re-common 0.2.54__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {re_common-0.2.54 → re_common-2.0.1}/PKG-INFO +1 -1
- re_common-2.0.1/README.md +10 -0
- re_common-2.0.1/re_common/v2/baselibrary/s3object/baseboto3.py +230 -0
- re_common-2.0.1/re_common/v2/baselibrary/tools/dict_tools.py +24 -0
- re_common-2.0.1/re_common/v2/baselibrary/tools/search_hash_tools.py +33 -0
- re_common-2.0.1/re_common/v2/baselibrary/tools/text_matcher.py +223 -0
- re_common-2.0.1/re_common/v2/baselibrary/tools/unionfind_tools.py +60 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/BusinessStringUtil.py +74 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/author_smi.py +308 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/basedict.py +26 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/basehdfs.py +127 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/json_cls.py +11 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/string_bool.py +9 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/string_clear.py +98 -0
- re_common-2.0.1/re_common/v2/baselibrary/utils/stringutils.py +95 -0
- {re_common-0.2.54 → re_common-2.0.1}/re_common.egg-info/PKG-INFO +1 -1
- re_common-2.0.1/re_common.egg-info/SOURCES.txt +27 -0
- {re_common-0.2.54 → re_common-2.0.1}/setup.py +3 -2
- re_common-0.2.54/README.md +0 -10
- re_common-0.2.54/re_common/baselibrary/__init__.py +0 -4
- re_common-0.2.54/re_common/baselibrary/baseabs/__init__.py +0 -7
- re_common-0.2.54/re_common/baselibrary/baseabs/baseabs.py +0 -26
- re_common-0.2.54/re_common/baselibrary/database/mbuilder.py +0 -132
- re_common-0.2.54/re_common/baselibrary/database/moudle.py +0 -93
- re_common-0.2.54/re_common/baselibrary/database/msqlite3.py +0 -194
- re_common-0.2.54/re_common/baselibrary/database/mysql.py +0 -169
- re_common-0.2.54/re_common/baselibrary/database/sql_factory.py +0 -26
- re_common-0.2.54/re_common/baselibrary/mthread/MThreadingRun.py +0 -486
- re_common-0.2.54/re_common/baselibrary/mthread/MThreadingRunEvent.py +0 -349
- re_common-0.2.54/re_common/baselibrary/mthread/__init__.py +0 -3
- re_common-0.2.54/re_common/baselibrary/mthread/mythreading.py +0 -695
- re_common-0.2.54/re_common/baselibrary/pakge_other/socks.py +0 -404
- re_common-0.2.54/re_common/baselibrary/readconfig/config_factory.py +0 -18
- re_common-0.2.54/re_common/baselibrary/readconfig/ini_config.py +0 -317
- re_common-0.2.54/re_common/baselibrary/readconfig/toml_config.py +0 -49
- re_common-0.2.54/re_common/baselibrary/temporary/envdata.py +0 -36
- re_common-0.2.54/re_common/baselibrary/tools/all_requests/aiohttp_request.py +0 -118
- re_common-0.2.54/re_common/baselibrary/tools/all_requests/httpx_requet.py +0 -102
- re_common-0.2.54/re_common/baselibrary/tools/all_requests/mrequest.py +0 -412
- re_common-0.2.54/re_common/baselibrary/tools/all_requests/requests_request.py +0 -81
- re_common-0.2.54/re_common/baselibrary/tools/batch_compre/__init__.py +0 -0
- re_common-0.2.54/re_common/baselibrary/tools/batch_compre/bijiao_batch.py +0 -31
- re_common-0.2.54/re_common/baselibrary/tools/contrast_db3.py +0 -123
- re_common-0.2.54/re_common/baselibrary/tools/copy_file.py +0 -39
- re_common-0.2.54/re_common/baselibrary/tools/db3_2_sizedb3.py +0 -102
- re_common-0.2.54/re_common/baselibrary/tools/foreachgz.py +0 -40
- re_common-0.2.54/re_common/baselibrary/tools/get_attr.py +0 -11
- re_common-0.2.54/re_common/baselibrary/tools/image_to_pdf.py +0 -62
- re_common-0.2.54/re_common/baselibrary/tools/java_code_deal.py +0 -139
- re_common-0.2.54/re_common/baselibrary/tools/javacode.py +0 -79
- re_common-0.2.54/re_common/baselibrary/tools/mdb_db3.py +0 -48
- re_common-0.2.54/re_common/baselibrary/tools/merge_file.py +0 -171
- re_common-0.2.54/re_common/baselibrary/tools/merge_gz_file.py +0 -165
- re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/__init__.py +0 -0
- re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +0 -42
- re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/hdfst.py +0 -42
- re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +0 -38
- re_common-0.2.54/re_common/baselibrary/tools/mongo_tools.py +0 -50
- re_common-0.2.54/re_common/baselibrary/tools/move_file.py +0 -170
- re_common-0.2.54/re_common/baselibrary/tools/move_mongo/__init__.py +0 -0
- re_common-0.2.54/re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +0 -63
- re_common-0.2.54/re_common/baselibrary/tools/move_mongo/move_mongo_table.py +0 -354
- re_common-0.2.54/re_common/baselibrary/tools/move_mongo/use_mttf.py +0 -18
- re_common-0.2.54/re_common/baselibrary/tools/move_mongo/use_mv.py +0 -93
- re_common-0.2.54/re_common/baselibrary/tools/mpandas/__init__.py +0 -0
- re_common-0.2.54/re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +0 -125
- re_common-0.2.54/re_common/baselibrary/tools/mpandas/pandas_visualization.py +0 -8
- re_common-0.2.54/re_common/baselibrary/tools/myparsel.py +0 -104
- re_common-0.2.54/re_common/baselibrary/tools/rename_dir_file.py +0 -37
- re_common-0.2.54/re_common/baselibrary/tools/sequoiadb_utils.py +0 -398
- re_common-0.2.54/re_common/baselibrary/tools/split_line_to_many.py +0 -25
- re_common-0.2.54/re_common/baselibrary/tools/stringtodicts.py +0 -33
- re_common-0.2.54/re_common/baselibrary/tools/workwechant_bot.py +0 -84
- re_common-0.2.54/re_common/baselibrary/utils/__init__.py +0 -0
- re_common-0.2.54/re_common/baselibrary/utils/baseaiohttp.py +0 -296
- re_common-0.2.54/re_common/baselibrary/utils/baseaiomysql.py +0 -87
- re_common-0.2.54/re_common/baselibrary/utils/baseallstep.py +0 -191
- re_common-0.2.54/re_common/baselibrary/utils/baseavro.py +0 -19
- re_common-0.2.54/re_common/baselibrary/utils/baseboto3.py +0 -291
- re_common-0.2.54/re_common/baselibrary/utils/basecsv.py +0 -32
- re_common-0.2.54/re_common/baselibrary/utils/basedict.py +0 -133
- re_common-0.2.54/re_common/baselibrary/utils/basedir.py +0 -241
- re_common-0.2.54/re_common/baselibrary/utils/baseencode.py +0 -351
- re_common-0.2.54/re_common/baselibrary/utils/baseencoding.py +0 -29
- re_common-0.2.54/re_common/baselibrary/utils/baseesdsl.py +0 -86
- re_common-0.2.54/re_common/baselibrary/utils/baseexcel.py +0 -264
- re_common-0.2.54/re_common/baselibrary/utils/baseexcept.py +0 -109
- re_common-0.2.54/re_common/baselibrary/utils/basefile.py +0 -654
- re_common-0.2.54/re_common/baselibrary/utils/baseftp.py +0 -214
- re_common-0.2.54/re_common/baselibrary/utils/basegzip.py +0 -60
- re_common-0.2.54/re_common/baselibrary/utils/basehdfs.py +0 -135
- re_common-0.2.54/re_common/baselibrary/utils/basehttpx.py +0 -268
- re_common-0.2.54/re_common/baselibrary/utils/baseip.py +0 -87
- re_common-0.2.54/re_common/baselibrary/utils/basejson.py +0 -2
- re_common-0.2.54/re_common/baselibrary/utils/baselist.py +0 -32
- re_common-0.2.54/re_common/baselibrary/utils/basemotor.py +0 -190
- re_common-0.2.54/re_common/baselibrary/utils/basemssql.py +0 -98
- re_common-0.2.54/re_common/baselibrary/utils/baseodbc.py +0 -113
- re_common-0.2.54/re_common/baselibrary/utils/basepandas.py +0 -302
- re_common-0.2.54/re_common/baselibrary/utils/basepeewee.py +0 -11
- re_common-0.2.54/re_common/baselibrary/utils/basepika.py +0 -180
- re_common-0.2.54/re_common/baselibrary/utils/basepydash.py +0 -143
- re_common-0.2.54/re_common/baselibrary/utils/basepymongo.py +0 -230
- re_common-0.2.54/re_common/baselibrary/utils/basequeue.py +0 -22
- re_common-0.2.54/re_common/baselibrary/utils/baserar.py +0 -57
- re_common-0.2.54/re_common/baselibrary/utils/baserequest.py +0 -279
- re_common-0.2.54/re_common/baselibrary/utils/baseset.py +0 -8
- re_common-0.2.54/re_common/baselibrary/utils/basesmb.py +0 -403
- re_common-0.2.54/re_common/baselibrary/utils/basestring.py +0 -382
- re_common-0.2.54/re_common/baselibrary/utils/basetime.py +0 -320
- re_common-0.2.54/re_common/baselibrary/utils/basetuple.py +0 -0
- re_common-0.2.54/re_common/baselibrary/utils/baseurl.py +0 -121
- re_common-0.2.54/re_common/baselibrary/utils/basezip.py +0 -57
- re_common-0.2.54/re_common/baselibrary/utils/core/__init__.py +0 -8
- re_common-0.2.54/re_common/baselibrary/utils/core/bottomutils.py +0 -18
- re_common-0.2.54/re_common/baselibrary/utils/core/mdeprecated.py +0 -327
- re_common-0.2.54/re_common/baselibrary/utils/core/mlamada.py +0 -16
- re_common-0.2.54/re_common/baselibrary/utils/core/msginfo.py +0 -25
- re_common-0.2.54/re_common/baselibrary/utils/core/requests_core.py +0 -103
- re_common-0.2.54/re_common/baselibrary/utils/fateadm.py +0 -429
- re_common-0.2.54/re_common/baselibrary/utils/importfun.py +0 -123
- re_common-0.2.54/re_common/baselibrary/utils/mfaker.py +0 -57
- re_common-0.2.54/re_common/baselibrary/utils/my_abc/__init__.py +0 -3
- re_common-0.2.54/re_common/baselibrary/utils/my_abc/better_abc.py +0 -32
- re_common-0.2.54/re_common/baselibrary/utils/mylogger.py +0 -414
- re_common-0.2.54/re_common/baselibrary/utils/myredisclient.py +0 -861
- re_common-0.2.54/re_common/baselibrary/utils/pipupgrade.py +0 -21
- re_common-0.2.54/re_common/baselibrary/utils/ringlist.py +0 -85
- re_common-0.2.54/re_common/baselibrary/utils/version_compare.py +0 -36
- re_common-0.2.54/re_common/baselibrary/utils/ydmhttp.py +0 -126
- re_common-0.2.54/re_common/facade/__init__.py +0 -1
- re_common-0.2.54/re_common/facade/lazy_import.py +0 -11
- re_common-0.2.54/re_common/facade/loggerfacade.py +0 -25
- re_common-0.2.54/re_common/facade/mysqlfacade.py +0 -467
- re_common-0.2.54/re_common/facade/now.py +0 -31
- re_common-0.2.54/re_common/facade/sqlite3facade.py +0 -257
- re_common-0.2.54/re_common/facade/use/__init__.py +0 -0
- re_common-0.2.54/re_common/facade/use/mq_use_facade.py +0 -83
- re_common-0.2.54/re_common/facade/use/proxy_use_facade.py +0 -20
- re_common-0.2.54/re_common/libtest/__init__.py +0 -0
- re_common-0.2.54/re_common/libtest/base_dict_test.py +0 -19
- re_common-0.2.54/re_common/libtest/baseavro_test.py +0 -13
- re_common-0.2.54/re_common/libtest/basefile_test.py +0 -14
- re_common-0.2.54/re_common/libtest/basemssql_test.py +0 -77
- re_common-0.2.54/re_common/libtest/baseodbc_test.py +0 -8
- re_common-0.2.54/re_common/libtest/basepandas_test.py +0 -38
- re_common-0.2.54/re_common/libtest/get_attr_test/__init__.py +0 -0
- re_common-0.2.54/re_common/libtest/get_attr_test/get_attr_test_settings.py +0 -14
- re_common-0.2.54/re_common/libtest/get_attr_test/settings.py +0 -55
- re_common-0.2.54/re_common/libtest/idencode_test.py +0 -54
- re_common-0.2.54/re_common/libtest/iniconfig_test.py +0 -35
- re_common-0.2.54/re_common/libtest/ip_test.py +0 -35
- re_common-0.2.54/re_common/libtest/merge_file_test.py +0 -20
- re_common-0.2.54/re_common/libtest/mfaker_test.py +0 -9
- re_common-0.2.54/re_common/libtest/mm3_test.py +0 -32
- re_common-0.2.54/re_common/libtest/mylogger_test.py +0 -89
- re_common-0.2.54/re_common/libtest/myparsel_test.py +0 -28
- re_common-0.2.54/re_common/libtest/mysql_test.py +0 -151
- re_common-0.2.54/re_common/libtest/pymongo_test.py +0 -21
- re_common-0.2.54/re_common/libtest/split_test.py +0 -12
- re_common-0.2.54/re_common/libtest/sqlite3_merge_test.py +0 -6
- re_common-0.2.54/re_common/libtest/sqlite3_test.py +0 -34
- re_common-0.2.54/re_common/libtest/tomlconfig_test.py +0 -30
- re_common-0.2.54/re_common/libtest/use_tools_test/__init__.py +0 -3
- re_common-0.2.54/re_common/libtest/user/__init__.py +0 -5
- re_common-0.2.54/re_common/studio/__init__.py +0 -5
- re_common-0.2.54/re_common/studio/assignment_expressions.py +0 -37
- re_common-0.2.54/re_common/studio/mydash/__init__.py +0 -0
- re_common-0.2.54/re_common/studio/mydash/test1.py +0 -19
- re_common-0.2.54/re_common/studio/pydashstudio/__init__.py +0 -0
- re_common-0.2.54/re_common/studio/pydashstudio/first.py +0 -9
- re_common-0.2.54/re_common/studio/streamlitstudio/__init__.py +0 -0
- re_common-0.2.54/re_common/studio/streamlitstudio/first_app.py +0 -66
- re_common-0.2.54/re_common/studio/streamlitstudio/uber_pickups.py +0 -24
- re_common-0.2.54/re_common/studio/test.py +0 -19
- re_common-0.2.54/re_common/vip/__init__.py +0 -0
- re_common-0.2.54/re_common/vip/base_step_process.py +0 -11
- re_common-0.2.54/re_common/vip/baseencodeid.py +0 -91
- re_common-0.2.54/re_common/vip/changetaskname.py +0 -28
- re_common-0.2.54/re_common/vip/core_var.py +0 -24
- re_common-0.2.54/re_common/vip/mmh3Hash.py +0 -90
- re_common-0.2.54/re_common/vip/proxy/__init__.py +0 -0
- re_common-0.2.54/re_common/vip/proxy/allproxys.py +0 -127
- re_common-0.2.54/re_common/vip/proxy/allproxys_thread.py +0 -159
- re_common-0.2.54/re_common/vip/proxy/cnki_proxy.py +0 -153
- re_common-0.2.54/re_common/vip/proxy/kuaidaili.py +0 -87
- re_common-0.2.54/re_common/vip/proxy/proxy_all.py +0 -113
- re_common-0.2.54/re_common/vip/proxy/update_kuaidaili_0.py +0 -42
- re_common-0.2.54/re_common/vip/proxy/wanfang_proxy.py +0 -152
- re_common-0.2.54/re_common/vip/proxy/wp_proxy_all.py +0 -182
- re_common-0.2.54/re_common/vip/read_rawid_to_txt.py +0 -92
- re_common-0.2.54/re_common/vip/title/__init__.py +0 -5
- re_common-0.2.54/re_common/vip/title/transform/TransformBookTitleToZt.py +0 -125
- re_common-0.2.54/re_common/vip/title/transform/TransformConferenceTitleToZt.py +0 -139
- re_common-0.2.54/re_common/vip/title/transform/TransformCstadTitleToZt.py +0 -196
- re_common-0.2.54/re_common/vip/title/transform/TransformJournalTitleToZt.py +0 -203
- re_common-0.2.54/re_common/vip/title/transform/TransformPatentTitleToZt.py +0 -132
- re_common-0.2.54/re_common/vip/title/transform/TransformRegulationTitleToZt.py +0 -114
- re_common-0.2.54/re_common/vip/title/transform/TransformStandardTitleToZt.py +0 -135
- re_common-0.2.54/re_common/vip/title/transform/TransformThesisTitleToZt.py +0 -135
- re_common-0.2.54/re_common/vip/title/transform/__init__.py +0 -11
- re_common-0.2.54/re_common.egg-info/SOURCES.txt +0 -196
- {re_common-0.2.54 → re_common-2.0.1}/LICENSE +0 -0
- {re_common-0.2.54 → re_common-2.0.1}/re_common/__init__.py +0 -0
- {re_common-0.2.54/re_common/baselibrary/database → re_common-2.0.1/re_common/v2}/__init__.py +0 -0
- {re_common-0.2.54/re_common/baselibrary/pakge_other → re_common-2.0.1/re_common/v2/baselibrary}/__init__.py +0 -0
- {re_common-0.2.54/re_common/baselibrary/readconfig → re_common-2.0.1/re_common/v2/baselibrary/s3object}/__init__.py +0 -0
- {re_common-0.2.54/re_common/baselibrary/temporary → re_common-2.0.1/re_common/v2/baselibrary/tools}/__init__.py +0 -0
- /re_common-0.2.54/re_common/baselibrary/tools/__init__.py → /re_common-2.0.1/re_common/v2/baselibrary/tools/list_tools.py +0 -0
- {re_common-0.2.54/re_common/baselibrary/tools/all_requests → re_common-2.0.1/re_common/v2/baselibrary/utils}/__init__.py +0 -0
- {re_common-0.2.54 → re_common-2.0.1}/re_common.egg-info/dependency_links.txt +0 -0
- {re_common-0.2.54 → re_common-2.0.1}/re_common.egg-info/top_level.txt +0 -0
- {re_common-0.2.54 → re_common-2.0.1}/setup.cfg +0 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
from boto3.session import Session
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseBoto3(object):
|
|
5
|
+
|
|
6
|
+
def __init__(self, aws_access_key_id="", aws_secret_access_key="", endpoint_url=""):
|
|
7
|
+
self.aws_access_key_id = aws_access_key_id
|
|
8
|
+
self.aws_secret_access_key = aws_secret_access_key
|
|
9
|
+
self.endpoint_url = endpoint_url
|
|
10
|
+
self.session = None
|
|
11
|
+
self.client = None
|
|
12
|
+
if self.aws_access_key_id and self.aws_secret_access_key and self.endpoint_url:
|
|
13
|
+
self.conn_session()
|
|
14
|
+
self.get_client()
|
|
15
|
+
|
|
16
|
+
def set_key(self, aws_access_key_id, aws_secret_access_key, endpoint_url):
|
|
17
|
+
self.aws_access_key_id = aws_access_key_id
|
|
18
|
+
self.aws_secret_access_key = aws_secret_access_key
|
|
19
|
+
self.endpoint_url = endpoint_url
|
|
20
|
+
return self
|
|
21
|
+
|
|
22
|
+
def conn_session(self):
|
|
23
|
+
assert self.aws_access_key_id not in (None, '')
|
|
24
|
+
assert self.aws_secret_access_key not in (None, '')
|
|
25
|
+
self.session = Session(aws_access_key_id=self.aws_access_key_id,
|
|
26
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
27
|
+
return self.session
|
|
28
|
+
|
|
29
|
+
def get_client(self):
|
|
30
|
+
assert self.session is not None
|
|
31
|
+
self.client = self.session.client('s3', endpoint_url=self.endpoint_url)
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def get_all_buckets(self):
|
|
35
|
+
"""
|
|
36
|
+
获取所有的桶信息
|
|
37
|
+
:return:
|
|
38
|
+
"""
|
|
39
|
+
return self.client.list_buckets()
|
|
40
|
+
|
|
41
|
+
def create_buckets(self, buckets_name):
|
|
42
|
+
"""
|
|
43
|
+
如果get_client 使用 client 返回
|
|
44
|
+
{'ResponseMetadata': {'RequestId': '16BC90EED4A433C4', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '0', 'content-security-policy': 'block-all-mixed-content', 'location': '/create1', 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '16BC90EED4A433C4', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Wed, 01 Dec 2021 07:28:39 GMT'}, 'RetryAttempts': 0}, 'Location': '/create1'}
|
|
45
|
+
"""
|
|
46
|
+
assert buckets_name.find("_") == -1, "新建一个bucket桶(bucket name 中不能有_下划线)"
|
|
47
|
+
# 新建一个bucket桶(bucket name 中不能有_下划线)
|
|
48
|
+
return self.client.create_bucket(Bucket=buckets_name)
|
|
49
|
+
|
|
50
|
+
def delete_buckets(self, bucket_name):
|
|
51
|
+
"""
|
|
52
|
+
删除桶 删除bucket(只能删除空的bucket)
|
|
53
|
+
:return:
|
|
54
|
+
"""
|
|
55
|
+
response = self.client.delete_bucket(Bucket=bucket_name)
|
|
56
|
+
return response
|
|
57
|
+
|
|
58
|
+
def get_bucket(self, bucket_name):
|
|
59
|
+
raise Exception("无实现方法")
|
|
60
|
+
|
|
61
|
+
def get_all_objs(self, bucket_name, prefix=None, continuation_token=None):
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
|
|
65
|
+
|
|
66
|
+
response 的结构
|
|
67
|
+
{'ResponseMetadata': {'RequestId': '1818F447C1E7BA3B', 'HostId': '', 'HTTPStatusCode': 200,
|
|
68
|
+
'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '3182', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'application/xml',
|
|
69
|
+
'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '1818F447C1E7BA3B',
|
|
70
|
+
'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Thu, 09 Jan 2025 07:04:05 GMT'}, 'RetryAttempts': 0},
|
|
71
|
+
'IsTruncated': False, 'Contents':
|
|
72
|
+
[
|
|
73
|
+
{'Key': 'zt_file/zt类型样例数据/11_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 59, 716000, tzinfo=tzutc()), 'ETag': '"e0d635f171bce6a67ad72265e5f9137d-2"',
|
|
74
|
+
'Size': 18164139, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
|
|
75
|
+
{'Key': 'zt_file/zt类型样例数据/12_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 57, 70000, tzinfo=tzutc()), 'ETag': '"f238fe9973a2bc0d3e1562c2938ce897-9"',
|
|
76
|
+
'Size': 93710911, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
|
|
77
|
+
],
|
|
78
|
+
'Name': 'crawl.dc.cqvip.com', 'Prefix': 'zt_file/zt类型样例数据', 'Delimiter': '',
|
|
79
|
+
'MaxKeys': 1000, 'EncodingType': 'url', 'KeyCount': 7}
|
|
80
|
+
|
|
81
|
+
"""
|
|
82
|
+
if continuation_token:
|
|
83
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
84
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
85
|
+
Prefix=prefix,
|
|
86
|
+
ContinuationToken=continuation_token)
|
|
87
|
+
else:
|
|
88
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
89
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
90
|
+
Prefix=prefix)
|
|
91
|
+
object_list = []
|
|
92
|
+
# 检查是否有对象存在
|
|
93
|
+
if 'Contents' in response:
|
|
94
|
+
object_list = [obj['Key'] for obj in response['Contents']]
|
|
95
|
+
|
|
96
|
+
continuation_token = None
|
|
97
|
+
# 检查是否有更多对象
|
|
98
|
+
if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
|
|
99
|
+
continuation_token = response.get('NextContinuationToken')
|
|
100
|
+
|
|
101
|
+
return object_list, continuation_token
|
|
102
|
+
|
|
103
|
+
def list_prefixes(self, bucket_name, prefix=None, Delimiter="/", continuation_token=None):
|
|
104
|
+
"""
|
|
105
|
+
获取目录下一层的目录
|
|
106
|
+
prefix: 注意 这个要以 Delimiter 结尾 比如 Delimiter="/" 那么 prefix="a/"
|
|
107
|
+
continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
|
|
108
|
+
return: ['a/b/', 'a/c/'] 注意 反回的 结果带有prefix 只能返回目录 不能返回文件
|
|
109
|
+
"""
|
|
110
|
+
if continuation_token:
|
|
111
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
112
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
113
|
+
Prefix=prefix,
|
|
114
|
+
Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
|
|
115
|
+
ContinuationToken=continuation_token)
|
|
116
|
+
else:
|
|
117
|
+
# 获取桶中以特定前缀开头的所有对象
|
|
118
|
+
response = self.client.list_objects_v2(Bucket=bucket_name,
|
|
119
|
+
Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
|
|
120
|
+
Prefix=prefix)
|
|
121
|
+
object_list = []
|
|
122
|
+
# 检查是否有对象存在
|
|
123
|
+
if 'Contents' in response:
|
|
124
|
+
object_list = [obj['Key'] for obj in response['Contents']]
|
|
125
|
+
|
|
126
|
+
Prefix_list = []
|
|
127
|
+
# 检查是否有目录存在
|
|
128
|
+
if 'CommonPrefixes' in response:
|
|
129
|
+
Prefix_list = [obj['Prefix'] for obj in response['CommonPrefixes']]
|
|
130
|
+
|
|
131
|
+
continuation_token = None
|
|
132
|
+
# 检查是否有更多对象
|
|
133
|
+
if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
|
|
134
|
+
continuation_token = response.get('NextContinuationToken')
|
|
135
|
+
|
|
136
|
+
return object_list, Prefix_list, continuation_token
|
|
137
|
+
|
|
138
|
+
def get_object_value(self, bucket_name, file_key, encoding='utf-8'):
|
|
139
|
+
"""
|
|
140
|
+
读取文本数据
|
|
141
|
+
Returns:
|
|
142
|
+
"""
|
|
143
|
+
obj = self.client.get_object(Bucket=bucket_name, Key=file_key)
|
|
144
|
+
body = obj['Body'].read().decode(encoding)
|
|
145
|
+
return body
|
|
146
|
+
|
|
147
|
+
def put_object(self, bucket_name, key, body):
|
|
148
|
+
"""
|
|
149
|
+
直接写内容到文件
|
|
150
|
+
Args:
|
|
151
|
+
bucket_name:
|
|
152
|
+
key:
|
|
153
|
+
body: 需要 编码 .encode('utf-8')
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
"""
|
|
157
|
+
self.client.put_object(Bucket=bucket_name,
|
|
158
|
+
Key=key,
|
|
159
|
+
Body=body)
|
|
160
|
+
|
|
161
|
+
def download_file(self, bucket_name, key, local_file):
|
|
162
|
+
"""
|
|
163
|
+
return: None
|
|
164
|
+
"""
|
|
165
|
+
result = self.client.download_file(bucket_name, key, local_file)
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
def upload_file(self, bucket_name, key, local_file):
|
|
169
|
+
"""
|
|
170
|
+
# key 桶中的位置 test1/test.pdf
|
|
171
|
+
:param local_file: 本地文件路径
|
|
172
|
+
:param bucket_name: 桶名
|
|
173
|
+
:param key: 远程文件路径
|
|
174
|
+
:return:
|
|
175
|
+
"""
|
|
176
|
+
self.client.upload_file(local_file, bucket_name, key)
|
|
177
|
+
|
|
178
|
+
def download_fileobj(self, bucket_name, key, fileobj):
|
|
179
|
+
"""
|
|
180
|
+
return: None
|
|
181
|
+
"""
|
|
182
|
+
result = self.client.download_fileobj(bucket_name, key, fileobj)
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
def upload_fileobj(self, bucket_name, key, fileobj):
|
|
186
|
+
# fileobj 字节流
|
|
187
|
+
self.client.upload_fileobj(fileobj, bucket_name, key)
|
|
188
|
+
|
|
189
|
+
def check_exist_or_file_info(self, bucket_name, key):
|
|
190
|
+
"""
|
|
191
|
+
检查文件是否存在且能获取文件info
|
|
192
|
+
{'ResponseMetadata': {'RequestId': '17E6A65A2B299D3B', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders':
|
|
193
|
+
{'accept-ranges': 'bytes', 'content-length': '117', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'binary/octet-stream',
|
|
194
|
+
'etag': '"2237a934f176003e41abf3d733291079"', 'last-modified': 'Thu, 25 Jul 2024 05:49:43 GMT', 'server': 'MinIO',
|
|
195
|
+
'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '17E6A65A2B299D3B',
|
|
196
|
+
'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Mon, 29 Jul 2024 09:53:33 GMT'}, 'RetryAttempts': 0},
|
|
197
|
+
'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2024, 7, 25, 5, 49, 43, tzinfo=tzutc()), 'ContentLength': 117, 'ETag': '"2237a934f176003e41abf3d733291079"',
|
|
198
|
+
'ContentType': 'binary/octet-stream', 'Metadata': {}}
|
|
199
|
+
"""
|
|
200
|
+
try:
|
|
201
|
+
obj_info = self.client.head_object(
|
|
202
|
+
Bucket=bucket_name,
|
|
203
|
+
Key=key
|
|
204
|
+
)
|
|
205
|
+
return obj_info
|
|
206
|
+
except:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
def get_prefix_count(self, bucket_name, obj_count, prefix, continuation_token=None):
|
|
210
|
+
"""
|
|
211
|
+
统计 某个目录的文件数据量,由于需要每个目录获取一次 性能很慢
|
|
212
|
+
"""
|
|
213
|
+
for index in range(10000):
|
|
214
|
+
obj_list, dir_list, token = self.list_prefixes(bucket_name=bucket_name,
|
|
215
|
+
prefix=prefix,
|
|
216
|
+
continuation_token=continuation_token)
|
|
217
|
+
|
|
218
|
+
obj_count = obj_count + len(obj_list)
|
|
219
|
+
for dir_sub in dir_list:
|
|
220
|
+
obj_count = self.get_prefix_count(bucket_name, obj_count, dir_sub)
|
|
221
|
+
|
|
222
|
+
if token:
|
|
223
|
+
continuation_token = token
|
|
224
|
+
else:
|
|
225
|
+
break
|
|
226
|
+
|
|
227
|
+
if index > 10000 - 5:
|
|
228
|
+
raise Exception("循环耗尽,请检查逻辑正确性")
|
|
229
|
+
|
|
230
|
+
return obj_count
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
class DotDict(dict):
|
|
2
|
+
"""
|
|
3
|
+
让字典成为对象 既可以用字典方式访问 也可以用点访问key
|
|
4
|
+
"""
|
|
5
|
+
def __init__(self, *args, **kwargs):
|
|
6
|
+
super().__init__(*args, **kwargs)
|
|
7
|
+
# 递归地将嵌套字典转换为 DotDict
|
|
8
|
+
for key, value in self.items():
|
|
9
|
+
if isinstance(value, dict):
|
|
10
|
+
self[key] = DotDict(value)
|
|
11
|
+
|
|
12
|
+
def __getattr__(self, key):
|
|
13
|
+
try:
|
|
14
|
+
value = self[key]
|
|
15
|
+
if isinstance(value, dict): # 如果值是字典,继续转换为 DotDict
|
|
16
|
+
return DotDict(value)
|
|
17
|
+
return value
|
|
18
|
+
except KeyError:
|
|
19
|
+
raise AttributeError(f"'DotDict' object has no attribute '{key}'")
|
|
20
|
+
|
|
21
|
+
def __setattr__(self, key, value):
|
|
22
|
+
if isinstance(value, dict): # 如果值是字典,转换为 DotDict
|
|
23
|
+
value = DotDict(value)
|
|
24
|
+
self[key] = value
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import jieba
|
|
4
|
+
from datasketch import MinHash
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def tokenize(text: str, stopwords=None) -> List[str]:
|
|
8
|
+
"""
|
|
9
|
+
分词并移除停用词
|
|
10
|
+
"""
|
|
11
|
+
if stopwords is None:
|
|
12
|
+
stopwords = []
|
|
13
|
+
words = jieba.lcut(text)
|
|
14
|
+
# 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
|
|
15
|
+
one_char_size = len([i for i in words if len(i) == 1])
|
|
16
|
+
all_size = len(words)
|
|
17
|
+
# 如果单字符个数超过一定比例 就直接用空格分词
|
|
18
|
+
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
19
|
+
words = [i for i in text.split() if i.strip()]
|
|
20
|
+
|
|
21
|
+
# 过滤停用词和空字符
|
|
22
|
+
words = [w for w in words if w not in stopwords and w.strip()]
|
|
23
|
+
return words
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_minhash(words: List[str], num_perm=128) -> MinHash:
|
|
27
|
+
"""
|
|
28
|
+
为分词结果创建 MinHash
|
|
29
|
+
"""
|
|
30
|
+
minhash = MinHash(num_perm=num_perm)
|
|
31
|
+
for word in words:
|
|
32
|
+
minhash.update(word.encode("utf-8"))
|
|
33
|
+
return minhash
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import jieba
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Dict, Tuple, Set, Optional, Union
|
|
4
|
+
from datasketch import MinHash, MinHashLSH
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TextMatcher:
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
threshold: float = 0.5,
|
|
11
|
+
num_perm: int = 128,
|
|
12
|
+
is_raw_texts=True,
|
|
13
|
+
stopwords_path: Optional[str] = None,
|
|
14
|
+
user_dict_path: Optional[str] = None,
|
|
15
|
+
|
|
16
|
+
):
|
|
17
|
+
"""
|
|
18
|
+
初始化文本匹配器
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
threshold: LSH 相似度阈值
|
|
22
|
+
num_perm: MinHash 排列数
|
|
23
|
+
stopwords_path: 停用词文件路径
|
|
24
|
+
user_dict_path: 用户自定义词典路径
|
|
25
|
+
"""
|
|
26
|
+
self.threshold = threshold
|
|
27
|
+
self.num_perm = num_perm
|
|
28
|
+
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
29
|
+
# self.minhashes: Dict[str, MinHash] = {}
|
|
30
|
+
self.raw_texts: Dict[str, str] = {}
|
|
31
|
+
self.is_raw_texts = is_raw_texts
|
|
32
|
+
self.doc_counter = 0
|
|
33
|
+
|
|
34
|
+
# 加载停用词
|
|
35
|
+
self.stopwords: Set[str] = set()
|
|
36
|
+
if stopwords_path:
|
|
37
|
+
self.load_stopwords(stopwords_path)
|
|
38
|
+
|
|
39
|
+
# 加载用户词典
|
|
40
|
+
if user_dict_path:
|
|
41
|
+
jieba.load_userdict(user_dict_path)
|
|
42
|
+
|
|
43
|
+
def load_stopwords(self, stopwords_path: str) -> None:
|
|
44
|
+
"""加载停用词"""
|
|
45
|
+
with open(stopwords_path, "r", encoding="utf-8") as f:
|
|
46
|
+
self.stopwords = set(line.strip() for line in f)
|
|
47
|
+
|
|
48
|
+
def preprocess_text(self, text: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
文本预处理
|
|
51
|
+
"""
|
|
52
|
+
# 转换为小写
|
|
53
|
+
text = text.lower()
|
|
54
|
+
# 移除特殊字符
|
|
55
|
+
text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
|
|
56
|
+
# 移除多余空格
|
|
57
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
58
|
+
return text
|
|
59
|
+
|
|
60
|
+
def tokenize(self, text: str) -> List[str]:
|
|
61
|
+
"""
|
|
62
|
+
分词并移除停用词
|
|
63
|
+
"""
|
|
64
|
+
words = jieba.lcut(text)
|
|
65
|
+
one_char_size = len([i for i in words if len(i) == 1])
|
|
66
|
+
all_size = len(words)
|
|
67
|
+
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
68
|
+
words = [i for i in text.split() if i.strip()]
|
|
69
|
+
|
|
70
|
+
# 过滤停用词和空字符
|
|
71
|
+
words = [w for w in words if w not in self.stopwords and w.strip()]
|
|
72
|
+
return words
|
|
73
|
+
|
|
74
|
+
def create_minhash(self, words: List[str]) -> MinHash:
|
|
75
|
+
"""
|
|
76
|
+
为分词结果创建 MinHash
|
|
77
|
+
"""
|
|
78
|
+
minhash = MinHash(num_perm=self.num_perm)
|
|
79
|
+
for word in words:
|
|
80
|
+
minhash.update(word.encode("utf-8"))
|
|
81
|
+
return minhash
|
|
82
|
+
|
|
83
|
+
def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
|
|
84
|
+
"""
|
|
85
|
+
添加文档到索引
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
text: 文档文本
|
|
89
|
+
doc_id: 文档ID(可选)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
doc_id: 文档ID
|
|
93
|
+
"""
|
|
94
|
+
if doc_id is None:
|
|
95
|
+
doc_id = f"doc_{self.doc_counter}"
|
|
96
|
+
self.doc_counter += 1
|
|
97
|
+
|
|
98
|
+
# 预处理和分词
|
|
99
|
+
processed_text = self.preprocess_text(text)
|
|
100
|
+
words = self.tokenize(processed_text)
|
|
101
|
+
|
|
102
|
+
# 创建 MinHash
|
|
103
|
+
minhash = self.create_minhash(words)
|
|
104
|
+
if self.is_raw_texts:
|
|
105
|
+
# 存储原始文本和 MinHash
|
|
106
|
+
self.raw_texts[doc_id] = text
|
|
107
|
+
# self.minhashes[doc_id] = minhash
|
|
108
|
+
|
|
109
|
+
# 添加到 LSH
|
|
110
|
+
self.lsh.insert(doc_id, minhash)
|
|
111
|
+
|
|
112
|
+
return doc_id
|
|
113
|
+
|
|
114
|
+
def batch_add_documents(self, texts: Dict[str, str]) -> None:
|
|
115
|
+
"""
|
|
116
|
+
批量添加文档
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
texts: {doc_id: text} 的字典
|
|
120
|
+
"""
|
|
121
|
+
for doc_id, text in texts.items():
|
|
122
|
+
self.add_document(text, doc_id)
|
|
123
|
+
|
|
124
|
+
def create_query_minhash(self, query: str):
|
|
125
|
+
|
|
126
|
+
# 预处理查询文本
|
|
127
|
+
processed_query = self.preprocess_text(query)
|
|
128
|
+
query_words = self.tokenize(processed_query)
|
|
129
|
+
# print(query_words)
|
|
130
|
+
query_minhash = self.create_minhash(query_words)
|
|
131
|
+
return query_minhash
|
|
132
|
+
|
|
133
|
+
def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
|
|
134
|
+
List[str], List[Tuple[str, float]]]:
|
|
135
|
+
"""
|
|
136
|
+
查找相似文档
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
query: 查询文本
|
|
140
|
+
return_similarities: 是否返回相似度分数
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
|
|
144
|
+
否则返回 [doc_id, ...]
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
# 使用 LSH 查找候选集
|
|
148
|
+
similar_docs = self.lsh.query(query_minhash)
|
|
149
|
+
|
|
150
|
+
# if return_similarities:
|
|
151
|
+
# # 计算精确的 Jaccard 相似度
|
|
152
|
+
# results = []
|
|
153
|
+
# for doc_id in similar_docs:
|
|
154
|
+
# similarity = query_minhash.jaccard(self.minhashes[doc_id])
|
|
155
|
+
# results.append((doc_id, similarity))
|
|
156
|
+
# # 按相似度降序排序
|
|
157
|
+
# return sorted(results, key=lambda x: x[1], reverse=True)
|
|
158
|
+
|
|
159
|
+
return similar_docs
|
|
160
|
+
|
|
161
|
+
def get_text(self, doc_id: str) -> Optional[str]:
|
|
162
|
+
"""获取原始文本"""
|
|
163
|
+
if self.is_raw_texts:
|
|
164
|
+
return self.raw_texts.get(doc_id)
|
|
165
|
+
raise Exception("没有开启存储")
|
|
166
|
+
|
|
167
|
+
def remove_document(self, doc_id: str) -> bool:
|
|
168
|
+
"""
|
|
169
|
+
删除文档
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
bool: 是否成功删除
|
|
173
|
+
"""
|
|
174
|
+
# if doc_id not in self.minhashes:
|
|
175
|
+
# return False
|
|
176
|
+
|
|
177
|
+
self.lsh.remove(doc_id)
|
|
178
|
+
# del self.minhashes[doc_id]
|
|
179
|
+
if self.is_raw_texts:
|
|
180
|
+
del self.raw_texts[doc_id]
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
def clear(self) -> None:
|
|
184
|
+
"""清空所有数据"""
|
|
185
|
+
self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
|
|
186
|
+
# self.minhashes.clear()
|
|
187
|
+
self.raw_texts.clear()
|
|
188
|
+
self.doc_counter = 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
# 创建匹配器实例
|
|
193
|
+
matcher = TextMatcher(
|
|
194
|
+
threshold=0.1, # 相似度阈值
|
|
195
|
+
num_perm=128, # MinHash 排列数
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# 添加单个文档
|
|
199
|
+
doc_id = matcher.add_document(
|
|
200
|
+
"北京是中国的首都"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# 批量添加文档
|
|
204
|
+
docs = {"doc1": "北京是一座现代化的大都市", "doc2": "上海是中国最大的城市", "doc3": "中国的首都是北京"}
|
|
205
|
+
matcher.batch_add_documents(docs)
|
|
206
|
+
|
|
207
|
+
# 查找相似文档(不返回相似度分数)
|
|
208
|
+
similar_docs = matcher.find_similar("北京首都")
|
|
209
|
+
print("相似文档ID:", similar_docs)
|
|
210
|
+
|
|
211
|
+
# 查找相似文档(返回相似度分数)
|
|
212
|
+
similar_docs_with_scores = matcher.find_similar("北京首都", return_similarities=True)
|
|
213
|
+
print("相似文档ID和分数:", similar_docs_with_scores)
|
|
214
|
+
|
|
215
|
+
# 获取原始文本
|
|
216
|
+
for doc_id, score in similar_docs_with_scores:
|
|
217
|
+
print(f"文档 {doc_id}: {matcher.get_text(doc_id)} (相似度: {score:.2f})")
|
|
218
|
+
|
|
219
|
+
# 删除文档
|
|
220
|
+
matcher.remove_document("doc1")
|
|
221
|
+
|
|
222
|
+
# 清空所有数据
|
|
223
|
+
matcher.clear()
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
并查集(Union-Find)是一种用于管理元素分组的数据结构,主要用于解决动态连通性问题。它支持以下两种核心操作:
|
|
3
|
+
|
|
4
|
+
查找(Find):确定某个元素属于哪个集合。
|
|
5
|
+
|
|
6
|
+
合并(Union):将两个集合合并为一个集合。
|
|
7
|
+
|
|
8
|
+
并查集广泛应用于图论、网络连接、社交网络分析、图像处理等领域。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UnionFind:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
"""
|
|
15
|
+
初始化并查集。
|
|
16
|
+
使用字典动态存储 parent 和 rank。
|
|
17
|
+
"""
|
|
18
|
+
self.parent = {} # 存储每个元素的父节点,用于表示集合的树结构
|
|
19
|
+
self.rank = {} # 存储每个集合的秩(树的高度),用于优化合并操作
|
|
20
|
+
|
|
21
|
+
def find(self, x):
|
|
22
|
+
"""
|
|
23
|
+
查找元素 x 的根节点(路径压缩优化)。
|
|
24
|
+
如果元素不存在,则动态添加。
|
|
25
|
+
"""
|
|
26
|
+
if x not in self.parent: # 如果元素 x 不在 parent 字典中
|
|
27
|
+
self.parent[x] = x # 将 x 的父节点设置为自己(初始化)
|
|
28
|
+
self.rank[x] = 1 # 将 x 的秩初始化为 1
|
|
29
|
+
if self.parent[x] != x: # 如果 x 不是根节点(路径压缩优化)
|
|
30
|
+
self.parent[x] = self.find(self.parent[x]) # 递归查找根节点,并更新 x 的父节点
|
|
31
|
+
return self.parent[x] # 返回 x 的根节点
|
|
32
|
+
|
|
33
|
+
def union(self, x, y):
|
|
34
|
+
"""
|
|
35
|
+
合并元素 x 和 y 所在的集合(按秩合并优化)。
|
|
36
|
+
如果元素不存在,则动态添加。
|
|
37
|
+
"""
|
|
38
|
+
root_x = self.find(x) # 找到 x 的根节点
|
|
39
|
+
root_y = self.find(y) # 找到 y 的根节点
|
|
40
|
+
if root_x != root_y: # 如果 x 和 y 不在同一个集合中
|
|
41
|
+
# 按秩合并
|
|
42
|
+
if self.rank[root_x] > self.rank[root_y]: # 如果 x 所在集合的秩更大
|
|
43
|
+
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
44
|
+
elif self.rank[root_x] < self.rank[root_y]: # 如果 y 所在集合的秩更大
|
|
45
|
+
self.parent[root_x] = root_y # 将 x 的根节点指向 y 的根节点
|
|
46
|
+
else: # 如果两个集合的秩相等
|
|
47
|
+
self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
|
|
48
|
+
self.rank[root_x] += 1 # 增加 x 所在集合的秩
|
|
49
|
+
|
|
50
|
+
def get_groups(self):
|
|
51
|
+
"""
|
|
52
|
+
获取所有分组,返回一个字典,键为根节点,值为该组的所有元素。
|
|
53
|
+
"""
|
|
54
|
+
groups = {} # 初始化一个空字典,用于存储分组
|
|
55
|
+
for x in self.parent: # 遍历所有元素
|
|
56
|
+
root = self.find(x) # 找到当前元素的根节点
|
|
57
|
+
if root not in groups: # 如果根节点不在 groups 字典中
|
|
58
|
+
groups[root] = [] # 初始化一个空列表
|
|
59
|
+
groups[root].append(x) # 将当前元素添加到对应根节点的列表中
|
|
60
|
+
return groups # 返回分组结果
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# 某些业务中的字符串处理 算是特定场景的工具
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def clean_organ_postcode(organ):
|
|
6
|
+
"""
|
|
7
|
+
格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
|
|
8
|
+
|
|
9
|
+
备注: 该方法替换java 里面的 formatOrgan
|
|
10
|
+
|
|
11
|
+
参数:
|
|
12
|
+
organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
|
|
13
|
+
|
|
14
|
+
返回:
|
|
15
|
+
str: 格式化并清理后的组织名称字符串(无独立6位数字)。
|
|
16
|
+
"""
|
|
17
|
+
# 如果输入为空,设为空字符串以避免后续操作报错
|
|
18
|
+
if not organ:
|
|
19
|
+
organ = ""
|
|
20
|
+
|
|
21
|
+
# 删除方括号和圆括号中的内容(包括括号本身)
|
|
22
|
+
organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
|
|
23
|
+
organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
|
|
24
|
+
|
|
25
|
+
# 定义正则表达式,匹配独立的6位数字
|
|
26
|
+
# \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
|
|
27
|
+
organ = re.sub(r"\b[0-9]{6}\b", "", organ)
|
|
28
|
+
|
|
29
|
+
# 初始化结果列表,用于存储处理后的组织名称部分
|
|
30
|
+
format_organ = []
|
|
31
|
+
# 按分号分割字符串,生成组织名称的各个部分
|
|
32
|
+
organ_parts = organ.split(";")
|
|
33
|
+
|
|
34
|
+
# 遍历每个部分,追加到结果列表
|
|
35
|
+
for temp_organ in organ_parts:
|
|
36
|
+
# 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
|
|
37
|
+
cleaned_part = temp_organ.strip()
|
|
38
|
+
# 如果首尾是标点符号,则移除
|
|
39
|
+
# 定义标点符号的正则表达式(这里包括常见标点)
|
|
40
|
+
punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
|
|
41
|
+
cleaned_part = re.sub(punctuation, "", cleaned_part)
|
|
42
|
+
if cleaned_part: # 只追加非空部分
|
|
43
|
+
format_organ.append(cleaned_part)
|
|
44
|
+
|
|
45
|
+
# 用分号连接结果,转换为大写并清理标点
|
|
46
|
+
format_organ = ";".join(format_organ)
|
|
47
|
+
|
|
48
|
+
# 返回最终结果并去除首尾空格
|
|
49
|
+
return format_organ.strip()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_first_organ(organ):
|
|
53
|
+
if not organ:
|
|
54
|
+
return ""
|
|
55
|
+
organ_list = organ.strip().split(";")
|
|
56
|
+
for organ_one in organ_list:
|
|
57
|
+
# 清理邮政编码
|
|
58
|
+
organ_one = clean_organ_postcode(organ_one)
|
|
59
|
+
if organ_one:
|
|
60
|
+
return organ_one
|
|
61
|
+
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_first_author(author: str) -> str:
|
|
66
|
+
if not author:
|
|
67
|
+
return ""
|
|
68
|
+
au_list = author.strip().split(";")
|
|
69
|
+
for au in au_list:
|
|
70
|
+
au = re.sub("\\[.*?]", "", au)
|
|
71
|
+
au = re.sub("\\(.*?\\)", "", au)
|
|
72
|
+
if au:
|
|
73
|
+
return au
|
|
74
|
+
return ""
|