re-common 0.2.54__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. {re_common-0.2.54 → re_common-2.0.1}/PKG-INFO +1 -1
  2. re_common-2.0.1/README.md +10 -0
  3. re_common-2.0.1/re_common/v2/baselibrary/s3object/baseboto3.py +230 -0
  4. re_common-2.0.1/re_common/v2/baselibrary/tools/dict_tools.py +24 -0
  5. re_common-2.0.1/re_common/v2/baselibrary/tools/search_hash_tools.py +33 -0
  6. re_common-2.0.1/re_common/v2/baselibrary/tools/text_matcher.py +223 -0
  7. re_common-2.0.1/re_common/v2/baselibrary/tools/unionfind_tools.py +60 -0
  8. re_common-2.0.1/re_common/v2/baselibrary/utils/BusinessStringUtil.py +74 -0
  9. re_common-2.0.1/re_common/v2/baselibrary/utils/author_smi.py +308 -0
  10. re_common-2.0.1/re_common/v2/baselibrary/utils/basedict.py +26 -0
  11. re_common-2.0.1/re_common/v2/baselibrary/utils/basehdfs.py +127 -0
  12. re_common-2.0.1/re_common/v2/baselibrary/utils/json_cls.py +11 -0
  13. re_common-2.0.1/re_common/v2/baselibrary/utils/string_bool.py +9 -0
  14. re_common-2.0.1/re_common/v2/baselibrary/utils/string_clear.py +98 -0
  15. re_common-2.0.1/re_common/v2/baselibrary/utils/stringutils.py +95 -0
  16. {re_common-0.2.54 → re_common-2.0.1}/re_common.egg-info/PKG-INFO +1 -1
  17. re_common-2.0.1/re_common.egg-info/SOURCES.txt +27 -0
  18. {re_common-0.2.54 → re_common-2.0.1}/setup.py +3 -2
  19. re_common-0.2.54/README.md +0 -10
  20. re_common-0.2.54/re_common/baselibrary/__init__.py +0 -4
  21. re_common-0.2.54/re_common/baselibrary/baseabs/__init__.py +0 -7
  22. re_common-0.2.54/re_common/baselibrary/baseabs/baseabs.py +0 -26
  23. re_common-0.2.54/re_common/baselibrary/database/mbuilder.py +0 -132
  24. re_common-0.2.54/re_common/baselibrary/database/moudle.py +0 -93
  25. re_common-0.2.54/re_common/baselibrary/database/msqlite3.py +0 -194
  26. re_common-0.2.54/re_common/baselibrary/database/mysql.py +0 -169
  27. re_common-0.2.54/re_common/baselibrary/database/sql_factory.py +0 -26
  28. re_common-0.2.54/re_common/baselibrary/mthread/MThreadingRun.py +0 -486
  29. re_common-0.2.54/re_common/baselibrary/mthread/MThreadingRunEvent.py +0 -349
  30. re_common-0.2.54/re_common/baselibrary/mthread/__init__.py +0 -3
  31. re_common-0.2.54/re_common/baselibrary/mthread/mythreading.py +0 -695
  32. re_common-0.2.54/re_common/baselibrary/pakge_other/socks.py +0 -404
  33. re_common-0.2.54/re_common/baselibrary/readconfig/config_factory.py +0 -18
  34. re_common-0.2.54/re_common/baselibrary/readconfig/ini_config.py +0 -317
  35. re_common-0.2.54/re_common/baselibrary/readconfig/toml_config.py +0 -49
  36. re_common-0.2.54/re_common/baselibrary/temporary/envdata.py +0 -36
  37. re_common-0.2.54/re_common/baselibrary/tools/all_requests/aiohttp_request.py +0 -118
  38. re_common-0.2.54/re_common/baselibrary/tools/all_requests/httpx_requet.py +0 -102
  39. re_common-0.2.54/re_common/baselibrary/tools/all_requests/mrequest.py +0 -412
  40. re_common-0.2.54/re_common/baselibrary/tools/all_requests/requests_request.py +0 -81
  41. re_common-0.2.54/re_common/baselibrary/tools/batch_compre/__init__.py +0 -0
  42. re_common-0.2.54/re_common/baselibrary/tools/batch_compre/bijiao_batch.py +0 -31
  43. re_common-0.2.54/re_common/baselibrary/tools/contrast_db3.py +0 -123
  44. re_common-0.2.54/re_common/baselibrary/tools/copy_file.py +0 -39
  45. re_common-0.2.54/re_common/baselibrary/tools/db3_2_sizedb3.py +0 -102
  46. re_common-0.2.54/re_common/baselibrary/tools/foreachgz.py +0 -40
  47. re_common-0.2.54/re_common/baselibrary/tools/get_attr.py +0 -11
  48. re_common-0.2.54/re_common/baselibrary/tools/image_to_pdf.py +0 -62
  49. re_common-0.2.54/re_common/baselibrary/tools/java_code_deal.py +0 -139
  50. re_common-0.2.54/re_common/baselibrary/tools/javacode.py +0 -79
  51. re_common-0.2.54/re_common/baselibrary/tools/mdb_db3.py +0 -48
  52. re_common-0.2.54/re_common/baselibrary/tools/merge_file.py +0 -171
  53. re_common-0.2.54/re_common/baselibrary/tools/merge_gz_file.py +0 -165
  54. re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/__init__.py +0 -0
  55. re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +0 -42
  56. re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/hdfst.py +0 -42
  57. re_common-0.2.54/re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +0 -38
  58. re_common-0.2.54/re_common/baselibrary/tools/mongo_tools.py +0 -50
  59. re_common-0.2.54/re_common/baselibrary/tools/move_file.py +0 -170
  60. re_common-0.2.54/re_common/baselibrary/tools/move_mongo/__init__.py +0 -0
  61. re_common-0.2.54/re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +0 -63
  62. re_common-0.2.54/re_common/baselibrary/tools/move_mongo/move_mongo_table.py +0 -354
  63. re_common-0.2.54/re_common/baselibrary/tools/move_mongo/use_mttf.py +0 -18
  64. re_common-0.2.54/re_common/baselibrary/tools/move_mongo/use_mv.py +0 -93
  65. re_common-0.2.54/re_common/baselibrary/tools/mpandas/__init__.py +0 -0
  66. re_common-0.2.54/re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +0 -125
  67. re_common-0.2.54/re_common/baselibrary/tools/mpandas/pandas_visualization.py +0 -8
  68. re_common-0.2.54/re_common/baselibrary/tools/myparsel.py +0 -104
  69. re_common-0.2.54/re_common/baselibrary/tools/rename_dir_file.py +0 -37
  70. re_common-0.2.54/re_common/baselibrary/tools/sequoiadb_utils.py +0 -398
  71. re_common-0.2.54/re_common/baselibrary/tools/split_line_to_many.py +0 -25
  72. re_common-0.2.54/re_common/baselibrary/tools/stringtodicts.py +0 -33
  73. re_common-0.2.54/re_common/baselibrary/tools/workwechant_bot.py +0 -84
  74. re_common-0.2.54/re_common/baselibrary/utils/__init__.py +0 -0
  75. re_common-0.2.54/re_common/baselibrary/utils/baseaiohttp.py +0 -296
  76. re_common-0.2.54/re_common/baselibrary/utils/baseaiomysql.py +0 -87
  77. re_common-0.2.54/re_common/baselibrary/utils/baseallstep.py +0 -191
  78. re_common-0.2.54/re_common/baselibrary/utils/baseavro.py +0 -19
  79. re_common-0.2.54/re_common/baselibrary/utils/baseboto3.py +0 -291
  80. re_common-0.2.54/re_common/baselibrary/utils/basecsv.py +0 -32
  81. re_common-0.2.54/re_common/baselibrary/utils/basedict.py +0 -133
  82. re_common-0.2.54/re_common/baselibrary/utils/basedir.py +0 -241
  83. re_common-0.2.54/re_common/baselibrary/utils/baseencode.py +0 -351
  84. re_common-0.2.54/re_common/baselibrary/utils/baseencoding.py +0 -29
  85. re_common-0.2.54/re_common/baselibrary/utils/baseesdsl.py +0 -86
  86. re_common-0.2.54/re_common/baselibrary/utils/baseexcel.py +0 -264
  87. re_common-0.2.54/re_common/baselibrary/utils/baseexcept.py +0 -109
  88. re_common-0.2.54/re_common/baselibrary/utils/basefile.py +0 -654
  89. re_common-0.2.54/re_common/baselibrary/utils/baseftp.py +0 -214
  90. re_common-0.2.54/re_common/baselibrary/utils/basegzip.py +0 -60
  91. re_common-0.2.54/re_common/baselibrary/utils/basehdfs.py +0 -135
  92. re_common-0.2.54/re_common/baselibrary/utils/basehttpx.py +0 -268
  93. re_common-0.2.54/re_common/baselibrary/utils/baseip.py +0 -87
  94. re_common-0.2.54/re_common/baselibrary/utils/basejson.py +0 -2
  95. re_common-0.2.54/re_common/baselibrary/utils/baselist.py +0 -32
  96. re_common-0.2.54/re_common/baselibrary/utils/basemotor.py +0 -190
  97. re_common-0.2.54/re_common/baselibrary/utils/basemssql.py +0 -98
  98. re_common-0.2.54/re_common/baselibrary/utils/baseodbc.py +0 -113
  99. re_common-0.2.54/re_common/baselibrary/utils/basepandas.py +0 -302
  100. re_common-0.2.54/re_common/baselibrary/utils/basepeewee.py +0 -11
  101. re_common-0.2.54/re_common/baselibrary/utils/basepika.py +0 -180
  102. re_common-0.2.54/re_common/baselibrary/utils/basepydash.py +0 -143
  103. re_common-0.2.54/re_common/baselibrary/utils/basepymongo.py +0 -230
  104. re_common-0.2.54/re_common/baselibrary/utils/basequeue.py +0 -22
  105. re_common-0.2.54/re_common/baselibrary/utils/baserar.py +0 -57
  106. re_common-0.2.54/re_common/baselibrary/utils/baserequest.py +0 -279
  107. re_common-0.2.54/re_common/baselibrary/utils/baseset.py +0 -8
  108. re_common-0.2.54/re_common/baselibrary/utils/basesmb.py +0 -403
  109. re_common-0.2.54/re_common/baselibrary/utils/basestring.py +0 -382
  110. re_common-0.2.54/re_common/baselibrary/utils/basetime.py +0 -320
  111. re_common-0.2.54/re_common/baselibrary/utils/basetuple.py +0 -0
  112. re_common-0.2.54/re_common/baselibrary/utils/baseurl.py +0 -121
  113. re_common-0.2.54/re_common/baselibrary/utils/basezip.py +0 -57
  114. re_common-0.2.54/re_common/baselibrary/utils/core/__init__.py +0 -8
  115. re_common-0.2.54/re_common/baselibrary/utils/core/bottomutils.py +0 -18
  116. re_common-0.2.54/re_common/baselibrary/utils/core/mdeprecated.py +0 -327
  117. re_common-0.2.54/re_common/baselibrary/utils/core/mlamada.py +0 -16
  118. re_common-0.2.54/re_common/baselibrary/utils/core/msginfo.py +0 -25
  119. re_common-0.2.54/re_common/baselibrary/utils/core/requests_core.py +0 -103
  120. re_common-0.2.54/re_common/baselibrary/utils/fateadm.py +0 -429
  121. re_common-0.2.54/re_common/baselibrary/utils/importfun.py +0 -123
  122. re_common-0.2.54/re_common/baselibrary/utils/mfaker.py +0 -57
  123. re_common-0.2.54/re_common/baselibrary/utils/my_abc/__init__.py +0 -3
  124. re_common-0.2.54/re_common/baselibrary/utils/my_abc/better_abc.py +0 -32
  125. re_common-0.2.54/re_common/baselibrary/utils/mylogger.py +0 -414
  126. re_common-0.2.54/re_common/baselibrary/utils/myredisclient.py +0 -861
  127. re_common-0.2.54/re_common/baselibrary/utils/pipupgrade.py +0 -21
  128. re_common-0.2.54/re_common/baselibrary/utils/ringlist.py +0 -85
  129. re_common-0.2.54/re_common/baselibrary/utils/version_compare.py +0 -36
  130. re_common-0.2.54/re_common/baselibrary/utils/ydmhttp.py +0 -126
  131. re_common-0.2.54/re_common/facade/__init__.py +0 -1
  132. re_common-0.2.54/re_common/facade/lazy_import.py +0 -11
  133. re_common-0.2.54/re_common/facade/loggerfacade.py +0 -25
  134. re_common-0.2.54/re_common/facade/mysqlfacade.py +0 -467
  135. re_common-0.2.54/re_common/facade/now.py +0 -31
  136. re_common-0.2.54/re_common/facade/sqlite3facade.py +0 -257
  137. re_common-0.2.54/re_common/facade/use/__init__.py +0 -0
  138. re_common-0.2.54/re_common/facade/use/mq_use_facade.py +0 -83
  139. re_common-0.2.54/re_common/facade/use/proxy_use_facade.py +0 -20
  140. re_common-0.2.54/re_common/libtest/__init__.py +0 -0
  141. re_common-0.2.54/re_common/libtest/base_dict_test.py +0 -19
  142. re_common-0.2.54/re_common/libtest/baseavro_test.py +0 -13
  143. re_common-0.2.54/re_common/libtest/basefile_test.py +0 -14
  144. re_common-0.2.54/re_common/libtest/basemssql_test.py +0 -77
  145. re_common-0.2.54/re_common/libtest/baseodbc_test.py +0 -8
  146. re_common-0.2.54/re_common/libtest/basepandas_test.py +0 -38
  147. re_common-0.2.54/re_common/libtest/get_attr_test/__init__.py +0 -0
  148. re_common-0.2.54/re_common/libtest/get_attr_test/get_attr_test_settings.py +0 -14
  149. re_common-0.2.54/re_common/libtest/get_attr_test/settings.py +0 -55
  150. re_common-0.2.54/re_common/libtest/idencode_test.py +0 -54
  151. re_common-0.2.54/re_common/libtest/iniconfig_test.py +0 -35
  152. re_common-0.2.54/re_common/libtest/ip_test.py +0 -35
  153. re_common-0.2.54/re_common/libtest/merge_file_test.py +0 -20
  154. re_common-0.2.54/re_common/libtest/mfaker_test.py +0 -9
  155. re_common-0.2.54/re_common/libtest/mm3_test.py +0 -32
  156. re_common-0.2.54/re_common/libtest/mylogger_test.py +0 -89
  157. re_common-0.2.54/re_common/libtest/myparsel_test.py +0 -28
  158. re_common-0.2.54/re_common/libtest/mysql_test.py +0 -151
  159. re_common-0.2.54/re_common/libtest/pymongo_test.py +0 -21
  160. re_common-0.2.54/re_common/libtest/split_test.py +0 -12
  161. re_common-0.2.54/re_common/libtest/sqlite3_merge_test.py +0 -6
  162. re_common-0.2.54/re_common/libtest/sqlite3_test.py +0 -34
  163. re_common-0.2.54/re_common/libtest/tomlconfig_test.py +0 -30
  164. re_common-0.2.54/re_common/libtest/use_tools_test/__init__.py +0 -3
  165. re_common-0.2.54/re_common/libtest/user/__init__.py +0 -5
  166. re_common-0.2.54/re_common/studio/__init__.py +0 -5
  167. re_common-0.2.54/re_common/studio/assignment_expressions.py +0 -37
  168. re_common-0.2.54/re_common/studio/mydash/__init__.py +0 -0
  169. re_common-0.2.54/re_common/studio/mydash/test1.py +0 -19
  170. re_common-0.2.54/re_common/studio/pydashstudio/__init__.py +0 -0
  171. re_common-0.2.54/re_common/studio/pydashstudio/first.py +0 -9
  172. re_common-0.2.54/re_common/studio/streamlitstudio/__init__.py +0 -0
  173. re_common-0.2.54/re_common/studio/streamlitstudio/first_app.py +0 -66
  174. re_common-0.2.54/re_common/studio/streamlitstudio/uber_pickups.py +0 -24
  175. re_common-0.2.54/re_common/studio/test.py +0 -19
  176. re_common-0.2.54/re_common/vip/__init__.py +0 -0
  177. re_common-0.2.54/re_common/vip/base_step_process.py +0 -11
  178. re_common-0.2.54/re_common/vip/baseencodeid.py +0 -91
  179. re_common-0.2.54/re_common/vip/changetaskname.py +0 -28
  180. re_common-0.2.54/re_common/vip/core_var.py +0 -24
  181. re_common-0.2.54/re_common/vip/mmh3Hash.py +0 -90
  182. re_common-0.2.54/re_common/vip/proxy/__init__.py +0 -0
  183. re_common-0.2.54/re_common/vip/proxy/allproxys.py +0 -127
  184. re_common-0.2.54/re_common/vip/proxy/allproxys_thread.py +0 -159
  185. re_common-0.2.54/re_common/vip/proxy/cnki_proxy.py +0 -153
  186. re_common-0.2.54/re_common/vip/proxy/kuaidaili.py +0 -87
  187. re_common-0.2.54/re_common/vip/proxy/proxy_all.py +0 -113
  188. re_common-0.2.54/re_common/vip/proxy/update_kuaidaili_0.py +0 -42
  189. re_common-0.2.54/re_common/vip/proxy/wanfang_proxy.py +0 -152
  190. re_common-0.2.54/re_common/vip/proxy/wp_proxy_all.py +0 -182
  191. re_common-0.2.54/re_common/vip/read_rawid_to_txt.py +0 -92
  192. re_common-0.2.54/re_common/vip/title/__init__.py +0 -5
  193. re_common-0.2.54/re_common/vip/title/transform/TransformBookTitleToZt.py +0 -125
  194. re_common-0.2.54/re_common/vip/title/transform/TransformConferenceTitleToZt.py +0 -139
  195. re_common-0.2.54/re_common/vip/title/transform/TransformCstadTitleToZt.py +0 -196
  196. re_common-0.2.54/re_common/vip/title/transform/TransformJournalTitleToZt.py +0 -203
  197. re_common-0.2.54/re_common/vip/title/transform/TransformPatentTitleToZt.py +0 -132
  198. re_common-0.2.54/re_common/vip/title/transform/TransformRegulationTitleToZt.py +0 -114
  199. re_common-0.2.54/re_common/vip/title/transform/TransformStandardTitleToZt.py +0 -135
  200. re_common-0.2.54/re_common/vip/title/transform/TransformThesisTitleToZt.py +0 -135
  201. re_common-0.2.54/re_common/vip/title/transform/__init__.py +0 -11
  202. re_common-0.2.54/re_common.egg-info/SOURCES.txt +0 -196
  203. {re_common-0.2.54 → re_common-2.0.1}/LICENSE +0 -0
  204. {re_common-0.2.54 → re_common-2.0.1}/re_common/__init__.py +0 -0
  205. {re_common-0.2.54/re_common/baselibrary/database → re_common-2.0.1/re_common/v2}/__init__.py +0 -0
  206. {re_common-0.2.54/re_common/baselibrary/pakge_other → re_common-2.0.1/re_common/v2/baselibrary}/__init__.py +0 -0
  207. {re_common-0.2.54/re_common/baselibrary/readconfig → re_common-2.0.1/re_common/v2/baselibrary/s3object}/__init__.py +0 -0
  208. {re_common-0.2.54/re_common/baselibrary/temporary → re_common-2.0.1/re_common/v2/baselibrary/tools}/__init__.py +0 -0
  209. /re_common-0.2.54/re_common/baselibrary/tools/__init__.py → /re_common-2.0.1/re_common/v2/baselibrary/tools/list_tools.py +0 -0
  210. {re_common-0.2.54/re_common/baselibrary/tools/all_requests → re_common-2.0.1/re_common/v2/baselibrary/utils}/__init__.py +0 -0
  211. {re_common-0.2.54 → re_common-2.0.1}/re_common.egg-info/dependency_links.txt +0 -0
  212. {re_common-0.2.54 → re_common-2.0.1}/re_common.egg-info/top_level.txt +0 -0
  213. {re_common-0.2.54 → re_common-2.0.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 0.2.54
3
+ Version: 2.0.1
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -0,0 +1,10 @@
1
+ # re-common
2
+
3
+ #### 介绍
4
+
5
+ v2 会完全重新开始的基础库 会慢慢将master的库转移过来
6
+
7
+ #### 安装教程
8
+
9
+ 1. pip install re-common
10
+
@@ -0,0 +1,230 @@
1
+ from boto3.session import Session
2
+
3
+
4
+ class BaseBoto3(object):
5
+
6
+ def __init__(self, aws_access_key_id="", aws_secret_access_key="", endpoint_url=""):
7
+ self.aws_access_key_id = aws_access_key_id
8
+ self.aws_secret_access_key = aws_secret_access_key
9
+ self.endpoint_url = endpoint_url
10
+ self.session = None
11
+ self.client = None
12
+ if self.aws_access_key_id and self.aws_secret_access_key and self.endpoint_url:
13
+ self.conn_session()
14
+ self.get_client()
15
+
16
+ def set_key(self, aws_access_key_id, aws_secret_access_key, endpoint_url):
17
+ self.aws_access_key_id = aws_access_key_id
18
+ self.aws_secret_access_key = aws_secret_access_key
19
+ self.endpoint_url = endpoint_url
20
+ return self
21
+
22
+ def conn_session(self):
23
+ assert self.aws_access_key_id not in (None, '')
24
+ assert self.aws_secret_access_key not in (None, '')
25
+ self.session = Session(aws_access_key_id=self.aws_access_key_id,
26
+ aws_secret_access_key=self.aws_secret_access_key)
27
+ return self.session
28
+
29
+ def get_client(self):
30
+ assert self.session is not None
31
+ self.client = self.session.client('s3', endpoint_url=self.endpoint_url)
32
+ return self
33
+
34
+ def get_all_buckets(self):
35
+ """
36
+ 获取所有的桶信息
37
+ :return:
38
+ """
39
+ return self.client.list_buckets()
40
+
41
+ def create_buckets(self, buckets_name):
42
+ """
43
+ 如果get_client 使用 client 返回
44
+ {'ResponseMetadata': {'RequestId': '16BC90EED4A433C4', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '0', 'content-security-policy': 'block-all-mixed-content', 'location': '/create1', 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '16BC90EED4A433C4', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Wed, 01 Dec 2021 07:28:39 GMT'}, 'RetryAttempts': 0}, 'Location': '/create1'}
45
+ """
46
+ assert buckets_name.find("_") == -1, "新建一个bucket桶(bucket name 中不能有_下划线)"
47
+ # 新建一个bucket桶(bucket name 中不能有_下划线)
48
+ return self.client.create_bucket(Bucket=buckets_name)
49
+
50
+ def delete_buckets(self, bucket_name):
51
+ """
52
+ 删除桶 删除bucket(只能删除空的bucket)
53
+ :return:
54
+ """
55
+ response = self.client.delete_bucket(Bucket=bucket_name)
56
+ return response
57
+
58
+ def get_bucket(self, bucket_name):
59
+ raise Exception("无实现方法")
60
+
61
+ def get_all_objs(self, bucket_name, prefix=None, continuation_token=None):
62
+ """
63
+
64
+ continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
65
+
66
+ response 的结构
67
+ {'ResponseMetadata': {'RequestId': '1818F447C1E7BA3B', 'HostId': '', 'HTTPStatusCode': 200,
68
+ 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '3182', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'application/xml',
69
+ 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '1818F447C1E7BA3B',
70
+ 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Thu, 09 Jan 2025 07:04:05 GMT'}, 'RetryAttempts': 0},
71
+ 'IsTruncated': False, 'Contents':
72
+ [
73
+ {'Key': 'zt_file/zt类型样例数据/11_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 59, 716000, tzinfo=tzutc()), 'ETag': '"e0d635f171bce6a67ad72265e5f9137d-2"',
74
+ 'Size': 18164139, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
75
+ {'Key': 'zt_file/zt类型样例数据/12_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 57, 70000, tzinfo=tzutc()), 'ETag': '"f238fe9973a2bc0d3e1562c2938ce897-9"',
76
+ 'Size': 93710911, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
77
+ ],
78
+ 'Name': 'crawl.dc.cqvip.com', 'Prefix': 'zt_file/zt类型样例数据', 'Delimiter': '',
79
+ 'MaxKeys': 1000, 'EncodingType': 'url', 'KeyCount': 7}
80
+
81
+ """
82
+ if continuation_token:
83
+ # 获取桶中以特定前缀开头的所有对象
84
+ response = self.client.list_objects_v2(Bucket=bucket_name,
85
+ Prefix=prefix,
86
+ ContinuationToken=continuation_token)
87
+ else:
88
+ # 获取桶中以特定前缀开头的所有对象
89
+ response = self.client.list_objects_v2(Bucket=bucket_name,
90
+ Prefix=prefix)
91
+ object_list = []
92
+ # 检查是否有对象存在
93
+ if 'Contents' in response:
94
+ object_list = [obj['Key'] for obj in response['Contents']]
95
+
96
+ continuation_token = None
97
+ # 检查是否有更多对象
98
+ if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
99
+ continuation_token = response.get('NextContinuationToken')
100
+
101
+ return object_list, continuation_token
102
+
103
+ def list_prefixes(self, bucket_name, prefix=None, Delimiter="/", continuation_token=None):
104
+ """
105
+ 获取目录下一层的目录
106
+ prefix: 注意 这个要以 Delimiter 结尾 比如 Delimiter="/" 那么 prefix="a/"
107
+ continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
108
+ return: ['a/b/', 'a/c/'] 注意 反回的 结果带有prefix 只能返回目录 不能返回文件
109
+ """
110
+ if continuation_token:
111
+ # 获取桶中以特定前缀开头的所有对象
112
+ response = self.client.list_objects_v2(Bucket=bucket_name,
113
+ Prefix=prefix,
114
+ Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
115
+ ContinuationToken=continuation_token)
116
+ else:
117
+ # 获取桶中以特定前缀开头的所有对象
118
+ response = self.client.list_objects_v2(Bucket=bucket_name,
119
+ Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
120
+ Prefix=prefix)
121
+ object_list = []
122
+ # 检查是否有对象存在
123
+ if 'Contents' in response:
124
+ object_list = [obj['Key'] for obj in response['Contents']]
125
+
126
+ Prefix_list = []
127
+ # 检查是否有目录存在
128
+ if 'CommonPrefixes' in response:
129
+ Prefix_list = [obj['Prefix'] for obj in response['CommonPrefixes']]
130
+
131
+ continuation_token = None
132
+ # 检查是否有更多对象
133
+ if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
134
+ continuation_token = response.get('NextContinuationToken')
135
+
136
+ return object_list, Prefix_list, continuation_token
137
+
138
+ def get_object_value(self, bucket_name, file_key, encoding='utf-8'):
139
+ """
140
+ 读取文本数据
141
+ Returns:
142
+ """
143
+ obj = self.client.get_object(Bucket=bucket_name, Key=file_key)
144
+ body = obj['Body'].read().decode(encoding)
145
+ return body
146
+
147
+ def put_object(self, bucket_name, key, body):
148
+ """
149
+ 直接写内容到文件
150
+ Args:
151
+ bucket_name:
152
+ key:
153
+ body: 需要 编码 .encode('utf-8')
154
+
155
+ Returns:
156
+ """
157
+ self.client.put_object(Bucket=bucket_name,
158
+ Key=key,
159
+ Body=body)
160
+
161
+ def download_file(self, bucket_name, key, local_file):
162
+ """
163
+ return: None
164
+ """
165
+ result = self.client.download_file(bucket_name, key, local_file)
166
+ return result
167
+
168
+ def upload_file(self, bucket_name, key, local_file):
169
+ """
170
+ # key 桶中的位置 test1/test.pdf
171
+ :param local_file: 本地文件路径
172
+ :param bucket_name: 桶名
173
+ :param key: 远程文件路径
174
+ :return:
175
+ """
176
+ self.client.upload_file(local_file, bucket_name, key)
177
+
178
+ def download_fileobj(self, bucket_name, key, fileobj):
179
+ """
180
+ return: None
181
+ """
182
+ result = self.client.download_fileobj(bucket_name, key, fileobj)
183
+ return result
184
+
185
+ def upload_fileobj(self, bucket_name, key, fileobj):
186
+ # fileobj 字节流
187
+ self.client.upload_fileobj(fileobj, bucket_name, key)
188
+
189
+ def check_exist_or_file_info(self, bucket_name, key):
190
+ """
191
+ 检查文件是否存在且能获取文件info
192
+ {'ResponseMetadata': {'RequestId': '17E6A65A2B299D3B', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders':
193
+ {'accept-ranges': 'bytes', 'content-length': '117', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'binary/octet-stream',
194
+ 'etag': '"2237a934f176003e41abf3d733291079"', 'last-modified': 'Thu, 25 Jul 2024 05:49:43 GMT', 'server': 'MinIO',
195
+ 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '17E6A65A2B299D3B',
196
+ 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Mon, 29 Jul 2024 09:53:33 GMT'}, 'RetryAttempts': 0},
197
+ 'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2024, 7, 25, 5, 49, 43, tzinfo=tzutc()), 'ContentLength': 117, 'ETag': '"2237a934f176003e41abf3d733291079"',
198
+ 'ContentType': 'binary/octet-stream', 'Metadata': {}}
199
+ """
200
+ try:
201
+ obj_info = self.client.head_object(
202
+ Bucket=bucket_name,
203
+ Key=key
204
+ )
205
+ return obj_info
206
+ except:
207
+ return None
208
+
209
+ def get_prefix_count(self, bucket_name, obj_count, prefix, continuation_token=None):
210
+ """
211
+ 统计 某个目录的文件数据量,由于需要每个目录获取一次 性能很慢
212
+ """
213
+ for index in range(10000):
214
+ obj_list, dir_list, token = self.list_prefixes(bucket_name=bucket_name,
215
+ prefix=prefix,
216
+ continuation_token=continuation_token)
217
+
218
+ obj_count = obj_count + len(obj_list)
219
+ for dir_sub in dir_list:
220
+ obj_count = self.get_prefix_count(bucket_name, obj_count, dir_sub)
221
+
222
+ if token:
223
+ continuation_token = token
224
+ else:
225
+ break
226
+
227
+ if index > 10000 - 5:
228
+ raise Exception("循环耗尽,请检查逻辑正确性")
229
+
230
+ return obj_count
@@ -0,0 +1,24 @@
1
+ class DotDict(dict):
2
+ """
3
+ 让字典成为对象 既可以用字典方式访问 也可以用点访问key
4
+ """
5
+ def __init__(self, *args, **kwargs):
6
+ super().__init__(*args, **kwargs)
7
+ # 递归地将嵌套字典转换为 DotDict
8
+ for key, value in self.items():
9
+ if isinstance(value, dict):
10
+ self[key] = DotDict(value)
11
+
12
+ def __getattr__(self, key):
13
+ try:
14
+ value = self[key]
15
+ if isinstance(value, dict): # 如果值是字典,继续转换为 DotDict
16
+ return DotDict(value)
17
+ return value
18
+ except KeyError:
19
+ raise AttributeError(f"'DotDict' object has no attribute '{key}'")
20
+
21
+ def __setattr__(self, key, value):
22
+ if isinstance(value, dict): # 如果值是字典,转换为 DotDict
23
+ value = DotDict(value)
24
+ self[key] = value
@@ -0,0 +1,33 @@
1
+ from typing import List
2
+
3
+ import jieba
4
+ from datasketch import MinHash
5
+
6
+
7
+ def tokenize(text: str, stopwords=None) -> List[str]:
8
+ """
9
+ 分词并移除停用词
10
+ """
11
+ if stopwords is None:
12
+ stopwords = []
13
+ words = jieba.lcut(text)
14
+ # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
15
+ one_char_size = len([i for i in words if len(i) == 1])
16
+ all_size = len(words)
17
+ # 如果单字符个数超过一定比例 就直接用空格分词
18
+ if all_size != 0 and one_char_size / all_size > 0.6:
19
+ words = [i for i in text.split() if i.strip()]
20
+
21
+ # 过滤停用词和空字符
22
+ words = [w for w in words if w not in stopwords and w.strip()]
23
+ return words
24
+
25
+
26
+ def create_minhash(words: List[str], num_perm=128) -> MinHash:
27
+ """
28
+ 为分词结果创建 MinHash
29
+ """
30
+ minhash = MinHash(num_perm=num_perm)
31
+ for word in words:
32
+ minhash.update(word.encode("utf-8"))
33
+ return minhash
@@ -0,0 +1,223 @@
1
+ import jieba
2
+ import re
3
+ from typing import List, Dict, Tuple, Set, Optional, Union
4
+ from datasketch import MinHash, MinHashLSH
5
+
6
+
7
+ class TextMatcher:
8
+ def __init__(
9
+ self,
10
+ threshold: float = 0.5,
11
+ num_perm: int = 128,
12
+ is_raw_texts=True,
13
+ stopwords_path: Optional[str] = None,
14
+ user_dict_path: Optional[str] = None,
15
+
16
+ ):
17
+ """
18
+ 初始化文本匹配器
19
+
20
+ Args:
21
+ threshold: LSH 相似度阈值
22
+ num_perm: MinHash 排列数
23
+ stopwords_path: 停用词文件路径
24
+ user_dict_path: 用户自定义词典路径
25
+ """
26
+ self.threshold = threshold
27
+ self.num_perm = num_perm
28
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
29
+ # self.minhashes: Dict[str, MinHash] = {}
30
+ self.raw_texts: Dict[str, str] = {}
31
+ self.is_raw_texts = is_raw_texts
32
+ self.doc_counter = 0
33
+
34
+ # 加载停用词
35
+ self.stopwords: Set[str] = set()
36
+ if stopwords_path:
37
+ self.load_stopwords(stopwords_path)
38
+
39
+ # 加载用户词典
40
+ if user_dict_path:
41
+ jieba.load_userdict(user_dict_path)
42
+
43
+ def load_stopwords(self, stopwords_path: str) -> None:
44
+ """加载停用词"""
45
+ with open(stopwords_path, "r", encoding="utf-8") as f:
46
+ self.stopwords = set(line.strip() for line in f)
47
+
48
+ def preprocess_text(self, text: str) -> str:
49
+ """
50
+ 文本预处理
51
+ """
52
+ # 转换为小写
53
+ text = text.lower()
54
+ # 移除特殊字符
55
+ text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
56
+ # 移除多余空格
57
+ text = re.sub(r"\s+", " ", text).strip()
58
+ return text
59
+
60
+ def tokenize(self, text: str) -> List[str]:
61
+ """
62
+ 分词并移除停用词
63
+ """
64
+ words = jieba.lcut(text)
65
+ one_char_size = len([i for i in words if len(i) == 1])
66
+ all_size = len(words)
67
+ if all_size != 0 and one_char_size / all_size > 0.6:
68
+ words = [i for i in text.split() if i.strip()]
69
+
70
+ # 过滤停用词和空字符
71
+ words = [w for w in words if w not in self.stopwords and w.strip()]
72
+ return words
73
+
74
+ def create_minhash(self, words: List[str]) -> MinHash:
75
+ """
76
+ 为分词结果创建 MinHash
77
+ """
78
+ minhash = MinHash(num_perm=self.num_perm)
79
+ for word in words:
80
+ minhash.update(word.encode("utf-8"))
81
+ return minhash
82
+
83
+ def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
84
+ """
85
+ 添加文档到索引
86
+
87
+ Args:
88
+ text: 文档文本
89
+ doc_id: 文档ID(可选)
90
+
91
+ Returns:
92
+ doc_id: 文档ID
93
+ """
94
+ if doc_id is None:
95
+ doc_id = f"doc_{self.doc_counter}"
96
+ self.doc_counter += 1
97
+
98
+ # 预处理和分词
99
+ processed_text = self.preprocess_text(text)
100
+ words = self.tokenize(processed_text)
101
+
102
+ # 创建 MinHash
103
+ minhash = self.create_minhash(words)
104
+ if self.is_raw_texts:
105
+ # 存储原始文本和 MinHash
106
+ self.raw_texts[doc_id] = text
107
+ # self.minhashes[doc_id] = minhash
108
+
109
+ # 添加到 LSH
110
+ self.lsh.insert(doc_id, minhash)
111
+
112
+ return doc_id
113
+
114
+ def batch_add_documents(self, texts: Dict[str, str]) -> None:
115
+ """
116
+ 批量添加文档
117
+
118
+ Args:
119
+ texts: {doc_id: text} 的字典
120
+ """
121
+ for doc_id, text in texts.items():
122
+ self.add_document(text, doc_id)
123
+
124
+ def create_query_minhash(self, query: str):
125
+
126
+ # 预处理查询文本
127
+ processed_query = self.preprocess_text(query)
128
+ query_words = self.tokenize(processed_query)
129
+ # print(query_words)
130
+ query_minhash = self.create_minhash(query_words)
131
+ return query_minhash
132
+
133
+ def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
134
+ List[str], List[Tuple[str, float]]]:
135
+ """
136
+ 查找相似文档
137
+
138
+ Args:
139
+ query: 查询文本
140
+ return_similarities: 是否返回相似度分数
141
+
142
+ Returns:
143
+ 如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
144
+ 否则返回 [doc_id, ...]
145
+ """
146
+
147
+ # 使用 LSH 查找候选集
148
+ similar_docs = self.lsh.query(query_minhash)
149
+
150
+ # if return_similarities:
151
+ # # 计算精确的 Jaccard 相似度
152
+ # results = []
153
+ # for doc_id in similar_docs:
154
+ # similarity = query_minhash.jaccard(self.minhashes[doc_id])
155
+ # results.append((doc_id, similarity))
156
+ # # 按相似度降序排序
157
+ # return sorted(results, key=lambda x: x[1], reverse=True)
158
+
159
+ return similar_docs
160
+
161
+ def get_text(self, doc_id: str) -> Optional[str]:
162
+ """获取原始文本"""
163
+ if self.is_raw_texts:
164
+ return self.raw_texts.get(doc_id)
165
+ raise Exception("没有开启存储")
166
+
167
+ def remove_document(self, doc_id: str) -> bool:
168
+ """
169
+ 删除文档
170
+
171
+ Returns:
172
+ bool: 是否成功删除
173
+ """
174
+ # if doc_id not in self.minhashes:
175
+ # return False
176
+
177
+ self.lsh.remove(doc_id)
178
+ # del self.minhashes[doc_id]
179
+ if self.is_raw_texts:
180
+ del self.raw_texts[doc_id]
181
+ return True
182
+
183
+ def clear(self) -> None:
184
+ """清空所有数据"""
185
+ self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
186
+ # self.minhashes.clear()
187
+ self.raw_texts.clear()
188
+ self.doc_counter = 0
189
+
190
+
191
+ if __name__ == "__main__":
192
+ # 创建匹配器实例
193
+ matcher = TextMatcher(
194
+ threshold=0.1, # 相似度阈值
195
+ num_perm=128, # MinHash 排列数
196
+ )
197
+
198
+ # 添加单个文档
199
+ doc_id = matcher.add_document(
200
+ "北京是中国的首都"
201
+ )
202
+
203
+ # 批量添加文档
204
+ docs = {"doc1": "北京是一座现代化的大都市", "doc2": "上海是中国最大的城市", "doc3": "中国的首都是北京"}
205
+ matcher.batch_add_documents(docs)
206
+
207
+ # 查找相似文档(不返回相似度分数)
208
+ similar_docs = matcher.find_similar("北京首都")
209
+ print("相似文档ID:", similar_docs)
210
+
211
+ # 查找相似文档(返回相似度分数)
212
+ similar_docs_with_scores = matcher.find_similar("北京首都", return_similarities=True)
213
+ print("相似文档ID和分数:", similar_docs_with_scores)
214
+
215
+ # 获取原始文本
216
+ for doc_id, score in similar_docs_with_scores:
217
+ print(f"文档 {doc_id}: {matcher.get_text(doc_id)} (相似度: {score:.2f})")
218
+
219
+ # 删除文档
220
+ matcher.remove_document("doc1")
221
+
222
+ # 清空所有数据
223
+ matcher.clear()
@@ -0,0 +1,60 @@
1
+ """
2
+ 并查集(Union-Find)是一种用于管理元素分组的数据结构,主要用于解决动态连通性问题。它支持以下两种核心操作:
3
+
4
+ 查找(Find):确定某个元素属于哪个集合。
5
+
6
+ 合并(Union):将两个集合合并为一个集合。
7
+
8
+ 并查集广泛应用于图论、网络连接、社交网络分析、图像处理等领域。
9
+ """
10
+
11
+
12
+ class UnionFind:
13
+ def __init__(self):
14
+ """
15
+ 初始化并查集。
16
+ 使用字典动态存储 parent 和 rank。
17
+ """
18
+ self.parent = {} # 存储每个元素的父节点,用于表示集合的树结构
19
+ self.rank = {} # 存储每个集合的秩(树的高度),用于优化合并操作
20
+
21
+ def find(self, x):
22
+ """
23
+ 查找元素 x 的根节点(路径压缩优化)。
24
+ 如果元素不存在,则动态添加。
25
+ """
26
+ if x not in self.parent: # 如果元素 x 不在 parent 字典中
27
+ self.parent[x] = x # 将 x 的父节点设置为自己(初始化)
28
+ self.rank[x] = 1 # 将 x 的秩初始化为 1
29
+ if self.parent[x] != x: # 如果 x 不是根节点(路径压缩优化)
30
+ self.parent[x] = self.find(self.parent[x]) # 递归查找根节点,并更新 x 的父节点
31
+ return self.parent[x] # 返回 x 的根节点
32
+
33
+ def union(self, x, y):
34
+ """
35
+ 合并元素 x 和 y 所在的集合(按秩合并优化)。
36
+ 如果元素不存在,则动态添加。
37
+ """
38
+ root_x = self.find(x) # 找到 x 的根节点
39
+ root_y = self.find(y) # 找到 y 的根节点
40
+ if root_x != root_y: # 如果 x 和 y 不在同一个集合中
41
+ # 按秩合并
42
+ if self.rank[root_x] > self.rank[root_y]: # 如果 x 所在集合的秩更大
43
+ self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
44
+ elif self.rank[root_x] < self.rank[root_y]: # 如果 y 所在集合的秩更大
45
+ self.parent[root_x] = root_y # 将 x 的根节点指向 y 的根节点
46
+ else: # 如果两个集合的秩相等
47
+ self.parent[root_y] = root_x # 将 y 的根节点指向 x 的根节点
48
+ self.rank[root_x] += 1 # 增加 x 所在集合的秩
49
+
50
+ def get_groups(self):
51
+ """
52
+ 获取所有分组,返回一个字典,键为根节点,值为该组的所有元素。
53
+ """
54
+ groups = {} # 初始化一个空字典,用于存储分组
55
+ for x in self.parent: # 遍历所有元素
56
+ root = self.find(x) # 找到当前元素的根节点
57
+ if root not in groups: # 如果根节点不在 groups 字典中
58
+ groups[root] = [] # 初始化一个空列表
59
+ groups[root].append(x) # 将当前元素添加到对应根节点的列表中
60
+ return groups # 返回分组结果
@@ -0,0 +1,74 @@
1
+ # 某些业务中的字符串处理 算是特定场景的工具
2
+ import re
3
+
4
+
5
+ def clean_organ_postcode(organ):
6
+ """
7
+ 格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
8
+
9
+ 备注: 该方法替换java 里面的 formatOrgan
10
+
11
+ 参数:
12
+ organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
13
+
14
+ 返回:
15
+ str: 格式化并清理后的组织名称字符串(无独立6位数字)。
16
+ """
17
+ # 如果输入为空,设为空字符串以避免后续操作报错
18
+ if not organ:
19
+ organ = ""
20
+
21
+ # 删除方括号和圆括号中的内容(包括括号本身)
22
+ organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
23
+ organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
24
+
25
+ # 定义正则表达式,匹配独立的6位数字
26
+ # \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
27
+ organ = re.sub(r"\b[0-9]{6}\b", "", organ)
28
+
29
+ # 初始化结果列表,用于存储处理后的组织名称部分
30
+ format_organ = []
31
+ # 按分号分割字符串,生成组织名称的各个部分
32
+ organ_parts = organ.split(";")
33
+
34
+ # 遍历每个部分,追加到结果列表
35
+ for temp_organ in organ_parts:
36
+ # 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
37
+ cleaned_part = temp_organ.strip()
38
+ # 如果首尾是标点符号,则移除
39
+ # 定义标点符号的正则表达式(这里包括常见标点)
40
+ punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
41
+ cleaned_part = re.sub(punctuation, "", cleaned_part)
42
+ if cleaned_part: # 只追加非空部分
43
+ format_organ.append(cleaned_part)
44
+
45
+ # 用分号连接结果,转换为大写并清理标点
46
+ format_organ = ";".join(format_organ)
47
+
48
+ # 返回最终结果并去除首尾空格
49
+ return format_organ.strip()
50
+
51
+
52
+ def get_first_organ(organ):
53
+ if not organ:
54
+ return ""
55
+ organ_list = organ.strip().split(";")
56
+ for organ_one in organ_list:
57
+ # 清理邮政编码
58
+ organ_one = clean_organ_postcode(organ_one)
59
+ if organ_one:
60
+ return organ_one
61
+
62
+ return ""
63
+
64
+
65
+ def get_first_author(author: str) -> str:
66
+ if not author:
67
+ return ""
68
+ au_list = author.strip().split(";")
69
+ for au in au_list:
70
+ au = re.sub("\\[.*?]", "", au)
71
+ au = re.sub("\\(.*?\\)", "", au)
72
+ if au:
73
+ return au
74
+ return ""