re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
  172. re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
  173. re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
  174. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  175. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  176. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  177. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  178. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  179. re_common/v2/baselibrary/utils/basedict.py +37 -37
  180. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  181. re_common/v2/baselibrary/utils/basepika.py +180 -180
  182. re_common/v2/baselibrary/utils/basetime.py +94 -77
  183. re_common/v2/baselibrary/utils/db.py +174 -156
  184. re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
  185. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  186. re_common/v2/baselibrary/utils/mq.py +83 -83
  187. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  188. re_common/v2/baselibrary/utils/string_bool.py +187 -186
  189. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  190. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  191. re_common/v2/baselibrary/utils/stringutils.py +312 -271
  192. re_common/vip/base_step_process.py +11 -11
  193. re_common/vip/baseencodeid.py +90 -90
  194. re_common/vip/changetaskname.py +28 -28
  195. re_common/vip/core_var.py +24 -24
  196. re_common/vip/mmh3Hash.py +89 -89
  197. re_common/vip/proxy/allproxys.py +127 -127
  198. re_common/vip/proxy/allproxys_thread.py +159 -159
  199. re_common/vip/proxy/cnki_proxy.py +153 -153
  200. re_common/vip/proxy/kuaidaili.py +87 -87
  201. re_common/vip/proxy/proxy_all.py +113 -113
  202. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  203. re_common/vip/proxy/wanfang_proxy.py +152 -152
  204. re_common/vip/proxy/wp_proxy_all.py +181 -181
  205. re_common/vip/read_rawid_to_txt.py +91 -91
  206. re_common/vip/title/__init__.py +5 -5
  207. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  208. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  209. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  210. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  211. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  212. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  213. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  214. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  215. re_common/vip/title/transform/__init__.py +10 -10
  216. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
  217. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
  218. re_common-10.0.41.dist-info/RECORD +252 -0
  219. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
  220. re_common-10.0.39.dist-info/RECORD +0 -248
  221. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
@@ -1,230 +1,230 @@
1
- from boto3.session import Session
2
-
3
-
4
- class BaseBoto3(object):
5
-
6
- def __init__(self, aws_access_key_id="", aws_secret_access_key="", endpoint_url=""):
7
- self.aws_access_key_id = aws_access_key_id
8
- self.aws_secret_access_key = aws_secret_access_key
9
- self.endpoint_url = endpoint_url
10
- self.session = None
11
- self.client = None
12
- if self.aws_access_key_id and self.aws_secret_access_key and self.endpoint_url:
13
- self.conn_session()
14
- self.get_client()
15
-
16
- def set_key(self, aws_access_key_id, aws_secret_access_key, endpoint_url):
17
- self.aws_access_key_id = aws_access_key_id
18
- self.aws_secret_access_key = aws_secret_access_key
19
- self.endpoint_url = endpoint_url
20
- return self
21
-
22
- def conn_session(self):
23
- assert self.aws_access_key_id not in (None, '')
24
- assert self.aws_secret_access_key not in (None, '')
25
- self.session = Session(aws_access_key_id=self.aws_access_key_id,
26
- aws_secret_access_key=self.aws_secret_access_key)
27
- return self.session
28
-
29
- def get_client(self):
30
- assert self.session is not None
31
- self.client = self.session.client('s3', endpoint_url=self.endpoint_url)
32
- return self
33
-
34
- def get_all_buckets(self):
35
- """
36
- 获取所有的桶信息
37
- :return:
38
- """
39
- return self.client.list_buckets()
40
-
41
- def create_buckets(self, buckets_name):
42
- """
43
- 如果get_client 使用 client 返回
44
- {'ResponseMetadata': {'RequestId': '16BC90EED4A433C4', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '0', 'content-security-policy': 'block-all-mixed-content', 'location': '/create1', 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '16BC90EED4A433C4', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Wed, 01 Dec 2021 07:28:39 GMT'}, 'RetryAttempts': 0}, 'Location': '/create1'}
45
- """
46
- assert buckets_name.find("_") == -1, "新建一个bucket桶(bucket name 中不能有_下划线)"
47
- # 新建一个bucket桶(bucket name 中不能有_下划线)
48
- return self.client.create_bucket(Bucket=buckets_name)
49
-
50
- def delete_buckets(self, bucket_name):
51
- """
52
- 删除桶 删除bucket(只能删除空的bucket)
53
- :return:
54
- """
55
- response = self.client.delete_bucket(Bucket=bucket_name)
56
- return response
57
-
58
- def get_bucket(self, bucket_name):
59
- raise Exception("无实现方法")
60
-
61
- def get_all_objs(self, bucket_name, prefix=None, continuation_token=None):
62
- """
63
-
64
- continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
65
-
66
- response 的结构
67
- {'ResponseMetadata': {'RequestId': '1818F447C1E7BA3B', 'HostId': '', 'HTTPStatusCode': 200,
68
- 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '3182', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'application/xml',
69
- 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '1818F447C1E7BA3B',
70
- 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Thu, 09 Jan 2025 07:04:05 GMT'}, 'RetryAttempts': 0},
71
- 'IsTruncated': False, 'Contents':
72
- [
73
- {'Key': 'zt_file/zt类型样例数据/11_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 59, 716000, tzinfo=tzutc()), 'ETag': '"e0d635f171bce6a67ad72265e5f9137d-2"',
74
- 'Size': 18164139, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
75
- {'Key': 'zt_file/zt类型样例数据/12_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 57, 70000, tzinfo=tzutc()), 'ETag': '"f238fe9973a2bc0d3e1562c2938ce897-9"',
76
- 'Size': 93710911, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
77
- ],
78
- 'Name': 'crawl.dc.cqvip.com', 'Prefix': 'zt_file/zt类型样例数据', 'Delimiter': '',
79
- 'MaxKeys': 1000, 'EncodingType': 'url', 'KeyCount': 7}
80
-
81
- """
82
- if continuation_token:
83
- # 获取桶中以特定前缀开头的所有对象
84
- response = self.client.list_objects_v2(Bucket=bucket_name,
85
- Prefix=prefix,
86
- ContinuationToken=continuation_token)
87
- else:
88
- # 获取桶中以特定前缀开头的所有对象
89
- response = self.client.list_objects_v2(Bucket=bucket_name,
90
- Prefix=prefix)
91
- object_list = []
92
- # 检查是否有对象存在
93
- if 'Contents' in response:
94
- object_list = [obj['Key'] for obj in response['Contents']]
95
-
96
- continuation_token = None
97
- # 检查是否有更多对象
98
- if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
99
- continuation_token = response.get('NextContinuationToken')
100
-
101
- return object_list, continuation_token
102
-
103
- def list_prefixes(self, bucket_name, prefix=None, Delimiter="/", continuation_token=None):
104
- """
105
- 获取目录下一层的目录
106
- prefix: 注意 这个要以 Delimiter 结尾 比如 Delimiter="/" 那么 prefix="a/"
107
- continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
108
- return: ['a/b/', 'a/c/'] 注意 反回的 结果带有prefix 只能返回目录 不能返回文件
109
- """
110
- if continuation_token:
111
- # 获取桶中以特定前缀开头的所有对象
112
- response = self.client.list_objects_v2(Bucket=bucket_name,
113
- Prefix=prefix,
114
- Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
115
- ContinuationToken=continuation_token)
116
- else:
117
- # 获取桶中以特定前缀开头的所有对象
118
- response = self.client.list_objects_v2(Bucket=bucket_name,
119
- Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
120
- Prefix=prefix)
121
- object_list = []
122
- # 检查是否有对象存在
123
- if 'Contents' in response:
124
- object_list = [obj['Key'] for obj in response['Contents']]
125
-
126
- Prefix_list = []
127
- # 检查是否有目录存在
128
- if 'CommonPrefixes' in response:
129
- Prefix_list = [obj['Prefix'] for obj in response['CommonPrefixes']]
130
-
131
- continuation_token = None
132
- # 检查是否有更多对象
133
- if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
134
- continuation_token = response.get('NextContinuationToken')
135
-
136
- return object_list, Prefix_list, continuation_token
137
-
138
- def get_object_value(self, bucket_name, file_key, encoding='utf-8'):
139
- """
140
- 读取文本数据
141
- Returns:
142
- """
143
- obj = self.client.get_object(Bucket=bucket_name, Key=file_key)
144
- body = obj['Body'].read().decode(encoding)
145
- return body
146
-
147
- def put_object(self, bucket_name, key, body):
148
- """
149
- 直接写内容到文件
150
- Args:
151
- bucket_name:
152
- key:
153
- body: 需要 编码 .encode('utf-8')
154
-
155
- Returns:
156
- """
157
- self.client.put_object(Bucket=bucket_name,
158
- Key=key,
159
- Body=body)
160
-
161
- def download_file(self, bucket_name, key, local_file):
162
- """
163
- return: None
164
- """
165
- result = self.client.download_file(bucket_name, key, local_file)
166
- return result
167
-
168
- def upload_file(self, bucket_name, key, local_file):
169
- """
170
- # key 桶中的位置 test1/test.pdf
171
- :param local_file: 本地文件路径
172
- :param bucket_name: 桶名
173
- :param key: 远程文件路径
174
- :return:
175
- """
176
- self.client.upload_file(local_file, bucket_name, key)
177
-
178
- def download_fileobj(self, bucket_name, key, fileobj):
179
- """
180
- return: None
181
- """
182
- result = self.client.download_fileobj(bucket_name, key, fileobj)
183
- return result
184
-
185
- def upload_fileobj(self, bucket_name, key, fileobj):
186
- # fileobj 字节流
187
- self.client.upload_fileobj(fileobj, bucket_name, key)
188
-
189
- def check_exist_or_file_info(self, bucket_name, key):
190
- """
191
- 检查文件是否存在且能获取文件info
192
- {'ResponseMetadata': {'RequestId': '17E6A65A2B299D3B', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders':
193
- {'accept-ranges': 'bytes', 'content-length': '117', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'binary/octet-stream',
194
- 'etag': '"2237a934f176003e41abf3d733291079"', 'last-modified': 'Thu, 25 Jul 2024 05:49:43 GMT', 'server': 'MinIO',
195
- 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '17E6A65A2B299D3B',
196
- 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Mon, 29 Jul 2024 09:53:33 GMT'}, 'RetryAttempts': 0},
197
- 'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2024, 7, 25, 5, 49, 43, tzinfo=tzutc()), 'ContentLength': 117, 'ETag': '"2237a934f176003e41abf3d733291079"',
198
- 'ContentType': 'binary/octet-stream', 'Metadata': {}}
199
- """
200
- try:
201
- obj_info = self.client.head_object(
202
- Bucket=bucket_name,
203
- Key=key
204
- )
205
- return obj_info
206
- except:
207
- return None
208
-
209
- def get_prefix_count(self, bucket_name, obj_count, prefix, continuation_token=None):
210
- """
211
- 统计 某个目录的文件数据量,由于需要每个目录获取一次 性能很慢
212
- """
213
- for index in range(10000):
214
- obj_list, dir_list, token = self.list_prefixes(bucket_name=bucket_name,
215
- prefix=prefix,
216
- continuation_token=continuation_token)
217
-
218
- obj_count = obj_count + len(obj_list)
219
- for dir_sub in dir_list:
220
- obj_count = self.get_prefix_count(bucket_name, obj_count, dir_sub)
221
-
222
- if token:
223
- continuation_token = token
224
- else:
225
- break
226
-
227
- if index > 10000 - 5:
228
- raise Exception("循环耗尽,请检查逻辑正确性")
229
-
230
- return obj_count
1
+ from boto3.session import Session
2
+
3
+
4
+ class BaseBoto3(object):
5
+
6
+ def __init__(self, aws_access_key_id="", aws_secret_access_key="", endpoint_url=""):
7
+ self.aws_access_key_id = aws_access_key_id
8
+ self.aws_secret_access_key = aws_secret_access_key
9
+ self.endpoint_url = endpoint_url
10
+ self.session = None
11
+ self.client = None
12
+ if self.aws_access_key_id and self.aws_secret_access_key and self.endpoint_url:
13
+ self.conn_session()
14
+ self.get_client()
15
+
16
+ def set_key(self, aws_access_key_id, aws_secret_access_key, endpoint_url):
17
+ self.aws_access_key_id = aws_access_key_id
18
+ self.aws_secret_access_key = aws_secret_access_key
19
+ self.endpoint_url = endpoint_url
20
+ return self
21
+
22
+ def conn_session(self):
23
+ assert self.aws_access_key_id not in (None, '')
24
+ assert self.aws_secret_access_key not in (None, '')
25
+ self.session = Session(aws_access_key_id=self.aws_access_key_id,
26
+ aws_secret_access_key=self.aws_secret_access_key)
27
+ return self.session
28
+
29
+ def get_client(self):
30
+ assert self.session is not None
31
+ self.client = self.session.client('s3', endpoint_url=self.endpoint_url)
32
+ return self
33
+
34
+ def get_all_buckets(self):
35
+ """
36
+ 获取所有的桶信息
37
+ :return:
38
+ """
39
+ return self.client.list_buckets()
40
+
41
+ def create_buckets(self, buckets_name):
42
+ """
43
+ 如果get_client 使用 client 返回
44
+ {'ResponseMetadata': {'RequestId': '16BC90EED4A433C4', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '0', 'content-security-policy': 'block-all-mixed-content', 'location': '/create1', 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '16BC90EED4A433C4', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Wed, 01 Dec 2021 07:28:39 GMT'}, 'RetryAttempts': 0}, 'Location': '/create1'}
45
+ """
46
+ assert buckets_name.find("_") == -1, "新建一个bucket桶(bucket name 中不能有_下划线)"
47
+ # 新建一个bucket桶(bucket name 中不能有_下划线)
48
+ return self.client.create_bucket(Bucket=buckets_name)
49
+
50
+ def delete_buckets(self, bucket_name):
51
+ """
52
+ 删除桶 删除bucket(只能删除空的bucket)
53
+ :return:
54
+ """
55
+ response = self.client.delete_bucket(Bucket=bucket_name)
56
+ return response
57
+
58
+ def get_bucket(self, bucket_name):
59
+ raise Exception("无实现方法")
60
+
61
+ def get_all_objs(self, bucket_name, prefix=None, continuation_token=None):
62
+ """
63
+
64
+ continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
65
+
66
+ response 的结构
67
+ {'ResponseMetadata': {'RequestId': '1818F447C1E7BA3B', 'HostId': '', 'HTTPStatusCode': 200,
68
+ 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '3182', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'application/xml',
69
+ 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '1818F447C1E7BA3B',
70
+ 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Thu, 09 Jan 2025 07:04:05 GMT'}, 'RetryAttempts': 0},
71
+ 'IsTruncated': False, 'Contents':
72
+ [
73
+ {'Key': 'zt_file/zt类型样例数据/11_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 59, 716000, tzinfo=tzutc()), 'ETag': '"e0d635f171bce6a67ad72265e5f9137d-2"',
74
+ 'Size': 18164139, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
75
+ {'Key': 'zt_file/zt类型样例数据/12_part-00000.gz', 'LastModified': datetime.datetime(2024, 4, 28, 2, 56, 57, 70000, tzinfo=tzutc()), 'ETag': '"f238fe9973a2bc0d3e1562c2938ce897-9"',
76
+ 'Size': 93710911, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'minio', 'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}},
77
+ ],
78
+ 'Name': 'crawl.dc.cqvip.com', 'Prefix': 'zt_file/zt类型样例数据', 'Delimiter': '',
79
+ 'MaxKeys': 1000, 'EncodingType': 'url', 'KeyCount': 7}
80
+
81
+ """
82
+ if continuation_token:
83
+ # 获取桶中以特定前缀开头的所有对象
84
+ response = self.client.list_objects_v2(Bucket=bucket_name,
85
+ Prefix=prefix,
86
+ ContinuationToken=continuation_token)
87
+ else:
88
+ # 获取桶中以特定前缀开头的所有对象
89
+ response = self.client.list_objects_v2(Bucket=bucket_name,
90
+ Prefix=prefix)
91
+ object_list = []
92
+ # 检查是否有对象存在
93
+ if 'Contents' in response:
94
+ object_list = [obj['Key'] for obj in response['Contents']]
95
+
96
+ continuation_token = None
97
+ # 检查是否有更多对象
98
+ if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
99
+ continuation_token = response.get('NextContinuationToken')
100
+
101
+ return object_list, continuation_token
102
+
103
+ def list_prefixes(self, bucket_name, prefix=None, Delimiter="/", continuation_token=None):
104
+ """
105
+ 获取目录下一层的目录
106
+ prefix: 注意 这个要以 Delimiter 结尾 比如 Delimiter="/" 那么 prefix="a/"
107
+ continuation_token: 如果超过1000 需要传第一次获取结果中的 continuation_token
108
+ return: ['a/b/', 'a/c/'] 注意 反回的 结果带有prefix 只能返回目录 不能返回文件
109
+ """
110
+ if continuation_token:
111
+ # 获取桶中以特定前缀开头的所有对象
112
+ response = self.client.list_objects_v2(Bucket=bucket_name,
113
+ Prefix=prefix,
114
+ Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
115
+ ContinuationToken=continuation_token)
116
+ else:
117
+ # 获取桶中以特定前缀开头的所有对象
118
+ response = self.client.list_objects_v2(Bucket=bucket_name,
119
+ Delimiter=Delimiter, # 使用斜杠分隔符模拟目录结构
120
+ Prefix=prefix)
121
+ object_list = []
122
+ # 检查是否有对象存在
123
+ if 'Contents' in response:
124
+ object_list = [obj['Key'] for obj in response['Contents']]
125
+
126
+ Prefix_list = []
127
+ # 检查是否有目录存在
128
+ if 'CommonPrefixes' in response:
129
+ Prefix_list = [obj['Prefix'] for obj in response['CommonPrefixes']]
130
+
131
+ continuation_token = None
132
+ # 检查是否有更多对象
133
+ if response.get('IsTruncated'): # 如果返回结果被截断,说明有更多对象
134
+ continuation_token = response.get('NextContinuationToken')
135
+
136
+ return object_list, Prefix_list, continuation_token
137
+
138
+ def get_object_value(self, bucket_name, file_key, encoding='utf-8'):
139
+ """
140
+ 读取文本数据
141
+ Returns:
142
+ """
143
+ obj = self.client.get_object(Bucket=bucket_name, Key=file_key)
144
+ body = obj['Body'].read().decode(encoding)
145
+ return body
146
+
147
+ def put_object(self, bucket_name, key, body):
148
+ """
149
+ 直接写内容到文件
150
+ Args:
151
+ bucket_name:
152
+ key:
153
+ body: 需要 编码 .encode('utf-8')
154
+
155
+ Returns:
156
+ """
157
+ self.client.put_object(Bucket=bucket_name,
158
+ Key=key,
159
+ Body=body)
160
+
161
+ def download_file(self, bucket_name, key, local_file):
162
+ """
163
+ return: None
164
+ """
165
+ result = self.client.download_file(bucket_name, key, local_file)
166
+ return result
167
+
168
+ def upload_file(self, bucket_name, key, local_file):
169
+ """
170
+ # key 桶中的位置 test1/test.pdf
171
+ :param local_file: 本地文件路径
172
+ :param bucket_name: 桶名
173
+ :param key: 远程文件路径
174
+ :return:
175
+ """
176
+ self.client.upload_file(local_file, bucket_name, key)
177
+
178
+ def download_fileobj(self, bucket_name, key, fileobj):
179
+ """
180
+ return: None
181
+ """
182
+ result = self.client.download_fileobj(bucket_name, key, fileobj)
183
+ return result
184
+
185
+ def upload_fileobj(self, bucket_name, key, fileobj):
186
+ # fileobj 字节流
187
+ self.client.upload_fileobj(fileobj, bucket_name, key)
188
+
189
+ def check_exist_or_file_info(self, bucket_name, key):
190
+ """
191
+ 检查文件是否存在且能获取文件info
192
+ {'ResponseMetadata': {'RequestId': '17E6A65A2B299D3B', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders':
193
+ {'accept-ranges': 'bytes', 'content-length': '117', 'content-security-policy': 'block-all-mixed-content', 'content-type': 'binary/octet-stream',
194
+ 'etag': '"2237a934f176003e41abf3d733291079"', 'last-modified': 'Thu, 25 Jul 2024 05:49:43 GMT', 'server': 'MinIO',
195
+ 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-request-id': '17E6A65A2B299D3B',
196
+ 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'date': 'Mon, 29 Jul 2024 09:53:33 GMT'}, 'RetryAttempts': 0},
197
+ 'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2024, 7, 25, 5, 49, 43, tzinfo=tzutc()), 'ContentLength': 117, 'ETag': '"2237a934f176003e41abf3d733291079"',
198
+ 'ContentType': 'binary/octet-stream', 'Metadata': {}}
199
+ """
200
+ try:
201
+ obj_info = self.client.head_object(
202
+ Bucket=bucket_name,
203
+ Key=key
204
+ )
205
+ return obj_info
206
+ except:
207
+ return None
208
+
209
+ def get_prefix_count(self, bucket_name, obj_count, prefix, continuation_token=None):
210
+ """
211
+ 统计 某个目录的文件数据量,由于需要每个目录获取一次 性能很慢
212
+ """
213
+ for index in range(10000):
214
+ obj_list, dir_list, token = self.list_prefixes(bucket_name=bucket_name,
215
+ prefix=prefix,
216
+ continuation_token=continuation_token)
217
+
218
+ obj_count = obj_count + len(obj_list)
219
+ for dir_sub in dir_list:
220
+ obj_count = self.get_prefix_count(bucket_name, obj_count, dir_sub)
221
+
222
+ if token:
223
+ continuation_token = token
224
+ else:
225
+ break
226
+
227
+ if index > 10000 - 5:
228
+ raise Exception("循环耗尽,请检查逻辑正确性")
229
+
230
+ return obj_count