re-common 10.0.39__py3-none-any.whl → 10.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +77 -77
- re_common/v2/baselibrary/utils/db.py +156 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +186 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +271 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/METADATA +24 -16
- re_common-10.0.40.dist-info/RECORD +249 -0
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/top_level.txt +0 -0
|
@@ -1,163 +1,163 @@
|
|
|
1
|
-
import gzip
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
|
|
4
|
-
from hdfs import InsecureClient
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class HDFSUtils(object):
|
|
8
|
-
"""
|
|
9
|
-
HDFS 工具类,封装常见的 HDFS 操作。
|
|
10
|
-
|
|
11
|
-
InsecureClient: 缺陷 写大文件数据时无法写入不报错
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
def __init__(self, hdfs_url, hdfs_user):
|
|
15
|
-
"""
|
|
16
|
-
初始化 HDFS 客户端。
|
|
17
|
-
:param hdfs_url: HDFS 的 URL,例如 "http://namenode:50070"
|
|
18
|
-
:param hdfs_user: HDFS 用户名
|
|
19
|
-
"""
|
|
20
|
-
self.hdfs_url = hdfs_url
|
|
21
|
-
self.hdfs_user = hdfs_user
|
|
22
|
-
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
23
|
-
|
|
24
|
-
def upload_file(self, local_path, hdfs_path, overwrite=False):
|
|
25
|
-
"""
|
|
26
|
-
将本地文件上传到 HDFS。
|
|
27
|
-
:param local_path: 本地文件路径
|
|
28
|
-
:param hdfs_path: HDFS 文件路径
|
|
29
|
-
:param overwrite: 是否覆盖已存在的文件
|
|
30
|
-
:return: None
|
|
31
|
-
"""
|
|
32
|
-
self.client.upload(hdfs_path, local_path, overwrite=overwrite)
|
|
33
|
-
print(f"文件上传成功: {local_path} -> {hdfs_path}")
|
|
34
|
-
|
|
35
|
-
def download_file(self, hdfs_path, local_path, overwrite=False):
|
|
36
|
-
"""
|
|
37
|
-
从 HDFS 下载文件到本地。
|
|
38
|
-
:param hdfs_path: HDFS 文件路径
|
|
39
|
-
:param local_path: 本地文件路径
|
|
40
|
-
:param overwrite: 是否覆盖已存在的文件
|
|
41
|
-
:return: None
|
|
42
|
-
"""
|
|
43
|
-
self.client.download(hdfs_path, local_path, overwrite=overwrite)
|
|
44
|
-
print(f"文件下载成功: {hdfs_path} -> {local_path}")
|
|
45
|
-
|
|
46
|
-
def delete_file(self, hdfs_path, recursive=False):
|
|
47
|
-
"""
|
|
48
|
-
删除 HDFS 上的文件或目录。
|
|
49
|
-
:param hdfs_path: HDFS 文件或目录路径
|
|
50
|
-
:param recursive: 是否递归删除目录
|
|
51
|
-
:return: None
|
|
52
|
-
"""
|
|
53
|
-
self.client.delete(hdfs_path, recursive=recursive)
|
|
54
|
-
print(f"文件/目录删除成功: {hdfs_path}")
|
|
55
|
-
|
|
56
|
-
def create_directory(self, hdfs_path):
|
|
57
|
-
"""
|
|
58
|
-
在 HDFS 上创建目录。
|
|
59
|
-
:param hdfs_path: HDFS 目录路径
|
|
60
|
-
:return: None
|
|
61
|
-
"""
|
|
62
|
-
self.client.makedirs(hdfs_path)
|
|
63
|
-
print(f"目录创建成功: {hdfs_path}")
|
|
64
|
-
|
|
65
|
-
def list_files(self, hdfs_path):
|
|
66
|
-
"""
|
|
67
|
-
列出 HDFS 目录下的文件和子目录。
|
|
68
|
-
:param hdfs_path: HDFS 目录路径
|
|
69
|
-
:return: 文件/目录列表
|
|
70
|
-
"""
|
|
71
|
-
files = self.client.list(hdfs_path)
|
|
72
|
-
return files
|
|
73
|
-
|
|
74
|
-
def read_file(self, hdfs_path):
|
|
75
|
-
"""
|
|
76
|
-
读取 HDFS 文件内容。
|
|
77
|
-
:param hdfs_path: HDFS 文件路径
|
|
78
|
-
:return: 文件内容
|
|
79
|
-
"""
|
|
80
|
-
with self.client.read(hdfs_path) as reader:
|
|
81
|
-
content = reader.read()
|
|
82
|
-
print(f"文件读取成功: {hdfs_path}")
|
|
83
|
-
return content
|
|
84
|
-
|
|
85
|
-
def read_gz_file(self, hdfs_path, encoding='utf-8'):
|
|
86
|
-
"""
|
|
87
|
-
读取 HDFS 上的 .gz 文件内容。
|
|
88
|
-
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
89
|
-
:param encoding: 文件编码格式(默认 utf-8)
|
|
90
|
-
:return: 文件内容
|
|
91
|
-
"""
|
|
92
|
-
with self.client.read(hdfs_path) as reader: # 以二进制模式读取
|
|
93
|
-
compressed_data = reader.read() # 读取压缩数据
|
|
94
|
-
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
|
|
95
|
-
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
96
|
-
print(f"文件读取成功: {hdfs_path}")
|
|
97
|
-
return content
|
|
98
|
-
|
|
99
|
-
def write_file(self, hdfs_path, content, overwrite=False, encoding='utf-8'):
|
|
100
|
-
"""
|
|
101
|
-
向 HDFS 文件写入内容。
|
|
102
|
-
:param hdfs_path: HDFS 文件路径
|
|
103
|
-
:param content: 要写入的内容
|
|
104
|
-
:param overwrite: 是否覆盖已存在的文件
|
|
105
|
-
:param encoding: 文件编码格式
|
|
106
|
-
:return: None
|
|
107
|
-
"""
|
|
108
|
-
with self.client.write(hdfs_path, overwrite=overwrite, encoding=encoding) as writer:
|
|
109
|
-
writer.write(content)
|
|
110
|
-
print(f"文件写入成功: {hdfs_path}")
|
|
111
|
-
|
|
112
|
-
def write_file_kwargs(self, hdfs_path, content, **kwargs):
|
|
113
|
-
"""
|
|
114
|
-
向 HDFS 文件写入内容
|
|
115
|
-
自定义参数实现更大的灵活性
|
|
116
|
-
"""
|
|
117
|
-
with self.client.write(hdfs_path, **kwargs) as writer:
|
|
118
|
-
writer.write(content)
|
|
119
|
-
print(f"文件写入成功: {hdfs_path}")
|
|
120
|
-
|
|
121
|
-
def safe_append_hdfs(self, hdfs_path, content):
|
|
122
|
-
"""
|
|
123
|
-
更安全的追加写入方式,显式检查文件是否存在
|
|
124
|
-
|
|
125
|
-
:param content: 要写入的内容
|
|
126
|
-
:param hdfs_path: HDFS文件路径
|
|
127
|
-
"""
|
|
128
|
-
try:
|
|
129
|
-
# 检查文件是否存在
|
|
130
|
-
file_exists = self.client.status(hdfs_path, strict=False) is not None
|
|
131
|
-
|
|
132
|
-
if not file_exists:
|
|
133
|
-
print(f"文件 {hdfs_path} 不存在,将创建新文件")
|
|
134
|
-
# 第一次写入不使用append模式
|
|
135
|
-
with self.client.write(hdfs_path, encoding='utf-8') as writer:
|
|
136
|
-
writer.write(content)
|
|
137
|
-
else:
|
|
138
|
-
# 追加模式写入
|
|
139
|
-
with self.client.write(hdfs_path, encoding='utf-8', append=True) as writer:
|
|
140
|
-
writer.write(content)
|
|
141
|
-
|
|
142
|
-
except Exception as e:
|
|
143
|
-
print(f"文件操作失败: {str(e)}")
|
|
144
|
-
raise
|
|
145
|
-
|
|
146
|
-
def file_exists(self, hdfs_path):
|
|
147
|
-
"""
|
|
148
|
-
检查 HDFS 文件或目录是否存在。
|
|
149
|
-
:param hdfs_path: HDFS 文件或目录路径
|
|
150
|
-
:return: 是否存在
|
|
151
|
-
"""
|
|
152
|
-
status = self.client.status(hdfs_path, strict=False)
|
|
153
|
-
return status is not None
|
|
154
|
-
|
|
155
|
-
def rename_file(self, hdfs_src_path, hdfs_dst_path):
|
|
156
|
-
"""
|
|
157
|
-
重命名或移动 HDFS 文件/目录。
|
|
158
|
-
:param hdfs_src_path: 源路径
|
|
159
|
-
:param hdfs_dst_path: 目标路径
|
|
160
|
-
:return: None
|
|
161
|
-
"""
|
|
162
|
-
self.client.rename(hdfs_src_path, hdfs_dst_path)
|
|
163
|
-
print(f"文件/目录重命名成功: {hdfs_src_path} -> {hdfs_dst_path}")
|
|
1
|
+
import gzip
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
|
|
4
|
+
from hdfs import InsecureClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HDFSUtils(object):
|
|
8
|
+
"""
|
|
9
|
+
HDFS 工具类,封装常见的 HDFS 操作。
|
|
10
|
+
|
|
11
|
+
InsecureClient: 缺陷 写大文件数据时无法写入不报错
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, hdfs_url, hdfs_user):
|
|
15
|
+
"""
|
|
16
|
+
初始化 HDFS 客户端。
|
|
17
|
+
:param hdfs_url: HDFS 的 URL,例如 "http://namenode:50070"
|
|
18
|
+
:param hdfs_user: HDFS 用户名
|
|
19
|
+
"""
|
|
20
|
+
self.hdfs_url = hdfs_url
|
|
21
|
+
self.hdfs_user = hdfs_user
|
|
22
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
23
|
+
|
|
24
|
+
def upload_file(self, local_path, hdfs_path, overwrite=False):
|
|
25
|
+
"""
|
|
26
|
+
将本地文件上传到 HDFS。
|
|
27
|
+
:param local_path: 本地文件路径
|
|
28
|
+
:param hdfs_path: HDFS 文件路径
|
|
29
|
+
:param overwrite: 是否覆盖已存在的文件
|
|
30
|
+
:return: None
|
|
31
|
+
"""
|
|
32
|
+
self.client.upload(hdfs_path, local_path, overwrite=overwrite)
|
|
33
|
+
print(f"文件上传成功: {local_path} -> {hdfs_path}")
|
|
34
|
+
|
|
35
|
+
def download_file(self, hdfs_path, local_path, overwrite=False):
|
|
36
|
+
"""
|
|
37
|
+
从 HDFS 下载文件到本地。
|
|
38
|
+
:param hdfs_path: HDFS 文件路径
|
|
39
|
+
:param local_path: 本地文件路径
|
|
40
|
+
:param overwrite: 是否覆盖已存在的文件
|
|
41
|
+
:return: None
|
|
42
|
+
"""
|
|
43
|
+
self.client.download(hdfs_path, local_path, overwrite=overwrite)
|
|
44
|
+
print(f"文件下载成功: {hdfs_path} -> {local_path}")
|
|
45
|
+
|
|
46
|
+
def delete_file(self, hdfs_path, recursive=False):
|
|
47
|
+
"""
|
|
48
|
+
删除 HDFS 上的文件或目录。
|
|
49
|
+
:param hdfs_path: HDFS 文件或目录路径
|
|
50
|
+
:param recursive: 是否递归删除目录
|
|
51
|
+
:return: None
|
|
52
|
+
"""
|
|
53
|
+
self.client.delete(hdfs_path, recursive=recursive)
|
|
54
|
+
print(f"文件/目录删除成功: {hdfs_path}")
|
|
55
|
+
|
|
56
|
+
def create_directory(self, hdfs_path):
|
|
57
|
+
"""
|
|
58
|
+
在 HDFS 上创建目录。
|
|
59
|
+
:param hdfs_path: HDFS 目录路径
|
|
60
|
+
:return: None
|
|
61
|
+
"""
|
|
62
|
+
self.client.makedirs(hdfs_path)
|
|
63
|
+
print(f"目录创建成功: {hdfs_path}")
|
|
64
|
+
|
|
65
|
+
def list_files(self, hdfs_path):
|
|
66
|
+
"""
|
|
67
|
+
列出 HDFS 目录下的文件和子目录。
|
|
68
|
+
:param hdfs_path: HDFS 目录路径
|
|
69
|
+
:return: 文件/目录列表
|
|
70
|
+
"""
|
|
71
|
+
files = self.client.list(hdfs_path)
|
|
72
|
+
return files
|
|
73
|
+
|
|
74
|
+
def read_file(self, hdfs_path):
|
|
75
|
+
"""
|
|
76
|
+
读取 HDFS 文件内容。
|
|
77
|
+
:param hdfs_path: HDFS 文件路径
|
|
78
|
+
:return: 文件内容
|
|
79
|
+
"""
|
|
80
|
+
with self.client.read(hdfs_path) as reader:
|
|
81
|
+
content = reader.read()
|
|
82
|
+
print(f"文件读取成功: {hdfs_path}")
|
|
83
|
+
return content
|
|
84
|
+
|
|
85
|
+
def read_gz_file(self, hdfs_path, encoding='utf-8'):
|
|
86
|
+
"""
|
|
87
|
+
读取 HDFS 上的 .gz 文件内容。
|
|
88
|
+
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
89
|
+
:param encoding: 文件编码格式(默认 utf-8)
|
|
90
|
+
:return: 文件内容
|
|
91
|
+
"""
|
|
92
|
+
with self.client.read(hdfs_path) as reader: # 以二进制模式读取
|
|
93
|
+
compressed_data = reader.read() # 读取压缩数据
|
|
94
|
+
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
|
|
95
|
+
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
96
|
+
print(f"文件读取成功: {hdfs_path}")
|
|
97
|
+
return content
|
|
98
|
+
|
|
99
|
+
def write_file(self, hdfs_path, content, overwrite=False, encoding='utf-8'):
|
|
100
|
+
"""
|
|
101
|
+
向 HDFS 文件写入内容。
|
|
102
|
+
:param hdfs_path: HDFS 文件路径
|
|
103
|
+
:param content: 要写入的内容
|
|
104
|
+
:param overwrite: 是否覆盖已存在的文件
|
|
105
|
+
:param encoding: 文件编码格式
|
|
106
|
+
:return: None
|
|
107
|
+
"""
|
|
108
|
+
with self.client.write(hdfs_path, overwrite=overwrite, encoding=encoding) as writer:
|
|
109
|
+
writer.write(content)
|
|
110
|
+
print(f"文件写入成功: {hdfs_path}")
|
|
111
|
+
|
|
112
|
+
def write_file_kwargs(self, hdfs_path, content, **kwargs):
|
|
113
|
+
"""
|
|
114
|
+
向 HDFS 文件写入内容
|
|
115
|
+
自定义参数实现更大的灵活性
|
|
116
|
+
"""
|
|
117
|
+
with self.client.write(hdfs_path, **kwargs) as writer:
|
|
118
|
+
writer.write(content)
|
|
119
|
+
print(f"文件写入成功: {hdfs_path}")
|
|
120
|
+
|
|
121
|
+
def safe_append_hdfs(self, hdfs_path, content):
|
|
122
|
+
"""
|
|
123
|
+
更安全的追加写入方式,显式检查文件是否存在
|
|
124
|
+
|
|
125
|
+
:param content: 要写入的内容
|
|
126
|
+
:param hdfs_path: HDFS文件路径
|
|
127
|
+
"""
|
|
128
|
+
try:
|
|
129
|
+
# 检查文件是否存在
|
|
130
|
+
file_exists = self.client.status(hdfs_path, strict=False) is not None
|
|
131
|
+
|
|
132
|
+
if not file_exists:
|
|
133
|
+
print(f"文件 {hdfs_path} 不存在,将创建新文件")
|
|
134
|
+
# 第一次写入不使用append模式
|
|
135
|
+
with self.client.write(hdfs_path, encoding='utf-8') as writer:
|
|
136
|
+
writer.write(content)
|
|
137
|
+
else:
|
|
138
|
+
# 追加模式写入
|
|
139
|
+
with self.client.write(hdfs_path, encoding='utf-8', append=True) as writer:
|
|
140
|
+
writer.write(content)
|
|
141
|
+
|
|
142
|
+
except Exception as e:
|
|
143
|
+
print(f"文件操作失败: {str(e)}")
|
|
144
|
+
raise
|
|
145
|
+
|
|
146
|
+
def file_exists(self, hdfs_path):
|
|
147
|
+
"""
|
|
148
|
+
检查 HDFS 文件或目录是否存在。
|
|
149
|
+
:param hdfs_path: HDFS 文件或目录路径
|
|
150
|
+
:return: 是否存在
|
|
151
|
+
"""
|
|
152
|
+
status = self.client.status(hdfs_path, strict=False)
|
|
153
|
+
return status is not None
|
|
154
|
+
|
|
155
|
+
def rename_file(self, hdfs_src_path, hdfs_dst_path):
|
|
156
|
+
"""
|
|
157
|
+
重命名或移动 HDFS 文件/目录。
|
|
158
|
+
:param hdfs_src_path: 源路径
|
|
159
|
+
:param hdfs_dst_path: 目标路径
|
|
160
|
+
:return: None
|
|
161
|
+
"""
|
|
162
|
+
self.client.rename(hdfs_src_path, hdfs_dst_path)
|
|
163
|
+
print(f"文件/目录重命名成功: {hdfs_src_path} -> {hdfs_dst_path}")
|