re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
- re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
- re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +94 -77
- re_common/v2/baselibrary/utils/db.py +174 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +187 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +312 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
- re_common-10.0.41.dist-info/RECORD +252 -0
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
|
@@ -1,187 +1,187 @@
|
|
|
1
|
-
import gzip
|
|
2
|
-
import io
|
|
3
|
-
import json
|
|
4
|
-
from io import BytesIO
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import List, Generator
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from hdfs import InsecureClient
|
|
10
|
-
|
|
11
|
-
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileReader
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class HDFSFileReader(BaseFileReader):
|
|
15
|
-
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
16
|
-
super().__init__(batch_size)
|
|
17
|
-
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
18
|
-
|
|
19
|
-
def list_files(self, path: str) -> List[str]:
|
|
20
|
-
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0] != '_SUCCESS']
|
|
21
|
-
|
|
22
|
-
def count_lines(self, file_path: str) -> int:
|
|
23
|
-
with self.client.read(file_path) as f:
|
|
24
|
-
return sum(1 for _ in f)
|
|
25
|
-
|
|
26
|
-
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
27
|
-
# 批量读取后 处理 缺点 连接可能会断
|
|
28
|
-
with self.client.read(file_path) as f:
|
|
29
|
-
while True:
|
|
30
|
-
batch = []
|
|
31
|
-
for _ in range(self.batch_size):
|
|
32
|
-
try:
|
|
33
|
-
line = next(f)
|
|
34
|
-
line = line.decode('utf-8')
|
|
35
|
-
if line.strip():
|
|
36
|
-
batch.append(line.strip())
|
|
37
|
-
except StopIteration:
|
|
38
|
-
break
|
|
39
|
-
if not batch:
|
|
40
|
-
break
|
|
41
|
-
yield batch
|
|
42
|
-
|
|
43
|
-
def read_all(self, file_path: str) -> List[List[str]]:
|
|
44
|
-
# 一次读取返回所有后批量处理缺点 内存占用
|
|
45
|
-
with self.client.read(file_path) as f:
|
|
46
|
-
lines = [line.decode('utf-8').strip() for line in f if line.decode('utf-8').strip()]
|
|
47
|
-
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class HDFSGZFileReader(BaseFileReader):
|
|
51
|
-
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
52
|
-
super().__init__(batch_size)
|
|
53
|
-
self.hdfs_url = hdfs_url
|
|
54
|
-
self.hdfs_user = hdfs_user
|
|
55
|
-
self.client = None
|
|
56
|
-
|
|
57
|
-
def _init_client(self):
|
|
58
|
-
if self.client is None:
|
|
59
|
-
self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
|
|
60
|
-
return self
|
|
61
|
-
|
|
62
|
-
def list_files(self, path: str) -> List[str]:
|
|
63
|
-
self._init_client()
|
|
64
|
-
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".gz")]
|
|
65
|
-
|
|
66
|
-
def count_lines(self, file_path: str) -> int:
|
|
67
|
-
self._init_client()
|
|
68
|
-
with self.client.read(file_path) as f:
|
|
69
|
-
with gzip.GzipFile(fileobj=f) as gz:
|
|
70
|
-
return sum(1 for _ in gz)
|
|
71
|
-
|
|
72
|
-
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
73
|
-
self._init_client()
|
|
74
|
-
# 批量读取后 处理 缺点 连接可能会断
|
|
75
|
-
with self.client.read(file_path) as f:
|
|
76
|
-
with gzip.GzipFile(fileobj=f) as gz:
|
|
77
|
-
while True:
|
|
78
|
-
batch = []
|
|
79
|
-
for _ in range(self.batch_size):
|
|
80
|
-
try:
|
|
81
|
-
line = next(gz)
|
|
82
|
-
if line.strip():
|
|
83
|
-
batch.append(line.decode("utf-8"))
|
|
84
|
-
except StopIteration:
|
|
85
|
-
break
|
|
86
|
-
if not batch:
|
|
87
|
-
break
|
|
88
|
-
yield batch
|
|
89
|
-
|
|
90
|
-
def read_all(self, file_path: str) -> List[List[str]]:
|
|
91
|
-
self._init_client()
|
|
92
|
-
# 一次读取返回所有后批量处理缺点 内存占用
|
|
93
|
-
with self.client.read(file_path) as reader:
|
|
94
|
-
compressed_data = reader.read()
|
|
95
|
-
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file:
|
|
96
|
-
content = gz_file.read().decode("utf-8")
|
|
97
|
-
lines = [i for i in content.split("\n") if i.strip()]
|
|
98
|
-
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
class HDFSParquetFileReader(BaseFileReader):
|
|
102
|
-
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
103
|
-
super().__init__(batch_size)
|
|
104
|
-
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
105
|
-
|
|
106
|
-
def list_files(self, path: str) -> List[str]:
|
|
107
|
-
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".parquet")]
|
|
108
|
-
|
|
109
|
-
def count_lines(self, file_path: str) -> int:
|
|
110
|
-
with self.client.read(file_path) as f:
|
|
111
|
-
data = f.read()
|
|
112
|
-
df = pd.read_parquet(io.BytesIO(data))
|
|
113
|
-
count = len(df)
|
|
114
|
-
return count
|
|
115
|
-
|
|
116
|
-
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
117
|
-
# 批量读取后 处理 缺点 连接可能会断
|
|
118
|
-
with self.client.read(file_path) as f:
|
|
119
|
-
data = f.read()
|
|
120
|
-
df = pd.read_parquet(io.BytesIO(data))
|
|
121
|
-
records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
|
|
122
|
-
for i in range(0, len(records), self.batch_size):
|
|
123
|
-
yield records[i: i + self.batch_size]
|
|
124
|
-
|
|
125
|
-
def read_all(self, file_path: str) -> List[List[str]]:
|
|
126
|
-
# 一次读取返回所有后批量处理缺点 内存占用
|
|
127
|
-
with self.client.read(file_path) as f:
|
|
128
|
-
data = f.read()
|
|
129
|
-
df = pd.read_parquet(io.BytesIO(data))
|
|
130
|
-
records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
|
|
131
|
-
return [records[i: i + self.batch_size] for i in range(0, len(records), self.batch_size)]
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
class LocalGZFileReader(BaseFileReader):
|
|
135
|
-
def list_files(self, path: str) -> List[str]:
|
|
136
|
-
return [str(p) for p in Path(path).rglob("*.gz")]
|
|
137
|
-
|
|
138
|
-
def count_lines(self, file_path: str) -> int:
|
|
139
|
-
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
140
|
-
return sum(1 for _ in f)
|
|
141
|
-
|
|
142
|
-
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
143
|
-
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
144
|
-
while True:
|
|
145
|
-
batch = []
|
|
146
|
-
for _ in range(self.batch_size):
|
|
147
|
-
line = f.readline()
|
|
148
|
-
if not line:
|
|
149
|
-
break
|
|
150
|
-
if line.strip():
|
|
151
|
-
batch.append(line.strip())
|
|
152
|
-
if not batch:
|
|
153
|
-
break
|
|
154
|
-
yield batch
|
|
155
|
-
|
|
156
|
-
def read_all(self, file_path: str) -> List[List[str]]:
|
|
157
|
-
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
158
|
-
lines = [line.strip() for line in f if line.strip()]
|
|
159
|
-
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
class LocalFileReader(BaseFileReader):
|
|
163
|
-
def list_files(self, path: str) -> List[str]:
|
|
164
|
-
return [str(p) for p in Path(path).rglob("*") if p.is_file()]
|
|
165
|
-
|
|
166
|
-
def count_lines(self, file_path: str) -> int:
|
|
167
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
168
|
-
return sum(1 for _ in f)
|
|
169
|
-
|
|
170
|
-
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
171
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
172
|
-
while True:
|
|
173
|
-
batch = []
|
|
174
|
-
for _ in range(self.batch_size):
|
|
175
|
-
line = f.readline()
|
|
176
|
-
if not line:
|
|
177
|
-
break
|
|
178
|
-
if line.strip():
|
|
179
|
-
batch.append(line.strip())
|
|
180
|
-
if not batch:
|
|
181
|
-
break
|
|
182
|
-
yield batch
|
|
183
|
-
|
|
184
|
-
def read_all(self, file_path: str) -> List[List[str]]:
|
|
185
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
186
|
-
lines = [line.strip() for line in f if line.strip()]
|
|
187
|
-
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
1
|
+
import gzip
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Generator
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from hdfs import InsecureClient
|
|
10
|
+
|
|
11
|
+
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileReader
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HDFSFileReader(BaseFileReader):
|
|
15
|
+
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
16
|
+
super().__init__(batch_size)
|
|
17
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
18
|
+
|
|
19
|
+
def list_files(self, path: str) -> List[str]:
|
|
20
|
+
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0] != '_SUCCESS']
|
|
21
|
+
|
|
22
|
+
def count_lines(self, file_path: str) -> int:
|
|
23
|
+
with self.client.read(file_path) as f:
|
|
24
|
+
return sum(1 for _ in f)
|
|
25
|
+
|
|
26
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
27
|
+
# 批量读取后 处理 缺点 连接可能会断
|
|
28
|
+
with self.client.read(file_path) as f:
|
|
29
|
+
while True:
|
|
30
|
+
batch = []
|
|
31
|
+
for _ in range(self.batch_size):
|
|
32
|
+
try:
|
|
33
|
+
line = next(f)
|
|
34
|
+
line = line.decode('utf-8')
|
|
35
|
+
if line.strip():
|
|
36
|
+
batch.append(line.strip())
|
|
37
|
+
except StopIteration:
|
|
38
|
+
break
|
|
39
|
+
if not batch:
|
|
40
|
+
break
|
|
41
|
+
yield batch
|
|
42
|
+
|
|
43
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
44
|
+
# 一次读取返回所有后批量处理缺点 内存占用
|
|
45
|
+
with self.client.read(file_path) as f:
|
|
46
|
+
lines = [line.decode('utf-8').strip() for line in f if line.decode('utf-8').strip()]
|
|
47
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class HDFSGZFileReader(BaseFileReader):
|
|
51
|
+
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
52
|
+
super().__init__(batch_size)
|
|
53
|
+
self.hdfs_url = hdfs_url
|
|
54
|
+
self.hdfs_user = hdfs_user
|
|
55
|
+
self.client = None
|
|
56
|
+
|
|
57
|
+
def _init_client(self):
|
|
58
|
+
if self.client is None:
|
|
59
|
+
self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
|
|
60
|
+
return self
|
|
61
|
+
|
|
62
|
+
def list_files(self, path: str) -> List[str]:
|
|
63
|
+
self._init_client()
|
|
64
|
+
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".gz")]
|
|
65
|
+
|
|
66
|
+
def count_lines(self, file_path: str) -> int:
|
|
67
|
+
self._init_client()
|
|
68
|
+
with self.client.read(file_path) as f:
|
|
69
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
|
70
|
+
return sum(1 for _ in gz)
|
|
71
|
+
|
|
72
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
73
|
+
self._init_client()
|
|
74
|
+
# 批量读取后 处理 缺点 连接可能会断
|
|
75
|
+
with self.client.read(file_path) as f:
|
|
76
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
|
77
|
+
while True:
|
|
78
|
+
batch = []
|
|
79
|
+
for _ in range(self.batch_size):
|
|
80
|
+
try:
|
|
81
|
+
line = next(gz)
|
|
82
|
+
if line.strip():
|
|
83
|
+
batch.append(line.decode("utf-8"))
|
|
84
|
+
except StopIteration:
|
|
85
|
+
break
|
|
86
|
+
if not batch:
|
|
87
|
+
break
|
|
88
|
+
yield batch
|
|
89
|
+
|
|
90
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
91
|
+
self._init_client()
|
|
92
|
+
# 一次读取返回所有后批量处理缺点 内存占用
|
|
93
|
+
with self.client.read(file_path) as reader:
|
|
94
|
+
compressed_data = reader.read()
|
|
95
|
+
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file:
|
|
96
|
+
content = gz_file.read().decode("utf-8")
|
|
97
|
+
lines = [i for i in content.split("\n") if i.strip()]
|
|
98
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class HDFSParquetFileReader(BaseFileReader):
|
|
102
|
+
def __init__(self, batch_size: int = 1000, hdfs_url: str = "http://VIP-DC-MASTER-2:9870", hdfs_user: str = "root"):
|
|
103
|
+
super().__init__(batch_size)
|
|
104
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
105
|
+
|
|
106
|
+
def list_files(self, path: str) -> List[str]:
|
|
107
|
+
return [f"{path}/{f[0]}" for f in self.client.list(path, status=True) if f[0].endswith(".parquet")]
|
|
108
|
+
|
|
109
|
+
def count_lines(self, file_path: str) -> int:
|
|
110
|
+
with self.client.read(file_path) as f:
|
|
111
|
+
data = f.read()
|
|
112
|
+
df = pd.read_parquet(io.BytesIO(data))
|
|
113
|
+
count = len(df)
|
|
114
|
+
return count
|
|
115
|
+
|
|
116
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
117
|
+
# 批量读取后 处理 缺点 连接可能会断
|
|
118
|
+
with self.client.read(file_path) as f:
|
|
119
|
+
data = f.read()
|
|
120
|
+
df = pd.read_parquet(io.BytesIO(data))
|
|
121
|
+
records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
|
|
122
|
+
for i in range(0, len(records), self.batch_size):
|
|
123
|
+
yield records[i: i + self.batch_size]
|
|
124
|
+
|
|
125
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
126
|
+
# 一次读取返回所有后批量处理缺点 内存占用
|
|
127
|
+
with self.client.read(file_path) as f:
|
|
128
|
+
data = f.read()
|
|
129
|
+
df = pd.read_parquet(io.BytesIO(data))
|
|
130
|
+
records = [json.dumps(row, ensure_ascii=False) for row in df.to_dict(orient='records')]
|
|
131
|
+
return [records[i: i + self.batch_size] for i in range(0, len(records), self.batch_size)]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class LocalGZFileReader(BaseFileReader):
|
|
135
|
+
def list_files(self, path: str) -> List[str]:
|
|
136
|
+
return [str(p) for p in Path(path).rglob("*.gz")]
|
|
137
|
+
|
|
138
|
+
def count_lines(self, file_path: str) -> int:
|
|
139
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
140
|
+
return sum(1 for _ in f)
|
|
141
|
+
|
|
142
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
143
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
144
|
+
while True:
|
|
145
|
+
batch = []
|
|
146
|
+
for _ in range(self.batch_size):
|
|
147
|
+
line = f.readline()
|
|
148
|
+
if not line:
|
|
149
|
+
break
|
|
150
|
+
if line.strip():
|
|
151
|
+
batch.append(line.strip())
|
|
152
|
+
if not batch:
|
|
153
|
+
break
|
|
154
|
+
yield batch
|
|
155
|
+
|
|
156
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
157
|
+
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
|
|
158
|
+
lines = [line.strip() for line in f if line.strip()]
|
|
159
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class LocalFileReader(BaseFileReader):
|
|
163
|
+
def list_files(self, path: str) -> List[str]:
|
|
164
|
+
return [str(p) for p in Path(path).rglob("*") if p.is_file()]
|
|
165
|
+
|
|
166
|
+
def count_lines(self, file_path: str) -> int:
|
|
167
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
168
|
+
return sum(1 for _ in f)
|
|
169
|
+
|
|
170
|
+
def read_lines(self, file_path: str) -> Generator[List[str], None, None]:
|
|
171
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
172
|
+
while True:
|
|
173
|
+
batch = []
|
|
174
|
+
for _ in range(self.batch_size):
|
|
175
|
+
line = f.readline()
|
|
176
|
+
if not line:
|
|
177
|
+
break
|
|
178
|
+
if line.strip():
|
|
179
|
+
batch.append(line.strip())
|
|
180
|
+
if not batch:
|
|
181
|
+
break
|
|
182
|
+
yield batch
|
|
183
|
+
|
|
184
|
+
def read_all(self, file_path: str) -> List[List[str]]:
|
|
185
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
186
|
+
lines = [line.strip() for line in f if line.strip()]
|
|
187
|
+
return [lines[i: i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
@@ -1,38 +1,38 @@
|
|
|
1
|
-
import gzip
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
from typing import List
|
|
4
|
-
|
|
5
|
-
from hdfs import InsecureClient
|
|
6
|
-
|
|
7
|
-
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileWriter
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class HDFSFileWriter(BaseFileWriter):
|
|
11
|
-
def __init__(self, file_path: str, hdfs_url: str, hdfs_user: str, *args, **kwargs):
|
|
12
|
-
super().__init__(file_path, *args, **kwargs)
|
|
13
|
-
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
14
|
-
|
|
15
|
-
def write_lines(self, lines: List[str], file_path: str = None):
|
|
16
|
-
if file_path is None:
|
|
17
|
-
file_path = self.file_path
|
|
18
|
-
data = "\n".join(lines).encode(self.encoding)
|
|
19
|
-
if self.compress:
|
|
20
|
-
buf = BytesIO()
|
|
21
|
-
with gzip.GzipFile(fileobj=buf, mode="wb") as gz:
|
|
22
|
-
gz.write(data)
|
|
23
|
-
buf.seek(0)
|
|
24
|
-
self.client.write(file_path, data=buf, overwrite=self.overwrite)
|
|
25
|
-
else:
|
|
26
|
-
self.client.write(file_path, data=data, overwrite=self.overwrite)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class LocalFileWriter(BaseFileWriter):
|
|
30
|
-
def write_lines(self, lines: List[str], file_path: str, compress: bool = True, encoding="utf-8"):
|
|
31
|
-
if compress:
|
|
32
|
-
with gzip.open(file_path, 'wt', encoding=encoding) as f:
|
|
33
|
-
for line in lines:
|
|
34
|
-
f.write(f"{line}\n")
|
|
35
|
-
else:
|
|
36
|
-
with open(file_path, 'w', encoding=encoding) as f:
|
|
37
|
-
for line in lines:
|
|
38
|
-
f.write(f"{line}\n")
|
|
1
|
+
import gzip
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from hdfs import InsecureClient
|
|
6
|
+
|
|
7
|
+
from re_common.v2.baselibrary.tools.data_processer.base import BaseFileWriter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HDFSFileWriter(BaseFileWriter):
|
|
11
|
+
def __init__(self, file_path: str, hdfs_url: str, hdfs_user: str, *args, **kwargs):
|
|
12
|
+
super().__init__(file_path, *args, **kwargs)
|
|
13
|
+
self.client = InsecureClient(hdfs_url, user=hdfs_user)
|
|
14
|
+
|
|
15
|
+
def write_lines(self, lines: List[str], file_path: str = None):
|
|
16
|
+
if file_path is None:
|
|
17
|
+
file_path = self.file_path
|
|
18
|
+
data = "\n".join(lines).encode(self.encoding)
|
|
19
|
+
if self.compress:
|
|
20
|
+
buf = BytesIO()
|
|
21
|
+
with gzip.GzipFile(fileobj=buf, mode="wb") as gz:
|
|
22
|
+
gz.write(data)
|
|
23
|
+
buf.seek(0)
|
|
24
|
+
self.client.write(file_path, data=buf, overwrite=self.overwrite)
|
|
25
|
+
else:
|
|
26
|
+
self.client.write(file_path, data=data, overwrite=self.overwrite)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class LocalFileWriter(BaseFileWriter):
|
|
30
|
+
def write_lines(self, lines: List[str], file_path: str, compress: bool = True, encoding="utf-8"):
|
|
31
|
+
if compress:
|
|
32
|
+
with gzip.open(file_path, 'wt', encoding=encoding) as f:
|
|
33
|
+
for line in lines:
|
|
34
|
+
f.write(f"{line}\n")
|
|
35
|
+
else:
|
|
36
|
+
with open(file_path, 'w', encoding=encoding) as f:
|
|
37
|
+
for line in lines:
|
|
38
|
+
f.write(f"{line}\n")
|
|
@@ -1,44 +1,44 @@
|
|
|
1
|
-
class DotDict(dict):
|
|
2
|
-
"""
|
|
3
|
-
让字典成为对象 既可以用字典方式访问 也可以用点访问key
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
def __init__(self, *args, **kwargs):
|
|
7
|
-
super().__init__(*args, **kwargs)
|
|
8
|
-
# 递归地将嵌套字典转换为 DotDict
|
|
9
|
-
for key, value in self.items():
|
|
10
|
-
if isinstance(value, dict):
|
|
11
|
-
self[key] = DotDict(value)
|
|
12
|
-
|
|
13
|
-
def __getattr__(self, key):
|
|
14
|
-
try:
|
|
15
|
-
value = self[key]
|
|
16
|
-
if isinstance(value, dict): # 如果值是字典,继续转换为 DotDict
|
|
17
|
-
return DotDict(value)
|
|
18
|
-
return value
|
|
19
|
-
except KeyError:
|
|
20
|
-
raise AttributeError(f"'DotDict' object has no attribute '{key}'")
|
|
21
|
-
|
|
22
|
-
def __setattr__(self, key, value):
|
|
23
|
-
if isinstance(value, dict): # 如果值是字典,转换为 DotDict
|
|
24
|
-
value = DotDict(value)
|
|
25
|
-
self[key] = value
|
|
26
|
-
|
|
27
|
-
def to_dict(self):
|
|
28
|
-
"""
|
|
29
|
-
将 DotDict 实例转换为普通字典
|
|
30
|
-
"""
|
|
31
|
-
result = {}
|
|
32
|
-
for key, value in self.items():
|
|
33
|
-
if isinstance(value, DotDict):
|
|
34
|
-
result[key] = value.to_dict()
|
|
35
|
-
else:
|
|
36
|
-
result[key] = value
|
|
37
|
-
return result
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def none_to_empty_str(d):
|
|
41
|
-
for k, v in d.items():
|
|
42
|
-
if v is None:
|
|
43
|
-
d[k] = ""
|
|
44
|
-
return d
|
|
1
|
+
class DotDict(dict):
|
|
2
|
+
"""
|
|
3
|
+
让字典成为对象 既可以用字典方式访问 也可以用点访问key
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
def __init__(self, *args, **kwargs):
|
|
7
|
+
super().__init__(*args, **kwargs)
|
|
8
|
+
# 递归地将嵌套字典转换为 DotDict
|
|
9
|
+
for key, value in self.items():
|
|
10
|
+
if isinstance(value, dict):
|
|
11
|
+
self[key] = DotDict(value)
|
|
12
|
+
|
|
13
|
+
def __getattr__(self, key):
|
|
14
|
+
try:
|
|
15
|
+
value = self[key]
|
|
16
|
+
if isinstance(value, dict): # 如果值是字典,继续转换为 DotDict
|
|
17
|
+
return DotDict(value)
|
|
18
|
+
return value
|
|
19
|
+
except KeyError:
|
|
20
|
+
raise AttributeError(f"'DotDict' object has no attribute '{key}'")
|
|
21
|
+
|
|
22
|
+
def __setattr__(self, key, value):
|
|
23
|
+
if isinstance(value, dict): # 如果值是字典,转换为 DotDict
|
|
24
|
+
value = DotDict(value)
|
|
25
|
+
self[key] = value
|
|
26
|
+
|
|
27
|
+
def to_dict(self):
|
|
28
|
+
"""
|
|
29
|
+
将 DotDict 实例转换为普通字典
|
|
30
|
+
"""
|
|
31
|
+
result = {}
|
|
32
|
+
for key, value in self.items():
|
|
33
|
+
if isinstance(value, DotDict):
|
|
34
|
+
result[key] = value.to_dict()
|
|
35
|
+
else:
|
|
36
|
+
result[key] = value
|
|
37
|
+
return result
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def none_to_empty_str(d):
|
|
41
|
+
for k, v in d.items():
|
|
42
|
+
if v is None:
|
|
43
|
+
d[k] = ""
|
|
44
|
+
return d
|