re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
- re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
- re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +94 -77
- re_common/v2/baselibrary/utils/db.py +174 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +187 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +312 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
- re_common-10.0.41.dist-info/RECORD +252 -0
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
|
@@ -1,338 +1,338 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import gzip
|
|
3
|
-
import json
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
import sqlite3
|
|
6
|
-
import time
|
|
7
|
-
import os
|
|
8
|
-
from io import BytesIO
|
|
9
|
-
from typing import Callable, Any, List
|
|
10
|
-
|
|
11
|
-
from hdfs import InsecureClient
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class HDFSDataProcessor:
|
|
15
|
-
def __init__(
|
|
16
|
-
self,
|
|
17
|
-
hdfs_url="http://VIP-DC-MASTER-2:9870",
|
|
18
|
-
hdfs_user="root",
|
|
19
|
-
db_file="processed_files.db",
|
|
20
|
-
batch_size=50,
|
|
21
|
-
retry_limit=3,
|
|
22
|
-
):
|
|
23
|
-
self.hdfs_url = hdfs_url
|
|
24
|
-
self.hdfs_user = hdfs_user
|
|
25
|
-
self.db_file = db_file
|
|
26
|
-
self.batch_size = batch_size
|
|
27
|
-
self.retry_limit = retry_limit
|
|
28
|
-
self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
|
|
29
|
-
self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
|
|
30
|
-
self.read_hdfs_model = "all"
|
|
31
|
-
self.init_db()
|
|
32
|
-
|
|
33
|
-
def init_db(self):
|
|
34
|
-
"""初始化 SQLite 数据库"""
|
|
35
|
-
with sqlite3.connect(self.db_file) as conn:
|
|
36
|
-
cursor = conn.cursor()
|
|
37
|
-
cursor.execute("""
|
|
38
|
-
CREATE TABLE IF NOT EXISTS processed_files (
|
|
39
|
-
file_path TEXT PRIMARY KEY
|
|
40
|
-
)
|
|
41
|
-
""")
|
|
42
|
-
conn.commit()
|
|
43
|
-
|
|
44
|
-
def save_processed_file(self, file_path):
|
|
45
|
-
"""保存处理过的文件"""
|
|
46
|
-
with sqlite3.connect(self.db_file) as conn:
|
|
47
|
-
cursor = conn.cursor()
|
|
48
|
-
cursor.execute(
|
|
49
|
-
"INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
|
|
50
|
-
(file_path,),
|
|
51
|
-
)
|
|
52
|
-
conn.commit()
|
|
53
|
-
|
|
54
|
-
def is_file_processed(self, file_path):
|
|
55
|
-
"""检查文件是否已处理"""
|
|
56
|
-
with sqlite3.connect(self.db_file) as conn:
|
|
57
|
-
cursor = conn.cursor()
|
|
58
|
-
cursor.execute(
|
|
59
|
-
"SELECT file_path FROM processed_files WHERE file_path = ?",
|
|
60
|
-
(file_path,),
|
|
61
|
-
)
|
|
62
|
-
result = cursor.fetchone()
|
|
63
|
-
return result is not None
|
|
64
|
-
|
|
65
|
-
def list_gz_files(self, hdfs_dir):
|
|
66
|
-
"""列出 HDFS 目录中的所有 gzip 文件"""
|
|
67
|
-
return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
|
|
68
|
-
|
|
69
|
-
def count_total_lines(self, gz_file_path: str):
|
|
70
|
-
with self.client.read(gz_file_path) as hdfs_file:
|
|
71
|
-
with gzip.GzipFile(fileobj=hdfs_file) as gz:
|
|
72
|
-
return sum(1 for _ in gz)
|
|
73
|
-
|
|
74
|
-
def batch_read_gz(self, gz_file_path: str):
|
|
75
|
-
"""分批读取 gz 文件"""
|
|
76
|
-
with self.client.read(gz_file_path) as hdfs_file:
|
|
77
|
-
with gzip.GzipFile(fileobj=hdfs_file) as gz:
|
|
78
|
-
while True:
|
|
79
|
-
lines = []
|
|
80
|
-
for _ in range(self.batch_size):
|
|
81
|
-
try:
|
|
82
|
-
line = next(gz)
|
|
83
|
-
if line.strip(): # 移除空行
|
|
84
|
-
lines.append(line.decode("utf-8")) # 解码
|
|
85
|
-
except StopIteration: # 文件已读完
|
|
86
|
-
break
|
|
87
|
-
if not lines:
|
|
88
|
-
break
|
|
89
|
-
yield lines
|
|
90
|
-
|
|
91
|
-
def all_read_gz(self, gz_file_path: str, encoding="utf-8"):
|
|
92
|
-
"""
|
|
93
|
-
读取 HDFS 上的 .gz 文件内容。
|
|
94
|
-
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
95
|
-
:param encoding: 文件编码格式(默认 utf-8)
|
|
96
|
-
:return: 文件内容
|
|
97
|
-
"""
|
|
98
|
-
with self.client.read(gz_file_path) as reader: # 以二进制模式读取
|
|
99
|
-
compressed_data = reader.read() # 读取压缩数据
|
|
100
|
-
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
|
|
101
|
-
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
102
|
-
print(f"文件读取成功: {gz_file_path}")
|
|
103
|
-
lines = [i for i in content.split("\n") if i.strip()]
|
|
104
|
-
result = [lines[i : i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
105
|
-
return result
|
|
106
|
-
|
|
107
|
-
async def process_data(self, data, process_func):
|
|
108
|
-
"""处理数据并执行处理函数"""
|
|
109
|
-
retry_count = 0
|
|
110
|
-
while retry_count < self.retry_limit:
|
|
111
|
-
try:
|
|
112
|
-
return await process_func(data) # 成功处理后退出
|
|
113
|
-
except Exception as e:
|
|
114
|
-
retry_count += 1
|
|
115
|
-
print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
|
|
116
|
-
await asyncio.sleep(2**retry_count)
|
|
117
|
-
raise Exception(f"处理数据失败, 达到重试上限, data: {data}")
|
|
118
|
-
|
|
119
|
-
async def process_file(self, hdfs_file_path, process_func, write_dir: str):
|
|
120
|
-
"""处理单个 gz 文件"""
|
|
121
|
-
total_lines = self.count_total_lines(hdfs_file_path)
|
|
122
|
-
processed_lines = 0
|
|
123
|
-
start_time = time.time()
|
|
124
|
-
results = []
|
|
125
|
-
# # 这里根据不同的配置选用不同的读取文件的方法
|
|
126
|
-
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
127
|
-
processing_start_time = time.time() # 记录本批处理开始时间
|
|
128
|
-
|
|
129
|
-
tasks = []
|
|
130
|
-
for line in lines:
|
|
131
|
-
try:
|
|
132
|
-
data = json.loads(line)
|
|
133
|
-
tasks.append(self.process_data(data, process_func))
|
|
134
|
-
except json.JSONDecodeError as e:
|
|
135
|
-
raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
136
|
-
|
|
137
|
-
# await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
|
|
138
|
-
results.extend(await asyncio.gather(*tasks))
|
|
139
|
-
|
|
140
|
-
processed_lines += len(lines)
|
|
141
|
-
|
|
142
|
-
elapsed_time = time.time() - start_time # 已用时间
|
|
143
|
-
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
144
|
-
avg_processing_time = (
|
|
145
|
-
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
146
|
-
) # 平均每条数据的处理时间(毫秒)
|
|
147
|
-
|
|
148
|
-
# 估算剩余时间
|
|
149
|
-
remaining_time = (
|
|
150
|
-
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
151
|
-
if processed_lines > 0
|
|
152
|
-
else float("inf")
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
# 显示总进度信息
|
|
156
|
-
print(
|
|
157
|
-
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
158
|
-
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
159
|
-
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
def generate_write_data(results):
|
|
163
|
-
for res in results:
|
|
164
|
-
yield str(res) + "\n"
|
|
165
|
-
|
|
166
|
-
if write_dir is not None:
|
|
167
|
-
self.client.write(
|
|
168
|
-
write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}",
|
|
169
|
-
data=generate_write_data(results),
|
|
170
|
-
overwrite=True,
|
|
171
|
-
encoding="utf-8",
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
# 最终进度显示
|
|
175
|
-
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
176
|
-
print(
|
|
177
|
-
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
178
|
-
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
179
|
-
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
180
|
-
if processed_lines > 0
|
|
181
|
-
else "处理无数据"
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
self.save_processed_file(hdfs_file_path) # 保存处理过的文件
|
|
185
|
-
|
|
186
|
-
async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
|
|
187
|
-
"""带重试机制的文件处理"""
|
|
188
|
-
retry_count = 0
|
|
189
|
-
while retry_count < self.retry_limit:
|
|
190
|
-
try:
|
|
191
|
-
await self.process_file(hdfs_file_path, process_func, write_dir)
|
|
192
|
-
return True # 成功处理后退出
|
|
193
|
-
except Exception as e:
|
|
194
|
-
retry_count += 1
|
|
195
|
-
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
196
|
-
await asyncio.sleep(2**retry_count)
|
|
197
|
-
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
198
|
-
return False
|
|
199
|
-
# raise
|
|
200
|
-
|
|
201
|
-
async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any], write_dir: str = None):
|
|
202
|
-
"""批量更新所有 gz 文件"""
|
|
203
|
-
gz_files = self.list_gz_files(hdfs_dir)
|
|
204
|
-
all_succeed = True
|
|
205
|
-
for hdfs_file_path in gz_files:
|
|
206
|
-
if self.is_file_processed(hdfs_file_path):
|
|
207
|
-
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
208
|
-
continue # 如果文件已处理,跳过
|
|
209
|
-
succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
|
|
210
|
-
if succeed is False:
|
|
211
|
-
all_succeed = False
|
|
212
|
-
|
|
213
|
-
if all_succeed:
|
|
214
|
-
# 处理完成后删除数据库文件
|
|
215
|
-
try:
|
|
216
|
-
if os.path.exists(self.db_file):
|
|
217
|
-
os.remove(self.db_file)
|
|
218
|
-
print(f"已删除断点重试文件: {self.db_file}")
|
|
219
|
-
except Exception as e:
|
|
220
|
-
print(f"删除断点重试文件失败: {e}")
|
|
221
|
-
|
|
222
|
-
async def process_file_bulk(self, hdfs_file_path, process_func):
|
|
223
|
-
"""按批次处理单个文件,批量数据传递给处理函数"""
|
|
224
|
-
total_lines = self.count_total_lines(hdfs_file_path)
|
|
225
|
-
processed_lines = 0
|
|
226
|
-
start_time = time.time()
|
|
227
|
-
|
|
228
|
-
tasks = []
|
|
229
|
-
# 这里根据不同的配置选用不同的读取文件的方法
|
|
230
|
-
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
231
|
-
processing_start_time = time.time() # 记录本批处理开始时间
|
|
232
|
-
|
|
233
|
-
batch_data = []
|
|
234
|
-
for line in lines:
|
|
235
|
-
try:
|
|
236
|
-
data = json.loads(line)
|
|
237
|
-
batch_data.append(data)
|
|
238
|
-
except json.JSONDecodeError as e:
|
|
239
|
-
raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
240
|
-
|
|
241
|
-
# 处理读取到的批次数据
|
|
242
|
-
if batch_data:
|
|
243
|
-
tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
|
|
244
|
-
processed_lines += len(batch_data) # 更新已处理行数
|
|
245
|
-
|
|
246
|
-
# 当积累的任务数量达到 batch_size 时并发处理所有任务
|
|
247
|
-
if len(tasks) >= self.batch_size:
|
|
248
|
-
await asyncio.gather(*tasks) # 同时处理多个批次
|
|
249
|
-
|
|
250
|
-
elapsed_time = time.time() - start_time # 已用时间
|
|
251
|
-
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
252
|
-
avg_processing_time = (
|
|
253
|
-
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
254
|
-
) # 平均每条数据的处理时间(毫秒)
|
|
255
|
-
|
|
256
|
-
# 估算剩余时间
|
|
257
|
-
remaining_time = (
|
|
258
|
-
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
259
|
-
if processed_lines > 0
|
|
260
|
-
else float("inf")
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
# 显示总进度信息
|
|
264
|
-
print(
|
|
265
|
-
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
266
|
-
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
267
|
-
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
# 清空任务列表,准备下一批处理
|
|
271
|
-
tasks.clear()
|
|
272
|
-
# 处理剩余的任务
|
|
273
|
-
if tasks:
|
|
274
|
-
await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
|
|
275
|
-
|
|
276
|
-
# 最终进度显示
|
|
277
|
-
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
278
|
-
print(
|
|
279
|
-
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
280
|
-
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
281
|
-
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
282
|
-
if processed_lines > 0
|
|
283
|
-
else "处理无数据"
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
self.save_processed_file(hdfs_file_path)
|
|
287
|
-
|
|
288
|
-
async def retry_process_file_bulk(self, hdfs_file_path, process_func):
|
|
289
|
-
"""带重试机制的批量文件处理"""
|
|
290
|
-
retry_count = 0
|
|
291
|
-
while retry_count < self.retry_limit:
|
|
292
|
-
try:
|
|
293
|
-
await self.process_file_bulk(hdfs_file_path, process_func)
|
|
294
|
-
return True # 成功处理后退出
|
|
295
|
-
except Exception as e:
|
|
296
|
-
retry_count += 1
|
|
297
|
-
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
298
|
-
await asyncio.sleep(2**retry_count)
|
|
299
|
-
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
300
|
-
return False
|
|
301
|
-
|
|
302
|
-
async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
|
|
303
|
-
"""批量处理 gz 文件中的数据"""
|
|
304
|
-
gz_files = self.list_gz_files(hdfs_dir)
|
|
305
|
-
all_succeed = True
|
|
306
|
-
for hdfs_file_path in gz_files:
|
|
307
|
-
if self.is_file_processed(hdfs_file_path):
|
|
308
|
-
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
309
|
-
continue # 跳过已处理文件
|
|
310
|
-
succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
|
|
311
|
-
if succeed is False:
|
|
312
|
-
all_succeed = False
|
|
313
|
-
|
|
314
|
-
if all_succeed:
|
|
315
|
-
# 处理完成后删除数据库文件
|
|
316
|
-
try:
|
|
317
|
-
if os.path.exists(self.db_file):
|
|
318
|
-
os.remove(self.db_file)
|
|
319
|
-
print(f"已删除断点重试文件: {self.db_file}")
|
|
320
|
-
except Exception as e:
|
|
321
|
-
print(f"删除断点重试文件失败: {e}")
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
# # 使用示例
|
|
325
|
-
# async def update_refer(data: dict):
|
|
326
|
-
# ref_id = data["ref_id"]
|
|
327
|
-
# url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
|
|
328
|
-
# update_data = data["update_data"]
|
|
329
|
-
# if not update_data:
|
|
330
|
-
# return
|
|
331
|
-
#
|
|
332
|
-
# # 此处为实际处理逻辑
|
|
333
|
-
# await ApiNetUtils.fetch_post(url=url, payload=update_data)
|
|
334
|
-
#
|
|
335
|
-
#
|
|
336
|
-
# if __name__ == "__main__":
|
|
337
|
-
# processor = HDFSDataProcessor() # 实例化数据处理类
|
|
338
|
-
# asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))
|
|
1
|
+
import asyncio
|
|
2
|
+
import gzip
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import sqlite3
|
|
6
|
+
import time
|
|
7
|
+
import os
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Callable, Any, List
|
|
10
|
+
|
|
11
|
+
from hdfs import InsecureClient
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HDFSDataProcessor:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
hdfs_url="http://VIP-DC-MASTER-2:9870",
|
|
18
|
+
hdfs_user="root",
|
|
19
|
+
db_file="processed_files.db",
|
|
20
|
+
batch_size=50,
|
|
21
|
+
retry_limit=3,
|
|
22
|
+
):
|
|
23
|
+
self.hdfs_url = hdfs_url
|
|
24
|
+
self.hdfs_user = hdfs_user
|
|
25
|
+
self.db_file = db_file
|
|
26
|
+
self.batch_size = batch_size
|
|
27
|
+
self.retry_limit = retry_limit
|
|
28
|
+
self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
|
|
29
|
+
self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
|
|
30
|
+
self.read_hdfs_model = "all"
|
|
31
|
+
self.init_db()
|
|
32
|
+
|
|
33
|
+
def init_db(self):
|
|
34
|
+
"""初始化 SQLite 数据库"""
|
|
35
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
36
|
+
cursor = conn.cursor()
|
|
37
|
+
cursor.execute("""
|
|
38
|
+
CREATE TABLE IF NOT EXISTS processed_files (
|
|
39
|
+
file_path TEXT PRIMARY KEY
|
|
40
|
+
)
|
|
41
|
+
""")
|
|
42
|
+
conn.commit()
|
|
43
|
+
|
|
44
|
+
def save_processed_file(self, file_path):
|
|
45
|
+
"""保存处理过的文件"""
|
|
46
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
47
|
+
cursor = conn.cursor()
|
|
48
|
+
cursor.execute(
|
|
49
|
+
"INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
|
|
50
|
+
(file_path,),
|
|
51
|
+
)
|
|
52
|
+
conn.commit()
|
|
53
|
+
|
|
54
|
+
def is_file_processed(self, file_path):
|
|
55
|
+
"""检查文件是否已处理"""
|
|
56
|
+
with sqlite3.connect(self.db_file) as conn:
|
|
57
|
+
cursor = conn.cursor()
|
|
58
|
+
cursor.execute(
|
|
59
|
+
"SELECT file_path FROM processed_files WHERE file_path = ?",
|
|
60
|
+
(file_path,),
|
|
61
|
+
)
|
|
62
|
+
result = cursor.fetchone()
|
|
63
|
+
return result is not None
|
|
64
|
+
|
|
65
|
+
def list_gz_files(self, hdfs_dir):
|
|
66
|
+
"""列出 HDFS 目录中的所有 gzip 文件"""
|
|
67
|
+
return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
|
|
68
|
+
|
|
69
|
+
def count_total_lines(self, gz_file_path: str):
|
|
70
|
+
with self.client.read(gz_file_path) as hdfs_file:
|
|
71
|
+
with gzip.GzipFile(fileobj=hdfs_file) as gz:
|
|
72
|
+
return sum(1 for _ in gz)
|
|
73
|
+
|
|
74
|
+
def batch_read_gz(self, gz_file_path: str):
|
|
75
|
+
"""分批读取 gz 文件"""
|
|
76
|
+
with self.client.read(gz_file_path) as hdfs_file:
|
|
77
|
+
with gzip.GzipFile(fileobj=hdfs_file) as gz:
|
|
78
|
+
while True:
|
|
79
|
+
lines = []
|
|
80
|
+
for _ in range(self.batch_size):
|
|
81
|
+
try:
|
|
82
|
+
line = next(gz)
|
|
83
|
+
if line.strip(): # 移除空行
|
|
84
|
+
lines.append(line.decode("utf-8")) # 解码
|
|
85
|
+
except StopIteration: # 文件已读完
|
|
86
|
+
break
|
|
87
|
+
if not lines:
|
|
88
|
+
break
|
|
89
|
+
yield lines
|
|
90
|
+
|
|
91
|
+
def all_read_gz(self, gz_file_path: str, encoding="utf-8"):
|
|
92
|
+
"""
|
|
93
|
+
读取 HDFS 上的 .gz 文件内容。
|
|
94
|
+
:param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
|
|
95
|
+
:param encoding: 文件编码格式(默认 utf-8)
|
|
96
|
+
:return: 文件内容
|
|
97
|
+
"""
|
|
98
|
+
with self.client.read(gz_file_path) as reader: # 以二进制模式读取
|
|
99
|
+
compressed_data = reader.read() # 读取压缩数据
|
|
100
|
+
with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
|
|
101
|
+
content = gz_file.read().decode(encoding) # 解码为字符串
|
|
102
|
+
print(f"文件读取成功: {gz_file_path}")
|
|
103
|
+
lines = [i for i in content.split("\n") if i.strip()]
|
|
104
|
+
result = [lines[i : i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
async def process_data(self, data, process_func):
|
|
108
|
+
"""处理数据并执行处理函数"""
|
|
109
|
+
retry_count = 0
|
|
110
|
+
while retry_count < self.retry_limit:
|
|
111
|
+
try:
|
|
112
|
+
return await process_func(data) # 成功处理后退出
|
|
113
|
+
except Exception as e:
|
|
114
|
+
retry_count += 1
|
|
115
|
+
print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
|
|
116
|
+
await asyncio.sleep(2**retry_count)
|
|
117
|
+
raise Exception(f"处理数据失败, 达到重试上限, data: {data}")
|
|
118
|
+
|
|
119
|
+
async def process_file(self, hdfs_file_path, process_func, write_dir: str):
|
|
120
|
+
"""处理单个 gz 文件"""
|
|
121
|
+
total_lines = self.count_total_lines(hdfs_file_path)
|
|
122
|
+
processed_lines = 0
|
|
123
|
+
start_time = time.time()
|
|
124
|
+
results = []
|
|
125
|
+
# # 这里根据不同的配置选用不同的读取文件的方法
|
|
126
|
+
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
127
|
+
processing_start_time = time.time() # 记录本批处理开始时间
|
|
128
|
+
|
|
129
|
+
tasks = []
|
|
130
|
+
for line in lines:
|
|
131
|
+
try:
|
|
132
|
+
data = json.loads(line)
|
|
133
|
+
tasks.append(self.process_data(data, process_func))
|
|
134
|
+
except json.JSONDecodeError as e:
|
|
135
|
+
raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
136
|
+
|
|
137
|
+
# await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
|
|
138
|
+
results.extend(await asyncio.gather(*tasks))
|
|
139
|
+
|
|
140
|
+
processed_lines += len(lines)
|
|
141
|
+
|
|
142
|
+
elapsed_time = time.time() - start_time # 已用时间
|
|
143
|
+
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
144
|
+
avg_processing_time = (
|
|
145
|
+
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
146
|
+
) # 平均每条数据的处理时间(毫秒)
|
|
147
|
+
|
|
148
|
+
# 估算剩余时间
|
|
149
|
+
remaining_time = (
|
|
150
|
+
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
151
|
+
if processed_lines > 0
|
|
152
|
+
else float("inf")
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# 显示总进度信息
|
|
156
|
+
print(
|
|
157
|
+
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
158
|
+
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
159
|
+
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def generate_write_data(results):
|
|
163
|
+
for res in results:
|
|
164
|
+
yield str(res) + "\n"
|
|
165
|
+
|
|
166
|
+
if write_dir is not None:
|
|
167
|
+
self.client.write(
|
|
168
|
+
write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}",
|
|
169
|
+
data=generate_write_data(results),
|
|
170
|
+
overwrite=True,
|
|
171
|
+
encoding="utf-8",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# 最终进度显示
|
|
175
|
+
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
176
|
+
print(
|
|
177
|
+
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
178
|
+
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
179
|
+
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
180
|
+
if processed_lines > 0
|
|
181
|
+
else "处理无数据"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
self.save_processed_file(hdfs_file_path) # 保存处理过的文件
|
|
185
|
+
|
|
186
|
+
async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
|
|
187
|
+
"""带重试机制的文件处理"""
|
|
188
|
+
retry_count = 0
|
|
189
|
+
while retry_count < self.retry_limit:
|
|
190
|
+
try:
|
|
191
|
+
await self.process_file(hdfs_file_path, process_func, write_dir)
|
|
192
|
+
return True # 成功处理后退出
|
|
193
|
+
except Exception as e:
|
|
194
|
+
retry_count += 1
|
|
195
|
+
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
196
|
+
await asyncio.sleep(2**retry_count)
|
|
197
|
+
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
198
|
+
return False
|
|
199
|
+
# raise
|
|
200
|
+
|
|
201
|
+
async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any], write_dir: str = None):
|
|
202
|
+
"""批量更新所有 gz 文件"""
|
|
203
|
+
gz_files = self.list_gz_files(hdfs_dir)
|
|
204
|
+
all_succeed = True
|
|
205
|
+
for hdfs_file_path in gz_files:
|
|
206
|
+
if self.is_file_processed(hdfs_file_path):
|
|
207
|
+
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
208
|
+
continue # 如果文件已处理,跳过
|
|
209
|
+
succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
|
|
210
|
+
if succeed is False:
|
|
211
|
+
all_succeed = False
|
|
212
|
+
|
|
213
|
+
if all_succeed:
|
|
214
|
+
# 处理完成后删除数据库文件
|
|
215
|
+
try:
|
|
216
|
+
if os.path.exists(self.db_file):
|
|
217
|
+
os.remove(self.db_file)
|
|
218
|
+
print(f"已删除断点重试文件: {self.db_file}")
|
|
219
|
+
except Exception as e:
|
|
220
|
+
print(f"删除断点重试文件失败: {e}")
|
|
221
|
+
|
|
222
|
+
async def process_file_bulk(self, hdfs_file_path, process_func):
|
|
223
|
+
"""按批次处理单个文件,批量数据传递给处理函数"""
|
|
224
|
+
total_lines = self.count_total_lines(hdfs_file_path)
|
|
225
|
+
processed_lines = 0
|
|
226
|
+
start_time = time.time()
|
|
227
|
+
|
|
228
|
+
tasks = []
|
|
229
|
+
# 这里根据不同的配置选用不同的读取文件的方法
|
|
230
|
+
for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
|
|
231
|
+
processing_start_time = time.time() # 记录本批处理开始时间
|
|
232
|
+
|
|
233
|
+
batch_data = []
|
|
234
|
+
for line in lines:
|
|
235
|
+
try:
|
|
236
|
+
data = json.loads(line)
|
|
237
|
+
batch_data.append(data)
|
|
238
|
+
except json.JSONDecodeError as e:
|
|
239
|
+
raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
|
|
240
|
+
|
|
241
|
+
# 处理读取到的批次数据
|
|
242
|
+
if batch_data:
|
|
243
|
+
tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
|
|
244
|
+
processed_lines += len(batch_data) # 更新已处理行数
|
|
245
|
+
|
|
246
|
+
# 当积累的任务数量达到 batch_size 时并发处理所有任务
|
|
247
|
+
if len(tasks) >= self.batch_size:
|
|
248
|
+
await asyncio.gather(*tasks) # 同时处理多个批次
|
|
249
|
+
|
|
250
|
+
elapsed_time = time.time() - start_time # 已用时间
|
|
251
|
+
processing_time = time.time() - processing_start_time # 本次处理时间
|
|
252
|
+
avg_processing_time = (
|
|
253
|
+
(elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
|
|
254
|
+
) # 平均每条数据的处理时间(毫秒)
|
|
255
|
+
|
|
256
|
+
# 估算剩余时间
|
|
257
|
+
remaining_time = (
|
|
258
|
+
((avg_processing_time / 1000) * (total_lines - processed_lines))
|
|
259
|
+
if processed_lines > 0
|
|
260
|
+
else float("inf")
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# 显示总进度信息
|
|
264
|
+
print(
|
|
265
|
+
f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
|
|
266
|
+
f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
|
|
267
|
+
f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# 清空任务列表,准备下一批处理
|
|
271
|
+
tasks.clear()
|
|
272
|
+
# 处理剩余的任务
|
|
273
|
+
if tasks:
|
|
274
|
+
await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
|
|
275
|
+
|
|
276
|
+
# 最终进度显示
|
|
277
|
+
final_elapsed_time = time.time() - start_time # 最终已用时间
|
|
278
|
+
print(
|
|
279
|
+
f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
|
|
280
|
+
f"总已用时间: {final_elapsed_time:.2f}秒 | "
|
|
281
|
+
f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
|
|
282
|
+
if processed_lines > 0
|
|
283
|
+
else "处理无数据"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
self.save_processed_file(hdfs_file_path)
|
|
287
|
+
|
|
288
|
+
async def retry_process_file_bulk(self, hdfs_file_path, process_func):
|
|
289
|
+
"""带重试机制的批量文件处理"""
|
|
290
|
+
retry_count = 0
|
|
291
|
+
while retry_count < self.retry_limit:
|
|
292
|
+
try:
|
|
293
|
+
await self.process_file_bulk(hdfs_file_path, process_func)
|
|
294
|
+
return True # 成功处理后退出
|
|
295
|
+
except Exception as e:
|
|
296
|
+
retry_count += 1
|
|
297
|
+
print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
|
|
298
|
+
await asyncio.sleep(2**retry_count)
|
|
299
|
+
print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
|
|
303
|
+
"""批量处理 gz 文件中的数据"""
|
|
304
|
+
gz_files = self.list_gz_files(hdfs_dir)
|
|
305
|
+
all_succeed = True
|
|
306
|
+
for hdfs_file_path in gz_files:
|
|
307
|
+
if self.is_file_processed(hdfs_file_path):
|
|
308
|
+
print(f"跳过已处理文件: {hdfs_file_path}")
|
|
309
|
+
continue # 跳过已处理文件
|
|
310
|
+
succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
|
|
311
|
+
if succeed is False:
|
|
312
|
+
all_succeed = False
|
|
313
|
+
|
|
314
|
+
if all_succeed:
|
|
315
|
+
# 处理完成后删除数据库文件
|
|
316
|
+
try:
|
|
317
|
+
if os.path.exists(self.db_file):
|
|
318
|
+
os.remove(self.db_file)
|
|
319
|
+
print(f"已删除断点重试文件: {self.db_file}")
|
|
320
|
+
except Exception as e:
|
|
321
|
+
print(f"删除断点重试文件失败: {e}")
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# # 使用示例
|
|
325
|
+
# async def update_refer(data: dict):
|
|
326
|
+
# ref_id = data["ref_id"]
|
|
327
|
+
# url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
|
|
328
|
+
# update_data = data["update_data"]
|
|
329
|
+
# if not update_data:
|
|
330
|
+
# return
|
|
331
|
+
#
|
|
332
|
+
# # 此处为实际处理逻辑
|
|
333
|
+
# await ApiNetUtils.fetch_post(url=url, payload=update_data)
|
|
334
|
+
#
|
|
335
|
+
#
|
|
336
|
+
# if __name__ == "__main__":
|
|
337
|
+
# processor = HDFSDataProcessor() # 实例化数据处理类
|
|
338
|
+
# asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))
|