re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,338 +1,338 @@
1
- import asyncio
2
- import gzip
3
- import json
4
- from pathlib import Path
5
- import sqlite3
6
- import time
7
- import os
8
- from io import BytesIO
9
- from typing import Callable, Any, List
10
-
11
- from hdfs import InsecureClient
12
-
13
-
14
- class HDFSDataProcessor:
15
- def __init__(
16
- self,
17
- hdfs_url="http://VIP-DC-MASTER-2:9870",
18
- hdfs_user="root",
19
- db_file="processed_files.db",
20
- batch_size=50,
21
- retry_limit=3,
22
- ):
23
- self.hdfs_url = hdfs_url
24
- self.hdfs_user = hdfs_user
25
- self.db_file = db_file
26
- self.batch_size = batch_size
27
- self.retry_limit = retry_limit
28
- self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
29
- self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
30
- self.read_hdfs_model = "all"
31
- self.init_db()
32
-
33
- def init_db(self):
34
- """初始化 SQLite 数据库"""
35
- with sqlite3.connect(self.db_file) as conn:
36
- cursor = conn.cursor()
37
- cursor.execute("""
38
- CREATE TABLE IF NOT EXISTS processed_files (
39
- file_path TEXT PRIMARY KEY
40
- )
41
- """)
42
- conn.commit()
43
-
44
- def save_processed_file(self, file_path):
45
- """保存处理过的文件"""
46
- with sqlite3.connect(self.db_file) as conn:
47
- cursor = conn.cursor()
48
- cursor.execute(
49
- "INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
50
- (file_path,),
51
- )
52
- conn.commit()
53
-
54
- def is_file_processed(self, file_path):
55
- """检查文件是否已处理"""
56
- with sqlite3.connect(self.db_file) as conn:
57
- cursor = conn.cursor()
58
- cursor.execute(
59
- "SELECT file_path FROM processed_files WHERE file_path = ?",
60
- (file_path,),
61
- )
62
- result = cursor.fetchone()
63
- return result is not None
64
-
65
- def list_gz_files(self, hdfs_dir):
66
- """列出 HDFS 目录中的所有 gzip 文件"""
67
- return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
68
-
69
- def count_total_lines(self, gz_file_path: str):
70
- with self.client.read(gz_file_path) as hdfs_file:
71
- with gzip.GzipFile(fileobj=hdfs_file) as gz:
72
- return sum(1 for _ in gz)
73
-
74
- def batch_read_gz(self, gz_file_path: str):
75
- """分批读取 gz 文件"""
76
- with self.client.read(gz_file_path) as hdfs_file:
77
- with gzip.GzipFile(fileobj=hdfs_file) as gz:
78
- while True:
79
- lines = []
80
- for _ in range(self.batch_size):
81
- try:
82
- line = next(gz)
83
- if line.strip(): # 移除空行
84
- lines.append(line.decode("utf-8")) # 解码
85
- except StopIteration: # 文件已读完
86
- break
87
- if not lines:
88
- break
89
- yield lines
90
-
91
- def all_read_gz(self, gz_file_path: str, encoding="utf-8"):
92
- """
93
- 读取 HDFS 上的 .gz 文件内容。
94
- :param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
95
- :param encoding: 文件编码格式(默认 utf-8)
96
- :return: 文件内容
97
- """
98
- with self.client.read(gz_file_path) as reader: # 以二进制模式读取
99
- compressed_data = reader.read() # 读取压缩数据
100
- with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
101
- content = gz_file.read().decode(encoding) # 解码为字符串
102
- print(f"文件读取成功: {gz_file_path}")
103
- lines = [i for i in content.split("\n") if i.strip()]
104
- result = [lines[i : i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
105
- return result
106
-
107
- async def process_data(self, data, process_func):
108
- """处理数据并执行处理函数"""
109
- retry_count = 0
110
- while retry_count < self.retry_limit:
111
- try:
112
- return await process_func(data) # 成功处理后退出
113
- except Exception as e:
114
- retry_count += 1
115
- print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
116
- await asyncio.sleep(2**retry_count)
117
- raise Exception(f"处理数据失败, 达到重试上限, data: {data}")
118
-
119
- async def process_file(self, hdfs_file_path, process_func, write_dir: str):
120
- """处理单个 gz 文件"""
121
- total_lines = self.count_total_lines(hdfs_file_path)
122
- processed_lines = 0
123
- start_time = time.time()
124
- results = []
125
- # # 这里根据不同的配置选用不同的读取文件的方法
126
- for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
127
- processing_start_time = time.time() # 记录本批处理开始时间
128
-
129
- tasks = []
130
- for line in lines:
131
- try:
132
- data = json.loads(line)
133
- tasks.append(self.process_data(data, process_func))
134
- except json.JSONDecodeError as e:
135
- raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
136
-
137
- # await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
138
- results.extend(await asyncio.gather(*tasks))
139
-
140
- processed_lines += len(lines)
141
-
142
- elapsed_time = time.time() - start_time # 已用时间
143
- processing_time = time.time() - processing_start_time # 本次处理时间
144
- avg_processing_time = (
145
- (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
146
- ) # 平均每条数据的处理时间(毫秒)
147
-
148
- # 估算剩余时间
149
- remaining_time = (
150
- ((avg_processing_time / 1000) * (total_lines - processed_lines))
151
- if processed_lines > 0
152
- else float("inf")
153
- )
154
-
155
- # 显示总进度信息
156
- print(
157
- f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
158
- f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
159
- f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
160
- )
161
-
162
- def generate_write_data(results):
163
- for res in results:
164
- yield str(res) + "\n"
165
-
166
- if write_dir is not None:
167
- self.client.write(
168
- write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}",
169
- data=generate_write_data(results),
170
- overwrite=True,
171
- encoding="utf-8",
172
- )
173
-
174
- # 最终进度显示
175
- final_elapsed_time = time.time() - start_time # 最终已用时间
176
- print(
177
- f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
178
- f"总已用时间: {final_elapsed_time:.2f}秒 | "
179
- f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
180
- if processed_lines > 0
181
- else "处理无数据"
182
- )
183
-
184
- self.save_processed_file(hdfs_file_path) # 保存处理过的文件
185
-
186
- async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
187
- """带重试机制的文件处理"""
188
- retry_count = 0
189
- while retry_count < self.retry_limit:
190
- try:
191
- await self.process_file(hdfs_file_path, process_func, write_dir)
192
- return True # 成功处理后退出
193
- except Exception as e:
194
- retry_count += 1
195
- print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
196
- await asyncio.sleep(2**retry_count)
197
- print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
198
- return False
199
- # raise
200
-
201
- async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any], write_dir: str = None):
202
- """批量更新所有 gz 文件"""
203
- gz_files = self.list_gz_files(hdfs_dir)
204
- all_succeed = True
205
- for hdfs_file_path in gz_files:
206
- if self.is_file_processed(hdfs_file_path):
207
- print(f"跳过已处理文件: {hdfs_file_path}")
208
- continue # 如果文件已处理,跳过
209
- succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
210
- if succeed is False:
211
- all_succeed = False
212
-
213
- if all_succeed:
214
- # 处理完成后删除数据库文件
215
- try:
216
- if os.path.exists(self.db_file):
217
- os.remove(self.db_file)
218
- print(f"已删除断点重试文件: {self.db_file}")
219
- except Exception as e:
220
- print(f"删除断点重试文件失败: {e}")
221
-
222
- async def process_file_bulk(self, hdfs_file_path, process_func):
223
- """按批次处理单个文件,批量数据传递给处理函数"""
224
- total_lines = self.count_total_lines(hdfs_file_path)
225
- processed_lines = 0
226
- start_time = time.time()
227
-
228
- tasks = []
229
- # 这里根据不同的配置选用不同的读取文件的方法
230
- for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
231
- processing_start_time = time.time() # 记录本批处理开始时间
232
-
233
- batch_data = []
234
- for line in lines:
235
- try:
236
- data = json.loads(line)
237
- batch_data.append(data)
238
- except json.JSONDecodeError as e:
239
- raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
240
-
241
- # 处理读取到的批次数据
242
- if batch_data:
243
- tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
244
- processed_lines += len(batch_data) # 更新已处理行数
245
-
246
- # 当积累的任务数量达到 batch_size 时并发处理所有任务
247
- if len(tasks) >= self.batch_size:
248
- await asyncio.gather(*tasks) # 同时处理多个批次
249
-
250
- elapsed_time = time.time() - start_time # 已用时间
251
- processing_time = time.time() - processing_start_time # 本次处理时间
252
- avg_processing_time = (
253
- (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
254
- ) # 平均每条数据的处理时间(毫秒)
255
-
256
- # 估算剩余时间
257
- remaining_time = (
258
- ((avg_processing_time / 1000) * (total_lines - processed_lines))
259
- if processed_lines > 0
260
- else float("inf")
261
- )
262
-
263
- # 显示总进度信息
264
- print(
265
- f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
266
- f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
267
- f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
268
- )
269
-
270
- # 清空任务列表,准备下一批处理
271
- tasks.clear()
272
- # 处理剩余的任务
273
- if tasks:
274
- await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
275
-
276
- # 最终进度显示
277
- final_elapsed_time = time.time() - start_time # 最终已用时间
278
- print(
279
- f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
280
- f"总已用时间: {final_elapsed_time:.2f}秒 | "
281
- f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
282
- if processed_lines > 0
283
- else "处理无数据"
284
- )
285
-
286
- self.save_processed_file(hdfs_file_path)
287
-
288
- async def retry_process_file_bulk(self, hdfs_file_path, process_func):
289
- """带重试机制的批量文件处理"""
290
- retry_count = 0
291
- while retry_count < self.retry_limit:
292
- try:
293
- await self.process_file_bulk(hdfs_file_path, process_func)
294
- return True # 成功处理后退出
295
- except Exception as e:
296
- retry_count += 1
297
- print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
298
- await asyncio.sleep(2**retry_count)
299
- print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
300
- return False
301
-
302
- async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
303
- """批量处理 gz 文件中的数据"""
304
- gz_files = self.list_gz_files(hdfs_dir)
305
- all_succeed = True
306
- for hdfs_file_path in gz_files:
307
- if self.is_file_processed(hdfs_file_path):
308
- print(f"跳过已处理文件: {hdfs_file_path}")
309
- continue # 跳过已处理文件
310
- succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
311
- if succeed is False:
312
- all_succeed = False
313
-
314
- if all_succeed:
315
- # 处理完成后删除数据库文件
316
- try:
317
- if os.path.exists(self.db_file):
318
- os.remove(self.db_file)
319
- print(f"已删除断点重试文件: {self.db_file}")
320
- except Exception as e:
321
- print(f"删除断点重试文件失败: {e}")
322
-
323
-
324
- # # 使用示例
325
- # async def update_refer(data: dict):
326
- # ref_id = data["ref_id"]
327
- # url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
328
- # update_data = data["update_data"]
329
- # if not update_data:
330
- # return
331
- #
332
- # # 此处为实际处理逻辑
333
- # await ApiNetUtils.fetch_post(url=url, payload=update_data)
334
- #
335
- #
336
- # if __name__ == "__main__":
337
- # processor = HDFSDataProcessor() # 实例化数据处理类
338
- # asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))
1
+ import asyncio
2
+ import gzip
3
+ import json
4
+ from pathlib import Path
5
+ import sqlite3
6
+ import time
7
+ import os
8
+ from io import BytesIO
9
+ from typing import Callable, Any, List
10
+
11
+ from hdfs import InsecureClient
12
+
13
+
14
+ class HDFSDataProcessor:
15
+ def __init__(
16
+ self,
17
+ hdfs_url="http://VIP-DC-MASTER-2:9870",
18
+ hdfs_user="root",
19
+ db_file="processed_files.db",
20
+ batch_size=50,
21
+ retry_limit=3,
22
+ ):
23
+ self.hdfs_url = hdfs_url
24
+ self.hdfs_user = hdfs_user
25
+ self.db_file = db_file
26
+ self.batch_size = batch_size
27
+ self.retry_limit = retry_limit
28
+ self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
29
+ self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
30
+ self.read_hdfs_model = "all"
31
+ self.init_db()
32
+
33
+ def init_db(self):
34
+ """初始化 SQLite 数据库"""
35
+ with sqlite3.connect(self.db_file) as conn:
36
+ cursor = conn.cursor()
37
+ cursor.execute("""
38
+ CREATE TABLE IF NOT EXISTS processed_files (
39
+ file_path TEXT PRIMARY KEY
40
+ )
41
+ """)
42
+ conn.commit()
43
+
44
+ def save_processed_file(self, file_path):
45
+ """保存处理过的文件"""
46
+ with sqlite3.connect(self.db_file) as conn:
47
+ cursor = conn.cursor()
48
+ cursor.execute(
49
+ "INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)",
50
+ (file_path,),
51
+ )
52
+ conn.commit()
53
+
54
+ def is_file_processed(self, file_path):
55
+ """检查文件是否已处理"""
56
+ with sqlite3.connect(self.db_file) as conn:
57
+ cursor = conn.cursor()
58
+ cursor.execute(
59
+ "SELECT file_path FROM processed_files WHERE file_path = ?",
60
+ (file_path,),
61
+ )
62
+ result = cursor.fetchone()
63
+ return result is not None
64
+
65
+ def list_gz_files(self, hdfs_dir):
66
+ """列出 HDFS 目录中的所有 gzip 文件"""
67
+ return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
68
+
69
+ def count_total_lines(self, gz_file_path: str):
70
+ with self.client.read(gz_file_path) as hdfs_file:
71
+ with gzip.GzipFile(fileobj=hdfs_file) as gz:
72
+ return sum(1 for _ in gz)
73
+
74
+ def batch_read_gz(self, gz_file_path: str):
75
+ """分批读取 gz 文件"""
76
+ with self.client.read(gz_file_path) as hdfs_file:
77
+ with gzip.GzipFile(fileobj=hdfs_file) as gz:
78
+ while True:
79
+ lines = []
80
+ for _ in range(self.batch_size):
81
+ try:
82
+ line = next(gz)
83
+ if line.strip(): # 移除空行
84
+ lines.append(line.decode("utf-8")) # 解码
85
+ except StopIteration: # 文件已读完
86
+ break
87
+ if not lines:
88
+ break
89
+ yield lines
90
+
91
+ def all_read_gz(self, gz_file_path: str, encoding="utf-8"):
92
+ """
93
+ 读取 HDFS 上的 .gz 文件内容。
94
+ :param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
95
+ :param encoding: 文件编码格式(默认 utf-8)
96
+ :return: 文件内容
97
+ """
98
+ with self.client.read(gz_file_path) as reader: # 以二进制模式读取
99
+ compressed_data = reader.read() # 读取压缩数据
100
+ with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
101
+ content = gz_file.read().decode(encoding) # 解码为字符串
102
+ print(f"文件读取成功: {gz_file_path}")
103
+ lines = [i for i in content.split("\n") if i.strip()]
104
+ result = [lines[i : i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
105
+ return result
106
+
107
+ async def process_data(self, data, process_func):
108
+ """处理数据并执行处理函数"""
109
+ retry_count = 0
110
+ while retry_count < self.retry_limit:
111
+ try:
112
+ return await process_func(data) # 成功处理后退出
113
+ except Exception as e:
114
+ retry_count += 1
115
+ print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
116
+ await asyncio.sleep(2**retry_count)
117
+ raise Exception(f"处理数据失败, 达到重试上限, data: {data}")
118
+
119
+ async def process_file(self, hdfs_file_path, process_func, write_dir: str):
120
+ """处理单个 gz 文件"""
121
+ total_lines = self.count_total_lines(hdfs_file_path)
122
+ processed_lines = 0
123
+ start_time = time.time()
124
+ results = []
125
+ # # 这里根据不同的配置选用不同的读取文件的方法
126
+ for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
127
+ processing_start_time = time.time() # 记录本批处理开始时间
128
+
129
+ tasks = []
130
+ for line in lines:
131
+ try:
132
+ data = json.loads(line)
133
+ tasks.append(self.process_data(data, process_func))
134
+ except json.JSONDecodeError as e:
135
+ raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
136
+
137
+ # await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
138
+ results.extend(await asyncio.gather(*tasks))
139
+
140
+ processed_lines += len(lines)
141
+
142
+ elapsed_time = time.time() - start_time # 已用时间
143
+ processing_time = time.time() - processing_start_time # 本次处理时间
144
+ avg_processing_time = (
145
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
146
+ ) # 平均每条数据的处理时间(毫秒)
147
+
148
+ # 估算剩余时间
149
+ remaining_time = (
150
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
151
+ if processed_lines > 0
152
+ else float("inf")
153
+ )
154
+
155
+ # 显示总进度信息
156
+ print(
157
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
158
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
159
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
160
+ )
161
+
162
+ def generate_write_data(results):
163
+ for res in results:
164
+ yield str(res) + "\n"
165
+
166
+ if write_dir is not None:
167
+ self.client.write(
168
+ write_dir.rstrip("/") + f"/{Path(hdfs_file_path).stem}",
169
+ data=generate_write_data(results),
170
+ overwrite=True,
171
+ encoding="utf-8",
172
+ )
173
+
174
+ # 最终进度显示
175
+ final_elapsed_time = time.time() - start_time # 最终已用时间
176
+ print(
177
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
178
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
179
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
180
+ if processed_lines > 0
181
+ else "处理无数据"
182
+ )
183
+
184
+ self.save_processed_file(hdfs_file_path) # 保存处理过的文件
185
+
186
+ async def retry_process_file(self, hdfs_file_path, process_func, write_dir):
187
+ """带重试机制的文件处理"""
188
+ retry_count = 0
189
+ while retry_count < self.retry_limit:
190
+ try:
191
+ await self.process_file(hdfs_file_path, process_func, write_dir)
192
+ return True # 成功处理后退出
193
+ except Exception as e:
194
+ retry_count += 1
195
+ print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
196
+ await asyncio.sleep(2**retry_count)
197
+ print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
198
+ return False
199
+ # raise
200
+
201
+ async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any], write_dir: str = None):
202
+ """批量更新所有 gz 文件"""
203
+ gz_files = self.list_gz_files(hdfs_dir)
204
+ all_succeed = True
205
+ for hdfs_file_path in gz_files:
206
+ if self.is_file_processed(hdfs_file_path):
207
+ print(f"跳过已处理文件: {hdfs_file_path}")
208
+ continue # 如果文件已处理,跳过
209
+ succeed = await self.retry_process_file(hdfs_file_path, process_func, write_dir) # 处理文件
210
+ if succeed is False:
211
+ all_succeed = False
212
+
213
+ if all_succeed:
214
+ # 处理完成后删除数据库文件
215
+ try:
216
+ if os.path.exists(self.db_file):
217
+ os.remove(self.db_file)
218
+ print(f"已删除断点重试文件: {self.db_file}")
219
+ except Exception as e:
220
+ print(f"删除断点重试文件失败: {e}")
221
+
222
+ async def process_file_bulk(self, hdfs_file_path, process_func):
223
+ """按批次处理单个文件,批量数据传递给处理函数"""
224
+ total_lines = self.count_total_lines(hdfs_file_path)
225
+ processed_lines = 0
226
+ start_time = time.time()
227
+
228
+ tasks = []
229
+ # 这里根据不同的配置选用不同的读取文件的方法
230
+ for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
231
+ processing_start_time = time.time() # 记录本批处理开始时间
232
+
233
+ batch_data = []
234
+ for line in lines:
235
+ try:
236
+ data = json.loads(line)
237
+ batch_data.append(data)
238
+ except json.JSONDecodeError as e:
239
+ raise Exception(f"解析JSON失败: {e}, 行内容: {line.strip()}")
240
+
241
+ # 处理读取到的批次数据
242
+ if batch_data:
243
+ tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
244
+ processed_lines += len(batch_data) # 更新已处理行数
245
+
246
+ # 当积累的任务数量达到 batch_size 时并发处理所有任务
247
+ if len(tasks) >= self.batch_size:
248
+ await asyncio.gather(*tasks) # 同时处理多个批次
249
+
250
+ elapsed_time = time.time() - start_time # 已用时间
251
+ processing_time = time.time() - processing_start_time # 本次处理时间
252
+ avg_processing_time = (
253
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
254
+ ) # 平均每条数据的处理时间(毫秒)
255
+
256
+ # 估算剩余时间
257
+ remaining_time = (
258
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
259
+ if processed_lines > 0
260
+ else float("inf")
261
+ )
262
+
263
+ # 显示总进度信息
264
+ print(
265
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
266
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
267
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
268
+ )
269
+
270
+ # 清空任务列表,准备下一批处理
271
+ tasks.clear()
272
+ # 处理剩余的任务
273
+ if tasks:
274
+ await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
275
+
276
+ # 最终进度显示
277
+ final_elapsed_time = time.time() - start_time # 最终已用时间
278
+ print(
279
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
280
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
281
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
282
+ if processed_lines > 0
283
+ else "处理无数据"
284
+ )
285
+
286
+ self.save_processed_file(hdfs_file_path)
287
+
288
+ async def retry_process_file_bulk(self, hdfs_file_path, process_func):
289
+ """带重试机制的批量文件处理"""
290
+ retry_count = 0
291
+ while retry_count < self.retry_limit:
292
+ try:
293
+ await self.process_file_bulk(hdfs_file_path, process_func)
294
+ return True # 成功处理后退出
295
+ except Exception as e:
296
+ retry_count += 1
297
+ print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
298
+ await asyncio.sleep(2**retry_count)
299
+ print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
300
+ return False
301
+
302
+ async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
303
+ """批量处理 gz 文件中的数据"""
304
+ gz_files = self.list_gz_files(hdfs_dir)
305
+ all_succeed = True
306
+ for hdfs_file_path in gz_files:
307
+ if self.is_file_processed(hdfs_file_path):
308
+ print(f"跳过已处理文件: {hdfs_file_path}")
309
+ continue # 跳过已处理文件
310
+ succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
311
+ if succeed is False:
312
+ all_succeed = False
313
+
314
+ if all_succeed:
315
+ # 处理完成后删除数据库文件
316
+ try:
317
+ if os.path.exists(self.db_file):
318
+ os.remove(self.db_file)
319
+ print(f"已删除断点重试文件: {self.db_file}")
320
+ except Exception as e:
321
+ print(f"删除断点重试文件失败: {e}")
322
+
323
+
324
+ # # 使用示例
325
+ # async def update_refer(data: dict):
326
+ # ref_id = data["ref_id"]
327
+ # url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
328
+ # update_data = data["update_data"]
329
+ # if not update_data:
330
+ # return
331
+ #
332
+ # # 此处为实际处理逻辑
333
+ # await ApiNetUtils.fetch_post(url=url, payload=update_data)
334
+ #
335
+ #
336
+ # if __name__ == "__main__":
337
+ # processor = HDFSDataProcessor() # 实例化数据处理类
338
+ # asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))