tfduck-bsd 0.18.1__tar.gz → 0.18.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- {tfduck-bsd-0.18.1/tfduck_bsd.egg-info → tfduck-bsd-0.18.3}/PKG-INFO +1 -1
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/setup.py +1 -1
- tfduck-bsd-0.18.3/tfduck/__init__.py +1 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/tga.py +344 -60
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3/tfduck_bsd.egg-info}/PKG-INFO +1 -1
- tfduck-bsd-0.18.1/tfduck/__init__.py +0 -1
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/LICENSE +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/README.md +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/bin/tfduck +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/setup.cfg +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/example.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/defines.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/defines_clean.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/extendEncoder.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/main.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/oss/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/oss/oss.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/spark_manage.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/s3/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/s3/s3oper.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/sagemaker/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/sagemaker/saoper.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/base_tga.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_ltv.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_retain.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_yh.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/tga_test.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_ltv.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_retain.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_yh.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/thinkdata/__init__.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/thinkdata/query.py +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/SOURCES.txt +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/requires.txt +0 -0
- {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__="0.18.3"
|
|
@@ -11,9 +11,10 @@ finally:
|
|
|
11
11
|
BMOBJ.remove_file(local_file)
|
|
12
12
|
|
|
13
13
|
版本记录:
|
|
14
|
-
pyhive=0.6.2
|
|
14
|
+
pyhive=0.6.2
|
|
15
15
|
requests=2.23.0 2.27.1
|
|
16
16
|
"""
|
|
17
|
+
|
|
17
18
|
import requests
|
|
18
19
|
import pandas
|
|
19
20
|
import json
|
|
@@ -22,16 +23,18 @@ import os
|
|
|
22
23
|
import uuid
|
|
23
24
|
import urllib3
|
|
24
25
|
from tfduck.common.defines import BMOBJ, Et
|
|
26
|
+
|
|
25
27
|
# from django.conf import settings
|
|
26
28
|
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED
|
|
27
29
|
from tfduck.tga.base_tga import BaseTga
|
|
28
30
|
|
|
31
|
+
|
|
29
32
|
class ThinkDataQuery(BaseTga):
|
|
30
33
|
"""
|
|
31
34
|
@des: thinkdata openapi查询基础类----这个只能再thinkdata内网执行
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
|
-
def __init__(self, query_uri, token, hive_conn_info=[
|
|
37
|
+
def __init__(self, query_uri, token, hive_conn_info=["host", 0]):
|
|
35
38
|
"""
|
|
36
39
|
@des:初始化类
|
|
37
40
|
"""
|
|
@@ -52,53 +55,81 @@ class ThinkDataQuery(BaseTga):
|
|
|
52
55
|
file_path = os.path.join(base_dir, real_name)
|
|
53
56
|
return file_path
|
|
54
57
|
|
|
55
|
-
def g_to_csv_notmp(
|
|
58
|
+
def g_to_csv_notmp(
|
|
59
|
+
self, filepath, df, index=True, compression=None, mode="w", header=True
|
|
60
|
+
):
|
|
56
61
|
"""
|
|
57
62
|
@des: pandas生成csv文件---用于追加文件,不能用临时文件
|
|
58
|
-
compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
|
|
63
|
+
compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
|
|
59
64
|
"""
|
|
60
65
|
tmp_filepath = filepath
|
|
61
66
|
if index is None: # 不保存行索引
|
|
62
67
|
if compression is None: # 不压缩
|
|
63
68
|
df.to_csv(tmp_filepath, index=None, mode=mode, header=header)
|
|
64
69
|
else:
|
|
65
|
-
df.to_csv(
|
|
66
|
-
|
|
70
|
+
df.to_csv(
|
|
71
|
+
tmp_filepath,
|
|
72
|
+
index=None,
|
|
73
|
+
compression=compression,
|
|
74
|
+
mode=mode,
|
|
75
|
+
header=header,
|
|
76
|
+
)
|
|
67
77
|
else:
|
|
68
78
|
if compression is None: # 不压缩
|
|
69
79
|
df.to_csv(tmp_filepath, mode=mode, header=header)
|
|
70
80
|
else:
|
|
71
|
-
df.to_csv(
|
|
72
|
-
|
|
81
|
+
df.to_csv(
|
|
82
|
+
tmp_filepath, compression=compression, mode=mode, header=header
|
|
83
|
+
)
|
|
73
84
|
return True
|
|
74
85
|
|
|
75
|
-
def get_data_csv_i(
|
|
86
|
+
def get_data_csv_i(
|
|
87
|
+
self,
|
|
88
|
+
ctx,
|
|
89
|
+
unique_path,
|
|
90
|
+
sql,
|
|
91
|
+
block_size=100000,
|
|
92
|
+
print_size=100000,
|
|
93
|
+
read_timeout=600,
|
|
94
|
+
upcount=None,
|
|
95
|
+
conn_timeout=30,
|
|
96
|
+
tga_data_timeout=600,
|
|
97
|
+
):
|
|
76
98
|
"""
|
|
77
99
|
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
78
100
|
"""
|
|
79
101
|
session = requests.session()
|
|
80
|
-
post_data = {
|
|
102
|
+
post_data = {
|
|
103
|
+
"token": self.token,
|
|
104
|
+
"sql": sql,
|
|
105
|
+
"timeoutSeconds": tga_data_timeout,
|
|
106
|
+
}
|
|
81
107
|
#
|
|
82
108
|
unique_path = self.gen_local_unique_file()
|
|
83
109
|
#
|
|
84
110
|
BMOBJ.log_error("in query")
|
|
85
111
|
#
|
|
86
|
-
r = session.post(
|
|
87
|
-
|
|
112
|
+
r = session.post(
|
|
113
|
+
self.query_uri,
|
|
114
|
+
data=post_data,
|
|
115
|
+
stream=True,
|
|
116
|
+
verify=False,
|
|
117
|
+
timeout=(conn_timeout, read_timeout),
|
|
118
|
+
)
|
|
88
119
|
datas = []
|
|
89
|
-
i = 0
|
|
120
|
+
i = 0 # 循环引用计数
|
|
90
121
|
icount = 0 # 数据的数量
|
|
91
122
|
cols = [] # 表头
|
|
92
123
|
try:
|
|
93
|
-
row =
|
|
124
|
+
row = ""
|
|
94
125
|
# iter_lines iter_content, chunk_size字节, 下面取100M
|
|
95
|
-
for row in r.iter_lines(chunk_size=1024*1024*100):
|
|
126
|
+
for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
|
|
96
127
|
if not row:
|
|
97
128
|
continue
|
|
98
129
|
data = None
|
|
99
130
|
if i == 0: # 处理header
|
|
100
131
|
data = json.loads(row)
|
|
101
|
-
if
|
|
132
|
+
if data["return_code"] == 0:
|
|
102
133
|
cols = data["data"]["headers"]
|
|
103
134
|
df = pandas.DataFrame(data=[], columns=cols) # 保存表头
|
|
104
135
|
self.g_to_csv_notmp(unique_path, df, index=None)
|
|
@@ -111,6 +142,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
111
142
|
except:
|
|
112
143
|
pass
|
|
113
144
|
datas = []
|
|
145
|
+
raise Exception("sql error")
|
|
114
146
|
break # 表示查询出错,没有消息
|
|
115
147
|
else:
|
|
116
148
|
if row.strip() not in [b"", ""]:
|
|
@@ -120,8 +152,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
120
152
|
i += 1
|
|
121
153
|
if len(datas) == block_size: # 1000000条保存一次
|
|
122
154
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
123
|
-
self.g_to_csv_notmp(
|
|
124
|
-
|
|
155
|
+
self.g_to_csv_notmp(
|
|
156
|
+
unique_path, df, index=None, mode="a", header=False
|
|
157
|
+
) # 追加保存
|
|
125
158
|
icount += block_size
|
|
126
159
|
datas = []
|
|
127
160
|
if i % print_size == 0:
|
|
@@ -129,8 +162,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
129
162
|
BMOBJ.clog(ctx, f"total: {i}")
|
|
130
163
|
if len(datas) > 0: # 保存最后收尾的
|
|
131
164
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
132
|
-
self.g_to_csv_notmp(
|
|
133
|
-
|
|
165
|
+
self.g_to_csv_notmp(
|
|
166
|
+
unique_path, df, index=None, mode="a", header=False
|
|
167
|
+
) # 追加保存
|
|
134
168
|
icount += len(datas)
|
|
135
169
|
datas = []
|
|
136
170
|
except Exception as e:
|
|
@@ -151,7 +185,18 @@ class ThinkDataQuery(BaseTga):
|
|
|
151
185
|
pass
|
|
152
186
|
return unique_path
|
|
153
187
|
|
|
154
|
-
def get_data_csv(
|
|
188
|
+
def get_data_csv(
|
|
189
|
+
self,
|
|
190
|
+
ctx,
|
|
191
|
+
sql,
|
|
192
|
+
block_size=100000,
|
|
193
|
+
print_size=100000,
|
|
194
|
+
read_timeout=600,
|
|
195
|
+
upcount=None,
|
|
196
|
+
retry_count=2,
|
|
197
|
+
conn_timeout=30,
|
|
198
|
+
tga_data_timeout=600,
|
|
199
|
+
):
|
|
155
200
|
"""
|
|
156
201
|
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
157
202
|
"""
|
|
@@ -161,33 +206,243 @@ class ThinkDataQuery(BaseTga):
|
|
|
161
206
|
for i in range(retry_count):
|
|
162
207
|
try:
|
|
163
208
|
result = self.get_data_csv_i(
|
|
164
|
-
ctx,
|
|
209
|
+
ctx,
|
|
210
|
+
unique_path,
|
|
211
|
+
sql,
|
|
212
|
+
block_size,
|
|
213
|
+
print_size,
|
|
214
|
+
read_timeout,
|
|
215
|
+
upcount,
|
|
216
|
+
conn_timeout,
|
|
217
|
+
tga_data_timeout,
|
|
218
|
+
)
|
|
165
219
|
return result
|
|
166
220
|
except Exception as e:
|
|
167
221
|
gol_e = e
|
|
168
222
|
BMOBJ.remove_file(unique_path)
|
|
169
223
|
BMOBJ.remove_folder(unique_path)
|
|
170
|
-
|
|
171
|
-
|
|
224
|
+
# modify by yx 2024-05-08---加强出错重试---
|
|
225
|
+
# if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
|
|
226
|
+
# BMOBJ.clog(ctx, f'retry {i}')
|
|
227
|
+
# continue
|
|
228
|
+
# else:
|
|
229
|
+
# break
|
|
230
|
+
if 1:
|
|
231
|
+
continue
|
|
232
|
+
if gol_e is not None:
|
|
233
|
+
raise gol_e
|
|
234
|
+
|
|
235
|
+
def get_data_csv_by_str_i(
|
|
236
|
+
self,
|
|
237
|
+
ctx,
|
|
238
|
+
unique_path,
|
|
239
|
+
sql,
|
|
240
|
+
block_size=100000,
|
|
241
|
+
print_size=100000,
|
|
242
|
+
read_timeout=600,
|
|
243
|
+
upcount=None,
|
|
244
|
+
conn_timeout=30,
|
|
245
|
+
tga_data_timeout=600,
|
|
246
|
+
):
|
|
247
|
+
"""
|
|
248
|
+
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
249
|
+
"""
|
|
250
|
+
session = requests.session()
|
|
251
|
+
post_data = {
|
|
252
|
+
"token": self.token,
|
|
253
|
+
"sql": sql,
|
|
254
|
+
"timeoutSeconds": tga_data_timeout,
|
|
255
|
+
}
|
|
256
|
+
#
|
|
257
|
+
unique_path = self.gen_local_unique_file()
|
|
258
|
+
#
|
|
259
|
+
BMOBJ.log_error("in query")
|
|
260
|
+
#
|
|
261
|
+
r = session.post(
|
|
262
|
+
self.query_uri,
|
|
263
|
+
data=post_data,
|
|
264
|
+
stream=True,
|
|
265
|
+
verify=False,
|
|
266
|
+
timeout=(conn_timeout, read_timeout),
|
|
267
|
+
)
|
|
268
|
+
datas = []
|
|
269
|
+
i = 0 # 循环引用计数
|
|
270
|
+
icount = 0 # 数据的数量
|
|
271
|
+
cols = [] # 表头
|
|
272
|
+
try:
|
|
273
|
+
row = ""
|
|
274
|
+
# iter_lines iter_content, chunk_size字节, 下面取100M
|
|
275
|
+
for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
|
|
276
|
+
if not row:
|
|
172
277
|
continue
|
|
278
|
+
data = None
|
|
279
|
+
if i == 0: # 处理header
|
|
280
|
+
data = json.loads(row)
|
|
281
|
+
if data["return_code"] == 0:
|
|
282
|
+
cols = data["data"]["headers"]
|
|
283
|
+
df = pandas.DataFrame(
|
|
284
|
+
data=[], columns=cols, type=object
|
|
285
|
+
) # 保存表头
|
|
286
|
+
# 解决科学计数法的问题
|
|
287
|
+
df = df.astype(str)
|
|
288
|
+
df = df.astype("string")
|
|
289
|
+
#
|
|
290
|
+
self.g_to_csv_notmp(unique_path, df, index=None)
|
|
291
|
+
data = None
|
|
292
|
+
else:
|
|
293
|
+
BMOBJ.log_error("sql error:", data)
|
|
294
|
+
# BMOBJ.log_error(sql)
|
|
295
|
+
try:
|
|
296
|
+
BMOBJ.clog(ctx, "sql error:", data)
|
|
297
|
+
except:
|
|
298
|
+
pass
|
|
299
|
+
datas = []
|
|
300
|
+
raise Exception("sql error")
|
|
301
|
+
break # 表示查询出错,没有消息
|
|
173
302
|
else:
|
|
174
|
-
|
|
303
|
+
if row.strip() not in [b"", ""]:
|
|
304
|
+
data = json.loads(row)
|
|
305
|
+
if data is not None:
|
|
306
|
+
datas.append(data)
|
|
307
|
+
i += 1
|
|
308
|
+
if len(datas) == block_size: # 1000000条保存一次
|
|
309
|
+
df = pandas.DataFrame(
|
|
310
|
+
data=datas, columns=cols, type=object
|
|
311
|
+
) # 保存表头
|
|
312
|
+
# 解决科学计数法的问题
|
|
313
|
+
df = df.astype(str)
|
|
314
|
+
df = df.astype("string")
|
|
315
|
+
#
|
|
316
|
+
self.g_to_csv_notmp(
|
|
317
|
+
unique_path, df, index=None, mode="a", header=False
|
|
318
|
+
) # 追加保存
|
|
319
|
+
icount += block_size
|
|
320
|
+
datas = []
|
|
321
|
+
if i % print_size == 0:
|
|
322
|
+
BMOBJ.clog(ctx, i)
|
|
323
|
+
BMOBJ.clog(ctx, f"total: {i}")
|
|
324
|
+
if len(datas) > 0: # 保存最后收尾的
|
|
325
|
+
df = pandas.DataFrame(data=datas, columns=cols, type=object) # 保存表头
|
|
326
|
+
# 解决科学计数法的问题
|
|
327
|
+
df = df.astype(str)
|
|
328
|
+
df = df.astype("string")
|
|
329
|
+
#
|
|
330
|
+
self.g_to_csv_notmp(
|
|
331
|
+
unique_path, df, index=None, mode="a", header=False
|
|
332
|
+
) # 追加保存
|
|
333
|
+
icount += len(datas)
|
|
334
|
+
datas = []
|
|
335
|
+
except Exception as e:
|
|
336
|
+
BMOBJ.clog(ctx, "get data error", str(e), row)
|
|
337
|
+
if upcount is not None:
|
|
338
|
+
if i < upcount: # 看是否达到可以接受的数量,否则重新查询
|
|
339
|
+
raise e
|
|
340
|
+
else:
|
|
341
|
+
raise e
|
|
342
|
+
finally:
|
|
343
|
+
try:
|
|
344
|
+
r.close()
|
|
345
|
+
except:
|
|
346
|
+
pass
|
|
347
|
+
try:
|
|
348
|
+
session.close()
|
|
349
|
+
except:
|
|
350
|
+
pass
|
|
351
|
+
return unique_path
|
|
352
|
+
|
|
353
|
+
def get_data_csv_by_str(
|
|
354
|
+
self,
|
|
355
|
+
ctx,
|
|
356
|
+
sql,
|
|
357
|
+
block_size=100000,
|
|
358
|
+
print_size=100000,
|
|
359
|
+
read_timeout=600,
|
|
360
|
+
upcount=None,
|
|
361
|
+
retry_count=2,
|
|
362
|
+
conn_timeout=30,
|
|
363
|
+
tga_data_timeout=600,
|
|
364
|
+
):
|
|
365
|
+
"""
|
|
366
|
+
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
367
|
+
"""
|
|
368
|
+
# unique_path = "./test.csv"
|
|
369
|
+
unique_path = self.gen_local_unique_file()
|
|
370
|
+
gol_e = None
|
|
371
|
+
for i in range(retry_count):
|
|
372
|
+
try:
|
|
373
|
+
result = self.get_data_csv_by_str_i(
|
|
374
|
+
ctx,
|
|
375
|
+
unique_path,
|
|
376
|
+
sql,
|
|
377
|
+
block_size,
|
|
378
|
+
print_size,
|
|
379
|
+
read_timeout,
|
|
380
|
+
upcount,
|
|
381
|
+
conn_timeout,
|
|
382
|
+
tga_data_timeout,
|
|
383
|
+
)
|
|
384
|
+
return result
|
|
385
|
+
except Exception as e:
|
|
386
|
+
gol_e = e
|
|
387
|
+
BMOBJ.remove_file(unique_path)
|
|
388
|
+
BMOBJ.remove_folder(unique_path)
|
|
389
|
+
# modify by yx 2024-05-08---加强出错重试---
|
|
390
|
+
# if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
|
|
391
|
+
# BMOBJ.clog(ctx, f'retry {i}')
|
|
392
|
+
# continue
|
|
393
|
+
# else:
|
|
394
|
+
# break
|
|
395
|
+
if 1:
|
|
396
|
+
continue
|
|
175
397
|
if gol_e is not None:
|
|
176
398
|
raise gol_e
|
|
177
|
-
|
|
178
|
-
def get_data_raw_pyhive(
|
|
399
|
+
|
|
400
|
+
def get_data_raw_pyhive(
|
|
401
|
+
self,
|
|
402
|
+
ctx,
|
|
403
|
+
sql,
|
|
404
|
+
block_size=100000,
|
|
405
|
+
fetch_size=10000,
|
|
406
|
+
retry_count=2,
|
|
407
|
+
read_timeout=300,
|
|
408
|
+
upcount=None,
|
|
409
|
+
print_size=100000,
|
|
410
|
+
conn_timeout=30,
|
|
411
|
+
tga_data_timeout=600,
|
|
412
|
+
):
|
|
179
413
|
"""
|
|
180
414
|
@des: 接口装饰器--修改为get_data_csv,防止全面修改代码
|
|
181
415
|
"""
|
|
182
|
-
result = self.get_data_csv(
|
|
416
|
+
result = self.get_data_csv(
|
|
417
|
+
ctx,
|
|
418
|
+
sql,
|
|
419
|
+
block_size,
|
|
420
|
+
print_size,
|
|
421
|
+
read_timeout,
|
|
422
|
+
upcount,
|
|
423
|
+
retry_count,
|
|
424
|
+
conn_timeout,
|
|
425
|
+
tga_data_timeout,
|
|
426
|
+
)
|
|
183
427
|
return result
|
|
184
|
-
|
|
185
428
|
|
|
186
|
-
def get_data_raw_pyhive_bck(
|
|
429
|
+
def get_data_raw_pyhive_bck(
|
|
430
|
+
self,
|
|
431
|
+
ctx,
|
|
432
|
+
sql,
|
|
433
|
+
block_size=100000,
|
|
434
|
+
fetch_size=10000,
|
|
435
|
+
retry_count=2,
|
|
436
|
+
read_timeout=300,
|
|
437
|
+
upcount=None,
|
|
438
|
+
print_size=100000,
|
|
439
|
+
conn_timeout=30,
|
|
440
|
+
tga_data_timeout=600,
|
|
441
|
+
):
|
|
187
442
|
'''
|
|
188
443
|
@des:presto直连方式读取-----重试的方式----当get_data_csv接口出问题,则启用这个接口
|
|
189
444
|
tobj = ThinkDataQuery("http://queryhost:port/querySql", "查询token",
|
|
190
|
-
["presto直连的host", 直连的port])
|
|
445
|
+
["presto直连的host", 直连的port])
|
|
191
446
|
sql = """select * from v_event_7 where "$part_date"='2022-02-24' limit 100 """
|
|
192
447
|
unique_path = tobj.get_data_raw_pyhive({}, sql)
|
|
193
448
|
'''
|
|
@@ -197,25 +452,46 @@ class ThinkDataQuery(BaseTga):
|
|
|
197
452
|
for i in range(retry_count):
|
|
198
453
|
try:
|
|
199
454
|
result = self.get_data_raw_pyhive_i(
|
|
200
|
-
ctx,
|
|
455
|
+
ctx,
|
|
456
|
+
unique_path,
|
|
457
|
+
sql,
|
|
458
|
+
block_size,
|
|
459
|
+
fetch_size,
|
|
460
|
+
read_timeout,
|
|
461
|
+
upcount,
|
|
462
|
+
print_size,
|
|
463
|
+
conn_timeout,
|
|
464
|
+
)
|
|
201
465
|
return result
|
|
202
466
|
except Exception as e:
|
|
203
467
|
gol_e = e
|
|
204
468
|
BMOBJ.remove_file(unique_path)
|
|
205
469
|
BMOBJ.remove_folder(unique_path)
|
|
206
470
|
if str(e).find("Read timed out") != -1:
|
|
207
|
-
BMOBJ.clog(ctx, f
|
|
471
|
+
BMOBJ.clog(ctx, f"retry {i}")
|
|
208
472
|
continue
|
|
209
473
|
else:
|
|
210
474
|
break
|
|
211
475
|
if gol_e is not None:
|
|
212
476
|
raise gol_e
|
|
213
477
|
|
|
214
|
-
def get_data_raw_pyhive_i(
|
|
215
|
-
|
|
478
|
+
def get_data_raw_pyhive_i(
|
|
479
|
+
self,
|
|
480
|
+
ctx,
|
|
481
|
+
unique_path,
|
|
482
|
+
sql,
|
|
483
|
+
block_size=100000,
|
|
484
|
+
fetch_size=10000,
|
|
485
|
+
read_timeout=300,
|
|
486
|
+
upcount=None,
|
|
487
|
+
print_size=100000,
|
|
488
|
+
conn_timeout=30,
|
|
489
|
+
):
|
|
490
|
+
"""
|
|
216
491
|
@des: 内部调用
|
|
217
|
-
|
|
492
|
+
"""
|
|
218
493
|
from pyhive import presto
|
|
494
|
+
|
|
219
495
|
#
|
|
220
496
|
# unique_path = self.gen_local_unique_file()
|
|
221
497
|
# unique_path = "./test.csv"
|
|
@@ -224,20 +500,25 @@ class ThinkDataQuery(BaseTga):
|
|
|
224
500
|
session = requests.session()
|
|
225
501
|
#
|
|
226
502
|
datas = []
|
|
227
|
-
i = 0
|
|
503
|
+
i = 0 # 循环引用计数
|
|
228
504
|
icount = 0 # 数据的数量
|
|
229
505
|
cols = [] # 表头
|
|
230
506
|
try:
|
|
231
|
-
conn = presto.connect(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
507
|
+
conn = presto.connect(
|
|
508
|
+
host=self.hive_conn_info[0],
|
|
509
|
+
port=int(self.hive_conn_info[1]),
|
|
510
|
+
username="ta",
|
|
511
|
+
catalog="hive",
|
|
512
|
+
schema="ta",
|
|
513
|
+
requests_session=session,
|
|
514
|
+
# 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
|
|
515
|
+
# 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
|
|
516
|
+
requests_kwargs={
|
|
517
|
+
"timeout": (conn_timeout, read_timeout),
|
|
518
|
+
"stream": True,
|
|
519
|
+
"verify": False,
|
|
520
|
+
},
|
|
521
|
+
)
|
|
241
522
|
cursor = conn.cursor()
|
|
242
523
|
cursor.execute(sql)
|
|
243
524
|
BMOBJ.clog(ctx, "文件大小")
|
|
@@ -262,6 +543,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
262
543
|
def yx_fetch_many():
|
|
263
544
|
myres = cursor.fetchmany(fetch_size)
|
|
264
545
|
return myres
|
|
546
|
+
|
|
265
547
|
rows = yx_fetch_many()
|
|
266
548
|
while rows:
|
|
267
549
|
for row in rows:
|
|
@@ -271,8 +553,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
271
553
|
i += 1
|
|
272
554
|
if len(datas) == block_size: # 1000000条保存一次
|
|
273
555
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
274
|
-
self.g_to_csv_notmp(
|
|
275
|
-
|
|
556
|
+
self.g_to_csv_notmp(
|
|
557
|
+
unique_path, df, index=None, mode="a", header=False
|
|
558
|
+
) # 追加保存
|
|
276
559
|
icount += block_size
|
|
277
560
|
datas = []
|
|
278
561
|
if i % print_size == 0:
|
|
@@ -282,8 +565,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
282
565
|
BMOBJ.clog(ctx, f"total: {i}")
|
|
283
566
|
if len(datas) > 0: # 保存最后收尾的
|
|
284
567
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
285
|
-
self.g_to_csv_notmp(
|
|
286
|
-
|
|
568
|
+
self.g_to_csv_notmp(
|
|
569
|
+
unique_path, df, index=None, mode="a", header=False
|
|
570
|
+
) # 追加保存
|
|
287
571
|
icount += len(datas)
|
|
288
572
|
datas = []
|
|
289
573
|
except Exception as e:
|
|
@@ -298,13 +582,13 @@ class ThinkDataQuery(BaseTga):
|
|
|
298
582
|
try:
|
|
299
583
|
conn.close()
|
|
300
584
|
except:
|
|
301
|
-
pass
|
|
585
|
+
pass
|
|
302
586
|
try:
|
|
303
587
|
session.close()
|
|
304
588
|
except:
|
|
305
|
-
pass
|
|
589
|
+
pass
|
|
306
590
|
return unique_path
|
|
307
|
-
|
|
591
|
+
|
|
308
592
|
"""
|
|
309
593
|
数据打入接口--start--
|
|
310
594
|
"""
|
|
@@ -312,7 +596,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
312
596
|
def set_tga_user_data(self, tga_app_no, sec_token, url, data, is_set_once=False):
|
|
313
597
|
"""
|
|
314
598
|
@des: 用户数据打入tga
|
|
315
|
-
@params:
|
|
599
|
+
@params:
|
|
316
600
|
tga_app_no: tga项目的id,注意不是app_id
|
|
317
601
|
sec_token: 安全的二次部署服务器的token
|
|
318
602
|
url: 二次部署的服务器打入的url
|
|
@@ -324,7 +608,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
324
608
|
def set_tga_event_data_trac(self, tga_app_no, sec_token, url, data):
|
|
325
609
|
"""
|
|
326
610
|
@des: 普通事件数据打入tga
|
|
327
|
-
@params:
|
|
611
|
+
@params:
|
|
328
612
|
tga_app_no: tga项目的id,注意不是app_id
|
|
329
613
|
sec_token: 安全的二次部署服务器的token
|
|
330
614
|
url: 二次部署的服务器打入的url
|
|
@@ -336,7 +620,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
336
620
|
def set_tga_event_data_trac_update(self, tga_app_no, sec_token, url, data):
|
|
337
621
|
"""
|
|
338
622
|
@des: 可更新事件数据打入tga, 重写部分数据
|
|
339
|
-
@params:
|
|
623
|
+
@params:
|
|
340
624
|
tga_app_no: tga项目的id,注意不是app_id
|
|
341
625
|
sec_token: 安全的二次部署服务器的token
|
|
342
626
|
url: 二次部署的服务器打入的url
|
|
@@ -345,10 +629,10 @@ class ThinkDataQuery(BaseTga):
|
|
|
345
629
|
"""
|
|
346
630
|
pass
|
|
347
631
|
|
|
348
|
-
def set_tga_event_data_trac_overwrite(self, tga_app_no,
|
|
632
|
+
def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
|
|
349
633
|
"""
|
|
350
634
|
@des: 可更新事件数据打入tga, 重写全部数据
|
|
351
|
-
@params:
|
|
635
|
+
@params:
|
|
352
636
|
tga_app_no: tga项目的id,注意不是app_id
|
|
353
637
|
sec_token: 安全的二次部署服务器的token
|
|
354
638
|
url: 二次部署的服务器打入的url
|
|
@@ -359,4 +643,4 @@ class ThinkDataQuery(BaseTga):
|
|
|
359
643
|
|
|
360
644
|
"""
|
|
361
645
|
数据打入接口----end-----
|
|
362
|
-
"""
|
|
646
|
+
"""
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__="0.18.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|