tfduck-bsd 0.18.2__py3-none-any.whl → 0.18.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- tfduck/__init__.py +1 -1
- tfduck/tga/tga.py +337 -57
- {tfduck_bsd-0.18.2.dist-info → tfduck_bsd-0.18.5.dist-info}/METADATA +1 -1
- {tfduck_bsd-0.18.2.dist-info → tfduck_bsd-0.18.5.dist-info}/RECORD +8 -8
- {tfduck_bsd-0.18.2.data → tfduck_bsd-0.18.5.data}/scripts/tfduck +0 -0
- {tfduck_bsd-0.18.2.dist-info → tfduck_bsd-0.18.5.dist-info}/LICENSE +0 -0
- {tfduck_bsd-0.18.2.dist-info → tfduck_bsd-0.18.5.dist-info}/WHEEL +0 -0
- {tfduck_bsd-0.18.2.dist-info → tfduck_bsd-0.18.5.dist-info}/top_level.txt +0 -0
tfduck/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__="0.18.
|
|
1
|
+
__version__="0.18.5"
|
tfduck/tga/tga.py
CHANGED
|
@@ -11,9 +11,10 @@ finally:
|
|
|
11
11
|
BMOBJ.remove_file(local_file)
|
|
12
12
|
|
|
13
13
|
版本记录:
|
|
14
|
-
pyhive=0.6.2
|
|
14
|
+
pyhive=0.6.2
|
|
15
15
|
requests=2.23.0 2.27.1
|
|
16
16
|
"""
|
|
17
|
+
|
|
17
18
|
import requests
|
|
18
19
|
import pandas
|
|
19
20
|
import json
|
|
@@ -22,16 +23,18 @@ import os
|
|
|
22
23
|
import uuid
|
|
23
24
|
import urllib3
|
|
24
25
|
from tfduck.common.defines import BMOBJ, Et
|
|
26
|
+
|
|
25
27
|
# from django.conf import settings
|
|
26
28
|
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED
|
|
27
29
|
from tfduck.tga.base_tga import BaseTga
|
|
28
30
|
|
|
31
|
+
|
|
29
32
|
class ThinkDataQuery(BaseTga):
|
|
30
33
|
"""
|
|
31
34
|
@des: thinkdata openapi查询基础类----这个只能再thinkdata内网执行
|
|
32
35
|
"""
|
|
33
36
|
|
|
34
|
-
def __init__(self, query_uri, token, hive_conn_info=[
|
|
37
|
+
def __init__(self, query_uri, token, hive_conn_info=["host", 0]):
|
|
35
38
|
"""
|
|
36
39
|
@des:初始化类
|
|
37
40
|
"""
|
|
@@ -52,53 +55,81 @@ class ThinkDataQuery(BaseTga):
|
|
|
52
55
|
file_path = os.path.join(base_dir, real_name)
|
|
53
56
|
return file_path
|
|
54
57
|
|
|
55
|
-
def g_to_csv_notmp(
|
|
58
|
+
def g_to_csv_notmp(
|
|
59
|
+
self, filepath, df, index=True, compression=None, mode="w", header=True
|
|
60
|
+
):
|
|
56
61
|
"""
|
|
57
62
|
@des: pandas生成csv文件---用于追加文件,不能用临时文件
|
|
58
|
-
compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
|
|
63
|
+
compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
|
|
59
64
|
"""
|
|
60
65
|
tmp_filepath = filepath
|
|
61
66
|
if index is None: # 不保存行索引
|
|
62
67
|
if compression is None: # 不压缩
|
|
63
68
|
df.to_csv(tmp_filepath, index=None, mode=mode, header=header)
|
|
64
69
|
else:
|
|
65
|
-
df.to_csv(
|
|
66
|
-
|
|
70
|
+
df.to_csv(
|
|
71
|
+
tmp_filepath,
|
|
72
|
+
index=None,
|
|
73
|
+
compression=compression,
|
|
74
|
+
mode=mode,
|
|
75
|
+
header=header,
|
|
76
|
+
)
|
|
67
77
|
else:
|
|
68
78
|
if compression is None: # 不压缩
|
|
69
79
|
df.to_csv(tmp_filepath, mode=mode, header=header)
|
|
70
80
|
else:
|
|
71
|
-
df.to_csv(
|
|
72
|
-
|
|
81
|
+
df.to_csv(
|
|
82
|
+
tmp_filepath, compression=compression, mode=mode, header=header
|
|
83
|
+
)
|
|
73
84
|
return True
|
|
74
85
|
|
|
75
|
-
def get_data_csv_i(
|
|
86
|
+
def get_data_csv_i(
|
|
87
|
+
self,
|
|
88
|
+
ctx,
|
|
89
|
+
unique_path,
|
|
90
|
+
sql,
|
|
91
|
+
block_size=100000,
|
|
92
|
+
print_size=100000,
|
|
93
|
+
read_timeout=600,
|
|
94
|
+
upcount=None,
|
|
95
|
+
conn_timeout=30,
|
|
96
|
+
tga_data_timeout=600,
|
|
97
|
+
):
|
|
76
98
|
"""
|
|
77
99
|
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
78
100
|
"""
|
|
79
101
|
session = requests.session()
|
|
80
|
-
post_data = {
|
|
102
|
+
post_data = {
|
|
103
|
+
"token": self.token,
|
|
104
|
+
"sql": sql,
|
|
105
|
+
"timeoutSeconds": tga_data_timeout,
|
|
106
|
+
}
|
|
81
107
|
#
|
|
82
108
|
unique_path = self.gen_local_unique_file()
|
|
83
109
|
#
|
|
84
110
|
BMOBJ.log_error("in query")
|
|
85
111
|
#
|
|
86
|
-
r = session.post(
|
|
87
|
-
|
|
112
|
+
r = session.post(
|
|
113
|
+
self.query_uri,
|
|
114
|
+
data=post_data,
|
|
115
|
+
stream=True,
|
|
116
|
+
verify=False,
|
|
117
|
+
timeout=(conn_timeout, read_timeout),
|
|
118
|
+
)
|
|
88
119
|
datas = []
|
|
89
|
-
i = 0
|
|
120
|
+
i = 0 # 循环引用计数
|
|
90
121
|
icount = 0 # 数据的数量
|
|
91
122
|
cols = [] # 表头
|
|
92
123
|
try:
|
|
93
|
-
row =
|
|
124
|
+
row = ""
|
|
94
125
|
# iter_lines iter_content, chunk_size字节, 下面取100M
|
|
95
|
-
for row in r.iter_lines(chunk_size=1024*1024*100):
|
|
126
|
+
for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
|
|
96
127
|
if not row:
|
|
97
128
|
continue
|
|
98
129
|
data = None
|
|
99
130
|
if i == 0: # 处理header
|
|
100
131
|
data = json.loads(row)
|
|
101
|
-
if
|
|
132
|
+
if data["return_code"] == 0:
|
|
102
133
|
cols = data["data"]["headers"]
|
|
103
134
|
df = pandas.DataFrame(data=[], columns=cols) # 保存表头
|
|
104
135
|
self.g_to_csv_notmp(unique_path, df, index=None)
|
|
@@ -121,8 +152,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
121
152
|
i += 1
|
|
122
153
|
if len(datas) == block_size: # 1000000条保存一次
|
|
123
154
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
124
|
-
self.g_to_csv_notmp(
|
|
125
|
-
|
|
155
|
+
self.g_to_csv_notmp(
|
|
156
|
+
unique_path, df, index=None, mode="a", header=False
|
|
157
|
+
) # 追加保存
|
|
126
158
|
icount += block_size
|
|
127
159
|
datas = []
|
|
128
160
|
if i % print_size == 0:
|
|
@@ -130,8 +162,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
130
162
|
BMOBJ.clog(ctx, f"total: {i}")
|
|
131
163
|
if len(datas) > 0: # 保存最后收尾的
|
|
132
164
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
133
|
-
self.g_to_csv_notmp(
|
|
134
|
-
|
|
165
|
+
self.g_to_csv_notmp(
|
|
166
|
+
unique_path, df, index=None, mode="a", header=False
|
|
167
|
+
) # 追加保存
|
|
135
168
|
icount += len(datas)
|
|
136
169
|
datas = []
|
|
137
170
|
except Exception as e:
|
|
@@ -152,7 +185,18 @@ class ThinkDataQuery(BaseTga):
|
|
|
152
185
|
pass
|
|
153
186
|
return unique_path
|
|
154
187
|
|
|
155
|
-
def get_data_csv(
|
|
188
|
+
def get_data_csv(
|
|
189
|
+
self,
|
|
190
|
+
ctx,
|
|
191
|
+
sql,
|
|
192
|
+
block_size=100000,
|
|
193
|
+
print_size=100000,
|
|
194
|
+
read_timeout=600,
|
|
195
|
+
upcount=None,
|
|
196
|
+
retry_count=2,
|
|
197
|
+
conn_timeout=30,
|
|
198
|
+
tga_data_timeout=600,
|
|
199
|
+
):
|
|
156
200
|
"""
|
|
157
201
|
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
158
202
|
"""
|
|
@@ -162,7 +206,16 @@ class ThinkDataQuery(BaseTga):
|
|
|
162
206
|
for i in range(retry_count):
|
|
163
207
|
try:
|
|
164
208
|
result = self.get_data_csv_i(
|
|
165
|
-
ctx,
|
|
209
|
+
ctx,
|
|
210
|
+
unique_path,
|
|
211
|
+
sql,
|
|
212
|
+
block_size,
|
|
213
|
+
print_size,
|
|
214
|
+
read_timeout,
|
|
215
|
+
upcount,
|
|
216
|
+
conn_timeout,
|
|
217
|
+
tga_data_timeout,
|
|
218
|
+
)
|
|
166
219
|
return result
|
|
167
220
|
except Exception as e:
|
|
168
221
|
gol_e = e
|
|
@@ -178,20 +231,218 @@ class ThinkDataQuery(BaseTga):
|
|
|
178
231
|
continue
|
|
179
232
|
if gol_e is not None:
|
|
180
233
|
raise gol_e
|
|
181
|
-
|
|
182
|
-
def
|
|
234
|
+
|
|
235
|
+
def get_data_csv_by_str_i(
|
|
236
|
+
self,
|
|
237
|
+
ctx,
|
|
238
|
+
unique_path,
|
|
239
|
+
sql,
|
|
240
|
+
block_size=100000,
|
|
241
|
+
print_size=100000,
|
|
242
|
+
read_timeout=600,
|
|
243
|
+
upcount=None,
|
|
244
|
+
conn_timeout=30,
|
|
245
|
+
tga_data_timeout=600,
|
|
246
|
+
):
|
|
247
|
+
"""
|
|
248
|
+
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
249
|
+
"""
|
|
250
|
+
session = requests.session()
|
|
251
|
+
post_data = {
|
|
252
|
+
"token": self.token,
|
|
253
|
+
"sql": sql,
|
|
254
|
+
"timeoutSeconds": tga_data_timeout,
|
|
255
|
+
}
|
|
256
|
+
#
|
|
257
|
+
unique_path = self.gen_local_unique_file()
|
|
258
|
+
#
|
|
259
|
+
BMOBJ.log_error("in query")
|
|
260
|
+
#
|
|
261
|
+
r = session.post(
|
|
262
|
+
self.query_uri,
|
|
263
|
+
data=post_data,
|
|
264
|
+
stream=True,
|
|
265
|
+
verify=False,
|
|
266
|
+
timeout=(conn_timeout, read_timeout),
|
|
267
|
+
)
|
|
268
|
+
datas = []
|
|
269
|
+
i = 0 # 循环引用计数
|
|
270
|
+
icount = 0 # 数据的数量
|
|
271
|
+
cols = [] # 表头
|
|
272
|
+
try:
|
|
273
|
+
row = ""
|
|
274
|
+
# iter_lines iter_content, chunk_size字节, 下面取100M
|
|
275
|
+
for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
|
|
276
|
+
if not row:
|
|
277
|
+
continue
|
|
278
|
+
data = None
|
|
279
|
+
if i == 0: # 处理header
|
|
280
|
+
data = json.loads(row)
|
|
281
|
+
if data["return_code"] == 0:
|
|
282
|
+
cols = data["data"]["headers"]
|
|
283
|
+
df = pandas.DataFrame(
|
|
284
|
+
data=[], columns=cols, dtype=object
|
|
285
|
+
) # 保存表头
|
|
286
|
+
# 解决科学计数法的问题
|
|
287
|
+
df = df.astype(str)
|
|
288
|
+
df = df.astype("string")
|
|
289
|
+
#
|
|
290
|
+
self.g_to_csv_notmp(unique_path, df, index=None)
|
|
291
|
+
data = None
|
|
292
|
+
else:
|
|
293
|
+
BMOBJ.log_error("sql error:", data)
|
|
294
|
+
# BMOBJ.log_error(sql)
|
|
295
|
+
try:
|
|
296
|
+
BMOBJ.clog(ctx, "sql error:", data)
|
|
297
|
+
except:
|
|
298
|
+
pass
|
|
299
|
+
datas = []
|
|
300
|
+
raise Exception("sql error")
|
|
301
|
+
break # 表示查询出错,没有消息
|
|
302
|
+
else:
|
|
303
|
+
if row.strip() not in [b"", ""]:
|
|
304
|
+
data = json.loads(row)
|
|
305
|
+
if data is not None:
|
|
306
|
+
datas.append(data)
|
|
307
|
+
i += 1
|
|
308
|
+
if len(datas) == block_size: # 1000000条保存一次
|
|
309
|
+
df = pandas.DataFrame(
|
|
310
|
+
data=datas, columns=cols, dtype=object
|
|
311
|
+
) # 保存表头
|
|
312
|
+
# 解决科学计数法的问题
|
|
313
|
+
df = df.astype(str)
|
|
314
|
+
df = df.astype("string")
|
|
315
|
+
#
|
|
316
|
+
self.g_to_csv_notmp(
|
|
317
|
+
unique_path, df, index=None, mode="a", header=False
|
|
318
|
+
) # 追加保存
|
|
319
|
+
icount += block_size
|
|
320
|
+
datas = []
|
|
321
|
+
if i % print_size == 0:
|
|
322
|
+
BMOBJ.clog(ctx, i)
|
|
323
|
+
BMOBJ.clog(ctx, f"total: {i}")
|
|
324
|
+
if len(datas) > 0: # 保存最后收尾的
|
|
325
|
+
df = pandas.DataFrame(data=datas, columns=cols, dtype=object) # 保存表头
|
|
326
|
+
# 解决科学计数法的问题
|
|
327
|
+
df = df.astype(str)
|
|
328
|
+
df = df.astype("string")
|
|
329
|
+
#
|
|
330
|
+
self.g_to_csv_notmp(
|
|
331
|
+
unique_path, df, index=None, mode="a", header=False
|
|
332
|
+
) # 追加保存
|
|
333
|
+
icount += len(datas)
|
|
334
|
+
datas = []
|
|
335
|
+
except Exception as e:
|
|
336
|
+
BMOBJ.clog(ctx, "get data error", str(e), row)
|
|
337
|
+
if upcount is not None:
|
|
338
|
+
if i < upcount: # 看是否达到可以接受的数量,否则重新查询
|
|
339
|
+
raise e
|
|
340
|
+
else:
|
|
341
|
+
raise e
|
|
342
|
+
finally:
|
|
343
|
+
try:
|
|
344
|
+
r.close()
|
|
345
|
+
except:
|
|
346
|
+
pass
|
|
347
|
+
try:
|
|
348
|
+
session.close()
|
|
349
|
+
except:
|
|
350
|
+
pass
|
|
351
|
+
return unique_path
|
|
352
|
+
|
|
353
|
+
def get_data_csv_by_str(
|
|
354
|
+
self,
|
|
355
|
+
ctx,
|
|
356
|
+
sql,
|
|
357
|
+
block_size=100000,
|
|
358
|
+
print_size=100000,
|
|
359
|
+
read_timeout=600,
|
|
360
|
+
upcount=None,
|
|
361
|
+
retry_count=2,
|
|
362
|
+
conn_timeout=30,
|
|
363
|
+
tga_data_timeout=600,
|
|
364
|
+
):
|
|
365
|
+
"""
|
|
366
|
+
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
367
|
+
"""
|
|
368
|
+
# unique_path = "./test.csv"
|
|
369
|
+
unique_path = self.gen_local_unique_file()
|
|
370
|
+
gol_e = None
|
|
371
|
+
for i in range(retry_count):
|
|
372
|
+
try:
|
|
373
|
+
result = self.get_data_csv_by_str_i(
|
|
374
|
+
ctx,
|
|
375
|
+
unique_path,
|
|
376
|
+
sql,
|
|
377
|
+
block_size,
|
|
378
|
+
print_size,
|
|
379
|
+
read_timeout,
|
|
380
|
+
upcount,
|
|
381
|
+
conn_timeout,
|
|
382
|
+
tga_data_timeout,
|
|
383
|
+
)
|
|
384
|
+
return result
|
|
385
|
+
except Exception as e:
|
|
386
|
+
gol_e = e
|
|
387
|
+
BMOBJ.remove_file(unique_path)
|
|
388
|
+
BMOBJ.remove_folder(unique_path)
|
|
389
|
+
# modify by yx 2024-05-08---加强出错重试---
|
|
390
|
+
# if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
|
|
391
|
+
# BMOBJ.clog(ctx, f'retry {i}')
|
|
392
|
+
# continue
|
|
393
|
+
# else:
|
|
394
|
+
# break
|
|
395
|
+
if 1:
|
|
396
|
+
continue
|
|
397
|
+
if gol_e is not None:
|
|
398
|
+
raise gol_e
|
|
399
|
+
|
|
400
|
+
def get_data_raw_pyhive(
|
|
401
|
+
self,
|
|
402
|
+
ctx,
|
|
403
|
+
sql,
|
|
404
|
+
block_size=100000,
|
|
405
|
+
fetch_size=10000,
|
|
406
|
+
retry_count=2,
|
|
407
|
+
read_timeout=300,
|
|
408
|
+
upcount=None,
|
|
409
|
+
print_size=100000,
|
|
410
|
+
conn_timeout=30,
|
|
411
|
+
tga_data_timeout=600,
|
|
412
|
+
):
|
|
183
413
|
"""
|
|
184
414
|
@des: 接口装饰器--修改为get_data_csv,防止全面修改代码
|
|
185
415
|
"""
|
|
186
|
-
result = self.get_data_csv(
|
|
416
|
+
result = self.get_data_csv(
|
|
417
|
+
ctx,
|
|
418
|
+
sql,
|
|
419
|
+
block_size,
|
|
420
|
+
print_size,
|
|
421
|
+
read_timeout,
|
|
422
|
+
upcount,
|
|
423
|
+
retry_count,
|
|
424
|
+
conn_timeout,
|
|
425
|
+
tga_data_timeout,
|
|
426
|
+
)
|
|
187
427
|
return result
|
|
188
|
-
|
|
189
428
|
|
|
190
|
-
def get_data_raw_pyhive_bck(
|
|
429
|
+
def get_data_raw_pyhive_bck(
|
|
430
|
+
self,
|
|
431
|
+
ctx,
|
|
432
|
+
sql,
|
|
433
|
+
block_size=100000,
|
|
434
|
+
fetch_size=10000,
|
|
435
|
+
retry_count=2,
|
|
436
|
+
read_timeout=300,
|
|
437
|
+
upcount=None,
|
|
438
|
+
print_size=100000,
|
|
439
|
+
conn_timeout=30,
|
|
440
|
+
tga_data_timeout=600,
|
|
441
|
+
):
|
|
191
442
|
'''
|
|
192
443
|
@des:presto直连方式读取-----重试的方式----当get_data_csv接口出问题,则启用这个接口
|
|
193
444
|
tobj = ThinkDataQuery("http://queryhost:port/querySql", "查询token",
|
|
194
|
-
["presto直连的host", 直连的port])
|
|
445
|
+
["presto直连的host", 直连的port])
|
|
195
446
|
sql = """select * from v_event_7 where "$part_date"='2022-02-24' limit 100 """
|
|
196
447
|
unique_path = tobj.get_data_raw_pyhive({}, sql)
|
|
197
448
|
'''
|
|
@@ -201,25 +452,46 @@ class ThinkDataQuery(BaseTga):
|
|
|
201
452
|
for i in range(retry_count):
|
|
202
453
|
try:
|
|
203
454
|
result = self.get_data_raw_pyhive_i(
|
|
204
|
-
ctx,
|
|
455
|
+
ctx,
|
|
456
|
+
unique_path,
|
|
457
|
+
sql,
|
|
458
|
+
block_size,
|
|
459
|
+
fetch_size,
|
|
460
|
+
read_timeout,
|
|
461
|
+
upcount,
|
|
462
|
+
print_size,
|
|
463
|
+
conn_timeout,
|
|
464
|
+
)
|
|
205
465
|
return result
|
|
206
466
|
except Exception as e:
|
|
207
467
|
gol_e = e
|
|
208
468
|
BMOBJ.remove_file(unique_path)
|
|
209
469
|
BMOBJ.remove_folder(unique_path)
|
|
210
470
|
if str(e).find("Read timed out") != -1:
|
|
211
|
-
BMOBJ.clog(ctx, f
|
|
471
|
+
BMOBJ.clog(ctx, f"retry {i}")
|
|
212
472
|
continue
|
|
213
473
|
else:
|
|
214
474
|
break
|
|
215
475
|
if gol_e is not None:
|
|
216
476
|
raise gol_e
|
|
217
477
|
|
|
218
|
-
def get_data_raw_pyhive_i(
|
|
219
|
-
|
|
478
|
+
def get_data_raw_pyhive_i(
|
|
479
|
+
self,
|
|
480
|
+
ctx,
|
|
481
|
+
unique_path,
|
|
482
|
+
sql,
|
|
483
|
+
block_size=100000,
|
|
484
|
+
fetch_size=10000,
|
|
485
|
+
read_timeout=300,
|
|
486
|
+
upcount=None,
|
|
487
|
+
print_size=100000,
|
|
488
|
+
conn_timeout=30,
|
|
489
|
+
):
|
|
490
|
+
"""
|
|
220
491
|
@des: 内部调用
|
|
221
|
-
|
|
492
|
+
"""
|
|
222
493
|
from pyhive import presto
|
|
494
|
+
|
|
223
495
|
#
|
|
224
496
|
# unique_path = self.gen_local_unique_file()
|
|
225
497
|
# unique_path = "./test.csv"
|
|
@@ -228,20 +500,25 @@ class ThinkDataQuery(BaseTga):
|
|
|
228
500
|
session = requests.session()
|
|
229
501
|
#
|
|
230
502
|
datas = []
|
|
231
|
-
i = 0
|
|
503
|
+
i = 0 # 循环引用计数
|
|
232
504
|
icount = 0 # 数据的数量
|
|
233
505
|
cols = [] # 表头
|
|
234
506
|
try:
|
|
235
|
-
conn = presto.connect(
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
507
|
+
conn = presto.connect(
|
|
508
|
+
host=self.hive_conn_info[0],
|
|
509
|
+
port=int(self.hive_conn_info[1]),
|
|
510
|
+
username="ta",
|
|
511
|
+
catalog="hive",
|
|
512
|
+
schema="ta",
|
|
513
|
+
requests_session=session,
|
|
514
|
+
# 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
|
|
515
|
+
# 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
|
|
516
|
+
requests_kwargs={
|
|
517
|
+
"timeout": (conn_timeout, read_timeout),
|
|
518
|
+
"stream": True,
|
|
519
|
+
"verify": False,
|
|
520
|
+
},
|
|
521
|
+
)
|
|
245
522
|
cursor = conn.cursor()
|
|
246
523
|
cursor.execute(sql)
|
|
247
524
|
BMOBJ.clog(ctx, "文件大小")
|
|
@@ -266,6 +543,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
266
543
|
def yx_fetch_many():
|
|
267
544
|
myres = cursor.fetchmany(fetch_size)
|
|
268
545
|
return myres
|
|
546
|
+
|
|
269
547
|
rows = yx_fetch_many()
|
|
270
548
|
while rows:
|
|
271
549
|
for row in rows:
|
|
@@ -275,8 +553,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
275
553
|
i += 1
|
|
276
554
|
if len(datas) == block_size: # 1000000条保存一次
|
|
277
555
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
278
|
-
self.g_to_csv_notmp(
|
|
279
|
-
|
|
556
|
+
self.g_to_csv_notmp(
|
|
557
|
+
unique_path, df, index=None, mode="a", header=False
|
|
558
|
+
) # 追加保存
|
|
280
559
|
icount += block_size
|
|
281
560
|
datas = []
|
|
282
561
|
if i % print_size == 0:
|
|
@@ -286,8 +565,9 @@ class ThinkDataQuery(BaseTga):
|
|
|
286
565
|
BMOBJ.clog(ctx, f"total: {i}")
|
|
287
566
|
if len(datas) > 0: # 保存最后收尾的
|
|
288
567
|
df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
|
|
289
|
-
self.g_to_csv_notmp(
|
|
290
|
-
|
|
568
|
+
self.g_to_csv_notmp(
|
|
569
|
+
unique_path, df, index=None, mode="a", header=False
|
|
570
|
+
) # 追加保存
|
|
291
571
|
icount += len(datas)
|
|
292
572
|
datas = []
|
|
293
573
|
except Exception as e:
|
|
@@ -302,13 +582,13 @@ class ThinkDataQuery(BaseTga):
|
|
|
302
582
|
try:
|
|
303
583
|
conn.close()
|
|
304
584
|
except:
|
|
305
|
-
pass
|
|
585
|
+
pass
|
|
306
586
|
try:
|
|
307
587
|
session.close()
|
|
308
588
|
except:
|
|
309
|
-
pass
|
|
589
|
+
pass
|
|
310
590
|
return unique_path
|
|
311
|
-
|
|
591
|
+
|
|
312
592
|
"""
|
|
313
593
|
数据打入接口--start--
|
|
314
594
|
"""
|
|
@@ -316,7 +596,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
316
596
|
def set_tga_user_data(self, tga_app_no, sec_token, url, data, is_set_once=False):
|
|
317
597
|
"""
|
|
318
598
|
@des: 用户数据打入tga
|
|
319
|
-
@params:
|
|
599
|
+
@params:
|
|
320
600
|
tga_app_no: tga项目的id,注意不是app_id
|
|
321
601
|
sec_token: 安全的二次部署服务器的token
|
|
322
602
|
url: 二次部署的服务器打入的url
|
|
@@ -328,7 +608,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
328
608
|
def set_tga_event_data_trac(self, tga_app_no, sec_token, url, data):
|
|
329
609
|
"""
|
|
330
610
|
@des: 普通事件数据打入tga
|
|
331
|
-
@params:
|
|
611
|
+
@params:
|
|
332
612
|
tga_app_no: tga项目的id,注意不是app_id
|
|
333
613
|
sec_token: 安全的二次部署服务器的token
|
|
334
614
|
url: 二次部署的服务器打入的url
|
|
@@ -340,7 +620,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
340
620
|
def set_tga_event_data_trac_update(self, tga_app_no, sec_token, url, data):
|
|
341
621
|
"""
|
|
342
622
|
@des: 可更新事件数据打入tga, 重写部分数据
|
|
343
|
-
@params:
|
|
623
|
+
@params:
|
|
344
624
|
tga_app_no: tga项目的id,注意不是app_id
|
|
345
625
|
sec_token: 安全的二次部署服务器的token
|
|
346
626
|
url: 二次部署的服务器打入的url
|
|
@@ -349,10 +629,10 @@ class ThinkDataQuery(BaseTga):
|
|
|
349
629
|
"""
|
|
350
630
|
pass
|
|
351
631
|
|
|
352
|
-
def set_tga_event_data_trac_overwrite(self, tga_app_no,
|
|
632
|
+
def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
|
|
353
633
|
"""
|
|
354
634
|
@des: 可更新事件数据打入tga, 重写全部数据
|
|
355
|
-
@params:
|
|
635
|
+
@params:
|
|
356
636
|
tga_app_no: tga项目的id,注意不是app_id
|
|
357
637
|
sec_token: 安全的二次部署服务器的token
|
|
358
638
|
url: 二次部署的服务器打入的url
|
|
@@ -363,4 +643,4 @@ class ThinkDataQuery(BaseTga):
|
|
|
363
643
|
|
|
364
644
|
"""
|
|
365
645
|
数据打入接口----end-----
|
|
366
|
-
"""
|
|
646
|
+
"""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
tfduck/__init__.py,sha256=
|
|
1
|
+
tfduck/__init__.py,sha256=ez0D39S5UwmrhC-yB0KboIMoVyM6XotVwZfeObtnCd8,20
|
|
2
2
|
tfduck/main.py,sha256=zNTC16wkwGJ0QX1-i8vzlGophOxmFuO4SLsF1tkjsbE,14670
|
|
3
3
|
tfduck/bdp_sdk_py/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
tfduck/bdp_sdk_py/example.py,sha256=Xq1_gcSyu0zho_iMmMYfYgMblcgF8a-GwRBWPTw0FuU,2879
|
|
@@ -26,16 +26,16 @@ tfduck/tga/base_tga.py,sha256=rg1BHaIKGSwUVbOlWSA3Y1arB8J_caemjTjH9jh8GYM,2184
|
|
|
26
26
|
tfduck/tga/predict_sql_ltv.py,sha256=25rpOZdHyMcEU3O8u67oUpLsTiZburEPsvXR38hTUJ0,3589
|
|
27
27
|
tfduck/tga/predict_sql_retain.py,sha256=Nsl0lSZ_CC8j_GB4jLrJgkoqqDRAAPYwzo7vAOYZ764,19772
|
|
28
28
|
tfduck/tga/predict_sql_yh.py,sha256=uYeuCZX2btxO-pvjrhlmuG35PquJbLvVtQEuEZHu8Cs,3588
|
|
29
|
-
tfduck/tga/tga.py,sha256=
|
|
29
|
+
tfduck/tga/tga.py,sha256=M72cZ1s0MbXxk63tQFBr1CP3junwg_sKe8jfg8t8PWc,22645
|
|
30
30
|
tfduck/tga/tga_test.py,sha256=A3n2LdvgQWlkX6E54K6cnsIUeYrgHlzG3QYbA7ZKgHk,2750
|
|
31
31
|
tfduck/tga/train_sql_ltv.py,sha256=VZqGy0FbwLHkjQClV1IIMd6_E_H6BrIqL5p_WUfTUXc,26668
|
|
32
32
|
tfduck/tga/train_sql_retain.py,sha256=AIOJKWC37j4UdM8JLFS6LdJFtABjMq9gOi3xvAs4fAE,24335
|
|
33
33
|
tfduck/tga/train_sql_yh.py,sha256=nb5BO_vOv0eKY2kOVt5ZOfM1cvfI8j2no8cYpCL_rNE,24378
|
|
34
34
|
tfduck/thinkdata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
tfduck/thinkdata/query.py,sha256=DsfcxjZrc0ZFTwN2pI5fKdM1Bwr6ageoPcA2MP3r2bE,1314
|
|
36
|
-
tfduck_bsd-0.18.
|
|
37
|
-
tfduck_bsd-0.18.
|
|
38
|
-
tfduck_bsd-0.18.
|
|
39
|
-
tfduck_bsd-0.18.
|
|
40
|
-
tfduck_bsd-0.18.
|
|
41
|
-
tfduck_bsd-0.18.
|
|
36
|
+
tfduck_bsd-0.18.5.data/scripts/tfduck,sha256=UsuoAs4peJW4I-e6Gn91gEToP_YyuUp-rUUg3ObKneY,192
|
|
37
|
+
tfduck_bsd-0.18.5.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
38
|
+
tfduck_bsd-0.18.5.dist-info/METADATA,sha256=VVIXxggWIfjNbASFNaVmPYKFVlbcwnF0PCTizHuN0TE,1003
|
|
39
|
+
tfduck_bsd-0.18.5.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
|
|
40
|
+
tfduck_bsd-0.18.5.dist-info/top_level.txt,sha256=503etRkoyeI1VYcAwe5KpD5Bamhx0R0y2ofkE8HpRDA,7
|
|
41
|
+
tfduck_bsd-0.18.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|