tfduck-bsd 0.18.2__tar.gz → 0.18.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tfduck-bsd might be problematic. Click here for more details.

Files changed (47) hide show
  1. {tfduck-bsd-0.18.2/tfduck_bsd.egg-info → tfduck-bsd-0.18.3}/PKG-INFO +1 -1
  2. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/setup.py +1 -1
  3. tfduck-bsd-0.18.3/tfduck/__init__.py +1 -0
  4. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/tga.py +337 -57
  5. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3/tfduck_bsd.egg-info}/PKG-INFO +1 -1
  6. tfduck-bsd-0.18.2/tfduck/__init__.py +0 -1
  7. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/LICENSE +0 -0
  8. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/README.md +0 -0
  9. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/bin/tfduck +0 -0
  10. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/setup.cfg +0 -0
  11. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/__init__.py +0 -0
  12. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
  13. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
  14. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
  15. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/example.py +0 -0
  16. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
  17. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
  18. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
  19. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/common/__init__.py +0 -0
  20. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/common/defines.py +0 -0
  21. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/common/defines_clean.py +0 -0
  22. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/common/extendEncoder.py +0 -0
  23. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/main.py +0 -0
  24. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/oss/__init__.py +0 -0
  25. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/oss/oss.py +0 -0
  26. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/__init__.py +0 -0
  27. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
  28. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/spark_manage.py +0 -0
  29. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/s3/__init__.py +0 -0
  30. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/s3/s3oper.py +0 -0
  31. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/sagemaker/__init__.py +0 -0
  32. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/sagemaker/saoper.py +0 -0
  33. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/__init__.py +0 -0
  34. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/base_tga.py +0 -0
  35. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_ltv.py +0 -0
  36. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_retain.py +0 -0
  37. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_yh.py +0 -0
  38. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/tga_test.py +0 -0
  39. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_ltv.py +0 -0
  40. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_retain.py +0 -0
  41. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_yh.py +0 -0
  42. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/thinkdata/__init__.py +0 -0
  43. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck/thinkdata/query.py +0 -0
  44. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/SOURCES.txt +0 -0
  45. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
  46. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/requires.txt +0 -0
  47. {tfduck-bsd-0.18.2 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tfduck-bsd
3
- Version: 0.18.2
3
+ Version: 0.18.3
4
4
  Summary: A small example package
5
5
  Home-page: UNKNOWN
6
6
  Author: yuanxiao
@@ -8,7 +8,7 @@ with open("README.md", "r") as fh:
8
8
 
9
9
  setuptools.setup(
10
10
  name="tfduck-bsd",
11
- version="0.18.2",
11
+ version="0.18.3",
12
12
  author="yuanxiao",
13
13
  author_email="yuan6785@163.com",
14
14
  description="A small example package",
@@ -0,0 +1 @@
1
+ __version__="0.18.3"
@@ -11,9 +11,10 @@ finally:
11
11
  BMOBJ.remove_file(local_file)
12
12
 
13
13
  版本记录:
14
- pyhive=0.6.2
14
+ pyhive=0.6.2
15
15
  requests=2.23.0 2.27.1
16
16
  """
17
+
17
18
  import requests
18
19
  import pandas
19
20
  import json
@@ -22,16 +23,18 @@ import os
22
23
  import uuid
23
24
  import urllib3
24
25
  from tfduck.common.defines import BMOBJ, Et
26
+
25
27
  # from django.conf import settings
26
28
  from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED
27
29
  from tfduck.tga.base_tga import BaseTga
28
30
 
31
+
29
32
  class ThinkDataQuery(BaseTga):
30
33
  """
31
34
  @des: thinkdata openapi查询基础类----这个只能再thinkdata内网执行
32
35
  """
33
36
 
34
- def __init__(self, query_uri, token, hive_conn_info=['host', 0]):
37
+ def __init__(self, query_uri, token, hive_conn_info=["host", 0]):
35
38
  """
36
39
  @des:初始化类
37
40
  """
@@ -52,53 +55,81 @@ class ThinkDataQuery(BaseTga):
52
55
  file_path = os.path.join(base_dir, real_name)
53
56
  return file_path
54
57
 
55
- def g_to_csv_notmp(self, filepath, df, index=True, compression=None, mode='w', header=True):
58
+ def g_to_csv_notmp(
59
+ self, filepath, df, index=True, compression=None, mode="w", header=True
60
+ ):
56
61
  """
57
62
  @des: pandas生成csv文件---用于追加文件,不能用临时文件
58
- compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
63
+ compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
59
64
  """
60
65
  tmp_filepath = filepath
61
66
  if index is None: # 不保存行索引
62
67
  if compression is None: # 不压缩
63
68
  df.to_csv(tmp_filepath, index=None, mode=mode, header=header)
64
69
  else:
65
- df.to_csv(tmp_filepath, index=None,
66
- compression=compression, mode=mode, header=header)
70
+ df.to_csv(
71
+ tmp_filepath,
72
+ index=None,
73
+ compression=compression,
74
+ mode=mode,
75
+ header=header,
76
+ )
67
77
  else:
68
78
  if compression is None: # 不压缩
69
79
  df.to_csv(tmp_filepath, mode=mode, header=header)
70
80
  else:
71
- df.to_csv(tmp_filepath, compression=compression,
72
- mode=mode, header=header)
81
+ df.to_csv(
82
+ tmp_filepath, compression=compression, mode=mode, header=header
83
+ )
73
84
  return True
74
85
 
75
- def get_data_csv_i(self, ctx, unique_path, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, conn_timeout=30, tga_data_timeout=600):
86
+ def get_data_csv_i(
87
+ self,
88
+ ctx,
89
+ unique_path,
90
+ sql,
91
+ block_size=100000,
92
+ print_size=100000,
93
+ read_timeout=600,
94
+ upcount=None,
95
+ conn_timeout=30,
96
+ tga_data_timeout=600,
97
+ ):
76
98
  """
77
99
  @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
78
100
  """
79
101
  session = requests.session()
80
- post_data = {'token': self.token, 'sql': sql, 'timeoutSeconds': tga_data_timeout}
102
+ post_data = {
103
+ "token": self.token,
104
+ "sql": sql,
105
+ "timeoutSeconds": tga_data_timeout,
106
+ }
81
107
  #
82
108
  unique_path = self.gen_local_unique_file()
83
109
  #
84
110
  BMOBJ.log_error("in query")
85
111
  #
86
- r = session.post(self.query_uri, data=post_data, stream=True,
87
- verify=False, timeout=(conn_timeout, read_timeout))
112
+ r = session.post(
113
+ self.query_uri,
114
+ data=post_data,
115
+ stream=True,
116
+ verify=False,
117
+ timeout=(conn_timeout, read_timeout),
118
+ )
88
119
  datas = []
89
- i = 0 # 循环引用计数
120
+ i = 0 # 循环引用计数
90
121
  icount = 0 # 数据的数量
91
122
  cols = [] # 表头
92
123
  try:
93
- row = ''
124
+ row = ""
94
125
  # iter_lines iter_content, chunk_size字节, 下面取100M
95
- for row in r.iter_lines(chunk_size=1024*1024*100):
126
+ for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
96
127
  if not row:
97
128
  continue
98
129
  data = None
99
130
  if i == 0: # 处理header
100
131
  data = json.loads(row)
101
- if(data["return_code"] == 0):
132
+ if data["return_code"] == 0:
102
133
  cols = data["data"]["headers"]
103
134
  df = pandas.DataFrame(data=[], columns=cols) # 保存表头
104
135
  self.g_to_csv_notmp(unique_path, df, index=None)
@@ -121,8 +152,9 @@ class ThinkDataQuery(BaseTga):
121
152
  i += 1
122
153
  if len(datas) == block_size: # 1000000条保存一次
123
154
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
124
- self.g_to_csv_notmp(unique_path, df, index=None,
125
- mode='a', header=False) # 追加保存
155
+ self.g_to_csv_notmp(
156
+ unique_path, df, index=None, mode="a", header=False
157
+ ) # 追加保存
126
158
  icount += block_size
127
159
  datas = []
128
160
  if i % print_size == 0:
@@ -130,8 +162,9 @@ class ThinkDataQuery(BaseTga):
130
162
  BMOBJ.clog(ctx, f"total: {i}")
131
163
  if len(datas) > 0: # 保存最后收尾的
132
164
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
133
- self.g_to_csv_notmp(unique_path, df, index=None,
134
- mode='a', header=False) # 追加保存
165
+ self.g_to_csv_notmp(
166
+ unique_path, df, index=None, mode="a", header=False
167
+ ) # 追加保存
135
168
  icount += len(datas)
136
169
  datas = []
137
170
  except Exception as e:
@@ -152,7 +185,18 @@ class ThinkDataQuery(BaseTga):
152
185
  pass
153
186
  return unique_path
154
187
 
155
- def get_data_csv(self, ctx, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, retry_count=2, conn_timeout=30, tga_data_timeout=600):
188
+ def get_data_csv(
189
+ self,
190
+ ctx,
191
+ sql,
192
+ block_size=100000,
193
+ print_size=100000,
194
+ read_timeout=600,
195
+ upcount=None,
196
+ retry_count=2,
197
+ conn_timeout=30,
198
+ tga_data_timeout=600,
199
+ ):
156
200
  """
157
201
  @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
158
202
  """
@@ -162,7 +206,16 @@ class ThinkDataQuery(BaseTga):
162
206
  for i in range(retry_count):
163
207
  try:
164
208
  result = self.get_data_csv_i(
165
- ctx, unique_path, sql, block_size, print_size, read_timeout, upcount, conn_timeout, tga_data_timeout)
209
+ ctx,
210
+ unique_path,
211
+ sql,
212
+ block_size,
213
+ print_size,
214
+ read_timeout,
215
+ upcount,
216
+ conn_timeout,
217
+ tga_data_timeout,
218
+ )
166
219
  return result
167
220
  except Exception as e:
168
221
  gol_e = e
@@ -178,20 +231,218 @@ class ThinkDataQuery(BaseTga):
178
231
  continue
179
232
  if gol_e is not None:
180
233
  raise gol_e
181
-
182
- def get_data_raw_pyhive(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30, tga_data_timeout=600):
234
+
235
+ def get_data_csv_by_str_i(
236
+ self,
237
+ ctx,
238
+ unique_path,
239
+ sql,
240
+ block_size=100000,
241
+ print_size=100000,
242
+ read_timeout=600,
243
+ upcount=None,
244
+ conn_timeout=30,
245
+ tga_data_timeout=600,
246
+ ):
247
+ """
248
+ @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
249
+ """
250
+ session = requests.session()
251
+ post_data = {
252
+ "token": self.token,
253
+ "sql": sql,
254
+ "timeoutSeconds": tga_data_timeout,
255
+ }
256
+ #
257
+ unique_path = self.gen_local_unique_file()
258
+ #
259
+ BMOBJ.log_error("in query")
260
+ #
261
+ r = session.post(
262
+ self.query_uri,
263
+ data=post_data,
264
+ stream=True,
265
+ verify=False,
266
+ timeout=(conn_timeout, read_timeout),
267
+ )
268
+ datas = []
269
+ i = 0 # 循环引用计数
270
+ icount = 0 # 数据的数量
271
+ cols = [] # 表头
272
+ try:
273
+ row = ""
274
+ # iter_lines iter_content, chunk_size字节, 下面取100M
275
+ for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
276
+ if not row:
277
+ continue
278
+ data = None
279
+ if i == 0: # 处理header
280
+ data = json.loads(row)
281
+ if data["return_code"] == 0:
282
+ cols = data["data"]["headers"]
283
+ df = pandas.DataFrame(
284
+ data=[], columns=cols, type=object
285
+ ) # 保存表头
286
+ # 解决科学计数法的问题
287
+ df = df.astype(str)
288
+ df = df.astype("string")
289
+ #
290
+ self.g_to_csv_notmp(unique_path, df, index=None)
291
+ data = None
292
+ else:
293
+ BMOBJ.log_error("sql error:", data)
294
+ # BMOBJ.log_error(sql)
295
+ try:
296
+ BMOBJ.clog(ctx, "sql error:", data)
297
+ except:
298
+ pass
299
+ datas = []
300
+ raise Exception("sql error")
301
+ break # 表示查询出错,没有消息
302
+ else:
303
+ if row.strip() not in [b"", ""]:
304
+ data = json.loads(row)
305
+ if data is not None:
306
+ datas.append(data)
307
+ i += 1
308
+ if len(datas) == block_size: # 1000000条保存一次
309
+ df = pandas.DataFrame(
310
+ data=datas, columns=cols, type=object
311
+ ) # 保存表头
312
+ # 解决科学计数法的问题
313
+ df = df.astype(str)
314
+ df = df.astype("string")
315
+ #
316
+ self.g_to_csv_notmp(
317
+ unique_path, df, index=None, mode="a", header=False
318
+ ) # 追加保存
319
+ icount += block_size
320
+ datas = []
321
+ if i % print_size == 0:
322
+ BMOBJ.clog(ctx, i)
323
+ BMOBJ.clog(ctx, f"total: {i}")
324
+ if len(datas) > 0: # 保存最后收尾的
325
+ df = pandas.DataFrame(data=datas, columns=cols, type=object) # 保存表头
326
+ # 解决科学计数法的问题
327
+ df = df.astype(str)
328
+ df = df.astype("string")
329
+ #
330
+ self.g_to_csv_notmp(
331
+ unique_path, df, index=None, mode="a", header=False
332
+ ) # 追加保存
333
+ icount += len(datas)
334
+ datas = []
335
+ except Exception as e:
336
+ BMOBJ.clog(ctx, "get data error", str(e), row)
337
+ if upcount is not None:
338
+ if i < upcount: # 看是否达到可以接受的数量,否则重新查询
339
+ raise e
340
+ else:
341
+ raise e
342
+ finally:
343
+ try:
344
+ r.close()
345
+ except:
346
+ pass
347
+ try:
348
+ session.close()
349
+ except:
350
+ pass
351
+ return unique_path
352
+
353
+ def get_data_csv_by_str(
354
+ self,
355
+ ctx,
356
+ sql,
357
+ block_size=100000,
358
+ print_size=100000,
359
+ read_timeout=600,
360
+ upcount=None,
361
+ retry_count=2,
362
+ conn_timeout=30,
363
+ tga_data_timeout=600,
364
+ ):
365
+ """
366
+ @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
367
+ """
368
+ # unique_path = "./test.csv"
369
+ unique_path = self.gen_local_unique_file()
370
+ gol_e = None
371
+ for i in range(retry_count):
372
+ try:
373
+ result = self.get_data_csv_by_str_i(
374
+ ctx,
375
+ unique_path,
376
+ sql,
377
+ block_size,
378
+ print_size,
379
+ read_timeout,
380
+ upcount,
381
+ conn_timeout,
382
+ tga_data_timeout,
383
+ )
384
+ return result
385
+ except Exception as e:
386
+ gol_e = e
387
+ BMOBJ.remove_file(unique_path)
388
+ BMOBJ.remove_folder(unique_path)
389
+ # modify by yx 2024-05-08---加强出错重试---
390
+ # if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
391
+ # BMOBJ.clog(ctx, f'retry {i}')
392
+ # continue
393
+ # else:
394
+ # break
395
+ if 1:
396
+ continue
397
+ if gol_e is not None:
398
+ raise gol_e
399
+
400
+ def get_data_raw_pyhive(
401
+ self,
402
+ ctx,
403
+ sql,
404
+ block_size=100000,
405
+ fetch_size=10000,
406
+ retry_count=2,
407
+ read_timeout=300,
408
+ upcount=None,
409
+ print_size=100000,
410
+ conn_timeout=30,
411
+ tga_data_timeout=600,
412
+ ):
183
413
  """
184
414
  @des: 接口装饰器--修改为get_data_csv,防止全面修改代码
185
415
  """
186
- result = self.get_data_csv(ctx, sql, block_size, print_size, read_timeout, upcount, retry_count, conn_timeout, tga_data_timeout)
416
+ result = self.get_data_csv(
417
+ ctx,
418
+ sql,
419
+ block_size,
420
+ print_size,
421
+ read_timeout,
422
+ upcount,
423
+ retry_count,
424
+ conn_timeout,
425
+ tga_data_timeout,
426
+ )
187
427
  return result
188
-
189
428
 
190
- def get_data_raw_pyhive_bck(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30, tga_data_timeout=600):
429
+ def get_data_raw_pyhive_bck(
430
+ self,
431
+ ctx,
432
+ sql,
433
+ block_size=100000,
434
+ fetch_size=10000,
435
+ retry_count=2,
436
+ read_timeout=300,
437
+ upcount=None,
438
+ print_size=100000,
439
+ conn_timeout=30,
440
+ tga_data_timeout=600,
441
+ ):
191
442
  '''
192
443
  @des:presto直连方式读取-----重试的方式----当get_data_csv接口出问题,则启用这个接口
193
444
  tobj = ThinkDataQuery("http://queryhost:port/querySql", "查询token",
194
- ["presto直连的host", 直连的port])
445
+ ["presto直连的host", 直连的port])
195
446
  sql = """select * from v_event_7 where "$part_date"='2022-02-24' limit 100 """
196
447
  unique_path = tobj.get_data_raw_pyhive({}, sql)
197
448
  '''
@@ -201,25 +452,46 @@ class ThinkDataQuery(BaseTga):
201
452
  for i in range(retry_count):
202
453
  try:
203
454
  result = self.get_data_raw_pyhive_i(
204
- ctx, unique_path, sql, block_size, fetch_size, read_timeout, upcount, print_size, conn_timeout)
455
+ ctx,
456
+ unique_path,
457
+ sql,
458
+ block_size,
459
+ fetch_size,
460
+ read_timeout,
461
+ upcount,
462
+ print_size,
463
+ conn_timeout,
464
+ )
205
465
  return result
206
466
  except Exception as e:
207
467
  gol_e = e
208
468
  BMOBJ.remove_file(unique_path)
209
469
  BMOBJ.remove_folder(unique_path)
210
470
  if str(e).find("Read timed out") != -1:
211
- BMOBJ.clog(ctx, f'retry {i}')
471
+ BMOBJ.clog(ctx, f"retry {i}")
212
472
  continue
213
473
  else:
214
474
  break
215
475
  if gol_e is not None:
216
476
  raise gol_e
217
477
 
218
- def get_data_raw_pyhive_i(self, ctx, unique_path, sql, block_size=100000, fetch_size=10000, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30):
219
- '''
478
+ def get_data_raw_pyhive_i(
479
+ self,
480
+ ctx,
481
+ unique_path,
482
+ sql,
483
+ block_size=100000,
484
+ fetch_size=10000,
485
+ read_timeout=300,
486
+ upcount=None,
487
+ print_size=100000,
488
+ conn_timeout=30,
489
+ ):
490
+ """
220
491
  @des: 内部调用
221
- '''
492
+ """
222
493
  from pyhive import presto
494
+
223
495
  #
224
496
  # unique_path = self.gen_local_unique_file()
225
497
  # unique_path = "./test.csv"
@@ -228,20 +500,25 @@ class ThinkDataQuery(BaseTga):
228
500
  session = requests.session()
229
501
  #
230
502
  datas = []
231
- i = 0 # 循环引用计数
503
+ i = 0 # 循环引用计数
232
504
  icount = 0 # 数据的数量
233
505
  cols = [] # 表头
234
506
  try:
235
- conn = presto.connect(host=self.hive_conn_info[0],
236
- port=int(self.hive_conn_info[1]),
237
- username='ta', catalog='hive',
238
- schema='ta',
239
- requests_session = session,
240
- # 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
241
- # 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
242
- requests_kwargs={"timeout": (
243
- conn_timeout, read_timeout), "stream": True, "verify": False}
244
- )
507
+ conn = presto.connect(
508
+ host=self.hive_conn_info[0],
509
+ port=int(self.hive_conn_info[1]),
510
+ username="ta",
511
+ catalog="hive",
512
+ schema="ta",
513
+ requests_session=session,
514
+ # 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
515
+ # 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
516
+ requests_kwargs={
517
+ "timeout": (conn_timeout, read_timeout),
518
+ "stream": True,
519
+ "verify": False,
520
+ },
521
+ )
245
522
  cursor = conn.cursor()
246
523
  cursor.execute(sql)
247
524
  BMOBJ.clog(ctx, "文件大小")
@@ -266,6 +543,7 @@ class ThinkDataQuery(BaseTga):
266
543
  def yx_fetch_many():
267
544
  myres = cursor.fetchmany(fetch_size)
268
545
  return myres
546
+
269
547
  rows = yx_fetch_many()
270
548
  while rows:
271
549
  for row in rows:
@@ -275,8 +553,9 @@ class ThinkDataQuery(BaseTga):
275
553
  i += 1
276
554
  if len(datas) == block_size: # 1000000条保存一次
277
555
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
278
- self.g_to_csv_notmp(unique_path, df, index=None,
279
- mode='a', header=False) # 追加保存
556
+ self.g_to_csv_notmp(
557
+ unique_path, df, index=None, mode="a", header=False
558
+ ) # 追加保存
280
559
  icount += block_size
281
560
  datas = []
282
561
  if i % print_size == 0:
@@ -286,8 +565,9 @@ class ThinkDataQuery(BaseTga):
286
565
  BMOBJ.clog(ctx, f"total: {i}")
287
566
  if len(datas) > 0: # 保存最后收尾的
288
567
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
289
- self.g_to_csv_notmp(unique_path, df, index=None,
290
- mode='a', header=False) # 追加保存
568
+ self.g_to_csv_notmp(
569
+ unique_path, df, index=None, mode="a", header=False
570
+ ) # 追加保存
291
571
  icount += len(datas)
292
572
  datas = []
293
573
  except Exception as e:
@@ -302,13 +582,13 @@ class ThinkDataQuery(BaseTga):
302
582
  try:
303
583
  conn.close()
304
584
  except:
305
- pass
585
+ pass
306
586
  try:
307
587
  session.close()
308
588
  except:
309
- pass
589
+ pass
310
590
  return unique_path
311
-
591
+
312
592
  """
313
593
  数据打入接口--start--
314
594
  """
@@ -316,7 +596,7 @@ class ThinkDataQuery(BaseTga):
316
596
  def set_tga_user_data(self, tga_app_no, sec_token, url, data, is_set_once=False):
317
597
  """
318
598
  @des: 用户数据打入tga
319
- @params:
599
+ @params:
320
600
  tga_app_no: tga项目的id,注意不是app_id
321
601
  sec_token: 安全的二次部署服务器的token
322
602
  url: 二次部署的服务器打入的url
@@ -328,7 +608,7 @@ class ThinkDataQuery(BaseTga):
328
608
  def set_tga_event_data_trac(self, tga_app_no, sec_token, url, data):
329
609
  """
330
610
  @des: 普通事件数据打入tga
331
- @params:
611
+ @params:
332
612
  tga_app_no: tga项目的id,注意不是app_id
333
613
  sec_token: 安全的二次部署服务器的token
334
614
  url: 二次部署的服务器打入的url
@@ -340,7 +620,7 @@ class ThinkDataQuery(BaseTga):
340
620
  def set_tga_event_data_trac_update(self, tga_app_no, sec_token, url, data):
341
621
  """
342
622
  @des: 可更新事件数据打入tga, 重写部分数据
343
- @params:
623
+ @params:
344
624
  tga_app_no: tga项目的id,注意不是app_id
345
625
  sec_token: 安全的二次部署服务器的token
346
626
  url: 二次部署的服务器打入的url
@@ -349,10 +629,10 @@ class ThinkDataQuery(BaseTga):
349
629
  """
350
630
  pass
351
631
 
352
- def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
632
+ def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
353
633
  """
354
634
  @des: 可更新事件数据打入tga, 重写全部数据
355
- @params:
635
+ @params:
356
636
  tga_app_no: tga项目的id,注意不是app_id
357
637
  sec_token: 安全的二次部署服务器的token
358
638
  url: 二次部署的服务器打入的url
@@ -363,4 +643,4 @@ class ThinkDataQuery(BaseTga):
363
643
 
364
644
  """
365
645
  数据打入接口----end-----
366
- """
646
+ """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tfduck-bsd
3
- Version: 0.18.2
3
+ Version: 0.18.3
4
4
  Summary: A small example package
5
5
  Home-page: UNKNOWN
6
6
  Author: yuanxiao
@@ -1 +0,0 @@
1
- __version__="0.18.2"
File without changes
File without changes
File without changes
File without changes
File without changes