tfduck-bsd 0.18.1__tar.gz → 0.18.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tfduck-bsd might be problematic. Click here for more details.

Files changed (47) hide show
  1. {tfduck-bsd-0.18.1/tfduck_bsd.egg-info → tfduck-bsd-0.18.3}/PKG-INFO +1 -1
  2. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/setup.py +1 -1
  3. tfduck-bsd-0.18.3/tfduck/__init__.py +1 -0
  4. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/tga.py +344 -60
  5. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3/tfduck_bsd.egg-info}/PKG-INFO +1 -1
  6. tfduck-bsd-0.18.1/tfduck/__init__.py +0 -1
  7. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/LICENSE +0 -0
  8. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/README.md +0 -0
  9. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/bin/tfduck +0 -0
  10. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/setup.cfg +0 -0
  11. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/__init__.py +0 -0
  12. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
  13. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
  14. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
  15. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/example.py +0 -0
  16. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
  17. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
  18. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
  19. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/__init__.py +0 -0
  20. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/defines.py +0 -0
  21. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/defines_clean.py +0 -0
  22. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/common/extendEncoder.py +0 -0
  23. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/main.py +0 -0
  24. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/oss/__init__.py +0 -0
  25. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/oss/oss.py +0 -0
  26. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/__init__.py +0 -0
  27. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
  28. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/pyspark_k8s/spark_manage.py +0 -0
  29. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/s3/__init__.py +0 -0
  30. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/s3/s3oper.py +0 -0
  31. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/sagemaker/__init__.py +0 -0
  32. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/sagemaker/saoper.py +0 -0
  33. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/__init__.py +0 -0
  34. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/base_tga.py +0 -0
  35. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_ltv.py +0 -0
  36. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_retain.py +0 -0
  37. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/predict_sql_yh.py +0 -0
  38. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/tga_test.py +0 -0
  39. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_ltv.py +0 -0
  40. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_retain.py +0 -0
  41. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/tga/train_sql_yh.py +0 -0
  42. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/thinkdata/__init__.py +0 -0
  43. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck/thinkdata/query.py +0 -0
  44. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/SOURCES.txt +0 -0
  45. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
  46. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/requires.txt +0 -0
  47. {tfduck-bsd-0.18.1 → tfduck-bsd-0.18.3}/tfduck_bsd.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tfduck-bsd
3
- Version: 0.18.1
3
+ Version: 0.18.3
4
4
  Summary: A small example package
5
5
  Home-page: UNKNOWN
6
6
  Author: yuanxiao
@@ -8,7 +8,7 @@ with open("README.md", "r") as fh:
8
8
 
9
9
  setuptools.setup(
10
10
  name="tfduck-bsd",
11
- version="0.18.1",
11
+ version="0.18.3",
12
12
  author="yuanxiao",
13
13
  author_email="yuan6785@163.com",
14
14
  description="A small example package",
@@ -0,0 +1 @@
1
+ __version__="0.18.3"
@@ -11,9 +11,10 @@ finally:
11
11
  BMOBJ.remove_file(local_file)
12
12
 
13
13
  版本记录:
14
- pyhive=0.6.2
14
+ pyhive=0.6.2
15
15
  requests=2.23.0 2.27.1
16
16
  """
17
+
17
18
  import requests
18
19
  import pandas
19
20
  import json
@@ -22,16 +23,18 @@ import os
22
23
  import uuid
23
24
  import urllib3
24
25
  from tfduck.common.defines import BMOBJ, Et
26
+
25
27
  # from django.conf import settings
26
28
  from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED
27
29
  from tfduck.tga.base_tga import BaseTga
28
30
 
31
+
29
32
  class ThinkDataQuery(BaseTga):
30
33
  """
31
34
  @des: thinkdata openapi查询基础类----这个只能再thinkdata内网执行
32
35
  """
33
36
 
34
- def __init__(self, query_uri, token, hive_conn_info=['host', 0]):
37
+ def __init__(self, query_uri, token, hive_conn_info=["host", 0]):
35
38
  """
36
39
  @des:初始化类
37
40
  """
@@ -52,53 +55,81 @@ class ThinkDataQuery(BaseTga):
52
55
  file_path = os.path.join(base_dir, real_name)
53
56
  return file_path
54
57
 
55
- def g_to_csv_notmp(self, filepath, df, index=True, compression=None, mode='w', header=True):
58
+ def g_to_csv_notmp(
59
+ self, filepath, df, index=True, compression=None, mode="w", header=True
60
+ ):
56
61
  """
57
62
  @des: pandas生成csv文件---用于追加文件,不能用临时文件
58
- compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
63
+ compression: 压缩格式 ‘gzip’, ‘bz2’, ‘zip’, ‘xz’.
59
64
  """
60
65
  tmp_filepath = filepath
61
66
  if index is None: # 不保存行索引
62
67
  if compression is None: # 不压缩
63
68
  df.to_csv(tmp_filepath, index=None, mode=mode, header=header)
64
69
  else:
65
- df.to_csv(tmp_filepath, index=None,
66
- compression=compression, mode=mode, header=header)
70
+ df.to_csv(
71
+ tmp_filepath,
72
+ index=None,
73
+ compression=compression,
74
+ mode=mode,
75
+ header=header,
76
+ )
67
77
  else:
68
78
  if compression is None: # 不压缩
69
79
  df.to_csv(tmp_filepath, mode=mode, header=header)
70
80
  else:
71
- df.to_csv(tmp_filepath, compression=compression,
72
- mode=mode, header=header)
81
+ df.to_csv(
82
+ tmp_filepath, compression=compression, mode=mode, header=header
83
+ )
73
84
  return True
74
85
 
75
- def get_data_csv_i(self, ctx, unique_path, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, conn_timeout=30, tga_data_timeout=600):
86
+ def get_data_csv_i(
87
+ self,
88
+ ctx,
89
+ unique_path,
90
+ sql,
91
+ block_size=100000,
92
+ print_size=100000,
93
+ read_timeout=600,
94
+ upcount=None,
95
+ conn_timeout=30,
96
+ tga_data_timeout=600,
97
+ ):
76
98
  """
77
99
  @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
78
100
  """
79
101
  session = requests.session()
80
- post_data = {'token': self.token, 'sql': sql, 'timeoutSeconds': tga_data_timeout}
102
+ post_data = {
103
+ "token": self.token,
104
+ "sql": sql,
105
+ "timeoutSeconds": tga_data_timeout,
106
+ }
81
107
  #
82
108
  unique_path = self.gen_local_unique_file()
83
109
  #
84
110
  BMOBJ.log_error("in query")
85
111
  #
86
- r = session.post(self.query_uri, data=post_data, stream=True,
87
- verify=False, timeout=(conn_timeout, read_timeout))
112
+ r = session.post(
113
+ self.query_uri,
114
+ data=post_data,
115
+ stream=True,
116
+ verify=False,
117
+ timeout=(conn_timeout, read_timeout),
118
+ )
88
119
  datas = []
89
- i = 0 # 循环引用计数
120
+ i = 0 # 循环引用计数
90
121
  icount = 0 # 数据的数量
91
122
  cols = [] # 表头
92
123
  try:
93
- row = ''
124
+ row = ""
94
125
  # iter_lines iter_content, chunk_size字节, 下面取100M
95
- for row in r.iter_lines(chunk_size=1024*1024*100):
126
+ for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
96
127
  if not row:
97
128
  continue
98
129
  data = None
99
130
  if i == 0: # 处理header
100
131
  data = json.loads(row)
101
- if(data["return_code"] == 0):
132
+ if data["return_code"] == 0:
102
133
  cols = data["data"]["headers"]
103
134
  df = pandas.DataFrame(data=[], columns=cols) # 保存表头
104
135
  self.g_to_csv_notmp(unique_path, df, index=None)
@@ -111,6 +142,7 @@ class ThinkDataQuery(BaseTga):
111
142
  except:
112
143
  pass
113
144
  datas = []
145
+ raise Exception("sql error")
114
146
  break # 表示查询出错,没有消息
115
147
  else:
116
148
  if row.strip() not in [b"", ""]:
@@ -120,8 +152,9 @@ class ThinkDataQuery(BaseTga):
120
152
  i += 1
121
153
  if len(datas) == block_size: # 1000000条保存一次
122
154
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
123
- self.g_to_csv_notmp(unique_path, df, index=None,
124
- mode='a', header=False) # 追加保存
155
+ self.g_to_csv_notmp(
156
+ unique_path, df, index=None, mode="a", header=False
157
+ ) # 追加保存
125
158
  icount += block_size
126
159
  datas = []
127
160
  if i % print_size == 0:
@@ -129,8 +162,9 @@ class ThinkDataQuery(BaseTga):
129
162
  BMOBJ.clog(ctx, f"total: {i}")
130
163
  if len(datas) > 0: # 保存最后收尾的
131
164
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
132
- self.g_to_csv_notmp(unique_path, df, index=None,
133
- mode='a', header=False) # 追加保存
165
+ self.g_to_csv_notmp(
166
+ unique_path, df, index=None, mode="a", header=False
167
+ ) # 追加保存
134
168
  icount += len(datas)
135
169
  datas = []
136
170
  except Exception as e:
@@ -151,7 +185,18 @@ class ThinkDataQuery(BaseTga):
151
185
  pass
152
186
  return unique_path
153
187
 
154
- def get_data_csv(self, ctx, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, retry_count=2, conn_timeout=30, tga_data_timeout=600):
188
+ def get_data_csv(
189
+ self,
190
+ ctx,
191
+ sql,
192
+ block_size=100000,
193
+ print_size=100000,
194
+ read_timeout=600,
195
+ upcount=None,
196
+ retry_count=2,
197
+ conn_timeout=30,
198
+ tga_data_timeout=600,
199
+ ):
155
200
  """
156
201
  @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
157
202
  """
@@ -161,33 +206,243 @@ class ThinkDataQuery(BaseTga):
161
206
  for i in range(retry_count):
162
207
  try:
163
208
  result = self.get_data_csv_i(
164
- ctx, unique_path, sql, block_size, print_size, read_timeout, upcount, conn_timeout, tga_data_timeout)
209
+ ctx,
210
+ unique_path,
211
+ sql,
212
+ block_size,
213
+ print_size,
214
+ read_timeout,
215
+ upcount,
216
+ conn_timeout,
217
+ tga_data_timeout,
218
+ )
165
219
  return result
166
220
  except Exception as e:
167
221
  gol_e = e
168
222
  BMOBJ.remove_file(unique_path)
169
223
  BMOBJ.remove_folder(unique_path)
170
- if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
171
- BMOBJ.clog(ctx, f'retry {i}')
224
+ # modify by yx 2024-05-08---加强出错重试---
225
+ # if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
226
+ # BMOBJ.clog(ctx, f'retry {i}')
227
+ # continue
228
+ # else:
229
+ # break
230
+ if 1:
231
+ continue
232
+ if gol_e is not None:
233
+ raise gol_e
234
+
235
+ def get_data_csv_by_str_i(
236
+ self,
237
+ ctx,
238
+ unique_path,
239
+ sql,
240
+ block_size=100000,
241
+ print_size=100000,
242
+ read_timeout=600,
243
+ upcount=None,
244
+ conn_timeout=30,
245
+ tga_data_timeout=600,
246
+ ):
247
+ """
248
+ @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
249
+ """
250
+ session = requests.session()
251
+ post_data = {
252
+ "token": self.token,
253
+ "sql": sql,
254
+ "timeoutSeconds": tga_data_timeout,
255
+ }
256
+ #
257
+ unique_path = self.gen_local_unique_file()
258
+ #
259
+ BMOBJ.log_error("in query")
260
+ #
261
+ r = session.post(
262
+ self.query_uri,
263
+ data=post_data,
264
+ stream=True,
265
+ verify=False,
266
+ timeout=(conn_timeout, read_timeout),
267
+ )
268
+ datas = []
269
+ i = 0 # 循环引用计数
270
+ icount = 0 # 数据的数量
271
+ cols = [] # 表头
272
+ try:
273
+ row = ""
274
+ # iter_lines iter_content, chunk_size字节, 下面取100M
275
+ for row in r.iter_lines(chunk_size=1024 * 1024 * 100):
276
+ if not row:
172
277
  continue
278
+ data = None
279
+ if i == 0: # 处理header
280
+ data = json.loads(row)
281
+ if data["return_code"] == 0:
282
+ cols = data["data"]["headers"]
283
+ df = pandas.DataFrame(
284
+ data=[], columns=cols, type=object
285
+ ) # 保存表头
286
+ # 解决科学计数法的问题
287
+ df = df.astype(str)
288
+ df = df.astype("string")
289
+ #
290
+ self.g_to_csv_notmp(unique_path, df, index=None)
291
+ data = None
292
+ else:
293
+ BMOBJ.log_error("sql error:", data)
294
+ # BMOBJ.log_error(sql)
295
+ try:
296
+ BMOBJ.clog(ctx, "sql error:", data)
297
+ except:
298
+ pass
299
+ datas = []
300
+ raise Exception("sql error")
301
+ break # 表示查询出错,没有消息
173
302
  else:
174
- break
303
+ if row.strip() not in [b"", ""]:
304
+ data = json.loads(row)
305
+ if data is not None:
306
+ datas.append(data)
307
+ i += 1
308
+ if len(datas) == block_size: # 1000000条保存一次
309
+ df = pandas.DataFrame(
310
+ data=datas, columns=cols, type=object
311
+ ) # 保存表头
312
+ # 解决科学计数法的问题
313
+ df = df.astype(str)
314
+ df = df.astype("string")
315
+ #
316
+ self.g_to_csv_notmp(
317
+ unique_path, df, index=None, mode="a", header=False
318
+ ) # 追加保存
319
+ icount += block_size
320
+ datas = []
321
+ if i % print_size == 0:
322
+ BMOBJ.clog(ctx, i)
323
+ BMOBJ.clog(ctx, f"total: {i}")
324
+ if len(datas) > 0: # 保存最后收尾的
325
+ df = pandas.DataFrame(data=datas, columns=cols, type=object) # 保存表头
326
+ # 解决科学计数法的问题
327
+ df = df.astype(str)
328
+ df = df.astype("string")
329
+ #
330
+ self.g_to_csv_notmp(
331
+ unique_path, df, index=None, mode="a", header=False
332
+ ) # 追加保存
333
+ icount += len(datas)
334
+ datas = []
335
+ except Exception as e:
336
+ BMOBJ.clog(ctx, "get data error", str(e), row)
337
+ if upcount is not None:
338
+ if i < upcount: # 看是否达到可以接受的数量,否则重新查询
339
+ raise e
340
+ else:
341
+ raise e
342
+ finally:
343
+ try:
344
+ r.close()
345
+ except:
346
+ pass
347
+ try:
348
+ session.close()
349
+ except:
350
+ pass
351
+ return unique_path
352
+
353
+ def get_data_csv_by_str(
354
+ self,
355
+ ctx,
356
+ sql,
357
+ block_size=100000,
358
+ print_size=100000,
359
+ read_timeout=600,
360
+ upcount=None,
361
+ retry_count=2,
362
+ conn_timeout=30,
363
+ tga_data_timeout=600,
364
+ ):
365
+ """
366
+ @des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
367
+ """
368
+ # unique_path = "./test.csv"
369
+ unique_path = self.gen_local_unique_file()
370
+ gol_e = None
371
+ for i in range(retry_count):
372
+ try:
373
+ result = self.get_data_csv_by_str_i(
374
+ ctx,
375
+ unique_path,
376
+ sql,
377
+ block_size,
378
+ print_size,
379
+ read_timeout,
380
+ upcount,
381
+ conn_timeout,
382
+ tga_data_timeout,
383
+ )
384
+ return result
385
+ except Exception as e:
386
+ gol_e = e
387
+ BMOBJ.remove_file(unique_path)
388
+ BMOBJ.remove_folder(unique_path)
389
+ # modify by yx 2024-05-08---加强出错重试---
390
+ # if str(e).find("Read timed out") != -1 or str(e).find("Connection broken") != -1:
391
+ # BMOBJ.clog(ctx, f'retry {i}')
392
+ # continue
393
+ # else:
394
+ # break
395
+ if 1:
396
+ continue
175
397
  if gol_e is not None:
176
398
  raise gol_e
177
-
178
- def get_data_raw_pyhive(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30, tga_data_timeout=600):
399
+
400
+ def get_data_raw_pyhive(
401
+ self,
402
+ ctx,
403
+ sql,
404
+ block_size=100000,
405
+ fetch_size=10000,
406
+ retry_count=2,
407
+ read_timeout=300,
408
+ upcount=None,
409
+ print_size=100000,
410
+ conn_timeout=30,
411
+ tga_data_timeout=600,
412
+ ):
179
413
  """
180
414
  @des: 接口装饰器--修改为get_data_csv,防止全面修改代码
181
415
  """
182
- result = self.get_data_csv(ctx, sql, block_size, print_size, read_timeout, upcount, retry_count, conn_timeout, tga_data_timeout)
416
+ result = self.get_data_csv(
417
+ ctx,
418
+ sql,
419
+ block_size,
420
+ print_size,
421
+ read_timeout,
422
+ upcount,
423
+ retry_count,
424
+ conn_timeout,
425
+ tga_data_timeout,
426
+ )
183
427
  return result
184
-
185
428
 
186
- def get_data_raw_pyhive_bck(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30, tga_data_timeout=600):
429
+ def get_data_raw_pyhive_bck(
430
+ self,
431
+ ctx,
432
+ sql,
433
+ block_size=100000,
434
+ fetch_size=10000,
435
+ retry_count=2,
436
+ read_timeout=300,
437
+ upcount=None,
438
+ print_size=100000,
439
+ conn_timeout=30,
440
+ tga_data_timeout=600,
441
+ ):
187
442
  '''
188
443
  @des:presto直连方式读取-----重试的方式----当get_data_csv接口出问题,则启用这个接口
189
444
  tobj = ThinkDataQuery("http://queryhost:port/querySql", "查询token",
190
- ["presto直连的host", 直连的port])
445
+ ["presto直连的host", 直连的port])
191
446
  sql = """select * from v_event_7 where "$part_date"='2022-02-24' limit 100 """
192
447
  unique_path = tobj.get_data_raw_pyhive({}, sql)
193
448
  '''
@@ -197,25 +452,46 @@ class ThinkDataQuery(BaseTga):
197
452
  for i in range(retry_count):
198
453
  try:
199
454
  result = self.get_data_raw_pyhive_i(
200
- ctx, unique_path, sql, block_size, fetch_size, read_timeout, upcount, print_size, conn_timeout)
455
+ ctx,
456
+ unique_path,
457
+ sql,
458
+ block_size,
459
+ fetch_size,
460
+ read_timeout,
461
+ upcount,
462
+ print_size,
463
+ conn_timeout,
464
+ )
201
465
  return result
202
466
  except Exception as e:
203
467
  gol_e = e
204
468
  BMOBJ.remove_file(unique_path)
205
469
  BMOBJ.remove_folder(unique_path)
206
470
  if str(e).find("Read timed out") != -1:
207
- BMOBJ.clog(ctx, f'retry {i}')
471
+ BMOBJ.clog(ctx, f"retry {i}")
208
472
  continue
209
473
  else:
210
474
  break
211
475
  if gol_e is not None:
212
476
  raise gol_e
213
477
 
214
- def get_data_raw_pyhive_i(self, ctx, unique_path, sql, block_size=100000, fetch_size=10000, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30):
215
- '''
478
+ def get_data_raw_pyhive_i(
479
+ self,
480
+ ctx,
481
+ unique_path,
482
+ sql,
483
+ block_size=100000,
484
+ fetch_size=10000,
485
+ read_timeout=300,
486
+ upcount=None,
487
+ print_size=100000,
488
+ conn_timeout=30,
489
+ ):
490
+ """
216
491
  @des: 内部调用
217
- '''
492
+ """
218
493
  from pyhive import presto
494
+
219
495
  #
220
496
  # unique_path = self.gen_local_unique_file()
221
497
  # unique_path = "./test.csv"
@@ -224,20 +500,25 @@ class ThinkDataQuery(BaseTga):
224
500
  session = requests.session()
225
501
  #
226
502
  datas = []
227
- i = 0 # 循环引用计数
503
+ i = 0 # 循环引用计数
228
504
  icount = 0 # 数据的数量
229
505
  cols = [] # 表头
230
506
  try:
231
- conn = presto.connect(host=self.hive_conn_info[0],
232
- port=int(self.hive_conn_info[1]),
233
- username='ta', catalog='hive',
234
- schema='ta',
235
- requests_session = session,
236
- # 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
237
- # 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
238
- requests_kwargs={"timeout": (
239
- conn_timeout, read_timeout), "stream": True, "verify": False}
240
- )
507
+ conn = presto.connect(
508
+ host=self.hive_conn_info[0],
509
+ port=int(self.hive_conn_info[1]),
510
+ username="ta",
511
+ catalog="hive",
512
+ schema="ta",
513
+ requests_session=session,
514
+ # 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
515
+ # 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
516
+ requests_kwargs={
517
+ "timeout": (conn_timeout, read_timeout),
518
+ "stream": True,
519
+ "verify": False,
520
+ },
521
+ )
241
522
  cursor = conn.cursor()
242
523
  cursor.execute(sql)
243
524
  BMOBJ.clog(ctx, "文件大小")
@@ -262,6 +543,7 @@ class ThinkDataQuery(BaseTga):
262
543
  def yx_fetch_many():
263
544
  myres = cursor.fetchmany(fetch_size)
264
545
  return myres
546
+
265
547
  rows = yx_fetch_many()
266
548
  while rows:
267
549
  for row in rows:
@@ -271,8 +553,9 @@ class ThinkDataQuery(BaseTga):
271
553
  i += 1
272
554
  if len(datas) == block_size: # 1000000条保存一次
273
555
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
274
- self.g_to_csv_notmp(unique_path, df, index=None,
275
- mode='a', header=False) # 追加保存
556
+ self.g_to_csv_notmp(
557
+ unique_path, df, index=None, mode="a", header=False
558
+ ) # 追加保存
276
559
  icount += block_size
277
560
  datas = []
278
561
  if i % print_size == 0:
@@ -282,8 +565,9 @@ class ThinkDataQuery(BaseTga):
282
565
  BMOBJ.clog(ctx, f"total: {i}")
283
566
  if len(datas) > 0: # 保存最后收尾的
284
567
  df = pandas.DataFrame(data=datas, columns=cols) # 保存表头
285
- self.g_to_csv_notmp(unique_path, df, index=None,
286
- mode='a', header=False) # 追加保存
568
+ self.g_to_csv_notmp(
569
+ unique_path, df, index=None, mode="a", header=False
570
+ ) # 追加保存
287
571
  icount += len(datas)
288
572
  datas = []
289
573
  except Exception as e:
@@ -298,13 +582,13 @@ class ThinkDataQuery(BaseTga):
298
582
  try:
299
583
  conn.close()
300
584
  except:
301
- pass
585
+ pass
302
586
  try:
303
587
  session.close()
304
588
  except:
305
- pass
589
+ pass
306
590
  return unique_path
307
-
591
+
308
592
  """
309
593
  数据打入接口--start--
310
594
  """
@@ -312,7 +596,7 @@ class ThinkDataQuery(BaseTga):
312
596
  def set_tga_user_data(self, tga_app_no, sec_token, url, data, is_set_once=False):
313
597
  """
314
598
  @des: 用户数据打入tga
315
- @params:
599
+ @params:
316
600
  tga_app_no: tga项目的id,注意不是app_id
317
601
  sec_token: 安全的二次部署服务器的token
318
602
  url: 二次部署的服务器打入的url
@@ -324,7 +608,7 @@ class ThinkDataQuery(BaseTga):
324
608
  def set_tga_event_data_trac(self, tga_app_no, sec_token, url, data):
325
609
  """
326
610
  @des: 普通事件数据打入tga
327
- @params:
611
+ @params:
328
612
  tga_app_no: tga项目的id,注意不是app_id
329
613
  sec_token: 安全的二次部署服务器的token
330
614
  url: 二次部署的服务器打入的url
@@ -336,7 +620,7 @@ class ThinkDataQuery(BaseTga):
336
620
  def set_tga_event_data_trac_update(self, tga_app_no, sec_token, url, data):
337
621
  """
338
622
  @des: 可更新事件数据打入tga, 重写部分数据
339
- @params:
623
+ @params:
340
624
  tga_app_no: tga项目的id,注意不是app_id
341
625
  sec_token: 安全的二次部署服务器的token
342
626
  url: 二次部署的服务器打入的url
@@ -345,10 +629,10 @@ class ThinkDataQuery(BaseTga):
345
629
  """
346
630
  pass
347
631
 
348
- def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
632
+ def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
349
633
  """
350
634
  @des: 可更新事件数据打入tga, 重写全部数据
351
- @params:
635
+ @params:
352
636
  tga_app_no: tga项目的id,注意不是app_id
353
637
  sec_token: 安全的二次部署服务器的token
354
638
  url: 二次部署的服务器打入的url
@@ -359,4 +643,4 @@ class ThinkDataQuery(BaseTga):
359
643
 
360
644
  """
361
645
  数据打入接口----end-----
362
- """
646
+ """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tfduck-bsd
3
- Version: 0.18.1
3
+ Version: 0.18.3
4
4
  Summary: A small example package
5
5
  Home-page: UNKNOWN
6
6
  Author: yuanxiao
@@ -1 +0,0 @@
1
- __version__="0.18.1"
File without changes
File without changes
File without changes
File without changes
File without changes