tfduck-bsd 0.17.9__tar.gz → 0.18.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- {tfduck-bsd-0.17.9/tfduck_bsd.egg-info → tfduck-bsd-0.18.1}/PKG-INFO +1 -1
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/setup.py +1 -1
- tfduck-bsd-0.18.1/tfduck/__init__.py +1 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/tga.py +69 -12
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/train_sql_ltv.py +1 -1
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/train_sql_retain.py +1 -1
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1/tfduck_bsd.egg-info}/PKG-INFO +1 -1
- tfduck-bsd-0.17.9/tfduck/__init__.py +0 -1
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/LICENSE +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/README.md +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/bin/tfduck +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/setup.cfg +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/example.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/common/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/common/defines.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/common/defines_clean.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/common/extendEncoder.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/main.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/oss/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/oss/oss.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/pyspark_k8s/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/pyspark_k8s/spark_manage.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/s3/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/s3/s3oper.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/sagemaker/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/sagemaker/saoper.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/base_tga.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/predict_sql_ltv.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/predict_sql_retain.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/predict_sql_yh.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/tga_test.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/tga/train_sql_yh.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/thinkdata/__init__.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck/thinkdata/query.py +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck_bsd.egg-info/SOURCES.txt +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck_bsd.egg-info/requires.txt +0 -0
- {tfduck-bsd-0.17.9 → tfduck-bsd-0.18.1}/tfduck_bsd.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__="0.18.1"
|
|
@@ -72,24 +72,25 @@ class ThinkDataQuery(BaseTga):
|
|
|
72
72
|
mode=mode, header=header)
|
|
73
73
|
return True
|
|
74
74
|
|
|
75
|
-
def get_data_csv_i(self, ctx, unique_path, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None):
|
|
75
|
+
def get_data_csv_i(self, ctx, unique_path, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, conn_timeout=30, tga_data_timeout=600):
|
|
76
76
|
"""
|
|
77
77
|
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
78
78
|
"""
|
|
79
79
|
session = requests.session()
|
|
80
|
-
post_data = {'token': self.token, 'sql': sql}
|
|
80
|
+
post_data = {'token': self.token, 'sql': sql, 'timeoutSeconds': tga_data_timeout}
|
|
81
81
|
#
|
|
82
82
|
unique_path = self.gen_local_unique_file()
|
|
83
83
|
#
|
|
84
84
|
BMOBJ.log_error("in query")
|
|
85
85
|
#
|
|
86
86
|
r = session.post(self.query_uri, data=post_data, stream=True,
|
|
87
|
-
verify=False, timeout=(
|
|
87
|
+
verify=False, timeout=(conn_timeout, read_timeout))
|
|
88
88
|
datas = []
|
|
89
89
|
i = 0 # 循环引用计数
|
|
90
90
|
icount = 0 # 数据的数量
|
|
91
91
|
cols = [] # 表头
|
|
92
92
|
try:
|
|
93
|
+
row = ''
|
|
93
94
|
# iter_lines iter_content, chunk_size字节, 下面取100M
|
|
94
95
|
for row in r.iter_lines(chunk_size=1024*1024*100):
|
|
95
96
|
if not row:
|
|
@@ -133,7 +134,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
133
134
|
icount += len(datas)
|
|
134
135
|
datas = []
|
|
135
136
|
except Exception as e:
|
|
136
|
-
BMOBJ.clog(ctx, "get data error", str(e))
|
|
137
|
+
BMOBJ.clog(ctx, "get data error", str(e), row)
|
|
137
138
|
if upcount is not None:
|
|
138
139
|
if i < upcount: # 看是否达到可以接受的数量,否则重新查询
|
|
139
140
|
raise e
|
|
@@ -150,7 +151,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
150
151
|
pass
|
|
151
152
|
return unique_path
|
|
152
153
|
|
|
153
|
-
def get_data_csv(self, ctx, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, retry_count=2):
|
|
154
|
+
def get_data_csv(self, ctx, sql, block_size=100000, print_size=100000, read_timeout=600, upcount=None, retry_count=2, conn_timeout=30, tga_data_timeout=600):
|
|
154
155
|
"""
|
|
155
156
|
@des:从thinkdata的openapi获取数据----流式,为了节省内存---配合下面的getquerycsv
|
|
156
157
|
"""
|
|
@@ -160,7 +161,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
160
161
|
for i in range(retry_count):
|
|
161
162
|
try:
|
|
162
163
|
result = self.get_data_csv_i(
|
|
163
|
-
ctx, unique_path, sql, block_size, print_size, read_timeout, upcount)
|
|
164
|
+
ctx, unique_path, sql, block_size, print_size, read_timeout, upcount, conn_timeout, tga_data_timeout)
|
|
164
165
|
return result
|
|
165
166
|
except Exception as e:
|
|
166
167
|
gol_e = e
|
|
@@ -174,15 +175,15 @@ class ThinkDataQuery(BaseTga):
|
|
|
174
175
|
if gol_e is not None:
|
|
175
176
|
raise gol_e
|
|
176
177
|
|
|
177
|
-
def get_data_raw_pyhive(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000):
|
|
178
|
+
def get_data_raw_pyhive(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30, tga_data_timeout=600):
|
|
178
179
|
"""
|
|
179
180
|
@des: 接口装饰器--修改为get_data_csv,防止全面修改代码
|
|
180
181
|
"""
|
|
181
|
-
result = self.get_data_csv(ctx, sql, block_size, print_size, read_timeout, upcount, retry_count)
|
|
182
|
+
result = self.get_data_csv(ctx, sql, block_size, print_size, read_timeout, upcount, retry_count, conn_timeout, tga_data_timeout)
|
|
182
183
|
return result
|
|
183
184
|
|
|
184
185
|
|
|
185
|
-
def get_data_raw_pyhive_bck(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000):
|
|
186
|
+
def get_data_raw_pyhive_bck(self, ctx, sql, block_size=100000, fetch_size=10000, retry_count=2, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30, tga_data_timeout=600):
|
|
186
187
|
'''
|
|
187
188
|
@des:presto直连方式读取-----重试的方式----当get_data_csv接口出问题,则启用这个接口
|
|
188
189
|
tobj = ThinkDataQuery("http://queryhost:port/querySql", "查询token",
|
|
@@ -196,7 +197,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
196
197
|
for i in range(retry_count):
|
|
197
198
|
try:
|
|
198
199
|
result = self.get_data_raw_pyhive_i(
|
|
199
|
-
ctx, unique_path, sql, block_size, fetch_size, read_timeout, upcount, print_size)
|
|
200
|
+
ctx, unique_path, sql, block_size, fetch_size, read_timeout, upcount, print_size, conn_timeout)
|
|
200
201
|
return result
|
|
201
202
|
except Exception as e:
|
|
202
203
|
gol_e = e
|
|
@@ -210,7 +211,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
210
211
|
if gol_e is not None:
|
|
211
212
|
raise gol_e
|
|
212
213
|
|
|
213
|
-
def get_data_raw_pyhive_i(self, ctx, unique_path, sql, block_size=100000, fetch_size=10000, read_timeout=300, upcount=None, print_size=100000):
|
|
214
|
+
def get_data_raw_pyhive_i(self, ctx, unique_path, sql, block_size=100000, fetch_size=10000, read_timeout=300, upcount=None, print_size=100000, conn_timeout=30):
|
|
214
215
|
'''
|
|
215
216
|
@des: 内部调用
|
|
216
217
|
'''
|
|
@@ -235,7 +236,7 @@ class ThinkDataQuery(BaseTga):
|
|
|
235
236
|
# 这里stream为true和false没有关系,fetchmany每次都会通过request_session传nexturl重新get获取数据
|
|
236
237
|
# 参考pyhive/presto.py的_fetch_more,每次fetchmany其实是多次fetchone
|
|
237
238
|
requests_kwargs={"timeout": (
|
|
238
|
-
|
|
239
|
+
conn_timeout, read_timeout), "stream": True, "verify": False}
|
|
239
240
|
)
|
|
240
241
|
cursor = conn.cursor()
|
|
241
242
|
cursor.execute(sql)
|
|
@@ -303,3 +304,59 @@ class ThinkDataQuery(BaseTga):
|
|
|
303
304
|
except:
|
|
304
305
|
pass
|
|
305
306
|
return unique_path
|
|
307
|
+
|
|
308
|
+
"""
|
|
309
|
+
数据打入接口--start--
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def set_tga_user_data(self, tga_app_no, sec_token, url, data, is_set_once=False):
|
|
313
|
+
"""
|
|
314
|
+
@des: 用户数据打入tga
|
|
315
|
+
@params:
|
|
316
|
+
tga_app_no: tga项目的id,注意不是app_id
|
|
317
|
+
sec_token: 安全的二次部署服务器的token
|
|
318
|
+
url: 二次部署的服务器打入的url
|
|
319
|
+
data: 数据 [['tga用户的distinct_id,为空就传None', 'tga用户的account_id为空就传None', '打入的用户属性为一个dict' ], ...,]
|
|
320
|
+
例如: [['eqw31231231', 'fads21321312312', {'a':1, 'b':2}], ...,]
|
|
321
|
+
"""
|
|
322
|
+
pass
|
|
323
|
+
|
|
324
|
+
def set_tga_event_data_trac(self, tga_app_no, sec_token, url, data):
|
|
325
|
+
"""
|
|
326
|
+
@des: 普通事件数据打入tga
|
|
327
|
+
@params:
|
|
328
|
+
tga_app_no: tga项目的id,注意不是app_id
|
|
329
|
+
sec_token: 安全的二次部署服务器的token
|
|
330
|
+
url: 二次部署的服务器打入的url
|
|
331
|
+
data: 数据 [['事件名称', 'tga用户的distinct_id,为空就传None', 'tga用户的account_id为空就传None', '打入的用户属性为一个dict' ], ...,]
|
|
332
|
+
例如: [['new_session', 'eqw31231231', 'fads21321312312', {'a':1, 'b':2}], ...,]
|
|
333
|
+
"""
|
|
334
|
+
pass
|
|
335
|
+
|
|
336
|
+
def set_tga_event_data_trac_update(self, tga_app_no, sec_token, url, data):
|
|
337
|
+
"""
|
|
338
|
+
@des: 可更新事件数据打入tga, 重写部分数据
|
|
339
|
+
@params:
|
|
340
|
+
tga_app_no: tga项目的id,注意不是app_id
|
|
341
|
+
sec_token: 安全的二次部署服务器的token
|
|
342
|
+
url: 二次部署的服务器打入的url
|
|
343
|
+
data: 数据 [['事件名称', '事件唯一id', 'tga用户的distinct_id,为空就传None', 'tga用户的account_id为空就传None', , '打入的用户属性为一个dict' ], ...,]
|
|
344
|
+
例如: [['new_session', 'event_id_123', 'eqw31231231', 'fads21321312312', {'a':1, 'b':2}], ...,]
|
|
345
|
+
"""
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
def set_tga_event_data_trac_overwrite(self, tga_app_no, sec_token, url, data):
|
|
349
|
+
"""
|
|
350
|
+
@des: 可更新事件数据打入tga, 重写全部数据
|
|
351
|
+
@params:
|
|
352
|
+
tga_app_no: tga项目的id,注意不是app_id
|
|
353
|
+
sec_token: 安全的二次部署服务器的token
|
|
354
|
+
url: 二次部署的服务器打入的url
|
|
355
|
+
data: 数据 [['事件名称', '事件唯一id', 'tga用户的distinct_id,为空就传None', 'tga用户的account_id为空就传None', , '打入的用户属性为一个dict' ], ...,]
|
|
356
|
+
例如: [['new_session', 'event_id_123', 'eqw31231231', 'fads21321312312', {'a':1, 'b':2}], ...,]
|
|
357
|
+
"""
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
"""
|
|
361
|
+
数据打入接口----end-----
|
|
362
|
+
"""
|
|
@@ -335,7 +335,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
335
335
|
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
336
336
|
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
337
337
|
-- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式可能会产生left null的情况,但是是少数,过滤掉就行,不影响结果,但支持乱序采样
|
|
338
|
-
a3.user_distinct_id as tt_stable_rand --
|
|
338
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一, 就算limit分几部分取特征值,这几部分也是可以对应上的
|
|
339
339
|
|
|
340
340
|
FROM
|
|
341
341
|
(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__="0.17.9"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|