tfduck-bsd 0.16.1__tar.gz → 0.16.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- {tfduck-bsd-0.16.1/tfduck_bsd.egg-info → tfduck-bsd-0.16.2}/PKG-INFO +1 -1
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/setup.py +1 -1
- tfduck-bsd-0.16.2/tfduck/__init__.py +1 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/pyspark_k8s/spark_manage.py +2 -2
- tfduck-bsd-0.16.2/tfduck/tga/predict_sql_ltv.py +90 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/predict_sql_retain.py +1 -1
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/predict_sql_yh.py +1 -1
- tfduck-bsd-0.16.2/tfduck/tga/tga_test.py +88 -0
- tfduck-bsd-0.16.2/tfduck/tga/train_sql_ltv.py +603 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/train_sql_retain.py +1 -1
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/train_sql_yh.py +2 -2
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2/tfduck_bsd.egg-info}/PKG-INFO +1 -1
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/SOURCES.txt +3 -0
- tfduck-bsd-0.16.1/tfduck/__init__.py +0 -1
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/LICENSE +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/README.md +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/bin/tfduck +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/setup.cfg +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/example.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/common/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/common/defines.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/common/extendEncoder.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/main.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/oss/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/oss/oss.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/pyspark_k8s/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/s3/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/s3/s3oper.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/sagemaker/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/sagemaker/saoper.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/base_tga.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/tga.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/thinkdata/__init__.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/thinkdata/query.py +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/requires.txt +0 -0
- {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__="0.16.2"
|
|
@@ -364,7 +364,7 @@ if __name__ == "__main__":
|
|
|
364
364
|
# # "容器内path,没有被污染也可以不设置"
|
|
365
365
|
# os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python342342342342"
|
|
366
366
|
|
|
367
|
-
if
|
|
367
|
+
if 1: # 同步--天天--测试上传代码路径
|
|
368
368
|
sm = SparkManage(
|
|
369
369
|
ctx={},
|
|
370
370
|
# code_path='/Users/yuanxiao/workspace/djcelery44/djcelery44/scripts/tools/debug/test_presto_s3',
|
|
@@ -380,7 +380,7 @@ if __name__ == "__main__":
|
|
|
380
380
|
)
|
|
381
381
|
sm.upload_code()
|
|
382
382
|
sm.submit_spark_task()
|
|
383
|
-
if
|
|
383
|
+
if 0: # 同步--天天---测试直接上传代码内容
|
|
384
384
|
with open('/Users/yuanxiao/workspace/djcelery44/djcelery44/scripts/tools/debug/test_hello/main_script.py', 'r') as f:
|
|
385
385
|
code_content = f.read()
|
|
386
386
|
#
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
'''
|
|
2
|
+
@des: tga获取批量转换特征值数据拉取的sql基类
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
准备工作:
|
|
6
|
+
--------------------------------------
|
|
7
|
+
gconf的属性有这些:
|
|
8
|
+
########
|
|
9
|
+
part_date_start # 特征值取新增用户的开始日期, 例如"2022-05-20", 字符串类型
|
|
10
|
+
part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
|
|
11
|
+
level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
|
|
12
|
+
tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
|
|
13
|
+
tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
|
|
14
|
+
########
|
|
15
|
+
|
|
16
|
+
如何使用(需要自己定义一个子类继承这个类, 代码如下):
|
|
17
|
+
#######
|
|
18
|
+
from magic_number_train_tf_2.src.common.config import Config as GAConfig # 这里换成你自己的项目
|
|
19
|
+
from tfduck.tga.predict_sql_ltv import PredictFeatureSql
|
|
20
|
+
|
|
21
|
+
def call_method(self, ctx=None, **kwargs):
|
|
22
|
+
"""
|
|
23
|
+
@des: 每个具体项目的训练特征数据拉取
|
|
24
|
+
"""
|
|
25
|
+
gconf = GAConfig.getInstance() # 保证gconf有上面准备工作的属性
|
|
26
|
+
"""
|
|
27
|
+
设置项目属性-----下面的都需要自己配置------
|
|
28
|
+
"""
|
|
29
|
+
# 需要的事件
|
|
30
|
+
need_events = (
|
|
31
|
+
'new_device', 'new_session', 'act_level_path'
|
|
32
|
+
)
|
|
33
|
+
# 需要的属性
|
|
34
|
+
need_event_attrs = (
|
|
35
|
+
"#lib", "#country_code", "$part_event", "#user_id", "sdk_session_time",
|
|
36
|
+
"object_type", "act", "object_number", "act_object",
|
|
37
|
+
)
|
|
38
|
+
# 特征值名称(不要用字母a作为key)---
|
|
39
|
+
feature_names = {
|
|
40
|
+
'b': '常规关卡通关',
|
|
41
|
+
'c': '冒险关卡通关'
|
|
42
|
+
}
|
|
43
|
+
# 特征值sql---模板保持不变,变里面的内容即可---
|
|
44
|
+
sub_sql_fs = {
|
|
45
|
+
'b': """
|
|
46
|
+
--%(real_des)s
|
|
47
|
+
sum(
|
|
48
|
+
if(
|
|
49
|
+
"$part_event"='act_level_path' and object_type='normal' and act='win' and rt_dur<%(level_dur)s,
|
|
50
|
+
1,
|
|
51
|
+
0)
|
|
52
|
+
) as %(sub_field_name)s
|
|
53
|
+
""",
|
|
54
|
+
'c': """
|
|
55
|
+
--%(real_des)s
|
|
56
|
+
sum(
|
|
57
|
+
if(
|
|
58
|
+
"$part_event"='act_level_path' and object_type='adventure' and act='win' and rt_dur<%(level_dur)s,
|
|
59
|
+
1,
|
|
60
|
+
0)
|
|
61
|
+
) as %(sub_field_name)s
|
|
62
|
+
"""
|
|
63
|
+
}
|
|
64
|
+
# 额外的用户属性比如(可选参数)
|
|
65
|
+
addon_attrs = ["#screen_width", "#screen_height"]
|
|
66
|
+
"""
|
|
67
|
+
创建调用实例---最好通过dict创建实例
|
|
68
|
+
"""
|
|
69
|
+
pf_sql_obj = PredictFeatureSql(
|
|
70
|
+
ctx = ctx,
|
|
71
|
+
gconf = gconf,
|
|
72
|
+
need_events = need_events,
|
|
73
|
+
need_event_attrs = need_event_attrs,
|
|
74
|
+
feature_names = feature_names,
|
|
75
|
+
sub_sql_fs = sub_sql_fs,
|
|
76
|
+
addon_attrs = addon_attrs # 可选参数
|
|
77
|
+
)
|
|
78
|
+
sql = pf_sql_obj.get_sql() # 这个sql就是拉取特征值的sql
|
|
79
|
+
#######
|
|
80
|
+
'''
|
|
81
|
+
from tfduck.common.defines import BMOBJ, Et
|
|
82
|
+
from tfduck.tga.predict_sql_retain import PredictFeatureSql as BasePredictFeatureSql
|
|
83
|
+
import arrow
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PredictFeatureSql(BasePredictFeatureSql):
|
|
87
|
+
"""
|
|
88
|
+
@des:sql批量转换模板基类
|
|
89
|
+
"""
|
|
90
|
+
pass
|
|
@@ -10,7 +10,7 @@ gconf的属性有这些:
|
|
|
10
10
|
part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
|
|
11
11
|
level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
|
|
12
12
|
tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
|
|
13
|
-
tga_event_table # tga
|
|
13
|
+
tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
|
|
14
14
|
########
|
|
15
15
|
|
|
16
16
|
如何使用(需要自己定义一个子类继承这个类, 代码如下):
|
|
@@ -10,7 +10,7 @@ gconf的属性有这些:
|
|
|
10
10
|
part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
|
|
11
11
|
level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
|
|
12
12
|
tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
|
|
13
|
-
tga_event_table # tga
|
|
13
|
+
tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
|
|
14
14
|
########
|
|
15
15
|
|
|
16
16
|
如何使用(需要自己定义一个子类继承这个类, 代码如下):
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from train_sql_ltv import TrainFeatureSql
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test():
|
|
6
|
+
class GConf(object):
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.part_date_start = "2023-01-15"
|
|
9
|
+
self.part_date_end = "2023-01-17"
|
|
10
|
+
# 注册后0-14天的所有ltv作为标签
|
|
11
|
+
self.label_durs = [int(86400*0), int(86400*14)]
|
|
12
|
+
self.level_durs = [86400*0.25, 86400*0.5,
|
|
13
|
+
86400, 86400*2, 86400*3] # 由小到大排列
|
|
14
|
+
self.tga_user_table = "v_user_30"
|
|
15
|
+
self.tga_event_table = "v_event_30"
|
|
16
|
+
|
|
17
|
+
gconf = GConf()
|
|
18
|
+
|
|
19
|
+
need_events = (
|
|
20
|
+
'new_device',
|
|
21
|
+
'new_session',
|
|
22
|
+
'g_push',
|
|
23
|
+
'sdk_close_session'
|
|
24
|
+
)
|
|
25
|
+
need_event_attrs = (
|
|
26
|
+
"#lib",
|
|
27
|
+
"pn",
|
|
28
|
+
"#device_model",
|
|
29
|
+
"#screen_width",
|
|
30
|
+
"#screen_height",
|
|
31
|
+
"#country_code",
|
|
32
|
+
"$part_event",
|
|
33
|
+
"#user_id"
|
|
34
|
+
)
|
|
35
|
+
feature_names = {
|
|
36
|
+
'b': '打开游戏次数',
|
|
37
|
+
'c': '本地推送打开游戏次数'
|
|
38
|
+
}
|
|
39
|
+
addon_attrs = [
|
|
40
|
+
"#device_model",
|
|
41
|
+
"#screen_width",
|
|
42
|
+
"#screen_height"
|
|
43
|
+
]
|
|
44
|
+
sub_sql_fs = {
|
|
45
|
+
'b':
|
|
46
|
+
# 打开游戏次数
|
|
47
|
+
"""
|
|
48
|
+
--%(real_des)s
|
|
49
|
+
sum(
|
|
50
|
+
if(
|
|
51
|
+
"$part_event"='new_session' and rt_dur<%(level_dur)s,
|
|
52
|
+
1,
|
|
53
|
+
0)
|
|
54
|
+
) as %(sub_field_name)s
|
|
55
|
+
""", 'c':
|
|
56
|
+
# 本地推送打开游戏次数
|
|
57
|
+
"""
|
|
58
|
+
--%(real_des)s
|
|
59
|
+
sum(
|
|
60
|
+
if(
|
|
61
|
+
"$part_event"='g_push' and rt_dur<%(level_dur)s,
|
|
62
|
+
1,
|
|
63
|
+
0)
|
|
64
|
+
) as %(sub_field_name)s
|
|
65
|
+
"""
|
|
66
|
+
}
|
|
67
|
+
user_cols2 = """ "#lib"='Android' """
|
|
68
|
+
tf_sql_obj = TrainFeatureSql(
|
|
69
|
+
ctx={},
|
|
70
|
+
gconf=gconf,
|
|
71
|
+
need_events=need_events,
|
|
72
|
+
need_event_attrs=need_event_attrs,
|
|
73
|
+
feature_names=feature_names,
|
|
74
|
+
sub_sql_fs=sub_sql_fs,
|
|
75
|
+
user_cols2=user_cols2,
|
|
76
|
+
addon_attrs=addon_attrs, # 可选参数
|
|
77
|
+
line_value_rd=0.5, # 可选参数
|
|
78
|
+
mode="iaa+iap"
|
|
79
|
+
)
|
|
80
|
+
sql = tf_sql_obj.get_sql() # 这个sql就是拉取特征值的sql
|
|
81
|
+
# 将sql复制到剪贴板
|
|
82
|
+
import pyperclip
|
|
83
|
+
pyperclip.copy(sql)
|
|
84
|
+
return sql
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
test()
|
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
'''
|
|
2
|
+
@des: tga获取训练特征值数据拉取的sql基类
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
准备工作:
|
|
6
|
+
--------------------------------------
|
|
7
|
+
gconf的属性有这些:
|
|
8
|
+
########
|
|
9
|
+
part_date_start # 特征值取新增用户的开始日期, 例如"2022-05-20", 字符串类型
|
|
10
|
+
part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
|
|
11
|
+
level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
|
|
12
|
+
label_durs # 取多少边界算ltv标签值,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生过事件的作为正向用户
|
|
13
|
+
tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
|
|
14
|
+
tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
|
|
15
|
+
########
|
|
16
|
+
|
|
17
|
+
如何使用(需要自己定义一个子类继承这个类, 代码如下):
|
|
18
|
+
#######
|
|
19
|
+
from magic_number_train_tf_2.src.common.config import Config as GAConfig # 这里换成你自己的项目
|
|
20
|
+
from tfduck.tga.train_sql_ltv import TrainFeatureSql
|
|
21
|
+
|
|
22
|
+
def call_method(self, ctx=None, **kwargs):
|
|
23
|
+
"""
|
|
24
|
+
@des: 每个具体项目的训练特征数据拉取
|
|
25
|
+
"""
|
|
26
|
+
gconf = GAConfig.getInstance() # 保证gconf有上面准备工作的属性
|
|
27
|
+
"""
|
|
28
|
+
设置项目属性-----下面的都需要自己配置------
|
|
29
|
+
"""
|
|
30
|
+
# 需要的事件
|
|
31
|
+
need_events = (
|
|
32
|
+
'new_device', 'new_session', 'act_level_path'
|
|
33
|
+
)
|
|
34
|
+
# 需要的属性
|
|
35
|
+
need_event_attrs = (
|
|
36
|
+
"#lib", "#country_code", "$part_event", "#user_id", "sdk_session_time",
|
|
37
|
+
"object_type", "act", "object_number", "act_object",
|
|
38
|
+
)
|
|
39
|
+
# 特征值名称--不要用字母a作为key---
|
|
40
|
+
feature_names = {
|
|
41
|
+
'b': '常规关卡通关',
|
|
42
|
+
'c': '冒险关卡通关'
|
|
43
|
+
}
|
|
44
|
+
# 特征值sql---模板保持不变,变里面的内容即可---
|
|
45
|
+
sub_sql_fs = {
|
|
46
|
+
'b': """
|
|
47
|
+
--%(real_des)s
|
|
48
|
+
sum(
|
|
49
|
+
if(
|
|
50
|
+
"$part_event"='act_level_path' and object_type='normal' and act='win' and rt_dur<%(level_dur)s,
|
|
51
|
+
1,
|
|
52
|
+
0)
|
|
53
|
+
) as %(sub_field_name)s
|
|
54
|
+
""",
|
|
55
|
+
'c': """
|
|
56
|
+
--%(real_des)s
|
|
57
|
+
sum(
|
|
58
|
+
if(
|
|
59
|
+
"$part_event"='act_level_path' and object_type='adventure' and act='win' and rt_dur<%(level_dur)s,
|
|
60
|
+
1,
|
|
61
|
+
0)
|
|
62
|
+
) as %(sub_field_name)s
|
|
63
|
+
"""
|
|
64
|
+
}
|
|
65
|
+
# 额外的用户属性比如(可选参数)
|
|
66
|
+
addon_attrs = ["#screen_width", "#screen_height"]
|
|
67
|
+
# line_value_rd ltv的线性值大于多少算正样本,小于多少算负样本,这样可以控制正负样本比例
|
|
68
|
+
line_value_rd = 0.5
|
|
69
|
+
# mode 标签值iaa(只计算iaa), iap(只计算iap), iaa+iap(计算iaa+iap)
|
|
70
|
+
mode = 'iaa+iap'
|
|
71
|
+
"""
|
|
72
|
+
创建调用实例---最好通过dict创建实例
|
|
73
|
+
注意: 针对一个tga项目属于多个项目的打点情况,例如3dfish, 可以适时启用user_cols1,user_cols2或者event_cols1
|
|
74
|
+
a. user_cols1 参数为用户过滤条件(user表)
|
|
75
|
+
如果有的话, 例如: user_cols1 = """ "afrawip__meida_source"='FaceBook' OR "afrawip__app_id"='123456' """
|
|
76
|
+
b. user_cols2 参数为用户过滤条件(event表的new_device事件)
|
|
77
|
+
如果有的话, 例如: user_cols2 = """ "#lib"='Android' OR "#lib"='iOS' """
|
|
78
|
+
c. event_cols1 参数为事件过滤条件(event表的所有计算特征值的事件)
|
|
79
|
+
如果有的话, 例如: event_cols1 = """ "#lib"='Android' OR "#lib"='iOS' """
|
|
80
|
+
"""
|
|
81
|
+
tf_sql_obj = TrainFeatureSql(
|
|
82
|
+
ctx = ctx,
|
|
83
|
+
gconf = gconf,
|
|
84
|
+
need_events = need_events,
|
|
85
|
+
need_event_attrs = need_event_attrs,
|
|
86
|
+
feature_names = feature_names,
|
|
87
|
+
sub_sql_fs = sub_sql_fs,
|
|
88
|
+
addon_attrs = addon_attrs # 可选参数
|
|
89
|
+
)
|
|
90
|
+
sql = tf_sql_obj.get_sql() # 这个sql就是拉取特征值的sql
|
|
91
|
+
|
|
92
|
+
#######
|
|
93
|
+
'''
|
|
94
|
+
from tfduck.tga.base_tga import BaseTga
|
|
95
|
+
from tfduck.common.defines import BMOBJ, Et
|
|
96
|
+
import arrow
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class TrainFeatureSql(BaseTga):
|
|
100
|
+
"""
|
|
101
|
+
@des:sql训练模板基类
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self,
|
|
105
|
+
ctx=None,
|
|
106
|
+
gconf=None,
|
|
107
|
+
need_events=None,
|
|
108
|
+
need_event_attrs=None,
|
|
109
|
+
feature_names=None,
|
|
110
|
+
sub_sql_fs=None,
|
|
111
|
+
user_cols1='1=1',
|
|
112
|
+
user_cols2='1=1',
|
|
113
|
+
event_cols1='1=1',
|
|
114
|
+
addon_attrs=[],
|
|
115
|
+
line_value_rd=0.5,
|
|
116
|
+
mode='iaa+iap',
|
|
117
|
+
**kwargs):
|
|
118
|
+
"""
|
|
119
|
+
@des: 参数说明看上面的文档说明
|
|
120
|
+
"""
|
|
121
|
+
self.gconf = gconf
|
|
122
|
+
self.ctx = ctx
|
|
123
|
+
# 基础属性
|
|
124
|
+
self.label_col = "is_rd"
|
|
125
|
+
self.base_event_table = 'all_need_event'
|
|
126
|
+
self.base_user_table = 'a'
|
|
127
|
+
self.base_feature_table = 'features_t'
|
|
128
|
+
self.nav_neg_multi = 2 # 负正样本比例, 线性值无正负样本比例的说法, 但可以给一个中间值,大于多少,小于多少即可
|
|
129
|
+
self.line_value_rd = line_value_rd # ltv的线性值大于多少算正样本,小于多少算负样本,这样可以控制正负样本比例
|
|
130
|
+
self.mode = mode # mode 标签值iaa(只计算iaa), iap(只计算iap), iaa+iap(计算iaa+iap)
|
|
131
|
+
|
|
132
|
+
# 项目属性
|
|
133
|
+
self.need_events = need_events
|
|
134
|
+
self.need_event_attrs = need_event_attrs
|
|
135
|
+
self.feature_names = feature_names
|
|
136
|
+
if self.base_user_table in self.feature_names:
|
|
137
|
+
raise Et(2, f"attr name '{self.base_user_table}' cannt be used")
|
|
138
|
+
feature_names_all = {
|
|
139
|
+
self.base_user_table: '用户属性和标签', # 此属性固定sql,不需要拼接
|
|
140
|
+
}
|
|
141
|
+
feature_names_all.update(self.feature_names)
|
|
142
|
+
self.feature_names = feature_names_all
|
|
143
|
+
self.sub_sql_fs = sub_sql_fs
|
|
144
|
+
#
|
|
145
|
+
self.user_cols1 = user_cols1
|
|
146
|
+
self.user_cols2 = user_cols2
|
|
147
|
+
self.event_cols1 = event_cols1
|
|
148
|
+
self.addon_attrs = addon_attrs if isinstance(addon_attrs, list) else []
|
|
149
|
+
# 其他属性
|
|
150
|
+
for k, v in kwargs.items():
|
|
151
|
+
setattr(self, k, v)
|
|
152
|
+
|
|
153
|
+
def get_first_col_name(self):
|
|
154
|
+
"""
|
|
155
|
+
@des: 获取第一个特征属性列
|
|
156
|
+
"""
|
|
157
|
+
return f"{list(self.feature_names.keys())[1]}_0_v"
|
|
158
|
+
|
|
159
|
+
def get_real_feature_names(self, mode='d'):
|
|
160
|
+
"""
|
|
161
|
+
@des: 获取真正的特征值列表
|
|
162
|
+
"""
|
|
163
|
+
real_features_names = {}
|
|
164
|
+
sub_len = len(self.gconf.level_durs)
|
|
165
|
+
for k, v in self.feature_names.items():
|
|
166
|
+
if k != self.base_user_table:
|
|
167
|
+
for i in range(sub_len):
|
|
168
|
+
if mode == 'h':
|
|
169
|
+
rv = f"{v}_{int(self.gconf.level_durs[i]/3600)}h"
|
|
170
|
+
else:
|
|
171
|
+
js = self.gconf.level_durs[i]/86400
|
|
172
|
+
if js == int(js):
|
|
173
|
+
js = int(js)
|
|
174
|
+
else:
|
|
175
|
+
js = round(js, 1)
|
|
176
|
+
rv = f"{v}_{js}d"
|
|
177
|
+
real_features_names[f"{k}_{i}_v"] = rv
|
|
178
|
+
return real_features_names
|
|
179
|
+
|
|
180
|
+
def get_threshold_sql(self, col_name):
|
|
181
|
+
"""
|
|
182
|
+
@des: 根据col_name例如 g_1_v获取特征值计算的sql
|
|
183
|
+
"""
|
|
184
|
+
first_col_name, second_col_index = col_name.split(
|
|
185
|
+
"_")[0], col_name.split("_")[1]
|
|
186
|
+
sub_field_name = "%s_%s_v" % (first_col_name, second_col_index)
|
|
187
|
+
real_des = "%s %s" % (self.feature_names.get(
|
|
188
|
+
first_col_name), second_col_index)
|
|
189
|
+
level_dur = self.gconf.level_durs[int(second_col_index)]
|
|
190
|
+
f_sub_sql = self.sub_sql_fs.get(first_col_name)
|
|
191
|
+
if not f_sub_sql:
|
|
192
|
+
raise Et(2, f"error first_col_name {first_col_name}")
|
|
193
|
+
sub_sql = f_sub_sql % {
|
|
194
|
+
"real_des": real_des, "level_dur": level_dur, 'sub_field_name': sub_field_name}
|
|
195
|
+
return sub_sql
|
|
196
|
+
|
|
197
|
+
def get_sql_config(self):
|
|
198
|
+
"""
|
|
199
|
+
@des: 计算各种时间间隔和配置,统一从一个方法读取,后面方便改
|
|
200
|
+
"""
|
|
201
|
+
gconf = self.gconf
|
|
202
|
+
#
|
|
203
|
+
part_date_start = gconf.part_date_start
|
|
204
|
+
part_date_end = gconf.part_date_end
|
|
205
|
+
days = (arrow.get(part_date_end)-arrow.get(part_date_start)).days
|
|
206
|
+
# 取用户新增后的N天作为事件池数据来取特征值,比如7日预测14日,就取8-9比较合适,根据level_durs[-1]决定
|
|
207
|
+
after_feature_day = int(gconf.level_durs[-1]/86400)+2 # 多取两天即可
|
|
208
|
+
feature_date_end = arrow.get(part_date_end).shift(days=after_feature_day).format(
|
|
209
|
+
"YYYY-MM-DD") # 增加到8天的事件数据,因为现在有7日预测30日
|
|
210
|
+
label_durs = gconf.label_durs
|
|
211
|
+
label_date_start = arrow.get(part_date_start).shift(
|
|
212
|
+
days=int(label_durs[0]/86400)-2).format("YYYY-MM-DD")
|
|
213
|
+
label_date_end = arrow.get(part_date_end).shift(
|
|
214
|
+
days=int(label_durs[1]/86400)+2).format("YYYY-MM-DD")
|
|
215
|
+
#
|
|
216
|
+
new_device_start = arrow.get(part_date_start).format("YYYY-MM-DD")
|
|
217
|
+
new_device_end = arrow.get(part_date_end).shift(
|
|
218
|
+
days=1).format("YYYY-MM-DD")
|
|
219
|
+
comm_cc = {
|
|
220
|
+
"days": days,
|
|
221
|
+
'part_date_start': part_date_start,
|
|
222
|
+
'part_date_end': part_date_end,
|
|
223
|
+
'feature_date_end': feature_date_end,
|
|
224
|
+
'label_date_start': label_date_start,
|
|
225
|
+
'label_date_end': label_date_end,
|
|
226
|
+
'new_device_start': new_device_start,
|
|
227
|
+
'new_device_end': new_device_end
|
|
228
|
+
}
|
|
229
|
+
# print(111111, comm_cc)
|
|
230
|
+
return comm_cc
|
|
231
|
+
|
|
232
|
+
def get_event_sql(self):
|
|
233
|
+
"""
|
|
234
|
+
@des: 获取N日ltv用户数据
|
|
235
|
+
"""
|
|
236
|
+
gconf = self.gconf
|
|
237
|
+
comm_cc = self.get_sql_config()
|
|
238
|
+
#
|
|
239
|
+
sql = f"""
|
|
240
|
+
-- des: add by yuanxiao for machine learn train
|
|
241
|
+
with new_user as (
|
|
242
|
+
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件
|
|
243
|
+
SELECT
|
|
244
|
+
a1.user_register_time as user_register_time,
|
|
245
|
+
a1."#user_id" as user_user_id,
|
|
246
|
+
b1."#user_id" as event_user_id,
|
|
247
|
+
CASE WHEN b1."yiap__itemrevenue" is NULL THEN 0 ELSE b1."yiap__itemrevenue" END yiap__itemrevenue,
|
|
248
|
+
CASE WHEN b1."sdk_ad_price" is NULL THEN 0 ELSE b1."sdk_ad_price" END sdk_ad_price,
|
|
249
|
+
floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
|
|
250
|
+
FROM
|
|
251
|
+
(
|
|
252
|
+
-- 获取指定日期的注册用户,以防万一,使用窗口函数去个重
|
|
253
|
+
SELECT
|
|
254
|
+
*
|
|
255
|
+
FROM
|
|
256
|
+
(
|
|
257
|
+
SELECT
|
|
258
|
+
*,
|
|
259
|
+
row_number() OVER (PARTITION BY "#user_id" ORDER BY user_register_time ) AS row_no
|
|
260
|
+
FROM
|
|
261
|
+
(
|
|
262
|
+
SELECT
|
|
263
|
+
b.event_time_utc as user_register_time, a."#user_id"
|
|
264
|
+
FROM
|
|
265
|
+
(
|
|
266
|
+
SELECT
|
|
267
|
+
"#user_id"
|
|
268
|
+
FROM
|
|
269
|
+
{gconf.tga_user_table}
|
|
270
|
+
WHERE
|
|
271
|
+
{self.user_cols1}
|
|
272
|
+
) a
|
|
273
|
+
INNER JOIN
|
|
274
|
+
(
|
|
275
|
+
SELECT
|
|
276
|
+
*
|
|
277
|
+
FROM (
|
|
278
|
+
SELECT
|
|
279
|
+
"#user_id",
|
|
280
|
+
{self.tran_dt_by_zone("#event_time", "#zone_offset", 0)} as event_time_utc,
|
|
281
|
+
"$part_event"
|
|
282
|
+
FROM
|
|
283
|
+
{gconf.tga_event_table}
|
|
284
|
+
WHERE
|
|
285
|
+
"$part_event" = 'new_device'
|
|
286
|
+
AND "$part_date" >= '{arrow.get(comm_cc["new_device_start"]).shift(days=-2).format("YYYY-MM-DD")}'
|
|
287
|
+
AND "$part_date" <= '{arrow.get(comm_cc["new_device_end"]).shift(days=2).format("YYYY-MM-DD")}'
|
|
288
|
+
AND {self.user_cols2}
|
|
289
|
+
)
|
|
290
|
+
WHERE
|
|
291
|
+
event_time_utc >= timestamp '{comm_cc["new_device_start"]}'
|
|
292
|
+
AND event_time_utc < timestamp '{comm_cc["new_device_end"]}'
|
|
293
|
+
) b
|
|
294
|
+
ON a."#user_id" = b."#user_id"
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
WHERE row_no=1
|
|
298
|
+
) a1
|
|
299
|
+
LEFT JOIN
|
|
300
|
+
(
|
|
301
|
+
-- 获取指定日期的7-10天后的所有事件
|
|
302
|
+
SELECT
|
|
303
|
+
*
|
|
304
|
+
FROM (
|
|
305
|
+
SELECT
|
|
306
|
+
"#user_id",
|
|
307
|
+
"yiap__itemrevenue", --- 美元
|
|
308
|
+
"sdk_ad_price", -- 美分
|
|
309
|
+
"$part_event",
|
|
310
|
+
{self.tran_dt_by_zone("#event_time", "#zone_offset", 0)} as event_time_utc
|
|
311
|
+
FROM
|
|
312
|
+
{gconf.tga_event_table}
|
|
313
|
+
WHERE
|
|
314
|
+
"$part_event" in ('server_iap', 'impression_ad')
|
|
315
|
+
AND "$part_date" >= '{arrow.get(comm_cc["label_date_start"]).shift(days=-2).format("YYYY-MM-DD")}'
|
|
316
|
+
AND "$part_date" <= '{arrow.get(comm_cc["label_date_end"]).shift(days=2).format("YYYY-MM-DD")}'
|
|
317
|
+
)
|
|
318
|
+
WHERE
|
|
319
|
+
event_time_utc >= timestamp '{comm_cc["label_date_start"]}'
|
|
320
|
+
AND event_time_utc <= timestamp '{comm_cc["label_date_end"]}'
|
|
321
|
+
) b1
|
|
322
|
+
ON a1."#user_id" = b1."#user_id"
|
|
323
|
+
)
|
|
324
|
+
, user_label_table as (
|
|
325
|
+
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量 为0[流失用户] 和 大于0[留存优质用户] 的数量
|
|
326
|
+
SELECT
|
|
327
|
+
-- SUM(CASE WHEN a3.event_count=0 THEN 1 ELSE 0 END) as miss_count,
|
|
328
|
+
-- SUM(CASE WHEN a3.event_count>0 THEN 1 ELSE 0 END) as high_value_count
|
|
329
|
+
a3.revenue_price as is_rd, -- 将指定mode的ltv值作为标签
|
|
330
|
+
a3.user_user_id,
|
|
331
|
+
a3.user_register_time,
|
|
332
|
+
-- 计算固定随机值,打乱顺序
|
|
333
|
+
(abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
334
|
+
FROM
|
|
335
|
+
(
|
|
336
|
+
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
337
|
+
SELECT
|
|
338
|
+
a2_1.user_user_id,
|
|
339
|
+
a2_1.user_register_time,
|
|
340
|
+
{
|
|
341
|
+
[
|
|
342
|
+
"cast((a2_1.itemrevenue + a2_1.ad_price) as double) as revenue_price -- 将iaa和iap的值相加,作为标签",
|
|
343
|
+
"cast(a2_1.itemrevenue as double) as revenue_price -- 将iap作为标签",
|
|
344
|
+
"cast(a2_1.ad_price as double) as revenue_price -- 将iaa作为标签",
|
|
345
|
+
][['iaa+iap','iap','iaa'].index(self.mode)]
|
|
346
|
+
}
|
|
347
|
+
FROM (
|
|
348
|
+
SELECT
|
|
349
|
+
a2.user_user_id as user_user_id,
|
|
350
|
+
a2.user_register_time as user_register_time,
|
|
351
|
+
SUM(
|
|
352
|
+
CASE
|
|
353
|
+
WHEN a2.pd_rt_dur IS NULL THEN 0
|
|
354
|
+
WHEN a2.pd_rt_dur<{gconf.label_durs[0]} OR a2.pd_rt_dur>={gconf.label_durs[1]} THEN 0
|
|
355
|
+
ELSE a2."yiap__itemrevenue"
|
|
356
|
+
END
|
|
357
|
+
) AS itemrevenue, -- 美元
|
|
358
|
+
SUM(
|
|
359
|
+
CASE
|
|
360
|
+
WHEN a2.pd_rt_dur IS NULL THEN 0
|
|
361
|
+
WHEN a2.pd_rt_dur<{gconf.label_durs[0]} OR a2.pd_rt_dur>={gconf.label_durs[1]} THEN 0
|
|
362
|
+
ELSE a2."sdk_ad_price"
|
|
363
|
+
END
|
|
364
|
+
)/1000 AS ad_price -- 这里除以1000变成美元
|
|
365
|
+
FROM
|
|
366
|
+
(
|
|
367
|
+
new_user
|
|
368
|
+
) a2
|
|
369
|
+
GROUP BY a2.user_user_id, a2.user_register_time
|
|
370
|
+
) a2_1
|
|
371
|
+
) a3
|
|
372
|
+
)
|
|
373
|
+
, nav_table as (
|
|
374
|
+
select
|
|
375
|
+
*
|
|
376
|
+
from
|
|
377
|
+
user_label_table
|
|
378
|
+
where
|
|
379
|
+
is_rd>={self.line_value_rd}
|
|
380
|
+
-- 固定排序,防止找不到用户, 不适用于超大数据量
|
|
381
|
+
order by tt_stable_rand
|
|
382
|
+
limit {6000* comm_cc['days']}
|
|
383
|
+
)
|
|
384
|
+
, neg_table as (
|
|
385
|
+
select
|
|
386
|
+
*
|
|
387
|
+
from
|
|
388
|
+
user_label_table
|
|
389
|
+
where
|
|
390
|
+
is_rd<={self.line_value_rd}
|
|
391
|
+
-- 固定排序,防止找不到用户, 不适用于超大数据量
|
|
392
|
+
order by tt_stable_rand
|
|
393
|
+
limit {12000* comm_cc['days']}
|
|
394
|
+
)
|
|
395
|
+
, union_all as (
|
|
396
|
+
-- 保持正负样本固定比例1:2
|
|
397
|
+
select
|
|
398
|
+
*
|
|
399
|
+
from (
|
|
400
|
+
(
|
|
401
|
+
select is_rd,user_register_time, user_user_id from nav_table
|
|
402
|
+
)
|
|
403
|
+
UNION ALL
|
|
404
|
+
(
|
|
405
|
+
select
|
|
406
|
+
is_rd,user_register_time, user_user_id
|
|
407
|
+
from (
|
|
408
|
+
SELECT *, row_number() OVER (
|
|
409
|
+
PARTITION BY is_rd
|
|
410
|
+
ORDER BY tt_stable_rand
|
|
411
|
+
) AS kere_nopoa_end_0
|
|
412
|
+
FROM neg_table
|
|
413
|
+
) where kere_nopoa_end_0 < (select count(1)*{self.nav_neg_multi} from nav_table)
|
|
414
|
+
)
|
|
415
|
+
) st
|
|
416
|
+
)
|
|
417
|
+
, all_need_event as (
|
|
418
|
+
SELECT
|
|
419
|
+
*
|
|
420
|
+
FROM (
|
|
421
|
+
-- 获取正负标签样本需要的事件
|
|
422
|
+
select
|
|
423
|
+
a5.is_rd,
|
|
424
|
+
a5.user_register_time,
|
|
425
|
+
a5.user_user_id,
|
|
426
|
+
floor(to_unixtime(b5.event_time_utc))-floor(to_unixtime(a5.user_register_time)) as rt_dur,
|
|
427
|
+
c5."afrawip__meida_source" as "afrawip__meida_source_ikfdssausercommonend",
|
|
428
|
+
c5."#account_id" as "#account_id_ikfdssausercommonend",
|
|
429
|
+
c5."#distinct_id" as "#distinct_id_ikfdssausercommonend",
|
|
430
|
+
b5.*,
|
|
431
|
+
b5.event_time_utc as "#event_time"
|
|
432
|
+
from
|
|
433
|
+
union_all a5
|
|
434
|
+
INNER JOIN
|
|
435
|
+
(
|
|
436
|
+
SELECT
|
|
437
|
+
*
|
|
438
|
+
FROM (
|
|
439
|
+
SELECT
|
|
440
|
+
-- *
|
|
441
|
+
{','.join(['"%s"'%x for x in self.need_event_attrs])},
|
|
442
|
+
{self.tran_dt_by_zone("#event_time", "#zone_offset", 0)} as event_time_utc
|
|
443
|
+
FROM
|
|
444
|
+
{gconf.tga_event_table}
|
|
445
|
+
WHERE
|
|
446
|
+
"$part_date" >= '{arrow.get(comm_cc["part_date_start"]).shift(days=-2).format("YYYY-MM-DD")}'
|
|
447
|
+
AND "$part_date" <= '{arrow.get(comm_cc["feature_date_end"]).shift(days=2).format("YYYY-MM-DD")}'
|
|
448
|
+
AND "$part_event" in ({','.join(["'%s'"%x for x in self.need_events])})
|
|
449
|
+
AND {self.event_cols1}
|
|
450
|
+
)
|
|
451
|
+
WHERE
|
|
452
|
+
event_time_utc >= timestamp '{comm_cc["part_date_start"]}'
|
|
453
|
+
AND event_time_utc <= timestamp '{comm_cc["feature_date_end"]}'
|
|
454
|
+
)
|
|
455
|
+
as b5
|
|
456
|
+
ON a5.user_user_id = b5."#user_id"
|
|
457
|
+
INNER JOIN
|
|
458
|
+
(
|
|
459
|
+
SELECT
|
|
460
|
+
"#user_id","afrawip__meida_source","#account_id","#distinct_id"
|
|
461
|
+
FROM
|
|
462
|
+
{gconf.tga_user_table}
|
|
463
|
+
)
|
|
464
|
+
as c5
|
|
465
|
+
ON a5.user_user_id = c5."#user_id"
|
|
466
|
+
) a6
|
|
467
|
+
WHERE
|
|
468
|
+
rt_dur>=0 AND rt_dur<={gconf.level_durs[-1]} -- N小时内的事件
|
|
469
|
+
)
|
|
470
|
+
"""
|
|
471
|
+
return sql
|
|
472
|
+
|
|
473
|
+
def get_sub_sql_i(self, sub_sql_f, col_name, col_des, base_event_table='all_need_event', base_user_table='a'):
|
|
474
|
+
"""
|
|
475
|
+
@des: 内部调用
|
|
476
|
+
"""
|
|
477
|
+
gconf = self.gconf
|
|
478
|
+
sub_field_f = "%(col)s_%(dur_i)s"
|
|
479
|
+
real_des_f = "%(des)s %(dur_i)s"
|
|
480
|
+
sub_field_names = []
|
|
481
|
+
sub_sqls = []
|
|
482
|
+
for _dur_i, level_dur in enumerate(gconf.level_durs):
|
|
483
|
+
real_des = real_des_f % {'des': col_des, 'dur_i': _dur_i}
|
|
484
|
+
sub_field_name = sub_field_f % {'col': col_name, 'dur_i': _dur_i}
|
|
485
|
+
sub_field_names.append(sub_field_name)
|
|
486
|
+
# sub_sql_f格式如下
|
|
487
|
+
# sub_sql_f = """
|
|
488
|
+
# --%(real_des)s
|
|
489
|
+
# sum(
|
|
490
|
+
# if(
|
|
491
|
+
# "$part_event"=='act_level_path' and object_type='normal' and act='win' and rt_dur<%(level_dur)s,
|
|
492
|
+
# 1,
|
|
493
|
+
# 0)
|
|
494
|
+
# ) as %(sub_field_name)s
|
|
495
|
+
# """
|
|
496
|
+
sub_sql = sub_sql_f % {
|
|
497
|
+
'real_des': real_des,
|
|
498
|
+
'level_dur': level_dur,
|
|
499
|
+
'sub_field_name': sub_field_name
|
|
500
|
+
}
|
|
501
|
+
sub_sqls.append(sub_sql)
|
|
502
|
+
return sub_field_names, sub_sqls
|
|
503
|
+
|
|
504
|
+
def get_feature_sql(self):
|
|
505
|
+
"""
|
|
506
|
+
@des: 提取特征值的sql
|
|
507
|
+
"""
|
|
508
|
+
gconf = self.gconf
|
|
509
|
+
"""
|
|
510
|
+
构建level_durs的sql
|
|
511
|
+
"""
|
|
512
|
+
base_event_table = self.base_event_table
|
|
513
|
+
base_user_table = self.base_user_table
|
|
514
|
+
base_feature_table = self.base_feature_table
|
|
515
|
+
feature_names = self.feature_names
|
|
516
|
+
comm_cc = self.get_sql_config()
|
|
517
|
+
#
|
|
518
|
+
sub_sqls = []
|
|
519
|
+
sub_field_names = []
|
|
520
|
+
for col_name, col_des in feature_names.items():
|
|
521
|
+
sub_sql_f = ""
|
|
522
|
+
if col_name == 'a':
|
|
523
|
+
continue
|
|
524
|
+
else:
|
|
525
|
+
sub_sql_f = self.sub_sql_fs.get(col_name)
|
|
526
|
+
if not sub_sql_f:
|
|
527
|
+
raise Et(2, f"error col_name: {col_name} ")
|
|
528
|
+
# 拼接结果
|
|
529
|
+
if sub_sql_f:
|
|
530
|
+
_sub_field_names, _sub_sqls = self.get_sub_sql_i(
|
|
531
|
+
sub_sql_f, col_name, col_des)
|
|
532
|
+
sub_sqls.extend(_sub_sqls)
|
|
533
|
+
sub_field_names.extend(_sub_field_names)
|
|
534
|
+
|
|
535
|
+
"""
|
|
536
|
+
构建select选项
|
|
537
|
+
"""
|
|
538
|
+
# 带表名的字段 表名.field
|
|
539
|
+
base_user_table_fields = [
|
|
540
|
+
"is_rd", "user_user_id", "afrawip__meida_source_ikfdssausercommonend", "#lib", "#country_code"]
|
|
541
|
+
for addon_attr in self.addon_attrs:
|
|
542
|
+
_addon_attr = addon_attr.strip()
|
|
543
|
+
if _addon_attr not in base_user_table_fields:
|
|
544
|
+
base_user_table_fields.append(_addon_attr)
|
|
545
|
+
#
|
|
546
|
+
sub_selects = ",".join(
|
|
547
|
+
[f"{base_feature_table}.{item} as {item}_v" for item in sub_field_names])
|
|
548
|
+
sub_selects = f"""{",".join(['%s."%s"'%(base_user_table, ptf) for ptf in base_user_table_fields])},{sub_selects}"""
|
|
549
|
+
# 不带表名的字段
|
|
550
|
+
# sub_selects_unique = ",".join(
|
|
551
|
+
# [f'"{item}_v"' for item in sub_field_names])
|
|
552
|
+
# sub_selects_unique = f"""{",".join(['"%s"'%ptf for ptf in base_user_table_fields])},{sub_selects_unique}"""
|
|
553
|
+
"""
|
|
554
|
+
构建sub_sqls
|
|
555
|
+
"""
|
|
556
|
+
sub_join_sqls = ",".join(sub_sqls)
|
|
557
|
+
"""
|
|
558
|
+
拼接最后的sql
|
|
559
|
+
"""
|
|
560
|
+
sql = f"""
|
|
561
|
+
, user_tzz as (
|
|
562
|
+
select
|
|
563
|
+
{sub_selects}
|
|
564
|
+
from (
|
|
565
|
+
--用户属性和标签
|
|
566
|
+
select
|
|
567
|
+
{",".join(['"%s"'%ptf for ptf in base_user_table_fields])}
|
|
568
|
+
from
|
|
569
|
+
(
|
|
570
|
+
select
|
|
571
|
+
{",".join(['"%s"'%ptf for ptf in base_user_table_fields])},
|
|
572
|
+
row_number() OVER (PARTITION BY user_user_id ORDER BY "#event_time" asc) AS row_no
|
|
573
|
+
from
|
|
574
|
+
{base_event_table}
|
|
575
|
+
)
|
|
576
|
+
where
|
|
577
|
+
row_no=1
|
|
578
|
+
) {base_user_table}
|
|
579
|
+
left join
|
|
580
|
+
( --- 特征值计算
|
|
581
|
+
select
|
|
582
|
+
user_user_id,
|
|
583
|
+
{sub_join_sqls}
|
|
584
|
+
from
|
|
585
|
+
{base_event_table}
|
|
586
|
+
group by user_user_id
|
|
587
|
+
) {base_feature_table}
|
|
588
|
+
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
589
|
+
)
|
|
590
|
+
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
591
|
+
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
592
|
+
"""
|
|
593
|
+
return sql
|
|
594
|
+
|
|
595
|
+
def get_sql(self):
|
|
596
|
+
# print("---------------------------------")
|
|
597
|
+
event_sql = self.get_event_sql()
|
|
598
|
+
feature_sql = self.get_feature_sql()
|
|
599
|
+
sql = f"""
|
|
600
|
+
{event_sql}
|
|
601
|
+
{feature_sql}
|
|
602
|
+
"""
|
|
603
|
+
return sql
|
|
@@ -11,7 +11,7 @@ gconf的属性有这些:
|
|
|
11
11
|
level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
|
|
12
12
|
label_durs # 取多少边界算留存,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生过事件的作为正向用户
|
|
13
13
|
tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
|
|
14
|
-
tga_event_table # tga
|
|
14
|
+
tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
|
|
15
15
|
########
|
|
16
16
|
|
|
17
17
|
如何使用(需要自己定义一个子类继承这个类, 代码如下):
|
|
@@ -9,9 +9,9 @@ gconf的属性有这些:
|
|
|
9
9
|
part_date_start # 特征值取新增用户的开始日期, 例如"2022-05-20", 字符串类型
|
|
10
10
|
part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
|
|
11
11
|
level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
|
|
12
|
-
label_durs #
|
|
12
|
+
label_durs # 取多少边界算硬核用户标签,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生高粘度事件的作为正向用户
|
|
13
13
|
tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
|
|
14
|
-
tga_event_table # tga
|
|
14
|
+
tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
|
|
15
15
|
########
|
|
16
16
|
|
|
17
17
|
如何使用(需要自己定义一个子类继承这个类, 代码如下):
|
|
@@ -26,9 +26,12 @@ tfduck/sagemaker/__init__.py
|
|
|
26
26
|
tfduck/sagemaker/saoper.py
|
|
27
27
|
tfduck/tga/__init__.py
|
|
28
28
|
tfduck/tga/base_tga.py
|
|
29
|
+
tfduck/tga/predict_sql_ltv.py
|
|
29
30
|
tfduck/tga/predict_sql_retain.py
|
|
30
31
|
tfduck/tga/predict_sql_yh.py
|
|
31
32
|
tfduck/tga/tga.py
|
|
33
|
+
tfduck/tga/tga_test.py
|
|
34
|
+
tfduck/tga/train_sql_ltv.py
|
|
32
35
|
tfduck/tga/train_sql_retain.py
|
|
33
36
|
tfduck/tga/train_sql_yh.py
|
|
34
37
|
tfduck/thinkdata/__init__.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__="0.16.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|