tfduck-bsd 0.16.1__tar.gz → 0.16.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tfduck-bsd might be problematic. Click here for more details.

Files changed (46) hide show
  1. {tfduck-bsd-0.16.1/tfduck_bsd.egg-info → tfduck-bsd-0.16.2}/PKG-INFO +1 -1
  2. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/setup.py +1 -1
  3. tfduck-bsd-0.16.2/tfduck/__init__.py +1 -0
  4. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/pyspark_k8s/spark_manage.py +2 -2
  5. tfduck-bsd-0.16.2/tfduck/tga/predict_sql_ltv.py +90 -0
  6. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/predict_sql_retain.py +1 -1
  7. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/predict_sql_yh.py +1 -1
  8. tfduck-bsd-0.16.2/tfduck/tga/tga_test.py +88 -0
  9. tfduck-bsd-0.16.2/tfduck/tga/train_sql_ltv.py +603 -0
  10. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/train_sql_retain.py +1 -1
  11. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/train_sql_yh.py +2 -2
  12. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2/tfduck_bsd.egg-info}/PKG-INFO +1 -1
  13. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/SOURCES.txt +3 -0
  14. tfduck-bsd-0.16.1/tfduck/__init__.py +0 -1
  15. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/LICENSE +0 -0
  16. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/README.md +0 -0
  17. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/bin/tfduck +0 -0
  18. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/setup.cfg +0 -0
  19. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/__init__.py +0 -0
  20. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
  21. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
  22. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
  23. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/example.py +0 -0
  24. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
  25. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
  26. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
  27. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/common/__init__.py +0 -0
  28. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/common/defines.py +0 -0
  29. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/common/extendEncoder.py +0 -0
  30. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/main.py +0 -0
  31. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/oss/__init__.py +0 -0
  32. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/oss/oss.py +0 -0
  33. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/pyspark_k8s/__init__.py +0 -0
  34. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
  35. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/s3/__init__.py +0 -0
  36. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/s3/s3oper.py +0 -0
  37. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/sagemaker/__init__.py +0 -0
  38. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/sagemaker/saoper.py +0 -0
  39. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/__init__.py +0 -0
  40. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/base_tga.py +0 -0
  41. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/tga/tga.py +0 -0
  42. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/thinkdata/__init__.py +0 -0
  43. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck/thinkdata/query.py +0 -0
  44. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
  45. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/requires.txt +0 -0
  46. {tfduck-bsd-0.16.1 → tfduck-bsd-0.16.2}/tfduck_bsd.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tfduck-bsd
3
- Version: 0.16.1
3
+ Version: 0.16.2
4
4
  Summary: A small example package
5
5
  Home-page: UNKNOWN
6
6
  Author: yuanxiao
@@ -8,7 +8,7 @@ with open("README.md", "r") as fh:
8
8
 
9
9
  setuptools.setup(
10
10
  name="tfduck-bsd",
11
- version="0.16.1",
11
+ version="0.16.2",
12
12
  author="yuanxiao",
13
13
  author_email="yuan6785@163.com",
14
14
  description="A small example package",
@@ -0,0 +1 @@
1
+ __version__="0.16.2"
@@ -364,7 +364,7 @@ if __name__ == "__main__":
364
364
  # # "容器内path,没有被污染也可以不设置"
365
365
  # os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python342342342342"
366
366
 
367
- if 0: # 同步--天天--测试上传代码路径
367
+ if 1: # 同步--天天--测试上传代码路径
368
368
  sm = SparkManage(
369
369
  ctx={},
370
370
  # code_path='/Users/yuanxiao/workspace/djcelery44/djcelery44/scripts/tools/debug/test_presto_s3',
@@ -380,7 +380,7 @@ if __name__ == "__main__":
380
380
  )
381
381
  sm.upload_code()
382
382
  sm.submit_spark_task()
383
- if 1: # 同步--天天---测试直接上传代码内容
383
+ if 0: # 同步--天天---测试直接上传代码内容
384
384
  with open('/Users/yuanxiao/workspace/djcelery44/djcelery44/scripts/tools/debug/test_hello/main_script.py', 'r') as f:
385
385
  code_content = f.read()
386
386
  #
@@ -0,0 +1,90 @@
1
+ '''
2
+ @des: tga获取批量转换特征值数据拉取的sql基类
3
+
4
+
5
+ 准备工作:
6
+ --------------------------------------
7
+ gconf的属性有这些:
8
+ ########
9
+ part_date_start # 特征值取新增用户的开始日期, 例如"2022-05-20", 字符串类型
10
+ part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
11
+ level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
12
+ tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
13
+ tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
14
+ ########
15
+
16
+ 如何使用(需要自己定义一个子类继承这个类, 代码如下):
17
+ #######
18
+ from magic_number_train_tf_2.src.common.config import Config as GAConfig # 这里换成你自己的项目
19
+ from tfduck.tga.predict_sql_ltv import PredictFeatureSql
20
+
21
+ def call_method(self, ctx=None, **kwargs):
22
+ """
23
+ @des: 每个具体项目的训练特征数据拉取
24
+ """
25
+ gconf = GAConfig.getInstance() # 保证gconf有上面准备工作的属性
26
+ """
27
+ 设置项目属性-----下面的都需要自己配置------
28
+ """
29
+ # 需要的事件
30
+ need_events = (
31
+ 'new_device', 'new_session', 'act_level_path'
32
+ )
33
+ # 需要的属性
34
+ need_event_attrs = (
35
+ "#lib", "#country_code", "$part_event", "#user_id", "sdk_session_time",
36
+ "object_type", "act", "object_number", "act_object",
37
+ )
38
+ # 特征值名称(不要用字母a作为key)---
39
+ feature_names = {
40
+ 'b': '常规关卡通关',
41
+ 'c': '冒险关卡通关'
42
+ }
43
+ # 特征值sql---模板保持不变,变里面的内容即可---
44
+ sub_sql_fs = {
45
+ 'b': """
46
+ --%(real_des)s
47
+ sum(
48
+ if(
49
+ "$part_event"='act_level_path' and object_type='normal' and act='win' and rt_dur<%(level_dur)s,
50
+ 1,
51
+ 0)
52
+ ) as %(sub_field_name)s
53
+ """,
54
+ 'c': """
55
+ --%(real_des)s
56
+ sum(
57
+ if(
58
+ "$part_event"='act_level_path' and object_type='adventure' and act='win' and rt_dur<%(level_dur)s,
59
+ 1,
60
+ 0)
61
+ ) as %(sub_field_name)s
62
+ """
63
+ }
64
+ # 额外的用户属性比如(可选参数)
65
+ addon_attrs = ["#screen_width", "#screen_height"]
66
+ """
67
+ 创建调用实例---最好通过dict创建实例
68
+ """
69
+ pf_sql_obj = PredictFeatureSql(
70
+ ctx = ctx,
71
+ gconf = gconf,
72
+ need_events = need_events,
73
+ need_event_attrs = need_event_attrs,
74
+ feature_names = feature_names,
75
+ sub_sql_fs = sub_sql_fs,
76
+ addon_attrs = addon_attrs # 可选参数
77
+ )
78
+ sql = pf_sql_obj.get_sql() # 这个sql就是拉取特征值的sql
79
+ #######
80
+ '''
81
+ from tfduck.common.defines import BMOBJ, Et
82
+ from tfduck.tga.predict_sql_retain import PredictFeatureSql as BasePredictFeatureSql
83
+ import arrow
84
+
85
+
86
+ class PredictFeatureSql(BasePredictFeatureSql):
87
+ """
88
+ @des:sql批量转换模板基类
89
+ """
90
+ pass
@@ -10,7 +10,7 @@ gconf的属性有这些:
10
10
  part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
11
11
  level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
12
12
  tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
13
- tga_event_table # tga用户表名, 例如 "v_user_7", 字符串
13
+ tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
14
14
  ########
15
15
 
16
16
  如何使用(需要自己定义一个子类继承这个类, 代码如下):
@@ -10,7 +10,7 @@ gconf的属性有这些:
10
10
  part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
11
11
  level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
12
12
  tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
13
- tga_event_table # tga用户表名, 例如 "v_user_7", 字符串
13
+ tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
14
14
  ########
15
15
 
16
16
  如何使用(需要自己定义一个子类继承这个类, 代码如下):
@@ -0,0 +1,88 @@
1
+ # -*- coding: utf-8 -*-
2
+ from train_sql_ltv import TrainFeatureSql
3
+
4
+
5
+ def test():
6
+ class GConf(object):
7
+ def __init__(self):
8
+ self.part_date_start = "2023-01-15"
9
+ self.part_date_end = "2023-01-17"
10
+ # 注册后0-14天的所有ltv作为标签
11
+ self.label_durs = [int(86400*0), int(86400*14)]
12
+ self.level_durs = [86400*0.25, 86400*0.5,
13
+ 86400, 86400*2, 86400*3] # 由小到大排列
14
+ self.tga_user_table = "v_user_30"
15
+ self.tga_event_table = "v_event_30"
16
+
17
+ gconf = GConf()
18
+
19
+ need_events = (
20
+ 'new_device',
21
+ 'new_session',
22
+ 'g_push',
23
+ 'sdk_close_session'
24
+ )
25
+ need_event_attrs = (
26
+ "#lib",
27
+ "pn",
28
+ "#device_model",
29
+ "#screen_width",
30
+ "#screen_height",
31
+ "#country_code",
32
+ "$part_event",
33
+ "#user_id"
34
+ )
35
+ feature_names = {
36
+ 'b': '打开游戏次数',
37
+ 'c': '本地推送打开游戏次数'
38
+ }
39
+ addon_attrs = [
40
+ "#device_model",
41
+ "#screen_width",
42
+ "#screen_height"
43
+ ]
44
+ sub_sql_fs = {
45
+ 'b':
46
+ # 打开游戏次数
47
+ """
48
+ --%(real_des)s
49
+ sum(
50
+ if(
51
+ "$part_event"='new_session' and rt_dur<%(level_dur)s,
52
+ 1,
53
+ 0)
54
+ ) as %(sub_field_name)s
55
+ """, 'c':
56
+ # 本地推送打开游戏次数
57
+ """
58
+ --%(real_des)s
59
+ sum(
60
+ if(
61
+ "$part_event"='g_push' and rt_dur<%(level_dur)s,
62
+ 1,
63
+ 0)
64
+ ) as %(sub_field_name)s
65
+ """
66
+ }
67
+ user_cols2 = """ "#lib"='Android' """
68
+ tf_sql_obj = TrainFeatureSql(
69
+ ctx={},
70
+ gconf=gconf,
71
+ need_events=need_events,
72
+ need_event_attrs=need_event_attrs,
73
+ feature_names=feature_names,
74
+ sub_sql_fs=sub_sql_fs,
75
+ user_cols2=user_cols2,
76
+ addon_attrs=addon_attrs, # 可选参数
77
+ line_value_rd=0.5, # 可选参数
78
+ mode="iaa+iap"
79
+ )
80
+ sql = tf_sql_obj.get_sql() # 这个sql就是拉取特征值的sql
81
+ # 将sql复制到剪贴板
82
+ import pyperclip
83
+ pyperclip.copy(sql)
84
+ return sql
85
+
86
+
87
+ if __name__ == "__main__":
88
+ test()
@@ -0,0 +1,603 @@
1
+ '''
2
+ @des: tga获取训练特征值数据拉取的sql基类
3
+
4
+
5
+ 准备工作:
6
+ --------------------------------------
7
+ gconf的属性有这些:
8
+ ########
9
+ part_date_start # 特征值取新增用户的开始日期, 例如"2022-05-20", 字符串类型
10
+ part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
11
+ level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
12
+ label_durs # 取多少边界算ltv标签值,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生过事件的作为正向用户
13
+ tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
14
+ tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
15
+ ########
16
+
17
+ 如何使用(需要自己定义一个子类继承这个类, 代码如下):
18
+ #######
19
+ from magic_number_train_tf_2.src.common.config import Config as GAConfig # 这里换成你自己的项目
20
+ from tfduck.tga.train_sql_ltv import TrainFeatureSql
21
+
22
+ def call_method(self, ctx=None, **kwargs):
23
+ """
24
+ @des: 每个具体项目的训练特征数据拉取
25
+ """
26
+ gconf = GAConfig.getInstance() # 保证gconf有上面准备工作的属性
27
+ """
28
+ 设置项目属性-----下面的都需要自己配置------
29
+ """
30
+ # 需要的事件
31
+ need_events = (
32
+ 'new_device', 'new_session', 'act_level_path'
33
+ )
34
+ # 需要的属性
35
+ need_event_attrs = (
36
+ "#lib", "#country_code", "$part_event", "#user_id", "sdk_session_time",
37
+ "object_type", "act", "object_number", "act_object",
38
+ )
39
+ # 特征值名称--不要用字母a作为key---
40
+ feature_names = {
41
+ 'b': '常规关卡通关',
42
+ 'c': '冒险关卡通关'
43
+ }
44
+ # 特征值sql---模板保持不变,变里面的内容即可---
45
+ sub_sql_fs = {
46
+ 'b': """
47
+ --%(real_des)s
48
+ sum(
49
+ if(
50
+ "$part_event"='act_level_path' and object_type='normal' and act='win' and rt_dur<%(level_dur)s,
51
+ 1,
52
+ 0)
53
+ ) as %(sub_field_name)s
54
+ """,
55
+ 'c': """
56
+ --%(real_des)s
57
+ sum(
58
+ if(
59
+ "$part_event"='act_level_path' and object_type='adventure' and act='win' and rt_dur<%(level_dur)s,
60
+ 1,
61
+ 0)
62
+ ) as %(sub_field_name)s
63
+ """
64
+ }
65
+ # 额外的用户属性比如(可选参数)
66
+ addon_attrs = ["#screen_width", "#screen_height"]
67
+ # line_value_rd ltv的线性值大于多少算正样本,小于多少算负样本,这样可以控制正负样本比例
68
+ line_value_rd = 0.5
69
+ # mode 标签值iaa(只计算iaa), iap(只计算iap), iaa+iap(计算iaa+iap)
70
+ mode = 'iaa+iap'
71
+ """
72
+ 创建调用实例---最好通过dict创建实例
73
+ 注意: 针对一个tga项目属于多个项目的打点情况,例如3dfish, 可以适时启用user_cols1,user_cols2或者event_cols1
74
+ a. user_cols1 参数为用户过滤条件(user表)
75
+ 如果有的话, 例如: user_cols1 = """ "afrawip__meida_source"='FaceBook' OR "afrawip__app_id"='123456' """
76
+ b. user_cols2 参数为用户过滤条件(event表的new_device事件)
77
+ 如果有的话, 例如: user_cols2 = """ "#lib"='Android' OR "#lib"='iOS' """
78
+ c. event_cols1 参数为事件过滤条件(event表的所有计算特征值的事件)
79
+ 如果有的话, 例如: event_cols1 = """ "#lib"='Android' OR "#lib"='iOS' """
80
+ """
81
+ tf_sql_obj = TrainFeatureSql(
82
+ ctx = ctx,
83
+ gconf = gconf,
84
+ need_events = need_events,
85
+ need_event_attrs = need_event_attrs,
86
+ feature_names = feature_names,
87
+ sub_sql_fs = sub_sql_fs,
88
+ addon_attrs = addon_attrs # 可选参数
89
+ )
90
+ sql = tf_sql_obj.get_sql() # 这个sql就是拉取特征值的sql
91
+
92
+ #######
93
+ '''
94
+ from tfduck.tga.base_tga import BaseTga
95
+ from tfduck.common.defines import BMOBJ, Et
96
+ import arrow
97
+
98
+
99
+ class TrainFeatureSql(BaseTga):
100
+ """
101
+ @des:sql训练模板基类
102
+ """
103
+
104
+ def __init__(self,
105
+ ctx=None,
106
+ gconf=None,
107
+ need_events=None,
108
+ need_event_attrs=None,
109
+ feature_names=None,
110
+ sub_sql_fs=None,
111
+ user_cols1='1=1',
112
+ user_cols2='1=1',
113
+ event_cols1='1=1',
114
+ addon_attrs=[],
115
+ line_value_rd=0.5,
116
+ mode='iaa+iap',
117
+ **kwargs):
118
+ """
119
+ @des: 参数说明看上面的文档说明
120
+ """
121
+ self.gconf = gconf
122
+ self.ctx = ctx
123
+ # 基础属性
124
+ self.label_col = "is_rd"
125
+ self.base_event_table = 'all_need_event'
126
+ self.base_user_table = 'a'
127
+ self.base_feature_table = 'features_t'
128
+ self.nav_neg_multi = 2 # 负正样本比例, 线性值无正负样本比例的说法, 但可以给一个中间值,大于多少,小于多少即可
129
+ self.line_value_rd = line_value_rd # ltv的线性值大于多少算正样本,小于多少算负样本,这样可以控制正负样本比例
130
+ self.mode = mode # mode 标签值iaa(只计算iaa), iap(只计算iap), iaa+iap(计算iaa+iap)
131
+
132
+ # 项目属性
133
+ self.need_events = need_events
134
+ self.need_event_attrs = need_event_attrs
135
+ self.feature_names = feature_names
136
+ if self.base_user_table in self.feature_names:
137
+ raise Et(2, f"attr name '{self.base_user_table}' cannt be used")
138
+ feature_names_all = {
139
+ self.base_user_table: '用户属性和标签', # 此属性固定sql,不需要拼接
140
+ }
141
+ feature_names_all.update(self.feature_names)
142
+ self.feature_names = feature_names_all
143
+ self.sub_sql_fs = sub_sql_fs
144
+ #
145
+ self.user_cols1 = user_cols1
146
+ self.user_cols2 = user_cols2
147
+ self.event_cols1 = event_cols1
148
+ self.addon_attrs = addon_attrs if isinstance(addon_attrs, list) else []
149
+ # 其他属性
150
+ for k, v in kwargs.items():
151
+ setattr(self, k, v)
152
+
153
+ def get_first_col_name(self):
154
+ """
155
+ @des: 获取第一个特征属性列
156
+ """
157
+ return f"{list(self.feature_names.keys())[1]}_0_v"
158
+
159
+ def get_real_feature_names(self, mode='d'):
160
+ """
161
+ @des: 获取真正的特征值列表
162
+ """
163
+ real_features_names = {}
164
+ sub_len = len(self.gconf.level_durs)
165
+ for k, v in self.feature_names.items():
166
+ if k != self.base_user_table:
167
+ for i in range(sub_len):
168
+ if mode == 'h':
169
+ rv = f"{v}_{int(self.gconf.level_durs[i]/3600)}h"
170
+ else:
171
+ js = self.gconf.level_durs[i]/86400
172
+ if js == int(js):
173
+ js = int(js)
174
+ else:
175
+ js = round(js, 1)
176
+ rv = f"{v}_{js}d"
177
+ real_features_names[f"{k}_{i}_v"] = rv
178
+ return real_features_names
179
+
180
+ def get_threshold_sql(self, col_name):
181
+ """
182
+ @des: 根据col_name例如 g_1_v获取特征值计算的sql
183
+ """
184
+ first_col_name, second_col_index = col_name.split(
185
+ "_")[0], col_name.split("_")[1]
186
+ sub_field_name = "%s_%s_v" % (first_col_name, second_col_index)
187
+ real_des = "%s %s" % (self.feature_names.get(
188
+ first_col_name), second_col_index)
189
+ level_dur = self.gconf.level_durs[int(second_col_index)]
190
+ f_sub_sql = self.sub_sql_fs.get(first_col_name)
191
+ if not f_sub_sql:
192
+ raise Et(2, f"error first_col_name {first_col_name}")
193
+ sub_sql = f_sub_sql % {
194
+ "real_des": real_des, "level_dur": level_dur, 'sub_field_name': sub_field_name}
195
+ return sub_sql
196
+
197
+ def get_sql_config(self):
198
+ """
199
+ @des: 计算各种时间间隔和配置,统一从一个方法读取,后面方便改
200
+ """
201
+ gconf = self.gconf
202
+ #
203
+ part_date_start = gconf.part_date_start
204
+ part_date_end = gconf.part_date_end
205
+ days = (arrow.get(part_date_end)-arrow.get(part_date_start)).days
206
+ # 取用户新增后的N天作为事件池数据来取特征值,比如7日预测14日,就取8-9比较合适,根据level_durs[-1]决定
207
+ after_feature_day = int(gconf.level_durs[-1]/86400)+2 # 多取两天即可
208
+ feature_date_end = arrow.get(part_date_end).shift(days=after_feature_day).format(
209
+ "YYYY-MM-DD") # 增加到8天的事件数据,因为现在有7日预测30日
210
+ label_durs = gconf.label_durs
211
+ label_date_start = arrow.get(part_date_start).shift(
212
+ days=int(label_durs[0]/86400)-2).format("YYYY-MM-DD")
213
+ label_date_end = arrow.get(part_date_end).shift(
214
+ days=int(label_durs[1]/86400)+2).format("YYYY-MM-DD")
215
+ #
216
+ new_device_start = arrow.get(part_date_start).format("YYYY-MM-DD")
217
+ new_device_end = arrow.get(part_date_end).shift(
218
+ days=1).format("YYYY-MM-DD")
219
+ comm_cc = {
220
+ "days": days,
221
+ 'part_date_start': part_date_start,
222
+ 'part_date_end': part_date_end,
223
+ 'feature_date_end': feature_date_end,
224
+ 'label_date_start': label_date_start,
225
+ 'label_date_end': label_date_end,
226
+ 'new_device_start': new_device_start,
227
+ 'new_device_end': new_device_end
228
+ }
229
+ # print(111111, comm_cc)
230
+ return comm_cc
231
+
232
+ def get_event_sql(self):
233
+ """
234
+ @des: 获取N日ltv用户数据
235
+ """
236
+ gconf = self.gconf
237
+ comm_cc = self.get_sql_config()
238
+ #
239
+ sql = f"""
240
+ -- des: add by yuanxiao for machine learn train
241
+ with new_user as (
242
+ -- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件
243
+ SELECT
244
+ a1.user_register_time as user_register_time,
245
+ a1."#user_id" as user_user_id,
246
+ b1."#user_id" as event_user_id,
247
+ CASE WHEN b1."yiap__itemrevenue" is NULL THEN 0 ELSE b1."yiap__itemrevenue" END yiap__itemrevenue,
248
+ CASE WHEN b1."sdk_ad_price" is NULL THEN 0 ELSE b1."sdk_ad_price" END sdk_ad_price,
249
+ floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
250
+ FROM
251
+ (
252
+ -- 获取指定日期的注册用户,以防万一,使用窗口函数去个重
253
+ SELECT
254
+ *
255
+ FROM
256
+ (
257
+ SELECT
258
+ *,
259
+ row_number() OVER (PARTITION BY "#user_id" ORDER BY user_register_time ) AS row_no
260
+ FROM
261
+ (
262
+ SELECT
263
+ b.event_time_utc as user_register_time, a."#user_id"
264
+ FROM
265
+ (
266
+ SELECT
267
+ "#user_id"
268
+ FROM
269
+ {gconf.tga_user_table}
270
+ WHERE
271
+ {self.user_cols1}
272
+ ) a
273
+ INNER JOIN
274
+ (
275
+ SELECT
276
+ *
277
+ FROM (
278
+ SELECT
279
+ "#user_id",
280
+ {self.tran_dt_by_zone("#event_time", "#zone_offset", 0)} as event_time_utc,
281
+ "$part_event"
282
+ FROM
283
+ {gconf.tga_event_table}
284
+ WHERE
285
+ "$part_event" = 'new_device'
286
+ AND "$part_date" >= '{arrow.get(comm_cc["new_device_start"]).shift(days=-2).format("YYYY-MM-DD")}'
287
+ AND "$part_date" <= '{arrow.get(comm_cc["new_device_end"]).shift(days=2).format("YYYY-MM-DD")}'
288
+ AND {self.user_cols2}
289
+ )
290
+ WHERE
291
+ event_time_utc >= timestamp '{comm_cc["new_device_start"]}'
292
+ AND event_time_utc < timestamp '{comm_cc["new_device_end"]}'
293
+ ) b
294
+ ON a."#user_id" = b."#user_id"
295
+ )
296
+ )
297
+ WHERE row_no=1
298
+ ) a1
299
+ LEFT JOIN
300
+ (
301
+ -- 获取指定日期的7-10天后的所有事件
302
+ SELECT
303
+ *
304
+ FROM (
305
+ SELECT
306
+ "#user_id",
307
+ "yiap__itemrevenue", --- 美元
308
+ "sdk_ad_price", -- 美分
309
+ "$part_event",
310
+ {self.tran_dt_by_zone("#event_time", "#zone_offset", 0)} as event_time_utc
311
+ FROM
312
+ {gconf.tga_event_table}
313
+ WHERE
314
+ "$part_event" in ('server_iap', 'impression_ad')
315
+ AND "$part_date" >= '{arrow.get(comm_cc["label_date_start"]).shift(days=-2).format("YYYY-MM-DD")}'
316
+ AND "$part_date" <= '{arrow.get(comm_cc["label_date_end"]).shift(days=2).format("YYYY-MM-DD")}'
317
+ )
318
+ WHERE
319
+ event_time_utc >= timestamp '{comm_cc["label_date_start"]}'
320
+ AND event_time_utc <= timestamp '{comm_cc["label_date_end"]}'
321
+ ) b1
322
+ ON a1."#user_id" = b1."#user_id"
323
+ )
324
+ , user_label_table as (
325
+ -- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量 为0[流失用户] 和 大于0[留存优质用户] 的数量
326
+ SELECT
327
+ -- SUM(CASE WHEN a3.event_count=0 THEN 1 ELSE 0 END) as miss_count,
328
+ -- SUM(CASE WHEN a3.event_count>0 THEN 1 ELSE 0 END) as high_value_count
329
+ a3.revenue_price as is_rd, -- 将指定mode的ltv值作为标签
330
+ a3.user_user_id,
331
+ a3.user_register_time,
332
+ -- 计算固定随机值,打乱顺序
333
+ (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
334
+ FROM
335
+ (
336
+ -- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
337
+ SELECT
338
+ a2_1.user_user_id,
339
+ a2_1.user_register_time,
340
+ {
341
+ [
342
+ "cast((a2_1.itemrevenue + a2_1.ad_price) as double) as revenue_price -- 将iaa和iap的值相加,作为标签",
343
+ "cast(a2_1.itemrevenue as double) as revenue_price -- 将iap作为标签",
344
+ "cast(a2_1.ad_price as double) as revenue_price -- 将iaa作为标签",
345
+ ][['iaa+iap','iap','iaa'].index(self.mode)]
346
+ }
347
+ FROM (
348
+ SELECT
349
+ a2.user_user_id as user_user_id,
350
+ a2.user_register_time as user_register_time,
351
+ SUM(
352
+ CASE
353
+ WHEN a2.pd_rt_dur IS NULL THEN 0
354
+ WHEN a2.pd_rt_dur<{gconf.label_durs[0]} OR a2.pd_rt_dur>={gconf.label_durs[1]} THEN 0
355
+ ELSE a2."yiap__itemrevenue"
356
+ END
357
+ ) AS itemrevenue, -- 美元
358
+ SUM(
359
+ CASE
360
+ WHEN a2.pd_rt_dur IS NULL THEN 0
361
+ WHEN a2.pd_rt_dur<{gconf.label_durs[0]} OR a2.pd_rt_dur>={gconf.label_durs[1]} THEN 0
362
+ ELSE a2."sdk_ad_price"
363
+ END
364
+ )/1000 AS ad_price -- 这里除以1000变成美元
365
+ FROM
366
+ (
367
+ new_user
368
+ ) a2
369
+ GROUP BY a2.user_user_id, a2.user_register_time
370
+ ) a2_1
371
+ ) a3
372
+ )
373
+ , nav_table as (
374
+ select
375
+ *
376
+ from
377
+ user_label_table
378
+ where
379
+ is_rd>={self.line_value_rd}
380
+ -- 固定排序,防止找不到用户, 不适用于超大数据量
381
+ order by tt_stable_rand
382
+ limit {6000* comm_cc['days']}
383
+ )
384
+ , neg_table as (
385
+ select
386
+ *
387
+ from
388
+ user_label_table
389
+ where
390
+ is_rd<={self.line_value_rd}
391
+ -- 固定排序,防止找不到用户, 不适用于超大数据量
392
+ order by tt_stable_rand
393
+ limit {12000* comm_cc['days']}
394
+ )
395
+ , union_all as (
396
+ -- 保持正负样本固定比例1:2
397
+ select
398
+ *
399
+ from (
400
+ (
401
+ select is_rd,user_register_time, user_user_id from nav_table
402
+ )
403
+ UNION ALL
404
+ (
405
+ select
406
+ is_rd,user_register_time, user_user_id
407
+ from (
408
+ SELECT *, row_number() OVER (
409
+ PARTITION BY is_rd
410
+ ORDER BY tt_stable_rand
411
+ ) AS kere_nopoa_end_0
412
+ FROM neg_table
413
+ ) where kere_nopoa_end_0 < (select count(1)*{self.nav_neg_multi} from nav_table)
414
+ )
415
+ ) st
416
+ )
417
+ , all_need_event as (
418
+ SELECT
419
+ *
420
+ FROM (
421
+ -- 获取正负标签样本需要的事件
422
+ select
423
+ a5.is_rd,
424
+ a5.user_register_time,
425
+ a5.user_user_id,
426
+ floor(to_unixtime(b5.event_time_utc))-floor(to_unixtime(a5.user_register_time)) as rt_dur,
427
+ c5."afrawip__meida_source" as "afrawip__meida_source_ikfdssausercommonend",
428
+ c5."#account_id" as "#account_id_ikfdssausercommonend",
429
+ c5."#distinct_id" as "#distinct_id_ikfdssausercommonend",
430
+ b5.*,
431
+ b5.event_time_utc as "#event_time"
432
+ from
433
+ union_all a5
434
+ INNER JOIN
435
+ (
436
+ SELECT
437
+ *
438
+ FROM (
439
+ SELECT
440
+ -- *
441
+ {','.join(['"%s"'%x for x in self.need_event_attrs])},
442
+ {self.tran_dt_by_zone("#event_time", "#zone_offset", 0)} as event_time_utc
443
+ FROM
444
+ {gconf.tga_event_table}
445
+ WHERE
446
+ "$part_date" >= '{arrow.get(comm_cc["part_date_start"]).shift(days=-2).format("YYYY-MM-DD")}'
447
+ AND "$part_date" <= '{arrow.get(comm_cc["feature_date_end"]).shift(days=2).format("YYYY-MM-DD")}'
448
+ AND "$part_event" in ({','.join(["'%s'"%x for x in self.need_events])})
449
+ AND {self.event_cols1}
450
+ )
451
+ WHERE
452
+ event_time_utc >= timestamp '{comm_cc["part_date_start"]}'
453
+ AND event_time_utc <= timestamp '{comm_cc["feature_date_end"]}'
454
+ )
455
+ as b5
456
+ ON a5.user_user_id = b5."#user_id"
457
+ INNER JOIN
458
+ (
459
+ SELECT
460
+ "#user_id","afrawip__meida_source","#account_id","#distinct_id"
461
+ FROM
462
+ {gconf.tga_user_table}
463
+ )
464
+ as c5
465
+ ON a5.user_user_id = c5."#user_id"
466
+ ) a6
467
+ WHERE
468
+ rt_dur>=0 AND rt_dur<={gconf.level_durs[-1]} -- N小时内的事件
469
+ )
470
+ """
471
+ return sql
472
+
473
+ def get_sub_sql_i(self, sub_sql_f, col_name, col_des, base_event_table='all_need_event', base_user_table='a'):
474
+ """
475
+ @des: 内部调用
476
+ """
477
+ gconf = self.gconf
478
+ sub_field_f = "%(col)s_%(dur_i)s"
479
+ real_des_f = "%(des)s %(dur_i)s"
480
+ sub_field_names = []
481
+ sub_sqls = []
482
+ for _dur_i, level_dur in enumerate(gconf.level_durs):
483
+ real_des = real_des_f % {'des': col_des, 'dur_i': _dur_i}
484
+ sub_field_name = sub_field_f % {'col': col_name, 'dur_i': _dur_i}
485
+ sub_field_names.append(sub_field_name)
486
+ # sub_sql_f格式如下
487
+ # sub_sql_f = """
488
+ # --%(real_des)s
489
+ # sum(
490
+ # if(
491
+ # "$part_event"=='act_level_path' and object_type='normal' and act='win' and rt_dur<%(level_dur)s,
492
+ # 1,
493
+ # 0)
494
+ # ) as %(sub_field_name)s
495
+ # """
496
+ sub_sql = sub_sql_f % {
497
+ 'real_des': real_des,
498
+ 'level_dur': level_dur,
499
+ 'sub_field_name': sub_field_name
500
+ }
501
+ sub_sqls.append(sub_sql)
502
+ return sub_field_names, sub_sqls
503
+
504
+ def get_feature_sql(self):
505
+ """
506
+ @des: 提取特征值的sql
507
+ """
508
+ gconf = self.gconf
509
+ """
510
+ 构建level_durs的sql
511
+ """
512
+ base_event_table = self.base_event_table
513
+ base_user_table = self.base_user_table
514
+ base_feature_table = self.base_feature_table
515
+ feature_names = self.feature_names
516
+ comm_cc = self.get_sql_config()
517
+ #
518
+ sub_sqls = []
519
+ sub_field_names = []
520
+ for col_name, col_des in feature_names.items():
521
+ sub_sql_f = ""
522
+ if col_name == 'a':
523
+ continue
524
+ else:
525
+ sub_sql_f = self.sub_sql_fs.get(col_name)
526
+ if not sub_sql_f:
527
+ raise Et(2, f"error col_name: {col_name} ")
528
+ # 拼接结果
529
+ if sub_sql_f:
530
+ _sub_field_names, _sub_sqls = self.get_sub_sql_i(
531
+ sub_sql_f, col_name, col_des)
532
+ sub_sqls.extend(_sub_sqls)
533
+ sub_field_names.extend(_sub_field_names)
534
+
535
+ """
536
+ 构建select选项
537
+ """
538
+ # 带表名的字段 表名.field
539
+ base_user_table_fields = [
540
+ "is_rd", "user_user_id", "afrawip__meida_source_ikfdssausercommonend", "#lib", "#country_code"]
541
+ for addon_attr in self.addon_attrs:
542
+ _addon_attr = addon_attr.strip()
543
+ if _addon_attr not in base_user_table_fields:
544
+ base_user_table_fields.append(_addon_attr)
545
+ #
546
+ sub_selects = ",".join(
547
+ [f"{base_feature_table}.{item} as {item}_v" for item in sub_field_names])
548
+ sub_selects = f"""{",".join(['%s."%s"'%(base_user_table, ptf) for ptf in base_user_table_fields])},{sub_selects}"""
549
+ # 不带表名的字段
550
+ # sub_selects_unique = ",".join(
551
+ # [f'"{item}_v"' for item in sub_field_names])
552
+ # sub_selects_unique = f"""{",".join(['"%s"'%ptf for ptf in base_user_table_fields])},{sub_selects_unique}"""
553
+ """
554
+ 构建sub_sqls
555
+ """
556
+ sub_join_sqls = ",".join(sub_sqls)
557
+ """
558
+ 拼接最后的sql
559
+ """
560
+ sql = f"""
561
+ , user_tzz as (
562
+ select
563
+ {sub_selects}
564
+ from (
565
+ --用户属性和标签
566
+ select
567
+ {",".join(['"%s"'%ptf for ptf in base_user_table_fields])}
568
+ from
569
+ (
570
+ select
571
+ {",".join(['"%s"'%ptf for ptf in base_user_table_fields])},
572
+ row_number() OVER (PARTITION BY user_user_id ORDER BY "#event_time" asc) AS row_no
573
+ from
574
+ {base_event_table}
575
+ )
576
+ where
577
+ row_no=1
578
+ ) {base_user_table}
579
+ left join
580
+ ( --- 特征值计算
581
+ select
582
+ user_user_id,
583
+ {sub_join_sqls}
584
+ from
585
+ {base_event_table}
586
+ group by user_user_id
587
+ ) {base_feature_table}
588
+ on {base_user_table}.user_user_id={base_feature_table}.user_user_id
589
+ )
590
+ -- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
591
+ select * from user_tzz where {self.get_first_col_name()} is not NULL
592
+ """
593
+ return sql
594
+
595
+ def get_sql(self):
596
+ # print("---------------------------------")
597
+ event_sql = self.get_event_sql()
598
+ feature_sql = self.get_feature_sql()
599
+ sql = f"""
600
+ {event_sql}
601
+ {feature_sql}
602
+ """
603
+ return sql
@@ -11,7 +11,7 @@ gconf的属性有这些:
11
11
  level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
12
12
  label_durs # 取多少边界算留存,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生过事件的作为正向用户
13
13
  tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
14
- tga_event_table # tga用户表名, 例如 "v_user_7", 字符串
14
+ tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
15
15
  ########
16
16
 
17
17
  如何使用(需要自己定义一个子类继承这个类, 代码如下):
@@ -9,9 +9,9 @@ gconf的属性有这些:
9
9
  part_date_start # 特征值取新增用户的开始日期, 例如"2022-05-20", 字符串类型
10
10
  part_date_end # 特征值取新增用户的结束日期, 例如"2022-05-25", 字符串类型
11
11
  level_durs # 特征取值段,由小到大排列, 例如 [43200, 43200*2], 这里是秒为单位,整型数组;多少时间段发生事件作为特征值
12
- label_durs # 取多少边界算留存,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生高粘度事件的作为正向用户
12
+ label_durs # 取多少边界算硬核用户标签,例如[int(86400*14), int(86400*17)],整型数组;这里是取14到17天发生高粘度事件的作为正向用户
13
13
  tga_user_table # tga用户表名, 例如 "v_user_7", 字符串
14
- tga_event_table # tga用户表名, 例如 "v_user_7", 字符串
14
+ tga_event_table # tga事件表名, 例如 "v_event_7", 字符串
15
15
  ########
16
16
 
17
17
  如何使用(需要自己定义一个子类继承这个类, 代码如下):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tfduck-bsd
3
- Version: 0.16.1
3
+ Version: 0.16.2
4
4
  Summary: A small example package
5
5
  Home-page: UNKNOWN
6
6
  Author: yuanxiao
@@ -26,9 +26,12 @@ tfduck/sagemaker/__init__.py
26
26
  tfduck/sagemaker/saoper.py
27
27
  tfduck/tga/__init__.py
28
28
  tfduck/tga/base_tga.py
29
+ tfduck/tga/predict_sql_ltv.py
29
30
  tfduck/tga/predict_sql_retain.py
30
31
  tfduck/tga/predict_sql_yh.py
31
32
  tfduck/tga/tga.py
33
+ tfduck/tga/tga_test.py
34
+ tfduck/tga/train_sql_ltv.py
32
35
  tfduck/tga/train_sql_retain.py
33
36
  tfduck/tga/train_sql_yh.py
34
37
  tfduck/thinkdata/__init__.py
@@ -1 +0,0 @@
1
- __version__="0.16.1"
File without changes
File without changes
File without changes
File without changes
File without changes