tfduck-bsd 0.16.4__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- tfduck/__init__.py +1 -1
- tfduck/tga/base_tga.py +1 -0
- tfduck/tga/predict_sql_ltv.py +4 -2
- tfduck/tga/predict_sql_retain.py +17 -3
- tfduck/tga/predict_sql_yh.py +4 -2
- tfduck/tga/train_sql_ltv.py +17 -7
- tfduck/tga/train_sql_retain.py +16 -7
- tfduck/tga/train_sql_yh.py +15 -7
- {tfduck_bsd-0.16.4.dist-info → tfduck_bsd-0.17.0.dist-info}/METADATA +1 -1
- {tfduck_bsd-0.16.4.dist-info → tfduck_bsd-0.17.0.dist-info}/RECORD +14 -14
- {tfduck_bsd-0.16.4.data → tfduck_bsd-0.17.0.data}/scripts/tfduck +0 -0
- {tfduck_bsd-0.16.4.dist-info → tfduck_bsd-0.17.0.dist-info}/LICENSE +0 -0
- {tfduck_bsd-0.16.4.dist-info → tfduck_bsd-0.17.0.dist-info}/WHEEL +0 -0
- {tfduck_bsd-0.16.4.dist-info → tfduck_bsd-0.17.0.dist-info}/top_level.txt +0 -0
tfduck/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__="0.
|
|
1
|
+
__version__="0.17.0"
|
tfduck/tga/base_tga.py
CHANGED
tfduck/tga/predict_sql_ltv.py
CHANGED
|
@@ -61,8 +61,10 @@ def call_method(self, ctx=None, **kwargs):
|
|
|
61
61
|
) as %(sub_field_name)s
|
|
62
62
|
"""
|
|
63
63
|
}
|
|
64
|
-
#
|
|
65
|
-
addon_attrs = ["#screen_width", "#screen_height"]
|
|
64
|
+
# 额外的event表属性和预查询的用户属性base_user_table_fields里面选,比如(可选参数)
|
|
65
|
+
addon_attrs = ["#screen_width", "#screen_height"]
|
|
66
|
+
# 是否等待用户注册后经过max(gconf.level_durs)的时间才开始计算特征值(可选参数)---默认是不等待
|
|
67
|
+
wait_dur_get_feature
|
|
66
68
|
"""
|
|
67
69
|
创建调用实例---最好通过dict创建实例
|
|
68
70
|
"""
|
tfduck/tga/predict_sql_retain.py
CHANGED
|
@@ -61,8 +61,10 @@ def call_method(self, ctx=None, **kwargs):
|
|
|
61
61
|
) as %(sub_field_name)s
|
|
62
62
|
"""
|
|
63
63
|
}
|
|
64
|
-
#
|
|
65
|
-
addon_attrs = ["#screen_width", "#screen_height"]
|
|
64
|
+
# 额外的event表属性和预查询的用户属性base_user_table_fields里面选,比如(可选参数)
|
|
65
|
+
addon_attrs = ["#screen_width", "#screen_height"]
|
|
66
|
+
# 是否等待用户注册后经过max(gconf.level_durs)的时间才开始计算特征值(可选参数)---默认是不等待
|
|
67
|
+
wait_dur_get_feature
|
|
66
68
|
"""
|
|
67
69
|
创建调用实例---最好通过dict创建实例
|
|
68
70
|
注意: 针对一个tga项目属于多个项目的打点情况,例如3dfish, 可以适时启用user_cols1,user_cols2或者event_cols1
|
|
@@ -106,6 +108,7 @@ class PredictFeatureSql(BaseTga):
|
|
|
106
108
|
user_cols2='1=1',
|
|
107
109
|
event_cols1='1=1',
|
|
108
110
|
addon_attrs=[],
|
|
111
|
+
wait_dur_get_feature=False,
|
|
109
112
|
**kwargs):
|
|
110
113
|
"""
|
|
111
114
|
@des: 参数说明看上面的文档说明
|
|
@@ -135,6 +138,8 @@ class PredictFeatureSql(BaseTga):
|
|
|
135
138
|
self.user_cols2 = user_cols2
|
|
136
139
|
self.event_cols1 = event_cols1
|
|
137
140
|
self.addon_attrs = addon_attrs if isinstance(addon_attrs, list) else []
|
|
141
|
+
#
|
|
142
|
+
self.wait_dur_get_feature = wait_dur_get_feature
|
|
138
143
|
# 其他属性
|
|
139
144
|
for k, v in kwargs.items():
|
|
140
145
|
setattr(self, k, v)
|
|
@@ -223,6 +228,12 @@ class PredictFeatureSql(BaseTga):
|
|
|
223
228
|
gconf = self.gconf
|
|
224
229
|
comm_cc = self.get_sql_config()
|
|
225
230
|
#
|
|
231
|
+
if self.wait_dur_get_feature:
|
|
232
|
+
# 这里需要转为真正的utc时间和当前时间的差值, 否则会读本地时区,不准确
|
|
233
|
+
wait_dur_get_feature_sql = f"to_unixtime(current_timestamp)-to_unixtime(try_cast (concat(to_char(event_time_utc, 'yyyy-mm-dd hh24:mi:ss'),' UTC') as timestamp with time zone)) >= {max(gconf.level_durs)}"
|
|
234
|
+
else:
|
|
235
|
+
wait_dur_get_feature_sql = "1=1"
|
|
236
|
+
#
|
|
226
237
|
sql = f"""
|
|
227
238
|
-- des: add by yuanxiao for machine learn predict
|
|
228
239
|
with new_user as (
|
|
@@ -273,6 +284,7 @@ class PredictFeatureSql(BaseTga):
|
|
|
273
284
|
WHERE
|
|
274
285
|
event_time_utc >= timestamp '{comm_cc["new_device_start"]}'
|
|
275
286
|
AND event_time_utc < timestamp '{comm_cc["new_device_end"]}'
|
|
287
|
+
AND {wait_dur_get_feature_sql}
|
|
276
288
|
) b
|
|
277
289
|
ON a."#user_id" = b."#user_id"
|
|
278
290
|
)
|
|
@@ -289,9 +301,11 @@ class PredictFeatureSql(BaseTga):
|
|
|
289
301
|
a5.user_register_time,
|
|
290
302
|
a5.user_user_id,
|
|
291
303
|
floor(to_unixtime(b5.event_time_utc))-floor(to_unixtime(a5.user_register_time)) as rt_dur,
|
|
304
|
+
-- floor(to_unixtime(try_cast (concat(to_char(b5.event_time_utc, 'yyyy-mm-dd hh24:mi:ss'),' UTC') as timestamp with time zone)))-floor(to_unixtime(try_cast (concat(to_char(a5.user_register_time, 'yyyy-mm-dd hh24:mi:ss'),' UTC') as timestamp with time zone))) as rt_dur,
|
|
292
305
|
c5."afrawip__meida_source" as "afrawip__meida_source_ikfdssausercommonend",
|
|
293
306
|
c5."#account_id" as "#account_id_ikfdssausercommonend",
|
|
294
307
|
c5."#distinct_id" as "#distinct_id_ikfdssausercommonend",
|
|
308
|
+
c5."afrawip__campaign" as "afrawip__campaign_ikfdssausercommonend",
|
|
295
309
|
b5.*,
|
|
296
310
|
b5.event_time_utc as "#event_time"
|
|
297
311
|
from
|
|
@@ -322,7 +336,7 @@ class PredictFeatureSql(BaseTga):
|
|
|
322
336
|
INNER JOIN
|
|
323
337
|
(
|
|
324
338
|
SELECT
|
|
325
|
-
"#user_id","afrawip__meida_source","#account_id","#distinct_id"
|
|
339
|
+
"#user_id","afrawip__meida_source","#account_id","#distinct_id","afrawip__campaign"
|
|
326
340
|
FROM
|
|
327
341
|
{gconf.tga_user_table}
|
|
328
342
|
)
|
tfduck/tga/predict_sql_yh.py
CHANGED
|
@@ -61,8 +61,10 @@ def call_method(self, ctx=None, **kwargs):
|
|
|
61
61
|
) as %(sub_field_name)s
|
|
62
62
|
"""
|
|
63
63
|
}
|
|
64
|
-
#
|
|
65
|
-
addon_attrs = ["#screen_width", "#screen_height"]
|
|
64
|
+
# 额外的event表属性和预查询的用户属性base_user_table_fields里面选,比如(可选参数)
|
|
65
|
+
addon_attrs = ["#screen_width", "#screen_height"]
|
|
66
|
+
# 是否等待用户注册后经过max(gconf.level_durs)的时间才开始计算特征值(可选参数)---默认是不等待
|
|
67
|
+
wait_dur_get_feature
|
|
66
68
|
"""
|
|
67
69
|
创建调用实例---最好通过dict创建实例
|
|
68
70
|
"""
|
tfduck/tga/train_sql_ltv.py
CHANGED
|
@@ -62,7 +62,7 @@ def call_method(self, ctx=None, **kwargs):
|
|
|
62
62
|
) as %(sub_field_name)s
|
|
63
63
|
"""
|
|
64
64
|
}
|
|
65
|
-
#
|
|
65
|
+
# 额外的event表属性和预查询的用户属性base_user_table_fields里面选,比如(可选参数)
|
|
66
66
|
addon_attrs = ["#screen_width", "#screen_height"]
|
|
67
67
|
# line_value_rd ltv的线性值大于多少算正样本,小于多少算负样本,这样可以控制正负样本比例
|
|
68
68
|
line_value_rd = 0.5
|
|
@@ -243,6 +243,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
243
243
|
SELECT
|
|
244
244
|
a1.user_register_time as user_register_time,
|
|
245
245
|
a1."#user_id" as user_user_id,
|
|
246
|
+
a1."#distinct_id" as user_distinct_id,
|
|
246
247
|
b1."#user_id" as event_user_id,
|
|
247
248
|
CASE WHEN b1."yiap__itemrevenue" is NULL THEN 0 ELSE b1."yiap__itemrevenue" END yiap__itemrevenue,
|
|
248
249
|
CASE WHEN b1."sdk_ad_price" is NULL THEN 0 ELSE b1."sdk_ad_price" END sdk_ad_price,
|
|
@@ -260,11 +261,11 @@ class TrainFeatureSql(BaseTga):
|
|
|
260
261
|
FROM
|
|
261
262
|
(
|
|
262
263
|
SELECT
|
|
263
|
-
b.event_time_utc as user_register_time, a."#user_id"
|
|
264
|
+
b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
|
|
264
265
|
FROM
|
|
265
266
|
(
|
|
266
267
|
SELECT
|
|
267
|
-
"#user_id"
|
|
268
|
+
"#user_id","#distinct_id"
|
|
268
269
|
FROM
|
|
269
270
|
{gconf.tga_user_table}
|
|
270
271
|
WHERE
|
|
@@ -329,13 +330,19 @@ class TrainFeatureSql(BaseTga):
|
|
|
329
330
|
a3.revenue_price as is_rd, -- 将指定mode的ltv值作为标签
|
|
330
331
|
a3.user_user_id,
|
|
331
332
|
a3.user_register_time,
|
|
332
|
-
-- 计算固定随机值,打乱顺序
|
|
333
|
-
(abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
333
|
+
-- 计算固定随机值,打乱顺序(废弃)--计算一个固定的采样值--现在也失效了,因为tga上云到k8s,不同节点计算,这个值也会变, 而且不同的id生成的数字可能一样,这样就不能达到目的了
|
|
334
|
+
-- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
335
|
+
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
336
|
+
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
337
|
+
-- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式可能会产生left null的情况,但是是少数,过滤掉就行,不影响结果,但支持乱序采样
|
|
338
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一
|
|
339
|
+
|
|
334
340
|
FROM
|
|
335
341
|
(
|
|
336
342
|
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
337
343
|
SELECT
|
|
338
344
|
a2_1.user_user_id,
|
|
345
|
+
a2_1.user_distinct_id,
|
|
339
346
|
a2_1.user_register_time,
|
|
340
347
|
{
|
|
341
348
|
[
|
|
@@ -347,6 +354,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
347
354
|
FROM (
|
|
348
355
|
SELECT
|
|
349
356
|
a2.user_user_id as user_user_id,
|
|
357
|
+
a2.user_distinct_id as user_distinct_id,
|
|
350
358
|
a2.user_register_time as user_register_time,
|
|
351
359
|
SUM(
|
|
352
360
|
CASE
|
|
@@ -366,7 +374,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
366
374
|
(
|
|
367
375
|
new_user
|
|
368
376
|
) a2
|
|
369
|
-
GROUP BY a2.user_user_id, a2.user_register_time
|
|
377
|
+
GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
|
|
370
378
|
) a2_1
|
|
371
379
|
) a3
|
|
372
380
|
)
|
|
@@ -427,6 +435,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
427
435
|
c5."afrawip__meida_source" as "afrawip__meida_source_ikfdssausercommonend",
|
|
428
436
|
c5."#account_id" as "#account_id_ikfdssausercommonend",
|
|
429
437
|
c5."#distinct_id" as "#distinct_id_ikfdssausercommonend",
|
|
438
|
+
c5."afrawip__campaign" as "afrawip__campaign_ikfdssausercommonend",
|
|
430
439
|
b5.*,
|
|
431
440
|
b5.event_time_utc as "#event_time"
|
|
432
441
|
from
|
|
@@ -457,7 +466,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
457
466
|
INNER JOIN
|
|
458
467
|
(
|
|
459
468
|
SELECT
|
|
460
|
-
"#user_id","afrawip__meida_source","#account_id","#distinct_id"
|
|
469
|
+
"#user_id","afrawip__meida_source","#account_id","#distinct_id","afrawip__campaign"
|
|
461
470
|
FROM
|
|
462
471
|
{gconf.tga_user_table}
|
|
463
472
|
)
|
|
@@ -588,6 +597,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
588
597
|
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
589
598
|
)
|
|
590
599
|
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
600
|
+
-- 出现null过滤掉就行,少量的不管,因为tt_stable_rand的构建方法会有较小影响s
|
|
591
601
|
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
592
602
|
"""
|
|
593
603
|
return sql
|
tfduck/tga/train_sql_retain.py
CHANGED
|
@@ -62,7 +62,7 @@ def call_method(self, ctx=None, **kwargs):
|
|
|
62
62
|
) as %(sub_field_name)s
|
|
63
63
|
"""
|
|
64
64
|
}
|
|
65
|
-
#
|
|
65
|
+
# 额外的event表属性和预查询的用户属性base_user_table_fields里面选,比如(可选参数)
|
|
66
66
|
addon_attrs = ["#screen_width", "#screen_height"]
|
|
67
67
|
"""
|
|
68
68
|
创建调用实例---最好通过dict创建实例
|
|
@@ -232,6 +232,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
232
232
|
SELECT
|
|
233
233
|
a1.user_register_time as user_register_time,
|
|
234
234
|
a1."#user_id" as user_user_id,
|
|
235
|
+
a1."#distinct_id" as user_distinct_id,
|
|
235
236
|
b1."#user_id" as event_user_id,
|
|
236
237
|
-- b1."event_time_utc" as pd_event_time_utc,
|
|
237
238
|
floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
|
|
@@ -248,11 +249,12 @@ class TrainFeatureSql(BaseTga):
|
|
|
248
249
|
FROM
|
|
249
250
|
(
|
|
250
251
|
SELECT
|
|
251
|
-
b.event_time_utc as user_register_time, a."#user_id"
|
|
252
|
+
b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
|
|
252
253
|
FROM
|
|
253
254
|
(
|
|
254
255
|
SELECT
|
|
255
|
-
"#user_id"
|
|
256
|
+
"#user_id",
|
|
257
|
+
"#distinct_id"
|
|
256
258
|
FROM
|
|
257
259
|
{gconf.tga_user_table}
|
|
258
260
|
WHERE
|
|
@@ -314,13 +316,18 @@ class TrainFeatureSql(BaseTga):
|
|
|
314
316
|
CASE WHEN a3.event_count>0 THEN '1' ELSE '0' END as is_rd,
|
|
315
317
|
a3.user_user_id,
|
|
316
318
|
a3.user_register_time,
|
|
317
|
-
-- 计算固定随机值,打乱顺序
|
|
318
|
-
(abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
319
|
+
-- 计算固定随机值,打乱顺序(废弃)--计算一个固定的采样值--现在也失效了,因为tga上云到k8s,不同节点计算,这个值也会变, 而且不同的id生成的数字可能一样,这样就不能达到目的了
|
|
320
|
+
-- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
321
|
+
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
322
|
+
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
323
|
+
-- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式也不太行,数据会几种在某一天
|
|
324
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一
|
|
319
325
|
FROM
|
|
320
326
|
(
|
|
321
327
|
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
322
328
|
SELECT
|
|
323
329
|
a2.user_user_id as user_user_id,
|
|
330
|
+
a2.user_distinct_id as user_distinct_id,
|
|
324
331
|
a2.user_register_time as user_register_time,
|
|
325
332
|
SUM(
|
|
326
333
|
CASE
|
|
@@ -333,7 +340,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
333
340
|
(
|
|
334
341
|
new_user
|
|
335
342
|
) a2
|
|
336
|
-
GROUP BY a2.user_user_id, a2.user_register_time
|
|
343
|
+
GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
|
|
337
344
|
) a3
|
|
338
345
|
)
|
|
339
346
|
, nav_table as (
|
|
@@ -393,6 +400,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
393
400
|
c5."afrawip__meida_source" as "afrawip__meida_source_ikfdssausercommonend",
|
|
394
401
|
c5."#account_id" as "#account_id_ikfdssausercommonend",
|
|
395
402
|
c5."#distinct_id" as "#distinct_id_ikfdssausercommonend",
|
|
403
|
+
c5."afrawip__campaign" as "afrawip__campaign_ikfdssausercommonend",
|
|
396
404
|
b5.*,
|
|
397
405
|
b5.event_time_utc as "#event_time"
|
|
398
406
|
from
|
|
@@ -423,7 +431,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
423
431
|
INNER JOIN
|
|
424
432
|
(
|
|
425
433
|
SELECT
|
|
426
|
-
"#user_id","afrawip__meida_source","#account_id","#distinct_id"
|
|
434
|
+
"#user_id","afrawip__meida_source","#account_id","#distinct_id","afrawip__campaign"
|
|
427
435
|
FROM
|
|
428
436
|
{gconf.tga_user_table}
|
|
429
437
|
)
|
|
@@ -554,6 +562,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
554
562
|
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
555
563
|
)
|
|
556
564
|
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
565
|
+
-- 出现null过滤掉就行,少量的不管,因为tt_stable_rand的构建方法会有较小影响
|
|
557
566
|
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
558
567
|
"""
|
|
559
568
|
return sql
|
tfduck/tga/train_sql_yh.py
CHANGED
|
@@ -62,7 +62,7 @@ def call_method(self, ctx=None, **kwargs):
|
|
|
62
62
|
) as %(sub_field_name)s
|
|
63
63
|
"""
|
|
64
64
|
}
|
|
65
|
-
#
|
|
65
|
+
# 额外的event表属性和预查询的用户属性base_user_table_fields里面选,比如(可选参数)
|
|
66
66
|
addon_attrs = ["#screen_width", "#screen_height"]
|
|
67
67
|
"""
|
|
68
68
|
创建调用实例---最好通过dict创建实例
|
|
@@ -232,6 +232,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
232
232
|
SELECT
|
|
233
233
|
a1.user_register_time as user_register_time,
|
|
234
234
|
a1."#user_id" as user_user_id,
|
|
235
|
+
a1."#distinct_id" as user_distinct_id,
|
|
235
236
|
b1."#user_id" as event_user_id,
|
|
236
237
|
-- b1."event_time_utc" as pd_event_time_utc,
|
|
237
238
|
floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
|
|
@@ -248,11 +249,11 @@ class TrainFeatureSql(BaseTga):
|
|
|
248
249
|
FROM
|
|
249
250
|
(
|
|
250
251
|
SELECT
|
|
251
|
-
b.event_time_utc as user_register_time, a."#user_id"
|
|
252
|
+
b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
|
|
252
253
|
FROM
|
|
253
254
|
(
|
|
254
255
|
SELECT
|
|
255
|
-
"#user_id"
|
|
256
|
+
"#user_id","#distinct_id"
|
|
256
257
|
FROM
|
|
257
258
|
{gconf.tga_user_table}
|
|
258
259
|
WHERE
|
|
@@ -318,20 +319,25 @@ class TrainFeatureSql(BaseTga):
|
|
|
318
319
|
CASE WHEN a3.event_count>0 THEN '1' ELSE '0' END as is_rd,
|
|
319
320
|
a3.user_user_id,
|
|
320
321
|
a3.user_register_time,
|
|
321
|
-
-- 计算固定随机值,打乱顺序
|
|
322
|
-
(abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
322
|
+
-- 计算固定随机值,打乱顺序(废弃)--计算一个固定的采样值--现在也失效了,因为tga上云到k8s,不同节点计算,这个值也会变, 而且不同的id生成的数字可能一样,这样就不能达到目的了
|
|
323
|
+
-- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
324
|
+
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
325
|
+
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
326
|
+
--bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式可能会产生left null的情况,但是是少数,过滤掉就行,不影响结果,但支持乱序采样
|
|
327
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一
|
|
323
328
|
FROM
|
|
324
329
|
(
|
|
325
330
|
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
326
331
|
SELECT
|
|
327
332
|
a2.user_user_id as user_user_id,
|
|
333
|
+
a2.user_distinct_id as user_distinct_id,
|
|
328
334
|
a2.user_register_time as user_register_time,
|
|
329
335
|
SUM(CASE WHEN a2.event_user_id IS NULL THEN 0 ELSE 1 END) AS event_count
|
|
330
336
|
FROM
|
|
331
337
|
(
|
|
332
338
|
new_user
|
|
333
339
|
) a2
|
|
334
|
-
GROUP BY a2.user_user_id, a2.user_register_time
|
|
340
|
+
GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
|
|
335
341
|
) a3
|
|
336
342
|
)
|
|
337
343
|
, nav_table as (
|
|
@@ -391,6 +397,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
391
397
|
c5."afrawip__meida_source" as "afrawip__meida_source_ikfdssausercommonend",
|
|
392
398
|
c5."#account_id" as "#account_id_ikfdssausercommonend",
|
|
393
399
|
c5."#distinct_id" as "#distinct_id_ikfdssausercommonend",
|
|
400
|
+
c5."afrawip__campaign" as "afrawip__campaign_ikfdssausercommonend",
|
|
394
401
|
b5.*,
|
|
395
402
|
b5.event_time_utc as "#event_time"
|
|
396
403
|
from
|
|
@@ -421,7 +428,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
421
428
|
INNER JOIN
|
|
422
429
|
(
|
|
423
430
|
SELECT
|
|
424
|
-
"#user_id","afrawip__meida_source","#account_id","#distinct_id"
|
|
431
|
+
"#user_id","afrawip__meida_source","#account_id","#distinct_id","afrawip__campaign"
|
|
425
432
|
FROM
|
|
426
433
|
{gconf.tga_user_table}
|
|
427
434
|
)
|
|
@@ -552,6 +559,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
552
559
|
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
553
560
|
)
|
|
554
561
|
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
562
|
+
-- 出现null过滤掉就行,少量的不管,因为tt_stable_rand的构建方法会有较小影响
|
|
555
563
|
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
556
564
|
"""
|
|
557
565
|
return sql
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
tfduck/__init__.py,sha256=
|
|
1
|
+
tfduck/__init__.py,sha256=UOdb1TyDAAWyMDClpACUUPZRxaht77qoX9AuhbBhsAs,20
|
|
2
2
|
tfduck/main.py,sha256=zNTC16wkwGJ0QX1-i8vzlGophOxmFuO4SLsF1tkjsbE,14670
|
|
3
3
|
tfduck/bdp_sdk_py/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
tfduck/bdp_sdk_py/example.py,sha256=Xq1_gcSyu0zho_iMmMYfYgMblcgF8a-GwRBWPTw0FuU,2879
|
|
@@ -21,20 +21,20 @@ tfduck/s3/s3oper.py,sha256=U1kYqNlVUX6SqTLnVRGqYbP0uRgWrW16N8pruQsxhW0,17761
|
|
|
21
21
|
tfduck/sagemaker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
tfduck/sagemaker/saoper.py,sha256=oVdxoUeXrM4sGXrTg8F-ZXG0lF6VXfjZ9gW3Q7ubjkU,11991
|
|
23
23
|
tfduck/tga/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
tfduck/tga/base_tga.py,sha256=
|
|
25
|
-
tfduck/tga/predict_sql_ltv.py,sha256=
|
|
26
|
-
tfduck/tga/predict_sql_retain.py,sha256=
|
|
27
|
-
tfduck/tga/predict_sql_yh.py,sha256=
|
|
24
|
+
tfduck/tga/base_tga.py,sha256=rg1BHaIKGSwUVbOlWSA3Y1arB8J_caemjTjH9jh8GYM,2184
|
|
25
|
+
tfduck/tga/predict_sql_ltv.py,sha256=25rpOZdHyMcEU3O8u67oUpLsTiZburEPsvXR38hTUJ0,3589
|
|
26
|
+
tfduck/tga/predict_sql_retain.py,sha256=Nsl0lSZ_CC8j_GB4jLrJgkoqqDRAAPYwzo7vAOYZ764,19772
|
|
27
|
+
tfduck/tga/predict_sql_yh.py,sha256=uYeuCZX2btxO-pvjrhlmuG35PquJbLvVtQEuEZHu8Cs,3588
|
|
28
28
|
tfduck/tga/tga.py,sha256=bAW_RFV0Xclgol-MdP9NOWHwNeGLaVSVLqE9Ji-f_Hw,12496
|
|
29
29
|
tfduck/tga/tga_test.py,sha256=A3n2LdvgQWlkX6E54K6cnsIUeYrgHlzG3QYbA7ZKgHk,2750
|
|
30
|
-
tfduck/tga/train_sql_ltv.py,sha256=
|
|
31
|
-
tfduck/tga/train_sql_retain.py,sha256=
|
|
32
|
-
tfduck/tga/train_sql_yh.py,sha256=
|
|
30
|
+
tfduck/tga/train_sql_ltv.py,sha256=iJwKzc_u5o_KZ5ZyFsHyQN0cUnPAYX96NXuwPCtK27A,26563
|
|
31
|
+
tfduck/tga/train_sql_retain.py,sha256=q385ZRzg9cymWqd_7qi-c5vX87mzL-pKC5vYyfCxS98,24326
|
|
32
|
+
tfduck/tga/train_sql_yh.py,sha256=IJNr7U__dOqsPNpF0Y47_EdW8D3ScTZ_YR8cde-VViM,24367
|
|
33
33
|
tfduck/thinkdata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
34
|
tfduck/thinkdata/query.py,sha256=DsfcxjZrc0ZFTwN2pI5fKdM1Bwr6ageoPcA2MP3r2bE,1314
|
|
35
|
-
tfduck_bsd-0.
|
|
36
|
-
tfduck_bsd-0.
|
|
37
|
-
tfduck_bsd-0.
|
|
38
|
-
tfduck_bsd-0.
|
|
39
|
-
tfduck_bsd-0.
|
|
40
|
-
tfduck_bsd-0.
|
|
35
|
+
tfduck_bsd-0.17.0.data/scripts/tfduck,sha256=UsuoAs4peJW4I-e6Gn91gEToP_YyuUp-rUUg3ObKneY,192
|
|
36
|
+
tfduck_bsd-0.17.0.dist-info/LICENSE,sha256=2bm9uFabQZ3Ykb_SaSU_uUbAj2-htc6WJQmS_65qD00,1073
|
|
37
|
+
tfduck_bsd-0.17.0.dist-info/METADATA,sha256=P6YmdRS7t2nyfnzlei7Mmq9SNfBLHCfFA1Xpb794spU,971
|
|
38
|
+
tfduck_bsd-0.17.0.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
|
|
39
|
+
tfduck_bsd-0.17.0.dist-info/top_level.txt,sha256=503etRkoyeI1VYcAwe5KpD5Bamhx0R0y2ofkE8HpRDA,7
|
|
40
|
+
tfduck_bsd-0.17.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|