tfduck-bsd 0.16.7__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tfduck-bsd might be problematic. Click here for more details.
- {tfduck-bsd-0.16.7/tfduck_bsd.egg-info → tfduck-bsd-0.17.0}/PKG-INFO +1 -1
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/setup.py +1 -1
- tfduck-bsd-0.17.0/tfduck/__init__.py +1 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/train_sql_ltv.py +11 -4
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/train_sql_retain.py +10 -4
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/train_sql_yh.py +9 -4
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0/tfduck_bsd.egg-info}/PKG-INFO +1 -1
- tfduck-bsd-0.16.7/tfduck/__init__.py +0 -1
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/LICENSE +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/README.md +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/bin/tfduck +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/setup.cfg +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/config/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/config/bdpmanager.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/config/table_config.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/example.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/opends/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/opends/opends.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/opends/sdk.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/common/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/common/defines.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/common/extendEncoder.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/main.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/oss/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/oss/oss.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/pyspark_k8s/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/pyspark_k8s/k8s_manage.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/pyspark_k8s/spark_manage.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/s3/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/s3/s3oper.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/sagemaker/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/sagemaker/saoper.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/base_tga.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/predict_sql_ltv.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/predict_sql_retain.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/predict_sql_yh.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/tga.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/tga_test.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/thinkdata/__init__.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/thinkdata/query.py +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/SOURCES.txt +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/dependency_links.txt +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/requires.txt +0 -0
- {tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__="0.17.0"
|
|
@@ -243,6 +243,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
243
243
|
SELECT
|
|
244
244
|
a1.user_register_time as user_register_time,
|
|
245
245
|
a1."#user_id" as user_user_id,
|
|
246
|
+
a1."#distinct_id" as user_distinct_id,
|
|
246
247
|
b1."#user_id" as event_user_id,
|
|
247
248
|
CASE WHEN b1."yiap__itemrevenue" is NULL THEN 0 ELSE b1."yiap__itemrevenue" END yiap__itemrevenue,
|
|
248
249
|
CASE WHEN b1."sdk_ad_price" is NULL THEN 0 ELSE b1."sdk_ad_price" END sdk_ad_price,
|
|
@@ -260,11 +261,11 @@ class TrainFeatureSql(BaseTga):
|
|
|
260
261
|
FROM
|
|
261
262
|
(
|
|
262
263
|
SELECT
|
|
263
|
-
b.event_time_utc as user_register_time, a."#user_id"
|
|
264
|
+
b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
|
|
264
265
|
FROM
|
|
265
266
|
(
|
|
266
267
|
SELECT
|
|
267
|
-
"#user_id"
|
|
268
|
+
"#user_id","#distinct_id"
|
|
268
269
|
FROM
|
|
269
270
|
{gconf.tga_user_table}
|
|
270
271
|
WHERE
|
|
@@ -332,12 +333,16 @@ class TrainFeatureSql(BaseTga):
|
|
|
332
333
|
-- 计算固定随机值,打乱顺序(废弃)--计算一个固定的采样值--现在也失效了,因为tga上云到k8s,不同节点计算,这个值也会变, 而且不同的id生成的数字可能一样,这样就不能达到目的了
|
|
333
334
|
-- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
334
335
|
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
335
|
-
user_user_id as tt_stable_rand
|
|
336
|
+
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
337
|
+
-- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式可能会产生left null的情况,但是是少数,过滤掉就行,不影响结果,但支持乱序采样
|
|
338
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一
|
|
339
|
+
|
|
336
340
|
FROM
|
|
337
341
|
(
|
|
338
342
|
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
339
343
|
SELECT
|
|
340
344
|
a2_1.user_user_id,
|
|
345
|
+
a2_1.user_distinct_id,
|
|
341
346
|
a2_1.user_register_time,
|
|
342
347
|
{
|
|
343
348
|
[
|
|
@@ -349,6 +354,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
349
354
|
FROM (
|
|
350
355
|
SELECT
|
|
351
356
|
a2.user_user_id as user_user_id,
|
|
357
|
+
a2.user_distinct_id as user_distinct_id,
|
|
352
358
|
a2.user_register_time as user_register_time,
|
|
353
359
|
SUM(
|
|
354
360
|
CASE
|
|
@@ -368,7 +374,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
368
374
|
(
|
|
369
375
|
new_user
|
|
370
376
|
) a2
|
|
371
|
-
GROUP BY a2.user_user_id, a2.user_register_time
|
|
377
|
+
GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
|
|
372
378
|
) a2_1
|
|
373
379
|
) a3
|
|
374
380
|
)
|
|
@@ -591,6 +597,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
591
597
|
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
592
598
|
)
|
|
593
599
|
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
600
|
+
-- 出现null过滤掉就行,少量的不管,因为tt_stable_rand的构建方法会有较小影响s
|
|
594
601
|
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
595
602
|
"""
|
|
596
603
|
return sql
|
|
@@ -232,6 +232,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
232
232
|
SELECT
|
|
233
233
|
a1.user_register_time as user_register_time,
|
|
234
234
|
a1."#user_id" as user_user_id,
|
|
235
|
+
a1."#distinct_id" as user_distinct_id,
|
|
235
236
|
b1."#user_id" as event_user_id,
|
|
236
237
|
-- b1."event_time_utc" as pd_event_time_utc,
|
|
237
238
|
floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
|
|
@@ -248,11 +249,12 @@ class TrainFeatureSql(BaseTga):
|
|
|
248
249
|
FROM
|
|
249
250
|
(
|
|
250
251
|
SELECT
|
|
251
|
-
b.event_time_utc as user_register_time, a."#user_id"
|
|
252
|
+
b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
|
|
252
253
|
FROM
|
|
253
254
|
(
|
|
254
255
|
SELECT
|
|
255
|
-
"#user_id"
|
|
256
|
+
"#user_id",
|
|
257
|
+
"#distinct_id"
|
|
256
258
|
FROM
|
|
257
259
|
{gconf.tga_user_table}
|
|
258
260
|
WHERE
|
|
@@ -317,12 +319,15 @@ class TrainFeatureSql(BaseTga):
|
|
|
317
319
|
-- 计算固定随机值,打乱顺序(废弃)--计算一个固定的采样值--现在也失效了,因为tga上云到k8s,不同节点计算,这个值也会变, 而且不同的id生成的数字可能一样,这样就不能达到目的了
|
|
318
320
|
-- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
319
321
|
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
320
|
-
user_user_id as tt_stable_rand
|
|
322
|
+
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
323
|
+
-- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式也不太行,数据会几种在某一天
|
|
324
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一
|
|
321
325
|
FROM
|
|
322
326
|
(
|
|
323
327
|
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
324
328
|
SELECT
|
|
325
329
|
a2.user_user_id as user_user_id,
|
|
330
|
+
a2.user_distinct_id as user_distinct_id,
|
|
326
331
|
a2.user_register_time as user_register_time,
|
|
327
332
|
SUM(
|
|
328
333
|
CASE
|
|
@@ -335,7 +340,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
335
340
|
(
|
|
336
341
|
new_user
|
|
337
342
|
) a2
|
|
338
|
-
GROUP BY a2.user_user_id, a2.user_register_time
|
|
343
|
+
GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
|
|
339
344
|
) a3
|
|
340
345
|
)
|
|
341
346
|
, nav_table as (
|
|
@@ -557,6 +562,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
557
562
|
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
558
563
|
)
|
|
559
564
|
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
565
|
+
-- 出现null过滤掉就行,少量的不管,因为tt_stable_rand的构建方法会有较小影响
|
|
560
566
|
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
561
567
|
"""
|
|
562
568
|
return sql
|
|
@@ -232,6 +232,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
232
232
|
SELECT
|
|
233
233
|
a1.user_register_time as user_register_time,
|
|
234
234
|
a1."#user_id" as user_user_id,
|
|
235
|
+
a1."#distinct_id" as user_distinct_id,
|
|
235
236
|
b1."#user_id" as event_user_id,
|
|
236
237
|
-- b1."event_time_utc" as pd_event_time_utc,
|
|
237
238
|
floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
|
|
@@ -248,11 +249,11 @@ class TrainFeatureSql(BaseTga):
|
|
|
248
249
|
FROM
|
|
249
250
|
(
|
|
250
251
|
SELECT
|
|
251
|
-
b.event_time_utc as user_register_time, a."#user_id"
|
|
252
|
+
b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
|
|
252
253
|
FROM
|
|
253
254
|
(
|
|
254
255
|
SELECT
|
|
255
|
-
"#user_id"
|
|
256
|
+
"#user_id","#distinct_id"
|
|
256
257
|
FROM
|
|
257
258
|
{gconf.tga_user_table}
|
|
258
259
|
WHERE
|
|
@@ -321,19 +322,22 @@ class TrainFeatureSql(BaseTga):
|
|
|
321
322
|
-- 计算固定随机值,打乱顺序(废弃)--计算一个固定的采样值--现在也失效了,因为tga上云到k8s,不同节点计算,这个值也会变, 而且不同的id生成的数字可能一样,这样就不能达到目的了
|
|
322
323
|
-- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
|
|
323
324
|
-- 直接用user_user_id作为排序值,排序的必须是唯一的,否则下面会对不上,会产生很多null的数据
|
|
324
|
-
user_user_id as tt_stable_rand
|
|
325
|
+
-- user_user_id as tt_stable_rand -- 这种方式最保险,但是不能乱序,这样采样的数据就不是随机分布在每天的
|
|
326
|
+
--bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand -- 这种方式可能会产生left null的情况,但是是少数,过滤掉就行,不影响结果,但支持乱序采样
|
|
327
|
+
a3.user_distinct_id as tt_stable_rand -- 这种方式最保险,即是乱序也是唯一
|
|
325
328
|
FROM
|
|
326
329
|
(
|
|
327
330
|
-- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
|
|
328
331
|
SELECT
|
|
329
332
|
a2.user_user_id as user_user_id,
|
|
333
|
+
a2.user_distinct_id as user_distinct_id,
|
|
330
334
|
a2.user_register_time as user_register_time,
|
|
331
335
|
SUM(CASE WHEN a2.event_user_id IS NULL THEN 0 ELSE 1 END) AS event_count
|
|
332
336
|
FROM
|
|
333
337
|
(
|
|
334
338
|
new_user
|
|
335
339
|
) a2
|
|
336
|
-
GROUP BY a2.user_user_id, a2.user_register_time
|
|
340
|
+
GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
|
|
337
341
|
) a3
|
|
338
342
|
)
|
|
339
343
|
, nav_table as (
|
|
@@ -555,6 +559,7 @@ class TrainFeatureSql(BaseTga):
|
|
|
555
559
|
on {base_user_table}.user_user_id={base_feature_table}.user_user_id
|
|
556
560
|
)
|
|
557
561
|
-- with结束没有逗号, 过滤左连接没有特征值的行,调试的时候取消where条件
|
|
562
|
+
-- 出现null过滤掉就行,少量的不管,因为tt_stable_rand的构建方法会有较小影响
|
|
558
563
|
select * from user_tzz where {self.get_first_col_name()} is not NULL
|
|
559
564
|
"""
|
|
560
565
|
return sql
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__="0.16.7"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|