PyPI - tfduck-bsd - Versions diffs - 0.16.7__tar.gz → 0.17.0__tar.gz - Mend

tfduck-bsd 0.16.7tar.gz → 0.17.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tfduck-bsd might be problematic. Click here for more details.

Files changed (46) hide show

{tfduck-bsd-0.16.7/tfduck_bsd.egg-info → tfduck-bsd-0.17.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tfduck-bsd
-Version: 0.16.7
+Version: 0.17.0
 Summary: A small example package
 Home-page: UNKNOWN
 Author: yuanxiao

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ with open("README.md", "r") as fh:
 setuptools.setup(
     name="tfduck-bsd",
-    version="0.16.7",
+    version="0.17.0",
     author="yuanxiao",
     author_email="yuan6785@163.com",
     description="A small example package",

tfduck-bsd-0.17.0/tfduck/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__="0.17.0"

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/train_sql_ltv.py RENAMED Viewed

@@ -243,6 +243,7 @@ class TrainFeatureSql(BaseTga):
                 SELECT
                     a1.user_register_time as user_register_time,
                     a1."#user_id" as user_user_id,
+                    a1."#distinct_id" as user_distinct_id,
                     b1."#user_id" as event_user_id,
                     CASE WHEN b1."yiap__itemrevenue" is NULL THEN 0 ELSE b1."yiap__itemrevenue" END yiap__itemrevenue,
                     CASE WHEN b1."sdk_ad_price" is NULL THEN 0 ELSE b1."sdk_ad_price" END sdk_ad_price,
@@ -260,11 +261,11 @@ class TrainFeatureSql(BaseTga):
                         FROM
                         (
                             SELECT
-                                b.event_time_utc as user_register_time, a."#user_id"
+                                b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
                             FROM
                             (
                                 SELECT
-                                    "#user_id"
+                                    "#user_id","#distinct_id"
                                 FROM
                                     {gconf.tga_user_table}
                                 WHERE
@@ -332,12 +333,16 @@ class TrainFeatureSql(BaseTga):
             -- 计算固定随机值，打乱顺序(废弃)--计算一个固定的采样值--现在也失效了，因为tga上云到k8s，不同节点计算，这个值也会变, 而且不同的id生成的数字可能一样，这样就不能达到目的了
             -- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
             -- 直接用user_user_id作为排序值，排序的必须是唯一的，否则下面会对不上，会产生很多null的数据
-            user_user_id as tt_stable_rand
+            -- user_user_id as tt_stable_rand  -- 这种方式最保险，但是不能乱序，这样采样的数据就不是随机分布在每天的
+            -- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand  -- 这种方式可能会产生left null的情况，但是是少数，过滤掉就行，不影响结果，但支持乱序采样
+            a3.user_distinct_id as tt_stable_rand  -- 这种方式最保险，即是乱序也是唯一
         FROM
         (
             -- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
             SELECT
                a2_1.user_user_id,
+               a2_1.user_distinct_id,
                a2_1.user_register_time,
                {
                 [
@@ -349,6 +354,7 @@ class TrainFeatureSql(BaseTga):
             FROM (
                 SELECT
                     a2.user_user_id as user_user_id,
+                    a2.user_distinct_id as user_distinct_id,
                     a2.user_register_time as user_register_time,
                     SUM(
                         CASE
@@ -368,7 +374,7 @@ class TrainFeatureSql(BaseTga):
                 (
                     new_user
                 ) a2
-                GROUP BY a2.user_user_id, a2.user_register_time
+                GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
             ) a2_1
         ) a3
     )
@@ -591,6 +597,7 @@ class TrainFeatureSql(BaseTga):
         on {base_user_table}.user_user_id={base_feature_table}.user_user_id
     )
     -- with结束没有逗号, 过滤左连接没有特征值的行，调试的时候取消where条件
+    -- 出现null过滤掉就行，少量的不管，因为tt_stable_rand的构建方法会有较小影响s
     select * from user_tzz where {self.get_first_col_name()} is not NULL
     """
         return sql

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/train_sql_retain.py RENAMED Viewed

@@ -232,6 +232,7 @@ class TrainFeatureSql(BaseTga):
                 SELECT
                     a1.user_register_time as user_register_time,
                     a1."#user_id" as user_user_id,
+                    a1."#distinct_id" as user_distinct_id,
                     b1."#user_id" as event_user_id,
                     -- b1."event_time_utc" as pd_event_time_utc,
                     floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
@@ -248,11 +249,12 @@ class TrainFeatureSql(BaseTga):
                         FROM
                         (
                             SELECT
-                                b.event_time_utc as user_register_time, a."#user_id"
+                                b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
                             FROM
                             (
                                 SELECT
-                                    "#user_id"
+                                    "#user_id",
+                                    "#distinct_id"
                                 FROM
                                     {gconf.tga_user_table}
                                 WHERE
@@ -317,12 +319,15 @@ class TrainFeatureSql(BaseTga):
             -- 计算固定随机值，打乱顺序(废弃)--计算一个固定的采样值--现在也失效了，因为tga上云到k8s，不同节点计算，这个值也会变, 而且不同的id生成的数字可能一样，这样就不能达到目的了
             -- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
             -- 直接用user_user_id作为排序值，排序的必须是唯一的，否则下面会对不上，会产生很多null的数据
-            user_user_id as tt_stable_rand
+            -- user_user_id as tt_stable_rand  -- 这种方式最保险，但是不能乱序，这样采样的数据就不是随机分布在每天的
+            -- bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand  -- 这种方式也不太行，数据会几种在某一天
+            a3.user_distinct_id as tt_stable_rand  -- 这种方式最保险，即是乱序也是唯一
         FROM
         (
             -- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
             SELECT
                 a2.user_user_id as user_user_id,
+                a2.user_distinct_id as user_distinct_id,
                 a2.user_register_time as user_register_time,
                 SUM(
                     CASE
@@ -335,7 +340,7 @@ class TrainFeatureSql(BaseTga):
             (
                 new_user
             ) a2
-            GROUP BY a2.user_user_id, a2.user_register_time
+            GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
         ) a3
     )
     , nav_table as (
@@ -557,6 +562,7 @@ class TrainFeatureSql(BaseTga):
         on {base_user_table}.user_user_id={base_feature_table}.user_user_id
     )
     -- with结束没有逗号, 过滤左连接没有特征值的行，调试的时候取消where条件
+    -- 出现null过滤掉就行，少量的不管，因为tt_stable_rand的构建方法会有较小影响
     select * from user_tzz where {self.get_first_col_name()} is not NULL
     """
         return sql

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/train_sql_yh.py RENAMED Viewed

@@ -232,6 +232,7 @@ class TrainFeatureSql(BaseTga):
                 SELECT
                     a1.user_register_time as user_register_time,
                     a1."#user_id" as user_user_id,
+                    a1."#distinct_id" as user_distinct_id,
                     b1."#user_id" as event_user_id,
                     -- b1."event_time_utc" as pd_event_time_utc,
                     floor(to_unixtime(b1.event_time_utc))-floor(to_unixtime(a1.user_register_time)) as pd_rt_dur
@@ -248,11 +249,11 @@ class TrainFeatureSql(BaseTga):
                         FROM
                         (
                             SELECT
-                                b.event_time_utc as user_register_time, a."#user_id"
+                                b.event_time_utc as user_register_time, a."#user_id", a."#distinct_id"
                             FROM
                             (
                                 SELECT
-                                    "#user_id"
+                                    "#user_id","#distinct_id"
                                 FROM
                                     {gconf.tga_user_table}
                                 WHERE
@@ -321,19 +322,22 @@ class TrainFeatureSql(BaseTga):
             -- 计算固定随机值，打乱顺序(废弃)--计算一个固定的采样值--现在也失效了，因为tga上云到k8s，不同节点计算，这个值也会变, 而且不同的id生成的数字可能一样，这样就不能达到目的了
             -- (abs(from_ieee754_64(xxhash64(cast(cast(user_user_id as varchar) as varbinary)))) % 100) / 100. as tt_stable_rand
             -- 直接用user_user_id作为排序值，排序的必须是唯一的，否则下面会对不上，会产生很多null的数据
-            user_user_id as tt_stable_rand
+            -- user_user_id as tt_stable_rand  -- 这种方式最保险，但是不能乱序，这样采样的数据就不是随机分布在每天的
+            --bitwise_xor(user_user_id, 906867964886667264) as tt_stable_rand  -- 这种方式可能会产生left null的情况，但是是少数，过滤掉就行，不影响结果，但支持乱序采样
+            a3.user_distinct_id as tt_stable_rand  -- 这种方式最保险，即是乱序也是唯一
         FROM
         (
             -- 获取指定日期的注册用户 连接 这些用户在7-10天后的触发的事件 的 数量
             SELECT
                 a2.user_user_id as user_user_id,
+                a2.user_distinct_id as user_distinct_id,
                 a2.user_register_time as user_register_time,
                 SUM(CASE WHEN a2.event_user_id IS NULL THEN 0 ELSE 1 END) AS event_count
             FROM
             (
                 new_user
             ) a2
-            GROUP BY a2.user_user_id, a2.user_register_time
+            GROUP BY a2.user_user_id, a2.user_distinct_id, a2.user_register_time
         ) a3
     )
     , nav_table as (
@@ -555,6 +559,7 @@ class TrainFeatureSql(BaseTga):
         on {base_user_table}.user_user_id={base_feature_table}.user_user_id
     )
     -- with结束没有逗号, 过滤左连接没有特征值的行，调试的时候取消where条件
+    -- 出现null过滤掉就行，少量的不管，因为tt_stable_rand的构建方法会有较小影响
     select * from user_tzz where {self.get_first_col_name()} is not NULL
     """
         return sql

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0/tfduck_bsd.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tfduck-bsd
-Version: 0.16.7
+Version: 0.17.0
 Summary: A small example package
 Home-page: UNKNOWN
 Author: yuanxiao

tfduck-bsd-0.16.7/tfduck/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__="0.16.7"

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/LICENSE RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/README.md RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/bin/tfduck RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/setup.cfg RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/config/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/config/bdpmanager.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/config/table_config.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/example.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/opends/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/opends/opends.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/bdp_sdk_py/opends/sdk.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/common/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/common/defines.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/common/extendEncoder.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/main.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/oss/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/oss/oss.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/pyspark_k8s/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/pyspark_k8s/k8s_manage.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/pyspark_k8s/spark_manage.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/s3/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/s3/s3oper.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/sagemaker/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/sagemaker/saoper.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/base_tga.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/predict_sql_ltv.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/predict_sql_retain.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/predict_sql_yh.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/tga.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/tga/tga_test.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/thinkdata/__init__.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck/thinkdata/query.py RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/requires.txt RENAMED Viewed

File without changes

{tfduck-bsd-0.16.7 → tfduck-bsd-0.17.0}/tfduck_bsd.egg-info/top_level.txt RENAMED Viewed

File without changes

tfduck-bsd 0.16.7__tar.gz → 0.17.0__tar.gz

Potentially problematic release.

tfduck-bsd 0.16.7tar.gz → 0.17.0tar.gz