mdbq 2.2.2__tar.gz → 2.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mdbq-2.2.2 → mdbq-2.2.4}/PKG-INFO +1 -1
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/aggregation/query_data.py +42 -3
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq.egg-info/PKG-INFO +1 -1
- {mdbq-2.2.2 → mdbq-2.2.4}/setup.py +1 -1
- {mdbq-2.2.2 → mdbq-2.2.4}/README.txt +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/__version__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/aggregation/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/aggregation/aggregation.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/aggregation/df_types.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/aggregation/mysql_types.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/aggregation/optimize_data.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/bdup/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/bdup/bdup.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/clean/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/clean/data_clean.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/company/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/company/copysh.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/company/home_sh.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/config/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/config/get_myconf.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/config/products.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/config/set_support.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/config/update_conf.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/dataframe/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/dataframe/converter.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/log/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/log/mylogger.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/mongo/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/mongo/mongo.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/mysql/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/mysql/mysql.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/mysql/s_query.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/mysql/year_month_day.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/other/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/other/porxy.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/other/pov_city.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/other/sku_picture.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/other/ua_sj.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/pbix/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/pbix/pbix_refresh.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/pbix/refresh_all.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/pbix/refresh_all_old.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq/spider/__init__.py +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq.egg-info/SOURCES.txt +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq.egg-info/dependency_links.txt +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/mdbq.egg-info/top_level.txt +0 -0
- {mdbq-2.2.2 → mdbq-2.2.4}/setup.cfg +0 -0
@@ -1,4 +1,6 @@
|
|
1
1
|
# -*- coding: UTF-8 –*-
|
2
|
+
import re
|
3
|
+
|
2
4
|
from mdbq.mongo import mongo
|
3
5
|
from mdbq.mysql import mysql
|
4
6
|
from mdbq.mysql import s_query
|
@@ -14,8 +16,6 @@ import getpass
|
|
14
16
|
import json
|
15
17
|
import os
|
16
18
|
|
17
|
-
from sqlalchemy.event import remove
|
18
|
-
|
19
19
|
"""
|
20
20
|
程序用于下载数据库(调用 s_query.py 下载并清洗), 并对数据进行聚合清洗, 不会更新数据库信息;
|
21
21
|
|
@@ -862,7 +862,47 @@ class GroupBy:
|
|
862
862
|
}
|
863
863
|
)
|
864
864
|
df.insert(loc=1, column='推广渠道', value='万相台无界版') # df中插入新列
|
865
|
+
# 1. 匹配 L后面接 2 个或以上数字,不区分大小写,示例:L345
|
866
|
+
# 2. 其余情况,L 后面接多个数字的都会被第一条 if 命中,不区分大小写
|
867
|
+
df['消费力层级'] = df.apply(
|
868
|
+
lambda x:
|
869
|
+
''.join(re.findall(r'(l\d+)', x['人群名字'].upper(), re.IGNORECASE)) if re.findall(r'(l\d{2,})', x['人群名字'], re.IGNORECASE)
|
870
|
+
else 'L5' if re.findall(r'(l\d*5)', x['人群名字'], re.IGNORECASE)
|
871
|
+
else 'L4' if re.findall(r'(l\d*4)', x['人群名字'], re.IGNORECASE)
|
872
|
+
else 'L3' if re.findall(r'(l\d*3)', x['人群名字'], re.IGNORECASE)
|
873
|
+
else 'L2' if re.findall(r'(l\d*2)', x['人群名字'], re.IGNORECASE)
|
874
|
+
else 'L1' if re.findall(r'(l\d*1)', x['人群名字'], re.IGNORECASE)
|
875
|
+
else '', axis=1)
|
876
|
+
# 1. 匹配连续的 4 个数字且后面不能接数字或"元"或汉字,筛掉的人群示例:月均消费6000元|受众20240729175213|xxx2024真皮公文包
|
877
|
+
# 2. 匹配 2数字_2数字且前面不能是数字,合法匹配:人群_30_50_促; 非法示例:L345_3040 避免识别出 35~20 岁用户的情况
|
878
|
+
# pattern = r'(\d{4})(?!\d|[\u4e00-\u9fa5])' # 匹配 4 个数字,后面不能接数字或汉字
|
879
|
+
# pattern = r'(?<![\d\u4e00-\u9fa5])(\d{4})' # 匹配前面不是数字或汉字的 4 个连续数字
|
880
|
+
|
881
|
+
# 匹配 4 个数字,前面和后面都不能是数字或汉字
|
882
|
+
pattern1 = r'(?<![\d\u4e00-\u9fa5])(\d{4})(?!\d|[\u4e00-\u9fa5])'
|
883
|
+
# 匹配指定字符,前面不能是数字或 l 或 L 开头
|
884
|
+
pattern2 = r'(?<![\dlL])(\d{2}_\d{2})'
|
885
|
+
df['用户年龄'] = df.apply(
|
886
|
+
lambda x:
|
887
|
+
''.join(re.findall(pattern1, x['人群名字'].upper())) if re.findall(pattern1, x['人群名字'])
|
888
|
+
# else ''.join(re.findall(r'[^\d|l|L](\d{2}_\d{2})', x['人群名字'].upper())) if re.findall(r'[^\d|l|L](\d{2}_\d{2})', x['人群名字'])
|
889
|
+
else ''.join(re.findall(pattern2, x['人群名字'].upper())) if re.findall(pattern2, x['人群名字'])
|
890
|
+
else ''.join(re.findall(r'(\d{2}-\d{2})岁', x['人群名字'].upper())) if re.findall(r'(\d{2}-\d{2})岁', x['人群名字'])
|
891
|
+
else '', axis=1)
|
892
|
+
df['用户年龄'] = df['用户年龄'].apply(
|
893
|
+
lambda x: f'{x[:2]}~{x[2:4]}' if str(x).isdigit()
|
894
|
+
else str(x).replace('_', '~') if '_' in x
|
895
|
+
else str(x).replace('-', '~') if '-' in x
|
896
|
+
else x
|
897
|
+
)
|
898
|
+
# 年龄层不能是 0 开头
|
899
|
+
df['用户年龄'] = df['用户年龄'].apply(
|
900
|
+
lambda x: '' if str(x).startswith('0') else x)
|
901
|
+
# df = df.head(1000)
|
902
|
+
# df.to_csv('/Users/xigua/Downloads/test.csv', index=False, header=True, encoding='utf-8_sig')
|
903
|
+
# breakpoint()
|
865
904
|
return df
|
905
|
+
|
866
906
|
elif '天猫_关键词报表' in table_name:
|
867
907
|
df.rename(columns={
|
868
908
|
'场景名字': '营销场景',
|
@@ -1774,4 +1814,3 @@ if __name__ == '__main__':
|
|
1774
1814
|
data_aggregation(service_databases=[{'company': 'mysql'}], months=1) # 正常的聚合所有数据
|
1775
1815
|
# data_aggregation_one(service_databases=[{'company': 'mysql'}], months=1) # 单独聚合某一个数据库,具体库进函数编辑
|
1776
1816
|
# optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
|
1777
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|