mdbq 2.4.6__py3-none-any.whl → 2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +3 -1
- mdbq/aggregation/query_data.py +219 -51
- {mdbq-2.4.6.dist-info → mdbq-2.4.8.dist-info}/METADATA +1 -1
- {mdbq-2.4.6.dist-info → mdbq-2.4.8.dist-info}/RECORD +6 -6
- {mdbq-2.4.6.dist-info → mdbq-2.4.8.dist-info}/WHEEL +0 -0
- {mdbq-2.4.6.dist-info → mdbq-2.4.8.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -260,6 +260,7 @@ class DatabaseUpdate:
|
|
260
260
|
if len(df) == 0:
|
261
261
|
print(f'{name} 报表数据为空')
|
262
262
|
check_remove_file = True
|
263
|
+
os.remove(os.path.join(root, name))
|
263
264
|
continue
|
264
265
|
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
265
266
|
df.insert(loc=0, column='日期', value=pattern[0][1])
|
@@ -398,7 +399,7 @@ class DatabaseUpdate:
|
|
398
399
|
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
|
399
400
|
if not pattern or '省份城市分析2' not in name:
|
400
401
|
print(f'{name} 不支持或已转换的表格')
|
401
|
-
|
402
|
+
os.remove(os.path.join(root, name)) # 直接删掉,避免被分到原始文件, encoding 不同会引发错误
|
402
403
|
check_remove_file = True
|
403
404
|
continue
|
404
405
|
date = '-'.join(pattern[0][1:])
|
@@ -406,6 +407,7 @@ class DatabaseUpdate:
|
|
406
407
|
if len(df) == 0:
|
407
408
|
print(f'{name} 报表数据为空')
|
408
409
|
check_remove_file = True
|
410
|
+
os.remove(os.path.join(root, name))
|
409
411
|
continue
|
410
412
|
df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
|
411
413
|
df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
|
mdbq/aggregation/query_data.py
CHANGED
@@ -998,6 +998,45 @@ class GroupBy:
|
|
998
998
|
# df = df.head(1000)
|
999
999
|
# df.to_csv('/Users/xigua/Downloads/test.csv', index=False, header=True, encoding='utf-8_sig')
|
1000
1000
|
# breakpoint()
|
1001
|
+
|
1002
|
+
# 下面是添加人群 AIPL 分类
|
1003
|
+
dir_file = f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\0-电商周报-每周五更新\\分类配置文件.xlsx'
|
1004
|
+
dir_file2 = '/Volumes/时尚事业部/01.运营部/0-电商周报-每周五更新/分类配置文件.xlsx'
|
1005
|
+
if platform.system() == 'Windows':
|
1006
|
+
dir_file3 = 'C:\\同步空间\\BaiduSyncdisk\\原始文件2\\分类配置文件.xlsx'
|
1007
|
+
else:
|
1008
|
+
dir_file3 = '/Users/xigua/数据中心/原始文件2/分类配置文件.xlsx'
|
1009
|
+
if not os.path.isfile(dir_file):
|
1010
|
+
dir_file = dir_file2
|
1011
|
+
if not os.path.isfile(dir_file):
|
1012
|
+
dir_file = dir_file3
|
1013
|
+
if os.path.isfile(dir_file):
|
1014
|
+
df_fl = pd.read_excel(dir_file, sheet_name='人群分类', header=0)
|
1015
|
+
df_fl = df_fl[['人群名字', '人群分类']]
|
1016
|
+
# 合并并获取分类信息
|
1017
|
+
df = pd.merge(df, df_fl, left_on=['人群名字'], right_on=['人群名字'], how='left')
|
1018
|
+
df['人群分类'].fillna('', inplace=True)
|
1019
|
+
if '人群分类' in df.columns.tolist():
|
1020
|
+
# 这行决定了,从文件中读取的分类信息优先级高于内部函数的分类规则
|
1021
|
+
# 这个 lambda 适配人群名字中带有特定标识的分类,强匹配
|
1022
|
+
df['人群分类'] = df.apply(
|
1023
|
+
lambda x: self.set_crowd(keyword=str(x['人群名字']), as_file=False) if x['人群分类'] == ''
|
1024
|
+
else x['人群分类'], axis=1
|
1025
|
+
)
|
1026
|
+
# 这个 lambda 适配人群名字中聚类的特征字符,弱匹配
|
1027
|
+
df['人群分类'] = df.apply(
|
1028
|
+
lambda x: self.set_crowd2(keyword=str(x['人群名字']), as_file=False) if x['人群分类'] == ''
|
1029
|
+
else x['人群分类'], axis=1
|
1030
|
+
)
|
1031
|
+
else:
|
1032
|
+
df['人群分类'] = df['人群名字'].apply(lambda x: self.set_crowd(keyword=str(x), as_file=False))
|
1033
|
+
df['人群分类'] = df.apply(
|
1034
|
+
lambda x: self.set_crowd2(keyword=str(x['人群名字']), as_file=False) if x['人群分类'] == ''
|
1035
|
+
else x['人群分类'], axis=1
|
1036
|
+
)
|
1037
|
+
df['人群分类'] = df['人群分类'].apply(lambda x: str(x).upper() if x else x)
|
1038
|
+
# df.to_csv('/Users/xigua/Downloads/test_人群分类.csv', index=False, header=True, encoding='utf-8_sig')
|
1039
|
+
# breakpoint()
|
1001
1040
|
return df
|
1002
1041
|
|
1003
1042
|
elif '天猫_关键词报表' in table_name:
|
@@ -1048,7 +1087,7 @@ class GroupBy:
|
|
1048
1087
|
dir_file = dir_file2
|
1049
1088
|
if os.path.isfile(dir_file):
|
1050
1089
|
df_fl = pd.read_excel(dir_file, sheet_name='关键词分类', header=0)
|
1051
|
-
df_fl.rename(columns={'分类1': '词分类'}, inplace=True)
|
1090
|
+
# df_fl.rename(columns={'分类1': '词分类'}, inplace=True)
|
1052
1091
|
df_fl = df_fl[['关键词', '词分类']]
|
1053
1092
|
# 合并并获取词分类信息
|
1054
1093
|
df = pd.merge(df, df_fl, left_on=['词名字/词包名字'], right_on=['关键词'], how='left')
|
@@ -1620,6 +1659,138 @@ class GroupBy:
|
|
1620
1659
|
break
|
1621
1660
|
return result
|
1622
1661
|
|
1662
|
+
def set_crowd(self, keyword, as_file=False):
|
1663
|
+
""" 推广人群报表,人群分类, """
|
1664
|
+
result_a = re.findall('_a$|_a_|_ai|^a_', str(keyword), re.IGNORECASE)
|
1665
|
+
result_i = re.findall('_i$|_i_|^i_', str(keyword), re.IGNORECASE)
|
1666
|
+
result_p = re.findall('_p$|_p_|_pl|^p_||^pl_', str(keyword), re.IGNORECASE)
|
1667
|
+
result_l = re.findall('_l$|_l_|^l_', str(keyword), re.IGNORECASE)
|
1668
|
+
|
1669
|
+
datas = [
|
1670
|
+
{
|
1671
|
+
'类别': 'A',
|
1672
|
+
'值': result_a,
|
1673
|
+
},
|
1674
|
+
{
|
1675
|
+
'类别': 'I',
|
1676
|
+
'值': result_i,
|
1677
|
+
},
|
1678
|
+
{
|
1679
|
+
'类别': 'P',
|
1680
|
+
'值': result_p,
|
1681
|
+
},
|
1682
|
+
{
|
1683
|
+
'类别': 'L',
|
1684
|
+
'值': result_l,
|
1685
|
+
}
|
1686
|
+
]
|
1687
|
+
|
1688
|
+
is_res = False
|
1689
|
+
for data in datas:
|
1690
|
+
if data['值']:
|
1691
|
+
data['值'] = [item for item in data['值'] if item != '']
|
1692
|
+
if data['值']:
|
1693
|
+
return data['类别']
|
1694
|
+
if not is_res:
|
1695
|
+
return ''
|
1696
|
+
|
1697
|
+
def set_crowd2(self, keyword, as_file=False):
|
1698
|
+
""" 推广人群报表,人群分类, """
|
1699
|
+
datas = [
|
1700
|
+
{
|
1701
|
+
'类别': 'A',
|
1702
|
+
'值': [
|
1703
|
+
'相似宝贝',
|
1704
|
+
'相似店铺',
|
1705
|
+
'类目',
|
1706
|
+
'88VIP',
|
1707
|
+
'拉新',
|
1708
|
+
'潮流',
|
1709
|
+
'会场',
|
1710
|
+
'意向',
|
1711
|
+
'>>', # 系统推荐的搜索相关人群
|
1712
|
+
'关键词:', # 系统推荐的搜索相关人群
|
1713
|
+
'关键词_', # 自建的搜索相关人群
|
1714
|
+
'扩展',
|
1715
|
+
'敏感人群',
|
1716
|
+
'尝鲜',
|
1717
|
+
'小二推荐',
|
1718
|
+
'竞争',
|
1719
|
+
'资深',
|
1720
|
+
'女王节',
|
1721
|
+
'本行业',
|
1722
|
+
'618',
|
1723
|
+
'包包树',
|
1724
|
+
'迪桑娜',
|
1725
|
+
'菲安妮',
|
1726
|
+
'卡思乐',
|
1727
|
+
'场景词',
|
1728
|
+
'竞对',
|
1729
|
+
'精选',
|
1730
|
+
'发现',
|
1731
|
+
'行业mvp'
|
1732
|
+
'特征继承',
|
1733
|
+
'机会',
|
1734
|
+
'推荐',
|
1735
|
+
'智能定向',
|
1736
|
+
]
|
1737
|
+
},
|
1738
|
+
{
|
1739
|
+
'类别': 'I',
|
1740
|
+
'值': [
|
1741
|
+
'行动',
|
1742
|
+
'收加',
|
1743
|
+
'收藏',
|
1744
|
+
'加购',
|
1745
|
+
'促首购',
|
1746
|
+
'店铺优惠券',
|
1747
|
+
'高转化',
|
1748
|
+
'认知',
|
1749
|
+
'喜欢我', # 系统推荐的宝贝相关人群
|
1750
|
+
'未购买',
|
1751
|
+
'种草',
|
1752
|
+
'兴趣',
|
1753
|
+
'本店',
|
1754
|
+
'领券',
|
1755
|
+
]
|
1756
|
+
},
|
1757
|
+
{
|
1758
|
+
'类别': 'P',
|
1759
|
+
'值': [
|
1760
|
+
'万里马',
|
1761
|
+
'购买',
|
1762
|
+
'已购',
|
1763
|
+
'促复购'
|
1764
|
+
'店铺会员',
|
1765
|
+
'店铺粉丝',
|
1766
|
+
'转化',
|
1767
|
+
]
|
1768
|
+
},
|
1769
|
+
{
|
1770
|
+
'类别': 'L',
|
1771
|
+
'值': [
|
1772
|
+
'L人群',
|
1773
|
+
]
|
1774
|
+
},
|
1775
|
+
]
|
1776
|
+
if as_file:
|
1777
|
+
with open(os.path.join(self.output, f'分类配置_推广人群分类_函数内置规则.json'), 'w') as f:
|
1778
|
+
json.dump(datas, f, ensure_ascii=False, sort_keys=False, indent=4)
|
1779
|
+
breakpoint()
|
1780
|
+
result = ''
|
1781
|
+
res = []
|
1782
|
+
is_continue = False
|
1783
|
+
for data in datas:
|
1784
|
+
for item in data['值']:
|
1785
|
+
res = re.findall(item, str(keyword), re.IGNORECASE)
|
1786
|
+
if res:
|
1787
|
+
result = data['类别']
|
1788
|
+
is_continue = True
|
1789
|
+
break
|
1790
|
+
if is_continue:
|
1791
|
+
break
|
1792
|
+
return result
|
1793
|
+
|
1623
1794
|
# @try_except
|
1624
1795
|
def performance(self, bb_tg=True):
|
1625
1796
|
# print(self.data_tgyj)
|
@@ -1870,7 +2041,7 @@ def data_aggregation_one(service_databases=[{}], months=1):
|
|
1870
2041
|
) # 3. 回传数据库
|
1871
2042
|
|
1872
2043
|
|
1873
|
-
def data_aggregation(service_databases=[{}], months=1):
|
2044
|
+
def data_aggregation(service_databases=[{}], months=1, is_juhe=True):
|
1874
2045
|
"""
|
1875
2046
|
1. 从数据库中读取数据
|
1876
2047
|
2. 数据聚合清洗
|
@@ -2054,54 +2225,51 @@ def data_aggregation(service_databases=[{}], months=1):
|
|
2054
2225
|
icm_update=unique_key_list,
|
2055
2226
|
service_database=service_database,
|
2056
2227
|
) # 3. 回传数据库
|
2057
|
-
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
2068
|
-
|
2069
|
-
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
|
2076
|
-
|
2077
|
-
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
|
2101
|
-
|
2102
|
-
service_database=service_database,
|
2103
|
-
)
|
2104
|
-
|
2228
|
+
if is_juhe:
|
2229
|
+
res = g.performance(bb_tg=True) # 盈亏表,依赖其他表,单独做
|
2230
|
+
m.df_to_mysql(
|
2231
|
+
df=res,
|
2232
|
+
db_name='聚合数据',
|
2233
|
+
table_name='_全店商品销售',
|
2234
|
+
move_insert=True, # 先删除,再插入
|
2235
|
+
# df_sql=True,
|
2236
|
+
# drop_duplicates=False,
|
2237
|
+
# icm_update=['日期', '商品id'], # 设置唯一主键
|
2238
|
+
service_database=service_database,
|
2239
|
+
)
|
2240
|
+
res = g.performance(bb_tg=False) # 盈亏表,依赖其他表,单独做
|
2241
|
+
m.df_to_mysql(
|
2242
|
+
df=res,
|
2243
|
+
db_name='聚合数据',
|
2244
|
+
table_name='_推广商品销售',
|
2245
|
+
move_insert=True, # 先删除,再插入
|
2246
|
+
# df_sql=True,
|
2247
|
+
# drop_duplicates=False,
|
2248
|
+
# icm_update=['日期', '商品id'], # 设置唯一主键
|
2249
|
+
service_database=service_database,
|
2250
|
+
)
|
2251
|
+
res = g.performance_concat(bb_tg=False) # 推广主体合并直播表,依赖其他表,单独做
|
2252
|
+
m.df_to_mysql(
|
2253
|
+
df=res,
|
2254
|
+
db_name='聚合数据',
|
2255
|
+
table_name='天猫_推广汇总',
|
2256
|
+
move_insert=True, # 先删除,再插入
|
2257
|
+
# df_sql=True,
|
2258
|
+
# drop_duplicates=False,
|
2259
|
+
# icm_update=['日期', '推广渠道', '营销场景', '商品id', '花费', '展现量', '点击量'], # 设置唯一主键
|
2260
|
+
service_database=service_database,
|
2261
|
+
)
|
2262
|
+
res = g.performance_jd(jd_tg=False) # 盈亏表,依赖其他表,单独做
|
2263
|
+
m.df_to_mysql(
|
2264
|
+
df=res,
|
2265
|
+
db_name='聚合数据',
|
2266
|
+
table_name='_京东_推广商品销售',
|
2267
|
+
move_insert=True, # 先删除,再插入
|
2268
|
+
# df_sql=True,
|
2269
|
+
# drop_duplicates=False,
|
2270
|
+
# icm_update=['日期', '跟单sku id', '货号', '花费'], # 设置唯一主键
|
2271
|
+
service_database=service_database,
|
2272
|
+
)
|
2105
2273
|
|
2106
2274
|
# 这里要注释掉,不然 copysh.py 可能有问题,这里主要修改配置文件,后续触发 home_lx 的 optimize_datas.py(有s)程序进行全局清理
|
2107
2275
|
# optimize_data.op_data(service_databases=service_databases, days=3650) # 立即启动对聚合数据的清理工作
|
@@ -2112,6 +2280,6 @@ def main():
|
|
2112
2280
|
|
2113
2281
|
|
2114
2282
|
if __name__ == '__main__':
|
2115
|
-
data_aggregation(service_databases=[{'company': 'mysql'}], months=
|
2283
|
+
data_aggregation(service_databases=[{'company': 'mysql'}], months=0, is_juhe=False) # 正常的聚合所有数据
|
2116
2284
|
# data_aggregation_one(service_databases=[{'company': 'mysql'}], months=1) # 单独聚合某一个数据库,具体库进函数编辑
|
2117
2285
|
# optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
|
@@ -1,11 +1,11 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=nPp5fOLktxejNEak3SyTnKLjwzK1l2xjbV45X-I4LFQ,76131
|
5
5
|
mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=Wis40oL04M7E1pkvgNPjyVFAUe-zgjimjIVAikxYY8Y,4418
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
8
|
+
mdbq/aggregation/query_data.py,sha256=mL9kGu1sZf0bIRI_s2PwF12tPR0z7jGkC_qWAz_5wG8,103128
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
10
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
11
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
@@ -41,7 +41,7 @@ mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
|
41
41
|
mdbq/req_post/req_tb.py,sha256=PexWSCPJNM6Tv0ol4lAWIhlOwsAr_frnjtcdSHCFiek,36179
|
42
42
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
43
43
|
mdbq/spider/aikucun.py,sha256=Olq7IJP9itM4wuNxZeHOG-Q3i8wWyB4hY8TUGGwCvQ0,14104
|
44
|
-
mdbq-2.4.
|
45
|
-
mdbq-2.4.
|
46
|
-
mdbq-2.4.
|
47
|
-
mdbq-2.4.
|
44
|
+
mdbq-2.4.8.dist-info/METADATA,sha256=1PkjCLlpg0ipzA5WgllXzHDYkMUb10_sMB0RU2xIIww,245
|
45
|
+
mdbq-2.4.8.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
46
|
+
mdbq-2.4.8.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
47
|
+
mdbq-2.4.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|