PyPI - mdbq - Versions diffs - 1.9.5__py3-none-any.whl → 1.9.7__py3-none-any.whl - Mend

mdbq 1.9.5py3-none-any.whl → 1.9.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

mdbq/aggregation/aggregation.py +101 -2
mdbq/clean/data_clean.py +3 -1
mdbq/company/copysh.py +1 -0
{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/METADATA +1 -1
{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/RECORD +7 -7
{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/WHEEL +0 -0
{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/top_level.txt +0 -0

mdbq/aggregation/aggregation.py CHANGED Viewed

@@ -59,6 +59,7 @@ class DatabaseUpdate:
         for root, dirs, files in os.walk(self.path, topdown=False):
             for name in files:
+                check_remove_file = False  # 设置这个参数的目的: 避免误删其他文件， 不是本程序数据清洗覆盖的文件不做干预
                 if '~$' in name or '.DS' in name or '.localized' in name or '.ini' in name or '$RECYCLE.BIN' in name or 'Icon' in name:
                     continue
                 db_name = None  # 初始化/重置变量，避免进入下一个循环
@@ -93,17 +94,21 @@ class DatabaseUpdate:
                             ck = df.columns.tolist()
                             if '场景名字' not in ck:
                                 print(f'1.2.0 {name} 报表字段缺失, 请选择Pbix数据模板下载')
+                                check_remove_file = True
                                 continue
                         if len(df) == 0:
                             print(f'1.3.0 {name} 报表是空的, 请重新下载')
+                            check_remove_file = True
                             continue
                         cols = df.columns.tolist()
                         if '日期' not in cols:
                             print(f'1.4.0 {name} 报表不包含分日数据, 已跳过')
+                            check_remove_file = True
                             continue
                         if '省' in cols:
                             if '市' not in cols:
                                 print(f'1.5.0 {name} 请下载市级地域报表，而不是省报表')
+                                check_remove_file = True
                                 continue
                         # df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True)  # 替换掉特殊字符
                         # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
@@ -114,11 +119,13 @@ class DatabaseUpdate:
                         else:
                             db_name = '推广数据2'
                             collection_name = f'{tg_name}'
+                        check_remove_file = True
                 if name.endswith('.csv') and '超级直播' in name:
                     # 超级直播
                     df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     pattern = re.findall(r'(.*_)\d{8}_\d{6}', name)
                     if not pattern:  # 说明已经转换过
@@ -130,27 +137,34 @@ class DatabaseUpdate:
                         shop_name = ''
                     # df.replace(to_replace=['\\N'], value=0, regex=False, inplace=True)  # 替换掉特殊字符
                     # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '短直联投' in name:
                     # 短直联投
                     df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
                     df = pd.concat(df)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '视频加速推广' in name:
                     # 超级短视频
                     df = pd.read_excel(os.path.join(root, name), sheet_name=None, header=0)
                     df = pd.concat(df)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     # df.replace(to_replace=[''], value=0, regex=False, inplace=True)
+                    check_remove_file = True
                 if '人群报表汇总' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
+                    check_remove_file = True
                 # ----------------- 推广报表 分割线 -----------------
                 # ----------------- 推广报表 分割线 -----------------
                 date01 = re.findall(r'(\d{4}-\d{2}-\d{2})_\d{4}-\d{2}-\d{2}', str(name))
@@ -161,6 +175,7 @@ class DatabaseUpdate:
                     df = pd.read_excel(os.path.join(root, name), header=5)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     # df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
                     # df.replace(to_replace=[','], value='', regex=True, inplace=True)
@@ -186,15 +201,19 @@ class DatabaseUpdate:
                             collection_name='店铺来源_月数据_旧版'
                         else:
                             collection_name='店铺来源_日数据_旧版'
+                    check_remove_file = True
                 elif name.endswith('.csv') and '客户运营平台_客户列表' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '直播分场次效果' in name:
                     pattern = re.findall(r'(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})', name)
                     if pattern:
+                        check_remove_file = True
                         continue
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.replace(to_replace=['--'], value='', regex=False, inplace=True)
                     df.replace(to_replace=[','], value='', regex=True, inplace=True)
@@ -203,6 +222,7 @@ class DatabaseUpdate:
                     df['日期'] = df['日期'].apply(
                         lambda x: pd.to_datetime(str(x).split(' ')[0], format='%Y-%m-%d', errors='ignore') if x else x)
                     df.insert(loc=1, column='店铺', value='万里马官方旗舰店')
+                    check_remove_file = True
                 elif name.endswith('.xls') and '生意参谋' in name and '无线店铺三级流量来源详情' in name:
                     # 店铺来源，手淘搜索，关键词
@@ -210,6 +230,7 @@ class DatabaseUpdate:
                     df = pd.read_excel(os.path.join(root, name), header=5)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.replace(to_replace=[','], value='', regex=True, inplace=True)
                     df.insert(loc=0, column='日期', value=pattern[0][1])
@@ -221,12 +242,14 @@ class DatabaseUpdate:
                     if pattern[0][0] != pattern[0][1]:
                         data_lis = pattern[0][0] + '_' + pattern[0][1]
                         df.insert(loc=1, column='数据周期', value=data_lis)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '生意参谋' in name and '商品_全部' in name:
                     # 店铺商品排行
                     df = pd.read_excel(os.path.join(root, name), header=4)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     # df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
                     # df.replace(to_replace=[','], value='', regex=True, inplace=True)
@@ -234,18 +257,22 @@ class DatabaseUpdate:
                     if date01[0] != date02[0]:
                         data_lis = date01[0] + '_' + date02[0]
                         df.insert(loc=1, column='数据周期', value=data_lis)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '参谋店铺整体日报' in name:
                     # 自助取数，店铺日报
                     df = pd.read_excel(os.path.join(root, name), header=7)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={'统计日期': '日期'}, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '参谋每日流量_自助取数_新版' in name:
                     # 自助取数，每日流量
                     df = pd.read_excel(os.path.join(root, name), header=7)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={'统计日期': '日期'}, inplace=True)
                     # 2024-2-19 官方更新了推广渠道来源名称，自助取数没有更新，这里强制更改
@@ -258,11 +285,13 @@ class DatabaseUpdate:
                         else '智能场景' if x == '智能场景(原万相台)'
                         else x
                     )
+                    check_remove_file = True
                 elif name.endswith('.xls') and '商品sku' in name:
                     # 自助取数，商品sku
                     df = pd.read_excel(os.path.join(root, name), header=7)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={
                         '统计日期': '日期',
@@ -270,11 +299,13 @@ class DatabaseUpdate:
                         'SKU ID': 'sku id',
                         '商品SKU': '商品sku',
                     }, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '参谋店铺流量来源（月）' in name:
                     # 自助取数，月店铺流量来源
                     df = pd.read_excel(os.path.join(root, name), header=7)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={'统计日期': '数据周期'}, inplace=True)
                     # 2024-2-19 官方更新了推广渠道来源名称，自助取数没有更新，这里强制更改
@@ -288,47 +319,56 @@ class DatabaseUpdate:
                         else x
                     )
                     df['日期'] = df['数据周期'].apply(lambda x: re.findall('(.*) ~', x)[0])
+                    check_remove_file = True
                 elif name.endswith('.csv') and 'baobei' in name:
                     # 生意经宝贝指标日数据
                     date = re.findall(r's-(\d{4})(\d{2})(\d{2})\.', str(name))
                     if not date:  # 阻止月数据及已转换的表格
                         print(f'{name}  不支持或是已转换的表格')
                         # os.remove(os.path.join(root, name))  # 直接删掉，避免被分到原始文件, encoding 不同会引发错误
+                        check_remove_file = True
                         continue
                     df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
                         os.remove(os.path.join(root, name))
+                        check_remove_file = True
                         continue
                     if '日期' in df.columns.tolist():
                         df.pop('日期')
                     new_date = '-'.join(date[0])
                     df.insert(loc=0, column='日期', value=new_date)
                     df.replace(to_replace=['--'], value='', regex=False, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '店铺销售指标' in name:
                     # 生意经, 店铺指标，仅限月数据，实际日指标也可以
                     name_st = re.findall(r'(.*)\(分日', name)
                     if not name_st:
                         print(f'{name}  已转换的表格')
+                        check_remove_file = True
                         continue
                     df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df['日期'] = df['日期'].astype(str).apply(
                         lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', x)[0]) if x else x)
                     df.replace(to_replace=['--'], value='', regex=False, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('csv') and '省份城市分析' in name:
                     # 生意经，地域分布, 仅限日数据
                     pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\.', name)
                     if not pattern or '省份城市分析2' not in name:
                         print(f'{name}  不支持或已转换的表格')
                         # os.remove(os.path.join(root, name))  # 直接删掉，避免被分到原始文件, encoding 不同会引发错误
+                        check_remove_file = True
                         continue
                     date = '-'.join(pattern[0][1:])
                     df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df['省'] = df['省份'].apply(lambda x: x if ' ├─ ' not in x and ' └─ ' not in x else None)
                     df['城市'] = df[['省份', '省']].apply(lambda x: '汇总' if x['省'] else x['省份'], axis=1)
@@ -342,12 +382,14 @@ class DatabaseUpdate:
                     df['省+市'] = df[['省份', '城市']].apply(lambda x: f'{x["省份"]}-{x["城市"]}', axis=1)
                     df.replace('NAN', 0, inplace=True)
                     df['笔单价'] = df.apply(lambda x: 0 if x['销售量'] == 0 else 0 if x['销售量'] == '0' else x['笔单价'], axis=1)
+                    check_remove_file = True
                 elif name.endswith('csv') and 'order' in name:
                     # 生意经，订单数据，仅限月数据
                     pattern = re.findall(r'(.*)(\d{4})(\d{2})(\d{2})-(\d{4})(\d{2})(\d{2})', name)
                     if not pattern:
                         print(f'{name}  不支持或已转换的表格')
                         # os.remove(os.path.join(root, name))  # 直接删掉，避免被分到原始文件, encoding 不同会引发错误
+                        check_remove_file = True
                         continue
                     date1 = pattern[0][1:4]
                     date1 = '-'.join(date1)
@@ -357,6 +399,7 @@ class DatabaseUpdate:
                     df = pd.read_csv(os.path.join(root, name), encoding='gb18030', header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.insert(loc=0, column='日期', value=date1)
                     df.insert(loc=1, column='数据周期', value=date)
@@ -365,30 +408,38 @@ class DatabaseUpdate:
                     df.rename(columns={'宝贝标题': '商品标题', '宝贝链接': '商品链接'}, inplace=True)
                     df['颜色编码'] = df['商家编码'].apply(
                         lambda x: ''.join(re.findall(r' .*(\d{4})$', str(x))) if x else x)
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '直播间成交订单明细' in name:
                     # 直播间成交订单明细
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={'场次ID': '场次id', '商品ID': '商品id'}, inplace=True)
                     df['日期'] = df['支付时间'].apply(lambda x: x.strftime('%Y-%m-%d'))
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '直播间大盘数据' in name:
                     # 直播间大盘数据
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={'统计日期': '日期'}, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '直播业绩-成交拆解' in name:
                     # 直播业绩-成交拆解
                     df = pd.read_excel(os.path.join(root, name), header=5)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.rename(columns={'统计日期': '日期'}, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '淘宝店铺数据' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '人群洞察' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
                     df.replace(to_replace=['--'], value='', regex=False, inplace=True)
@@ -397,37 +448,50 @@ class DatabaseUpdate:
                         if is_move:
                             try:
                                 os.remove(os.path.join(root, name))  # 是否移除原文件
+                                check_remove_file = True
                             except Exception as e:
                                 print(f'{name},  {e}')
                         continue
                 elif name.endswith('.csv') and '客户_客户概况_画像' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '市场排行_店铺' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_商品发现' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '类目洞察_属性分析_分析明细_汇总' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_商品发现' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '类目洞察_价格分析_分析明细_汇总' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '搜索排行_搜索' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '竞店分析-销售分析-关键指标对比' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '竞店分析-销售分析-top商品榜' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '竞店分析-来源分析-入店来源' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '竞店分析-来源分析-入店搜索词' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
+                    check_remove_file = True
                 # ----------------------- 京东数据处理分界线 -----------------------
                 # ----------------------- 京东数据处理分界线 -----------------------
                 elif name.endswith('.xlsx') and '店铺来源_流量来源' in name:
                     # 京东店铺来源
                     if '按天' not in name:
                         print(f'{name} 京东流量请按天下载')
+                        check_remove_file = True
                         continue
                     date01 = re.findall(r'(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
                     new_date01 = f'{date01[0][0]}-{date01[0][1]}-{date01[0][2]}'
@@ -436,6 +500,7 @@ class DatabaseUpdate:
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.insert(loc=0, column='日期', value=new_date01)
                     if new_date01 != new_date02:
@@ -444,17 +509,20 @@ class DatabaseUpdate:
                     for col_2024 in cols:  # 京东这个表有字段加了去年日期，删除这些同比数据字段，不然列数量爆炸
                         if '20' in col_2024 and '流量来源' in name:
                             df.drop(col_2024, axis=1, inplace=True)
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '全部渠道_商品明细' in name:
                     # 京东商品明细 文件转换
                     date1 = re.findall(r'_(\d{4})(\d{2})(\d{2})_全部', str(name))
                     if not date1[0]:
                         print(f'{name}: 仅支持日数据')
+                        check_remove_file = True
                         continue
                     if date1:
                         date1 = f'{date1[0][0]}-{date1[0][1]}-{date1[0][2]}'
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     if '10035975359247' in df['商品ID'].values or '10056642622343' in df['商品ID'].values:
                         new_name = f'sku_{date1}_全部渠道_商品明细.csv'
@@ -471,30 +539,37 @@ class DatabaseUpdate:
                     elif 'spu' in new_name:
                         db_name = '京东数据2'
                         collection_name = 'spu_商品明细'
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '搜索分析-排名定位-商品词下排名' in name:
                     # 京东商品词下排名
                     try:
                         pattern = re.findall(r'(\d{4}-\d{2}-\d{2})-(\d{4}-\d{2}-\d{2})', name)
                         if not pattern:
+                            check_remove_file = True
                             continue
                         if pattern[0][0] == pattern[0][1]:
                             print(f'{name}: 检测到数据周期异常，仅支持7天数据')
+                            check_remove_file = True
                             continue
                         df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
                         if len(df) == 0:
                             print(f'{name} 报表数据为空')
+                            check_remove_file = True
                             continue
                         if len(df.columns.tolist()) < 20:
                             print(f'{name}: 报表可能缺失诊断数据')
                             os.remove(os.path.join(root, name))
+                            check_remove_file = True
                             continue
                         df.rename(columns={'商品的ID': 'skuid'}, inplace=True)
                         for col in ['词人气', '搜索点击率']:
                             if col in df.columns.tolist():
                                 df[col] = df[col].apply(lambda x: round(x, 6) if x else x)
+                        check_remove_file = True
                     except Exception as e:
                         print(e)
                         print(name, '报错')
+                        check_remove_file = True
                         continue
                 elif name.endswith('.xlsx') and '搜索分析-排名定位-商品排名' in name:
                     # 京东商品排名
@@ -502,11 +577,13 @@ class DatabaseUpdate:
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.insert(0, '日期', date_in)  # 插入新列
                     df.rename(columns={'SKU': 'skuid'}, inplace=True)
                     if '点击率' in df.columns.tolist():
                         df['点击率'] = df['点击率'].apply(lambda x: round(x, 6) if x else x)
+                    check_remove_file = True
                 elif name.endswith('.xls') and '竞店概况_竞店详情' in name:
                     # 京东，竞争-竞店概况-竞店详情-全部渠道
                     date01 = re.findall(r'全部渠道_(\d{4})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})', str(name))
@@ -515,68 +592,87 @@ class DatabaseUpdate:
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df.insert(loc=0, column='日期', value=start_date)
-                elif name.endswith('.xls') and 'JD店铺日报_店铺' in name:
+                    check_remove_file = True
+                elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
                     # 京东 自助报表  店铺日报
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
+                        continue
+                    if '访客数-全部渠道' not in df.columns.tolist():  # 识别是否真的京东日报
                         continue
                     df['日期'] = df['日期'].apply(
                         lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
                     )
+                    check_remove_file = True
                 elif name.endswith('.xls') and '商家榜单_女包_整体' in name:
                     # 京东 行业 商家榜单
                     date2 = re.findall(r'_\d{8}-\d+', name)
                     if date2:
                         print(f'{name}: 请下载日数据，不支持其他周期')
                         # os.remove(os.path.join(root, name))  # 直接删掉，避免被分到原始文件, encoding 不同会引发错误
+                        check_remove_file = True
                         continue
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df['日期'] = df['日期'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:8]}')
                     df.insert(loc=0, column='类型', value='商家榜单')
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '批量SKU导出-批量任务' in name:
                     # 京东 sku 导出
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     d_time = datetime.datetime.today().strftime('%Y-%m-%d')
                     df.insert(loc=0, column='日期', value=d_time)
                     df['商品链接'] = df['商品链接'].apply(lambda x: f'https://{x}' if x else x)
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '批量SPU导出-批量任务' in name:
                     # 京东 spu 导出
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     d_time = datetime.datetime.today().strftime('%Y-%m-%d')
                     df.insert(loc=0, column='日期', value=d_time)
+                    check_remove_file = True
                 elif name.endswith('.csv') and '万里马箱包推广1_完整点击成交' in name:
                     # 京东推广数据
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
+                    check_remove_file = True
                 elif name.endswith('.csv') and '万里马箱包推广1_京东推广搜索词_pbix同步不要' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df['日期'] = df['日期'].apply(lambda x: f'{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
                     df['是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
                     df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
+                    check_remove_file = True
                 elif name.endswith('.xlsx') and '零售明细统计' in name:
                     df = pd.read_excel(os.path.join(root, name), header=0)
                     if len(df) == 0:
                         print(f'{name} 报表数据为空')
+                        check_remove_file = True
                         continue
                     df = df[df['缩略图'] != '合计']
+                    check_remove_file = True
                 elif name.endswith('.csv') and '营销概况_全站营销' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=1, na_filter=False)
                     df = df[(df['日期'] != '日期') & (df['日期'] != '汇总') & (df['日期'] != '0') & (df['花费'] != '0') & (df['花费'] != '0.00')]
@@ -584,6 +680,7 @@ class DatabaseUpdate:
                     df.drop("'当前时间'", axis=1, inplace=True)
                     df.rename(columns={'全站ROI': '全站roi'}, inplace=True)
                     df.insert(loc=1, column='产品线', value='全站营销')
+                    check_remove_file = True
                 elif name.endswith('.csv') and '关键词点击成交报表_pbix同步_勿删改' in name:
                     df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
                     for col in df.columns.tolist():
@@ -595,6 +692,7 @@ class DatabaseUpdate:
                     df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore')
                     # min_clm = str(df['日期'].min()).split(' ')[0]
                     # max_clm = str(df['日期'].max()).split(' ')[0]
+                    check_remove_file = True
                 # 商品素材，必须保持放在最后处理
                 elif name.endswith('xlsx'):
@@ -619,8 +717,9 @@ class DatabaseUpdate:
                             collection_name = '商品素材导出'
                         else:
                             df = pd.DataFrame()
+                        check_remove_file = True
-                if is_move:
+                if is_move and check_remove_file:
                     try:
                         os.remove(os.path.join(root, name))  # 是否移除原文件
                     except Exception as e:

mdbq/clean/data_clean.py CHANGED Viewed

@@ -895,13 +895,15 @@ class DataClean:
                             m.df_to_mysql(df=df, db_name='京东数据1', tabel_name='京东_竞店监控_日数据')
                         os.remove(os.path.join(root, name))
-                    elif name.endswith('.xls') and '店铺' in name:
+                    elif name.endswith('.xls') and ('JD店铺日报_店铺' in name or '店铺_20' in name):
                         # 京东 自助报表  店铺日报
                         df = pd.read_excel(os.path.join(root, name), header=0)
                         if len(df) == 0:
                             print(f'{name} 报表数据为空')
                             os.remove(os.path.join(root, name))
                             continue
+                        if '访客数-全部渠道' not in df.columns.tolist():  # 识别是否真的京东日报
+                            continue
                         df['日期'] = df['日期'].apply(
                             lambda x: '-'.join(re.findall(r'(\d{4})(\d{2})(\d{2})', str(x))[0])
                         )

mdbq/company/copysh.py CHANGED Viewed

@@ -377,3 +377,4 @@ if __name__ == '__main__':
     main()
     # # 聚合数据，并清理聚合数据
     # query_data.data_aggregation(service_databases=[{'company': 'mysql'}], months=1)

{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mdbq
-Version: 1.9.5
+Version: 1.9.7
 Home-page: https://pypi.org/project/mdbsql
 Author: xigua,
 Author-email: 2587125111@qq.com

{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
 mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
 mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
-mdbq/aggregation/aggregation.py,sha256=dQdaZZ8PD8uHY5opW9M6EIEONv-q_V-e_XtvITtJNrc,67166
+mdbq/aggregation/aggregation.py,sha256=QAN378cXlkwonHUDSBYfdZRfHBuqft_HR7Vfr8l87-k,72085
 mdbq/aggregation/df_types.py,sha256=oQJS2IBU3_IO6GMgbssHuC2yCjNnbta0QPGrFOwNLnU,7591
 mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
 mdbq/aggregation/optimize_data.py,sha256=u2Kl_MFtZueXJ57ycy4H2OhXD431RctUYJYCl637uT0,4176
@@ -9,9 +9,9 @@ mdbq/aggregation/query_data.py,sha256=32NjVVYLnfFkzD8TflmNVhpdQTLRRUrb9toMGApSOC
 mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
 mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
 mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
-mdbq/clean/data_clean.py,sha256=VI_f9mQ2tHExBytQCCkajbPVpC0yNEOaVCtySe_OW40,100789
+mdbq/clean/data_clean.py,sha256=y83uqOyM6nL0d3ClUqYMjE23ghBEkhz9uv19qrxA8NA,100980
 mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
-mdbq/company/copysh.py,sha256=4PGjvmPzvrmstOaAwHQGFXIGCWqqNXZEOYf1QdUvMlI,17762
+mdbq/company/copysh.py,sha256=VUaaJPXPYPHWwnkdK77PWz_dAXZyEmYBA9Df1yROHAc,17764
 mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
 mdbq/config/get_myconf.py,sha256=-CFEW0dQh4OIwVgwK-cL0eVp1LN3PjJgN89d4P5TB9I,6011
 mdbq/config/products.py,sha256=vIK8DJ-F3XXwvNPK-4OJq2tZITNlL6Sub8QBdoOng8U,5676
@@ -36,7 +36,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
 mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
 mdbq/pbix/refresh_all.py,sha256=0uAnBKCd5cx5FLTkawN1GV9yi87rfyMgYal5LABtumQ,7186
 mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
-mdbq-1.9.5.dist-info/METADATA,sha256=qbeZPyNml9_seMx78A_nUdztJUVCi1xK8_E2MpdEu_4,245
-mdbq-1.9.5.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-mdbq-1.9.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
-mdbq-1.9.5.dist-info/RECORD,,
+mdbq-1.9.7.dist-info/METADATA,sha256=rqBOduo-xKxLyXbxt83RXob4dVYqlhdql_WL06TysmY,245
+mdbq-1.9.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+mdbq-1.9.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
+mdbq-1.9.7.dist-info/RECORD,,

{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{mdbq-1.9.5.dist-info → mdbq-1.9.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

mdbq 1.9.5__py3-none-any.whl → 1.9.7__py3-none-any.whl

mdbq 1.9.5py3-none-any.whl → 1.9.7py3-none-any.whl