mdbq 2.6.5__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1329,7 +1329,7 @@ if __name__ == '__main__':
1329
1329
  db_name = '京东数据3'
1330
1330
  table_name = '京东商智_spu_商品明细'
1331
1331
  upload_dir(
1332
- path='/Users/xigua/数据中心/原始文件3/京东报表/spu_商品明细',
1332
+ path='/Users/xigua/数据中心/原始文件3/京东报表/spu_商品明细qwqw',
1333
1333
  db_name=db_name,
1334
1334
  collection_name=table_name,
1335
1335
  dbs={'mysql': True, 'mongodb': False},
@@ -423,7 +423,6 @@ class MysqlDatasQuery:
423
423
  '总订单行': 1,
424
424
  '总订单金额': 1,
425
425
  '总加购数': 1,
426
- '下单新客数(去重)': 1,
427
426
  '领券数': 1,
428
427
  '商品关注数': 1,
429
428
  '店铺关注数': 1,
@@ -493,24 +492,26 @@ class MysqlDatasQuery:
493
492
  return pd.to_datetime(start_date), pd.to_datetime(end_date)
494
493
 
495
494
  @try_except
496
- def tm_search(self):
495
+ def se_search(self):
497
496
  start_date, end_date = self.months_data(num=self.months)
498
497
  projection = {
499
498
  '日期': 1,
500
- '关键词': 1,
499
+ '店铺名称': 1,
500
+ '搜索词': 1,
501
+ '词类型': 1,
501
502
  '访客数': 1,
503
+ '加购人数': 1,
504
+ '商品收藏人数': 1,
502
505
  '支付转化率': 1,
503
- '支付金额': 1,
504
- '下单金额': 1,
505
506
  '支付买家数': 1,
506
- '下单买家数': 1,
507
- '加购人数': 1,
507
+ '支付金额': 1,
508
508
  '新访客': 1,
509
- '店铺名称': 1,
509
+ '客单价': 1,
510
+ 'uv价值': 1,
510
511
  }
511
512
  df = self.download.data_to_df(
512
513
  db_name='生意参谋3',
513
- table_name='店铺来源_手淘搜索',
514
+ table_name='手淘搜索_本店引流词',
514
515
  start_date=start_date,
515
516
  end_date=end_date,
516
517
  projection=projection,
@@ -1462,7 +1463,6 @@ class GroupBy:
1462
1463
  '总订单行': ('总订单行', np.max),
1463
1464
  '总订单金额': ('总订单金额', np.max),
1464
1465
  '总加购数': ('总加购数', np.max),
1465
- '下单新客数': ('下单新客数(去重)', np.max),
1466
1466
  '领券数': ('领券数', np.max),
1467
1467
  '商品关注数': ('商品关注数', np.max),
1468
1468
  '店铺关注数': ('店铺关注数', np.max)
@@ -1476,17 +1476,16 @@ class GroupBy:
1476
1476
  return df
1477
1477
  elif '天猫店铺来源_手淘搜索' in table_name:
1478
1478
  df = df.groupby(
1479
- ['日期', '关键词', '店铺名称'],
1479
+ ['日期', '店铺名称', '词类型', '搜索词'],
1480
1480
  as_index=False).agg(
1481
1481
  **{
1482
1482
  '访客数': ('访客数', np.max),
1483
- '支付转化率': ('支付转化率', np.max),
1483
+ '加购人数': ('加购人数', np.max),
1484
1484
  '支付金额': ('支付金额', np.max),
1485
- '下单金额': ('下单金额', np.max),
1485
+ '支付转化率': ('支付转化率', np.max),
1486
1486
  '支付买家数': ('支付买家数', np.max),
1487
- '下单买家数': ('下单买家数', np.max),
1488
- '加购人数': ('加购人数', np.max),
1489
- '新访客': ('新访客', np.max),
1487
+ '客单价': ('客单价', np.max),
1488
+ 'uv价值': ('uv价值', np.max)
1490
1489
  }
1491
1490
  )
1492
1491
  return df
@@ -2077,6 +2076,7 @@ def data_aggregation(service_databases=[{}], months=1, is_juhe=True):
2077
2076
  2. 数据聚合清洗
2078
2077
  3. 统一回传数据库: <聚合数据> (不再导出为文件)
2079
2078
  公司台式机调用
2079
+ months: 1+,写 0 表示当月数据,但在每月 1 号时可能会因为返回空数据出错
2080
2080
  """
2081
2081
  for service_database in service_databases:
2082
2082
  for service_name, database in service_database.items():
@@ -2183,14 +2183,14 @@ def data_aggregation(service_databases=[{}], months=1, is_juhe=True):
2183
2183
  '数据库名': '聚合数据',
2184
2184
  '集合名': '天猫店铺来源_手淘搜索', # 暂缺
2185
2185
  '唯一主键': ['日期', '关键词', '访客数'],
2186
- '数据主体': sdq.tm_search(),
2187
- },
2188
- {
2189
- '数据库名': '聚合数据',
2190
- '集合名': '生意参谋_直播场次分析', # 暂缺
2191
- '唯一主键': ['场次id'],
2192
- '数据主体': sdq.zb_ccfx(),
2186
+ '数据主体': sdq.se_search(),
2193
2187
  },
2188
+ # {
2189
+ # '数据库名': '聚合数据',
2190
+ # '集合名': '生意参谋_直播场次分析', # 暂缺
2191
+ # '唯一主键': ['场次id'],
2192
+ # '数据主体': sdq.zb_ccfx(),
2193
+ # },
2194
2194
  {
2195
2195
  '数据库名': '聚合数据',
2196
2196
  '集合名': '多店推广场景_按日聚合',
@@ -2304,6 +2304,6 @@ def main():
2304
2304
 
2305
2305
 
2306
2306
  if __name__ == '__main__':
2307
- data_aggregation(service_databases=[{'company': 'mysql'}], months=0, is_juhe=True) # 正常的聚合所有数据
2307
+ data_aggregation(service_databases=[{'company': 'mysql'}], months=24, is_juhe=False) # 正常的聚合所有数据
2308
2308
  # data_aggregation_one(service_databases=[{'company': 'mysql'}], months=1) # 单独聚合某一个数据库,具体库进函数编辑
2309
2309
  # optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
@@ -151,7 +151,7 @@ class DataClean:
151
151
  self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
152
152
  os.remove(os.path.join(root, name))
153
153
  elif name.endswith('.xls') and '手淘搜索_本店引流词_' in name:
154
- df = pd.read_excel(os.path.join(root, name), header=5)
154
+ df = pd.read_excel(os.path.join(root, name), header=5, engine='xlrd')
155
155
  if len(df) == 0:
156
156
  print(f'{name} 报表数据不能为空')
157
157
  continue
@@ -382,7 +382,7 @@ class DataClean:
382
382
  sheets4 = ['账户', '推广计划', '推广单元', '创意', '品牌流量包', '定向人群'] # 品销宝
383
383
  file_name4 = os.path.splitext(name)[0] # 明星店铺报表
384
384
  for sheet4 in sheets4:
385
- df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='openpyxl')
385
+ df = pd.read_excel(os.path.join(root, name), sheet_name=sheet4, header=0, engine='xlrd')
386
386
  if len(df) == 0:
387
387
  print(f'{name} 报表数据为空')
388
388
  os.remove(os.path.join(root, name))
@@ -765,11 +765,11 @@ class DataClean:
765
765
  continue
766
766
 
767
767
  if name.endswith('.xlsx') and '京东推广_' in name:
768
- df = pd.read_excel(os.path.join(root, name), header=0)
768
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
769
769
  new_name = f'py_xg_{name}'
770
770
  os.rename(os.path.join(root, name), os.path.join(root, new_name))
771
771
  elif name.endswith('.xlsx') and '京东商智_sku_商品明细' in name:
772
- df = pd.read_excel(os.path.join(root, name), header=0)
772
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
773
773
  df.replace(to_replace=['-'], value='', regex=False, inplace=True)
774
774
  pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
775
775
  df.insert(loc=0, column='日期', value=pattern)
@@ -780,7 +780,7 @@ class DataClean:
780
780
  index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
781
781
  os.remove(os.path.join(root, name))
782
782
  elif name.endswith('.xlsx') and '京东商智_spu_商品明细' in name:
783
- df = pd.read_excel(os.path.join(root, name), header=0)
783
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
784
784
  df.replace(to_replace=['-'], value='', regex=False, inplace=True)
785
785
  pattern = re.findall(r'_(\d{4}-\d{2}-\d{2})', name)[0]
786
786
  df.insert(loc=0, column='日期', value=pattern)
@@ -791,7 +791,7 @@ class DataClean:
791
791
  index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
792
792
  os.remove(os.path.join(root, name))
793
793
  elif name.endswith('.xlsx') and '京东商智_店铺来源_三级来源' in name:
794
- df = pd.read_excel(os.path.join(root, name), header=0)
794
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='openpyxl')
795
795
  df.replace(to_replace=['-'], value='', regex=False, inplace=True)
796
796
  df.rename(columns={'时间': '日期'}, inplace=True)
797
797
  for col in df.columns.tolist():
@@ -870,7 +870,7 @@ class DataClean:
870
870
 
871
871
  if name.endswith('.xlsx') and '商品素材_' in name:
872
872
  shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)_', name)[0]
873
- df = pd.read_excel(os.path.join(root, name), header=0)
873
+ df = pd.read_excel(os.path.join(root, name), header=0, engine='xlrd')
874
874
  if '日期' not in df.columns.tolist():
875
875
  df.insert(loc=0, column='日期', value=datetime.datetime.today().strftime('%Y-%m-%d'))
876
876
  if '店铺名称' not in df.columns.tolist():
@@ -1276,7 +1276,7 @@ class DataClean:
1276
1276
  new_path = os.path.join(root, zip_name_1) # 拼接解压后的文件路径
1277
1277
  if os.path.isfile(new_path) and '全部渠道_商品明细' in new_path: # 是否存在和包内同名的文件
1278
1278
  # 专门处理京东文件, 已过期可删
1279
- df = pd.read_excel(new_path)
1279
+ df = pd.read_excel(new_path, engine='xlrd')
1280
1280
  try:
1281
1281
  pattern1 = re.findall(r'\d{8}_(\d{4})(\d{2})(\d{2})_全部渠道_商品明细',
1282
1282
  name)
@@ -1466,6 +1466,57 @@ def test():
1466
1466
  # print(e)
1467
1467
 
1468
1468
 
1469
+ def date_table(service_databases=[{}]):
1470
+ """
1471
+ 生成 pbix 使用的日期表
1472
+ """
1473
+ start_date = '2022-01-01' # 日期表的起始日期
1474
+ yesterday = time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))
1475
+ dic = pd.date_range(start=start_date, end=yesterday)
1476
+ df = pd.DataFrame(dic, columns=['日期'])
1477
+ df.sort_values('日期', ascending=True, ignore_index=True, inplace=True)
1478
+ df.reset_index(inplace=True)
1479
+ # inplace 添加索引到 df
1480
+ p = df.pop('index')
1481
+ df['月2'] = df['日期']
1482
+ df['月2'] = df['月2'].dt.month
1483
+ df['日期'] = df['日期'].dt.date # 日期格式保留年月日,去掉时分秒
1484
+ df['年'] = df['日期'].apply(lambda x: str(x).split('-')[0] + '年')
1485
+ df['月'] = df['月2'].apply(lambda x: str(x) + '月')
1486
+ # df.drop('月2', axis=1, inplace=True)
1487
+ mon = df.pop('月2')
1488
+ df['日'] = df['日期'].apply(lambda x: str(x).split('-')[2])
1489
+ df['年月'] = df.apply(lambda x: x['年'] + x['月'], axis=1)
1490
+ df['月日'] = df.apply(lambda x: x['月'] + x['日'] + '日', axis=1)
1491
+ df['第n周'] = df['日期'].apply(lambda x: x.strftime('第%W周'))
1492
+ df['索引'] = p
1493
+ df['月索引'] = mon
1494
+ df.sort_values('日期', ascending=False, ignore_index=True, inplace=True)
1495
+
1496
+ for service_database in service_databases:
1497
+ for service_name, database in service_database.items():
1498
+ username, password, host, port = get_myconf.select_config_values(
1499
+ target_service=service_name,
1500
+ database=database,
1501
+ )
1502
+ m = mysql.MysqlUpload(
1503
+ username=username,
1504
+ password=password,
1505
+ host=host,
1506
+ port=port,
1507
+ )
1508
+ m.df_to_mysql(
1509
+ df=df,
1510
+ db_name='聚合数据',
1511
+ table_name='日期表',
1512
+ move_insert=True, # 先删除,再插入
1513
+ df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
1514
+ drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
1515
+ filename=None, # 用来追踪处理进度
1516
+ service_database=service_database, # 用来追踪处理进度
1517
+ )
1518
+
1519
+
1469
1520
  def main(service_databases=None, is_mysql=False):
1470
1521
  """
1471
1522
  is_mysql: 调试时加,False: 是否后续的聚合数据
@@ -1505,6 +1556,8 @@ def main(service_databases=None, is_mysql=False):
1505
1556
  if not is_mysql:
1506
1557
  return
1507
1558
 
1559
+ # 更新日期表
1560
+ date_table(service_databases=service_databases)
1508
1561
  # 更新货品年份基准表, 属性设置 2 - 货品年份基准
1509
1562
  p = products.Products()
1510
1563
  p.to_mysql(service_databases=service_databases)
@@ -1544,16 +1597,16 @@ def main(service_databases=None, is_mysql=False):
1544
1597
 
1545
1598
 
1546
1599
  if __name__ == '__main__':
1547
- main(
1548
- service_databases = [
1549
- {'company': 'mysql'},
1550
- # {'home_lx': 'mysql'},
1551
- # {'home_lx': 'mongodb'},
1552
- # {'nas': 'mysql'},
1553
- ],
1554
- is_mysql = False, # 清理聚合数据
1555
- )
1556
-
1600
+ # main(
1601
+ # service_databases = [
1602
+ # {'company': 'mysql'},
1603
+ # # {'home_lx': 'mysql'},
1604
+ # # {'home_lx': 'mongodb'},
1605
+ # # {'nas': 'mysql'},
1606
+ # ],
1607
+ # is_mysql = False, # 清理聚合数据
1608
+ # )
1609
+ date_table(service_databases=[{'company': 'mysql'}])
1557
1610
  # c = DataClean(
1558
1611
  # path=upload_path, # 源文件目录,下载文件夹
1559
1612
  # source_path=source_path3, # 原始文件保存目录
@@ -1564,3 +1617,4 @@ if __name__ == '__main__':
1564
1617
 
1565
1618
 
1566
1619
  # test()
1620
+
mdbq/company/copysh.py CHANGED
@@ -22,6 +22,7 @@ from mdbq.config import products
22
22
  from mdbq.mysql import mysql
23
23
  from mdbq.pbix import refresh_all
24
24
  from mdbq.other import sku_picture
25
+ from mdbq.clean import clean_upload
25
26
  warnings.filterwarnings('ignore')
26
27
 
27
28
 
@@ -338,13 +339,16 @@ def op_data(days: int =100):
338
339
  # 清理所有非聚合数据的库
339
340
  optimize_data.op_data(
340
341
  db_name_lists=[
341
- '京东数据2',
342
+ '京东数据3',
343
+ '属性设置3',
342
344
  '推广数据2',
343
- '市场数据2',
344
- '生意参谋2',
345
+ '生意参谋3',
346
+ '推广数据_淘宝店',
347
+ '爱库存2'
348
+ '生意参谋3',
345
349
  '生意经2',
346
- '属性设置2',
347
350
  # '聚合数据', # 不在这里清理聚合数据, 还未开始聚合呢
351
+ '达摩盘3',
348
352
  ],
349
353
  days=days,
350
354
  )
@@ -363,34 +367,17 @@ def main():
363
367
  while True:
364
368
  res, d_path = u.check_date() # 文件中的 ch_record 值,决定是否执行更新
365
369
  if res:
366
- upload_path = f'windows/{str(datetime.date.today().strftime("%Y-%m"))}/{str(datetime.date.today())}'
370
+ upload_path = f'windows2/{str(datetime.date.today().strftime("%Y-%m"))}/{str(datetime.date.today())}'
367
371
  b = bdup.BaiDu()
372
+ # 从百度云下载文件
368
373
  b.download_dir(local_path=d_path, remote_path=upload_path)
369
374
 
370
- dp = aggregation.DatabaseUpdate(path=d_path)
371
- dp.new_unzip(is_move=True)
372
- dp.cleaning(is_move=True, is_except=[]) # 公司台式机需要移除自身下载的文件
373
- dp.upload_df(service_databases=[{'company': 'mysql'}])
374
- dp.date_table(service_databases=[{'company': 'mysql'}]) # 因为日期表不受 days 参数控制,因此单独更新日期表
375
- dp.other_table(service_databases=[{'company': 'mysql'}]) # 上传 support 文件夹下的 主推商品.csv
376
- # 更新货品年份基准表, 属性设置 2 - 货品年份基准
377
- p = products.Products()
378
- p.to_mysql(service_databases=[
379
- # {'home_lx': 'mysql'},
380
- {'company': 'mysql'}
381
- ]
375
+ # 对文件进行清洗和上传数据库
376
+ clean_upload.main(
377
+ service_databases = [{'company': 'mysql'}],
378
+ is_mysql = False, # 清理聚合数据
382
379
  )
383
380
 
384
- if datetime.datetime.now().day in [1, 3, 7, 9, 12, 16, 19, 22, 25, 27]:
385
- sku_picture.download_spu(
386
- service_name='company',
387
- database='mysql',
388
- db_name='属性设置2',
389
- table_name='商品spu素材下载记录',
390
- col_name='商品图片',
391
- save_path=os.path.join(f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\天猫报表\\其他文件', '商品id_商家编码_图片'),
392
- )
393
-
394
381
  # 此操作用于修改 .copysh_conf 文件,将 ch_record 改为 false (更新完成)
395
382
  w = update_conf.UpdateConf()
396
383
  w.update_config(filename='.copysh_conf', option='ch_record', new_value='False')
@@ -0,0 +1,417 @@
1
+ # -*- coding: UTF-8 –*-
2
+ import os
3
+ import platform
4
+ import warnings
5
+ import getpass
6
+ import sys
7
+ import configparser
8
+ import datetime
9
+ import shutil
10
+ import time
11
+ import re
12
+ import socket
13
+ from dateutil.utils import today
14
+ from mdbq.bdup import bdup
15
+ from mdbq.aggregation import aggregation
16
+ from mdbq.aggregation import query_data
17
+ from mdbq.aggregation import optimize_data
18
+ from mdbq.config import update_conf
19
+ from mdbq.config import get_myconf
20
+ from mdbq.config import set_support
21
+ from mdbq.config import products
22
+ from mdbq.mysql import mysql
23
+ from mdbq.pbix import refresh_all
24
+ from mdbq.other import sku_picture
25
+ warnings.filterwarnings('ignore')
26
+
27
+
28
+ class TbFiles:
29
+ """
30
+ 用于在公司台式机中 定时同步pandas数据源文件到共享
31
+ """
32
+ def __init__(self):
33
+
34
+ support_path = set_support.SetSupport(dirname='support').dirname
35
+
36
+ self.my_conf = os.path.join(support_path, '.copysh_conf')
37
+ self.path1 = os.path.join(support_path, 'tb_list.txt')
38
+ self.path2 = os.path.join(support_path, 'cp_list.txt')
39
+ self.d_path = None
40
+ self.data_path = None
41
+ self.share_path = None
42
+ self.before_max_time = []
43
+ self.sleep_minutes = 30
44
+ self.tomorrow = datetime.date.today()
45
+
46
+ def check_change(self):
47
+ """ 检查 source_path 的所有文件修改日期, 函数返回最新修改日期 """
48
+ source_path = os.path.join(self.data_path, 'pandas数据源')
49
+ if not os.path.exists(source_path):
50
+ return
51
+ results = []
52
+ for root, dirs, files in os.walk(source_path, topdown=False):
53
+ for name in files:
54
+ if '~$' in name or 'baiduyun' in name or name.startswith('.') or 'Icon' in name or 'xunlei' in name:
55
+ continue # 排除这些文件的变动
56
+ # stat_info = os.path.getmtime(os.path.join(root, name))
57
+ _c = os.stat(os.path.join(root, name)).st_mtime # 读取文件的元信息 >>>文件修改时间
58
+ c_time = datetime.datetime.fromtimestamp(_c) # 格式化修改时间
59
+ results.append(c_time)
60
+ return max(results).strftime('%Y%m%d%H%M%S')
61
+
62
+ def check_conf(self):
63
+ if not os.path.isfile(self.my_conf):
64
+ self.set_conf() # 添加配置文件
65
+ print('因缺少配置文件, 已自动初始化')
66
+ config = configparser.ConfigParser() # 初始化configparser类
67
+ try:
68
+ config.read(self.my_conf, 'UTF-8')
69
+ self.d_path = config.get('database', 'd_path')
70
+ self.data_path = config.get('database', 'data_path')
71
+ self.share_path = config.get('database', 'share_path')
72
+ if self.d_path is None or self.data_path is None or self.share_path is None:
73
+ self.set_conf()
74
+ print('配置文件部分值不完整, 已自动初始化')
75
+ if not os.path.exists(self.d_path) or not os.path.exists(self.data_path) or not os.path.exists(self.share_path):
76
+ self.set_conf()
77
+ print('配置文件异常(可能跨系统), 已自动初始化')
78
+ except Exception as e:
79
+ print(e)
80
+ print('配置文件部分值缺失, 已自动初始化')
81
+ self.set_conf()
82
+ sys.path.append(self.share_path)
83
+
84
+ def set_conf(self):
85
+ if platform.system() == 'Windows':
86
+ self.d_path = os.path.join('C:\\Users', getpass.getuser(), 'Downloads')
87
+ self.data_path = os.path.join('C:\\同步空间', 'BaiduSyncdisk')
88
+ self.share_path = os.path.join('\\\\192.168.1.198', '时尚事业部\\01.运营部\\天猫报表') # 共享文件根目录
89
+ elif platform.system() == 'Darwin':
90
+ self.d_path = os.path.join('/Users', getpass.getuser(), 'Downloads')
91
+ self.data_path = os.path.join('/Users', getpass.getuser(), '数据中心')
92
+ self.share_path = os.path.join('/Volumes/时尚事业部/01.运营部/天猫报表') # 共享文件根目录
93
+ else:
94
+ self.d_path = 'Downloads'
95
+ self.data_path = os.path.join(getpass.getuser(), '数据中心')
96
+ self.share_path = os.path.join('/Volumes/时尚事业部/01.运营部/天猫报表') # 共享文件根目录
97
+
98
+ if not os.path.exists(self.share_path):
99
+ self.share_path = re.sub('时尚事业部', '时尚事业部-1', self.share_path)
100
+
101
+ with open(self.my_conf, 'w+', encoding='utf-8') as f:
102
+ f.write('[database]\n')
103
+ f.write(f'# 配置文件\n')
104
+ f.write('# 下载目录\n')
105
+ f.write(f'd_path = {self.d_path}\n\n')
106
+ f.write('# 数据中心目录\n')
107
+ f.write(f'data_path = {self.data_path}\n\n')
108
+ f.write('# 共享目录\n')
109
+ f.write(f'share_path = {self.share_path}\n\n')
110
+ f.write('# 公司台式机中,用于触发下载百度云文件,更新至本机数据库\n')
111
+ f.write(f'ch_record = False\n\n')
112
+ print('目录初始化!')
113
+
114
+ def tb_file(self):
115
+
116
+ self.check_conf() # 检查配置文件
117
+
118
+ now_max_time = self.check_change()
119
+ if now_max_time in self.before_max_time:
120
+ return # 不更新
121
+ else:
122
+ self.before_max_time = [] # 重置变量,以免越来越占内存
123
+ self.before_max_time.append(now_max_time)
124
+
125
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
126
+ res = self.check_upload_mysql()
127
+ if not res:
128
+ print(f'检测到源文件修改, 但今日已经同步过, 不再同步')
129
+ return
130
+ print(f'{now}pandas数据源文件修改, 触发同步 ({self.sleep_minutes}分钟后开始)')
131
+
132
+ if not os.path.exists(self.data_path):
133
+ print(f'{self.data_path}: 本地目录不存在或配置文件异常, 无法同步此目录')
134
+ return None
135
+ if not os.path.exists(self.share_path):
136
+ print(f'{self.share_path}: 本机未连接共享或配置文件异常, 无法同步')
137
+ return None
138
+
139
+ time.sleep(self.sleep_minutes*60) # 开始同步前休眠时间
140
+ recent_time = 48 # 同步近N小时内更新过的文件,单位:小时
141
+ tb_list = []
142
+ pd_list = []
143
+ try:
144
+ with open(self.path1, 'r', encoding='utf-8') as f:
145
+ content = f.readlines()
146
+ content = [item.strip() for item in content if not item.strip().startswith('#')]
147
+ tb_list = [item for item in content if item]
148
+
149
+ with open(self.path2, 'r', encoding='utf-8') as f:
150
+ content = f.readlines()
151
+ content = [item.strip() for item in content if not item.strip().startswith('#')]
152
+ pd_list = [item for item in content if item]
153
+ except Exception as e:
154
+ print(e)
155
+
156
+ source_path = os.path.join(self.data_path, 'pandas数据源') # \BaiduSyncdisk\pandas数据源
157
+ target_path = os.path.join(self.share_path, 'pandas数据源') # \01.运营部\天猫报表\pandas数据源
158
+
159
+ if not os.path.exists(target_path): # 检查共享主目录,创建目录
160
+ os.makedirs(target_path, exist_ok=True)
161
+
162
+ # 删除共享的副本
163
+ file_list = os.listdir(self.share_path)
164
+ for file_1 in file_list:
165
+ if '副本_' in file_1 or 'con' in file_1: # or '.DS' in file_1
166
+ try:
167
+ os.remove(os.path.join(self.share_path, file_1))
168
+ print(f'移除: {os.path.join(self.share_path, file_1)}')
169
+ except Exception as e:
170
+ print(e)
171
+ print(f'移除失败:{os.path.join(self.share_path, file_1)}')
172
+ file_list2 = os.listdir(target_path) # 删除乱七八糟的临时文件
173
+ for file_1 in file_list2:
174
+ if '.DS' in file_1 or 'con' in file_1:
175
+ try:
176
+ os.remove(os.path.join(target_path, file_1))
177
+ print(f'移除: {os.path.join(target_path, file_1)}')
178
+ except Exception as e:
179
+ print(e)
180
+
181
+ # 删除 run_py的 副本
182
+ del_p = os.path.join(self.data_path, '自动0备份', 'py', '数据更新', 'run_py')
183
+ for file_1 in os.listdir(del_p):
184
+ if '副本_' in file_1:
185
+ try:
186
+ os.remove(os.path.join(del_p, file_1))
187
+ print(f'移除: {os.path.join(del_p, file_1)}')
188
+ except Exception as e:
189
+ print(e)
190
+ print(f'移除失败:{os.path.join(del_p, file_1)}')
191
+
192
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
193
+ print(f'{now} 正在同步文件...')
194
+ # 复制 run_py的文件到共享
195
+ for file_1 in tb_list:
196
+ s = os.path.join(del_p, file_1)
197
+ t = os.path.join(self.share_path, file_1)
198
+ try:
199
+ shutil.copy2(s, t)
200
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
201
+ print(f'{now}复制: {s}')
202
+ except Exception as e:
203
+ print(e)
204
+ s1 = os.path.join(del_p, f'副本_{file_1}')
205
+ t1 = os.path.join(self.share_path, f'副本_{file_1}')
206
+ shutil.copy2(s, s1) # 创建副本
207
+ shutil.copy2(s1, t1) # 复制副本到共享
208
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
209
+ print(f'{now}已创建副本 -->> {s1}')
210
+
211
+ # 同步 pandas 文件到共享
212
+ now_time = time.time()
213
+ for filenames in pd_list:
214
+ src = os.path.join(source_path, filenames) # 原位置,可能是文件或文件夹
215
+ dst = os.path.join(target_path, filenames) # 目标位置,可能是文件或文件夹
216
+ if os.path.isdir(src): # 如果是文件夹
217
+ for root, dirs, files in os.walk(src, topdown=False):
218
+ for name in files:
219
+ if '~$' in name or 'DS_Store' in name:
220
+ continue
221
+ if name.endswith('csv') or name.endswith('xlsx') or name.endswith('pbix') or name.endswith(
222
+ 'xls'):
223
+ new_src = os.path.join(root, name)
224
+ # share_path = dst + '\\' + new_src.split(src)[1] # 拼接目标路径
225
+ share_path = os.path.join(f'{dst}{new_src.split(src)[1]}') # 拼接目标路径
226
+ ls_paths = os.path.dirname(os.path.abspath(share_path)) # 获取上级目录,用来创建
227
+ if not os.path.exists(ls_paths): # 目录不存在则创建
228
+ os.makedirs(ls_paths, exist_ok=True)
229
+ c_stat = os.stat(new_src).st_mtime # 读取文件的元信息 >>>文件修改时间
230
+ if now_time - c_stat < recent_time * 3600: # 仅同步近期更新的文件
231
+ # res_name = os.path.basename(new_src)
232
+ try:
233
+ shutil.copy2(new_src, share_path)
234
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
235
+ print(f'{now}复制文件: {new_src}')
236
+ except Exception as e:
237
+ print(e)
238
+ elif os.path.isfile(src) and 'DS_Store' not in src: # 如果是文件
239
+ if src.endswith('csv') or src.endswith('xlsx') or src.endswith('pbix') or src.endswith('xls'):
240
+ c_stat = os.stat(src).st_mtime # 读取文件的元信息 >>>文件修改时间
241
+ if now_time - c_stat < recent_time * 3600:
242
+ ls_paths = os.path.dirname(os.path.abspath(src)) # 获取上级目录,用来创建
243
+ if not os.path.exists(ls_paths): # 目录不存在则创建
244
+ os.makedirs(ls_paths, exist_ok=True)
245
+ # new_name = os.path.basename(src)
246
+ try:
247
+ shutil.copy2(src, dst)
248
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
249
+ print(f'{now}复制文件: {src}')
250
+ except Exception as e:
251
+ print(e)
252
+ else:
253
+ print(f'{src} 所需同步的文件不存在,请检查:pd_list参数')
254
+
255
+ # 刷新共享位置的指定文件/文件夹
256
+ excel_path = os.path.join(self.share_path, 'EXCEL报表')
257
+ files = os.listdir(excel_path)
258
+ files = [f'{excel_path}\\{item}' for item in files if item.endswith('.xlsx') or item.endswith('.xls')]
259
+ r = refresh_all.RefreshAll()
260
+ for file in files:
261
+ if '~' in file or 'DS_Store' in file or 'baidu' in file or 'xunlei' in file:
262
+ continue
263
+ if file.endswith('.xlsx') or file.endswith('.xls'):
264
+ r.refresh_excel(file=file)
265
+ time.sleep(5)
266
+
267
+ # 临时加的
268
+ # excel_file = f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\0-电商周报-每周五更新\\0-WLM_运营周报-1012输出.xlsx'
269
+ dir_files = f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\0-电商周报-每周五更新'
270
+ files = os.listdir(dir_files)
271
+ for file in files:
272
+ if file.endswith('.xlsx') and file.startswith('0-WLM_运营周报') and '~' not in file and 'baidu' not in file:
273
+ excel_file = os.path.join(dir_files, file)
274
+ r.refresh_excel(file=excel_file)
275
+
276
+ self.before_max_time = self.check_change() # 重置值, 避免重复同步
277
+
278
+ now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
279
+ print(f'{now} 同步完成!')
280
+
281
+ def check_upload_mysql(self):
282
+ # 每天只更新一次
283
+ today = datetime.date.today()
284
+ if today == self.tomorrow:
285
+ self.tomorrow = today + datetime.timedelta(days=1)
286
+ return True
287
+ else:
288
+ return False
289
+
290
+
291
+ class UpdateMysql:
292
+ def __init__(self):
293
+ support_path = set_support.SetSupport(dirname='support').dirname
294
+ self.my_conf = os.path.join(support_path, '.copysh_conf')
295
+ self.ch_record = False
296
+ self.d_path = None
297
+
298
+ def check_date(self):
299
+ """ 检查公司台式机 .copysh_conf 文件中的 ch_record 值,决定是否执行更新"""
300
+ config = configparser.ConfigParser() # 初始化configparser类
301
+ try:
302
+ config.read(self.my_conf, 'UTF-8')
303
+ self.ch_record = config.get('database', 'ch_record').lower()
304
+ self.d_path = config.get('database', 'd_path')
305
+ except Exception as e:
306
+ print(e)
307
+ if self.ch_record == 'false':
308
+ return False, self.d_path
309
+ elif self.ch_record == 'true':
310
+ return True, self.d_path
311
+ else:
312
+ print(f'配置可能有误: {self.ch_record}, self.ch_record 值应为: true 或 false')
313
+ return False, self.d_path
314
+
315
+
316
+ def op_data(days: int =100):
317
+
318
+ # 清理数据库, 除了 聚合数据
319
+ if socket.gethostname() == 'company': # 公司台式机自身运行
320
+ # # Mysql
321
+ # username, password, host, port = get_myconf.select_config_values(
322
+ # target_service='company',
323
+ # database='mysql',
324
+ # )
325
+ # s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
326
+ # s.db_name_lists = [
327
+ # '京东数据2',
328
+ # '推广数据2',
329
+ # '市场数据2',
330
+ # '生意参谋2',
331
+ # '生意经2',
332
+ # '属性设置2',
333
+ # # '聚合数据', # 不在这里清理聚合数据, 还未开始聚合呢
334
+ # ]
335
+ # s.days = days
336
+ # s.optimize_list()
337
+
338
+ # 清理所有非聚合数据的库
339
+ optimize_data.op_data(
340
+ db_name_lists=[
341
+ '京东数据2',
342
+ '推广数据2',
343
+ '市场数据2',
344
+ '生意参谋2',
345
+ '生意经2',
346
+ '属性设置2',
347
+ # '聚合数据', # 不在这里清理聚合数据, 还未开始聚合呢
348
+ ],
349
+ days=days,
350
+ )
351
+
352
+ # 数据聚合
353
+ query_data.data_aggregation(service_databases=[{'company': 'mysql'}], months=3,)
354
+ time.sleep(60)
355
+
356
+ # 清理聚合数据
357
+ optimize_data.op_data(db_name_lists=['聚合数据'], days=3650, )
358
+
359
+
360
+ def main():
361
+ t = TbFiles()
362
+ u = UpdateMysql()
363
+ while True:
364
+ res, d_path = u.check_date() # 文件中的 ch_record 值,决定是否执行更新
365
+ if res:
366
+ upload_path = f'windows/{str(datetime.date.today().strftime("%Y-%m"))}/{str(datetime.date.today())}'
367
+ b = bdup.BaiDu()
368
+ b.download_dir(local_path=d_path, remote_path=upload_path)
369
+
370
+ dp = aggregation.DatabaseUpdate(path=d_path)
371
+ dp.new_unzip(is_move=True)
372
+ dp.cleaning(is_move=True, is_except=[]) # 公司台式机需要移除自身下载的文件
373
+ dp.upload_df(service_databases=[{'company': 'mysql'}])
374
+ dp.date_table(service_databases=[{'company': 'mysql'}]) # 因为日期表不受 days 参数控制,因此单独更新日期表
375
+ dp.other_table(service_databases=[{'company': 'mysql'}]) # 上传 support 文件夹下的 主推商品.csv
376
+ # 更新货品年份基准表, 属性设置 2 - 货品年份基准
377
+ p = products.Products()
378
+ p.to_mysql(service_databases=[
379
+ # {'home_lx': 'mysql'},
380
+ {'company': 'mysql'}
381
+ ]
382
+ )
383
+
384
+ if datetime.datetime.now().day in [1, 3, 7, 9, 12, 16, 19, 22, 25, 27]:
385
+ sku_picture.download_spu(
386
+ service_name='company',
387
+ database='mysql',
388
+ db_name='属性设置2',
389
+ table_name='商品spu素材下载记录',
390
+ col_name='商品图片',
391
+ save_path=os.path.join(f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\天猫报表\\其他文件', '商品id_商家编码_图片'),
392
+ )
393
+
394
+ # 此操作用于修改 .copysh_conf 文件,将 ch_record 改为 false (更新完成)
395
+ w = update_conf.UpdateConf()
396
+ w.update_config(filename='.copysh_conf', option='ch_record', new_value='False')
397
+ time.sleep(60)
398
+ op_data(days=100) # 数据清理和聚合
399
+
400
+ t.sleep_minutes = 5 # 同步前休眠时间
401
+ t.tb_file()
402
+ time.sleep(600) # 检测间隔
403
+
404
+
405
+ if __name__ == '__main__':
406
+ main()
407
+ # # 聚合数据,并清理聚合数据
408
+ # query_data.data_aggregation(service_databases=[{'company': 'mysql'}], months=1)
409
+
410
+ # sku_picture.download_spu(
411
+ # service_name='company',
412
+ # database='mysql',
413
+ # db_name='属性设置2',
414
+ # table_name='商品spu素材下载记录',
415
+ # col_name='商品图片',
416
+ # save_path=os.path.join(f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\天猫报表\\其他文件', '商品id_商家编码_图片'),
417
+ # )
@@ -37,6 +37,7 @@ class DataFrameConverter(object):
37
37
  df.replace(to_replace=['="'], value='', regex=True, inplace=True) # ="和"不可以放在一起清洗, 因为有: id=86785565
38
38
  df.replace(to_replace=['"'], value='', regex=True, inplace=True)
39
39
  cols = df.columns.tolist()
40
+
40
41
  df.reset_index(inplace=True, drop=True) # 重置索引,避免下面的 df.loc[0, col] 会出错
41
42
 
42
43
  for col in cols:
@@ -81,7 +82,9 @@ class DataFrameConverter(object):
81
82
  df[col] = df[col].apply(lambda x: pd.to_datetime(x))
82
83
  except:
83
84
  pass
84
- new_col = col.lower()
85
+ new_col = re.sub(r'[()()-,,$%&~^、* ]', '_', col.lower())
86
+ new_col = re.sub(r'_{2,}', '_', new_col)
87
+ new_col = re.sub(r'_+$', '', new_col)
85
88
  df.rename(columns={col: new_col}, inplace=True)
86
89
  df.fillna(0, inplace=True)
87
90
  return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mdbq
3
- Version: 2.6.5
3
+ Version: 2.6.7
4
4
  Home-page: https://pypi.org/project/mdbsql
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,18 +1,19 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
2
  mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
- mdbq/aggregation/aggregation.py,sha256=5WnLHNResPSMNNFYqt2trvw3PQM3XCHQD-XMZkfMYBM,76602
4
+ mdbq/aggregation/aggregation.py,sha256=aAAYq3-I4dqqXFFGwznihDl9ELajfi1NTIFdPFJ0Z_0,76606
5
5
  mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
6
6
  mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
7
7
  mdbq/aggregation/optimize_data.py,sha256=gdScrgTAb6RbXHZy1LitX7lggMGn1GTLhkYSgztfwew,4903
8
- mdbq/aggregation/query_data.py,sha256=0NGYmfl1klQryriHu4V6_Twi9WPERHbl56X3kUqmZaY,102619
8
+ mdbq/aggregation/query_data.py,sha256=Ob5PHdAzYF4gcZ85YX4R91-XKB53Gs48OKzAXGNK-6M,102603
9
9
  mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
10
10
  mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
11
11
  mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
12
- mdbq/clean/clean_upload.py,sha256=I9aJL-674ISOi5ZAbeGViRRKlcMW2bXQ2TGGBOQvzh4,81148
12
+ mdbq/clean/clean_upload.py,sha256=js0lngM43eAFUQ_J3RvqE4HFMO-9VAWTUbNY1H98LZo,83703
13
13
  mdbq/clean/data_clean.py,sha256=ucfslhqXVZoH2QaXHSAWDky0GhIvH9f4GeNaHg4SrFE,104790
14
14
  mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
15
- mdbq/company/copysh.py,sha256=NvlXCBZBcO2GIT5nLRYYqhOyHWM1-1RE7DHvgbj6jmQ,19723
15
+ mdbq/company/copysh.py,sha256=6RR2wbpUXHCrbdPzBJZOvPx_mhWWCtqeZwZ7x5B-r5s,18781
16
+ mdbq/company/copysh_bak.py,sha256=NvlXCBZBcO2GIT5nLRYYqhOyHWM1-1RE7DHvgbj6jmQ,19723
16
17
  mdbq/company/home_sh.py,sha256=42CZ2tZIXHLl2mOl2gk2fZnjH2IHh1VJ1s3qHABjonY,18021
17
18
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
18
19
  mdbq/config/get_myconf.py,sha256=cmNvsyoNa0RbZ9FOTjSd3jyyGwkxjUo0phvdHbGlrms,6010
@@ -20,7 +21,7 @@ mdbq/config/products.py,sha256=hN9UMkM6j76HYMulTYdtr3mOhh9QdpvvrLH14a_mbFY,5980
20
21
  mdbq/config/set_support.py,sha256=xkZCX6y9Bq1ppBpJAofld4B2YtchA7fl0eT3dx3CrSI,777
21
22
  mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
22
23
  mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
23
- mdbq/dataframe/converter.py,sha256=KNHxk3dNw1ycOpcnTg83yHrV9B3pvoYwK3Wc_bzk2NE,4314
24
+ mdbq/dataframe/converter.py,sha256=3n3_FKBxv7bFWeRcmv9CfiApFXuvvbRwZxTwR-SLGzU,4461
24
25
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
25
26
  mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
26
27
  mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
@@ -42,7 +43,7 @@ mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
42
43
  mdbq/req_post/req_tb.py,sha256=PexWSCPJNM6Tv0ol4lAWIhlOwsAr_frnjtcdSHCFiek,36179
43
44
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
44
45
  mdbq/spider/aikucun.py,sha256=4Y5zd64hZUFtll8AdpUc2napDas-La-A6XzAhb2mLv0,17157
45
- mdbq-2.6.5.dist-info/METADATA,sha256=mUyIb-qC1-GsTA2eIp_1_-oUUkJa8rXo9eLDw9PmUko,245
46
- mdbq-2.6.5.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
47
- mdbq-2.6.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
48
- mdbq-2.6.5.dist-info/RECORD,,
46
+ mdbq-2.6.7.dist-info/METADATA,sha256=85v7mF0rMpWyIE8IJqMd8h8LUXHctsnE6qcQWhl4ENs,245
47
+ mdbq-2.6.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
48
+ mdbq-2.6.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
49
+ mdbq-2.6.7.dist-info/RECORD,,
File without changes