mdbq 2.5.8__py3-none-any.whl → 2.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/optimize_data.py +41 -40
- mdbq/clean/clean_upload.py +263 -47
- mdbq/mysql/mysql.py +7 -5
- mdbq/spider/aikucun.py +64 -7
- {mdbq-2.5.8.dist-info → mdbq-2.5.9.dist-info}/METADATA +2 -2
- {mdbq-2.5.8.dist-info → mdbq-2.5.9.dist-info}/RECORD +8 -8
- {mdbq-2.5.8.dist-info → mdbq-2.5.9.dist-info}/WHEEL +1 -1
- {mdbq-2.5.8.dist-info → mdbq-2.5.9.dist-info}/top_level.txt +0 -0
@@ -57,8 +57,8 @@ def restart_mongodb():
|
|
57
57
|
subprocess.call(command, shell=True)
|
58
58
|
|
59
59
|
|
60
|
-
def op_data(db_name_lists, service_databases=
|
61
|
-
"""
|
60
|
+
def op_data(db_name_lists, service_databases=[{'home_lx': 'mysql', 'home_lx': 'mongodb'}], days: int = 63, is_mongo=True, is_mysql=True):
|
61
|
+
""" """
|
62
62
|
# for service_database in service_databases:
|
63
63
|
# for service_name, database in service_database.items():
|
64
64
|
# username, password, host, port = get_myconf.select_config_values(target_service=service_name, database=database)
|
@@ -68,47 +68,48 @@ def op_data(db_name_lists, service_databases=None, days: int = 63, is_mongo=True
|
|
68
68
|
# ]
|
69
69
|
# s.days = days
|
70
70
|
# s.optimize_list()
|
71
|
+
for service_database in service_databases:
|
72
|
+
for service_name, database in service_database.items():
|
73
|
+
if socket.gethostname() == 'xigua_lx' or socket.gethostname() == 'xigua1' or socket.gethostname() == 'Mac2.local':
|
74
|
+
# mongodb
|
75
|
+
if is_mongo and database == 'mongodb':
|
76
|
+
username, password, host, port = get_myconf.select_config_values(
|
77
|
+
target_service=service_name,
|
78
|
+
database=database,
|
79
|
+
)
|
80
|
+
m = mongo.OptimizeDatas(username=username, password=password, host=host, port=port)
|
81
|
+
m.db_name_lists = db_name_lists
|
82
|
+
m.days = days
|
83
|
+
m.optimize_list()
|
84
|
+
if m.client:
|
85
|
+
m.client.close()
|
86
|
+
print(f'已关闭 mongodb 连接')
|
71
87
|
|
72
|
-
|
73
|
-
|
74
|
-
if is_mongo:
|
75
|
-
username, password, host, port = get_myconf.select_config_values(
|
76
|
-
target_service='home_lx',
|
77
|
-
database='mongodb',
|
78
|
-
)
|
79
|
-
m = mongo.OptimizeDatas(username=username, password=password, host=host, port=port)
|
80
|
-
m.db_name_lists = db_name_lists
|
81
|
-
m.days = days
|
82
|
-
m.optimize_list()
|
83
|
-
if m.client:
|
84
|
-
m.client.close()
|
85
|
-
print(f'已关闭 mongodb 连接')
|
88
|
+
if socket.gethostname() == 'xigua_lx':
|
89
|
+
restart_mongodb() # mongodb 太占内存了, 重启服务, 释放内存
|
86
90
|
|
87
|
-
|
88
|
-
|
91
|
+
# Mysql
|
92
|
+
if is_mysql and database == 'mysql':
|
93
|
+
username, password, host, port = get_myconf.select_config_values(
|
94
|
+
target_service=service_name,
|
95
|
+
database=database,
|
96
|
+
)
|
97
|
+
s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
98
|
+
s.db_name_lists = db_name_lists
|
99
|
+
s.days = days
|
100
|
+
s.optimize_list()
|
89
101
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
elif socket.gethostname() == 'company':
|
102
|
-
# Mysql
|
103
|
-
if is_mysql:
|
104
|
-
username, password, host, port = get_myconf.select_config_values(
|
105
|
-
target_service='company',
|
106
|
-
database='mysql',
|
107
|
-
)
|
108
|
-
s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
109
|
-
s.db_name_lists = db_name_lists
|
110
|
-
s.days = days
|
111
|
-
s.optimize_list()
|
102
|
+
elif socket.gethostname() == 'company':
|
103
|
+
# Mysql
|
104
|
+
if is_mysql and database == 'mysql':
|
105
|
+
username, password, host, port = get_myconf.select_config_values(
|
106
|
+
target_service=service_name,
|
107
|
+
database=database,
|
108
|
+
)
|
109
|
+
s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
110
|
+
s.db_name_lists = db_name_lists
|
111
|
+
s.days = days
|
112
|
+
s.optimize_list()
|
112
113
|
|
113
114
|
|
114
115
|
if __name__ == '__main__':
|
mdbq/clean/clean_upload.py
CHANGED
@@ -83,6 +83,141 @@ class DataClean:
|
|
83
83
|
os.makedirs(_save_paths, exist_ok=True)
|
84
84
|
_df.to_csv(os.path.join(_save_paths, filenames), encoding=encoding, index=False, header=True)
|
85
85
|
|
86
|
+
def sycm_tm(self, path=None, is_except=[]):
|
87
|
+
""" 天猫 生意参谋数据 """
|
88
|
+
if not path:
|
89
|
+
path = self.path
|
90
|
+
report_names = [
|
91
|
+
{
|
92
|
+
'文件简称': '商品排行', # 文件名中包含的字符
|
93
|
+
'数据库名': '天猫_生意参谋',
|
94
|
+
'集合名称': '商品排行',
|
95
|
+
},
|
96
|
+
{
|
97
|
+
'文件简称': '店铺来源_来源构成_万里马官方旗舰店', # 文件名中包含的字符
|
98
|
+
'数据库名': '天猫_生意参谋',
|
99
|
+
'集合名称': '店铺流量来源构成',
|
100
|
+
},
|
101
|
+
]
|
102
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
103
|
+
for name in files:
|
104
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
105
|
+
continue
|
106
|
+
if 'py_xg' in name:
|
107
|
+
continue
|
108
|
+
is_continue = False
|
109
|
+
if is_except:
|
110
|
+
for item in is_except:
|
111
|
+
if item in os.path.join(root, name):
|
112
|
+
# print(name)
|
113
|
+
is_continue = True
|
114
|
+
break
|
115
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
116
|
+
continue
|
117
|
+
|
118
|
+
# 这里排除掉非目标报表
|
119
|
+
is_continue = False
|
120
|
+
db_name = None # 初始化参数
|
121
|
+
collection_name = None
|
122
|
+
for item in report_names:
|
123
|
+
if item['文件简称'] in name:
|
124
|
+
db_name = item['数据库名']
|
125
|
+
collection_name = item['集合名称']
|
126
|
+
is_continue = True
|
127
|
+
if not is_continue:
|
128
|
+
continue
|
129
|
+
if name.endswith('.xls') and '商品排行_万里马官方旗舰店' in name:
|
130
|
+
df = pd.read_excel(os.path.join(root, name), header=4)
|
131
|
+
if len(df) == 0:
|
132
|
+
print(f'{name} 报表数据为空')
|
133
|
+
continue
|
134
|
+
df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
135
|
+
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
136
|
+
df.rename(columns={'统计日期': '日期', '商品ID': '商品id'}, inplace=True)
|
137
|
+
shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
|
138
|
+
df.insert(loc=1, column='店铺名称', value=shop_name)
|
139
|
+
new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
|
140
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
141
|
+
os.remove(os.path.join(root, name))
|
142
|
+
elif name.endswith('.csv') and '_来源构成_万里马官方旗舰店' in name:
|
143
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
144
|
+
new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
|
145
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
146
|
+
os.remove(os.path.join(root, name))
|
147
|
+
|
148
|
+
# 将数据传入 self.datas 等待更新进数据库
|
149
|
+
if not db_name or not collection_name:
|
150
|
+
# print(f'db_name/collection_name 不能为空')
|
151
|
+
continue
|
152
|
+
self.datas.append(
|
153
|
+
{
|
154
|
+
'数据库名': db_name,
|
155
|
+
'集合名称': collection_name,
|
156
|
+
'数据主体': df,
|
157
|
+
'文件名': name,
|
158
|
+
}
|
159
|
+
)
|
160
|
+
|
161
|
+
def dmp_tm(self, path=None, is_except=[]):
|
162
|
+
""" 天猫 达摩盘 """
|
163
|
+
if not path:
|
164
|
+
path = self.path
|
165
|
+
report_names = [
|
166
|
+
{
|
167
|
+
'文件简称': '我的人群属性', # 文件名中包含的字符
|
168
|
+
'数据库名': '达摩盘3',
|
169
|
+
'集合名称': '我的人群属性',
|
170
|
+
},
|
171
|
+
]
|
172
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
173
|
+
for name in files:
|
174
|
+
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
175
|
+
continue
|
176
|
+
if 'py_xg' in name:
|
177
|
+
continue
|
178
|
+
is_continue = False
|
179
|
+
if is_except:
|
180
|
+
for item in is_except:
|
181
|
+
if item in os.path.join(root, name):
|
182
|
+
# print(name)
|
183
|
+
is_continue = True
|
184
|
+
break
|
185
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
186
|
+
continue
|
187
|
+
|
188
|
+
# 这里排除掉非目标报表
|
189
|
+
is_continue = False
|
190
|
+
db_name = None # 初始化参数
|
191
|
+
collection_name = None
|
192
|
+
for item in report_names:
|
193
|
+
if item['文件简称'] in name:
|
194
|
+
db_name = item['数据库名']
|
195
|
+
collection_name = item['集合名称']
|
196
|
+
is_continue = True
|
197
|
+
if not is_continue:
|
198
|
+
continue
|
199
|
+
if name.endswith('.csv') and '人群属性_万里马官方旗舰店' in name: # 推广类报表
|
200
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
201
|
+
if len(df) == 0:
|
202
|
+
print(f'{name} 报表数据为空')
|
203
|
+
continue
|
204
|
+
new_name = f'py_xg_{os.path.splitext(name)[0]}.csv'
|
205
|
+
self.save_to_csv(df, root, new_name, encoding='utf-8_sig')
|
206
|
+
os.remove(os.path.join(root, name))
|
207
|
+
|
208
|
+
# 将数据传入 self.datas 等待更新进数据库
|
209
|
+
if not db_name or not collection_name:
|
210
|
+
# print(f'db_name/collection_name 不能为空')
|
211
|
+
continue
|
212
|
+
self.datas.append(
|
213
|
+
{
|
214
|
+
'数据库名': db_name,
|
215
|
+
'集合名称': collection_name,
|
216
|
+
'数据主体': df,
|
217
|
+
'文件名': name,
|
218
|
+
}
|
219
|
+
)
|
220
|
+
|
86
221
|
def tg_reports(self, path=None, is_except=[]):
|
87
222
|
""" 处理天猫淘宝推广类报表 """
|
88
223
|
if not path:
|
@@ -172,7 +307,7 @@ class DataClean:
|
|
172
307
|
|
173
308
|
# 这里排除掉非推广类报表
|
174
309
|
is_continue = False
|
175
|
-
db_name = None
|
310
|
+
db_name = None # 初始化参数
|
176
311
|
collection_name = None
|
177
312
|
for item in report_names:
|
178
313
|
if item['文件简称'] in name:
|
@@ -234,7 +369,7 @@ class DataClean:
|
|
234
369
|
|
235
370
|
# 将数据传入 self.datas 等待更新进数据库
|
236
371
|
if not db_name or not collection_name:
|
237
|
-
print(f'db_name/collection_name 不能为空')
|
372
|
+
# print(f'db_name/collection_name 不能为空')
|
238
373
|
continue
|
239
374
|
self.datas.append(
|
240
375
|
{
|
@@ -290,7 +425,7 @@ class DataClean:
|
|
290
425
|
|
291
426
|
# 这里排除掉非目标报表
|
292
427
|
is_continue = False
|
293
|
-
db_name = None
|
428
|
+
db_name = None # 初始化参数
|
294
429
|
collection_name = None
|
295
430
|
for item in report_names:
|
296
431
|
if item['文件简称'] in name:
|
@@ -303,7 +438,7 @@ class DataClean:
|
|
303
438
|
if name.endswith('.csv') and 'baobei' in name:
|
304
439
|
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
305
440
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
306
|
-
pattern = re.findall(r'-(\d{4})(\d{2})(\d{2})
|
441
|
+
pattern = re.findall(r'-(\d{4})(\d{2})(\d{2})\W', name)[0]
|
307
442
|
df['日期'] = '-'.join(pattern)
|
308
443
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
309
444
|
new_name = f'py_xg_天猫_baobeitrains_{'-'.join(pattern)}.csv'
|
@@ -327,7 +462,7 @@ class DataClean:
|
|
327
462
|
elif name.endswith('.csv') and '省份城市分析' in name:
|
328
463
|
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
329
464
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
330
|
-
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})
|
465
|
+
pattern = re.findall(r'(.*[\u4e00-\u9fa5])(\d{4})(\d{2})(\d{2})\W', name)[0] # 注意后面可能有小括号 ...27 (2).csv
|
331
466
|
date = '-'.join(pattern[1:])
|
332
467
|
new_name = f'py_xg_天猫_{pattern[0]}-{date}.csv'
|
333
468
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
@@ -350,9 +485,9 @@ class DataClean:
|
|
350
485
|
os.remove(os.path.join(root, name))
|
351
486
|
elif name.endswith('.csv') and '店铺销售指标' in name:
|
352
487
|
# 生意经, 店铺指标,仅限月数据,实际日指标也可以
|
353
|
-
name_st = re.findall(r'(
|
488
|
+
name_st = re.findall(r'([\u4e00-\u9fa5]+)\(分日', name)
|
354
489
|
if not name_st:
|
355
|
-
print(f'{name}
|
490
|
+
print(f'{name} 正则提取文件名失败')
|
356
491
|
continue
|
357
492
|
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
358
493
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
@@ -375,7 +510,7 @@ class DataClean:
|
|
375
510
|
|
376
511
|
# 将数据传入 self.datas 等待更新进数据库
|
377
512
|
if not db_name or not collection_name:
|
378
|
-
print(f'db_name/collection_name 不能为空')
|
513
|
+
# print(f'db_name/collection_name 不能为空')
|
379
514
|
continue
|
380
515
|
self.datas.append(
|
381
516
|
{
|
@@ -431,7 +566,7 @@ class DataClean:
|
|
431
566
|
|
432
567
|
# 这里排除掉非目标报表
|
433
568
|
is_continue = False
|
434
|
-
db_name = None
|
569
|
+
db_name = None # 初始化参数
|
435
570
|
collection_name = None
|
436
571
|
for item in report_names:
|
437
572
|
if item['文件简称'] in name:
|
@@ -444,7 +579,7 @@ class DataClean:
|
|
444
579
|
if name.endswith('.csv') and 'baobei' in name:
|
445
580
|
encoding = self.get_encoding(file_path=os.path.join(root, name))
|
446
581
|
df = pd.read_csv(os.path.join(root, name), encoding=encoding, header=0, na_filter=False)
|
447
|
-
pattern = re.findall(r'-(\d{4})(\d{2})(\d{2})
|
582
|
+
pattern = re.findall(r'-(\d{4})(\d{2})(\d{2})\W', name)[0]
|
448
583
|
df['日期'] = '-'.join(pattern)
|
449
584
|
df.replace(to_replace=['--'], value='', regex=False, inplace=True)
|
450
585
|
new_name = f'py_xg_淘宝_baobeitrains_{'-'.join(pattern)}.csv'
|
@@ -516,7 +651,7 @@ class DataClean:
|
|
516
651
|
|
517
652
|
# 将数据传入 self.datas 等待更新进数据库
|
518
653
|
if not db_name or not collection_name:
|
519
|
-
print(f'db_name/collection_name 不能为空')
|
654
|
+
# print(f'db_name/collection_name 不能为空')
|
520
655
|
continue
|
521
656
|
self.datas.append(
|
522
657
|
{
|
@@ -582,7 +717,7 @@ class DataClean:
|
|
582
717
|
|
583
718
|
# 这里排除掉非目标报表
|
584
719
|
is_continue = False
|
585
|
-
db_name = None
|
720
|
+
db_name = None # 初始化参数
|
586
721
|
collection_name = None
|
587
722
|
for item in report_names:
|
588
723
|
if item['文件简称'] in name:
|
@@ -633,7 +768,7 @@ class DataClean:
|
|
633
768
|
|
634
769
|
# 将数据传入 self.datas 等待更新进数据库
|
635
770
|
if not db_name or not collection_name:
|
636
|
-
print(f'db_name/collection_name 不能为空')
|
771
|
+
# print(f'db_name/collection_name 不能为空')
|
637
772
|
continue
|
638
773
|
# print(name)
|
639
774
|
self.datas.append(
|
@@ -663,6 +798,8 @@ class DataClean:
|
|
663
798
|
break
|
664
799
|
if is_continue: # 需要排除不做处理的文件或文件夹
|
665
800
|
continue
|
801
|
+
db_name = None # 初始化参数
|
802
|
+
collection_name = None
|
666
803
|
|
667
804
|
if name.endswith('.xlsx') and '商品素材_' in name:
|
668
805
|
shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)_', name)[0]
|
@@ -681,7 +818,7 @@ class DataClean:
|
|
681
818
|
|
682
819
|
# 将数据传入 self.datas 等待更新进数据库
|
683
820
|
if not db_name or not collection_name:
|
684
|
-
print(f'db_name/collection_name 不能为空')
|
821
|
+
# print(f'db_name/collection_name 不能为空')
|
685
822
|
continue
|
686
823
|
self.datas.append(
|
687
824
|
{
|
@@ -718,6 +855,70 @@ class DataClean:
|
|
718
855
|
os.remove(old_file) # 如果存在则移除
|
719
856
|
shutil.move(os.path.join(path, _name), t2) # 将文件从下载文件夹移到目标位置
|
720
857
|
|
858
|
+
def move_sycm(self, path=None, is_except=[]):
|
859
|
+
""" 生意参谋 """
|
860
|
+
if not path:
|
861
|
+
path = self.path
|
862
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
863
|
+
for name in files:
|
864
|
+
# print(name)
|
865
|
+
is_continue = False
|
866
|
+
if is_except:
|
867
|
+
for item in is_except:
|
868
|
+
# print(item, f'-----', os.path.join(root, name))
|
869
|
+
if item in os.path.join(root, name):
|
870
|
+
# print(name)
|
871
|
+
is_continue = True
|
872
|
+
break
|
873
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
874
|
+
continue
|
875
|
+
|
876
|
+
# print(is_except, is_continue)
|
877
|
+
def bib(paths, _as_month=None):
|
878
|
+
"""闭包函数"""
|
879
|
+
self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
|
880
|
+
|
881
|
+
if 'py_xg' not in name: # 排除非目标文件
|
882
|
+
continue
|
883
|
+
|
884
|
+
if name.endswith('.csv') and '商品排行_万里马官方旗舰店' in name:
|
885
|
+
t_path = os.path.join(self.source_path, '天猫_生意参谋', '商品排行')
|
886
|
+
bib(t_path, _as_month=True)
|
887
|
+
elif name.endswith('.csv') and '店铺来源_来源构成_万里马官方旗舰店' in name:
|
888
|
+
t_path = os.path.join(self.source_path, '天猫_生意参谋', '店铺流量来源')
|
889
|
+
bib(t_path, _as_month=True)
|
890
|
+
|
891
|
+
def move_dmp(self, path=None, is_except=[]):
|
892
|
+
""" 达摩盘 """
|
893
|
+
if not path:
|
894
|
+
path = self.path
|
895
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
896
|
+
for name in files:
|
897
|
+
# print(name)
|
898
|
+
is_continue = False
|
899
|
+
if is_except:
|
900
|
+
for item in is_except:
|
901
|
+
# print(item, f'-----', os.path.join(root, name))
|
902
|
+
if item in os.path.join(root, name):
|
903
|
+
# print(name)
|
904
|
+
is_continue = True
|
905
|
+
break
|
906
|
+
if is_continue: # 需要排除不做处理的文件或文件夹
|
907
|
+
continue
|
908
|
+
|
909
|
+
# print(is_except, is_continue)
|
910
|
+
def bib(paths, _as_month=None):
|
911
|
+
"""闭包函数"""
|
912
|
+
self.move_files(path=path, _name=name, target_path=paths, _as_month=_as_month)
|
913
|
+
|
914
|
+
if 'py_xg' not in name: # 排除非目标文件
|
915
|
+
continue
|
916
|
+
|
917
|
+
if name.endswith('.csv') and '人群属性_万里马官方旗舰店' in name:
|
918
|
+
t_path = os.path.join(self.source_path, '天猫_达摩盘', '我的人群属性')
|
919
|
+
bib(t_path, _as_month=True)
|
920
|
+
|
921
|
+
|
721
922
|
# @try_except
|
722
923
|
def move_sjy(self, path=None, is_except=[]):
|
723
924
|
if not path:
|
@@ -1142,8 +1343,10 @@ class DataClean:
|
|
1142
1343
|
df_to_json.as_json_file() # 写入 json 文件, 包含数据的 dtypes 信息
|
1143
1344
|
|
1144
1345
|
|
1145
|
-
def main(service_databases=None):
|
1146
|
-
|
1346
|
+
def main(service_databases=None, is_mysql=False):
|
1347
|
+
"""
|
1348
|
+
is_mysql: 调试时加,False: 是否后续的聚合数据
|
1349
|
+
"""
|
1147
1350
|
|
1148
1351
|
if not service_databases:
|
1149
1352
|
service_databases = [
|
@@ -1153,23 +1356,30 @@ def main(service_databases=None):
|
|
1153
1356
|
# {'nas': 'mysql'},
|
1154
1357
|
]
|
1155
1358
|
|
1156
|
-
|
1359
|
+
cn = DataClean(
|
1157
1360
|
path=upload_path, # 源文件目录,下载文件夹
|
1158
1361
|
source_path=source_path3, # 原始文件保存目录
|
1159
1362
|
service_databases=service_databases
|
1160
1363
|
)
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1364
|
+
cn.new_unzip(is_move=True) # 解压文件, is_move 解压后是否删除原 zip 压缩文件
|
1365
|
+
cn.sycm_tm(is_except=['except']) # 天猫生意参谋
|
1366
|
+
cn.dmp_tm(is_except=['except']) # 达摩盘
|
1367
|
+
cn.tg_reports(is_except=['except']) # 推广报表,天猫淘宝共同清洗
|
1368
|
+
cn.syj_reports_tm(is_except=['except']) # 天猫生意经
|
1369
|
+
# cn.syj_reports_tb(is_except=['except']) # 淘宝生意经,不可以和天猫同时运行
|
1370
|
+
cn.jd_reports(is_except=['except']) # 清洗京东报表
|
1371
|
+
cn.sp_scene_clean(is_except=['except']) # 商品素材
|
1372
|
+
cn.upload_df(service_databases=service_databases) # 上传数据库
|
1373
|
+
|
1374
|
+
cn.move_sycm(is_except=['临时文件', ]) # 生意参谋,移到文件到原始文件夹
|
1375
|
+
cn.move_dmp(is_except=['临时文件', ]) # 达摩盘
|
1376
|
+
cn.move_sjy(is_except=['临时文件',]) # 生意经,移到文件到原始文件夹
|
1377
|
+
cn.move_jd(is_except=['临时文件', ]) # 京东,移到文件到原始文件夹
|
1378
|
+
cn.move_tg_tm(is_except=['临时文件', ]) # 天猫,移到文件到原始文件夹
|
1379
|
+
cn.move_tg_tb(is_except=['临时文件', ]) # 淘宝店,移到文件到原始文件夹
|
1380
|
+
|
1381
|
+
if not is_mysql:
|
1382
|
+
return
|
1173
1383
|
|
1174
1384
|
# 更新货品年份基准表, 属性设置 2 - 货品年份基准
|
1175
1385
|
p = products.Products()
|
@@ -1196,6 +1406,7 @@ def main(service_databases=None):
|
|
1196
1406
|
days=100,
|
1197
1407
|
is_mongo=True,
|
1198
1408
|
is_mysql=True,
|
1409
|
+
service_databases=service_databases
|
1199
1410
|
)
|
1200
1411
|
|
1201
1412
|
# 数据聚合
|
@@ -1214,7 +1425,7 @@ def main(service_databases=None):
|
|
1214
1425
|
|
1215
1426
|
def test():
|
1216
1427
|
# main_key = '单元报表'
|
1217
|
-
path = f'/Users/xigua/数据中心/原始文件2
|
1428
|
+
path = f'/Users/xigua/数据中心/原始文件2/生意参谋/商品排行qweqeqwe'
|
1218
1429
|
for root, dirs, files in os.walk(path, topdown=False):
|
1219
1430
|
for name in files:
|
1220
1431
|
if '~$' in name or '.DS' in name or '.localized' in name or '.jpg' in name or '.png' in name:
|
@@ -1227,30 +1438,34 @@ def test():
|
|
1227
1438
|
if name.endswith('.csv'):
|
1228
1439
|
print(name)
|
1229
1440
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1441
|
+
if '店铺名称' not in df.columns.tolist():
|
1442
|
+
df.insert(loc=1, column='店铺名称', value='万里马官方旗舰店')
|
1443
|
+
df.replace(to_replace=['-'], value=0, regex=False, inplace=True)
|
1444
|
+
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
1445
|
+
if '统计日期' in df.columns.tolist() and '日期' not in df.columns.tolist():
|
1446
|
+
df.rename(columns={'统计日期': '日期', '商品ID': '商品id'}, inplace=True)
|
1447
|
+
# shop_name = re.findall(r'_([\u4e00-\u9fffA-Za-z]+店)', name)[0]
|
1448
|
+
# df.insert(loc=1, column='店铺名称', value=shop_name)
|
1449
|
+
|
1450
|
+
date_all = re.findall(r'_(\d{4}-\d{2}-\d{2})_', name)[0]
|
1236
1451
|
|
1237
1452
|
date = re.findall(r'_(\d{4}-\d{2})-\d{2}', name)[0]
|
1238
1453
|
|
1239
|
-
new_path = f'/Users/xigua/数据中心/原始文件3
|
1454
|
+
new_path = f'/Users/xigua/数据中心/原始文件3/天猫_生意参谋/商品排行/{date}'
|
1240
1455
|
# new_path = os.path.join(new_path, date) # 添加 年月分类
|
1241
1456
|
if not os.path.exists(new_path):
|
1242
1457
|
os.makedirs(new_path, exist_ok=True)
|
1243
1458
|
# print(date_all)
|
1244
1459
|
|
1245
|
-
new_name = f'py_xg_
|
1460
|
+
new_name = f'py_xg_商品排行_万里马官方旗舰店_{date_all}.csv'
|
1246
1461
|
# print(os.path.join(new_path, new_name))
|
1247
1462
|
# breakpoint()
|
1248
|
-
|
1249
|
-
try:
|
1250
|
-
|
1251
|
-
|
1252
|
-
except Exception as e:
|
1253
|
-
|
1463
|
+
df.to_csv(os.path.join(new_path, new_name), encoding='utf-8_sig', index=False, header=True)
|
1464
|
+
# try:
|
1465
|
+
# df.to_excel(os.path.join(new_path, new_name),
|
1466
|
+
# index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
1467
|
+
# except Exception as e:
|
1468
|
+
# print(e)
|
1254
1469
|
|
1255
1470
|
|
1256
1471
|
|
@@ -1258,11 +1473,12 @@ def test():
|
|
1258
1473
|
if __name__ == '__main__':
|
1259
1474
|
main(
|
1260
1475
|
service_databases = [
|
1261
|
-
|
1262
|
-
{'home_lx': 'mysql'},
|
1476
|
+
{'company': 'mysql'},
|
1477
|
+
# {'home_lx': 'mysql'},
|
1263
1478
|
# {'home_lx': 'mongodb'},
|
1264
1479
|
# {'nas': 'mysql'},
|
1265
|
-
]
|
1480
|
+
],
|
1481
|
+
is_mysql = False,
|
1266
1482
|
)
|
1267
1483
|
|
1268
1484
|
# c = DataClean(
|
mdbq/mysql/mysql.py
CHANGED
@@ -68,7 +68,7 @@ class MysqlUpload:
|
|
68
68
|
|
69
69
|
return wrapper
|
70
70
|
|
71
|
-
@try_except
|
71
|
+
# @try_except
|
72
72
|
def df_to_mysql(self, df, table_name, db_name='远程数据源', icm_update=[], service_database={'home_lx': 'mysql'}, move_insert=False, df_sql=False, drop_duplicates=False, filename=None, count=None, json_path=None, reset_id=False):
|
73
73
|
"""
|
74
74
|
将 df 写入数据库
|
@@ -403,6 +403,8 @@ class MysqlUpload:
|
|
403
403
|
return 'INT'
|
404
404
|
elif dtype == 'float64':
|
405
405
|
res = find_longest_decimal_value(df[col].tolist()) # 取小数位数最长的值
|
406
|
+
if 'e' in str(res):
|
407
|
+
res = round(float(res), 4)
|
406
408
|
int_step = len(str(res).split('.')[0]) # 整数位数长度
|
407
409
|
f_step = len(str(res).split('.')[1]) # 小数位数长度
|
408
410
|
|
@@ -415,17 +417,17 @@ class MysqlUpload:
|
|
415
417
|
elif int_step >= 4 and f_step >= 0:
|
416
418
|
return 'decimal(10, 2)'
|
417
419
|
elif int_step >= 2 and f_step >= 6:
|
418
|
-
return 'decimal(12,
|
420
|
+
return 'decimal(12, 4)'
|
419
421
|
elif int_step >= 2 and f_step > 4:
|
420
|
-
return 'decimal(
|
422
|
+
return 'decimal(12, 4)'
|
421
423
|
elif int_step >= 2 and f_step > 2:
|
422
424
|
return 'decimal(10, 4)'
|
423
425
|
elif int_step >= 2 and f_step >= 0:
|
424
426
|
return 'decimal(10, 2)'
|
425
427
|
elif int_step >= 1 and f_step >= 6:
|
426
|
-
return 'decimal(
|
428
|
+
return 'decimal(12, 4)'
|
427
429
|
elif int_step >= 1 and f_step > 4:
|
428
|
-
return 'decimal(
|
430
|
+
return 'decimal(12, 4)'
|
429
431
|
elif int_step >= 1 and f_step > 2:
|
430
432
|
return 'decimal(10, 4)'
|
431
433
|
else:
|
mdbq/spider/aikucun.py
CHANGED
@@ -8,6 +8,7 @@ import platform
|
|
8
8
|
import re
|
9
9
|
import time
|
10
10
|
import warnings
|
11
|
+
import requests
|
11
12
|
import pandas as pd
|
12
13
|
from selenium import webdriver
|
13
14
|
from selenium.webdriver.support.wait import WebDriverWait
|
@@ -18,6 +19,7 @@ from mdbq.config import set_support
|
|
18
19
|
from selenium.webdriver.common.keys import Keys
|
19
20
|
from mdbq.aggregation import aggregation
|
20
21
|
from mdbq.clean import data_clean
|
22
|
+
from mdbq.other import ua_sj
|
21
23
|
|
22
24
|
warnings.filterwarnings('ignore')
|
23
25
|
|
@@ -41,6 +43,7 @@ else:
|
|
41
43
|
D_PATH = str(pathlib.Path(f'/Users/{getpass.getuser()}/Downloads'))
|
42
44
|
Share_Path = str(pathlib.Path('/Volumes/时尚事业部/01.运营部/天猫报表')) # 共享文件根目录
|
43
45
|
Source_Path = str(pathlib.Path(Data_Path, '原始文件2'))
|
46
|
+
upload_path = os.path.join(D_PATH, '数据上传中心') # 此目录位于下载文件夹
|
44
47
|
|
45
48
|
|
46
49
|
def get_cookie_aikucun():
|
@@ -84,6 +87,7 @@ def get_cookie_aikucun():
|
|
84
87
|
_driver.get(_url)
|
85
88
|
time.sleep(0.1)
|
86
89
|
_driver.maximize_window() # 窗口最大化 方便后续加载数据
|
90
|
+
print(f'请登录并切换到百宝箱,再保存 cookies: \n https://treasurebox.aikucun.com/dashboard/commodity/ranking/merchant?LS=true&shopId=1814114991487782914&from=menu&v=0.1936043279838604')
|
87
91
|
breakpoint()
|
88
92
|
|
89
93
|
d_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
@@ -122,7 +126,7 @@ class AikuCun:
|
|
122
126
|
|
123
127
|
def login(self, shop_name='aikucun'):
|
124
128
|
option = webdriver.ChromeOptions()
|
125
|
-
|
129
|
+
option.add_argument("--headless") # 设置无界面模式
|
126
130
|
# 调整chrome启动配置
|
127
131
|
option.add_argument("--disable-gpu")
|
128
132
|
option.add_argument("--no-sandbox")
|
@@ -154,6 +158,9 @@ class AikuCun:
|
|
154
158
|
option.add_experimental_option('prefs', prefs)
|
155
159
|
option.add_experimental_option('excludeSwitches', ['enable-automation']) # 实验性参数, 左上角小字
|
156
160
|
|
161
|
+
# 修改默认下载文件夹路径
|
162
|
+
option.add_experimental_option("prefs", {"download.default_directory": f'{upload_path}'})
|
163
|
+
|
157
164
|
# # 通过excludeSwitches参数禁用默认的启动路径
|
158
165
|
# option.add_experimental_option('excludeSwitches', ['enable-automation'])
|
159
166
|
|
@@ -238,7 +245,7 @@ class AikuCun:
|
|
238
245
|
elements = _driver.find_elements(
|
239
246
|
By.XPATH, '//button/span[contains(text(), "查询")]')
|
240
247
|
_driver.execute_script("arguments[0].click();", elements[0]) # 点击
|
241
|
-
time.sleep(
|
248
|
+
time.sleep(5)
|
242
249
|
wait.until(EC.presence_of_element_located(
|
243
250
|
(By.XPATH,
|
244
251
|
'//button[@class="el-button el-button--primary el-button--small is-plain"]/span[contains(text(), "下载数据")]')))
|
@@ -246,12 +253,12 @@ class AikuCun:
|
|
246
253
|
By.XPATH,
|
247
254
|
'//button[@class="el-button el-button--primary el-button--small is-plain"]/span[contains(text(), "下载数据")]')
|
248
255
|
_driver.execute_script("arguments[0].click();", elements[0]) # 点击
|
249
|
-
time.sleep(
|
256
|
+
time.sleep(5)
|
250
257
|
self.clean_data(date=new_date)
|
251
258
|
_driver.quit()
|
252
259
|
|
253
260
|
def clean_data(self, date):
|
254
|
-
for root, dirs, files in os.walk(
|
261
|
+
for root, dirs, files in os.walk(upload_path, topdown=False):
|
255
262
|
for name in files:
|
256
263
|
if '~$' in name or 'DS_Store' in name:
|
257
264
|
continue
|
@@ -280,11 +287,11 @@ class AikuCun:
|
|
280
287
|
|
281
288
|
def akucun():
|
282
289
|
akc = AikuCun()
|
283
|
-
akc.get_data(shop_name='aikucun', date_num=
|
290
|
+
akc.get_data(shop_name='aikucun', date_num=10) # 获取最近 N 天数据,0表示今天
|
284
291
|
# akc.clean_data()
|
285
292
|
|
286
293
|
# 新版 数据分类
|
287
|
-
dp = aggregation.DatabaseUpdate(path=
|
294
|
+
dp = aggregation.DatabaseUpdate(path=upload_path)
|
288
295
|
dp.new_unzip(is_move=True)
|
289
296
|
dp.cleaning(is_move=False, is_except=['临时文件']) # 清洗数据, 存入 self.datas, 不需要立即移除文件,仍保留文件到原始文件中
|
290
297
|
# 将 self.datas 更新至数据库
|
@@ -295,7 +302,7 @@ def akucun():
|
|
295
302
|
# {'nas': 'mysql'},
|
296
303
|
])
|
297
304
|
# 数据分类
|
298
|
-
c = data_clean.DataClean(path=
|
305
|
+
c = data_clean.DataClean(path=upload_path, source_path=Source_Path)
|
299
306
|
c.set_up_to_mogo = False # 不再使用 data_clean 更新数据库,改为 aggregation.py
|
300
307
|
c.set_up_to_mysql = False # 不再使用 data_clean 更新数据库,改为 aggregation.py
|
301
308
|
c.new_unzip(is_move=True, ) # 解压文件
|
@@ -303,7 +310,57 @@ def akucun():
|
|
303
310
|
c.move_all(is_except=['临时文件']) # 移到文件到原始文件夹
|
304
311
|
|
305
312
|
|
313
|
+
class AikuCunNew:
|
314
|
+
|
315
|
+
def __init__(self, shop_name,):
|
316
|
+
self.shop_name = shop_name
|
317
|
+
self.today = datetime.date.today()
|
318
|
+
self.headers = {'User-Agent': ua_sj.get_ua()}
|
319
|
+
self.cookie_path = os.path.join(set_support.SetSupport(dirname='support').dirname, 'cookies')
|
320
|
+
self.cookies = {}
|
321
|
+
self.get_cookies() # 更新 self.cookies 的值
|
322
|
+
self.support_path = set_support.SetSupport(dirname='support').dirname
|
323
|
+
self.start_date = (self.today - datetime.timedelta(days=15)).strftime('%Y-%m-%d')
|
324
|
+
self.end_date = (self.today - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
325
|
+
|
326
|
+
def akc(self):
|
327
|
+
"""
|
328
|
+
"""
|
329
|
+
start_date = self.start_date
|
330
|
+
end_date = self.end_date
|
331
|
+
url = 'https://treasurebox.aikucun.com/api/web/merchant/treasure/commodity/list/down?'
|
332
|
+
self.headers.update({'Referer': 'https://treasurebox.aikucun.com/dashboard/commodity/ranking/merchant?LS=true&shopId=1814114991487782914&from=menu&v=0.1936043279838604'})
|
333
|
+
now = datetime.datetime.now()
|
334
|
+
timestamp_ms = round(time.mktime(now.timetuple()) * 1000 + now.microsecond / 1000)
|
335
|
+
data = {
|
336
|
+
'time': timestamp_ms,
|
337
|
+
'sign': '2DA6A7580C859B374AE830CAD78BB84B'
|
338
|
+
}
|
339
|
+
res = requests.post(
|
340
|
+
url,
|
341
|
+
headers=self.headers,
|
342
|
+
cookies=self.cookies,
|
343
|
+
params=data
|
344
|
+
)
|
345
|
+
print(res.text)
|
346
|
+
|
347
|
+
|
348
|
+
|
349
|
+
def get_cookies(self):
|
350
|
+
files = os.listdir(self.cookie_path)
|
351
|
+
for file in files:
|
352
|
+
if self.shop_name in file and '~' not in file:
|
353
|
+
with open(os.path.join(self.cookie_path, file), 'r') as f:
|
354
|
+
cookies_data = json.load(f)
|
355
|
+
break
|
356
|
+
for data in cookies_data:
|
357
|
+
self.cookies.update({data['name']: data['value']})
|
358
|
+
|
359
|
+
|
306
360
|
if __name__ == '__main__':
|
307
361
|
pass
|
308
362
|
# get_cookie_aikucun()
|
309
363
|
akucun()
|
364
|
+
|
365
|
+
# a = AikuCunNew(shop_name='aikucun')
|
366
|
+
# a.akc()
|
@@ -4,12 +4,12 @@ mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,
|
|
4
4
|
mdbq/aggregation/aggregation.py,sha256=nPp5fOLktxejNEak3SyTnKLjwzK1l2xjbV45X-I4LFQ,76131
|
5
5
|
mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
|
7
|
-
mdbq/aggregation/optimize_data.py,sha256=
|
7
|
+
mdbq/aggregation/optimize_data.py,sha256=gdScrgTAb6RbXHZy1LitX7lggMGn1GTLhkYSgztfwew,4903
|
8
8
|
mdbq/aggregation/query_data.py,sha256=WKe42Xq1Gi-ELuIT0k2jh3X4-R7heb0ub3Mj3yuCRAk,103635
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
10
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
11
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
12
|
-
mdbq/clean/clean_upload.py,sha256=
|
12
|
+
mdbq/clean/clean_upload.py,sha256=bQwpzQcLxyELqmvQ_kRPSPkt0gyCcUN8jWai-Nmculc,76755
|
13
13
|
mdbq/clean/data_clean.py,sha256=ucfslhqXVZoH2QaXHSAWDky0GhIvH9f4GeNaHg4SrFE,104790
|
14
14
|
mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
|
15
15
|
mdbq/company/copysh.py,sha256=NvlXCBZBcO2GIT5nLRYYqhOyHWM1-1RE7DHvgbj6jmQ,19723
|
@@ -26,7 +26,7 @@ mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
|
26
26
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
27
27
|
mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
|
28
28
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
29
|
-
mdbq/mysql/mysql.py,sha256=
|
29
|
+
mdbq/mysql/mysql.py,sha256=F2AidJpHcof7vXXc3ReG24Et-ki-fKYdy8LeQH_Yh-g,47105
|
30
30
|
mdbq/mysql/s_query.py,sha256=37GGHzRpycfUjsYEoQgDpdEs9JwjW-LxFXnGwwP2b2Q,8403
|
31
31
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
32
32
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -41,8 +41,8 @@ mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,
|
|
41
41
|
mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
42
42
|
mdbq/req_post/req_tb.py,sha256=PexWSCPJNM6Tv0ol4lAWIhlOwsAr_frnjtcdSHCFiek,36179
|
43
43
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
44
|
-
mdbq/spider/aikucun.py,sha256=
|
45
|
-
mdbq-2.5.
|
46
|
-
mdbq-2.5.
|
47
|
-
mdbq-2.5.
|
48
|
-
mdbq-2.5.
|
44
|
+
mdbq/spider/aikucun.py,sha256=3EjeTPbwk_qLGMVqDhBZoEPGfD2oM-SBiODjxLL3A8U,16883
|
45
|
+
mdbq-2.5.9.dist-info/METADATA,sha256=0KYiPP9keTjCLiq4FSHmrJja0fUWmsa1ty6AWA_I_G0,245
|
46
|
+
mdbq-2.5.9.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
47
|
+
mdbq-2.5.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
48
|
+
mdbq-2.5.9.dist-info/RECORD,,
|
File without changes
|