mdbq 3.2.7__py3-none-any.whl → 3.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +16 -16
- mdbq/aggregation/query_data.py +111 -40
- mdbq/mysql/mysql.py +29 -6
- mdbq/spider/aikucun.py +99 -29
- {mdbq-3.2.7.dist-info → mdbq-3.2.9.dist-info}/METADATA +1 -1
- {mdbq-3.2.7.dist-info → mdbq-3.2.9.dist-info}/RECORD +8 -8
- {mdbq-3.2.7.dist-info → mdbq-3.2.9.dist-info}/WHEEL +0 -0
- {mdbq-3.2.7.dist-info → mdbq-3.2.9.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -1171,9 +1171,9 @@ def upload_dir(path, db_name, collection_name, json_path=None):
|
|
1171
1171
|
for col in df.columns.tolist():
|
1172
1172
|
df[col] = df[col].apply(lambda x: 0 if str(x) == '' else x)
|
1173
1173
|
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1174
|
+
if '更新时间' not in df.columns.tolist():
|
1175
|
+
df['更新时间'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
1176
|
+
|
1177
1177
|
m.df_to_mysql(df=df, db_name=db_name, table_name=collection_name,
|
1178
1178
|
move_insert=False, # 先删除,再插入
|
1179
1179
|
df_sql = True,
|
@@ -1272,7 +1272,7 @@ def cut_as_year_month(as_month=False):
|
|
1272
1272
|
|
1273
1273
|
|
1274
1274
|
if __name__ == '__main__':
|
1275
|
-
cut_as_year_month(as_month=False)
|
1275
|
+
# cut_as_year_month(as_month=False)
|
1276
1276
|
|
1277
1277
|
# username = 'root'
|
1278
1278
|
# password = ''
|
@@ -1281,22 +1281,22 @@ if __name__ == '__main__':
|
|
1281
1281
|
#
|
1282
1282
|
# # 上传 1 个文件到数据库
|
1283
1283
|
# one_file_to_mysql(
|
1284
|
-
# file=r'/Users/xigua/Downloads
|
1284
|
+
# file=r'/Users/xigua/Downloads/城市等级.csv',
|
1285
1285
|
# db_name='属性设置3',
|
1286
|
-
# table_name='
|
1286
|
+
# table_name='城市等级',
|
1287
1287
|
# )
|
1288
1288
|
|
1289
1289
|
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1290
|
+
col = 1
|
1291
|
+
if col:
|
1292
|
+
# 上传一个目录到指定数据库
|
1293
|
+
db_name = '爱库存2'
|
1294
|
+
table_name = '商品spu榜单'
|
1295
|
+
upload_dir(
|
1296
|
+
path=r'/Users/xigua/Downloads/数据上传中心',
|
1297
|
+
db_name=db_name,
|
1298
|
+
collection_name=table_name,
|
1299
|
+
)
|
1300
1300
|
|
1301
1301
|
|
1302
1302
|
|
mdbq/aggregation/query_data.py
CHANGED
@@ -6,6 +6,7 @@ from mdbq.mysql import s_query
|
|
6
6
|
from mdbq.aggregation import optimize_data
|
7
7
|
from mdbq.config import myconfig
|
8
8
|
from mdbq.config import products
|
9
|
+
from mdbq.config import set_support
|
9
10
|
import datetime
|
10
11
|
from dateutil.relativedelta import relativedelta
|
11
12
|
import pandas as pd
|
@@ -18,10 +19,12 @@ import os
|
|
18
19
|
import time
|
19
20
|
import calendar
|
20
21
|
import concurrent.futures
|
22
|
+
import traceback
|
21
23
|
|
22
24
|
"""
|
23
25
|
|
24
26
|
"""
|
27
|
+
error_file = os.path.join(set_support.SetSupport(dirname='support').dirname, 'error.log')
|
25
28
|
username, password, host, port, service_database = None, None, None, None, None,
|
26
29
|
if socket.gethostname() in ['xigua_lx', 'xigua1', 'MacBookPro']:
|
27
30
|
conf = myconfig.main()
|
@@ -65,6 +68,12 @@ class MysqlDatasQuery:
|
|
65
68
|
return func(*args, **kwargs)
|
66
69
|
except Exception as e:
|
67
70
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
71
|
+
with open(error_file, 'a') as f:
|
72
|
+
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
73
|
+
f.write(f'\n{now}\n')
|
74
|
+
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
75
|
+
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
76
|
+
print(f'更多信息请查看日志文件: {error_file}')
|
68
77
|
|
69
78
|
return wrapper
|
70
79
|
|
@@ -1825,22 +1834,52 @@ class MysqlDatasQuery:
|
|
1825
1834
|
}
|
1826
1835
|
min_date = df['日期'].min()
|
1827
1836
|
max_date = df['日期'].max()
|
1837
|
+
new_dict = {
|
1838
|
+
'日期': '',
|
1839
|
+
'店铺名称': '',
|
1840
|
+
'场次信息': '',
|
1841
|
+
'场次id': '',
|
1842
|
+
'直播开播时间': '',
|
1843
|
+
'开播时长': '',
|
1844
|
+
'封面图点击率': '',
|
1845
|
+
'观看人数': '',
|
1846
|
+
'观看次数': '',
|
1847
|
+
'新增粉丝数': '',
|
1848
|
+
'流量券消耗': '',
|
1849
|
+
'观看总时长': '',
|
1850
|
+
'人均观看时长': '',
|
1851
|
+
'次均观看时长': '',
|
1852
|
+
'商品点击人数': '',
|
1853
|
+
'商品点击次数': '',
|
1854
|
+
'商品点击率': '',
|
1855
|
+
'加购人数': '',
|
1856
|
+
'加购件数': '',
|
1857
|
+
'加购次数': '',
|
1858
|
+
'成交金额': '',
|
1859
|
+
'成交人数': '',
|
1860
|
+
'成交件数': '',
|
1861
|
+
'成交笔数': '',
|
1862
|
+
'成交转化率': '',
|
1863
|
+
'退款人数': '',
|
1864
|
+
'退款笔数': '',
|
1865
|
+
'退款件数': '',
|
1866
|
+
'退款金额': '',
|
1867
|
+
'预售定金支付金额': '',
|
1868
|
+
'预售预估总金额': '',
|
1869
|
+
}
|
1828
1870
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1829
1871
|
print(f'{now} 正在更新: mysql ({host}:{port}) {db_name}/{table_name} -> {min_date}~{max_date}')
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
set_typ=set_typ,
|
1842
|
-
|
1843
|
-
)
|
1872
|
+
for dict_data in df.to_dict(orient='records'):
|
1873
|
+
new_dict.update(dict_data)
|
1874
|
+
m_engine.dict_to_mysql(
|
1875
|
+
db_name=db_name,
|
1876
|
+
table_name=table_name,
|
1877
|
+
dict_data=new_dict,
|
1878
|
+
unique_main_key=None,
|
1879
|
+
icm_update=['场次id'], # 唯一组合键
|
1880
|
+
main_key=None, # 指定索引列, 通常用日期列,默认会设置日期为索引
|
1881
|
+
set_typ=set_typ, # 指定数据类型
|
1882
|
+
)
|
1844
1883
|
return True
|
1845
1884
|
|
1846
1885
|
# @try_except
|
@@ -2234,29 +2273,61 @@ class MysqlDatasQuery:
|
|
2234
2273
|
set_typ = {
|
2235
2274
|
'日期': 'date',
|
2236
2275
|
'店铺名称': 'varchar(100)',
|
2237
|
-
'序号': 'int',
|
2238
2276
|
'spu_id': 'varchar(100)',
|
2239
2277
|
'图片': 'varchar(255)',
|
2278
|
+
'序号': 'smallint',
|
2279
|
+
'商品名称': 'varchar(255)',
|
2280
|
+
'商品款号': 'varchar(255)',
|
2281
|
+
'一级类目名称': 'varchar(255)',
|
2282
|
+
'二级类目名称': 'varchar(255)',
|
2283
|
+
'三级类目名称': 'varchar(255)',
|
2240
2284
|
'数据更新时间': 'timestamp',
|
2285
|
+
'更新时间': 'timestamp',
|
2241
2286
|
}
|
2242
2287
|
min_date = df['日期'].min()
|
2243
2288
|
max_date = df['日期'].max()
|
2244
2289
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
2245
2290
|
print(f'{now} 正在更新: mysql ({host}:{port}) {db_name}/{table_name} -> {min_date}~{max_date}')
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
|
2250
|
-
|
2251
|
-
|
2252
|
-
|
2253
|
-
|
2254
|
-
|
2255
|
-
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2259
|
-
|
2291
|
+
new_dict = {
|
2292
|
+
'日期': '',
|
2293
|
+
'店铺名称': '',
|
2294
|
+
'序号': '',
|
2295
|
+
'商品名称': '',
|
2296
|
+
'spu_id': '',
|
2297
|
+
'商品款号': '',
|
2298
|
+
'一级类目名称': '',
|
2299
|
+
'二级类目名称': '',
|
2300
|
+
'三级类目名称': '',
|
2301
|
+
'访客量': '',
|
2302
|
+
'浏览量': '',
|
2303
|
+
'下单gmv': '',
|
2304
|
+
'成交gmv': '',
|
2305
|
+
'支付人数_成交': '',
|
2306
|
+
}
|
2307
|
+
for dict_data in df.to_dict(orient='records'):
|
2308
|
+
new_dict.update(dict_data)
|
2309
|
+
m_engine.dict_to_mysql(
|
2310
|
+
db_name='爱库存2',
|
2311
|
+
table_name='商品spu榜单',
|
2312
|
+
dict_data=new_dict,
|
2313
|
+
icm_update=['日期', '店铺名称', 'spu_id', '商品款号'],
|
2314
|
+
unique_main_key=None,
|
2315
|
+
set_typ=set_typ,
|
2316
|
+
)
|
2317
|
+
# m_engine.df_to_mysql(
|
2318
|
+
# df=df,
|
2319
|
+
# db_name=db_name,
|
2320
|
+
# table_name=table_name,
|
2321
|
+
# icm_update=[], # 增量更新, 在聚合数据中使用,其他不要用
|
2322
|
+
# move_insert=True, # 先删除,再插入
|
2323
|
+
# df_sql=False, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
2324
|
+
# drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
2325
|
+
# count=None,
|
2326
|
+
# filename=None, # 用来追踪处理进度
|
2327
|
+
# reset_id=False, # 是否重置自增列
|
2328
|
+
# set_typ=set_typ,
|
2329
|
+
#
|
2330
|
+
# )
|
2260
2331
|
return True
|
2261
2332
|
|
2262
2333
|
def deeplink(self, db_name='聚合数据', table_name='达摩盘_deeplink人群洞察'):
|
@@ -3227,10 +3298,10 @@ def main(days=150, months=3):
|
|
3227
3298
|
|
3228
3299
|
|
3229
3300
|
if __name__ == '__main__':
|
3230
|
-
main(
|
3231
|
-
|
3232
|
-
|
3233
|
-
)
|
3301
|
+
# main(
|
3302
|
+
# days=150, # 清理聚合数据的日期长度
|
3303
|
+
# months=3 # 生成聚合数据的长度
|
3304
|
+
# )
|
3234
3305
|
|
3235
3306
|
# query_list = [query1, query2]
|
3236
3307
|
# # 使用 ThreadPoolExecutor 来并行运行
|
@@ -3244,10 +3315,10 @@ if __name__ == '__main__':
|
|
3244
3315
|
# ),
|
3245
3316
|
# }
|
3246
3317
|
|
3247
|
-
#
|
3248
|
-
|
3249
|
-
|
3250
|
-
|
3251
|
-
|
3252
|
-
|
3253
|
-
|
3318
|
+
# 3. 清理聚合数据
|
3319
|
+
optimize_data.op_data(
|
3320
|
+
db_name_lists=['聚合数据'],
|
3321
|
+
days=3650, # 清理聚合数据的日期长度
|
3322
|
+
is_mongo=False,
|
3323
|
+
is_mysql=True,
|
3324
|
+
)
|
mdbq/mysql/mysql.py
CHANGED
@@ -12,7 +12,9 @@ import pandas as pd
|
|
12
12
|
from sqlalchemy import create_engine
|
13
13
|
import os
|
14
14
|
import calendar
|
15
|
+
from mdbq.config import set_support
|
15
16
|
from mdbq.config import myconfig
|
17
|
+
import traceback
|
16
18
|
|
17
19
|
warnings.filterwarnings('ignore')
|
18
20
|
"""
|
@@ -25,6 +27,7 @@ warnings.filterwarnings('ignore')
|
|
25
27
|
3. 小数必须使用 decimal, 禁止 float 和 double, 因为计算精度差异,后续需要聚合数据时会引发很多问题
|
26
28
|
|
27
29
|
"""
|
30
|
+
error_file = os.path.join(set_support.SetSupport(dirname='support').dirname, 'error.log')
|
28
31
|
|
29
32
|
|
30
33
|
def is_valid_date(date_string):
|
@@ -112,6 +115,12 @@ class MysqlUpload:
|
|
112
115
|
return func(*args, **kwargs)
|
113
116
|
except Exception as e:
|
114
117
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
118
|
+
with open(error_file, 'a') as f:
|
119
|
+
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
120
|
+
f.write(f'\n{now}\n')
|
121
|
+
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
122
|
+
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
123
|
+
print(f'更多信息请查看日志文件: {error_file}')
|
115
124
|
|
116
125
|
return wrapper
|
117
126
|
|
@@ -127,6 +136,10 @@ class MysqlUpload:
|
|
127
136
|
set_typ: {}
|
128
137
|
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
129
138
|
"""
|
139
|
+
if icm_update:
|
140
|
+
if main_key or unique_main_key:
|
141
|
+
print(f'icm_update/unique_main_key/unique_main_key 参数不能同时设定')
|
142
|
+
return
|
130
143
|
if not main_key:
|
131
144
|
main_key = []
|
132
145
|
if not unique_main_key:
|
@@ -196,6 +209,7 @@ class MysqlUpload:
|
|
196
209
|
else:
|
197
210
|
sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
198
211
|
# sql = f"ALTER TABLE `{table_name}` ADD COLUMN `{col}` {dtypes[col]} NOT NULL;"
|
212
|
+
# print(sql)
|
199
213
|
cursor.execute(sql)
|
200
214
|
print(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
201
215
|
|
@@ -458,6 +472,15 @@ class MysqlUpload:
|
|
458
472
|
filename: 用来追踪处理进度,传这个参数是方便定位产生错误的文件
|
459
473
|
allow_not_null: 创建允许插入空值的列,正常情况下不允许空值
|
460
474
|
"""
|
475
|
+
if icm_update:
|
476
|
+
if move_insert or df_sql or drop_duplicates:
|
477
|
+
print(f'icm_update/move_insert/df_sql/drop_duplicates 参数不能同时设定')
|
478
|
+
return
|
479
|
+
if move_insert:
|
480
|
+
if icm_update or df_sql or drop_duplicates:
|
481
|
+
print(f'icm_update/move_insert/df_sql/drop_duplicates 参数不能同时设定')
|
482
|
+
return
|
483
|
+
|
461
484
|
self.filename = filename
|
462
485
|
if isinstance(df, pd.DataFrame):
|
463
486
|
if len(df) == 0:
|
@@ -861,6 +884,12 @@ class OptimizeDatas:
|
|
861
884
|
return func(*args, **kwargs)
|
862
885
|
except Exception as e:
|
863
886
|
print(f'{func.__name__}, {e}') # 将异常信息返回
|
887
|
+
with open(error_file, 'a') as f:
|
888
|
+
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
889
|
+
f.write(f'\n{now}\n')
|
890
|
+
# f.write(f'报错的文件:\n{e.__traceback__.tb_frame.f_globals["__file__"]}\n') # 发生异常所在的文件
|
891
|
+
traceback.print_exc(file=open(error_file, 'a')) # 返回完整的堆栈信息
|
892
|
+
print(f'更多信息请查看日志文件: {error_file}')
|
864
893
|
|
865
894
|
return wrapper
|
866
895
|
|
@@ -1162,9 +1191,3 @@ if __name__ == '__main__':
|
|
1162
1191
|
data = conf['Windows']['xigua_lx']['mysql']['local']
|
1163
1192
|
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
1164
1193
|
print(username, password, host, port)
|
1165
|
-
|
1166
|
-
df = pd.read_excel('/Users/xigua/Downloads/66563857.xlsx')
|
1167
|
-
ss = MysqlUpload(username, password, host, port)
|
1168
|
-
res, data = ss.convert_df_dtypes(df=df)
|
1169
|
-
print(data)
|
1170
|
-
print(res)
|
mdbq/spider/aikucun.py
CHANGED
@@ -108,7 +108,29 @@ def get_cookie_aikucun():
|
|
108
108
|
time.sleep(0.1)
|
109
109
|
_driver.maximize_window() # 窗口最大化 方便后续加载数据
|
110
110
|
print(f'请登录并切换到百宝箱,再保存 cookies: \n https://treasurebox.aikucun.com/dashboard/commodity/ranking/merchant?LS=true&shopId=1814114991487782914&from=menu&v=0.1936043279838604')
|
111
|
-
|
111
|
+
wait = WebDriverWait(_driver, timeout=15)
|
112
|
+
input_box = wait.until(
|
113
|
+
EC.element_to_be_clickable(
|
114
|
+
(By.XPATH, '//input[@placeholder="请输入用户名"]'))) #
|
115
|
+
input_box.send_keys('广东万里马实业股份有限公司')
|
116
|
+
input_box = wait.until(
|
117
|
+
EC.element_to_be_clickable(
|
118
|
+
(By.XPATH, '//input[@placeholder="请输入密码"]'))) #
|
119
|
+
input_box.send_keys('wlm123$$$')
|
120
|
+
time.sleep(0.1)
|
121
|
+
elements = _driver.find_elements(
|
122
|
+
By.XPATH, '//button[@class="merchant_login_btn" and contains(text(), "登录")]')
|
123
|
+
_driver.execute_script("arguments[0].click();", elements[0])
|
124
|
+
for i in range(100):
|
125
|
+
try:
|
126
|
+
wait.until(
|
127
|
+
EC.element_to_be_clickable(
|
128
|
+
(By.XPATH, '//div[@class="user-info nav-user-slider"]')))
|
129
|
+
_driver.get(' https://treasurebox.aikucun.com/dashboard/commodity/ranking/merchant?LS=true&shopId=1814114991487782914&from=menu&v=0.1936043279838604')
|
130
|
+
time.sleep(3)
|
131
|
+
break
|
132
|
+
except:
|
133
|
+
time.sleep(5)
|
112
134
|
|
113
135
|
d_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
114
136
|
print(f'{d_time} 登录成功,正在获取cookie...')
|
@@ -116,13 +138,16 @@ def get_cookie_aikucun():
|
|
116
138
|
|
117
139
|
# 将cookies保存为json格式
|
118
140
|
cookies_list = _driver.get_cookies()
|
141
|
+
new_cookies_list = []
|
119
142
|
for cookie in cookies_list:
|
120
143
|
# 该字段有问题所以删除就可以
|
121
|
-
if '
|
122
|
-
|
144
|
+
if 'HWWAFSESTIME' in cookie:
|
145
|
+
continue
|
146
|
+
else:
|
147
|
+
new_cookies_list.append(cookie)
|
123
148
|
json_file = os.path.join(cookie_path, filename_aikucun)
|
124
149
|
with open(json_file, 'w', encoding='utf-8') as f:
|
125
|
-
json.dump(
|
150
|
+
json.dump(new_cookies_list, f, ensure_ascii=False, sort_keys=True, indent=4)
|
126
151
|
print(f'cookie已保存: {json_file}')
|
127
152
|
|
128
153
|
# _file = os.path.join(cookie_path, filename_aikucun)
|
@@ -147,9 +172,10 @@ class AikuCun:
|
|
147
172
|
self.sp_url = 'https://treasurebox.aikucun.com/dashboard/commodity/ranking/merchant?LS=true&shopId=1814114991487782914&from=menu&v=0.1936043279838604'
|
148
173
|
self.cookie_path = os.path.join(set_support.SetSupport(dirname='support').dirname, 'cookies')
|
149
174
|
|
150
|
-
def login(self, shop_name='aikucun'):
|
175
|
+
def login(self, shop_name='aikucun', headless=False):
|
151
176
|
option = webdriver.ChromeOptions()
|
152
|
-
|
177
|
+
if headless:
|
178
|
+
option.add_argument("--headless") # 设置无界面模式
|
153
179
|
# 调整chrome启动配置
|
154
180
|
option.add_argument("--disable-gpu")
|
155
181
|
option.add_argument("--no-sandbox")
|
@@ -224,13 +250,13 @@ class AikuCun:
|
|
224
250
|
time.sleep(3)
|
225
251
|
return _driver
|
226
252
|
|
227
|
-
def get_data(self, shop_name='aikucun', date_num=1):
|
253
|
+
def get_data(self, shop_name='aikucun', date_num=1, headless=True):
|
228
254
|
"""
|
229
255
|
date_num: 获取最近 N 天数据,0表示今天
|
230
256
|
所有数据都是逐日下载
|
231
257
|
"""
|
232
258
|
|
233
|
-
_driver = self.login(shop_name=shop_name)
|
259
|
+
_driver = self.login(shop_name=shop_name, headless=headless)
|
234
260
|
|
235
261
|
_driver.get(self.sp_url)
|
236
262
|
time.sleep(3)
|
@@ -239,7 +265,8 @@ class AikuCun:
|
|
239
265
|
today = datetime.date.today()
|
240
266
|
for date_s in range(date_num):
|
241
267
|
new_date = today - datetime.timedelta(days=date_s) # 会用作文件名
|
242
|
-
|
268
|
+
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
269
|
+
print(f'{now} 正在下载爱库存文件 {date_s+1}/{date_num}: {new_date}')
|
243
270
|
str_date = str(new_date)[2:]
|
244
271
|
wait = WebDriverWait(_driver, timeout=15) #
|
245
272
|
elements = _driver.find_elements(
|
@@ -273,6 +300,15 @@ class AikuCun:
|
|
273
300
|
wait.until(EC.presence_of_element_located(
|
274
301
|
(By.XPATH,
|
275
302
|
'//button[@class="el-button el-button--primary el-button--small is-plain"]/span[contains(text(), "下载数据")]')))
|
303
|
+
|
304
|
+
elements = _driver.find_elements(
|
305
|
+
By.XPATH,
|
306
|
+
'//div[@class="ak-page-list__table-empty" and contains(text(), "暂无数据")]')
|
307
|
+
if elements:
|
308
|
+
print(f'cookies 可能已过期,无法下载')
|
309
|
+
_driver.quit()
|
310
|
+
return
|
311
|
+
|
276
312
|
elements = _driver.find_elements(
|
277
313
|
By.XPATH,
|
278
314
|
'//button[@class="el-button el-button--primary el-button--small is-plain"]/span[contains(text(), "下载数据")]')
|
@@ -283,10 +319,18 @@ class AikuCun:
|
|
283
319
|
|
284
320
|
def clean_data(self, date):
|
285
321
|
set_typ = {
|
322
|
+
'日期': 'date',
|
286
323
|
'店铺名称': 'varchar(100)',
|
287
324
|
'spu_id': 'varchar(100)',
|
288
325
|
'图片': 'varchar(255)',
|
326
|
+
'序号': 'smallint',
|
327
|
+
'商品名称': 'varchar(255)',
|
328
|
+
'商品款号': 'varchar(255)',
|
329
|
+
'一级类目名称': 'varchar(255)',
|
330
|
+
'二级类目名称': 'varchar(255)',
|
331
|
+
'三级类目名称': 'varchar(255)',
|
289
332
|
'数据更新时间': 'timestamp',
|
333
|
+
'更新时间': 'timestamp',
|
290
334
|
}
|
291
335
|
for root, dirs, files in os.walk(upload_path, topdown=False):
|
292
336
|
for name in files:
|
@@ -307,33 +351,60 @@ class AikuCun:
|
|
307
351
|
df.insert(loc=0, column='日期', value=date) # df中插入新列
|
308
352
|
df.insert(loc=1, column='店铺名称', value='爱库存平台') # df中插入新列
|
309
353
|
df.rename(columns={'spuId': 'spu_id'}, inplace=True)
|
310
|
-
df['数据更新时间'] = pd.to_datetime(df['数据更新时间'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
|
354
|
+
# df['数据更新时间'] = pd.to_datetime(df['数据更新时间'], format='%Y-%m-%d %H:%M:%S', errors='ignore')
|
311
355
|
# df['数据更新时间'] = df['数据更新时间'].apply(lambda x: re.sub(' ', ' ', str(x)) if x else x)
|
312
356
|
# print(df['数据更新时间'])
|
313
357
|
# breakpoint()
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
358
|
+
new_dict = {
|
359
|
+
'日期': '',
|
360
|
+
'店铺名称': '',
|
361
|
+
'序号': '',
|
362
|
+
'商品名称': '',
|
363
|
+
'spu_id': '',
|
364
|
+
'商品款号': '',
|
365
|
+
'一级类目名称': '',
|
366
|
+
'二级类目名称': '',
|
367
|
+
'三级类目名称': '',
|
368
|
+
'访客量': '',
|
369
|
+
'浏览量': '',
|
370
|
+
'下单gmv': '',
|
371
|
+
'成交gmv': '',
|
372
|
+
'支付人数_成交': '',
|
373
|
+
}
|
374
|
+
for dict_data in df.to_dict(orient='records'):
|
375
|
+
new_dict.update(dict_data)
|
376
|
+
new_dict.update({'更新时间': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')})
|
377
|
+
m_engine.dict_to_mysql(
|
378
|
+
db_name='爱库存2',
|
379
|
+
table_name='商品spu榜单',
|
380
|
+
dict_data=new_dict,
|
381
|
+
icm_update=['日期', '店铺名称', 'spu_id', '商品款号'],
|
382
|
+
unique_main_key=None,
|
383
|
+
set_typ=set_typ,
|
384
|
+
)
|
385
|
+
|
386
|
+
# m_engine.df_to_mysql(
|
387
|
+
# df=df,
|
388
|
+
# db_name='爱库存2',
|
389
|
+
# table_name='商品spu榜单',
|
390
|
+
# icm_update=[], # 增量更新, 在聚合数据中使用,其他不要用
|
391
|
+
# move_insert=False, # 先删除,再插入
|
392
|
+
# df_sql=True, # 值为 True 时使用 df.to_sql 函数上传整个表, 不会排重
|
393
|
+
# drop_duplicates=False, # 值为 True 时检查重复数据再插入,反之直接上传,会比较慢
|
394
|
+
# count=None,
|
395
|
+
# filename=None, # 用来追踪处理进度
|
396
|
+
# reset_id=False, # 是否重置自增列
|
397
|
+
# set_typ=set_typ,
|
398
|
+
# )
|
328
399
|
|
329
400
|
new_name = f'爱库存_商品榜单_spu_{date}_{date}.csv'
|
330
401
|
df.to_csv(os.path.join(root, new_name), encoding='utf-8_sig', index=False)
|
331
402
|
os.remove(os.path.join(root, name))
|
332
403
|
|
333
404
|
|
334
|
-
def akucun():
|
405
|
+
def akucun(headless=True, date_num=10):
|
335
406
|
akc = AikuCun()
|
336
|
-
akc.get_data(shop_name='aikucun', date_num=
|
407
|
+
akc.get_data(shop_name='aikucun', date_num=date_num, headless=headless) # 获取最近 N 天数据,0表示今天
|
337
408
|
# akc.clean_data()
|
338
409
|
|
339
410
|
# # 新版 数据分类
|
@@ -404,9 +475,8 @@ class AikuCunNew:
|
|
404
475
|
|
405
476
|
|
406
477
|
if __name__ == '__main__':
|
407
|
-
|
408
|
-
|
409
|
-
akucun()
|
478
|
+
# get_cookie_aikucun() # 登录并获取 cookies
|
479
|
+
akucun(date_num=100, headless=True) # 下载数据
|
410
480
|
|
411
481
|
# a = AikuCunNew(shop_name='aikucun')
|
412
482
|
# a.akc()
|
@@ -1,11 +1,11 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=ygQYYbxTn7utNPgwiz6MmBSWlq5JrXB-2NU0V75b6Us,74640
|
5
5
|
mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
|
6
6
|
mdbq/aggregation/mysql_types.py,sha256=YTGyrF9vcRgfkQbpT-e-JdJ7c7VF1dDHgyx9YZRES8w,10934
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=RXIv7cACCgYyehAxMjUYi_S7rVyjIwXKWMaM3nduGtA,3068
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
8
|
+
mdbq/aggregation/query_data.py,sha256=yU-PUMY5mTKAhGTH9yCe897MzSRme2toCJBqXaUrJUQ,148223
|
9
9
|
mdbq/aggregation/query_data_bak.py,sha256=r1FU0C4zjXln7oVSrRkElh4Ehl-9mYhGcq57jLbViUA,104071
|
10
10
|
mdbq/aggregation/query_data_bak20241124.py,sha256=oY95ZK3qt3Wx9pdZKZ5cvDh45Yi5yGj1kl8G6riumHA,144513
|
11
11
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
@@ -28,7 +28,7 @@ mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
|
28
28
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
29
29
|
mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
|
30
30
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
31
|
-
mdbq/mysql/mysql.py,sha256=
|
31
|
+
mdbq/mysql/mysql.py,sha256=z3RXzPiVQzJzPBoyLr1XL5QXAtXehjbkxWVoBCQBaqY,64373
|
32
32
|
mdbq/mysql/recheck_mysql.py,sha256=rgTpvDMWYTyEn7UQdlig-pdXDluTgiU8JG6lkMh8DV0,8665
|
33
33
|
mdbq/mysql/s_query.py,sha256=MbIprZ4yJDAZ9AahZPzl7hqS695Vs0P-AJNwAtA_EEc,9287
|
34
34
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
@@ -45,8 +45,8 @@ mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,
|
|
45
45
|
mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
46
46
|
mdbq/req_post/req_tb.py,sha256=qg7pet73IgKGmCwxaeUyImJIoeK_pBQT9BBKD7fkBNg,36160
|
47
47
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
48
|
-
mdbq/spider/aikucun.py,sha256=
|
49
|
-
mdbq-3.2.
|
50
|
-
mdbq-3.2.
|
51
|
-
mdbq-3.2.
|
52
|
-
mdbq-3.2.
|
48
|
+
mdbq/spider/aikucun.py,sha256=nIKKZOZbemKqcrikcrMmtksLgJjjzeU0I99teBgU1jE,22439
|
49
|
+
mdbq-3.2.9.dist-info/METADATA,sha256=VVlPtpTiP4PO4M02keRDAf3m98dzf8_noBXsu_TV-L0,243
|
50
|
+
mdbq-3.2.9.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
51
|
+
mdbq-3.2.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
52
|
+
mdbq-3.2.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|