mdbq 4.0.9__py3-none-any.whl → 4.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +3 -2
- mdbq/mysql/deduplicator.py +4 -1
- mdbq/mysql/s_query.py +190 -151
- mdbq/mysql/uploader.py +2 -0
- mdbq/spider/aikucun.py +53 -46
- {mdbq-4.0.9.dist-info → mdbq-4.0.11.dist-info}/METADATA +1 -1
- {mdbq-4.0.9.dist-info → mdbq-4.0.11.dist-info}/RECORD +10 -11
- mdbq/log/spider_logging.py +0 -47
- {mdbq-4.0.9.dist-info → mdbq-4.0.11.dist-info}/WHEEL +0 -0
- {mdbq-4.0.9.dist-info → mdbq-4.0.11.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.11'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -1557,7 +1557,6 @@ class MysqlDatasQuery:
|
|
1557
1557
|
'unique_keys': [['日期', '店铺id', '商品id']], # 唯一约束列表
|
1558
1558
|
}
|
1559
1559
|
|
1560
|
-
|
1561
1560
|
@upload_data_decorator()
|
1562
1561
|
def spph(self, db_name='聚合数据', table_name='天猫_商品排行'):
|
1563
1562
|
""" """
|
@@ -3677,6 +3676,7 @@ def query3(months=1, download_manager=None):
|
|
3677
3676
|
|
3678
3677
|
|
3679
3678
|
def main(months=3):
|
3679
|
+
logger.info('数据聚合任务开始')
|
3680
3680
|
# 1. 更新日期表 更新货品年份基准表, 属性设置 3 - 货品年份基准
|
3681
3681
|
date_table()
|
3682
3682
|
# 2. 数据聚合
|
@@ -3685,11 +3685,12 @@ def main(months=3):
|
|
3685
3685
|
password=password,
|
3686
3686
|
host=host,
|
3687
3687
|
port=port,
|
3688
|
-
maxconnections=
|
3688
|
+
maxconnections=20,
|
3689
3689
|
)
|
3690
3690
|
query1(download_manager=download_manager, months=months)
|
3691
3691
|
query2(download_manager=download_manager, months=months)
|
3692
3692
|
query3(download_manager=download_manager, months=months)
|
3693
|
+
logger.info('数据聚合完成')
|
3693
3694
|
|
3694
3695
|
|
3695
3696
|
if __name__ == '__main__':
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -6,6 +6,7 @@ import warnings
|
|
6
6
|
import pymysql
|
7
7
|
import os
|
8
8
|
from mdbq.log import mylogger
|
9
|
+
from mdbq.config import config
|
9
10
|
from typing import List, Dict, Optional, Any, Tuple
|
10
11
|
from dbutils.pooled_db import PooledDB
|
11
12
|
import threading
|
@@ -1348,7 +1349,7 @@ class MySQLDeduplicator:
|
|
1348
1349
|
|
1349
1350
|
|
1350
1351
|
def main():
|
1351
|
-
|
1352
|
+
logger.info('去重任务开始')
|
1352
1353
|
dir_path = os.path.expanduser("~")
|
1353
1354
|
my_cont = config.read_config(file_path=os.path.join(dir_path, 'spd.txt'))
|
1354
1355
|
username, password, host, port = my_cont['username'], my_cont['password'], my_cont['host'], int(my_cont['port'])
|
@@ -1401,6 +1402,8 @@ def main():
|
|
1401
1402
|
|
1402
1403
|
# 关闭连接
|
1403
1404
|
deduplicator.close()
|
1405
|
+
logger.info('去重任务结束')
|
1406
|
+
|
1404
1407
|
|
1405
1408
|
if __name__ == '__main__':
|
1406
1409
|
main()
|
mdbq/mysql/s_query.py
CHANGED
@@ -35,7 +35,7 @@ class QueryDatas:
|
|
35
35
|
"""
|
36
36
|
|
37
37
|
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4',
|
38
|
-
|
38
|
+
pool_size: int = 20, mincached: int = 2, maxcached: int = 5,
|
39
39
|
connect_timeout: int = 10, read_timeout: int = 30, write_timeout: int = 30,
|
40
40
|
max_retries: int = 3, retry_waiting_time: int = 5, collation: str = 'utf8mb4_0900_ai_ci') -> None:
|
41
41
|
"""
|
@@ -47,9 +47,9 @@ class QueryDatas:
|
|
47
47
|
host: 数据库主机
|
48
48
|
port: 数据库端口
|
49
49
|
charset: 字符集,默认utf8mb4
|
50
|
-
|
51
|
-
mincached:
|
52
|
-
maxcached:
|
50
|
+
pool_size: 最大活动连接数,默认20
|
51
|
+
mincached: 最小缓存连接数,空闲连接数量,默认2
|
52
|
+
maxcached: 最大缓存连接数,最大空闲连接数,默认5
|
53
53
|
connect_timeout: 连接超时时间,默认10秒
|
54
54
|
read_timeout: 读取超时时间,默认30秒
|
55
55
|
write_timeout: 写入超时时间,默认30秒
|
@@ -87,14 +87,14 @@ class QueryDatas:
|
|
87
87
|
'write_timeout': write_timeout,
|
88
88
|
'autocommit': True
|
89
89
|
}
|
90
|
-
self.pool = self._create_connection_pool(
|
90
|
+
self.pool = self._create_connection_pool(pool_size, mincached, maxcached)
|
91
91
|
|
92
|
-
def _create_connection_pool(self,
|
92
|
+
def _create_connection_pool(self, pool_size: int, mincached: int, maxcached: int) -> PooledDB:
|
93
93
|
"""
|
94
94
|
创建数据库连接池
|
95
95
|
|
96
96
|
Args:
|
97
|
-
|
97
|
+
pool_size: 最大连接数
|
98
98
|
mincached: 最小缓存连接数
|
99
99
|
maxcached: 最大缓存连接数
|
100
100
|
|
@@ -122,7 +122,7 @@ class QueryDatas:
|
|
122
122
|
}
|
123
123
|
pool_params = {
|
124
124
|
'creator': pymysql,
|
125
|
-
'maxconnections':
|
125
|
+
'maxconnections': pool_size,
|
126
126
|
'mincached': mincached,
|
127
127
|
'maxcached': maxcached,
|
128
128
|
'blocking': True,
|
@@ -133,7 +133,7 @@ class QueryDatas:
|
|
133
133
|
try:
|
134
134
|
pool = PooledDB(**pool_params, **connection_params)
|
135
135
|
logger.debug('连接池创建成功', {
|
136
|
-
'连接池大小':
|
136
|
+
'连接池大小': pool_size,
|
137
137
|
'最小缓存': mincached,
|
138
138
|
'最大缓存': maxcached,
|
139
139
|
'主机': self.host,
|
@@ -253,20 +253,8 @@ class QueryDatas:
|
|
253
253
|
|
254
254
|
# @_execute_with_retry
|
255
255
|
def _get_connection(self, db_name: Optional[str] = None) -> pymysql.connections.Connection:
|
256
|
-
"""
|
257
|
-
从连接池获取数据库连接
|
258
|
-
|
259
|
-
Args:
|
260
|
-
db_name: 可选的数据库名,如果提供则会在连接后选择该数据库
|
261
|
-
|
262
|
-
Returns:
|
263
|
-
数据库连接对象
|
264
|
-
|
265
|
-
Raises:
|
266
|
-
ConnectionError: 当获取连接失败时抛出
|
267
|
-
"""
|
256
|
+
"""从连接池获取数据库连接"""
|
268
257
|
try:
|
269
|
-
# 只在连续失败次数达到阈值时检查健康状态
|
270
258
|
if self._pool_stats['consecutive_failures'] >= self._pool_stats['max_consecutive_failures']:
|
271
259
|
if not self._check_pool_health():
|
272
260
|
logger.warning('连接池不健康,尝试重新创建')
|
@@ -282,66 +270,184 @@ class QueryDatas:
|
|
282
270
|
error_code = e.args[0] if e.args else None
|
283
271
|
if error_code in (2003, 2006, 2013):
|
284
272
|
logger.error('数据库连接错误', {
|
273
|
+
'库': db_name,
|
285
274
|
'错误代码': error_code,
|
286
275
|
'错误信息': str(e),
|
287
|
-
'数据库': db_name
|
288
276
|
})
|
289
277
|
self.pool = self._create_connection_pool(10, 2, 5)
|
290
278
|
self._pool_stats['consecutive_failures'] = 0
|
291
279
|
raise ConnectionError(f'数据库连接错误: {str(e)}')
|
292
|
-
|
293
|
-
raise
|
280
|
+
raise
|
294
281
|
except Exception as e:
|
295
282
|
logger.error('从连接池获取数据库连接失败', {
|
283
|
+
'库': db_name,
|
296
284
|
'错误': str(e),
|
297
|
-
'数据库': db_name
|
298
285
|
})
|
299
286
|
raise ConnectionError(f'连接数据库失败: {str(e)}')
|
300
287
|
|
301
288
|
# @_execute_with_retry
|
302
|
-
def _execute_query(self, sql: str, params: tuple = None, db_name: str = None
|
289
|
+
def _execute_query(self, sql: str, params: tuple = None, db_name: str = None,
|
290
|
+
fetch_all: bool = True, error_handling: bool = True) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
|
291
|
+
"""执行SQL查询的通用方法"""
|
292
|
+
try:
|
293
|
+
if sql.upper().startswith('SHOW DATABASES'):
|
294
|
+
with closing(self._get_connection()) as connection:
|
295
|
+
with closing(connection.cursor()) as cursor:
|
296
|
+
cursor.execute(sql, params)
|
297
|
+
return cursor.fetchall() if fetch_all else cursor.fetchone()
|
298
|
+
else:
|
299
|
+
with closing(self._get_connection(db_name)) as connection:
|
300
|
+
with closing(connection.cursor()) as cursor:
|
301
|
+
cursor.execute(sql, params)
|
302
|
+
return cursor.fetchall() if fetch_all else cursor.fetchone()
|
303
|
+
except pymysql.OperationalError as e:
|
304
|
+
error_code = e.args[0] if e.args else None
|
305
|
+
if error_handling:
|
306
|
+
if error_code in (1045, 1049): # 访问被拒绝或数据库不存在
|
307
|
+
logger.error('数据库访问错误', {
|
308
|
+
'SQL': sql,
|
309
|
+
'参数': params,
|
310
|
+
'库': db_name,
|
311
|
+
'错误代码': error_code,
|
312
|
+
'错误信息': str(e)
|
313
|
+
})
|
314
|
+
else:
|
315
|
+
logger.error('数据库操作错误', {
|
316
|
+
'库': db_name,
|
317
|
+
'SQL': sql,
|
318
|
+
'参数': params,
|
319
|
+
'错误代码': error_code,
|
320
|
+
'错误信息': str(e)
|
321
|
+
})
|
322
|
+
return None
|
323
|
+
raise
|
324
|
+
except Exception as e:
|
325
|
+
if error_handling:
|
326
|
+
logger.error('执行SQL查询失败', {
|
327
|
+
'库': db_name,
|
328
|
+
'SQL': sql,
|
329
|
+
'参数': params,
|
330
|
+
'错误类型': type(e).__name__,
|
331
|
+
'错误信息': str(e)
|
332
|
+
})
|
333
|
+
return None
|
334
|
+
raise
|
335
|
+
|
336
|
+
def _get_table_info(self, db_name: str, table_name: str, info_type: Literal['columns', 'dtypes', 'exists'] = 'exists') -> Union[bool, List[Dict[str, Any]], List[str]]:
|
303
337
|
"""
|
304
|
-
|
338
|
+
获取表信息的通用方法。
|
305
339
|
|
306
340
|
Args:
|
307
|
-
sql: SQL查询语句
|
308
|
-
params: 查询参数
|
309
341
|
db_name: 数据库名
|
310
|
-
|
342
|
+
table_name: 表名
|
343
|
+
info_type: 信息类型
|
344
|
+
- 'exists': 检查表是否存在(默认)
|
345
|
+
- 'columns': 获取列名列表
|
346
|
+
- 'dtypes': 获取列名和类型
|
347
|
+
|
311
348
|
Returns:
|
312
|
-
|
349
|
+
根据info_type返回不同类型的信息:
|
350
|
+
- 'exists': 返回bool,表示表是否存在
|
351
|
+
- 'columns': 返回列名列表
|
352
|
+
- 'dtypes': 返回列名和类型的列表
|
313
353
|
"""
|
314
354
|
try:
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
355
|
+
if info_type == 'exists':
|
356
|
+
result = self._execute_query("SHOW DATABASES LIKE %s", (db_name,))
|
357
|
+
if not result:
|
358
|
+
all_dbs = self._execute_query("SHOW DATABASES")
|
359
|
+
available_dbs = [db['Database'] for db in all_dbs] if all_dbs else []
|
360
|
+
logger.info('数据库不存在', {
|
361
|
+
'库': db_name,
|
362
|
+
'可用的数据库': available_dbs,
|
363
|
+
'可能的原因': '数据库名称错误或没有访问权限'
|
364
|
+
})
|
365
|
+
return False
|
366
|
+
|
367
|
+
result = self._execute_query("SHOW TABLES LIKE %s", (table_name,), db_name=db_name)
|
368
|
+
if not result:
|
369
|
+
all_tables = self._execute_query("SHOW TABLES", db_name=db_name)
|
370
|
+
available_tables = [table[f'Tables_in_{db_name}'] for table in all_tables] if all_tables else []
|
371
|
+
logger.info('表不存在', {
|
372
|
+
'库': db_name,
|
373
|
+
'表': table_name,
|
374
|
+
'可用的表': available_tables,
|
375
|
+
'可能的原因': '表名称错误或没有访问权限'
|
376
|
+
})
|
377
|
+
return False
|
378
|
+
return True
|
379
|
+
|
380
|
+
elif info_type == 'columns':
|
381
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
382
|
+
result = self._execute_query(sql, (db_name, table_name))
|
383
|
+
return [col['COLUMN_NAME'] for col in result] if result else []
|
384
|
+
|
385
|
+
elif info_type == 'dtypes':
|
386
|
+
sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
387
|
+
return self._execute_query(sql, (db_name, table_name)) or []
|
388
|
+
|
319
389
|
except Exception as e:
|
320
|
-
logger.error('
|
321
|
-
'
|
322
|
-
'
|
323
|
-
'
|
390
|
+
logger.error('获取表信息失败', {
|
391
|
+
'库': db_name,
|
392
|
+
'表': table_name,
|
393
|
+
'信息类型': info_type,
|
324
394
|
'错误类型': type(e).__name__,
|
325
395
|
'错误信息': str(e)
|
326
396
|
})
|
327
|
-
return
|
397
|
+
return [] if info_type != 'exists' else False
|
328
398
|
|
329
|
-
def
|
330
|
-
"""
|
331
|
-
|
399
|
+
def check_infos(self, db_name: str, table_name: str) -> bool:
|
400
|
+
"""检查数据库和数据表是否存在"""
|
401
|
+
return self._get_table_info(db_name, table_name, 'exists')
|
402
|
+
|
403
|
+
def _format_columns(self, columns: List[str]) -> str:
|
404
|
+
"""格式化列名列表为SQL语句"""
|
405
|
+
return ', '.join([f'`{col}`' for col in columns])
|
406
|
+
|
407
|
+
def columns_to_list(self, db_name: str, table_name: str, columns_name: List[str], where: str = None) -> List[Dict[str, Any]]:
|
408
|
+
"""获取数据表的指定列数据"""
|
409
|
+
if not self._get_table_info(db_name, table_name):
|
410
|
+
return []
|
332
411
|
|
333
|
-
|
334
|
-
db_name
|
335
|
-
|
336
|
-
condition: SQL条件字符串(不含WHERE)
|
337
|
-
columns: 查询字段字符串或以逗号分隔的字段名,默认'更新时间'
|
412
|
+
try:
|
413
|
+
existing_columns = self._get_table_info(db_name, table_name, 'columns')
|
414
|
+
columns_name = [col for col in columns_name if col in existing_columns]
|
338
415
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
416
|
+
if not columns_name:
|
417
|
+
logger.info('未找到匹配的列名', {'库': db_name, '表': table_name, '请求列': columns_name})
|
418
|
+
return []
|
419
|
+
|
420
|
+
sql = f"SELECT {self._format_columns(columns_name)} FROM `{db_name}`.`{table_name}`"
|
421
|
+
if where:
|
422
|
+
sql += f" WHERE {where}"
|
344
423
|
|
424
|
+
logger.debug('执行列查询', {'库': db_name, '表': table_name, 'SQL': sql})
|
425
|
+
return self._execute_query(sql, db_name=db_name) or []
|
426
|
+
|
427
|
+
except Exception as e:
|
428
|
+
logger.error('列查询失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
429
|
+
return []
|
430
|
+
|
431
|
+
def dtypes_to_list(self, db_name: str, table_name: str, columns_name: List[str] = None) -> List[Dict[str, Any]]:
|
432
|
+
"""获取数据表的列名和类型"""
|
433
|
+
if not self._get_table_info(db_name, table_name):
|
434
|
+
return []
|
435
|
+
|
436
|
+
try:
|
437
|
+
result = self._get_table_info(db_name, table_name, 'dtypes')
|
438
|
+
if columns_name:
|
439
|
+
columns_name = set(columns_name)
|
440
|
+
result = [row for row in result if row['COLUMN_NAME'] in columns_name]
|
441
|
+
return result
|
442
|
+
except Exception as e:
|
443
|
+
logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
444
|
+
return []
|
445
|
+
|
446
|
+
def check_condition(self, db_name: str, table_name: str, condition: str, columns: str = '更新时间') -> Optional[List[Dict[str, Any]]]:
|
447
|
+
"""按指定条件查询数据库表"""
|
448
|
+
if not self._get_table_info(db_name, table_name):
|
449
|
+
return None
|
450
|
+
|
345
451
|
sql = f"SELECT {columns} FROM `{table_name}` WHERE {condition}"
|
346
452
|
logger.debug('执行SQL查询', {'库': db_name, '表': table_name, 'SQL': sql})
|
347
453
|
return self._execute_query(sql, db_name=db_name)
|
@@ -598,98 +704,6 @@ class QueryDatas:
|
|
598
704
|
df[col] = df[col].astype(float)
|
599
705
|
return df
|
600
706
|
|
601
|
-
# @_execute_with_retry
|
602
|
-
def columns_to_list(self, db_name, table_name, columns_name, where: str = None) -> list:
|
603
|
-
"""
|
604
|
-
获取数据表的指定列, 支持where条件筛选, 返回列表字典。
|
605
|
-
:param db_name: 数据库名
|
606
|
-
:param table_name: 表名
|
607
|
-
:param columns_name: 需要获取的列名列表
|
608
|
-
:param where: 可选,SQL条件字符串(不含WHERE)
|
609
|
-
:return: [{列1:值, 列2:值, ...}, ...]
|
610
|
-
"""
|
611
|
-
if not self.check_infos(db_name, table_name):
|
612
|
-
return []
|
613
|
-
|
614
|
-
try:
|
615
|
-
with closing(self._get_connection(db_name)) as connection:
|
616
|
-
with closing(connection.cursor()) as cursor:
|
617
|
-
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
618
|
-
cursor.execute(sql, (db_name, table_name))
|
619
|
-
cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()]
|
620
|
-
columns_name = [item for item in columns_name if item in cols_exist]
|
621
|
-
if not columns_name:
|
622
|
-
logger.info('未找到匹配的列名', {'库': db_name, '表': table_name, '请求列': columns_name})
|
623
|
-
return []
|
624
|
-
columns_in = ', '.join([f'`{col}`' for col in columns_name])
|
625
|
-
sql = f"SELECT {columns_in} FROM `{db_name}`.`{table_name}`"
|
626
|
-
if where:
|
627
|
-
sql += f" WHERE {where}"
|
628
|
-
logger.debug('执行列查询', {'库': db_name, '表': table_name, 'SQL': sql})
|
629
|
-
cursor.execute(sql)
|
630
|
-
column_values = cursor.fetchall()
|
631
|
-
return column_values
|
632
|
-
except Exception as e:
|
633
|
-
logger.error('列查询失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
634
|
-
return []
|
635
|
-
|
636
|
-
# @_execute_with_retry
|
637
|
-
def dtypes_to_list(self, db_name, table_name, columns_name=None) -> list:
|
638
|
-
"""
|
639
|
-
获取数据表的列名和类型, 支持只返回部分字段类型。
|
640
|
-
:param db_name: 数据库名
|
641
|
-
:param table_name: 表名
|
642
|
-
:param columns_name: 可选,字段名列表,仅返回这些字段的类型
|
643
|
-
:return: [{'COLUMN_NAME': ..., 'COLUMN_TYPE': ...}, ...]
|
644
|
-
"""
|
645
|
-
if not self.check_infos(db_name, table_name):
|
646
|
-
return []
|
647
|
-
|
648
|
-
try:
|
649
|
-
with closing(self._get_connection(db_name)) as connection:
|
650
|
-
with closing(connection.cursor()) as cursor:
|
651
|
-
sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
652
|
-
cursor.execute(sql, (db_name, table_name))
|
653
|
-
column_name_and_type = cursor.fetchall()
|
654
|
-
if columns_name:
|
655
|
-
columns_name = set(columns_name)
|
656
|
-
column_name_and_type = [row for row in column_name_and_type if row['COLUMN_NAME'] in columns_name]
|
657
|
-
return column_name_and_type
|
658
|
-
except Exception as e:
|
659
|
-
logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
660
|
-
return []
|
661
|
-
|
662
|
-
# @_execute_with_retry
|
663
|
-
def check_infos(self, db_name, table_name) -> bool:
|
664
|
-
"""
|
665
|
-
检查数据库和数据表是否存在。
|
666
|
-
:param db_name: 数据库名
|
667
|
-
:param table_name: 表名
|
668
|
-
:return: 存在返回True,否则False
|
669
|
-
"""
|
670
|
-
try:
|
671
|
-
# 检查数据库是否存在
|
672
|
-
result = self._execute_query("SHOW DATABASES LIKE %s", (db_name,))
|
673
|
-
if not result:
|
674
|
-
logger.info('数据库不存在', {'库': db_name})
|
675
|
-
return False
|
676
|
-
|
677
|
-
# 检查表是否存在
|
678
|
-
result = self._execute_query("SHOW TABLES LIKE %s", (table_name,), db_name=db_name)
|
679
|
-
if not result:
|
680
|
-
logger.info('表不存在', {'库': db_name, '表': table_name})
|
681
|
-
return False
|
682
|
-
return True
|
683
|
-
|
684
|
-
except Exception as e:
|
685
|
-
logger.error('检查数据库或表失败', {
|
686
|
-
'库': db_name,
|
687
|
-
'表': table_name,
|
688
|
-
'错误类型': type(e).__name__,
|
689
|
-
'错误信息': str(e)
|
690
|
-
})
|
691
|
-
return False
|
692
|
-
|
693
707
|
def __enter__(self):
|
694
708
|
"""上下文管理器入口"""
|
695
709
|
return self
|
@@ -703,7 +717,7 @@ class QueryDatas:
|
|
703
717
|
if hasattr(self, 'pool') and self.pool is not None:
|
704
718
|
try:
|
705
719
|
self.pool.close()
|
706
|
-
logger.
|
720
|
+
logger.debug('连接池已关闭', {
|
707
721
|
'主机': self.host,
|
708
722
|
'端口': self.port
|
709
723
|
})
|
@@ -772,6 +786,8 @@ class QueryDatas:
|
|
772
786
|
- 当return_format='list_dict'时,返回列表字典
|
773
787
|
- 如果查询失败,返回空的DataFrame或空列表
|
774
788
|
"""
|
789
|
+
start_time = time.time()
|
790
|
+
|
775
791
|
if not db_name or not table_name:
|
776
792
|
logger.error('数据库名和表名不能为空', {'库': db_name, '表': table_name})
|
777
793
|
return [] if return_format == 'list_dict' else pd.DataFrame()
|
@@ -786,7 +802,7 @@ class QueryDatas:
|
|
786
802
|
start_date, end_date = self._validate_date_range(start_date, end_date, db_name, table_name)
|
787
803
|
|
788
804
|
# 检查数据库和表是否存在
|
789
|
-
if not self.
|
805
|
+
if not self._get_table_info(db_name, table_name):
|
790
806
|
return [] if return_format == 'list_dict' else pd.DataFrame()
|
791
807
|
try:
|
792
808
|
with closing(self._get_connection(db_name)) as connection:
|
@@ -863,7 +879,7 @@ class QueryDatas:
|
|
863
879
|
target_time = 1.0 # 期望每批1秒
|
864
880
|
|
865
881
|
while offset < total_count:
|
866
|
-
|
882
|
+
_p_time = time.time()
|
867
883
|
# 添加分页参数
|
868
884
|
page_sql = f"{base_sql} LIMIT %s OFFSET %s"
|
869
885
|
page_params = list(params) + [page_size, offset]
|
@@ -881,7 +897,7 @@ class QueryDatas:
|
|
881
897
|
else:
|
882
898
|
all_results = pd.concat([all_results, pd.DataFrame(page_results)], ignore_index=True)
|
883
899
|
|
884
|
-
duration = time.time() -
|
900
|
+
duration = time.time() - _p_time
|
885
901
|
page_size = self._adjust_page_size(duration, page_size, min_size, max_size, target_time)
|
886
902
|
offset += len(page_results)
|
887
903
|
logger.debug('分页查询进度', {
|
@@ -896,6 +912,21 @@ class QueryDatas:
|
|
896
912
|
|
897
913
|
if return_format == 'df' and isinstance(all_results, pd.DataFrame) and not all_results.empty:
|
898
914
|
all_results = self._convert_decimal_columns(all_results)
|
915
|
+
logger.info('查询完成', {
|
916
|
+
'库': db_name,
|
917
|
+
'表': table_name,
|
918
|
+
'总记录数': total_count,
|
919
|
+
'已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index),
|
920
|
+
'查询耗时': f'{time.time() - start_time:.2f}s',
|
921
|
+
'查询参数': {
|
922
|
+
'开始日期': start_date,
|
923
|
+
'结束日期': end_date,
|
924
|
+
'日期字段': date_field,
|
925
|
+
'限制行数': limit,
|
926
|
+
'分页大小': page_size,
|
927
|
+
'返回数据格式': return_format,
|
928
|
+
}
|
929
|
+
})
|
899
930
|
return all_results
|
900
931
|
|
901
932
|
except Exception as e:
|
@@ -903,7 +934,15 @@ class QueryDatas:
|
|
903
934
|
'库': db_name,
|
904
935
|
'表': table_name,
|
905
936
|
'错误类型': type(e).__name__,
|
906
|
-
'错误信息': str(e)
|
937
|
+
'错误信息': str(e),
|
938
|
+
'查询参数': {
|
939
|
+
'开始日期': start_date,
|
940
|
+
'结束日期': end_date,
|
941
|
+
'日期字段': date_field,
|
942
|
+
'限制行数': limit,
|
943
|
+
'分页大小': page_size,
|
944
|
+
'返回数据格式': return_format,
|
945
|
+
}
|
907
946
|
})
|
908
947
|
return [] if return_format == 'list_dict' else pd.DataFrame()
|
909
948
|
|
mdbq/mysql/uploader.py
CHANGED
mdbq/spider/aikucun.py
CHANGED
@@ -25,9 +25,7 @@ config_file = os.path.join(dir_path, 'spd.txt')
|
|
25
25
|
content = config.read_config(file_path=config_file)
|
26
26
|
username, password, host, port = content['username'], content['password'], content['host'], content['port']
|
27
27
|
|
28
|
-
uld = uploader.MySQLUploader(username=username, password=password, host=host, port=int(port), pool_size=10)
|
29
28
|
# 实例化一个数据查询类,用来获取 cookies 表数据
|
30
|
-
download = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
31
29
|
logger = mylogger.MyLogger(
|
32
30
|
logging_mode='file',
|
33
31
|
log_level='info',
|
@@ -48,15 +46,15 @@ def keep_connect(_db_name, _config, max_try: int=10):
|
|
48
46
|
connection = pymysql.connect(**_config) # 连接数据库
|
49
47
|
return connection
|
50
48
|
except Exception as e:
|
51
|
-
logger.error(
|
49
|
+
logger.error('连接失败', {'数据库': _db_name, '主机': host, '端口': port, '重试次数': attempts, '最大重试次数': max_try, '错误信息': e})
|
52
50
|
attempts += 1
|
53
51
|
time.sleep(30)
|
54
|
-
logger.error(
|
52
|
+
logger.error('连接失败', {'数据库': _db_name, '主机': host, '端口': port, '重试次数': attempts, '最大重试次数': max_try})
|
55
53
|
return None
|
56
54
|
|
57
55
|
|
58
56
|
class AikuCun:
|
59
|
-
def __init__(self):
|
57
|
+
def __init__(self, uld_manager, download_manager):
|
60
58
|
self.url = 'https://gray-merc.aikucun.com/index.html'
|
61
59
|
self.db_name = 'cookie文件'
|
62
60
|
self.table_name = 'main_aikucun'
|
@@ -66,6 +64,8 @@ class AikuCun:
|
|
66
64
|
self.start_date = (self.today - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
|
67
65
|
self.end_date = (self.today - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
68
66
|
self.error_count = 0
|
67
|
+
self.uld = uld_manager
|
68
|
+
self.download = download_manager
|
69
69
|
|
70
70
|
def logining(self, shop_name='aikucun', headless=False):
|
71
71
|
option = webdriver.ChromeOptions()
|
@@ -171,7 +171,7 @@ class AikuCun:
|
|
171
171
|
|
172
172
|
def save_token(self):
|
173
173
|
if not self.token:
|
174
|
-
|
174
|
+
logger.error('self.token 不能为空')
|
175
175
|
return
|
176
176
|
set_typ = {
|
177
177
|
'日期': 'DATE',
|
@@ -182,11 +182,11 @@ class AikuCun:
|
|
182
182
|
'更新时间': 'timestamp'
|
183
183
|
}
|
184
184
|
# 更新至数据库记录
|
185
|
-
uld.upload_data(
|
185
|
+
self.uld.upload_data(
|
186
186
|
db_name=self.db_name,
|
187
187
|
table_name=self.table_name,
|
188
188
|
data=self.token,
|
189
|
-
set_typ=
|
189
|
+
set_typ=set_typ,
|
190
190
|
primary_keys=[],
|
191
191
|
check_duplicate=False,
|
192
192
|
update_on_duplicate=False,
|
@@ -209,7 +209,7 @@ class AikuCun:
|
|
209
209
|
self.end_date = end_date
|
210
210
|
date_list = otk.dates_between(start_date=self.start_date, end_date=self.end_date)
|
211
211
|
|
212
|
-
df = download.data_to_df(
|
212
|
+
df = self.download.data_to_df(
|
213
213
|
db_name=self.db_name,
|
214
214
|
table_name=self.table_name,
|
215
215
|
start_date='2025-03-07',
|
@@ -230,7 +230,7 @@ class AikuCun:
|
|
230
230
|
idx = df.groupby(['平台', '店铺名称'])['更新时间'].idxmax()
|
231
231
|
df = df.loc[idx][['token']]
|
232
232
|
if len(df) == 0:
|
233
|
-
|
233
|
+
logger.error(f'从数据库获取的 token 不能为空')
|
234
234
|
return
|
235
235
|
self.token = df.iloc[0, 0]
|
236
236
|
|
@@ -247,7 +247,7 @@ class AikuCun:
|
|
247
247
|
results = []
|
248
248
|
for date in date_list:
|
249
249
|
if self.error_count > 5:
|
250
|
-
|
250
|
+
logger.logger('已退出请求 -> self.error_count > 5')
|
251
251
|
break
|
252
252
|
req_date = re.sub('-', '', date)
|
253
253
|
data = {
|
@@ -273,16 +273,15 @@ class AikuCun:
|
|
273
273
|
# cookies=cookies,
|
274
274
|
data=json.dumps(data)
|
275
275
|
)
|
276
|
-
|
277
|
-
# print(res.json())
|
276
|
+
logger.info('获取数据', {'进度': num/len(date_list), '日期': date, '榜单类型': item_type})
|
278
277
|
if not res.json().get('success', None):
|
279
|
-
|
278
|
+
logger.error('没有获取到数据, 请求不成功, 如果连续请求失败 > 5, 则需重新获取cookie后继续')
|
280
279
|
num += 1
|
281
280
|
self.error_count += 1
|
282
281
|
time.sleep(1)
|
283
282
|
continue
|
284
283
|
if not res.json().get('data', {}).get('rows', None):
|
285
|
-
|
284
|
+
logger.error("返回的数据字典异常, ['data']['rows'] 不能为空")
|
286
285
|
num += 1
|
287
286
|
self.error_count += 1
|
288
287
|
time.sleep(1)
|
@@ -291,7 +290,7 @@ class AikuCun:
|
|
291
290
|
num += 1
|
292
291
|
time.sleep(1)
|
293
292
|
if num % 32 == 0:
|
294
|
-
|
293
|
+
logger.info("避免频繁请求, 正在休眠...")
|
295
294
|
# time.sleep(60)
|
296
295
|
|
297
296
|
return results
|
@@ -413,18 +412,18 @@ class AikuCun:
|
|
413
412
|
'尺码': 'varchar(50)',
|
414
413
|
'货号': 'varchar(50)', # 款号 + 颜色编码
|
415
414
|
}
|
416
|
-
|
415
|
+
logger.info('更新数据库', {'店铺名称': self.shop_name, '库': db_name, '表': table_name})
|
417
416
|
if 'spu' in table_name:
|
418
417
|
drop_dup = ['日期', '平台', '店铺名称', '商品款号', '访客量']
|
419
418
|
else:
|
420
419
|
drop_dup = ['日期', '平台', '店铺名称', '条码']
|
421
|
-
uld.upload_data(
|
420
|
+
self.uld.upload_data(
|
422
421
|
db_name=db_name,
|
423
422
|
table_name=table_name,
|
424
423
|
data=_results,
|
425
424
|
set_typ=set_typ, # 定义列和数据类型
|
426
425
|
primary_keys=[], # 创建唯一主键
|
427
|
-
check_duplicate=
|
426
|
+
check_duplicate=False, # 检查重复数据
|
428
427
|
update_on_duplicate=False, # 遇到重复时更新数据,默认 False 跳过
|
429
428
|
duplicate_columns=drop_dup, # 指定排重的组合键
|
430
429
|
allow_null=False, # 允许插入空值
|
@@ -470,36 +469,44 @@ class AikuCun:
|
|
470
469
|
headers=headers,
|
471
470
|
data=json.dumps(data)
|
472
471
|
)
|
473
|
-
print(res.json())
|
474
472
|
|
475
473
|
|
476
474
|
def main(start_date, end_date=None, item_type=['spu']):
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
475
|
+
db_config = {
|
476
|
+
'username': username,
|
477
|
+
'password': password,
|
478
|
+
'host': host,
|
479
|
+
'port': int(port),
|
480
|
+
'pool_size': 3
|
481
|
+
}
|
482
|
+
with uploader.MySQLUploader(**db_config) as uld:
|
483
|
+
with s_query.QueryDatas(**db_config) as download:
|
484
|
+
ak = AikuCun(uld_manager=uld, download_manager=download)
|
485
|
+
# ak.get_sign()
|
486
|
+
for type_ in item_type:
|
487
|
+
if type_ not in ['spu', 'sku']:
|
488
|
+
logger.error(f'{item_type} 非法参数: {type_}')
|
489
|
+
continue
|
490
|
+
for i in range(2):
|
491
|
+
data_list = ak.get_data_from_bbx(
|
492
|
+
start_date=start_date,
|
493
|
+
end_date=end_date,
|
494
|
+
item_type=type_,
|
495
|
+
page_num=1,
|
496
|
+
page_size=300
|
497
|
+
)
|
498
|
+
if not data_list:
|
499
|
+
ak.logining()
|
500
|
+
ak.save_token()
|
501
|
+
ak.error_count = 0 # 重置错误计数器
|
502
|
+
else:
|
503
|
+
break
|
497
504
|
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
505
|
+
ak.insert_datas(
|
506
|
+
data_list=data_list,
|
507
|
+
db_name='爱库存2',
|
508
|
+
table_name=f'{type_}榜单'
|
509
|
+
)
|
503
510
|
|
504
511
|
|
505
512
|
|
@@ -508,7 +515,7 @@ if __name__ == '__main__':
|
|
508
515
|
start_date='2025-05-13',
|
509
516
|
# end_date='2025-04-28', # 不传则默认到今天
|
510
517
|
item_type=[
|
511
|
-
|
518
|
+
'spu',
|
512
519
|
'sku'
|
513
520
|
]
|
514
521
|
)
|
@@ -1,18 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=PQJs_Lgx6OvamcsXbLCVuBAvLc7j2xwJDZEWigwyUy8,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=SM8cS9lBKmhLBQdwJz-sRu9bl7w1HS0MEq10s6Tqf_0,166777
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
8
8
|
mdbq/log/mylogger.py,sha256=9w_o5mYB3FooIxobq_lSa6oCYTKIhPxDFox-jeLtUHI,21714
|
9
|
-
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
9
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
10
|
+
mdbq/mysql/deduplicator.py,sha256=fS1dSs92vN15tuqmAKrUVdKk6z9dwW_Fe9WHMBYsy2U,73172
|
12
11
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
13
|
-
mdbq/mysql/s_query.py,sha256=
|
12
|
+
mdbq/mysql/s_query.py,sha256=RPC-KZVuqPlCSmpmtUmYAOJdxJT01i0DvlIbmum4MxM,42882
|
14
13
|
mdbq/mysql/unique_.py,sha256=Wgqq_PjAAD757JTa10wjYaJgssZ_C_ypU6DW56jbuyw,21074
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=wX2gHhVQJwGErnjUbLnsljkZ8Yd3YK-HS3P7q8DizAA,81053
|
16
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -24,8 +23,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
24
23
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
25
24
|
mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
|
26
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
|
-
mdbq/spider/aikucun.py,sha256=
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
26
|
+
mdbq/spider/aikucun.py,sha256=7oquQ2RIJr6B1xblQMfnmHzteOlvHA7dIcPRaAPfHBc,21546
|
27
|
+
mdbq-4.0.11.dist-info/METADATA,sha256=zZh35aA-suJ3B_v39Mw8V_O2GSdOLdylfNPl_E99uqQ,364
|
28
|
+
mdbq-4.0.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-4.0.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-4.0.11.dist-info/RECORD,,
|
mdbq/log/spider_logging.py
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from logging.handlers import RotatingFileHandler
|
3
|
-
import platform
|
4
|
-
import os
|
5
|
-
import sys
|
6
|
-
import getpass
|
7
|
-
|
8
|
-
|
9
|
-
def setup_logging(reMoveOldHandler=True, filename='spider_tg.log'):
|
10
|
-
"""
|
11
|
-
reMoveOldHandler: 替换根日志记录器的所有现有处理器
|
12
|
-
"""
|
13
|
-
dir_path = os.path.expanduser("~")
|
14
|
-
if not os.path.isdir(os.path.join(dir_path, 'logfile')):
|
15
|
-
os.makedirs(os.path.join(dir_path, 'logfile'))
|
16
|
-
|
17
|
-
log_file = os.path.join(dir_path, 'logfile', filename)
|
18
|
-
file_handler = RotatingFileHandler(
|
19
|
-
filename=log_file,
|
20
|
-
maxBytes=3*1024*1024, # 3MB
|
21
|
-
backupCount=10,
|
22
|
-
encoding='utf-8' # 明确指定编码(避免Windows乱码)
|
23
|
-
)
|
24
|
-
stream_handler = logging.StreamHandler() # 终端输出Handler
|
25
|
-
formatter = logging.Formatter(
|
26
|
-
fmt='[%(asctime)s] %(levelname)s %(message)s',
|
27
|
-
datefmt='%Y-%m-%d %H:%M:%S'
|
28
|
-
)
|
29
|
-
file_handler.setFormatter(formatter)
|
30
|
-
stream_handler.setFormatter(formatter) # 终端使用相同格式
|
31
|
-
file_handler.setLevel(logging.INFO)
|
32
|
-
stream_handler.setLevel(logging.INFO)
|
33
|
-
|
34
|
-
# 获取根日志记录器并添加Handler
|
35
|
-
logger = logging.getLogger()
|
36
|
-
if reMoveOldHandler:
|
37
|
-
# 移除根日志记录器的所有现有处理器
|
38
|
-
for handler in logger.handlers[:]: # 使用[:]来创建handlers列表的一个副本,因为我们在迭代时修改列表
|
39
|
-
logger.removeHandler(handler)
|
40
|
-
logger.addHandler(file_handler)
|
41
|
-
logger.addHandler(stream_handler)
|
42
|
-
logger.setLevel(logging.INFO) # 设置根日志级别
|
43
|
-
return logger
|
44
|
-
|
45
|
-
|
46
|
-
if __name__ == '__main__':
|
47
|
-
pass
|
File without changes
|
File without changes
|