mdbq 4.0.9__py3-none-any.whl → 4.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '4.0.9'
1
+ VERSION = '4.0.11'
@@ -1557,7 +1557,6 @@ class MysqlDatasQuery:
1557
1557
  'unique_keys': [['日期', '店铺id', '商品id']], # 唯一约束列表
1558
1558
  }
1559
1559
 
1560
-
1561
1560
  @upload_data_decorator()
1562
1561
  def spph(self, db_name='聚合数据', table_name='天猫_商品排行'):
1563
1562
  """ """
@@ -3677,6 +3676,7 @@ def query3(months=1, download_manager=None):
3677
3676
 
3678
3677
 
3679
3678
  def main(months=3):
3679
+ logger.info('数据聚合任务开始')
3680
3680
  # 1. 更新日期表 更新货品年份基准表, 属性设置 3 - 货品年份基准
3681
3681
  date_table()
3682
3682
  # 2. 数据聚合
@@ -3685,11 +3685,12 @@ def main(months=3):
3685
3685
  password=password,
3686
3686
  host=host,
3687
3687
  port=port,
3688
- maxconnections=30,
3688
+ maxconnections=20,
3689
3689
  )
3690
3690
  query1(download_manager=download_manager, months=months)
3691
3691
  query2(download_manager=download_manager, months=months)
3692
3692
  query3(download_manager=download_manager, months=months)
3693
+ logger.info('数据聚合完成')
3693
3694
 
3694
3695
 
3695
3696
  if __name__ == '__main__':
@@ -6,6 +6,7 @@ import warnings
6
6
  import pymysql
7
7
  import os
8
8
  from mdbq.log import mylogger
9
+ from mdbq.config import config
9
10
  from typing import List, Dict, Optional, Any, Tuple
10
11
  from dbutils.pooled_db import PooledDB
11
12
  import threading
@@ -1348,7 +1349,7 @@ class MySQLDeduplicator:
1348
1349
 
1349
1350
 
1350
1351
  def main():
1351
- from mdbq.config import config
1352
+ logger.info('去重任务开始')
1352
1353
  dir_path = os.path.expanduser("~")
1353
1354
  my_cont = config.read_config(file_path=os.path.join(dir_path, 'spd.txt'))
1354
1355
  username, password, host, port = my_cont['username'], my_cont['password'], my_cont['host'], int(my_cont['port'])
@@ -1401,6 +1402,8 @@ def main():
1401
1402
 
1402
1403
  # 关闭连接
1403
1404
  deduplicator.close()
1405
+ logger.info('去重任务结束')
1406
+
1404
1407
 
1405
1408
  if __name__ == '__main__':
1406
1409
  main()
mdbq/mysql/s_query.py CHANGED
@@ -35,7 +35,7 @@ class QueryDatas:
35
35
  """
36
36
 
37
37
  def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4',
38
- maxconnections: int = 20, mincached: int = 2, maxcached: int = 5,
38
+ pool_size: int = 20, mincached: int = 2, maxcached: int = 5,
39
39
  connect_timeout: int = 10, read_timeout: int = 30, write_timeout: int = 30,
40
40
  max_retries: int = 3, retry_waiting_time: int = 5, collation: str = 'utf8mb4_0900_ai_ci') -> None:
41
41
  """
@@ -47,9 +47,9 @@ class QueryDatas:
47
47
  host: 数据库主机
48
48
  port: 数据库端口
49
49
  charset: 字符集,默认utf8mb4
50
- maxconnections: 最大连接数,默认20
51
- mincached: 最小缓存连接数,默认2
52
- maxcached: 最大缓存连接数,默认5
50
+ pool_size: 最大活动连接数,默认20
51
+ mincached: 最小缓存连接数,空闲连接数量,默认2
52
+ maxcached: 最大缓存连接数,最大空闲连接数,默认5
53
53
  connect_timeout: 连接超时时间,默认10秒
54
54
  read_timeout: 读取超时时间,默认30秒
55
55
  write_timeout: 写入超时时间,默认30秒
@@ -87,14 +87,14 @@ class QueryDatas:
87
87
  'write_timeout': write_timeout,
88
88
  'autocommit': True
89
89
  }
90
- self.pool = self._create_connection_pool(maxconnections, mincached, maxcached)
90
+ self.pool = self._create_connection_pool(pool_size, mincached, maxcached)
91
91
 
92
- def _create_connection_pool(self, maxconnections: int, mincached: int, maxcached: int) -> PooledDB:
92
+ def _create_connection_pool(self, pool_size: int, mincached: int, maxcached: int) -> PooledDB:
93
93
  """
94
94
  创建数据库连接池
95
95
 
96
96
  Args:
97
- maxconnections: 最大连接数
97
+ pool_size: 最大连接数
98
98
  mincached: 最小缓存连接数
99
99
  maxcached: 最大缓存连接数
100
100
 
@@ -122,7 +122,7 @@ class QueryDatas:
122
122
  }
123
123
  pool_params = {
124
124
  'creator': pymysql,
125
- 'maxconnections': maxconnections,
125
+ 'maxconnections': pool_size,
126
126
  'mincached': mincached,
127
127
  'maxcached': maxcached,
128
128
  'blocking': True,
@@ -133,7 +133,7 @@ class QueryDatas:
133
133
  try:
134
134
  pool = PooledDB(**pool_params, **connection_params)
135
135
  logger.debug('连接池创建成功', {
136
- '连接池大小': maxconnections,
136
+ '连接池大小': pool_size,
137
137
  '最小缓存': mincached,
138
138
  '最大缓存': maxcached,
139
139
  '主机': self.host,
@@ -253,20 +253,8 @@ class QueryDatas:
253
253
 
254
254
  # @_execute_with_retry
255
255
  def _get_connection(self, db_name: Optional[str] = None) -> pymysql.connections.Connection:
256
- """
257
- 从连接池获取数据库连接
258
-
259
- Args:
260
- db_name: 可选的数据库名,如果提供则会在连接后选择该数据库
261
-
262
- Returns:
263
- 数据库连接对象
264
-
265
- Raises:
266
- ConnectionError: 当获取连接失败时抛出
267
- """
256
+ """从连接池获取数据库连接"""
268
257
  try:
269
- # 只在连续失败次数达到阈值时检查健康状态
270
258
  if self._pool_stats['consecutive_failures'] >= self._pool_stats['max_consecutive_failures']:
271
259
  if not self._check_pool_health():
272
260
  logger.warning('连接池不健康,尝试重新创建')
@@ -282,66 +270,184 @@ class QueryDatas:
282
270
  error_code = e.args[0] if e.args else None
283
271
  if error_code in (2003, 2006, 2013):
284
272
  logger.error('数据库连接错误', {
273
+ '库': db_name,
285
274
  '错误代码': error_code,
286
275
  '错误信息': str(e),
287
- '数据库': db_name
288
276
  })
289
277
  self.pool = self._create_connection_pool(10, 2, 5)
290
278
  self._pool_stats['consecutive_failures'] = 0
291
279
  raise ConnectionError(f'数据库连接错误: {str(e)}')
292
- else:
293
- raise
280
+ raise
294
281
  except Exception as e:
295
282
  logger.error('从连接池获取数据库连接失败', {
283
+ '库': db_name,
296
284
  '错误': str(e),
297
- '数据库': db_name
298
285
  })
299
286
  raise ConnectionError(f'连接数据库失败: {str(e)}')
300
287
 
301
288
  # @_execute_with_retry
302
- def _execute_query(self, sql: str, params: tuple = None, db_name: str = None) -> Optional[List[Dict[str, Any]]]:
289
+ def _execute_query(self, sql: str, params: tuple = None, db_name: str = None,
290
+ fetch_all: bool = True, error_handling: bool = True) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
291
+ """执行SQL查询的通用方法"""
292
+ try:
293
+ if sql.upper().startswith('SHOW DATABASES'):
294
+ with closing(self._get_connection()) as connection:
295
+ with closing(connection.cursor()) as cursor:
296
+ cursor.execute(sql, params)
297
+ return cursor.fetchall() if fetch_all else cursor.fetchone()
298
+ else:
299
+ with closing(self._get_connection(db_name)) as connection:
300
+ with closing(connection.cursor()) as cursor:
301
+ cursor.execute(sql, params)
302
+ return cursor.fetchall() if fetch_all else cursor.fetchone()
303
+ except pymysql.OperationalError as e:
304
+ error_code = e.args[0] if e.args else None
305
+ if error_handling:
306
+ if error_code in (1045, 1049): # 访问被拒绝或数据库不存在
307
+ logger.error('数据库访问错误', {
308
+ 'SQL': sql,
309
+ '参数': params,
310
+ '库': db_name,
311
+ '错误代码': error_code,
312
+ '错误信息': str(e)
313
+ })
314
+ else:
315
+ logger.error('数据库操作错误', {
316
+ '库': db_name,
317
+ 'SQL': sql,
318
+ '参数': params,
319
+ '错误代码': error_code,
320
+ '错误信息': str(e)
321
+ })
322
+ return None
323
+ raise
324
+ except Exception as e:
325
+ if error_handling:
326
+ logger.error('执行SQL查询失败', {
327
+ '库': db_name,
328
+ 'SQL': sql,
329
+ '参数': params,
330
+ '错误类型': type(e).__name__,
331
+ '错误信息': str(e)
332
+ })
333
+ return None
334
+ raise
335
+
336
+ def _get_table_info(self, db_name: str, table_name: str, info_type: Literal['columns', 'dtypes', 'exists'] = 'exists') -> Union[bool, List[Dict[str, Any]], List[str]]:
303
337
  """
304
- 执行SQL查询的通用方法。
338
+ 获取表信息的通用方法。
305
339
 
306
340
  Args:
307
- sql: SQL查询语句
308
- params: 查询参数
309
341
  db_name: 数据库名
310
-
342
+ table_name: 表名
343
+ info_type: 信息类型
344
+ - 'exists': 检查表是否存在(默认)
345
+ - 'columns': 获取列名列表
346
+ - 'dtypes': 获取列名和类型
347
+
311
348
  Returns:
312
- 查询结果列表,如果查询失败返回None
349
+ 根据info_type返回不同类型的信息:
350
+ - 'exists': 返回bool,表示表是否存在
351
+ - 'columns': 返回列名列表
352
+ - 'dtypes': 返回列名和类型的列表
313
353
  """
314
354
  try:
315
- with closing(self._get_connection(db_name)) as connection:
316
- with closing(connection.cursor()) as cursor:
317
- cursor.execute(sql, params)
318
- return cursor.fetchall()
355
+ if info_type == 'exists':
356
+ result = self._execute_query("SHOW DATABASES LIKE %s", (db_name,))
357
+ if not result:
358
+ all_dbs = self._execute_query("SHOW DATABASES")
359
+ available_dbs = [db['Database'] for db in all_dbs] if all_dbs else []
360
+ logger.info('数据库不存在', {
361
+ '库': db_name,
362
+ '可用的数据库': available_dbs,
363
+ '可能的原因': '数据库名称错误或没有访问权限'
364
+ })
365
+ return False
366
+
367
+ result = self._execute_query("SHOW TABLES LIKE %s", (table_name,), db_name=db_name)
368
+ if not result:
369
+ all_tables = self._execute_query("SHOW TABLES", db_name=db_name)
370
+ available_tables = [table[f'Tables_in_{db_name}'] for table in all_tables] if all_tables else []
371
+ logger.info('表不存在', {
372
+ '库': db_name,
373
+ '表': table_name,
374
+ '可用的表': available_tables,
375
+ '可能的原因': '表名称错误或没有访问权限'
376
+ })
377
+ return False
378
+ return True
379
+
380
+ elif info_type == 'columns':
381
+ sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
382
+ result = self._execute_query(sql, (db_name, table_name))
383
+ return [col['COLUMN_NAME'] for col in result] if result else []
384
+
385
+ elif info_type == 'dtypes':
386
+ sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
387
+ return self._execute_query(sql, (db_name, table_name)) or []
388
+
319
389
  except Exception as e:
320
- logger.error('执行SQL查询失败', {
321
- 'SQL': sql,
322
- '参数': params,
323
- '数据库': db_name,
390
+ logger.error('获取表信息失败', {
391
+ '': db_name,
392
+ '': table_name,
393
+ '信息类型': info_type,
324
394
  '错误类型': type(e).__name__,
325
395
  '错误信息': str(e)
326
396
  })
327
- return None
397
+ return [] if info_type != 'exists' else False
328
398
 
329
- def check_condition(self, db_name: str, table_name: str, condition: str, columns: str = '更新时间') -> Optional[List[Dict[str, Any]]]:
330
- """
331
- 按指定条件查询数据库表,返回满足条件的指定字段数据。
399
+ def check_infos(self, db_name: str, table_name: str) -> bool:
400
+ """检查数据库和数据表是否存在"""
401
+ return self._get_table_info(db_name, table_name, 'exists')
402
+
403
+ def _format_columns(self, columns: List[str]) -> str:
404
+ """格式化列名列表为SQL语句"""
405
+ return ', '.join([f'`{col}`' for col in columns])
406
+
407
+ def columns_to_list(self, db_name: str, table_name: str, columns_name: List[str], where: str = None) -> List[Dict[str, Any]]:
408
+ """获取数据表的指定列数据"""
409
+ if not self._get_table_info(db_name, table_name):
410
+ return []
332
411
 
333
- Args:
334
- db_name: 数据库名
335
- table_name: 表名
336
- condition: SQL条件字符串(不含WHERE)
337
- columns: 查询字段字符串或以逗号分隔的字段名,默认'更新时间'
412
+ try:
413
+ existing_columns = self._get_table_info(db_name, table_name, 'columns')
414
+ columns_name = [col for col in columns_name if col in existing_columns]
338
415
 
339
- Returns:
340
- 查询结果列表,如果查询失败返回None
341
- """
342
- if not self.check_infos(db_name, table_name):
343
- return None
416
+ if not columns_name:
417
+ logger.info('未找到匹配的列名', {'库': db_name, '表': table_name, '请求列': columns_name})
418
+ return []
419
+
420
+ sql = f"SELECT {self._format_columns(columns_name)} FROM `{db_name}`.`{table_name}`"
421
+ if where:
422
+ sql += f" WHERE {where}"
344
423
 
424
+ logger.debug('执行列查询', {'库': db_name, '表': table_name, 'SQL': sql})
425
+ return self._execute_query(sql, db_name=db_name) or []
426
+
427
+ except Exception as e:
428
+ logger.error('列查询失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
429
+ return []
430
+
431
+ def dtypes_to_list(self, db_name: str, table_name: str, columns_name: List[str] = None) -> List[Dict[str, Any]]:
432
+ """获取数据表的列名和类型"""
433
+ if not self._get_table_info(db_name, table_name):
434
+ return []
435
+
436
+ try:
437
+ result = self._get_table_info(db_name, table_name, 'dtypes')
438
+ if columns_name:
439
+ columns_name = set(columns_name)
440
+ result = [row for row in result if row['COLUMN_NAME'] in columns_name]
441
+ return result
442
+ except Exception as e:
443
+ logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
444
+ return []
445
+
446
+ def check_condition(self, db_name: str, table_name: str, condition: str, columns: str = '更新时间') -> Optional[List[Dict[str, Any]]]:
447
+ """按指定条件查询数据库表"""
448
+ if not self._get_table_info(db_name, table_name):
449
+ return None
450
+
345
451
  sql = f"SELECT {columns} FROM `{table_name}` WHERE {condition}"
346
452
  logger.debug('执行SQL查询', {'库': db_name, '表': table_name, 'SQL': sql})
347
453
  return self._execute_query(sql, db_name=db_name)
@@ -598,98 +704,6 @@ class QueryDatas:
598
704
  df[col] = df[col].astype(float)
599
705
  return df
600
706
 
601
- # @_execute_with_retry
602
- def columns_to_list(self, db_name, table_name, columns_name, where: str = None) -> list:
603
- """
604
- 获取数据表的指定列, 支持where条件筛选, 返回列表字典。
605
- :param db_name: 数据库名
606
- :param table_name: 表名
607
- :param columns_name: 需要获取的列名列表
608
- :param where: 可选,SQL条件字符串(不含WHERE)
609
- :return: [{列1:值, 列2:值, ...}, ...]
610
- """
611
- if not self.check_infos(db_name, table_name):
612
- return []
613
-
614
- try:
615
- with closing(self._get_connection(db_name)) as connection:
616
- with closing(connection.cursor()) as cursor:
617
- sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
618
- cursor.execute(sql, (db_name, table_name))
619
- cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()]
620
- columns_name = [item for item in columns_name if item in cols_exist]
621
- if not columns_name:
622
- logger.info('未找到匹配的列名', {'库': db_name, '表': table_name, '请求列': columns_name})
623
- return []
624
- columns_in = ', '.join([f'`{col}`' for col in columns_name])
625
- sql = f"SELECT {columns_in} FROM `{db_name}`.`{table_name}`"
626
- if where:
627
- sql += f" WHERE {where}"
628
- logger.debug('执行列查询', {'库': db_name, '表': table_name, 'SQL': sql})
629
- cursor.execute(sql)
630
- column_values = cursor.fetchall()
631
- return column_values
632
- except Exception as e:
633
- logger.error('列查询失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
634
- return []
635
-
636
- # @_execute_with_retry
637
- def dtypes_to_list(self, db_name, table_name, columns_name=None) -> list:
638
- """
639
- 获取数据表的列名和类型, 支持只返回部分字段类型。
640
- :param db_name: 数据库名
641
- :param table_name: 表名
642
- :param columns_name: 可选,字段名列表,仅返回这些字段的类型
643
- :return: [{'COLUMN_NAME': ..., 'COLUMN_TYPE': ...}, ...]
644
- """
645
- if not self.check_infos(db_name, table_name):
646
- return []
647
-
648
- try:
649
- with closing(self._get_connection(db_name)) as connection:
650
- with closing(connection.cursor()) as cursor:
651
- sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
652
- cursor.execute(sql, (db_name, table_name))
653
- column_name_and_type = cursor.fetchall()
654
- if columns_name:
655
- columns_name = set(columns_name)
656
- column_name_and_type = [row for row in column_name_and_type if row['COLUMN_NAME'] in columns_name]
657
- return column_name_and_type
658
- except Exception as e:
659
- logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
660
- return []
661
-
662
- # @_execute_with_retry
663
- def check_infos(self, db_name, table_name) -> bool:
664
- """
665
- 检查数据库和数据表是否存在。
666
- :param db_name: 数据库名
667
- :param table_name: 表名
668
- :return: 存在返回True,否则False
669
- """
670
- try:
671
- # 检查数据库是否存在
672
- result = self._execute_query("SHOW DATABASES LIKE %s", (db_name,))
673
- if not result:
674
- logger.info('数据库不存在', {'库': db_name})
675
- return False
676
-
677
- # 检查表是否存在
678
- result = self._execute_query("SHOW TABLES LIKE %s", (table_name,), db_name=db_name)
679
- if not result:
680
- logger.info('表不存在', {'库': db_name, '表': table_name})
681
- return False
682
- return True
683
-
684
- except Exception as e:
685
- logger.error('检查数据库或表失败', {
686
- '库': db_name,
687
- '表': table_name,
688
- '错误类型': type(e).__name__,
689
- '错误信息': str(e)
690
- })
691
- return False
692
-
693
707
  def __enter__(self):
694
708
  """上下文管理器入口"""
695
709
  return self
@@ -703,7 +717,7 @@ class QueryDatas:
703
717
  if hasattr(self, 'pool') and self.pool is not None:
704
718
  try:
705
719
  self.pool.close()
706
- logger.info('连接池已关闭', {
720
+ logger.debug('连接池已关闭', {
707
721
  '主机': self.host,
708
722
  '端口': self.port
709
723
  })
@@ -772,6 +786,8 @@ class QueryDatas:
772
786
  - 当return_format='list_dict'时,返回列表字典
773
787
  - 如果查询失败,返回空的DataFrame或空列表
774
788
  """
789
+ start_time = time.time()
790
+
775
791
  if not db_name or not table_name:
776
792
  logger.error('数据库名和表名不能为空', {'库': db_name, '表': table_name})
777
793
  return [] if return_format == 'list_dict' else pd.DataFrame()
@@ -786,7 +802,7 @@ class QueryDatas:
786
802
  start_date, end_date = self._validate_date_range(start_date, end_date, db_name, table_name)
787
803
 
788
804
  # 检查数据库和表是否存在
789
- if not self.check_infos(db_name, table_name):
805
+ if not self._get_table_info(db_name, table_name):
790
806
  return [] if return_format == 'list_dict' else pd.DataFrame()
791
807
  try:
792
808
  with closing(self._get_connection(db_name)) as connection:
@@ -863,7 +879,7 @@ class QueryDatas:
863
879
  target_time = 1.0 # 期望每批1秒
864
880
 
865
881
  while offset < total_count:
866
- start_time = time.time()
882
+ _p_time = time.time()
867
883
  # 添加分页参数
868
884
  page_sql = f"{base_sql} LIMIT %s OFFSET %s"
869
885
  page_params = list(params) + [page_size, offset]
@@ -881,7 +897,7 @@ class QueryDatas:
881
897
  else:
882
898
  all_results = pd.concat([all_results, pd.DataFrame(page_results)], ignore_index=True)
883
899
 
884
- duration = time.time() - start_time
900
+ duration = time.time() - _p_time
885
901
  page_size = self._adjust_page_size(duration, page_size, min_size, max_size, target_time)
886
902
  offset += len(page_results)
887
903
  logger.debug('分页查询进度', {
@@ -896,6 +912,21 @@ class QueryDatas:
896
912
 
897
913
  if return_format == 'df' and isinstance(all_results, pd.DataFrame) and not all_results.empty:
898
914
  all_results = self._convert_decimal_columns(all_results)
915
+ logger.info('查询完成', {
916
+ '库': db_name,
917
+ '表': table_name,
918
+ '总记录数': total_count,
919
+ '已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index),
920
+ '查询耗时': f'{time.time() - start_time:.2f}s',
921
+ '查询参数': {
922
+ '开始日期': start_date,
923
+ '结束日期': end_date,
924
+ '日期字段': date_field,
925
+ '限制行数': limit,
926
+ '分页大小': page_size,
927
+ '返回数据格式': return_format,
928
+ }
929
+ })
899
930
  return all_results
900
931
 
901
932
  except Exception as e:
@@ -903,7 +934,15 @@ class QueryDatas:
903
934
  '库': db_name,
904
935
  '表': table_name,
905
936
  '错误类型': type(e).__name__,
906
- '错误信息': str(e)
937
+ '错误信息': str(e),
938
+ '查询参数': {
939
+ '开始日期': start_date,
940
+ '结束日期': end_date,
941
+ '日期字段': date_field,
942
+ '限制行数': limit,
943
+ '分页大小': page_size,
944
+ '返回数据格式': return_format,
945
+ }
907
946
  })
908
947
  return [] if return_format == 'list_dict' else pd.DataFrame()
909
948
 
mdbq/mysql/uploader.py CHANGED
@@ -539,6 +539,8 @@ class MySQLUploader:
539
539
  is_nan = True
540
540
  elif str(value).lower() in ['nan', 'none']:
541
541
  is_nan = True
542
+ elif value == '':
543
+ is_nan = True
542
544
  if is_nan:
543
545
  if not allow_null:
544
546
  if 'int' in column_type_lower:
mdbq/spider/aikucun.py CHANGED
@@ -25,9 +25,7 @@ config_file = os.path.join(dir_path, 'spd.txt')
25
25
  content = config.read_config(file_path=config_file)
26
26
  username, password, host, port = content['username'], content['password'], content['host'], content['port']
27
27
 
28
- uld = uploader.MySQLUploader(username=username, password=password, host=host, port=int(port), pool_size=10)
29
28
  # 实例化一个数据查询类,用来获取 cookies 表数据
30
- download = s_query.QueryDatas(username=username, password=password, host=host, port=port)
31
29
  logger = mylogger.MyLogger(
32
30
  logging_mode='file',
33
31
  log_level='info',
@@ -48,15 +46,15 @@ def keep_connect(_db_name, _config, max_try: int=10):
48
46
  connection = pymysql.connect(**_config) # 连接数据库
49
47
  return connection
50
48
  except Exception as e:
51
- logger.error(f'{_db_name}: 连接失败,正在重试: {host}:{port} {attempts}/{max_try} {e}')
49
+ logger.error('连接失败', {'数据库': _db_name, '主机': host, '端口': port, '重试次数': attempts, '最大重试次数': max_try, '错误信息': e})
52
50
  attempts += 1
53
51
  time.sleep(30)
54
- logger.error(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
52
+ logger.error('连接失败', {'数据库': _db_name, '主机': host, '端口': port, '重试次数': attempts, '最大重试次数': max_try})
55
53
  return None
56
54
 
57
55
 
58
56
  class AikuCun:
59
- def __init__(self):
57
+ def __init__(self, uld_manager, download_manager):
60
58
  self.url = 'https://gray-merc.aikucun.com/index.html'
61
59
  self.db_name = 'cookie文件'
62
60
  self.table_name = 'main_aikucun'
@@ -66,6 +64,8 @@ class AikuCun:
66
64
  self.start_date = (self.today - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
67
65
  self.end_date = (self.today - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
68
66
  self.error_count = 0
67
+ self.uld = uld_manager
68
+ self.download = download_manager
69
69
 
70
70
  def logining(self, shop_name='aikucun', headless=False):
71
71
  option = webdriver.ChromeOptions()
@@ -171,7 +171,7 @@ class AikuCun:
171
171
 
172
172
  def save_token(self):
173
173
  if not self.token:
174
- print('self.token 不能为空')
174
+ logger.error('self.token 不能为空')
175
175
  return
176
176
  set_typ = {
177
177
  '日期': 'DATE',
@@ -182,11 +182,11 @@ class AikuCun:
182
182
  '更新时间': 'timestamp'
183
183
  }
184
184
  # 更新至数据库记录
185
- uld.upload_data(
185
+ self.uld.upload_data(
186
186
  db_name=self.db_name,
187
187
  table_name=self.table_name,
188
188
  data=self.token,
189
- set_typ={},
189
+ set_typ=set_typ,
190
190
  primary_keys=[],
191
191
  check_duplicate=False,
192
192
  update_on_duplicate=False,
@@ -209,7 +209,7 @@ class AikuCun:
209
209
  self.end_date = end_date
210
210
  date_list = otk.dates_between(start_date=self.start_date, end_date=self.end_date)
211
211
 
212
- df = download.data_to_df(
212
+ df = self.download.data_to_df(
213
213
  db_name=self.db_name,
214
214
  table_name=self.table_name,
215
215
  start_date='2025-03-07',
@@ -230,7 +230,7 @@ class AikuCun:
230
230
  idx = df.groupby(['平台', '店铺名称'])['更新时间'].idxmax()
231
231
  df = df.loc[idx][['token']]
232
232
  if len(df) == 0:
233
- print(f'从数据库获取的 token 不能为空')
233
+ logger.error(f'从数据库获取的 token 不能为空')
234
234
  return
235
235
  self.token = df.iloc[0, 0]
236
236
 
@@ -247,7 +247,7 @@ class AikuCun:
247
247
  results = []
248
248
  for date in date_list:
249
249
  if self.error_count > 5:
250
- print('已退出请求 -> self.error_count > 5')
250
+ logger.logger('已退出请求 -> self.error_count > 5')
251
251
  break
252
252
  req_date = re.sub('-', '', date)
253
253
  data = {
@@ -273,16 +273,15 @@ class AikuCun:
273
273
  # cookies=cookies,
274
274
  data=json.dumps(data)
275
275
  )
276
- print(f'正在获取数据({num}/{len(date_list)}): {item_type}榜单 {date}')
277
- # print(res.json())
276
+ logger.info('获取数据', {'进度': num/len(date_list), '日期': date, '榜单类型': item_type})
278
277
  if not res.json().get('success', None):
279
- print('没有获取到数据, 请求不成功, 如果连续请求失败 > 5, 则需重新获取cookie后继续')
278
+ logger.error('没有获取到数据, 请求不成功, 如果连续请求失败 > 5, 则需重新获取cookie后继续')
280
279
  num += 1
281
280
  self.error_count += 1
282
281
  time.sleep(1)
283
282
  continue
284
283
  if not res.json().get('data', {}).get('rows', None):
285
- print("返回的数据字典异常, ['data']['rows'] 不能为空")
284
+ logger.error("返回的数据字典异常, ['data']['rows'] 不能为空")
286
285
  num += 1
287
286
  self.error_count += 1
288
287
  time.sleep(1)
@@ -291,7 +290,7 @@ class AikuCun:
291
290
  num += 1
292
291
  time.sleep(1)
293
292
  if num % 32 == 0:
294
- print("避免频繁请求, 正在休眠...")
293
+ logger.info("避免频繁请求, 正在休眠...")
295
294
  # time.sleep(60)
296
295
 
297
296
  return results
@@ -413,18 +412,18 @@ class AikuCun:
413
412
  '尺码': 'varchar(50)',
414
413
  '货号': 'varchar(50)', # 款号 + 颜色编码
415
414
  }
416
- print(f'{self.shop_name} 正在更新数据库 {db_name} -> {table_name}...')
415
+ logger.info('更新数据库', {'店铺名称': self.shop_name, '库': db_name, '表': table_name})
417
416
  if 'spu' in table_name:
418
417
  drop_dup = ['日期', '平台', '店铺名称', '商品款号', '访客量']
419
418
  else:
420
419
  drop_dup = ['日期', '平台', '店铺名称', '条码']
421
- uld.upload_data(
420
+ self.uld.upload_data(
422
421
  db_name=db_name,
423
422
  table_name=table_name,
424
423
  data=_results,
425
424
  set_typ=set_typ, # 定义列和数据类型
426
425
  primary_keys=[], # 创建唯一主键
427
- check_duplicate=True, # 检查重复数据
426
+ check_duplicate=False, # 检查重复数据
428
427
  update_on_duplicate=False, # 遇到重复时更新数据,默认 False 跳过
429
428
  duplicate_columns=drop_dup, # 指定排重的组合键
430
429
  allow_null=False, # 允许插入空值
@@ -470,36 +469,44 @@ class AikuCun:
470
469
  headers=headers,
471
470
  data=json.dumps(data)
472
471
  )
473
- print(res.json())
474
472
 
475
473
 
476
474
  def main(start_date, end_date=None, item_type=['spu']):
477
- ak = AikuCun()
478
- # ak.get_sign()
479
- for type_ in item_type:
480
- if type_ not in ['spu', 'sku']:
481
- print(f'{item_type} 非法参数: {type_}')
482
- continue
483
- for i in range(2):
484
- data_list = ak.get_data_from_bbx(
485
- start_date=start_date,
486
- end_date=end_date,
487
- item_type=type_,
488
- page_num=1,
489
- page_size=300
490
- )
491
- if not data_list:
492
- ak.logining()
493
- ak.save_token()
494
- ak.error_count = 0 # 重置错误计数器
495
- else:
496
- break
475
+ db_config = {
476
+ 'username': username,
477
+ 'password': password,
478
+ 'host': host,
479
+ 'port': int(port),
480
+ 'pool_size': 3
481
+ }
482
+ with uploader.MySQLUploader(**db_config) as uld:
483
+ with s_query.QueryDatas(**db_config) as download:
484
+ ak = AikuCun(uld_manager=uld, download_manager=download)
485
+ # ak.get_sign()
486
+ for type_ in item_type:
487
+ if type_ not in ['spu', 'sku']:
488
+ logger.error(f'{item_type} 非法参数: {type_}')
489
+ continue
490
+ for i in range(2):
491
+ data_list = ak.get_data_from_bbx(
492
+ start_date=start_date,
493
+ end_date=end_date,
494
+ item_type=type_,
495
+ page_num=1,
496
+ page_size=300
497
+ )
498
+ if not data_list:
499
+ ak.logining()
500
+ ak.save_token()
501
+ ak.error_count = 0 # 重置错误计数器
502
+ else:
503
+ break
497
504
 
498
- ak.insert_datas(
499
- data_list=data_list,
500
- db_name='爱库存2',
501
- table_name=f'{type_}榜单'
502
- )
505
+ ak.insert_datas(
506
+ data_list=data_list,
507
+ db_name='爱库存2',
508
+ table_name=f'{type_}榜单'
509
+ )
503
510
 
504
511
 
505
512
 
@@ -508,7 +515,7 @@ if __name__ == '__main__':
508
515
  start_date='2025-05-13',
509
516
  # end_date='2025-04-28', # 不传则默认到今天
510
517
  item_type=[
511
- # 'spu',
518
+ 'spu',
512
519
  'sku'
513
520
  ]
514
521
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.0.9
3
+ Version: 4.0.11
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,18 +1,17 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=N6Y-iQbP8CjtdAYEfFoeANx5OHaGKVkL-lm9Q2bnRnY,17
2
+ mdbq/__version__.py,sha256=PQJs_Lgx6OvamcsXbLCVuBAvLc7j2xwJDZEWigwyUy8,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
- mdbq/aggregation/query_data.py,sha256=SiUr-JC-L2YllQa6mTV8MaaY1LhATvHl4ApyBQdncrU,166695
4
+ mdbq/aggregation/query_data.py,sha256=SM8cS9lBKmhLBQdwJz-sRu9bl7w1HS0MEq10s6Tqf_0,166777
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
6
6
  mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
7
7
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=9w_o5mYB3FooIxobq_lSa6oCYTKIhPxDFox-jeLtUHI,21714
9
- mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
9
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=8v3MC6TJ0YEiExWrTP9OXAxTYnL9XbpYL2vWaER1h2M,73099
10
+ mdbq/mysql/deduplicator.py,sha256=fS1dSs92vN15tuqmAKrUVdKk6z9dwW_Fe9WHMBYsy2U,73172
12
11
  mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
13
- mdbq/mysql/s_query.py,sha256=FSFrFZE5yzEbnpLrN2AmlRZ_VvTvfpIWaQUjZfLIi9g,40342
12
+ mdbq/mysql/s_query.py,sha256=RPC-KZVuqPlCSmpmtUmYAOJdxJT01i0DvlIbmum4MxM,42882
14
13
  mdbq/mysql/unique_.py,sha256=Wgqq_PjAAD757JTa10wjYaJgssZ_C_ypU6DW56jbuyw,21074
15
- mdbq/mysql/uploader.py,sha256=wNQE7UjCEyAKri9CnQXO7d6EVXCaYqFze2i2tcGAVpw,81001
14
+ mdbq/mysql/uploader.py,sha256=wX2gHhVQJwGErnjUbLnsljkZ8Yd3YK-HS3P7q8DizAA,81053
16
15
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
17
16
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
18
17
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -24,8 +23,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
24
23
  mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
25
24
  mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
26
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
27
- mdbq/spider/aikucun.py,sha256=hPRzLQvFIF4ibN8aP3Dg_ru5meac90faPyzOB22cj-o,20965
28
- mdbq-4.0.9.dist-info/METADATA,sha256=0yI54Kb8_qdyDNnmOWFaBmlgAONodPDBAYbaMKnAF5A,363
29
- mdbq-4.0.9.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
- mdbq-4.0.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
- mdbq-4.0.9.dist-info/RECORD,,
26
+ mdbq/spider/aikucun.py,sha256=7oquQ2RIJr6B1xblQMfnmHzteOlvHA7dIcPRaAPfHBc,21546
27
+ mdbq-4.0.11.dist-info/METADATA,sha256=zZh35aA-suJ3B_v39Mw8V_O2GSdOLdylfNPl_E99uqQ,364
28
+ mdbq-4.0.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-4.0.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-4.0.11.dist-info/RECORD,,
@@ -1,47 +0,0 @@
1
- import logging
2
- from logging.handlers import RotatingFileHandler
3
- import platform
4
- import os
5
- import sys
6
- import getpass
7
-
8
-
9
- def setup_logging(reMoveOldHandler=True, filename='spider_tg.log'):
10
- """
11
- reMoveOldHandler: 替换根日志记录器的所有现有处理器
12
- """
13
- dir_path = os.path.expanduser("~")
14
- if not os.path.isdir(os.path.join(dir_path, 'logfile')):
15
- os.makedirs(os.path.join(dir_path, 'logfile'))
16
-
17
- log_file = os.path.join(dir_path, 'logfile', filename)
18
- file_handler = RotatingFileHandler(
19
- filename=log_file,
20
- maxBytes=3*1024*1024, # 3MB
21
- backupCount=10,
22
- encoding='utf-8' # 明确指定编码(避免Windows乱码)
23
- )
24
- stream_handler = logging.StreamHandler() # 终端输出Handler
25
- formatter = logging.Formatter(
26
- fmt='[%(asctime)s] %(levelname)s %(message)s',
27
- datefmt='%Y-%m-%d %H:%M:%S'
28
- )
29
- file_handler.setFormatter(formatter)
30
- stream_handler.setFormatter(formatter) # 终端使用相同格式
31
- file_handler.setLevel(logging.INFO)
32
- stream_handler.setLevel(logging.INFO)
33
-
34
- # 获取根日志记录器并添加Handler
35
- logger = logging.getLogger()
36
- if reMoveOldHandler:
37
- # 移除根日志记录器的所有现有处理器
38
- for handler in logger.handlers[:]: # 使用[:]来创建handlers列表的一个副本,因为我们在迭代时修改列表
39
- logger.removeHandler(handler)
40
- logger.addHandler(file_handler)
41
- logger.addHandler(stream_handler)
42
- logger.setLevel(logging.INFO) # 设置根日志级别
43
- return logger
44
-
45
-
46
- if __name__ == '__main__':
47
- pass
File without changes