mdbq 3.11.2__py3-none-any.whl → 3.11.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +0 -3
- mdbq/log/mylogger.py +1 -1
- mdbq/mysql/deduplicator.py +1 -1
- mdbq/mysql/mysql.py +3 -4
- mdbq/mysql/uploader.py +193 -343
- mdbq/spider/aikucun.py +2 -20
- {mdbq-3.11.2.dist-info → mdbq-3.11.4.dist-info}/METADATA +1 -1
- {mdbq-3.11.2.dist-info → mdbq-3.11.4.dist-info}/RECORD +11 -11
- {mdbq-3.11.2.dist-info → mdbq-3.11.4.dist-info}/WHEEL +0 -0
- {mdbq-3.11.2.dist-info → mdbq-3.11.4.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.4'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -9,14 +9,11 @@ import pandas as pd
|
|
9
9
|
import numpy as np
|
10
10
|
from functools import wraps
|
11
11
|
import platform
|
12
|
-
import json
|
13
12
|
import os
|
14
13
|
import time
|
15
14
|
import calendar
|
16
15
|
import concurrent.futures
|
17
|
-
import traceback
|
18
16
|
import logging
|
19
|
-
import sys
|
20
17
|
|
21
18
|
"""
|
22
19
|
|
mdbq/log/mylogger.py
CHANGED
mdbq/mysql/deduplicator.py
CHANGED
mdbq/mysql/mysql.py
CHANGED
@@ -7,10 +7,9 @@ import warnings
|
|
7
7
|
import pymysql
|
8
8
|
import pandas as pd
|
9
9
|
from sqlalchemy import create_engine
|
10
|
-
import os
|
11
10
|
from mdbq.other import otk
|
12
11
|
from mdbq.log import mylogger
|
13
|
-
import
|
12
|
+
import math
|
14
13
|
|
15
14
|
warnings.filterwarnings('ignore')
|
16
15
|
"""
|
@@ -131,7 +130,7 @@ class MysqlUpload:
|
|
131
130
|
new_dict_data: dict = {}
|
132
131
|
for k, v in dict_data.items():
|
133
132
|
k = str(k).lower()
|
134
|
-
k = re.sub(r'[()\-,,$&~^、 ()\"\'
|
133
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'"="·/。》《><!!`]', '_', k, re.IGNORECASE)
|
135
134
|
k = k.replace(')', '')
|
136
135
|
k = re.sub(r'_{2,}', '_', k)
|
137
136
|
k = re.sub(r'_+$', '', k)
|
@@ -526,7 +525,7 @@ class MysqlUpload:
|
|
526
525
|
new_dict_data = {}
|
527
526
|
for k, v in dict_data.items():
|
528
527
|
k = str(k).lower()
|
529
|
-
k = re.sub(r'[()\-,,$&~^、 ()\"\'
|
528
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'"="·/。》《><!!`]', '_', k, re.IGNORECASE)
|
530
529
|
k = k.replace(')', '')
|
531
530
|
k = re.sub(r'_{2,}', '_', k)
|
532
531
|
k = re.sub(r'_+$', '', k)
|
mdbq/mysql/uploader.py
CHANGED
@@ -10,8 +10,8 @@ from mdbq.log import mylogger
|
|
10
10
|
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
11
11
|
from dbutils.pooled_db import PooledDB
|
12
12
|
import json
|
13
|
-
from collections import OrderedDict
|
14
13
|
import sys
|
14
|
+
from decimal import Decimal, InvalidOperation
|
15
15
|
|
16
16
|
warnings.filterwarnings('ignore')
|
17
17
|
logger = mylogger.MyLogger(
|
@@ -28,62 +28,44 @@ logger = mylogger.MyLogger(
|
|
28
28
|
)
|
29
29
|
|
30
30
|
|
31
|
-
def count_decimal_places(num_str):
|
31
|
+
def count_decimal_places(num_str: str) -> Tuple[int, int]:
|
32
32
|
"""
|
33
|
-
|
34
|
-
|
35
|
-
:param num_str: 数字字符串
|
36
|
-
:return: 返回元组(整数位数, 小数位数)
|
37
|
-
:raises: 无显式抛出异常,但正则匹配失败时返回(0, 0)
|
38
|
-
"""
|
39
|
-
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
40
|
-
if match:
|
41
|
-
# 如果是科学计数法
|
42
|
-
match = re.findall(r'(\d+)\.(\d+)[eE][-+]?(\d+)$', str(num_str))
|
43
|
-
if match:
|
44
|
-
if len(match[0]) == 3:
|
45
|
-
if int(match[0][2]) < len(match[0][1]):
|
46
|
-
# count_int 清除整数部分开头的 0 并计算整数位数
|
47
|
-
count_int = len(re.sub('^0+', '', str(match[0][0]))) + int(match[0][2])
|
48
|
-
# 计算小数位数
|
49
|
-
count_float = len(match[0][1]) - int(match[0][2])
|
50
|
-
return count_int, count_float
|
51
|
-
# 如果是普通小数
|
52
|
-
match = re.findall(r'(\d+)\.(\d+)$', str(num_str))
|
53
|
-
if match:
|
54
|
-
count_int = len(re.sub('^0+', '', str(match[0][0])))
|
55
|
-
count_float = len(match[0][1])
|
56
|
-
return count_int, count_float # 计算小数位数
|
57
|
-
return 0, 0
|
58
|
-
|
59
|
-
|
60
|
-
class StatementCache(OrderedDict):
|
61
|
-
"""
|
62
|
-
基于OrderedDict实现的LRU缓存策略,用于缓存SQL语句
|
63
|
-
|
64
|
-
这个类继承自OrderedDict,实现了最近最少使用(LRU)的缓存策略。
|
65
|
-
当缓存达到最大容量时,会自动删除最早添加的项。
|
33
|
+
统计小数点前后位数,支持科学计数法。
|
34
|
+
返回:(整数位数, 小数位数)
|
66
35
|
"""
|
36
|
+
try:
|
37
|
+
d = Decimal(str(num_str))
|
38
|
+
sign, digits, exponent = d.as_tuple()
|
39
|
+
int_part = len(digits) + exponent if exponent < 0 else len(digits)
|
40
|
+
dec_part = -exponent if exponent < 0 else 0
|
41
|
+
return max(int_part, 0), max(dec_part, 0)
|
42
|
+
except (InvalidOperation, ValueError, TypeError):
|
43
|
+
return (0, 0)
|
44
|
+
|
45
|
+
|
46
|
+
class StatementCache(dict):
|
47
|
+
"""简单LRU缓存实现,用于SQL语句缓存"""
|
67
48
|
def __init__(self, maxsize=100):
|
68
|
-
"""
|
69
|
-
初始化缓存
|
70
|
-
|
71
|
-
:param maxsize: 最大缓存大小,默认为100条SQL语句
|
72
|
-
"""
|
73
49
|
super().__init__()
|
74
|
-
self.
|
75
|
-
|
50
|
+
self._maxsize = maxsize
|
51
|
+
self._order = []
|
52
|
+
def __getitem__(self, key):
|
53
|
+
value = super().__getitem__(key)
|
54
|
+
self._order.remove(key)
|
55
|
+
self._order.append(key)
|
56
|
+
return value
|
76
57
|
def __setitem__(self, key, value):
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
"""
|
58
|
+
if key in self:
|
59
|
+
self._order.remove(key)
|
60
|
+
elif len(self._order) >= self._maxsize:
|
61
|
+
oldest = self._order.pop(0)
|
62
|
+
super().__delitem__(oldest)
|
83
63
|
super().__setitem__(key, value)
|
84
|
-
|
85
|
-
|
86
|
-
|
64
|
+
self._order.append(key)
|
65
|
+
def get(self, key, default=None):
|
66
|
+
if key in self:
|
67
|
+
return self[key]
|
68
|
+
return default
|
87
69
|
|
88
70
|
class MySQLUploader:
|
89
71
|
"""
|
@@ -194,22 +176,22 @@ class MySQLUploader:
|
|
194
176
|
logger.error('连接池创建失败', {'error': str(e), 'host': self.host, 'port': self.port})
|
195
177
|
raise ConnectionError(f'连接池创建失败: {str(e)}')
|
196
178
|
|
197
|
-
|
179
|
+
@staticmethod
|
180
|
+
def _execute_with_retry(func):
|
198
181
|
"""
|
199
182
|
带重试机制的装饰器,用于数据库操作
|
200
|
-
|
201
183
|
:param func: 被装饰的函数
|
202
184
|
:return: 装饰后的函数
|
203
185
|
:raises: 可能抛出原始异常或最后一次重试的异常
|
204
186
|
"""
|
205
187
|
@wraps(func)
|
206
|
-
def wrapper(*args, **kwargs):
|
188
|
+
def wrapper(self, *args, **kwargs):
|
207
189
|
last_exception = None
|
208
190
|
operation = func.__name__
|
209
191
|
logger.debug(f'开始执行操作: {operation}', {'max_retries': self.max_retries})
|
210
192
|
for attempt in range(self.max_retries):
|
211
193
|
try:
|
212
|
-
result = func(*args, **kwargs)
|
194
|
+
result = func(self, *args, **kwargs)
|
213
195
|
if attempt > 0:
|
214
196
|
logger.info('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
|
215
197
|
else:
|
@@ -255,6 +237,7 @@ class MySQLUploader:
|
|
255
237
|
raise last_exception if last_exception else Exception('发生未知错误')
|
256
238
|
return wrapper
|
257
239
|
|
240
|
+
@_execute_with_retry
|
258
241
|
def _get_connection(self) -> pymysql.connections.Connection:
|
259
242
|
"""
|
260
243
|
从连接池获取数据库连接
|
@@ -270,6 +253,7 @@ class MySQLUploader:
|
|
270
253
|
logger.error('获取数据库连接失败', {'error': str(e)})
|
271
254
|
raise ConnectionError(f'连接数据库失败: {str(e)}')
|
272
255
|
|
256
|
+
@_execute_with_retry
|
273
257
|
def _check_database_exists(self, db_name: str) -> bool:
|
274
258
|
"""
|
275
259
|
检查数据库是否存在
|
@@ -280,6 +264,7 @@ class MySQLUploader:
|
|
280
264
|
"""
|
281
265
|
db_name = self._validate_identifier(db_name)
|
282
266
|
sql = 'SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s'
|
267
|
+
conn = None
|
283
268
|
try:
|
284
269
|
with self._get_connection() as conn:
|
285
270
|
with conn.cursor() as cursor:
|
@@ -291,6 +276,7 @@ class MySQLUploader:
|
|
291
276
|
logger.error('检查数据库是否存在时出错', {'库': db_name, '错误': str(e)})
|
292
277
|
raise
|
293
278
|
|
279
|
+
@_execute_with_retry
|
294
280
|
def _create_database(self, db_name: str) -> None:
|
295
281
|
"""
|
296
282
|
创建数据库
|
@@ -300,6 +286,7 @@ class MySQLUploader:
|
|
300
286
|
"""
|
301
287
|
db_name = self._validate_identifier(db_name)
|
302
288
|
sql = f'CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}'
|
289
|
+
conn = None
|
303
290
|
try:
|
304
291
|
with self._get_connection() as conn:
|
305
292
|
with conn.cursor() as cursor:
|
@@ -308,7 +295,8 @@ class MySQLUploader:
|
|
308
295
|
logger.info('数据库已创建', {'库': db_name})
|
309
296
|
except Exception as e:
|
310
297
|
logger.error('无法创建数据库', {'库': db_name, '错误': str(e)})
|
311
|
-
conn
|
298
|
+
if conn is not None:
|
299
|
+
conn.rollback()
|
312
300
|
raise
|
313
301
|
|
314
302
|
def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
|
@@ -317,12 +305,12 @@ class MySQLUploader:
|
|
317
305
|
|
318
306
|
:param table_name: 基础表名
|
319
307
|
:param date_value: 日期值
|
320
|
-
:param partition_by: 分表方式 ('year' 或 'month')
|
308
|
+
:param partition_by: 分表方式 ('year' 或 'month' 或 'none')
|
321
309
|
:return: 分表名称
|
322
310
|
:raises ValueError: 如果日期格式无效或分表方式无效
|
323
311
|
"""
|
324
312
|
try:
|
325
|
-
date_obj = self._validate_datetime(date_value, True)
|
313
|
+
date_obj = self._validate_datetime(value=date_value, date_type=True, no_log=False)
|
326
314
|
except ValueError:
|
327
315
|
logger.error('无效的日期格式', {'表': table_name, '日期值': date_value})
|
328
316
|
raise ValueError(f"`{table_name}` 无效的日期格式: `{date_value}`")
|
@@ -345,10 +333,8 @@ class MySQLUploader:
|
|
345
333
|
if not identifier or not isinstance(identifier, str):
|
346
334
|
logger.error('无效的标识符', {'标识符': identifier})
|
347
335
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
348
|
-
|
349
|
-
|
350
|
-
else:
|
351
|
-
cleaned = identifier
|
336
|
+
# 始终做特殊字符清理
|
337
|
+
cleaned = re.sub(r'[^-\uFFFF\w\u4e00-\u9fff$]', '_', identifier)
|
352
338
|
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
353
339
|
if not cleaned:
|
354
340
|
logger.error('无法清理异常标识符', {'原始标识符': identifier})
|
@@ -362,6 +348,7 @@ class MySQLUploader:
|
|
362
348
|
return f"`{cleaned}`"
|
363
349
|
return cleaned
|
364
350
|
|
351
|
+
@_execute_with_retry
|
365
352
|
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
366
353
|
"""
|
367
354
|
检查表是否存在
|
@@ -396,6 +383,7 @@ class MySQLUploader:
|
|
396
383
|
logger.debug('表存在检查', {'库': db_name, '表': table_name, '存在': result})
|
397
384
|
return result
|
398
385
|
|
386
|
+
@_execute_with_retry
|
399
387
|
def _create_table(
|
400
388
|
self,
|
401
389
|
db_name: str,
|
@@ -407,16 +395,7 @@ class MySQLUploader:
|
|
407
395
|
allow_null: bool = False
|
408
396
|
) -> None:
|
409
397
|
"""
|
410
|
-
|
411
|
-
|
412
|
-
:param db_name: 数据库名
|
413
|
-
:param table_name: 表名
|
414
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
415
|
-
:param primary_keys: 主键列列表,可选
|
416
|
-
:param date_column: 日期列名,可选,如果存在将设置为索引
|
417
|
-
:param indexes: 需要创建索引的列列表,可选
|
418
|
-
:param allow_null: 是否允许空值,默认为False
|
419
|
-
:raises: 可能抛出数据库相关异常
|
398
|
+
创建数据表,优化索引创建方式
|
420
399
|
"""
|
421
400
|
db_name = self._validate_identifier(db_name)
|
422
401
|
table_name = self._validate_identifier(table_name)
|
@@ -439,48 +418,44 @@ class MySQLUploader:
|
|
439
418
|
primary_keys = ['id']
|
440
419
|
safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
|
441
420
|
primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
421
|
+
# 索引统一在CREATE TABLE中定义
|
422
|
+
index_defs = []
|
423
|
+
if date_column and date_column in set_typ:
|
424
|
+
safe_date_col = self._validate_identifier(date_column)
|
425
|
+
index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
|
426
|
+
if indexes:
|
427
|
+
for idx_col in indexes:
|
428
|
+
if idx_col in set_typ:
|
429
|
+
safe_idx_col = self._validate_identifier(idx_col)
|
430
|
+
index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
|
431
|
+
index_sql = (',' + ','.join(index_defs)) if index_defs else ''
|
442
432
|
sql = f"""
|
443
433
|
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
444
434
|
{','.join(column_defs)}
|
445
435
|
{primary_key_sql}
|
436
|
+
{index_sql}
|
446
437
|
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
447
438
|
"""
|
439
|
+
conn = None
|
448
440
|
try:
|
449
441
|
with self._get_connection() as conn:
|
450
442
|
with conn.cursor() as cursor:
|
451
443
|
cursor.execute(sql)
|
452
|
-
logger.info('数据表已创建', {'库': db_name, '表': table_name})
|
453
|
-
index_statements = []
|
454
|
-
if date_column and date_column in set_typ:
|
455
|
-
safe_date_col = self._validate_identifier(date_column)
|
456
|
-
index_statements.append(
|
457
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
|
458
|
-
)
|
459
|
-
if indexes:
|
460
|
-
for idx_col in indexes:
|
461
|
-
if idx_col in set_typ:
|
462
|
-
safe_idx_col = self._validate_identifier(idx_col)
|
463
|
-
index_statements.append(
|
464
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
|
465
|
-
)
|
466
|
-
if index_statements:
|
467
|
-
with conn.cursor() as cursor:
|
468
|
-
for stmt in index_statements:
|
469
|
-
cursor.execute(stmt)
|
470
|
-
logger.debug('执行索引语句', {'SQL': stmt})
|
471
444
|
conn.commit()
|
472
|
-
logger.info('
|
445
|
+
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes})
|
473
446
|
except Exception as e:
|
474
447
|
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
475
|
-
conn
|
448
|
+
if conn is not None:
|
449
|
+
conn.rollback()
|
476
450
|
raise
|
477
451
|
|
478
|
-
def _validate_datetime(self, value: str, date_type: bool = False) -> Any:
|
452
|
+
def _validate_datetime(self, value: str, date_type: bool = False, no_log: bool = False) -> Any:
|
479
453
|
"""
|
480
454
|
验证并标准化日期时间格式
|
481
455
|
|
482
456
|
:param value: 日期时间值
|
483
457
|
:param date_type: 是否返回日期类型(True)或字符串(False)
|
458
|
+
:param no_log: 记录日志,默认为False
|
484
459
|
:return: 标准化后的日期时间字符串或日期对象
|
485
460
|
:raises ValueError: 当日期格式无效时抛出
|
486
461
|
"""
|
@@ -508,22 +483,28 @@ class MySQLUploader:
|
|
508
483
|
return result
|
509
484
|
except ValueError:
|
510
485
|
continue
|
511
|
-
|
486
|
+
if not no_log:
|
487
|
+
logger.error('无效的日期格式', {'值': value})
|
512
488
|
raise ValueError(f"无效的日期格式: `{value}`")
|
513
489
|
|
514
|
-
def _validate_value(self, value: Any, column_type: str, allow_null: bool) -> Any:
|
490
|
+
def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
|
515
491
|
"""
|
516
492
|
根据列类型验证并转换数据值
|
517
493
|
|
518
494
|
:param value: 要验证的值
|
519
495
|
:param column_type: 列的数据类型
|
520
496
|
:param allow_null: 是否允许空值
|
497
|
+
:param db_name: 数据库名(用于日志)
|
498
|
+
:param table_name: 表名(用于日志)
|
499
|
+
:param col_name: 列名(用于日志)
|
521
500
|
:return: 转换后的值
|
522
501
|
:raises ValueError: 当值转换失败时抛出
|
523
502
|
"""
|
524
503
|
if value is None:
|
525
504
|
if not allow_null:
|
526
|
-
logger.warning('字段值为None
|
505
|
+
logger.warning('字段值为None但不允许空值, 已填充为none', {
|
506
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
507
|
+
})
|
527
508
|
return 'none'
|
528
509
|
return None
|
529
510
|
try:
|
@@ -536,14 +517,18 @@ class MySQLUploader:
|
|
536
517
|
logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
|
537
518
|
return decimal_value
|
538
519
|
except ValueError:
|
539
|
-
logger.warning('百分比字符串转小数失败', {
|
520
|
+
logger.warning('百分比字符串转小数失败', {
|
521
|
+
'库': db_name, '表': table_name, '列': col_name, '原始': value
|
522
|
+
})
|
540
523
|
elif 'int' in column_type_lower:
|
541
524
|
if isinstance(value, str):
|
542
525
|
value = value.replace(',', '').strip()
|
543
526
|
try:
|
544
527
|
return int(float(value))
|
545
528
|
except ValueError:
|
546
|
-
logger.error('字符串转整数失败', {
|
529
|
+
logger.error('字符串转整数失败', {
|
530
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value
|
531
|
+
})
|
547
532
|
raise ValueError(f"`{value}` -> 无法转为整数")
|
548
533
|
return int(value) if value is not None else None
|
549
534
|
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
@@ -555,9 +540,11 @@ class MySQLUploader:
|
|
555
540
|
return value.strftime('%Y-%m-%d %H:%M:%S')
|
556
541
|
elif isinstance(value, str):
|
557
542
|
try:
|
558
|
-
return self._validate_datetime(value)
|
543
|
+
return self._validate_datetime(value=value, date_type=False, no_log=False)
|
559
544
|
except ValueError as e:
|
560
|
-
logger.error('无效日期格式', {
|
545
|
+
logger.error('无效日期格式', {
|
546
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value, '错误': str(e)
|
547
|
+
})
|
561
548
|
raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
|
562
549
|
return str(value)
|
563
550
|
elif 'char' in column_type_lower or 'text' in column_type_lower:
|
@@ -569,9 +556,12 @@ class MySQLUploader:
|
|
569
556
|
else:
|
570
557
|
return value
|
571
558
|
except (ValueError, TypeError) as e:
|
572
|
-
logger.error('数据类型转换异常', {
|
559
|
+
logger.error('数据类型转换异常', {
|
560
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value, '目标类型': column_type, '错误': str(e)
|
561
|
+
})
|
573
562
|
raise ValueError(f"转换异常 -> 无法将 `{value}` 的数据类型转为: `{column_type}` -> {str(e)}")
|
574
563
|
|
564
|
+
@_execute_with_retry
|
575
565
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
576
566
|
"""
|
577
567
|
获取表的列名和数据类型
|
@@ -665,11 +655,12 @@ class MySQLUploader:
|
|
665
655
|
transaction_mode=transaction_mode
|
666
656
|
)
|
667
657
|
|
668
|
-
def _infer_data_type(self, value: Any) -> str:
|
658
|
+
def _infer_data_type(self, value: Any, no_log: bool = False) -> str:
|
669
659
|
"""
|
670
660
|
根据值推断合适的MySQL数据类型
|
671
661
|
|
672
662
|
:param value: 要推断的值
|
663
|
+
:param no_log: 记录日志,默认为False
|
673
664
|
:return: MySQL数据类型字符串
|
674
665
|
"""
|
675
666
|
if value is None or str(value).lower() in ['', 'none', 'nan']:
|
@@ -707,7 +698,7 @@ class MySQLUploader:
|
|
707
698
|
elif isinstance(value, str):
|
708
699
|
# 尝试判断是否是日期时间
|
709
700
|
try:
|
710
|
-
self._validate_datetime(value)
|
701
|
+
self._validate_datetime(value=value, date_type=False, no_log=no_log)
|
711
702
|
return 'DATETIME'
|
712
703
|
except ValueError:
|
713
704
|
pass
|
@@ -748,7 +739,9 @@ class MySQLUploader:
|
|
748
739
|
self,
|
749
740
|
data: Union[Dict, List[Dict], pd.DataFrame],
|
750
741
|
set_typ: Dict[str, str],
|
751
|
-
allow_null: bool = False
|
742
|
+
allow_null: bool = False,
|
743
|
+
db_name: str = None,
|
744
|
+
table_name: str = None,
|
752
745
|
) -> Tuple[List[Dict], Dict[str, str]]:
|
753
746
|
"""
|
754
747
|
准备要上传的数据,验证并转换数据类型
|
@@ -805,15 +798,17 @@ class MySQLUploader:
|
|
805
798
|
if col in set_typ:
|
806
799
|
filtered_set_typ[col] = set_typ[col]
|
807
800
|
else:
|
808
|
-
# 如果列不在set_typ
|
809
|
-
sample_values = [row[col] for row in data if col in row and row[col] is not None][:
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
801
|
+
# 如果列不在set_typ中,采样多个非None值推断类型
|
802
|
+
sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
|
803
|
+
inferred_type = None
|
804
|
+
for val in sample_values:
|
805
|
+
inferred_type = self._infer_data_type(val, no_log=True) # 推断日期类型不记录日志, 避免日志噪音过多
|
806
|
+
if inferred_type:
|
807
|
+
break
|
808
|
+
if not inferred_type:
|
809
|
+
inferred_type = 'VARCHAR(255)'
|
810
|
+
filtered_set_typ[col] = inferred_type
|
811
|
+
logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
|
817
812
|
|
818
813
|
prepared_data = []
|
819
814
|
for row_idx, row in enumerate(data, 1):
|
@@ -830,7 +825,7 @@ class MySQLUploader:
|
|
830
825
|
prepared_row[col_name] = None
|
831
826
|
else:
|
832
827
|
try:
|
833
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
|
828
|
+
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
|
834
829
|
except ValueError as e:
|
835
830
|
logger.error('数据验证失败', {
|
836
831
|
'列': col_name,
|
@@ -862,7 +857,7 @@ class MySQLUploader:
|
|
862
857
|
transaction_mode: str = "batch"
|
863
858
|
):
|
864
859
|
"""
|
865
|
-
|
860
|
+
上传数据到数据库的主入口方法,分表逻辑异常处理统计丢弃数据
|
866
861
|
|
867
862
|
:param db_name: 数据库名
|
868
863
|
:param table_name: 表名
|
@@ -872,7 +867,7 @@ class MySQLUploader:
|
|
872
867
|
:param check_duplicate: 是否检查重复数据,默认为False
|
873
868
|
:param duplicate_columns: 用于检查重复的列,可选
|
874
869
|
:param allow_null: 是否允许空值,默认为False
|
875
|
-
:param partition_by: 分表方式('year'
|
870
|
+
:param partition_by: 分表方式('year'、'month'、'None'),可选
|
876
871
|
:param partition_date_column: 用于分表的日期列名,默认为'日期'
|
877
872
|
:param auto_create: 表不存在时是否自动创建,默认为True
|
878
873
|
:param indexes: 需要创建索引的列列表,可选
|
@@ -888,6 +883,7 @@ class MySQLUploader:
|
|
888
883
|
|
889
884
|
batch_id = f"batch_{int(time.time() * 1000)}"
|
890
885
|
success_flag = False
|
886
|
+
dropped_rows = 0
|
891
887
|
|
892
888
|
logger.info("开始上传", {
|
893
889
|
'库': db_name,
|
@@ -924,7 +920,7 @@ class MySQLUploader:
|
|
924
920
|
raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
|
925
921
|
|
926
922
|
# 准备数据
|
927
|
-
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null)
|
923
|
+
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null, db_name, table_name)
|
928
924
|
|
929
925
|
# 检查数据库是否存在
|
930
926
|
if not self._check_database_exists(db_name):
|
@@ -951,8 +947,8 @@ class MySQLUploader:
|
|
951
947
|
'row': self._shorten_for_log(row),
|
952
948
|
'func': sys._getframe().f_code.co_name,
|
953
949
|
})
|
954
|
-
|
955
|
-
|
950
|
+
dropped_rows += 1
|
951
|
+
continue
|
956
952
|
part_table = self._get_partition_table_name(
|
957
953
|
table_name,
|
958
954
|
str(row[partition_date_column]),
|
@@ -969,7 +965,8 @@ class MySQLUploader:
|
|
969
965
|
'error': str(e),
|
970
966
|
'func': sys._getframe().f_code.co_name,
|
971
967
|
})
|
972
|
-
|
968
|
+
dropped_rows += 1
|
969
|
+
continue
|
973
970
|
|
974
971
|
# 对每个分表执行上传
|
975
972
|
for part_table, part_data in partitioned_data.items():
|
@@ -1016,10 +1013,11 @@ class MySQLUploader:
|
|
1016
1013
|
'表': table_name,
|
1017
1014
|
'批次': batch_id,
|
1018
1015
|
'finish': success_flag,
|
1019
|
-
|
1020
|
-
'
|
1016
|
+
'数据行': initial_row_count,
|
1017
|
+
'丢弃行数': dropped_rows
|
1021
1018
|
})
|
1022
1019
|
|
1020
|
+
@_execute_with_retry
|
1023
1021
|
def _insert_data(
|
1024
1022
|
self,
|
1025
1023
|
db_name: str,
|
@@ -1184,34 +1182,27 @@ class MySQLUploader:
|
|
1184
1182
|
update_on_duplicate: bool
|
1185
1183
|
) -> str:
|
1186
1184
|
"""
|
1187
|
-
准备插入SQL
|
1188
|
-
|
1189
|
-
1. 当 check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
|
1190
|
-
2. 当 check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
|
1191
|
-
3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1192
|
-
4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1193
|
-
5. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1194
|
-
6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1195
|
-
|
1185
|
+
准备插入SQL语句, 增加StatementCache缓存
|
1196
1186
|
"""
|
1187
|
+
cache_key = (db_name, table_name, tuple(sorted(set_typ.items())), check_duplicate, tuple(duplicate_columns) if duplicate_columns else (), update_on_duplicate)
|
1188
|
+
cached = self._prepared_statements.get(cache_key)
|
1189
|
+
if cached:
|
1190
|
+
return cached
|
1197
1191
|
# 获取所有列名(排除id)
|
1198
1192
|
all_columns = [col for col in set_typ.keys()
|
1199
1193
|
if col.lower() != 'id']
|
1200
|
-
|
1201
|
-
# 情况1-2:不检查重复
|
1202
1194
|
if not check_duplicate:
|
1203
|
-
|
1195
|
+
sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1204
1196
|
update_on_duplicate)
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
# 情况3-6:检查重复
|
1213
|
-
return self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1197
|
+
else:
|
1198
|
+
dup_cols = duplicate_columns if duplicate_columns else [
|
1199
|
+
col for col in all_columns
|
1200
|
+
if col.lower() not in self.base_excute_col
|
1201
|
+
]
|
1202
|
+
sql = self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1214
1203
|
dup_cols, update_on_duplicate, set_typ)
|
1204
|
+
self._prepared_statements[cache_key] = sql
|
1205
|
+
return sql
|
1215
1206
|
|
1216
1207
|
def _execute_batch_insert(
|
1217
1208
|
self,
|
@@ -1226,10 +1217,8 @@ class MySQLUploader:
|
|
1226
1217
|
transaction_mode: str,
|
1227
1218
|
update_on_duplicate: bool = False
|
1228
1219
|
) -> Tuple[int, int, int]:
|
1229
|
-
"""
|
1230
|
-
|
1220
|
+
"""执行批量插入操作,优化batch和hybrid模式"""
|
1231
1221
|
def get_optimal_batch_size(total_rows: int) -> int:
|
1232
|
-
# 根据数据量调整批量大小
|
1233
1222
|
if total_rows <= 100:
|
1234
1223
|
return total_rows
|
1235
1224
|
elif total_rows <= 1000:
|
@@ -1238,205 +1227,64 @@ class MySQLUploader:
|
|
1238
1227
|
return 1000
|
1239
1228
|
else:
|
1240
1229
|
return 2000
|
1241
|
-
|
1242
1230
|
batch_size = get_optimal_batch_size(len(data))
|
1243
|
-
|
1244
|
-
# 获取所有列名(排除id列)
|
1245
|
-
all_columns = [col for col in set_typ.keys()
|
1246
|
-
if col.lower() != 'id']
|
1247
|
-
|
1231
|
+
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
1248
1232
|
total_inserted = 0
|
1249
1233
|
total_skipped = 0
|
1250
1234
|
total_failed = 0
|
1251
|
-
|
1252
1235
|
with self._get_connection() as conn:
|
1253
1236
|
with conn.cursor() as cursor:
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1237
|
+
if transaction_mode == 'batch':
|
1238
|
+
for i in range(0, len(data), batch_size):
|
1239
|
+
batch = data[i:i + batch_size]
|
1240
|
+
values_list = []
|
1241
|
+
for row in batch:
|
1242
|
+
values = [row.get(col) for col in all_columns]
|
1243
|
+
if check_duplicate and not update_on_duplicate:
|
1244
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1245
|
+
values += [row.get(col) for col in dup_cols]
|
1246
|
+
values_list.append(values)
|
1247
|
+
try:
|
1248
|
+
cursor.executemany(sql, values_list)
|
1249
|
+
conn.commit()
|
1250
|
+
total_inserted += len(batch)
|
1251
|
+
except Exception as e:
|
1252
|
+
conn.rollback()
|
1253
|
+
total_failed += len(batch)
|
1254
|
+
logger.error('批量插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1255
|
+
elif transaction_mode == 'hybrid':
|
1256
|
+
hybrid_n = 100 # 可配置
|
1257
|
+
for i in range(0, len(data), hybrid_n):
|
1258
|
+
batch = data[i:i + hybrid_n]
|
1259
|
+
for row in batch:
|
1260
|
+
try:
|
1261
|
+
values = [row.get(col) for col in all_columns]
|
1262
|
+
if check_duplicate and not update_on_duplicate:
|
1263
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1264
|
+
values += [row.get(col) for col in dup_cols]
|
1265
|
+
cursor.execute(sql, values)
|
1266
|
+
total_inserted += 1
|
1267
|
+
except Exception as e:
|
1268
|
+
conn.rollback()
|
1269
|
+
total_failed += 1
|
1270
|
+
logger.error('hybrid单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1271
|
+
conn.commit()
|
1272
|
+
else: # row模式
|
1273
|
+
for row in data:
|
1274
|
+
try:
|
1275
|
+
values = [row.get(col) for col in all_columns]
|
1276
|
+
if check_duplicate and not update_on_duplicate:
|
1277
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1278
|
+
values += [row.get(col) for col in dup_cols]
|
1279
|
+
cursor.execute(sql, values)
|
1280
|
+
conn.commit()
|
1281
|
+
total_inserted += 1
|
1282
|
+
except Exception as e:
|
1283
|
+
conn.rollback()
|
1284
|
+
total_failed += 1
|
1285
|
+
logger.error('单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1267
1286
|
return total_inserted, total_skipped, total_failed
|
1268
1287
|
|
1269
|
-
def _process_batch(
|
1270
|
-
self,
|
1271
|
-
conn,
|
1272
|
-
cursor,
|
1273
|
-
db_name: str,
|
1274
|
-
table_name: str,
|
1275
|
-
batch: List[Dict],
|
1276
|
-
all_columns: List[str],
|
1277
|
-
sql: str,
|
1278
|
-
check_duplicate: bool,
|
1279
|
-
duplicate_columns: Optional[List[str]],
|
1280
|
-
batch_id: Optional[str],
|
1281
|
-
transaction_mode: str,
|
1282
|
-
update_on_duplicate: bool = False
|
1283
|
-
) -> Tuple[int, int, int]:
|
1284
|
-
"""
|
1285
|
-
处理单个批次的数据插入
|
1286
|
-
|
1287
|
-
:param conn: 数据库连接对象
|
1288
|
-
:param cursor: 数据库游标对象
|
1289
|
-
:param db_name: 数据库名
|
1290
|
-
:param table_name: 表名
|
1291
|
-
:param batch: 当前批次的数据(字典列表)
|
1292
|
-
:param all_columns: 需要插入的所有列名
|
1293
|
-
:param sql: 执行的SQL语句
|
1294
|
-
:param check_duplicate: 是否检查重复
|
1295
|
-
:param duplicate_columns: 排重列
|
1296
|
-
:param batch_id: 批次ID
|
1297
|
-
:param transaction_mode: 事务模式
|
1298
|
-
:param update_on_duplicate: 遇到重复时是否更新
|
1299
|
-
:return: (插入数, 跳过数, 失败数)
|
1300
|
-
"""
|
1301
|
-
batch_inserted = 0
|
1302
|
-
batch_skipped = 0
|
1303
|
-
batch_failed = 0
|
1304
|
-
batch_size = len(batch)
|
1305
|
-
logger.debug('批次插入开始', {
|
1306
|
-
'库': db_name,
|
1307
|
-
'表': table_name,
|
1308
|
-
'批次ID': batch_id,
|
1309
|
-
'批次大小': batch_size,
|
1310
|
-
'事务模式': transaction_mode,
|
1311
|
-
'SQL预览': sql[:200],
|
1312
|
-
'排重': check_duplicate,
|
1313
|
-
'排重列': duplicate_columns,
|
1314
|
-
'允许更新': update_on_duplicate,
|
1315
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1316
|
-
})
|
1317
|
-
if transaction_mode == 'batch':
|
1318
|
-
try:
|
1319
|
-
for row_idx, row in enumerate(batch, 1):
|
1320
|
-
result = self._process_single_row(
|
1321
|
-
db_name, table_name, cursor, row, all_columns, sql,
|
1322
|
-
check_duplicate, duplicate_columns, update_on_duplicate
|
1323
|
-
)
|
1324
|
-
if result == 'inserted':
|
1325
|
-
batch_inserted += 1
|
1326
|
-
elif result == 'skipped':
|
1327
|
-
batch_skipped += 1
|
1328
|
-
else:
|
1329
|
-
batch_failed += 1
|
1330
|
-
conn.commit()
|
1331
|
-
logger.debug('批次插入成功', {
|
1332
|
-
'库': db_name,
|
1333
|
-
'表': table_name,
|
1334
|
-
'批次ID': batch_id,
|
1335
|
-
'插入': batch_inserted,
|
1336
|
-
'跳过': batch_skipped,
|
1337
|
-
'失败': batch_failed
|
1338
|
-
})
|
1339
|
-
except Exception as e:
|
1340
|
-
conn.rollback()
|
1341
|
-
batch_failed += len(batch)
|
1342
|
-
logger.error('批次插入失败', {
|
1343
|
-
'库': db_name,
|
1344
|
-
'表': table_name,
|
1345
|
-
'批次ID': batch_id,
|
1346
|
-
'错误': str(e),
|
1347
|
-
'SQL预览': sql[:200],
|
1348
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1349
|
-
})
|
1350
|
-
else: # row 或 hybrid 模式
|
1351
|
-
for row_idx, row in enumerate(batch, 1):
|
1352
|
-
try:
|
1353
|
-
result = self._process_single_row(
|
1354
|
-
db_name, table_name, cursor, row, all_columns, sql,
|
1355
|
-
check_duplicate, duplicate_columns, update_on_duplicate
|
1356
|
-
)
|
1357
|
-
if result == 'inserted':
|
1358
|
-
batch_inserted += 1
|
1359
|
-
elif result == 'skipped':
|
1360
|
-
batch_skipped += 1
|
1361
|
-
else:
|
1362
|
-
batch_failed += 1
|
1363
|
-
conn.commit()
|
1364
|
-
logger.debug('单行插入成功', {
|
1365
|
-
'库': db_name,
|
1366
|
-
'表': table_name,
|
1367
|
-
'批次ID': batch_id,
|
1368
|
-
'行号': row_idx,
|
1369
|
-
'插入状态': result
|
1370
|
-
})
|
1371
|
-
except Exception as e:
|
1372
|
-
conn.rollback()
|
1373
|
-
batch_failed += 1
|
1374
|
-
logger.error('单行插入失败', {
|
1375
|
-
'库': db_name,
|
1376
|
-
'表': table_name,
|
1377
|
-
'批次ID': batch_id,
|
1378
|
-
'行号': row_idx,
|
1379
|
-
'错误': str(e),
|
1380
|
-
'SQL预览': sql[:200],
|
1381
|
-
'数据': self._shorten_for_log(row)
|
1382
|
-
})
|
1383
|
-
logger.debug('批次插入结束', {
|
1384
|
-
'库': db_name,
|
1385
|
-
'表': table_name,
|
1386
|
-
'批次ID': batch_id,
|
1387
|
-
'插入': batch_inserted,
|
1388
|
-
'跳过': batch_skipped,
|
1389
|
-
'失败': batch_failed,
|
1390
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1391
|
-
})
|
1392
|
-
return batch_inserted, batch_skipped, batch_failed
|
1393
|
-
|
1394
|
-
def _process_single_row(
|
1395
|
-
self,
|
1396
|
-
db_name: str,
|
1397
|
-
table_name: str,
|
1398
|
-
cursor,
|
1399
|
-
row: Dict,
|
1400
|
-
all_columns: List[str],
|
1401
|
-
sql: str,
|
1402
|
-
check_duplicate: bool,
|
1403
|
-
duplicate_columns: Optional[List[str]],
|
1404
|
-
update_on_duplicate: bool = False
|
1405
|
-
) -> str:
|
1406
|
-
"""
|
1407
|
-
处理单行数据插入
|
1408
|
-
|
1409
|
-
:param db_name: 数据库名
|
1410
|
-
:param table_name: 表名
|
1411
|
-
:param cursor: 数据库游标对象
|
1412
|
-
:param row: 单行数据(字典)
|
1413
|
-
:param all_columns: 需要插入的所有列名
|
1414
|
-
:param sql: 执行的SQL语句
|
1415
|
-
:param check_duplicate: 是否检查重复
|
1416
|
-
:param duplicate_columns: 排重列
|
1417
|
-
:param update_on_duplicate: 遇到重复时是否更新
|
1418
|
-
:return: 'inserted' | 'skipped' | 'failed'
|
1419
|
-
"""
|
1420
|
-
try:
|
1421
|
-
# 构造参数
|
1422
|
-
values = [row.get(col) for col in all_columns]
|
1423
|
-
if check_duplicate:
|
1424
|
-
# 需要为 WHERE NOT EXISTS 语句补充参数
|
1425
|
-
if not update_on_duplicate:
|
1426
|
-
# duplicate_columns 为空时,默认用所有列(排除id/更新时间)
|
1427
|
-
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1428
|
-
values = values + [row.get(col) for col in dup_cols]
|
1429
|
-
cursor.execute(sql, values)
|
1430
|
-
except Exception as e:
|
1431
|
-
logger.error('单行插入失败', {
|
1432
|
-
'库': db_name,
|
1433
|
-
'表': table_name,
|
1434
|
-
'row': self._shorten_for_log(row),
|
1435
|
-
'错误': str(e)
|
1436
|
-
})
|
1437
|
-
return 'failed'
|
1438
|
-
return 'inserted'
|
1439
|
-
|
1440
1288
|
def close(self) -> None:
|
1441
1289
|
"""
|
1442
1290
|
关闭连接池并清理资源
|
@@ -1447,10 +1295,10 @@ class MySQLUploader:
|
|
1447
1295
|
try:
|
1448
1296
|
if hasattr(self, 'pool') and self.pool is not None:
|
1449
1297
|
try:
|
1450
|
-
self.pool.close()
|
1298
|
+
# self.pool.close() # PooledDB 没有 close 方法
|
1299
|
+
self.pool = None
|
1451
1300
|
except Exception as e:
|
1452
1301
|
logger.warning('关闭连接池时出错', {'error': str(e)})
|
1453
|
-
self.pool = None
|
1454
1302
|
logger.info('连接池关闭', {'uploader.py': '连接池关闭'})
|
1455
1303
|
except Exception as e:
|
1456
1304
|
logger.error('关闭连接池失败', {'error': str(e)})
|
@@ -1458,11 +1306,12 @@ class MySQLUploader:
|
|
1458
1306
|
|
1459
1307
|
def _check_pool_health(self) -> bool:
|
1460
1308
|
"""
|
1461
|
-
|
1462
|
-
:return: 连接池健康返回True,否则返回False
|
1309
|
+
检查连接池健康状态,防止连接泄露
|
1463
1310
|
"""
|
1464
1311
|
conn = None
|
1465
1312
|
try:
|
1313
|
+
if not hasattr(self, 'pool') or self.pool is None:
|
1314
|
+
return False
|
1466
1315
|
conn = self.pool.connection()
|
1467
1316
|
conn.ping(reconnect=True)
|
1468
1317
|
logger.debug('连接池健康检查通过')
|
@@ -1471,12 +1320,13 @@ class MySQLUploader:
|
|
1471
1320
|
logger.warning('连接池健康检查失败', {'error': str(e)})
|
1472
1321
|
return False
|
1473
1322
|
finally:
|
1474
|
-
if conn:
|
1323
|
+
if conn is not None:
|
1475
1324
|
try:
|
1476
1325
|
conn.close()
|
1477
1326
|
except Exception as e:
|
1478
1327
|
logger.warning('关闭连接时出错', {'error': str(e)})
|
1479
1328
|
|
1329
|
+
@staticmethod
|
1480
1330
|
def retry_on_failure(max_retries: int = 3, delay: int = 1):
|
1481
1331
|
"""
|
1482
1332
|
通用重试装饰器
|
mdbq/spider/aikucun.py
CHANGED
@@ -3,10 +3,8 @@ import datetime
|
|
3
3
|
import requests
|
4
4
|
import json
|
5
5
|
import os
|
6
|
-
import sys
|
7
6
|
import re
|
8
7
|
import time
|
9
|
-
import warnings
|
10
8
|
import platform
|
11
9
|
import getpass
|
12
10
|
from selenium import webdriver
|
@@ -15,20 +13,18 @@ from selenium.webdriver.common.by import By
|
|
15
13
|
from selenium.webdriver.support import expected_conditions as EC
|
16
14
|
from selenium.webdriver.chrome.service import Service
|
17
15
|
import pymysql
|
18
|
-
|
19
|
-
from mdbq.log import spider_logging
|
20
|
-
from mdbq.mysql import mysql
|
16
|
+
from mdbq.mysql import uploader
|
21
17
|
from mdbq.mysql import s_query
|
22
18
|
from mdbq.config import config
|
23
19
|
from mdbq.other import ua_sj
|
24
20
|
from mdbq.other import otk
|
21
|
+
from mdbq.log import mylogger
|
25
22
|
|
26
23
|
dir_path = os.path.expanduser("~")
|
27
24
|
config_file = os.path.join(dir_path, 'spd.txt')
|
28
25
|
content = config.read_config(file_path=config_file)
|
29
26
|
username, password, host, port = content['username'], content['password'], content['host'], content['port']
|
30
27
|
|
31
|
-
# m_engine = mysql.MysqlUpload(username=username, password=password, host=host, port=port, charset='utf8mb4')
|
32
28
|
uld = uploader.MySQLUploader(username=username, password=password, host=host, port=int(port), pool_size=10)
|
33
29
|
# 实例化一个数据查询类,用来获取 cookies 表数据
|
34
30
|
download = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
@@ -188,11 +184,6 @@ class AikuCun:
|
|
188
184
|
'更新时间': 'timestamp'
|
189
185
|
}
|
190
186
|
# 更新至数据库记录
|
191
|
-
# m_engine.dict_to_mysql(
|
192
|
-
# db_name=self.db_name,
|
193
|
-
# table_name=self.table_name,
|
194
|
-
# dict_data=self.token,
|
195
|
-
# )
|
196
187
|
uld.upload_data(
|
197
188
|
db_name=self.db_name,
|
198
189
|
table_name=self.table_name,
|
@@ -429,15 +420,6 @@ class AikuCun:
|
|
429
420
|
drop_dup = ['日期', '平台', '店铺名称', '商品款号', '访客量']
|
430
421
|
else:
|
431
422
|
drop_dup = ['日期', '平台', '店铺名称', '条码']
|
432
|
-
# m_engine.insert_many_dict(
|
433
|
-
# db_name=db_name,
|
434
|
-
# table_name=table_name,
|
435
|
-
# dict_data_list=_results,
|
436
|
-
# icm_update=drop_dup, # 唯一组合键
|
437
|
-
# # unique_main_key=['人群id'],
|
438
|
-
# set_typ=set_typ,
|
439
|
-
# allow_not_null=False, # 创建允许插入空值的列
|
440
|
-
# )
|
441
423
|
uld.upload_data(
|
442
424
|
db_name=db_name,
|
443
425
|
table_name=table_name,
|
@@ -1,17 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=1DrHdseKlRiF6aYmgAcpC6gl2Za3P3wK0gh0svyeqSQ,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
8
|
-
mdbq/log/mylogger.py,sha256=
|
8
|
+
mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
12
|
-
mdbq/mysql/mysql.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=bIV010UkFfSUONY6-756x3tDVO4k6q3pqxoY3Z2xT-k,32990
|
12
|
+
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=K6PBLkmur-2dYdJ78_wFQPolSS44EiQfetDM0owy4E0,61699
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -23,8 +23,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
23
23
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
|
-
mdbq/spider/aikucun.py,sha256=
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
26
|
+
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
+
mdbq-3.11.4.dist-info/METADATA,sha256=Tnz1f-xIVia0AgsFI4nxuiHIjcfT1bxHk5zxHn23q9g,364
|
28
|
+
mdbq-3.11.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|