mdbq 3.11.1__py3-none-any.whl → 3.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +0 -3
- mdbq/log/mylogger.py +1 -1
- mdbq/mysql/deduplicator.py +1 -1
- mdbq/mysql/mysql.py +3 -4
- mdbq/mysql/uploader.py +212 -359
- mdbq/spider/aikucun.py +2 -20
- {mdbq-3.11.1.dist-info → mdbq-3.11.3.dist-info}/METADATA +1 -1
- {mdbq-3.11.1.dist-info → mdbq-3.11.3.dist-info}/RECORD +11 -11
- {mdbq-3.11.1.dist-info → mdbq-3.11.3.dist-info}/WHEEL +0 -0
- {mdbq-3.11.1.dist-info → mdbq-3.11.3.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.3'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -9,14 +9,11 @@ import pandas as pd
|
|
9
9
|
import numpy as np
|
10
10
|
from functools import wraps
|
11
11
|
import platform
|
12
|
-
import json
|
13
12
|
import os
|
14
13
|
import time
|
15
14
|
import calendar
|
16
15
|
import concurrent.futures
|
17
|
-
import traceback
|
18
16
|
import logging
|
19
|
-
import sys
|
20
17
|
|
21
18
|
"""
|
22
19
|
|
mdbq/log/mylogger.py
CHANGED
mdbq/mysql/deduplicator.py
CHANGED
mdbq/mysql/mysql.py
CHANGED
@@ -7,10 +7,9 @@ import warnings
|
|
7
7
|
import pymysql
|
8
8
|
import pandas as pd
|
9
9
|
from sqlalchemy import create_engine
|
10
|
-
import os
|
11
10
|
from mdbq.other import otk
|
12
11
|
from mdbq.log import mylogger
|
13
|
-
import
|
12
|
+
import math
|
14
13
|
|
15
14
|
warnings.filterwarnings('ignore')
|
16
15
|
"""
|
@@ -131,7 +130,7 @@ class MysqlUpload:
|
|
131
130
|
new_dict_data: dict = {}
|
132
131
|
for k, v in dict_data.items():
|
133
132
|
k = str(k).lower()
|
134
|
-
k = re.sub(r'[()\-,,$&~^、 ()\"\'
|
133
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'"="·/。》《><!!`]', '_', k, re.IGNORECASE)
|
135
134
|
k = k.replace(')', '')
|
136
135
|
k = re.sub(r'_{2,}', '_', k)
|
137
136
|
k = re.sub(r'_+$', '', k)
|
@@ -526,7 +525,7 @@ class MysqlUpload:
|
|
526
525
|
new_dict_data = {}
|
527
526
|
for k, v in dict_data.items():
|
528
527
|
k = str(k).lower()
|
529
|
-
k = re.sub(r'[()\-,,$&~^、 ()\"\'
|
528
|
+
k = re.sub(r'[()\-,,$&~^、 ()\"\'"="·/。》《><!!`]', '_', k, re.IGNORECASE)
|
530
529
|
k = k.replace(')', '')
|
531
530
|
k = re.sub(r'_{2,}', '_', k)
|
532
531
|
k = re.sub(r'_+$', '', k)
|
mdbq/mysql/uploader.py
CHANGED
@@ -10,8 +10,8 @@ from mdbq.log import mylogger
|
|
10
10
|
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
11
11
|
from dbutils.pooled_db import PooledDB
|
12
12
|
import json
|
13
|
-
from collections import OrderedDict
|
14
13
|
import sys
|
14
|
+
from decimal import Decimal, InvalidOperation
|
15
15
|
|
16
16
|
warnings.filterwarnings('ignore')
|
17
17
|
logger = mylogger.MyLogger(
|
@@ -28,62 +28,44 @@ logger = mylogger.MyLogger(
|
|
28
28
|
)
|
29
29
|
|
30
30
|
|
31
|
-
def count_decimal_places(num_str):
|
31
|
+
def count_decimal_places(num_str: str) -> Tuple[int, int]:
|
32
32
|
"""
|
33
|
-
|
34
|
-
|
35
|
-
:param num_str: 数字字符串
|
36
|
-
:return: 返回元组(整数位数, 小数位数)
|
37
|
-
:raises: 无显式抛出异常,但正则匹配失败时返回(0, 0)
|
38
|
-
"""
|
39
|
-
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
40
|
-
if match:
|
41
|
-
# 如果是科学计数法
|
42
|
-
match = re.findall(r'(\d+)\.(\d+)[eE][-+]?(\d+)$', str(num_str))
|
43
|
-
if match:
|
44
|
-
if len(match[0]) == 3:
|
45
|
-
if int(match[0][2]) < len(match[0][1]):
|
46
|
-
# count_int 清除整数部分开头的 0 并计算整数位数
|
47
|
-
count_int = len(re.sub('^0+', '', str(match[0][0]))) + int(match[0][2])
|
48
|
-
# 计算小数位数
|
49
|
-
count_float = len(match[0][1]) - int(match[0][2])
|
50
|
-
return count_int, count_float
|
51
|
-
# 如果是普通小数
|
52
|
-
match = re.findall(r'(\d+)\.(\d+)$', str(num_str))
|
53
|
-
if match:
|
54
|
-
count_int = len(re.sub('^0+', '', str(match[0][0])))
|
55
|
-
count_float = len(match[0][1])
|
56
|
-
return count_int, count_float # 计算小数位数
|
57
|
-
return 0, 0
|
58
|
-
|
59
|
-
|
60
|
-
class StatementCache(OrderedDict):
|
61
|
-
"""
|
62
|
-
基于OrderedDict实现的LRU缓存策略,用于缓存SQL语句
|
63
|
-
|
64
|
-
这个类继承自OrderedDict,实现了最近最少使用(LRU)的缓存策略。
|
65
|
-
当缓存达到最大容量时,会自动删除最早添加的项。
|
33
|
+
统计小数点前后位数,支持科学计数法。
|
34
|
+
返回:(整数位数, 小数位数)
|
66
35
|
"""
|
36
|
+
try:
|
37
|
+
d = Decimal(str(num_str))
|
38
|
+
sign, digits, exponent = d.as_tuple()
|
39
|
+
int_part = len(digits) + exponent if exponent < 0 else len(digits)
|
40
|
+
dec_part = -exponent if exponent < 0 else 0
|
41
|
+
return max(int_part, 0), max(dec_part, 0)
|
42
|
+
except (InvalidOperation, ValueError, TypeError):
|
43
|
+
return (0, 0)
|
44
|
+
|
45
|
+
|
46
|
+
class StatementCache(dict):
|
47
|
+
"""简单LRU缓存实现,用于SQL语句缓存"""
|
67
48
|
def __init__(self, maxsize=100):
|
68
|
-
"""
|
69
|
-
初始化缓存
|
70
|
-
|
71
|
-
:param maxsize: 最大缓存大小,默认为100条SQL语句
|
72
|
-
"""
|
73
49
|
super().__init__()
|
74
|
-
self.
|
75
|
-
|
50
|
+
self._maxsize = maxsize
|
51
|
+
self._order = []
|
52
|
+
def __getitem__(self, key):
|
53
|
+
value = super().__getitem__(key)
|
54
|
+
self._order.remove(key)
|
55
|
+
self._order.append(key)
|
56
|
+
return value
|
76
57
|
def __setitem__(self, key, value):
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
"""
|
58
|
+
if key in self:
|
59
|
+
self._order.remove(key)
|
60
|
+
elif len(self._order) >= self._maxsize:
|
61
|
+
oldest = self._order.pop(0)
|
62
|
+
super().__delitem__(oldest)
|
83
63
|
super().__setitem__(key, value)
|
84
|
-
|
85
|
-
|
86
|
-
|
64
|
+
self._order.append(key)
|
65
|
+
def get(self, key, default=None):
|
66
|
+
if key in self:
|
67
|
+
return self[key]
|
68
|
+
return default
|
87
69
|
|
88
70
|
class MySQLUploader:
|
89
71
|
"""
|
@@ -194,22 +176,22 @@ class MySQLUploader:
|
|
194
176
|
logger.error('连接池创建失败', {'error': str(e), 'host': self.host, 'port': self.port})
|
195
177
|
raise ConnectionError(f'连接池创建失败: {str(e)}')
|
196
178
|
|
197
|
-
|
179
|
+
@staticmethod
|
180
|
+
def _execute_with_retry(func):
|
198
181
|
"""
|
199
182
|
带重试机制的装饰器,用于数据库操作
|
200
|
-
|
201
183
|
:param func: 被装饰的函数
|
202
184
|
:return: 装饰后的函数
|
203
185
|
:raises: 可能抛出原始异常或最后一次重试的异常
|
204
186
|
"""
|
205
187
|
@wraps(func)
|
206
|
-
def wrapper(*args, **kwargs):
|
188
|
+
def wrapper(self, *args, **kwargs):
|
207
189
|
last_exception = None
|
208
190
|
operation = func.__name__
|
209
191
|
logger.debug(f'开始执行操作: {operation}', {'max_retries': self.max_retries})
|
210
192
|
for attempt in range(self.max_retries):
|
211
193
|
try:
|
212
|
-
result = func(*args, **kwargs)
|
194
|
+
result = func(self, *args, **kwargs)
|
213
195
|
if attempt > 0:
|
214
196
|
logger.info('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
|
215
197
|
else:
|
@@ -255,6 +237,7 @@ class MySQLUploader:
|
|
255
237
|
raise last_exception if last_exception else Exception('发生未知错误')
|
256
238
|
return wrapper
|
257
239
|
|
240
|
+
@_execute_with_retry
|
258
241
|
def _get_connection(self) -> pymysql.connections.Connection:
|
259
242
|
"""
|
260
243
|
从连接池获取数据库连接
|
@@ -270,6 +253,7 @@ class MySQLUploader:
|
|
270
253
|
logger.error('获取数据库连接失败', {'error': str(e)})
|
271
254
|
raise ConnectionError(f'连接数据库失败: {str(e)}')
|
272
255
|
|
256
|
+
@_execute_with_retry
|
273
257
|
def _check_database_exists(self, db_name: str) -> bool:
|
274
258
|
"""
|
275
259
|
检查数据库是否存在
|
@@ -280,6 +264,7 @@ class MySQLUploader:
|
|
280
264
|
"""
|
281
265
|
db_name = self._validate_identifier(db_name)
|
282
266
|
sql = 'SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s'
|
267
|
+
conn = None
|
283
268
|
try:
|
284
269
|
with self._get_connection() as conn:
|
285
270
|
with conn.cursor() as cursor:
|
@@ -291,6 +276,7 @@ class MySQLUploader:
|
|
291
276
|
logger.error('检查数据库是否存在时出错', {'库': db_name, '错误': str(e)})
|
292
277
|
raise
|
293
278
|
|
279
|
+
@_execute_with_retry
|
294
280
|
def _create_database(self, db_name: str) -> None:
|
295
281
|
"""
|
296
282
|
创建数据库
|
@@ -300,6 +286,7 @@ class MySQLUploader:
|
|
300
286
|
"""
|
301
287
|
db_name = self._validate_identifier(db_name)
|
302
288
|
sql = f'CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}'
|
289
|
+
conn = None
|
303
290
|
try:
|
304
291
|
with self._get_connection() as conn:
|
305
292
|
with conn.cursor() as cursor:
|
@@ -308,7 +295,8 @@ class MySQLUploader:
|
|
308
295
|
logger.info('数据库已创建', {'库': db_name})
|
309
296
|
except Exception as e:
|
310
297
|
logger.error('无法创建数据库', {'库': db_name, '错误': str(e)})
|
311
|
-
conn
|
298
|
+
if conn is not None:
|
299
|
+
conn.rollback()
|
312
300
|
raise
|
313
301
|
|
314
302
|
def _get_partition_table_name(self, table_name: str, date_value: str, partition_by: str) -> str:
|
@@ -345,10 +333,8 @@ class MySQLUploader:
|
|
345
333
|
if not identifier or not isinstance(identifier, str):
|
346
334
|
logger.error('无效的标识符', {'标识符': identifier})
|
347
335
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
348
|
-
|
349
|
-
|
350
|
-
else:
|
351
|
-
cleaned = identifier
|
336
|
+
# 始终做特殊字符清理
|
337
|
+
cleaned = re.sub(r'[^-\uFFFF\w\u4e00-\u9fff$]', '_', identifier)
|
352
338
|
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
353
339
|
if not cleaned:
|
354
340
|
logger.error('无法清理异常标识符', {'原始标识符': identifier})
|
@@ -362,6 +348,7 @@ class MySQLUploader:
|
|
362
348
|
return f"`{cleaned}`"
|
363
349
|
return cleaned
|
364
350
|
|
351
|
+
@_execute_with_retry
|
365
352
|
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
366
353
|
"""
|
367
354
|
检查表是否存在
|
@@ -396,6 +383,7 @@ class MySQLUploader:
|
|
396
383
|
logger.debug('表存在检查', {'库': db_name, '表': table_name, '存在': result})
|
397
384
|
return result
|
398
385
|
|
386
|
+
@_execute_with_retry
|
399
387
|
def _create_table(
|
400
388
|
self,
|
401
389
|
db_name: str,
|
@@ -407,16 +395,7 @@ class MySQLUploader:
|
|
407
395
|
allow_null: bool = False
|
408
396
|
) -> None:
|
409
397
|
"""
|
410
|
-
|
411
|
-
|
412
|
-
:param db_name: 数据库名
|
413
|
-
:param table_name: 表名
|
414
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
415
|
-
:param primary_keys: 主键列列表,可选
|
416
|
-
:param date_column: 日期列名,可选,如果存在将设置为索引
|
417
|
-
:param indexes: 需要创建索引的列列表,可选
|
418
|
-
:param allow_null: 是否允许空值,默认为False
|
419
|
-
:raises: 可能抛出数据库相关异常
|
398
|
+
创建数据表,优化索引创建方式
|
420
399
|
"""
|
421
400
|
db_name = self._validate_identifier(db_name)
|
422
401
|
table_name = self._validate_identifier(table_name)
|
@@ -439,40 +418,35 @@ class MySQLUploader:
|
|
439
418
|
primary_keys = ['id']
|
440
419
|
safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
|
441
420
|
primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
421
|
+
# 索引统一在CREATE TABLE中定义
|
422
|
+
index_defs = []
|
423
|
+
if date_column and date_column in set_typ:
|
424
|
+
safe_date_col = self._validate_identifier(date_column)
|
425
|
+
index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
|
426
|
+
if indexes:
|
427
|
+
for idx_col in indexes:
|
428
|
+
if idx_col in set_typ:
|
429
|
+
safe_idx_col = self._validate_identifier(idx_col)
|
430
|
+
index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
|
431
|
+
index_sql = (',' + ','.join(index_defs)) if index_defs else ''
|
442
432
|
sql = f"""
|
443
433
|
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
444
434
|
{','.join(column_defs)}
|
445
435
|
{primary_key_sql}
|
436
|
+
{index_sql}
|
446
437
|
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
447
438
|
"""
|
439
|
+
conn = None
|
448
440
|
try:
|
449
441
|
with self._get_connection() as conn:
|
450
442
|
with conn.cursor() as cursor:
|
451
443
|
cursor.execute(sql)
|
452
|
-
logger.info('数据表已创建', {'库': db_name, '表': table_name})
|
453
|
-
index_statements = []
|
454
|
-
if date_column and date_column in set_typ:
|
455
|
-
safe_date_col = self._validate_identifier(date_column)
|
456
|
-
index_statements.append(
|
457
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_date_col}` (`{safe_date_col}`)"
|
458
|
-
)
|
459
|
-
if indexes:
|
460
|
-
for idx_col in indexes:
|
461
|
-
if idx_col in set_typ:
|
462
|
-
safe_idx_col = self._validate_identifier(idx_col)
|
463
|
-
index_statements.append(
|
464
|
-
f"ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)"
|
465
|
-
)
|
466
|
-
if index_statements:
|
467
|
-
with conn.cursor() as cursor:
|
468
|
-
for stmt in index_statements:
|
469
|
-
cursor.execute(stmt)
|
470
|
-
logger.debug('执行索引语句', {'SQL': stmt})
|
471
444
|
conn.commit()
|
472
|
-
logger.info('
|
445
|
+
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes})
|
473
446
|
except Exception as e:
|
474
447
|
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
475
|
-
conn
|
448
|
+
if conn is not None:
|
449
|
+
conn.rollback()
|
476
450
|
raise
|
477
451
|
|
478
452
|
def _validate_datetime(self, value: str, date_type: bool = False) -> Any:
|
@@ -511,19 +485,24 @@ class MySQLUploader:
|
|
511
485
|
logger.error('无效的日期格式', {'值': value})
|
512
486
|
raise ValueError(f"无效的日期格式: `{value}`")
|
513
487
|
|
514
|
-
def _validate_value(self, value: Any, column_type: str, allow_null: bool) -> Any:
|
488
|
+
def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
|
515
489
|
"""
|
516
490
|
根据列类型验证并转换数据值
|
517
491
|
|
518
492
|
:param value: 要验证的值
|
519
493
|
:param column_type: 列的数据类型
|
520
494
|
:param allow_null: 是否允许空值
|
495
|
+
:param db_name: 数据库名(用于日志)
|
496
|
+
:param table_name: 表名(用于日志)
|
497
|
+
:param col_name: 列名(用于日志)
|
521
498
|
:return: 转换后的值
|
522
499
|
:raises ValueError: 当值转换失败时抛出
|
523
500
|
"""
|
524
501
|
if value is None:
|
525
502
|
if not allow_null:
|
526
|
-
logger.warning('字段值为None
|
503
|
+
logger.warning('字段值为None但不允许空值, 已填充为none', {
|
504
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
505
|
+
})
|
527
506
|
return 'none'
|
528
507
|
return None
|
529
508
|
try:
|
@@ -536,14 +515,18 @@ class MySQLUploader:
|
|
536
515
|
logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
|
537
516
|
return decimal_value
|
538
517
|
except ValueError:
|
539
|
-
logger.warning('百分比字符串转小数失败', {
|
518
|
+
logger.warning('百分比字符串转小数失败', {
|
519
|
+
'库': db_name, '表': table_name, '列': col_name, '原始': value
|
520
|
+
})
|
540
521
|
elif 'int' in column_type_lower:
|
541
522
|
if isinstance(value, str):
|
542
523
|
value = value.replace(',', '').strip()
|
543
524
|
try:
|
544
525
|
return int(float(value))
|
545
526
|
except ValueError:
|
546
|
-
logger.error('字符串转整数失败', {
|
527
|
+
logger.error('字符串转整数失败', {
|
528
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value
|
529
|
+
})
|
547
530
|
raise ValueError(f"`{value}` -> 无法转为整数")
|
548
531
|
return int(value) if value is not None else None
|
549
532
|
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
@@ -557,7 +540,9 @@ class MySQLUploader:
|
|
557
540
|
try:
|
558
541
|
return self._validate_datetime(value)
|
559
542
|
except ValueError as e:
|
560
|
-
logger.error('无效日期格式', {
|
543
|
+
logger.error('无效日期格式', {
|
544
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value, '错误': str(e)
|
545
|
+
})
|
561
546
|
raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
|
562
547
|
return str(value)
|
563
548
|
elif 'char' in column_type_lower or 'text' in column_type_lower:
|
@@ -569,9 +554,12 @@ class MySQLUploader:
|
|
569
554
|
else:
|
570
555
|
return value
|
571
556
|
except (ValueError, TypeError) as e:
|
572
|
-
logger.error('数据类型转换异常', {
|
557
|
+
logger.error('数据类型转换异常', {
|
558
|
+
'库': db_name, '表': table_name, '列': col_name, '值': value, '目标类型': column_type, '错误': str(e)
|
559
|
+
})
|
573
560
|
raise ValueError(f"转换异常 -> 无法将 `{value}` 的数据类型转为: `{column_type}` -> {str(e)}")
|
574
561
|
|
562
|
+
@_execute_with_retry
|
575
563
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
576
564
|
"""
|
577
565
|
获取表的列名和数据类型
|
@@ -593,7 +581,10 @@ class MySQLUploader:
|
|
593
581
|
with self._get_connection() as conn:
|
594
582
|
with conn.cursor() as cursor:
|
595
583
|
cursor.execute(sql, (db_name, table_name))
|
596
|
-
|
584
|
+
if self.case_sensitive:
|
585
|
+
set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
|
586
|
+
else:
|
587
|
+
set_typ = {row['COLUMN_NAME'].lower(): row['DATA_TYPE'] for row in cursor.fetchall()}
|
597
588
|
logger.debug('获取表的列信息', {'库': db_name, '表': table_name, '列信息': set_typ})
|
598
589
|
return set_typ
|
599
590
|
except Exception as e:
|
@@ -727,39 +718,38 @@ class MySQLUploader:
|
|
727
718
|
"""
|
728
719
|
1. pandas:规范化列名
|
729
720
|
2. 字典列表:规范化每个字典的键
|
730
|
-
|
731
|
-
参数:
|
732
|
-
data: 输入数据,支持两种类型:
|
733
|
-
- pandas.DataFrame:将规范化其列名
|
734
|
-
- List[Dict[str, Any]]:将规范化列表中每个字典的键
|
735
721
|
"""
|
736
722
|
if isinstance(data, pd.DataFrame):
|
737
|
-
|
723
|
+
if self.case_sensitive:
|
724
|
+
data.columns = [self._validate_identifier(col) for col in data.columns]
|
725
|
+
else:
|
726
|
+
data.columns = [self._validate_identifier(col).lower() for col in data.columns]
|
738
727
|
return data
|
739
728
|
elif isinstance(data, list):
|
740
|
-
|
729
|
+
if self.case_sensitive:
|
730
|
+
return [{self._validate_identifier(k): v for k, v in item.items()} for item in data]
|
731
|
+
else:
|
732
|
+
return [{self._validate_identifier(k).lower(): v for k, v in item.items()} for item in data]
|
741
733
|
return data
|
742
734
|
|
743
735
|
def _prepare_data(
|
744
736
|
self,
|
745
737
|
data: Union[Dict, List[Dict], pd.DataFrame],
|
746
738
|
set_typ: Dict[str, str],
|
747
|
-
allow_null: bool = False
|
739
|
+
allow_null: bool = False,
|
740
|
+
db_name: str = None,
|
741
|
+
table_name: str = None,
|
748
742
|
) -> Tuple[List[Dict], Dict[str, str]]:
|
749
743
|
"""
|
750
744
|
准备要上传的数据,验证并转换数据类型
|
751
|
-
|
752
|
-
:param data: 输入数据,可以是字典、字典列表或DataFrame
|
753
|
-
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
754
|
-
:param allow_null: 是否允许空值
|
755
|
-
:return: 元组(准备好的数据列表, 过滤后的列类型字典)
|
756
|
-
:raises ValueError: 当数据验证失败时抛出
|
757
745
|
"""
|
758
746
|
# 统一数据格式为字典列表
|
759
747
|
if isinstance(data, pd.DataFrame):
|
760
748
|
try:
|
761
|
-
|
762
|
-
|
749
|
+
if self.case_sensitive:
|
750
|
+
data.columns = [col for col in data.columns]
|
751
|
+
else:
|
752
|
+
data.columns = [col.lower() for col in data.columns]
|
763
753
|
data = data.replace({pd.NA: None}).to_dict('records')
|
764
754
|
except Exception as e:
|
765
755
|
logger.error('数据转字典时发生错误', {
|
@@ -769,10 +759,15 @@ class MySQLUploader:
|
|
769
759
|
})
|
770
760
|
raise ValueError(f"数据转字典时发生错误: {e}")
|
771
761
|
elif isinstance(data, dict):
|
772
|
-
|
762
|
+
if self.case_sensitive:
|
763
|
+
data = [{k: v for k, v in data.items()}]
|
764
|
+
else:
|
765
|
+
data = [{k.lower(): v for k, v in data.items()}]
|
773
766
|
elif isinstance(data, list) and all(isinstance(item, dict) for item in data):
|
774
|
-
|
775
|
-
|
767
|
+
if self.case_sensitive:
|
768
|
+
data = [{k: v for k, v in item.items()} for item in data]
|
769
|
+
else:
|
770
|
+
data = [{k.lower(): v for k, v in item.items()} for item in data]
|
776
771
|
else:
|
777
772
|
logger.error('数据结构必须是字典、列表、字典列表或dataframe', {
|
778
773
|
'data': self._shorten_for_log(data),
|
@@ -783,8 +778,11 @@ class MySQLUploader:
|
|
783
778
|
# 统一处理原始数据中列名的特殊字符
|
784
779
|
data = self.normalize_column_names(data)
|
785
780
|
|
786
|
-
#
|
787
|
-
|
781
|
+
# set_typ的键处理
|
782
|
+
if self.case_sensitive:
|
783
|
+
set_typ = {k: v for k, v in set_typ.items()}
|
784
|
+
else:
|
785
|
+
set_typ = {k.lower(): v for k, v in set_typ.items()}
|
788
786
|
|
789
787
|
# 获取数据中实际存在的列名
|
790
788
|
data_columns = set()
|
@@ -797,25 +795,25 @@ class MySQLUploader:
|
|
797
795
|
if col in set_typ:
|
798
796
|
filtered_set_typ[col] = set_typ[col]
|
799
797
|
else:
|
800
|
-
# 如果列不在set_typ
|
801
|
-
sample_values = [row[col] for row in data if col in row and row[col] is not None][:
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
798
|
+
# 如果列不在set_typ中,采样多个非None值推断类型
|
799
|
+
sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
|
800
|
+
inferred_type = None
|
801
|
+
for val in sample_values:
|
802
|
+
inferred_type = self._infer_data_type(val)
|
803
|
+
if inferred_type:
|
804
|
+
break
|
805
|
+
if not inferred_type:
|
806
|
+
inferred_type = 'VARCHAR(255)'
|
807
|
+
filtered_set_typ[col] = inferred_type
|
808
|
+
logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
|
810
809
|
|
811
810
|
prepared_data = []
|
812
811
|
for row_idx, row in enumerate(data, 1):
|
813
812
|
prepared_row = {}
|
814
813
|
for col_name in filtered_set_typ:
|
815
814
|
# 跳过id列,不允许外部传入id
|
816
|
-
if col_name.lower() == 'id':
|
815
|
+
if (self.case_sensitive and col_name == 'id') or (not self.case_sensitive and col_name.lower() == 'id'):
|
817
816
|
continue
|
818
|
-
|
819
817
|
if col_name not in row:
|
820
818
|
if not allow_null:
|
821
819
|
error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`"
|
@@ -824,7 +822,7 @@ class MySQLUploader:
|
|
824
822
|
prepared_row[col_name] = None
|
825
823
|
else:
|
826
824
|
try:
|
827
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
|
825
|
+
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
|
828
826
|
except ValueError as e:
|
829
827
|
logger.error('数据验证失败', {
|
830
828
|
'列': col_name,
|
@@ -856,7 +854,7 @@ class MySQLUploader:
|
|
856
854
|
transaction_mode: str = "batch"
|
857
855
|
):
|
858
856
|
"""
|
859
|
-
|
857
|
+
上传数据到数据库的主入口方法,分表逻辑异常处理统计丢弃数据
|
860
858
|
|
861
859
|
:param db_name: 数据库名
|
862
860
|
:param table_name: 表名
|
@@ -882,6 +880,7 @@ class MySQLUploader:
|
|
882
880
|
|
883
881
|
batch_id = f"batch_{int(time.time() * 1000)}"
|
884
882
|
success_flag = False
|
883
|
+
dropped_rows = 0
|
885
884
|
|
886
885
|
logger.info("开始上传", {
|
887
886
|
'库': db_name,
|
@@ -918,7 +917,7 @@ class MySQLUploader:
|
|
918
917
|
raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
|
919
918
|
|
920
919
|
# 准备数据
|
921
|
-
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null)
|
920
|
+
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null, db_name, table_name)
|
922
921
|
|
923
922
|
# 检查数据库是否存在
|
924
923
|
if not self._check_database_exists(db_name):
|
@@ -945,8 +944,8 @@ class MySQLUploader:
|
|
945
944
|
'row': self._shorten_for_log(row),
|
946
945
|
'func': sys._getframe().f_code.co_name,
|
947
946
|
})
|
948
|
-
|
949
|
-
|
947
|
+
dropped_rows += 1
|
948
|
+
continue
|
950
949
|
part_table = self._get_partition_table_name(
|
951
950
|
table_name,
|
952
951
|
str(row[partition_date_column]),
|
@@ -963,7 +962,8 @@ class MySQLUploader:
|
|
963
962
|
'error': str(e),
|
964
963
|
'func': sys._getframe().f_code.co_name,
|
965
964
|
})
|
966
|
-
|
965
|
+
dropped_rows += 1
|
966
|
+
continue
|
967
967
|
|
968
968
|
# 对每个分表执行上传
|
969
969
|
for part_table, part_data in partitioned_data.items():
|
@@ -1010,10 +1010,11 @@ class MySQLUploader:
|
|
1010
1010
|
'表': table_name,
|
1011
1011
|
'批次': batch_id,
|
1012
1012
|
'finish': success_flag,
|
1013
|
-
|
1014
|
-
'
|
1013
|
+
'数据行': initial_row_count,
|
1014
|
+
'丢弃行数': dropped_rows
|
1015
1015
|
})
|
1016
1016
|
|
1017
|
+
@_execute_with_retry
|
1017
1018
|
def _insert_data(
|
1018
1019
|
self,
|
1019
1020
|
db_name: str,
|
@@ -1178,34 +1179,27 @@ class MySQLUploader:
|
|
1178
1179
|
update_on_duplicate: bool
|
1179
1180
|
) -> str:
|
1180
1181
|
"""
|
1181
|
-
准备插入SQL
|
1182
|
-
|
1183
|
-
1. 当 check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
|
1184
|
-
2. 当 check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
|
1185
|
-
3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1186
|
-
4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1187
|
-
5. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1188
|
-
6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1189
|
-
|
1182
|
+
准备插入SQL语句, 增加StatementCache缓存
|
1190
1183
|
"""
|
1184
|
+
cache_key = (db_name, table_name, tuple(sorted(set_typ.items())), check_duplicate, tuple(duplicate_columns) if duplicate_columns else (), update_on_duplicate)
|
1185
|
+
cached = self._prepared_statements.get(cache_key)
|
1186
|
+
if cached:
|
1187
|
+
return cached
|
1191
1188
|
# 获取所有列名(排除id)
|
1192
1189
|
all_columns = [col for col in set_typ.keys()
|
1193
1190
|
if col.lower() != 'id']
|
1194
|
-
|
1195
|
-
# 情况1-2:不检查重复
|
1196
1191
|
if not check_duplicate:
|
1197
|
-
|
1192
|
+
sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1198
1193
|
update_on_duplicate)
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
# 情况3-6:检查重复
|
1207
|
-
return self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1194
|
+
else:
|
1195
|
+
dup_cols = duplicate_columns if duplicate_columns else [
|
1196
|
+
col for col in all_columns
|
1197
|
+
if col.lower() not in self.base_excute_col
|
1198
|
+
]
|
1199
|
+
sql = self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1208
1200
|
dup_cols, update_on_duplicate, set_typ)
|
1201
|
+
self._prepared_statements[cache_key] = sql
|
1202
|
+
return sql
|
1209
1203
|
|
1210
1204
|
def _execute_batch_insert(
|
1211
1205
|
self,
|
@@ -1220,10 +1214,8 @@ class MySQLUploader:
|
|
1220
1214
|
transaction_mode: str,
|
1221
1215
|
update_on_duplicate: bool = False
|
1222
1216
|
) -> Tuple[int, int, int]:
|
1223
|
-
"""
|
1224
|
-
|
1217
|
+
"""执行批量插入操作,优化batch和hybrid模式"""
|
1225
1218
|
def get_optimal_batch_size(total_rows: int) -> int:
|
1226
|
-
# 根据数据量调整批量大小
|
1227
1219
|
if total_rows <= 100:
|
1228
1220
|
return total_rows
|
1229
1221
|
elif total_rows <= 1000:
|
@@ -1232,205 +1224,64 @@ class MySQLUploader:
|
|
1232
1224
|
return 1000
|
1233
1225
|
else:
|
1234
1226
|
return 2000
|
1235
|
-
|
1236
1227
|
batch_size = get_optimal_batch_size(len(data))
|
1237
|
-
|
1238
|
-
# 获取所有列名(排除id列)
|
1239
|
-
all_columns = [col for col in set_typ.keys()
|
1240
|
-
if col.lower() != 'id']
|
1241
|
-
|
1228
|
+
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
1242
1229
|
total_inserted = 0
|
1243
1230
|
total_skipped = 0
|
1244
1231
|
total_failed = 0
|
1245
|
-
|
1246
1232
|
with self._get_connection() as conn:
|
1247
1233
|
with conn.cursor() as cursor:
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1234
|
+
if transaction_mode == 'batch':
|
1235
|
+
for i in range(0, len(data), batch_size):
|
1236
|
+
batch = data[i:i + batch_size]
|
1237
|
+
values_list = []
|
1238
|
+
for row in batch:
|
1239
|
+
values = [row.get(col) for col in all_columns]
|
1240
|
+
if check_duplicate and not update_on_duplicate:
|
1241
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1242
|
+
values += [row.get(col) for col in dup_cols]
|
1243
|
+
values_list.append(values)
|
1244
|
+
try:
|
1245
|
+
cursor.executemany(sql, values_list)
|
1246
|
+
conn.commit()
|
1247
|
+
total_inserted += len(batch)
|
1248
|
+
except Exception as e:
|
1249
|
+
conn.rollback()
|
1250
|
+
total_failed += len(batch)
|
1251
|
+
logger.error('批量插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1252
|
+
elif transaction_mode == 'hybrid':
|
1253
|
+
hybrid_n = 100 # 可配置
|
1254
|
+
for i in range(0, len(data), hybrid_n):
|
1255
|
+
batch = data[i:i + hybrid_n]
|
1256
|
+
for row in batch:
|
1257
|
+
try:
|
1258
|
+
values = [row.get(col) for col in all_columns]
|
1259
|
+
if check_duplicate and not update_on_duplicate:
|
1260
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1261
|
+
values += [row.get(col) for col in dup_cols]
|
1262
|
+
cursor.execute(sql, values)
|
1263
|
+
total_inserted += 1
|
1264
|
+
except Exception as e:
|
1265
|
+
conn.rollback()
|
1266
|
+
total_failed += 1
|
1267
|
+
logger.error('hybrid单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1268
|
+
conn.commit()
|
1269
|
+
else: # row模式
|
1270
|
+
for row in data:
|
1271
|
+
try:
|
1272
|
+
values = [row.get(col) for col in all_columns]
|
1273
|
+
if check_duplicate and not update_on_duplicate:
|
1274
|
+
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1275
|
+
values += [row.get(col) for col in dup_cols]
|
1276
|
+
cursor.execute(sql, values)
|
1277
|
+
conn.commit()
|
1278
|
+
total_inserted += 1
|
1279
|
+
except Exception as e:
|
1280
|
+
conn.rollback()
|
1281
|
+
total_failed += 1
|
1282
|
+
logger.error('单行插入失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
1261
1283
|
return total_inserted, total_skipped, total_failed
|
1262
1284
|
|
1263
|
-
def _process_batch(
|
1264
|
-
self,
|
1265
|
-
conn,
|
1266
|
-
cursor,
|
1267
|
-
db_name: str,
|
1268
|
-
table_name: str,
|
1269
|
-
batch: List[Dict],
|
1270
|
-
all_columns: List[str],
|
1271
|
-
sql: str,
|
1272
|
-
check_duplicate: bool,
|
1273
|
-
duplicate_columns: Optional[List[str]],
|
1274
|
-
batch_id: Optional[str],
|
1275
|
-
transaction_mode: str,
|
1276
|
-
update_on_duplicate: bool = False
|
1277
|
-
) -> Tuple[int, int, int]:
|
1278
|
-
"""
|
1279
|
-
处理单个批次的数据插入
|
1280
|
-
|
1281
|
-
:param conn: 数据库连接对象
|
1282
|
-
:param cursor: 数据库游标对象
|
1283
|
-
:param db_name: 数据库名
|
1284
|
-
:param table_name: 表名
|
1285
|
-
:param batch: 当前批次的数据(字典列表)
|
1286
|
-
:param all_columns: 需要插入的所有列名
|
1287
|
-
:param sql: 执行的SQL语句
|
1288
|
-
:param check_duplicate: 是否检查重复
|
1289
|
-
:param duplicate_columns: 排重列
|
1290
|
-
:param batch_id: 批次ID
|
1291
|
-
:param transaction_mode: 事务模式
|
1292
|
-
:param update_on_duplicate: 遇到重复时是否更新
|
1293
|
-
:return: (插入数, 跳过数, 失败数)
|
1294
|
-
"""
|
1295
|
-
batch_inserted = 0
|
1296
|
-
batch_skipped = 0
|
1297
|
-
batch_failed = 0
|
1298
|
-
batch_size = len(batch)
|
1299
|
-
logger.debug('批次插入开始', {
|
1300
|
-
'库': db_name,
|
1301
|
-
'表': table_name,
|
1302
|
-
'批次ID': batch_id,
|
1303
|
-
'批次大小': batch_size,
|
1304
|
-
'事务模式': transaction_mode,
|
1305
|
-
'SQL预览': sql[:200],
|
1306
|
-
'排重': check_duplicate,
|
1307
|
-
'排重列': duplicate_columns,
|
1308
|
-
'允许更新': update_on_duplicate,
|
1309
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1310
|
-
})
|
1311
|
-
if transaction_mode == 'batch':
|
1312
|
-
try:
|
1313
|
-
for row_idx, row in enumerate(batch, 1):
|
1314
|
-
result = self._process_single_row(
|
1315
|
-
db_name, table_name, cursor, row, all_columns, sql,
|
1316
|
-
check_duplicate, duplicate_columns, update_on_duplicate
|
1317
|
-
)
|
1318
|
-
if result == 'inserted':
|
1319
|
-
batch_inserted += 1
|
1320
|
-
elif result == 'skipped':
|
1321
|
-
batch_skipped += 1
|
1322
|
-
else:
|
1323
|
-
batch_failed += 1
|
1324
|
-
conn.commit()
|
1325
|
-
logger.debug('批次插入成功', {
|
1326
|
-
'库': db_name,
|
1327
|
-
'表': table_name,
|
1328
|
-
'批次ID': batch_id,
|
1329
|
-
'插入': batch_inserted,
|
1330
|
-
'跳过': batch_skipped,
|
1331
|
-
'失败': batch_failed
|
1332
|
-
})
|
1333
|
-
except Exception as e:
|
1334
|
-
conn.rollback()
|
1335
|
-
batch_failed += len(batch)
|
1336
|
-
logger.error('批次插入失败', {
|
1337
|
-
'库': db_name,
|
1338
|
-
'表': table_name,
|
1339
|
-
'批次ID': batch_id,
|
1340
|
-
'错误': str(e),
|
1341
|
-
'SQL预览': sql[:200],
|
1342
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1343
|
-
})
|
1344
|
-
else: # row 或 hybrid 模式
|
1345
|
-
for row_idx, row in enumerate(batch, 1):
|
1346
|
-
try:
|
1347
|
-
result = self._process_single_row(
|
1348
|
-
db_name, table_name, cursor, row, all_columns, sql,
|
1349
|
-
check_duplicate, duplicate_columns, update_on_duplicate
|
1350
|
-
)
|
1351
|
-
if result == 'inserted':
|
1352
|
-
batch_inserted += 1
|
1353
|
-
elif result == 'skipped':
|
1354
|
-
batch_skipped += 1
|
1355
|
-
else:
|
1356
|
-
batch_failed += 1
|
1357
|
-
conn.commit()
|
1358
|
-
logger.debug('单行插入成功', {
|
1359
|
-
'库': db_name,
|
1360
|
-
'表': table_name,
|
1361
|
-
'批次ID': batch_id,
|
1362
|
-
'行号': row_idx,
|
1363
|
-
'插入状态': result
|
1364
|
-
})
|
1365
|
-
except Exception as e:
|
1366
|
-
conn.rollback()
|
1367
|
-
batch_failed += 1
|
1368
|
-
logger.error('单行插入失败', {
|
1369
|
-
'库': db_name,
|
1370
|
-
'表': table_name,
|
1371
|
-
'批次ID': batch_id,
|
1372
|
-
'行号': row_idx,
|
1373
|
-
'错误': str(e),
|
1374
|
-
'SQL预览': sql[:200],
|
1375
|
-
'数据': self._shorten_for_log(row)
|
1376
|
-
})
|
1377
|
-
logger.debug('批次插入结束', {
|
1378
|
-
'库': db_name,
|
1379
|
-
'表': table_name,
|
1380
|
-
'批次ID': batch_id,
|
1381
|
-
'插入': batch_inserted,
|
1382
|
-
'跳过': batch_skipped,
|
1383
|
-
'失败': batch_failed,
|
1384
|
-
'数据样例': self._shorten_for_log(batch, 2)
|
1385
|
-
})
|
1386
|
-
return batch_inserted, batch_skipped, batch_failed
|
1387
|
-
|
1388
|
-
def _process_single_row(
|
1389
|
-
self,
|
1390
|
-
db_name: str,
|
1391
|
-
table_name: str,
|
1392
|
-
cursor,
|
1393
|
-
row: Dict,
|
1394
|
-
all_columns: List[str],
|
1395
|
-
sql: str,
|
1396
|
-
check_duplicate: bool,
|
1397
|
-
duplicate_columns: Optional[List[str]],
|
1398
|
-
update_on_duplicate: bool = False
|
1399
|
-
) -> str:
|
1400
|
-
"""
|
1401
|
-
处理单行数据插入
|
1402
|
-
|
1403
|
-
:param db_name: 数据库名
|
1404
|
-
:param table_name: 表名
|
1405
|
-
:param cursor: 数据库游标对象
|
1406
|
-
:param row: 单行数据(字典)
|
1407
|
-
:param all_columns: 需要插入的所有列名
|
1408
|
-
:param sql: 执行的SQL语句
|
1409
|
-
:param check_duplicate: 是否检查重复
|
1410
|
-
:param duplicate_columns: 排重列
|
1411
|
-
:param update_on_duplicate: 遇到重复时是否更新
|
1412
|
-
:return: 'inserted' | 'skipped' | 'failed'
|
1413
|
-
"""
|
1414
|
-
try:
|
1415
|
-
# 构造参数
|
1416
|
-
values = [row.get(col) for col in all_columns]
|
1417
|
-
if check_duplicate:
|
1418
|
-
# 需要为 WHERE NOT EXISTS 语句补充参数
|
1419
|
-
if not update_on_duplicate:
|
1420
|
-
# duplicate_columns 为空时,默认用所有列(排除id/更新时间)
|
1421
|
-
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1422
|
-
values = values + [row.get(col) for col in dup_cols]
|
1423
|
-
cursor.execute(sql, values)
|
1424
|
-
except Exception as e:
|
1425
|
-
logger.error('单行插入失败', {
|
1426
|
-
'库': db_name,
|
1427
|
-
'表': table_name,
|
1428
|
-
'row': self._shorten_for_log(row),
|
1429
|
-
'错误': str(e)
|
1430
|
-
})
|
1431
|
-
return 'failed'
|
1432
|
-
return 'inserted'
|
1433
|
-
|
1434
1285
|
def close(self) -> None:
|
1435
1286
|
"""
|
1436
1287
|
关闭连接池并清理资源
|
@@ -1441,10 +1292,10 @@ class MySQLUploader:
|
|
1441
1292
|
try:
|
1442
1293
|
if hasattr(self, 'pool') and self.pool is not None:
|
1443
1294
|
try:
|
1444
|
-
self.pool.close()
|
1295
|
+
# self.pool.close() # PooledDB 没有 close 方法
|
1296
|
+
self.pool = None
|
1445
1297
|
except Exception as e:
|
1446
1298
|
logger.warning('关闭连接池时出错', {'error': str(e)})
|
1447
|
-
self.pool = None
|
1448
1299
|
logger.info('连接池关闭', {'uploader.py': '连接池关闭'})
|
1449
1300
|
except Exception as e:
|
1450
1301
|
logger.error('关闭连接池失败', {'error': str(e)})
|
@@ -1452,11 +1303,12 @@ class MySQLUploader:
|
|
1452
1303
|
|
1453
1304
|
def _check_pool_health(self) -> bool:
|
1454
1305
|
"""
|
1455
|
-
|
1456
|
-
:return: 连接池健康返回True,否则返回False
|
1306
|
+
检查连接池健康状态,防止连接泄露
|
1457
1307
|
"""
|
1458
1308
|
conn = None
|
1459
1309
|
try:
|
1310
|
+
if not hasattr(self, 'pool') or self.pool is None:
|
1311
|
+
return False
|
1460
1312
|
conn = self.pool.connection()
|
1461
1313
|
conn.ping(reconnect=True)
|
1462
1314
|
logger.debug('连接池健康检查通过')
|
@@ -1465,12 +1317,13 @@ class MySQLUploader:
|
|
1465
1317
|
logger.warning('连接池健康检查失败', {'error': str(e)})
|
1466
1318
|
return False
|
1467
1319
|
finally:
|
1468
|
-
if conn:
|
1320
|
+
if conn is not None:
|
1469
1321
|
try:
|
1470
1322
|
conn.close()
|
1471
1323
|
except Exception as e:
|
1472
1324
|
logger.warning('关闭连接时出错', {'error': str(e)})
|
1473
1325
|
|
1326
|
+
@staticmethod
|
1474
1327
|
def retry_on_failure(max_retries: int = 3, delay: int = 1):
|
1475
1328
|
"""
|
1476
1329
|
通用重试装饰器
|
mdbq/spider/aikucun.py
CHANGED
@@ -3,10 +3,8 @@ import datetime
|
|
3
3
|
import requests
|
4
4
|
import json
|
5
5
|
import os
|
6
|
-
import sys
|
7
6
|
import re
|
8
7
|
import time
|
9
|
-
import warnings
|
10
8
|
import platform
|
11
9
|
import getpass
|
12
10
|
from selenium import webdriver
|
@@ -15,20 +13,18 @@ from selenium.webdriver.common.by import By
|
|
15
13
|
from selenium.webdriver.support import expected_conditions as EC
|
16
14
|
from selenium.webdriver.chrome.service import Service
|
17
15
|
import pymysql
|
18
|
-
|
19
|
-
from mdbq.log import spider_logging
|
20
|
-
from mdbq.mysql import mysql
|
16
|
+
from mdbq.mysql import uploader
|
21
17
|
from mdbq.mysql import s_query
|
22
18
|
from mdbq.config import config
|
23
19
|
from mdbq.other import ua_sj
|
24
20
|
from mdbq.other import otk
|
21
|
+
from mdbq.log import mylogger
|
25
22
|
|
26
23
|
dir_path = os.path.expanduser("~")
|
27
24
|
config_file = os.path.join(dir_path, 'spd.txt')
|
28
25
|
content = config.read_config(file_path=config_file)
|
29
26
|
username, password, host, port = content['username'], content['password'], content['host'], content['port']
|
30
27
|
|
31
|
-
# m_engine = mysql.MysqlUpload(username=username, password=password, host=host, port=port, charset='utf8mb4')
|
32
28
|
uld = uploader.MySQLUploader(username=username, password=password, host=host, port=int(port), pool_size=10)
|
33
29
|
# 实例化一个数据查询类,用来获取 cookies 表数据
|
34
30
|
download = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
@@ -188,11 +184,6 @@ class AikuCun:
|
|
188
184
|
'更新时间': 'timestamp'
|
189
185
|
}
|
190
186
|
# 更新至数据库记录
|
191
|
-
# m_engine.dict_to_mysql(
|
192
|
-
# db_name=self.db_name,
|
193
|
-
# table_name=self.table_name,
|
194
|
-
# dict_data=self.token,
|
195
|
-
# )
|
196
187
|
uld.upload_data(
|
197
188
|
db_name=self.db_name,
|
198
189
|
table_name=self.table_name,
|
@@ -429,15 +420,6 @@ class AikuCun:
|
|
429
420
|
drop_dup = ['日期', '平台', '店铺名称', '商品款号', '访客量']
|
430
421
|
else:
|
431
422
|
drop_dup = ['日期', '平台', '店铺名称', '条码']
|
432
|
-
# m_engine.insert_many_dict(
|
433
|
-
# db_name=db_name,
|
434
|
-
# table_name=table_name,
|
435
|
-
# dict_data_list=_results,
|
436
|
-
# icm_update=drop_dup, # 唯一组合键
|
437
|
-
# # unique_main_key=['人群id'],
|
438
|
-
# set_typ=set_typ,
|
439
|
-
# allow_not_null=False, # 创建允许插入空值的列
|
440
|
-
# )
|
441
423
|
uld.upload_data(
|
442
424
|
db_name=db_name,
|
443
425
|
table_name=table_name,
|
@@ -1,17 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=SerN98H6Mx8rHVh-jf2Nmc7iZHb02NHGVphB1O5jKwE,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
8
|
-
mdbq/log/mylogger.py,sha256=
|
8
|
+
mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
12
|
-
mdbq/mysql/mysql.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=bIV010UkFfSUONY6-756x3tDVO4k6q3pqxoY3Z2xT-k,32990
|
12
|
+
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=3fXyNA0GzBNaadAh6cOgbuUEvY4IAhKn4apgbkToEno,61321
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -23,8 +23,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
23
23
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
|
-
mdbq/spider/aikucun.py,sha256=
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
26
|
+
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
+
mdbq-3.11.3.dist-info/METADATA,sha256=tgDHEyJKxO0ML-gUTBap1b6yP-xv5sEA_SsfVJ_31C0,364
|
28
|
+
mdbq-3.11.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.3.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|