mdbq 3.9.8__py3-none-any.whl → 3.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +5 -8
- mdbq/mysql/uploader.py +276 -116
- {mdbq-3.9.8.dist-info → mdbq-3.9.10.dist-info}/METADATA +1 -1
- {mdbq-3.9.8.dist-info → mdbq-3.9.10.dist-info}/RECORD +7 -7
- {mdbq-3.9.8.dist-info → mdbq-3.9.10.dist-info}/WHEEL +0 -0
- {mdbq-3.9.8.dist-info → mdbq-3.9.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.9.
|
1
|
+
VERSION = '3.9.10'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -17,14 +17,14 @@ from collections import defaultdict
|
|
17
17
|
warnings.filterwarnings('ignore')
|
18
18
|
logger = mylogger.MyLogger(
|
19
19
|
name='deduplicator',
|
20
|
-
logging_mode='
|
21
|
-
log_level='
|
20
|
+
logging_mode='both',
|
21
|
+
log_level='info',
|
22
22
|
log_file='deduplicator.log',
|
23
23
|
log_format='json',
|
24
24
|
max_log_size=50,
|
25
25
|
backup_count=5,
|
26
26
|
enable_async=False, # 是否启用异步日志
|
27
|
-
sample_rate=
|
27
|
+
sample_rate=1, # 采样50%的DEBUG/INFO日志
|
28
28
|
sensitive_fields=[], # 敏感字段列表
|
29
29
|
)
|
30
30
|
|
@@ -116,10 +116,7 @@ class MySQLDeduplicator:
|
|
116
116
|
self._processing_tables = set() # 正在处理的表集合
|
117
117
|
|
118
118
|
# 系统数据库列表
|
119
|
-
self.SYSTEM_DATABASES = {
|
120
|
-
'information_schema', 'mysql',
|
121
|
-
'performance_schema', 'sys'
|
122
|
-
}
|
119
|
+
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
|
123
120
|
|
124
121
|
def _get_connection(self):
|
125
122
|
"""从连接池获取连接"""
|
@@ -580,7 +577,7 @@ class MySQLDeduplicator:
|
|
580
577
|
def main():
|
581
578
|
deduplicator = MySQLDeduplicator(
|
582
579
|
username='root',
|
583
|
-
password='
|
580
|
+
password='188988yang188',
|
584
581
|
host='localhost',
|
585
582
|
port=3306
|
586
583
|
)
|
mdbq/mysql/uploader.py
CHANGED
@@ -17,20 +17,26 @@ from collections import OrderedDict
|
|
17
17
|
warnings.filterwarnings('ignore')
|
18
18
|
logger = mylogger.MyLogger(
|
19
19
|
name='uploader',
|
20
|
-
logging_mode='
|
21
|
-
log_level='
|
20
|
+
logging_mode='both',
|
21
|
+
log_level='info',
|
22
22
|
log_file='uploader.log',
|
23
23
|
log_format='json',
|
24
24
|
max_log_size=50,
|
25
25
|
backup_count=5,
|
26
26
|
enable_async=False, # 是否启用异步日志
|
27
|
-
sample_rate=
|
27
|
+
sample_rate=1, # 采样50%的DEBUG/INFO日志
|
28
28
|
sensitive_fields=[], # 敏感字段列表
|
29
29
|
)
|
30
30
|
|
31
31
|
|
32
32
|
def count_decimal_places(num_str):
|
33
|
-
"""
|
33
|
+
"""
|
34
|
+
计算数字字符串的小数位数,支持科学计数法
|
35
|
+
|
36
|
+
:param num_str: 数字字符串
|
37
|
+
:return: 返回元组(整数位数, 小数位数)
|
38
|
+
:raises: 无显式抛出异常,但正则匹配失败时返回(0, 0)
|
39
|
+
"""
|
34
40
|
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
35
41
|
if match:
|
36
42
|
# 如果是科学计数法
|
@@ -53,8 +59,13 @@ def count_decimal_places(num_str):
|
|
53
59
|
|
54
60
|
|
55
61
|
class StatementCache(OrderedDict):
|
56
|
-
"""LRU
|
62
|
+
"""基于OrderedDict实现的LRU缓存策略,用于缓存SQL语句"""
|
57
63
|
def __init__(self, maxsize=100):
|
64
|
+
"""
|
65
|
+
初始化缓存
|
66
|
+
|
67
|
+
:param maxsize: 最大缓存大小,默认为100
|
68
|
+
"""
|
58
69
|
super().__init__()
|
59
70
|
self.maxsize = maxsize
|
60
71
|
|
@@ -88,7 +99,6 @@ class MySQLUploader:
|
|
88
99
|
:param port: 数据库端口,默认为3306
|
89
100
|
:param charset: 字符集,默认为utf8mb4
|
90
101
|
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci
|
91
|
-
|
92
102
|
:param max_retries: 最大重试次数,默认为10
|
93
103
|
:param retry_interval: 重试间隔(秒),默认为10
|
94
104
|
:param pool_size: 连接池大小,默认为5
|
@@ -111,7 +121,7 @@ class MySQLUploader:
|
|
111
121
|
self.write_timeout = write_timeout
|
112
122
|
self.ssl = ssl
|
113
123
|
self._prepared_statements = StatementCache(maxsize=100)
|
114
|
-
self._max_cached_statements = 100
|
124
|
+
self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
|
115
125
|
self._table_metadata_cache = {}
|
116
126
|
self.metadata_cache_ttl = 300 # 5分钟缓存时间
|
117
127
|
|
@@ -119,7 +129,12 @@ class MySQLUploader:
|
|
119
129
|
self.pool = self._create_connection_pool()
|
120
130
|
|
121
131
|
def _create_connection_pool(self) -> PooledDB:
|
122
|
-
"""
|
132
|
+
"""
|
133
|
+
创建数据库连接池
|
134
|
+
|
135
|
+
:return: PooledDB连接池实例
|
136
|
+
:raises ConnectionError: 当连接池创建失败时抛出
|
137
|
+
"""
|
123
138
|
if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
|
124
139
|
return self.pool
|
125
140
|
|
@@ -157,22 +172,29 @@ class MySQLUploader:
|
|
157
172
|
|
158
173
|
try:
|
159
174
|
pool = PooledDB(**pool_params)
|
160
|
-
elapsed = time.time() - start_time
|
175
|
+
elapsed = round(time.time() - start_time, 2)
|
161
176
|
logger.info("连接池创建成功", {
|
162
177
|
'pool_size': self.pool_size,
|
163
|
-
'
|
178
|
+
'耗时': elapsed
|
164
179
|
})
|
165
180
|
return pool
|
166
181
|
except Exception as e:
|
167
|
-
elapsed = time.time() - start_time
|
182
|
+
elapsed = round(time.time() - start_time, 2)
|
168
183
|
self.pool = None
|
169
184
|
logger.error("连接池创建失败", {
|
170
185
|
'error': str(e),
|
171
|
-
'
|
186
|
+
'耗时': elapsed
|
172
187
|
})
|
173
188
|
raise ConnectionError(f"连接池创建失败: {str(e)}")
|
174
189
|
|
175
190
|
def _execute_with_retry(self, func):
|
191
|
+
"""
|
192
|
+
带重试机制的装饰器,用于数据库操作
|
193
|
+
|
194
|
+
:param func: 被装饰的函数
|
195
|
+
:return: 装饰后的函数
|
196
|
+
:raises: 可能抛出原始异常或最后一次重试的异常
|
197
|
+
"""
|
176
198
|
@wraps(func)
|
177
199
|
def wrapper(*args, **kwargs):
|
178
200
|
last_exception = None
|
@@ -187,18 +209,18 @@ class MySQLUploader:
|
|
187
209
|
for attempt in range(self.max_retries):
|
188
210
|
try:
|
189
211
|
result = func(*args, **kwargs)
|
190
|
-
elapsed = time.time() - start_time
|
212
|
+
elapsed = round(time.time() - start_time, 2)
|
191
213
|
|
192
214
|
if attempt > 0:
|
193
215
|
logger.info("操作成功(重试后)", {
|
194
216
|
'operation': operation,
|
195
217
|
'attempts': attempt + 1,
|
196
|
-
'
|
218
|
+
'耗时': elapsed
|
197
219
|
})
|
198
220
|
else:
|
199
221
|
logger.debug("操作成功", {
|
200
222
|
'operation': operation,
|
201
|
-
'
|
223
|
+
'耗时': elapsed
|
202
224
|
})
|
203
225
|
|
204
226
|
return result
|
@@ -230,15 +252,15 @@ class MySQLUploader:
|
|
230
252
|
'error': str(reconnect_error)
|
231
253
|
})
|
232
254
|
else:
|
233
|
-
elapsed = time.time() - start_time
|
234
|
-
error_details['
|
255
|
+
elapsed = round(time.time() - start_time, 2)
|
256
|
+
error_details['耗时'] = elapsed
|
235
257
|
logger.error(f"操作最终失败 {error_details}")
|
236
258
|
|
237
259
|
except pymysql.IntegrityError as e:
|
238
|
-
elapsed = time.time() - start_time
|
260
|
+
elapsed = round(time.time() - start_time, 2)
|
239
261
|
logger.error("完整性约束错误", {
|
240
262
|
'operation': operation,
|
241
|
-
'
|
263
|
+
'耗时': elapsed,
|
242
264
|
'error_code': e.args[0] if e.args else None,
|
243
265
|
'error_message': e.args[1] if len(e.args) > 1 else None
|
244
266
|
})
|
@@ -246,10 +268,10 @@ class MySQLUploader:
|
|
246
268
|
|
247
269
|
except Exception as e:
|
248
270
|
last_exception = e
|
249
|
-
elapsed = time.time() - start_time
|
271
|
+
elapsed = round(time.time() - start_time, 2)
|
250
272
|
logger.error("发生意外错误", {
|
251
273
|
'operation': operation,
|
252
|
-
'
|
274
|
+
'耗时': elapsed,
|
253
275
|
'error_type': type(e).__name__,
|
254
276
|
'error_message': str(e),
|
255
277
|
'error_args': e.args if hasattr(e, 'args') else None
|
@@ -261,7 +283,12 @@ class MySQLUploader:
|
|
261
283
|
return wrapper
|
262
284
|
|
263
285
|
def _get_connection(self):
|
264
|
-
"""
|
286
|
+
"""
|
287
|
+
从连接池获取数据库连接
|
288
|
+
|
289
|
+
:return: 数据库连接对象
|
290
|
+
:raises ConnectionError: 当获取连接失败时抛出
|
291
|
+
"""
|
265
292
|
try:
|
266
293
|
conn = self.pool.connection()
|
267
294
|
logger.debug("获取数据库连接")
|
@@ -271,7 +298,13 @@ class MySQLUploader:
|
|
271
298
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
272
299
|
|
273
300
|
def _check_database_exists(self, db_name: str) -> bool:
|
274
|
-
"""
|
301
|
+
"""
|
302
|
+
检查数据库是否存在
|
303
|
+
|
304
|
+
:param db_name: 数据库名称
|
305
|
+
:return: 存在返回True,否则返回False
|
306
|
+
:raises: 可能抛出数据库相关异常
|
307
|
+
"""
|
275
308
|
db_name = self._validate_identifier(db_name)
|
276
309
|
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
277
310
|
|
@@ -287,7 +320,12 @@ class MySQLUploader:
|
|
287
320
|
raise
|
288
321
|
|
289
322
|
def _create_database(self, db_name: str):
|
290
|
-
"""
|
323
|
+
"""
|
324
|
+
创建数据库
|
325
|
+
|
326
|
+
:param db_name: 要创建的数据库名称
|
327
|
+
:raises: 可能抛出数据库相关异常
|
328
|
+
"""
|
291
329
|
db_name = self._validate_identifier(db_name)
|
292
330
|
sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
|
293
331
|
|
@@ -329,18 +367,17 @@ class MySQLUploader:
|
|
329
367
|
elif partition_by == 'month':
|
330
368
|
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
331
369
|
else:
|
332
|
-
error_msg = "
|
370
|
+
error_msg = "分表方式必须是 'year' 或 'month'"
|
333
371
|
logger.error(error_msg)
|
334
372
|
raise ValueError(error_msg)
|
335
373
|
|
336
374
|
def _validate_identifier(self, identifier: str) -> str:
|
337
375
|
"""
|
338
|
-
验证并清理数据库标识符(
|
339
|
-
防止SQL注入和非法字符
|
376
|
+
验证并清理数据库标识符(表名、列名等)
|
340
377
|
|
341
378
|
:param identifier: 要验证的标识符
|
342
379
|
:return: 清理后的安全标识符
|
343
|
-
:raises ValueError:
|
380
|
+
:raises ValueError: 当标识符无效时抛出
|
344
381
|
"""
|
345
382
|
if not identifier or not isinstance(identifier, str):
|
346
383
|
error_msg = f"无效的标识符: {identifier}"
|
@@ -348,7 +385,11 @@ class MySQLUploader:
|
|
348
385
|
raise ValueError(error_msg)
|
349
386
|
|
350
387
|
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
351
|
-
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
|
388
|
+
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
389
|
+
|
390
|
+
# 将多个连续的下划线替换为单个下划线, 移除开头和结尾的下划线
|
391
|
+
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
392
|
+
|
352
393
|
if not cleaned:
|
353
394
|
error_msg = f"无法清理异常标识符: {identifier}"
|
354
395
|
logger.error(error_msg)
|
@@ -366,7 +407,14 @@ class MySQLUploader:
|
|
366
407
|
return cleaned
|
367
408
|
|
368
409
|
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
369
|
-
"""
|
410
|
+
"""
|
411
|
+
检查表是否存在
|
412
|
+
|
413
|
+
:param db_name: 数据库名
|
414
|
+
:param table_name: 表名
|
415
|
+
:return: 存在返回True,否则返回False
|
416
|
+
:raises: 可能抛出数据库相关异常
|
417
|
+
"""
|
370
418
|
cache_key = f"{db_name}.{table_name}"
|
371
419
|
if cache_key in self._table_metadata_cache:
|
372
420
|
cached_time, result = self._table_metadata_cache[cache_key]
|
@@ -410,9 +458,11 @@ class MySQLUploader:
|
|
410
458
|
:param db_name: 数据库名
|
411
459
|
:param table_name: 表名
|
412
460
|
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
413
|
-
:param primary_keys:
|
414
|
-
:param date_column:
|
415
|
-
:param indexes:
|
461
|
+
:param primary_keys: 主键列列表,可选
|
462
|
+
:param date_column: 日期列名,可选,如果存在将设置为索引
|
463
|
+
:param indexes: 需要创建索引的列列表,可选
|
464
|
+
:param allow_null: 是否允许空值,默认为False
|
465
|
+
:raises: 可能抛出数据库相关异常
|
416
466
|
"""
|
417
467
|
db_name = self._validate_identifier(db_name)
|
418
468
|
table_name = self._validate_identifier(table_name)
|
@@ -501,7 +551,14 @@ class MySQLUploader:
|
|
501
551
|
raise
|
502
552
|
|
503
553
|
def _validate_datetime(self, value, date_type=False):
|
504
|
-
"""
|
554
|
+
"""
|
555
|
+
验证并标准化日期时间格式
|
556
|
+
|
557
|
+
:param value: 日期时间值
|
558
|
+
:param date_type: 是否返回日期类型(True)或字符串(False)
|
559
|
+
:return: 标准化后的日期时间字符串或日期对象
|
560
|
+
:raises ValueError: 当日期格式无效时抛出
|
561
|
+
"""
|
505
562
|
formats = [
|
506
563
|
'%Y-%m-%d %H:%M:%S',
|
507
564
|
'%Y-%m-%d',
|
@@ -524,26 +581,49 @@ class MySQLUploader:
|
|
524
581
|
continue
|
525
582
|
raise ValueError(f"无效的日期格式2: {value}")
|
526
583
|
|
527
|
-
def _validate_value(self, value: Any, column_type: str) -> Any:
|
584
|
+
def _validate_value(self, value: Any, column_type: str, allow_null: bool) -> Any:
|
528
585
|
"""
|
529
|
-
|
586
|
+
根据列类型验证并转换数据值
|
530
587
|
|
531
588
|
:param value: 要验证的值
|
532
589
|
:param column_type: 列的数据类型
|
533
|
-
:
|
534
|
-
:
|
590
|
+
:param allow_null: 是否允许空值
|
591
|
+
:return: 转换后的值
|
592
|
+
:raises ValueError: 当值转换失败时抛出
|
535
593
|
"""
|
536
594
|
if value is None:
|
595
|
+
if not allow_null:
|
596
|
+
return 'none'
|
537
597
|
return None
|
538
598
|
|
539
599
|
try:
|
540
600
|
column_type_lower = column_type.lower()
|
541
601
|
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
602
|
+
# 处理百分比值
|
603
|
+
if isinstance(value, str):
|
604
|
+
if value.endswith('%'):
|
605
|
+
try:
|
606
|
+
# 移除百分号并转换为小数
|
607
|
+
percent_value = float(value.strip().replace('%', ''))
|
608
|
+
decimal_value = percent_value / 100
|
609
|
+
return decimal_value
|
610
|
+
except ValueError:
|
611
|
+
pass # 如果不是有效的百分比数字,继续正常处理
|
612
|
+
|
613
|
+
elif 'int' in column_type_lower:
|
614
|
+
if isinstance(value, str):
|
615
|
+
# 移除可能的逗号和空格
|
616
|
+
value = value.replace(',', '').strip()
|
617
|
+
# 尝试转换为浮点数再转整数
|
618
|
+
try:
|
619
|
+
return int(float(value))
|
620
|
+
except ValueError:
|
621
|
+
raise ValueError(f"`{value}` 无法转为整数")
|
622
|
+
return int(value) if value is not None else None
|
546
623
|
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
624
|
+
if isinstance(value, str):
|
625
|
+
# 处理可能包含逗号的数字字符串
|
626
|
+
value = value.replace(',', '')
|
547
627
|
return float(value) if value is not None else None
|
548
628
|
elif '日期' in column_type_lower or 'time' in column_type_lower:
|
549
629
|
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
@@ -560,17 +640,23 @@ class MySQLUploader:
|
|
560
640
|
return value.replace('\\', '\\\\').replace("'", "\\'")
|
561
641
|
return str(value)
|
562
642
|
elif 'json' in column_type_lower:
|
563
|
-
import json
|
564
643
|
return json.dumps(value) if value is not None else None
|
565
644
|
else:
|
566
645
|
return value
|
567
646
|
except (ValueError, TypeError) as e:
|
568
|
-
error_msg = f"
|
647
|
+
error_msg = f"转换异常 -> 无法将 `{value}` 的数据类型转为: `{column_type}` -> {str(e)}"
|
569
648
|
logger.error(error_msg)
|
570
649
|
raise ValueError(error_msg)
|
571
650
|
|
572
651
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
573
|
-
"""
|
652
|
+
"""
|
653
|
+
获取表的列名和数据类型
|
654
|
+
|
655
|
+
:param db_name: 数据库名
|
656
|
+
:param table_name: 表名
|
657
|
+
:return: 列名和数据类型字典 {列名: 数据类型}
|
658
|
+
:raises: 可能抛出数据库相关异常
|
659
|
+
"""
|
574
660
|
db_name = self._validate_identifier(db_name)
|
575
661
|
table_name = self._validate_identifier(table_name)
|
576
662
|
sql = """
|
@@ -604,7 +690,8 @@ class MySQLUploader:
|
|
604
690
|
auto_create: bool,
|
605
691
|
date_column: Optional[str],
|
606
692
|
indexes: Optional[List[str]],
|
607
|
-
batch_id: Optional[str] = None
|
693
|
+
batch_id: Optional[str] = None,
|
694
|
+
update_on_duplicate: bool = False
|
608
695
|
):
|
609
696
|
"""实际执行表上传的方法"""
|
610
697
|
# 检查表是否存在
|
@@ -634,19 +721,26 @@ class MySQLUploader:
|
|
634
721
|
# 插入数据
|
635
722
|
self._insert_data(
|
636
723
|
db_name, table_name, data, set_typ,
|
637
|
-
check_duplicate, duplicate_columns
|
724
|
+
check_duplicate, duplicate_columns,
|
725
|
+
batch_id=batch_id,
|
726
|
+
update_on_duplicate=update_on_duplicate
|
638
727
|
)
|
639
728
|
|
640
729
|
def _infer_data_type(self, value: Any) -> str:
|
641
730
|
"""
|
642
|
-
|
731
|
+
根据值推断合适的MySQL数据类型
|
643
732
|
|
644
733
|
:param value: 要推断的值
|
645
734
|
:return: MySQL数据类型字符串
|
646
735
|
"""
|
647
|
-
if value is None:
|
736
|
+
if value is None or str(value).lower() in ['', 'none', 'nan']:
|
648
737
|
return 'VARCHAR(255)' # 默认字符串类型
|
649
738
|
|
739
|
+
# 检查是否是百分比字符串
|
740
|
+
if isinstance(value, str):
|
741
|
+
if value.endswith('%'):
|
742
|
+
return 'DECIMAL(10,4)' # 百分比统一使用DECIMAL(10,4)
|
743
|
+
|
650
744
|
if isinstance(value, bool):
|
651
745
|
return 'TINYINT(1)'
|
652
746
|
elif isinstance(value, int):
|
@@ -692,6 +786,26 @@ class MySQLUploader:
|
|
692
786
|
else:
|
693
787
|
return 'VARCHAR(255)'
|
694
788
|
|
789
|
+
def normalize_column_names(self, data: Union[pd.DataFrame, List[Dict[str, Any]]]) -> Union[
|
790
|
+
pd.DataFrame, List[Dict[str, Any]]]:
|
791
|
+
"""
|
792
|
+
1. pandas:规范化列名
|
793
|
+
2. 字典列表:规范化每个字典的键
|
794
|
+
|
795
|
+
参数:
|
796
|
+
data: 输入数据,支持两种类型:
|
797
|
+
- pandas.DataFrame:将规范化其列名
|
798
|
+
- List[Dict[str, Any]]:将规范化列表中每个字典的键
|
799
|
+
"""
|
800
|
+
if isinstance(data, pd.DataFrame):
|
801
|
+
# 处理DataFrame
|
802
|
+
data.columns = [self._validate_identifier(col) for col in data.columns]
|
803
|
+
return data
|
804
|
+
elif isinstance(data, list):
|
805
|
+
# 处理字典列表
|
806
|
+
return [{self._validate_identifier(k): v for k, v in item.items()} for item in data]
|
807
|
+
return data
|
808
|
+
|
695
809
|
def _prepare_data(
|
696
810
|
self,
|
697
811
|
data: Union[Dict, List[Dict], pd.DataFrame],
|
@@ -701,11 +815,11 @@ class MySQLUploader:
|
|
701
815
|
"""
|
702
816
|
准备要上传的数据,验证并转换数据类型
|
703
817
|
|
704
|
-
:param data:
|
818
|
+
:param data: 输入数据,可以是字典、字典列表或DataFrame
|
705
819
|
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
706
820
|
:param allow_null: 是否允许空值
|
707
|
-
:return:
|
708
|
-
:raises ValueError:
|
821
|
+
:return: 元组(准备好的数据列表, 过滤后的列类型字典)
|
822
|
+
:raises ValueError: 当数据验证失败时抛出
|
709
823
|
"""
|
710
824
|
# 统一数据格式为字典列表
|
711
825
|
if isinstance(data, pd.DataFrame):
|
@@ -726,6 +840,9 @@ class MySQLUploader:
|
|
726
840
|
logger.error(error_msg)
|
727
841
|
raise ValueError(error_msg)
|
728
842
|
|
843
|
+
# 统一处理原始数据中列名的特殊字符
|
844
|
+
data = self.normalize_column_names(data)
|
845
|
+
|
729
846
|
# 将set_typ的键转为小写
|
730
847
|
set_typ = {k.lower(): v for k, v in set_typ.items()}
|
731
848
|
|
@@ -745,11 +862,11 @@ class MySQLUploader:
|
|
745
862
|
if sample_values:
|
746
863
|
inferred_type = self._infer_data_type(sample_values[0])
|
747
864
|
filtered_set_typ[col] = inferred_type
|
748
|
-
logger.debug(f"自动推断列
|
865
|
+
logger.debug(f"自动推断列 `{col}` 的数据类型为: {inferred_type}")
|
749
866
|
else:
|
750
867
|
# 没有样本值,使用默认类型
|
751
868
|
filtered_set_typ[col] = 'VARCHAR(255)'
|
752
|
-
logger.debug(f"
|
869
|
+
logger.debug(f"列 `{col}` 使用默认数据类型: VARCHAR(255)")
|
753
870
|
|
754
871
|
prepared_data = []
|
755
872
|
for row_idx, row in enumerate(data, 1):
|
@@ -761,15 +878,15 @@ class MySQLUploader:
|
|
761
878
|
|
762
879
|
if col_name not in row:
|
763
880
|
if not allow_null:
|
764
|
-
error_msg = f"
|
881
|
+
error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`"
|
765
882
|
logger.error(error_msg)
|
766
883
|
raise ValueError(error_msg)
|
767
884
|
prepared_row[col_name] = None
|
768
885
|
else:
|
769
886
|
try:
|
770
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name])
|
887
|
+
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
|
771
888
|
except ValueError as e:
|
772
|
-
error_msg = f"
|
889
|
+
error_msg = f"行号:{row_idx}, 列名:`{col_name}`-> 报错: {str(e)}"
|
773
890
|
logger.error(error_msg)
|
774
891
|
raise ValueError(error_msg)
|
775
892
|
prepared_data.append(prepared_row)
|
@@ -790,10 +907,26 @@ class MySQLUploader:
|
|
790
907
|
partition_by: Optional[str] = None,
|
791
908
|
partition_date_column: str = '日期',
|
792
909
|
auto_create: bool = True,
|
793
|
-
indexes: Optional[List[str]] = None
|
910
|
+
indexes: Optional[List[str]] = None,
|
911
|
+
update_on_duplicate: bool = False
|
794
912
|
):
|
795
913
|
"""
|
796
|
-
|
914
|
+
上传数据到数据库的主入口方法
|
915
|
+
|
916
|
+
:param db_name: 数据库名
|
917
|
+
:param table_name: 表名
|
918
|
+
:param data: 要上传的数据
|
919
|
+
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
920
|
+
:param primary_keys: 主键列列表,可选
|
921
|
+
:param check_duplicate: 是否检查重复数据,默认为False
|
922
|
+
:param duplicate_columns: 用于检查重复的列,可选
|
923
|
+
:param allow_null: 是否允许空值,默认为False
|
924
|
+
:param partition_by: 分表方式('year'或'month'),可选
|
925
|
+
:param partition_date_column: 用于分表的日期列名,默认为'日期'
|
926
|
+
:param auto_create: 表不存在时是否自动创建,默认为True
|
927
|
+
:param indexes: 需要创建索引的列列表,可选
|
928
|
+
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
929
|
+
:raises: 可能抛出各种验证和数据库相关异常
|
797
930
|
"""
|
798
931
|
upload_start = time.time()
|
799
932
|
initial_row_count = len(data) if hasattr(data, '__len__') else 1
|
@@ -802,26 +935,28 @@ class MySQLUploader:
|
|
802
935
|
success_flag = False
|
803
936
|
|
804
937
|
logger.info("开始上传数据", {
|
805
|
-
'
|
806
|
-
'
|
807
|
-
'
|
808
|
-
'
|
809
|
-
'
|
810
|
-
'
|
811
|
-
'
|
938
|
+
'批次号': batch_id,
|
939
|
+
'库': db_name,
|
940
|
+
'表': table_name,
|
941
|
+
'分表方式': partition_by,
|
942
|
+
'排重': check_duplicate,
|
943
|
+
'总计行数': len(data) if hasattr(data, '__len__') else 1,
|
944
|
+
'自动建表': auto_create
|
812
945
|
})
|
813
946
|
|
814
947
|
try:
|
815
|
-
# 验证参数
|
816
|
-
if not set_typ:
|
817
|
-
|
818
|
-
|
819
|
-
|
948
|
+
# # 验证参数
|
949
|
+
# if not set_typ:
|
950
|
+
# error_msg = "列的数据类型缺失"
|
951
|
+
# logger.error(error_msg)
|
952
|
+
# raise ValueError(error_msg)
|
820
953
|
|
821
|
-
if partition_by
|
822
|
-
|
823
|
-
|
824
|
-
|
954
|
+
if partition_by:
|
955
|
+
partition_by = str(partition_by).lower()
|
956
|
+
if partition_by not in ['year', 'month']:
|
957
|
+
error_msg = "分表方式必须是 'year' 或 'month'"
|
958
|
+
logger.error(error_msg)
|
959
|
+
raise ValueError(error_msg)
|
825
960
|
|
826
961
|
# 准备数据
|
827
962
|
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null)
|
@@ -867,7 +1002,7 @@ class MySQLUploader:
|
|
867
1002
|
db_name, part_table, part_data, filtered_set_typ,
|
868
1003
|
primary_keys, check_duplicate, duplicate_columns,
|
869
1004
|
allow_null, auto_create, partition_date_column,
|
870
|
-
indexes, batch_id
|
1005
|
+
indexes, batch_id, update_on_duplicate
|
871
1006
|
)
|
872
1007
|
except Exception as e:
|
873
1008
|
logger.error("分表上传失败", {
|
@@ -881,7 +1016,7 @@ class MySQLUploader:
|
|
881
1016
|
db_name, table_name, prepared_data, filtered_set_typ,
|
882
1017
|
primary_keys, check_duplicate, duplicate_columns,
|
883
1018
|
allow_null, auto_create, partition_date_column,
|
884
|
-
indexes, batch_id
|
1019
|
+
indexes, batch_id, update_on_duplicate
|
885
1020
|
)
|
886
1021
|
|
887
1022
|
success_flag = True
|
@@ -892,12 +1027,12 @@ class MySQLUploader:
|
|
892
1027
|
'error_type': type(e).__name__
|
893
1028
|
})
|
894
1029
|
finally:
|
895
|
-
elapsed = time.time() - upload_start
|
1030
|
+
elapsed = round(time.time() - upload_start, 2)
|
896
1031
|
logger.info("上传处理完成", {
|
897
|
-
'
|
1032
|
+
'批次号': batch_id,
|
898
1033
|
'success': success_flag,
|
899
|
-
'
|
900
|
-
'
|
1034
|
+
'耗时': elapsed,
|
1035
|
+
'数据行': initial_row_count
|
901
1036
|
})
|
902
1037
|
|
903
1038
|
def _insert_data(
|
@@ -909,20 +1044,21 @@ class MySQLUploader:
|
|
909
1044
|
check_duplicate: bool = False,
|
910
1045
|
duplicate_columns: Optional[List[str]] = None,
|
911
1046
|
batch_size: int = 1000,
|
912
|
-
batch_id: Optional[str] = None
|
1047
|
+
batch_id: Optional[str] = None,
|
1048
|
+
update_on_duplicate: bool = False
|
913
1049
|
):
|
914
1050
|
"""
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
batch_id: 批次ID
|
1051
|
+
实际执行数据插入的方法
|
1052
|
+
|
1053
|
+
:param db_name: 数据库名
|
1054
|
+
:param table_name: 表名
|
1055
|
+
:param data: 要插入的数据列表
|
1056
|
+
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1057
|
+
:param check_duplicate: 是否检查重复数据,默认为False
|
1058
|
+
:param duplicate_columns: 用于检查重复的列,可选
|
1059
|
+
:param batch_size: 批量插入大小,默认为1000
|
1060
|
+
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
1061
|
+
:param batch_id: 批次ID用于日志追踪,可选
|
926
1062
|
"""
|
927
1063
|
if not data:
|
928
1064
|
return
|
@@ -954,8 +1090,21 @@ class MySQLUploader:
|
|
954
1090
|
|
955
1091
|
where_clause = " AND ".join(conditions)
|
956
1092
|
|
957
|
-
|
1093
|
+
if update_on_duplicate:
|
1094
|
+
# 更新模式 - 使用ON DUPLICATE KEY UPDATE语法
|
1095
|
+
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
|
1096
|
+
sql = f"""
|
958
1097
|
INSERT INTO `{db_name}`.`{table_name}`
|
1098
|
+
(`{'`,`'.join(safe_columns)}`)
|
1099
|
+
VALUES ({placeholders})
|
1100
|
+
ON DUPLICATE KEY UPDATE {update_clause}
|
1101
|
+
"""
|
1102
|
+
|
1103
|
+
# 注意:在update_on_duplicate模式下,row_values只需要插入数据,不需要排重列值
|
1104
|
+
def prepare_values(row):
|
1105
|
+
return [row.get(col) for col in all_columns]
|
1106
|
+
else:
|
1107
|
+
sql = f"""INSERT INTO `{db_name}`.`{table_name}`
|
959
1108
|
(`{'`,`'.join(safe_columns)}`)
|
960
1109
|
SELECT {placeholders}
|
961
1110
|
FROM DUAL
|
@@ -964,6 +1113,10 @@ class MySQLUploader:
|
|
964
1113
|
WHERE {where_clause}
|
965
1114
|
)
|
966
1115
|
"""
|
1116
|
+
|
1117
|
+
# 在check_duplicate模式下,row_values需要插入数据+排重列值
|
1118
|
+
def prepare_values(row):
|
1119
|
+
return [row.get(col) for col in all_columns] + [row.get(col) for col in duplicate_columns]
|
967
1120
|
else:
|
968
1121
|
sql = f"""
|
969
1122
|
INSERT INTO `{db_name}`.`{table_name}`
|
@@ -971,6 +1124,10 @@ class MySQLUploader:
|
|
971
1124
|
VALUES ({placeholders})
|
972
1125
|
"""
|
973
1126
|
|
1127
|
+
# 普通模式下,row_values只需要插入数据
|
1128
|
+
def prepare_values(row):
|
1129
|
+
return [row.get(col) for col in all_columns]
|
1130
|
+
|
974
1131
|
total_inserted = 0
|
975
1132
|
total_skipped = 0
|
976
1133
|
total_failed = 0 # 失败计数器
|
@@ -986,11 +1143,7 @@ class MySQLUploader:
|
|
986
1143
|
for row in batch:
|
987
1144
|
try:
|
988
1145
|
# 准备参数
|
989
|
-
row_values =
|
990
|
-
# 如果是排重检查,添加排重列值
|
991
|
-
if check_duplicate:
|
992
|
-
row_values += [row.get(col) for col in duplicate_columns]
|
993
|
-
|
1146
|
+
row_values = prepare_values(row)
|
994
1147
|
cursor.execute(sql, row_values)
|
995
1148
|
successful_rows += 1
|
996
1149
|
conn.commit() # 每次成功插入后提交
|
@@ -1001,14 +1154,14 @@ class MySQLUploader:
|
|
1001
1154
|
|
1002
1155
|
# 记录失败行详细信息
|
1003
1156
|
error_details = {
|
1004
|
-
'
|
1005
|
-
'
|
1006
|
-
'
|
1157
|
+
'批次号': batch_id,
|
1158
|
+
'库': db_name,
|
1159
|
+
'表': table_name,
|
1007
1160
|
'error_type': type(e).__name__,
|
1008
1161
|
'error_message': str(e),
|
1009
|
-
'
|
1010
|
-
'
|
1011
|
-
'
|
1162
|
+
'数据类型': set_typ,
|
1163
|
+
'是否排重': check_duplicate,
|
1164
|
+
'排重列': duplicate_columns
|
1012
1165
|
}
|
1013
1166
|
logger.error(f"单行插入失败: {error_details}")
|
1014
1167
|
continue # 跳过当前行,继续处理下一行
|
@@ -1022,28 +1175,31 @@ class MySQLUploader:
|
|
1022
1175
|
else:
|
1023
1176
|
total_inserted += successful_rows
|
1024
1177
|
|
1025
|
-
batch_elapsed = time.time() - batch_start
|
1178
|
+
batch_elapsed = round(time.time() - batch_start, 2)
|
1026
1179
|
batch_info = {
|
1027
|
-
'
|
1180
|
+
'批次号': batch_id,
|
1028
1181
|
'batch_index': i // batch_size + 1,
|
1029
1182
|
'total_batches': (len(data) + batch_size - 1) // batch_size,
|
1030
1183
|
'batch_size': len(batch),
|
1031
1184
|
'successful_rows': successful_rows,
|
1032
1185
|
'failed_rows': len(batch) - successful_rows,
|
1033
|
-
'
|
1186
|
+
'耗时': batch_elapsed,
|
1034
1187
|
'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
|
1035
1188
|
}
|
1036
1189
|
logger.debug(f"批次处理完成 {batch_info}")
|
1037
1190
|
|
1038
1191
|
logger.info("数据插入完成", {
|
1039
|
-
'
|
1040
|
-
'
|
1041
|
-
'
|
1042
|
-
'
|
1192
|
+
'总数据行': len(data),
|
1193
|
+
'插入行数': total_inserted,
|
1194
|
+
'跳过行数': total_skipped,
|
1195
|
+
'失败行数': total_failed
|
1043
1196
|
})
|
1044
1197
|
|
1045
1198
|
def close(self):
|
1046
|
-
"""
|
1199
|
+
"""
|
1200
|
+
关闭连接池并清理资源
|
1201
|
+
:raises: 可能抛出关闭连接时的异常
|
1202
|
+
"""
|
1047
1203
|
close_start = time.time()
|
1048
1204
|
|
1049
1205
|
try:
|
@@ -1060,18 +1216,22 @@ class MySQLUploader:
|
|
1060
1216
|
|
1061
1217
|
elapsed = round(time.time() - close_start, 2)
|
1062
1218
|
logger.info("连接池已关闭", {
|
1063
|
-
'
|
1219
|
+
'耗时': elapsed
|
1064
1220
|
})
|
1065
1221
|
except Exception as e:
|
1066
1222
|
elapsed = round(time.time() - close_start, 2)
|
1067
1223
|
logger.error("关闭连接池失败", {
|
1068
1224
|
'error': str(e),
|
1069
|
-
'
|
1225
|
+
'耗时': elapsed
|
1070
1226
|
})
|
1071
1227
|
raise
|
1072
1228
|
|
1073
1229
|
def _check_pool_health(self):
|
1074
|
-
"""
|
1230
|
+
"""
|
1231
|
+
检查连接池健康状态
|
1232
|
+
|
1233
|
+
:return: 连接池健康返回True,否则返回False
|
1234
|
+
"""
|
1075
1235
|
try:
|
1076
1236
|
conn = self.pool.connection()
|
1077
1237
|
conn.ping(reconnect=True)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=83jYP6xnYylgp029cctX2BP7k_exd-phUiwATgIjhH0,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
|
5
5
|
mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
|
@@ -9,10 +9,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
9
9
|
mdbq/log/mylogger.py,sha256=jHCVO7KPQrg2kcCaIrakHivZmFBJyy-24sIn2rsbK4Y,24440
|
10
10
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
11
11
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
12
|
-
mdbq/mysql/deduplicator.py,sha256=
|
12
|
+
mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
|
13
13
|
mdbq/mysql/mysql.py,sha256=jTcizvUtRdwMhWK2i_LA9yDPmcifLjUzVhwTbC3wfJk,119785
|
14
14
|
mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=V23PAzT59lMUqijkUiwV6a1qNwk9T76k8HKxY8fYW9w,52140
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
|
28
|
-
mdbq-3.9.
|
29
|
-
mdbq-3.9.
|
30
|
-
mdbq-3.9.
|
31
|
-
mdbq-3.9.
|
28
|
+
mdbq-3.9.10.dist-info/METADATA,sha256=Ln51lgeqZn0zAjgLUKXaMNJ5ZXCkX3Eyu0iao37_IQw,364
|
29
|
+
mdbq-3.9.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.9.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.9.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|