mdbq 3.9.8__py3-none-any.whl → 3.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +5 -8
- mdbq/mysql/uploader.py +181 -80
- {mdbq-3.9.8.dist-info → mdbq-3.9.9.dist-info}/METADATA +1 -1
- {mdbq-3.9.8.dist-info → mdbq-3.9.9.dist-info}/RECORD +7 -7
- {mdbq-3.9.8.dist-info → mdbq-3.9.9.dist-info}/WHEEL +0 -0
- {mdbq-3.9.8.dist-info → mdbq-3.9.9.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.9.
|
1
|
+
VERSION = '3.9.9'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -17,14 +17,14 @@ from collections import defaultdict
|
|
17
17
|
warnings.filterwarnings('ignore')
|
18
18
|
logger = mylogger.MyLogger(
|
19
19
|
name='deduplicator',
|
20
|
-
logging_mode='
|
21
|
-
log_level='
|
20
|
+
logging_mode='both',
|
21
|
+
log_level='info',
|
22
22
|
log_file='deduplicator.log',
|
23
23
|
log_format='json',
|
24
24
|
max_log_size=50,
|
25
25
|
backup_count=5,
|
26
26
|
enable_async=False, # 是否启用异步日志
|
27
|
-
sample_rate=
|
27
|
+
sample_rate=1, # 采样50%的DEBUG/INFO日志
|
28
28
|
sensitive_fields=[], # 敏感字段列表
|
29
29
|
)
|
30
30
|
|
@@ -116,10 +116,7 @@ class MySQLDeduplicator:
|
|
116
116
|
self._processing_tables = set() # 正在处理的表集合
|
117
117
|
|
118
118
|
# 系统数据库列表
|
119
|
-
self.SYSTEM_DATABASES = {
|
120
|
-
'information_schema', 'mysql',
|
121
|
-
'performance_schema', 'sys'
|
122
|
-
}
|
119
|
+
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
|
123
120
|
|
124
121
|
def _get_connection(self):
|
125
122
|
"""从连接池获取连接"""
|
@@ -580,7 +577,7 @@ class MySQLDeduplicator:
|
|
580
577
|
def main():
|
581
578
|
deduplicator = MySQLDeduplicator(
|
582
579
|
username='root',
|
583
|
-
password='
|
580
|
+
password='188988yang188',
|
584
581
|
host='localhost',
|
585
582
|
port=3306
|
586
583
|
)
|
mdbq/mysql/uploader.py
CHANGED
@@ -17,20 +17,26 @@ from collections import OrderedDict
|
|
17
17
|
warnings.filterwarnings('ignore')
|
18
18
|
logger = mylogger.MyLogger(
|
19
19
|
name='uploader',
|
20
|
-
logging_mode='
|
21
|
-
log_level='
|
20
|
+
logging_mode='both',
|
21
|
+
log_level='info',
|
22
22
|
log_file='uploader.log',
|
23
23
|
log_format='json',
|
24
24
|
max_log_size=50,
|
25
25
|
backup_count=5,
|
26
26
|
enable_async=False, # 是否启用异步日志
|
27
|
-
sample_rate=
|
27
|
+
sample_rate=1, # 采样50%的DEBUG/INFO日志
|
28
28
|
sensitive_fields=[], # 敏感字段列表
|
29
29
|
)
|
30
30
|
|
31
31
|
|
32
32
|
def count_decimal_places(num_str):
|
33
|
-
"""
|
33
|
+
"""
|
34
|
+
计算数字字符串的小数位数,支持科学计数法
|
35
|
+
|
36
|
+
:param num_str: 数字字符串
|
37
|
+
:return: 返回元组(整数位数, 小数位数)
|
38
|
+
:raises: 无显式抛出异常,但正则匹配失败时返回(0, 0)
|
39
|
+
"""
|
34
40
|
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
35
41
|
if match:
|
36
42
|
# 如果是科学计数法
|
@@ -53,8 +59,13 @@ def count_decimal_places(num_str):
|
|
53
59
|
|
54
60
|
|
55
61
|
class StatementCache(OrderedDict):
|
56
|
-
"""LRU
|
62
|
+
"""基于OrderedDict实现的LRU缓存策略,用于缓存SQL语句"""
|
57
63
|
def __init__(self, maxsize=100):
|
64
|
+
"""
|
65
|
+
初始化缓存
|
66
|
+
|
67
|
+
:param maxsize: 最大缓存大小,默认为100
|
68
|
+
"""
|
58
69
|
super().__init__()
|
59
70
|
self.maxsize = maxsize
|
60
71
|
|
@@ -119,7 +130,12 @@ class MySQLUploader:
|
|
119
130
|
self.pool = self._create_connection_pool()
|
120
131
|
|
121
132
|
def _create_connection_pool(self) -> PooledDB:
|
122
|
-
"""
|
133
|
+
"""
|
134
|
+
创建数据库连接池
|
135
|
+
|
136
|
+
:return: PooledDB连接池实例
|
137
|
+
:raises ConnectionError: 当连接池创建失败时抛出
|
138
|
+
"""
|
123
139
|
if hasattr(self, 'pool') and self.pool is not None and self._check_pool_health():
|
124
140
|
return self.pool
|
125
141
|
|
@@ -157,22 +173,29 @@ class MySQLUploader:
|
|
157
173
|
|
158
174
|
try:
|
159
175
|
pool = PooledDB(**pool_params)
|
160
|
-
elapsed = time.time() - start_time
|
176
|
+
elapsed = round(time.time() - start_time, 2)
|
161
177
|
logger.info("连接池创建成功", {
|
162
178
|
'pool_size': self.pool_size,
|
163
|
-
'
|
179
|
+
'耗时': elapsed
|
164
180
|
})
|
165
181
|
return pool
|
166
182
|
except Exception as e:
|
167
|
-
elapsed = time.time() - start_time
|
183
|
+
elapsed = round(time.time() - start_time, 2)
|
168
184
|
self.pool = None
|
169
185
|
logger.error("连接池创建失败", {
|
170
186
|
'error': str(e),
|
171
|
-
'
|
187
|
+
'耗时': elapsed
|
172
188
|
})
|
173
189
|
raise ConnectionError(f"连接池创建失败: {str(e)}")
|
174
190
|
|
175
191
|
def _execute_with_retry(self, func):
|
192
|
+
"""
|
193
|
+
带重试机制的装饰器,用于数据库操作
|
194
|
+
|
195
|
+
:param func: 被装饰的函数
|
196
|
+
:return: 装饰后的函数
|
197
|
+
:raises: 可能抛出原始异常或最后一次重试的异常
|
198
|
+
"""
|
176
199
|
@wraps(func)
|
177
200
|
def wrapper(*args, **kwargs):
|
178
201
|
last_exception = None
|
@@ -187,18 +210,18 @@ class MySQLUploader:
|
|
187
210
|
for attempt in range(self.max_retries):
|
188
211
|
try:
|
189
212
|
result = func(*args, **kwargs)
|
190
|
-
elapsed = time.time() - start_time
|
213
|
+
elapsed = round(time.time() - start_time, 2)
|
191
214
|
|
192
215
|
if attempt > 0:
|
193
216
|
logger.info("操作成功(重试后)", {
|
194
217
|
'operation': operation,
|
195
218
|
'attempts': attempt + 1,
|
196
|
-
'
|
219
|
+
'耗时': elapsed
|
197
220
|
})
|
198
221
|
else:
|
199
222
|
logger.debug("操作成功", {
|
200
223
|
'operation': operation,
|
201
|
-
'
|
224
|
+
'耗时': elapsed
|
202
225
|
})
|
203
226
|
|
204
227
|
return result
|
@@ -230,15 +253,15 @@ class MySQLUploader:
|
|
230
253
|
'error': str(reconnect_error)
|
231
254
|
})
|
232
255
|
else:
|
233
|
-
elapsed = time.time() - start_time
|
234
|
-
error_details['
|
256
|
+
elapsed = round(time.time() - start_time, 2)
|
257
|
+
error_details['耗时'] = elapsed
|
235
258
|
logger.error(f"操作最终失败 {error_details}")
|
236
259
|
|
237
260
|
except pymysql.IntegrityError as e:
|
238
|
-
elapsed = time.time() - start_time
|
261
|
+
elapsed = round(time.time() - start_time, 2)
|
239
262
|
logger.error("完整性约束错误", {
|
240
263
|
'operation': operation,
|
241
|
-
'
|
264
|
+
'耗时': elapsed,
|
242
265
|
'error_code': e.args[0] if e.args else None,
|
243
266
|
'error_message': e.args[1] if len(e.args) > 1 else None
|
244
267
|
})
|
@@ -246,10 +269,10 @@ class MySQLUploader:
|
|
246
269
|
|
247
270
|
except Exception as e:
|
248
271
|
last_exception = e
|
249
|
-
elapsed = time.time() - start_time
|
272
|
+
elapsed = round(time.time() - start_time, 2)
|
250
273
|
logger.error("发生意外错误", {
|
251
274
|
'operation': operation,
|
252
|
-
'
|
275
|
+
'耗时': elapsed,
|
253
276
|
'error_type': type(e).__name__,
|
254
277
|
'error_message': str(e),
|
255
278
|
'error_args': e.args if hasattr(e, 'args') else None
|
@@ -261,7 +284,12 @@ class MySQLUploader:
|
|
261
284
|
return wrapper
|
262
285
|
|
263
286
|
def _get_connection(self):
|
264
|
-
"""
|
287
|
+
"""
|
288
|
+
从连接池获取数据库连接
|
289
|
+
|
290
|
+
:return: 数据库连接对象
|
291
|
+
:raises ConnectionError: 当获取连接失败时抛出
|
292
|
+
"""
|
265
293
|
try:
|
266
294
|
conn = self.pool.connection()
|
267
295
|
logger.debug("获取数据库连接")
|
@@ -271,7 +299,13 @@ class MySQLUploader:
|
|
271
299
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
272
300
|
|
273
301
|
def _check_database_exists(self, db_name: str) -> bool:
|
274
|
-
"""
|
302
|
+
"""
|
303
|
+
检查数据库是否存在
|
304
|
+
|
305
|
+
:param db_name: 数据库名称
|
306
|
+
:return: 存在返回True,否则返回False
|
307
|
+
:raises: 可能抛出数据库相关异常
|
308
|
+
"""
|
275
309
|
db_name = self._validate_identifier(db_name)
|
276
310
|
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
277
311
|
|
@@ -287,7 +321,12 @@ class MySQLUploader:
|
|
287
321
|
raise
|
288
322
|
|
289
323
|
def _create_database(self, db_name: str):
|
290
|
-
"""
|
324
|
+
"""
|
325
|
+
创建数据库
|
326
|
+
|
327
|
+
:param db_name: 要创建的数据库名称
|
328
|
+
:raises: 可能抛出数据库相关异常
|
329
|
+
"""
|
291
330
|
db_name = self._validate_identifier(db_name)
|
292
331
|
sql = f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET {self.charset} COLLATE {self.collation}"
|
293
332
|
|
@@ -329,18 +368,17 @@ class MySQLUploader:
|
|
329
368
|
elif partition_by == 'month':
|
330
369
|
return f"{table_name}_{date_obj.year}_{date_obj.month:02d}"
|
331
370
|
else:
|
332
|
-
error_msg = "
|
371
|
+
error_msg = "分表方式必须是 'year' 或 'month'"
|
333
372
|
logger.error(error_msg)
|
334
373
|
raise ValueError(error_msg)
|
335
374
|
|
336
375
|
def _validate_identifier(self, identifier: str) -> str:
|
337
376
|
"""
|
338
|
-
验证并清理数据库标识符(
|
339
|
-
防止SQL注入和非法字符
|
377
|
+
验证并清理数据库标识符(表名、列名等)
|
340
378
|
|
341
379
|
:param identifier: 要验证的标识符
|
342
380
|
:return: 清理后的安全标识符
|
343
|
-
:raises ValueError:
|
381
|
+
:raises ValueError: 当标识符无效时抛出
|
344
382
|
"""
|
345
383
|
if not identifier or not isinstance(identifier, str):
|
346
384
|
error_msg = f"无效的标识符: {identifier}"
|
@@ -366,7 +404,14 @@ class MySQLUploader:
|
|
366
404
|
return cleaned
|
367
405
|
|
368
406
|
def _check_table_exists(self, db_name: str, table_name: str) -> bool:
|
369
|
-
"""
|
407
|
+
"""
|
408
|
+
检查表是否存在
|
409
|
+
|
410
|
+
:param db_name: 数据库名
|
411
|
+
:param table_name: 表名
|
412
|
+
:return: 存在返回True,否则返回False
|
413
|
+
:raises: 可能抛出数据库相关异常
|
414
|
+
"""
|
370
415
|
cache_key = f"{db_name}.{table_name}"
|
371
416
|
if cache_key in self._table_metadata_cache:
|
372
417
|
cached_time, result = self._table_metadata_cache[cache_key]
|
@@ -410,9 +455,11 @@ class MySQLUploader:
|
|
410
455
|
:param db_name: 数据库名
|
411
456
|
:param table_name: 表名
|
412
457
|
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
413
|
-
:param primary_keys:
|
414
|
-
:param date_column:
|
415
|
-
:param indexes:
|
458
|
+
:param primary_keys: 主键列列表,可选
|
459
|
+
:param date_column: 日期列名,可选,如果存在将设置为索引
|
460
|
+
:param indexes: 需要创建索引的列列表,可选
|
461
|
+
:param allow_null: 是否允许空值,默认为False
|
462
|
+
:raises: 可能抛出数据库相关异常
|
416
463
|
"""
|
417
464
|
db_name = self._validate_identifier(db_name)
|
418
465
|
table_name = self._validate_identifier(table_name)
|
@@ -501,7 +548,14 @@ class MySQLUploader:
|
|
501
548
|
raise
|
502
549
|
|
503
550
|
def _validate_datetime(self, value, date_type=False):
|
504
|
-
"""
|
551
|
+
"""
|
552
|
+
验证并标准化日期时间格式
|
553
|
+
|
554
|
+
:param value: 日期时间值
|
555
|
+
:param date_type: 是否返回日期类型(True)或字符串(False)
|
556
|
+
:return: 标准化后的日期时间字符串或日期对象
|
557
|
+
:raises ValueError: 当日期格式无效时抛出
|
558
|
+
"""
|
505
559
|
formats = [
|
506
560
|
'%Y-%m-%d %H:%M:%S',
|
507
561
|
'%Y-%m-%d',
|
@@ -524,26 +578,42 @@ class MySQLUploader:
|
|
524
578
|
continue
|
525
579
|
raise ValueError(f"无效的日期格式2: {value}")
|
526
580
|
|
527
|
-
def _validate_value(self, value: Any, column_type: str) -> Any:
|
581
|
+
def _validate_value(self, value: Any, column_type: str, allow_null: bool) -> Any:
|
528
582
|
"""
|
529
|
-
|
583
|
+
根据列类型验证并转换数据值
|
530
584
|
|
531
585
|
:param value: 要验证的值
|
532
586
|
:param column_type: 列的数据类型
|
533
|
-
:
|
534
|
-
:
|
587
|
+
:param allow_null: 是否允许空值
|
588
|
+
:return: 转换后的值
|
589
|
+
:raises ValueError: 当值转换失败时抛出
|
535
590
|
"""
|
536
591
|
if value is None:
|
592
|
+
if not allow_null:
|
593
|
+
return 'none'
|
537
594
|
return None
|
538
595
|
|
539
596
|
try:
|
540
597
|
column_type_lower = column_type.lower()
|
541
598
|
|
599
|
+
# 处理百分比值
|
600
|
+
if isinstance(value, str) and '%' in value:
|
601
|
+
try:
|
602
|
+
# 移除百分号并转换为小数
|
603
|
+
percent_value = float(value.strip().replace('%', ''))
|
604
|
+
decimal_value = percent_value / 100
|
605
|
+
return decimal_value
|
606
|
+
except ValueError:
|
607
|
+
pass # 如果不是有效的百分比数字,继续正常处理
|
608
|
+
|
542
609
|
if 'int' in column_type_lower:
|
543
610
|
if isinstance(value, (str, bytes)) and not value.strip().isdigit():
|
544
611
|
raise ValueError("非数字字符串无法转换为整数")
|
545
612
|
return int(value)
|
546
613
|
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
614
|
+
if isinstance(value, str):
|
615
|
+
# 处理可能包含逗号的数字字符串
|
616
|
+
value = value.replace(',', '')
|
547
617
|
return float(value) if value is not None else None
|
548
618
|
elif '日期' in column_type_lower or 'time' in column_type_lower:
|
549
619
|
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
@@ -570,7 +640,14 @@ class MySQLUploader:
|
|
570
640
|
raise ValueError(error_msg)
|
571
641
|
|
572
642
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
573
|
-
"""
|
643
|
+
"""
|
644
|
+
获取表的列名和数据类型
|
645
|
+
|
646
|
+
:param db_name: 数据库名
|
647
|
+
:param table_name: 表名
|
648
|
+
:return: 列名和数据类型字典 {列名: 数据类型}
|
649
|
+
:raises: 可能抛出数据库相关异常
|
650
|
+
"""
|
574
651
|
db_name = self._validate_identifier(db_name)
|
575
652
|
table_name = self._validate_identifier(table_name)
|
576
653
|
sql = """
|
@@ -639,7 +716,7 @@ class MySQLUploader:
|
|
639
716
|
|
640
717
|
def _infer_data_type(self, value: Any) -> str:
|
641
718
|
"""
|
642
|
-
|
719
|
+
根据值推断合适的MySQL数据类型
|
643
720
|
|
644
721
|
:param value: 要推断的值
|
645
722
|
:return: MySQL数据类型字符串
|
@@ -647,6 +724,10 @@ class MySQLUploader:
|
|
647
724
|
if value is None:
|
648
725
|
return 'VARCHAR(255)' # 默认字符串类型
|
649
726
|
|
727
|
+
# 检查是否是百分比字符串
|
728
|
+
if isinstance(value, str) and '%' in value:
|
729
|
+
return 'DECIMAL(10,4)' # 百分比统一使用DECIMAL(10,4)
|
730
|
+
|
650
731
|
if isinstance(value, bool):
|
651
732
|
return 'TINYINT(1)'
|
652
733
|
elif isinstance(value, int):
|
@@ -701,11 +782,11 @@ class MySQLUploader:
|
|
701
782
|
"""
|
702
783
|
准备要上传的数据,验证并转换数据类型
|
703
784
|
|
704
|
-
:param data:
|
785
|
+
:param data: 输入数据,可以是字典、字典列表或DataFrame
|
705
786
|
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
706
787
|
:param allow_null: 是否允许空值
|
707
|
-
:return:
|
708
|
-
:raises ValueError:
|
788
|
+
:return: 元组(准备好的数据列表, 过滤后的列类型字典)
|
789
|
+
:raises ValueError: 当数据验证失败时抛出
|
709
790
|
"""
|
710
791
|
# 统一数据格式为字典列表
|
711
792
|
if isinstance(data, pd.DataFrame):
|
@@ -767,7 +848,7 @@ class MySQLUploader:
|
|
767
848
|
prepared_row[col_name] = None
|
768
849
|
else:
|
769
850
|
try:
|
770
|
-
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name])
|
851
|
+
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
|
771
852
|
except ValueError as e:
|
772
853
|
error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
|
773
854
|
logger.error(error_msg)
|
@@ -793,7 +874,21 @@ class MySQLUploader:
|
|
793
874
|
indexes: Optional[List[str]] = None
|
794
875
|
):
|
795
876
|
"""
|
796
|
-
|
877
|
+
上传数据到数据库的主入口方法
|
878
|
+
|
879
|
+
:param db_name: 数据库名
|
880
|
+
:param table_name: 表名
|
881
|
+
:param data: 要上传的数据
|
882
|
+
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
883
|
+
:param primary_keys: 主键列列表,可选
|
884
|
+
:param check_duplicate: 是否检查重复数据,默认为False
|
885
|
+
:param duplicate_columns: 用于检查重复的列,可选
|
886
|
+
:param allow_null: 是否允许空值,默认为False
|
887
|
+
:param partition_by: 分表方式('year'或'month'),可选
|
888
|
+
:param partition_date_column: 用于分表的日期列名,默认为'日期'
|
889
|
+
:param auto_create: 表不存在时是否自动创建,默认为True
|
890
|
+
:param indexes: 需要创建索引的列列表,可选
|
891
|
+
:raises: 可能抛出各种验证和数据库相关异常
|
797
892
|
"""
|
798
893
|
upload_start = time.time()
|
799
894
|
initial_row_count = len(data) if hasattr(data, '__len__') else 1
|
@@ -802,21 +897,21 @@ class MySQLUploader:
|
|
802
897
|
success_flag = False
|
803
898
|
|
804
899
|
logger.info("开始上传数据", {
|
805
|
-
'
|
900
|
+
'批次号': batch_id,
|
806
901
|
'database': db_name,
|
807
902
|
'table': table_name,
|
808
|
-
'
|
809
|
-
'
|
810
|
-
'
|
811
|
-
'
|
903
|
+
'分表方式': partition_by,
|
904
|
+
'是否排重': check_duplicate,
|
905
|
+
'总计行数': len(data) if hasattr(data, '__len__') else 1,
|
906
|
+
'自动建表': auto_create
|
812
907
|
})
|
813
908
|
|
814
909
|
try:
|
815
|
-
# 验证参数
|
816
|
-
if not set_typ:
|
817
|
-
|
818
|
-
|
819
|
-
|
910
|
+
# # 验证参数
|
911
|
+
# if not set_typ:
|
912
|
+
# error_msg = "列的数据类型缺失"
|
913
|
+
# logger.error(error_msg)
|
914
|
+
# raise ValueError(error_msg)
|
820
915
|
|
821
916
|
if partition_by and partition_by not in ['year', 'month']:
|
822
917
|
error_msg = "分表方式必须是 'year' 或 'month'"
|
@@ -892,12 +987,12 @@ class MySQLUploader:
|
|
892
987
|
'error_type': type(e).__name__
|
893
988
|
})
|
894
989
|
finally:
|
895
|
-
elapsed = time.time() - upload_start
|
990
|
+
elapsed = round(time.time() - upload_start, 2)
|
896
991
|
logger.info("上传处理完成", {
|
897
|
-
'
|
992
|
+
'批次号': batch_id,
|
898
993
|
'success': success_flag,
|
899
|
-
'
|
900
|
-
'
|
994
|
+
'耗时': elapsed,
|
995
|
+
'数据行': initial_row_count
|
901
996
|
})
|
902
997
|
|
903
998
|
def _insert_data(
|
@@ -912,17 +1007,16 @@ class MySQLUploader:
|
|
912
1007
|
batch_id: Optional[str] = None
|
913
1008
|
):
|
914
1009
|
"""
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
batch_id: 批次ID用于日志追踪
|
1010
|
+
实际执行数据插入的方法
|
1011
|
+
|
1012
|
+
:param db_name: 数据库名
|
1013
|
+
:param table_name: 表名
|
1014
|
+
:param data: 要插入的数据列表
|
1015
|
+
:param set_typ: 列名和数据类型字典 {列名: 数据类型}
|
1016
|
+
:param check_duplicate: 是否检查重复数据,默认为False
|
1017
|
+
:param duplicate_columns: 用于检查重复的列,可选
|
1018
|
+
:param batch_size: 批量插入大小,默认为1000
|
1019
|
+
:param batch_id: 批次ID用于日志追踪,可选
|
926
1020
|
"""
|
927
1021
|
if not data:
|
928
1022
|
return
|
@@ -1001,13 +1095,13 @@ class MySQLUploader:
|
|
1001
1095
|
|
1002
1096
|
# 记录失败行详细信息
|
1003
1097
|
error_details = {
|
1004
|
-
'
|
1098
|
+
'批次号': batch_id,
|
1005
1099
|
'database': db_name,
|
1006
1100
|
'table': table_name,
|
1007
1101
|
'error_type': type(e).__name__,
|
1008
1102
|
'error_message': str(e),
|
1009
1103
|
'column_types': set_typ,
|
1010
|
-
'
|
1104
|
+
'是否排重': check_duplicate,
|
1011
1105
|
'duplicate_columns': duplicate_columns
|
1012
1106
|
}
|
1013
1107
|
logger.error(f"单行插入失败: {error_details}")
|
@@ -1022,28 +1116,31 @@ class MySQLUploader:
|
|
1022
1116
|
else:
|
1023
1117
|
total_inserted += successful_rows
|
1024
1118
|
|
1025
|
-
batch_elapsed = time.time() - batch_start
|
1119
|
+
batch_elapsed = round(time.time() - batch_start, 2)
|
1026
1120
|
batch_info = {
|
1027
|
-
'
|
1121
|
+
'批次号': batch_id,
|
1028
1122
|
'batch_index': i // batch_size + 1,
|
1029
1123
|
'total_batches': (len(data) + batch_size - 1) // batch_size,
|
1030
1124
|
'batch_size': len(batch),
|
1031
1125
|
'successful_rows': successful_rows,
|
1032
1126
|
'failed_rows': len(batch) - successful_rows,
|
1033
|
-
'
|
1127
|
+
'耗时': batch_elapsed,
|
1034
1128
|
'rows_per_second': successful_rows / batch_elapsed if batch_elapsed > 0 else 0
|
1035
1129
|
}
|
1036
1130
|
logger.debug(f"批次处理完成 {batch_info}")
|
1037
1131
|
|
1038
1132
|
logger.info("数据插入完成", {
|
1039
|
-
'
|
1040
|
-
'
|
1041
|
-
'
|
1042
|
-
'
|
1133
|
+
'总数据行': len(data),
|
1134
|
+
'插入行数': total_inserted,
|
1135
|
+
'跳过行数': total_skipped,
|
1136
|
+
'失败行数': total_failed
|
1043
1137
|
})
|
1044
1138
|
|
1045
1139
|
def close(self):
|
1046
|
-
"""
|
1140
|
+
"""
|
1141
|
+
关闭连接池并清理资源
|
1142
|
+
:raises: 可能抛出关闭连接时的异常
|
1143
|
+
"""
|
1047
1144
|
close_start = time.time()
|
1048
1145
|
|
1049
1146
|
try:
|
@@ -1060,18 +1157,22 @@ class MySQLUploader:
|
|
1060
1157
|
|
1061
1158
|
elapsed = round(time.time() - close_start, 2)
|
1062
1159
|
logger.info("连接池已关闭", {
|
1063
|
-
'
|
1160
|
+
'耗时': elapsed
|
1064
1161
|
})
|
1065
1162
|
except Exception as e:
|
1066
1163
|
elapsed = round(time.time() - close_start, 2)
|
1067
1164
|
logger.error("关闭连接池失败", {
|
1068
1165
|
'error': str(e),
|
1069
|
-
'
|
1166
|
+
'耗时': elapsed
|
1070
1167
|
})
|
1071
1168
|
raise
|
1072
1169
|
|
1073
1170
|
def _check_pool_health(self):
|
1074
|
-
"""
|
1171
|
+
"""
|
1172
|
+
检查连接池健康状态
|
1173
|
+
|
1174
|
+
:return: 连接池健康返回True,否则返回False
|
1175
|
+
"""
|
1075
1176
|
try:
|
1076
1177
|
conn = self.pool.connection()
|
1077
1178
|
conn.ping(reconnect=True)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=Z38j4uvZuqpFYiUEq0FTd82-1Y90RoVwpNEDWVHNTkk,17
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
|
5
5
|
mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
|
@@ -9,10 +9,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
9
9
|
mdbq/log/mylogger.py,sha256=jHCVO7KPQrg2kcCaIrakHivZmFBJyy-24sIn2rsbK4Y,24440
|
10
10
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
11
11
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
12
|
-
mdbq/mysql/deduplicator.py,sha256=
|
12
|
+
mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
|
13
13
|
mdbq/mysql/mysql.py,sha256=jTcizvUtRdwMhWK2i_LA9yDPmcifLjUzVhwTbC3wfJk,119785
|
14
14
|
mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=mIgUnV7MwIkrbG-dchMkMzWo_N-XrQROLWTGGGuD_ts,49171
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
|
28
|
-
mdbq-3.9.
|
29
|
-
mdbq-3.9.
|
30
|
-
mdbq-3.9.
|
31
|
-
mdbq-3.9.
|
28
|
+
mdbq-3.9.9.dist-info/METADATA,sha256=F6RAyI8aGmpT-VLwVeY7jw13qemIce-PMH2Ri335GAE,363
|
29
|
+
mdbq-3.9.9.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.9.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.9.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|