mdbq 3.11.11__py3-none-any.whl → 3.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +127 -63
- mdbq/mysql/uploader.py +177 -83
- {mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/METADATA +1 -1
- {mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/RECORD +7 -7
- {mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/WHEEL +0 -0
- {mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.
|
1
|
+
VERSION = '3.12.1'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -14,6 +14,7 @@ from collections import defaultdict
|
|
14
14
|
import sys
|
15
15
|
from datetime import datetime
|
16
16
|
import uuid
|
17
|
+
from contextlib import contextmanager
|
17
18
|
|
18
19
|
|
19
20
|
warnings.filterwarnings('ignore')
|
@@ -47,8 +48,8 @@ class MySQLDeduplicator:
|
|
47
48
|
batch_size: int = 1000,
|
48
49
|
skip_system_dbs: bool = True,
|
49
50
|
max_retries: int = 3,
|
50
|
-
|
51
|
-
pool_size: int =
|
51
|
+
retry_waiting_time: int = 5,
|
52
|
+
pool_size: int = 10,
|
52
53
|
primary_key: str = 'id',
|
53
54
|
date_range: Optional[List[str]] = None,
|
54
55
|
recent_month: Optional[int] = None,
|
@@ -87,15 +88,30 @@ class MySQLDeduplicator:
|
|
87
88
|
cursorclass=pymysql.cursors.DictCursor
|
88
89
|
)
|
89
90
|
|
91
|
+
# 并发模式要将 pool_size 加大
|
92
|
+
MAX_POOL_SIZE = 200
|
93
|
+
MAX_WORKERS = 4
|
94
|
+
if max_workers > MAX_WORKERS:
|
95
|
+
logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS}),自动将 max_workers 调整为 {MAX_WORKERS}")
|
96
|
+
max_workers = MAX_WORKERS
|
97
|
+
expected_threads = max_workers * 10
|
98
|
+
if pool_size < expected_threads:
|
99
|
+
logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10,自动将 pool_size 调整为 {expected_threads}")
|
100
|
+
pool_size = expected_threads
|
101
|
+
if pool_size > MAX_POOL_SIZE:
|
102
|
+
logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE}),自动将 pool_size 调整为 {MAX_POOL_SIZE}")
|
103
|
+
pool_size = MAX_POOL_SIZE
|
104
|
+
self.max_workers = max_workers
|
105
|
+
self.pool_size = pool_size
|
106
|
+
|
90
107
|
# 配置参数
|
91
|
-
self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
|
92
108
|
self.batch_size = batch_size
|
93
109
|
self.skip_system_dbs = skip_system_dbs
|
94
110
|
self.max_retries = max_retries
|
95
|
-
self.
|
111
|
+
self.retry_waiting_time = retry_waiting_time
|
96
112
|
self.primary_key = primary_key
|
97
113
|
|
98
|
-
#
|
114
|
+
# 时间范围参数
|
99
115
|
self.date_column = date_column
|
100
116
|
self._dedup_start_date = None
|
101
117
|
self._dedup_end_date = None
|
@@ -128,6 +144,9 @@ class MySQLDeduplicator:
|
|
128
144
|
year -= 1
|
129
145
|
self._dedup_start_date = f"{year}-{month:02d}-01"
|
130
146
|
self._dedup_end_date = today.strftime("%Y-%m-%d")
|
147
|
+
|
148
|
+
if self._dedup_start_date and self._dedup_end_date:
|
149
|
+
logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
|
131
150
|
|
132
151
|
# 排除列处理,直接合并去重
|
133
152
|
self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
|
@@ -164,6 +183,14 @@ class MySQLDeduplicator:
|
|
164
183
|
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
165
184
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
166
185
|
|
186
|
+
@contextmanager
|
187
|
+
def _conn_ctx(self):
|
188
|
+
conn = self._get_connection()
|
189
|
+
try:
|
190
|
+
yield conn
|
191
|
+
finally:
|
192
|
+
conn.close()
|
193
|
+
|
167
194
|
@staticmethod
|
168
195
|
def _retry_on_failure(func: Any) -> Any:
|
169
196
|
"""
|
@@ -187,7 +214,7 @@ class MySQLDeduplicator:
|
|
187
214
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
188
215
|
last_exception = e
|
189
216
|
if attempt < self.max_retries:
|
190
|
-
wait_time = self.
|
217
|
+
wait_time = self.retry_waiting_time * (attempt + 1)
|
191
218
|
logger.warning(
|
192
219
|
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
193
220
|
{'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
|
@@ -203,7 +230,6 @@ class MySQLDeduplicator:
|
|
203
230
|
raise Exception("未知错误")
|
204
231
|
return wrapper
|
205
232
|
|
206
|
-
@_retry_on_failure
|
207
233
|
def _get_databases(self) -> List[str]:
|
208
234
|
"""
|
209
235
|
获取所有非系统数据库列表,排除 exclude_databases。
|
@@ -212,7 +238,7 @@ class MySQLDeduplicator:
|
|
212
238
|
List[str]: 数据库名列表。
|
213
239
|
"""
|
214
240
|
sql = "SHOW DATABASES"
|
215
|
-
with self.
|
241
|
+
with self._conn_ctx() as conn:
|
216
242
|
with conn.cursor() as cursor:
|
217
243
|
cursor.execute(sql)
|
218
244
|
all_dbs = [row['Database'] for row in cursor.fetchall()]
|
@@ -220,7 +246,6 @@ class MySQLDeduplicator:
|
|
220
246
|
filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
|
221
247
|
return filtered
|
222
248
|
|
223
|
-
@_retry_on_failure
|
224
249
|
def _get_tables(self, database: str) -> List[str]:
|
225
250
|
"""
|
226
251
|
获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
|
@@ -231,15 +256,12 @@ class MySQLDeduplicator:
|
|
231
256
|
List[str]: 表名列表。
|
232
257
|
"""
|
233
258
|
sql = "SHOW TABLES"
|
234
|
-
|
235
|
-
with self._get_connection() as conn:
|
259
|
+
with self._conn_ctx() as conn:
|
236
260
|
with conn.cursor() as cursor:
|
237
261
|
cursor.execute(f"USE `{database}`")
|
238
262
|
cursor.execute(sql)
|
239
|
-
# 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
|
240
263
|
return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
|
241
264
|
|
242
|
-
@_retry_on_failure
|
243
265
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
244
266
|
"""
|
245
267
|
获取指定表的所有列名(排除主键列)。
|
@@ -256,14 +278,12 @@ class MySQLDeduplicator:
|
|
256
278
|
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
257
279
|
ORDER BY ORDINAL_POSITION
|
258
280
|
"""
|
259
|
-
|
260
|
-
with self._get_connection() as conn:
|
281
|
+
with self._conn_ctx() as conn:
|
261
282
|
with conn.cursor() as cursor:
|
262
283
|
cursor.execute(sql, (database, table))
|
263
284
|
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
264
285
|
if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
|
265
286
|
|
266
|
-
@_retry_on_failure
|
267
287
|
def _ensure_index(self, database: str, table: str, date_column: str) -> None:
|
268
288
|
"""
|
269
289
|
检查并为 date_column 自动创建索引(如果未存在)。
|
@@ -273,7 +293,7 @@ class MySQLDeduplicator:
|
|
273
293
|
table (str): 表名。
|
274
294
|
date_column (str): 需要检查的日期列名。
|
275
295
|
"""
|
276
|
-
with self.
|
296
|
+
with self._conn_ctx() as conn:
|
277
297
|
with conn.cursor() as cursor:
|
278
298
|
# 检查索引是否已存在
|
279
299
|
cursor.execute(
|
@@ -295,7 +315,33 @@ class MySQLDeduplicator:
|
|
295
315
|
except Exception as e:
|
296
316
|
logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
|
297
317
|
|
298
|
-
|
318
|
+
def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
|
319
|
+
"""
|
320
|
+
生成器:分批拉取表数据,避免一次性加载全部数据到内存。
|
321
|
+
Args:
|
322
|
+
database (str): 数据库名。
|
323
|
+
table (str): 表名。
|
324
|
+
select_cols (str): 选择的列字符串。
|
325
|
+
select_where (str): where条件字符串。
|
326
|
+
batch_size (int): 每批拉取的行数。
|
327
|
+
Yields:
|
328
|
+
dict: 每行数据。
|
329
|
+
"""
|
330
|
+
offset = 0
|
331
|
+
while True:
|
332
|
+
sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
|
333
|
+
with self._conn_ctx() as conn:
|
334
|
+
with conn.cursor() as cursor:
|
335
|
+
cursor.execute(sql)
|
336
|
+
rows = cursor.fetchall()
|
337
|
+
if not rows:
|
338
|
+
break
|
339
|
+
for row in rows:
|
340
|
+
yield row
|
341
|
+
if len(rows) < batch_size:
|
342
|
+
break
|
343
|
+
offset += batch_size
|
344
|
+
|
299
345
|
def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
|
300
346
|
"""
|
301
347
|
获取表中所有不同的日期分区(按天)。
|
@@ -308,7 +354,7 @@ class MySQLDeduplicator:
|
|
308
354
|
List[str]: 所有不同的日期(字符串)。
|
309
355
|
"""
|
310
356
|
sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
|
311
|
-
with self.
|
357
|
+
with self._conn_ctx() as conn:
|
312
358
|
with conn.cursor() as cursor:
|
313
359
|
cursor.execute(sql)
|
314
360
|
return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
|
@@ -367,7 +413,7 @@ class MySQLDeduplicator:
|
|
367
413
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
368
414
|
where_sql = f"t.`{time_col}` = '{date_val}'"
|
369
415
|
# 获取原始数据总量(只统计当天数据)
|
370
|
-
with self.
|
416
|
+
with self._conn_ctx() as conn:
|
371
417
|
with conn.cursor() as cursor:
|
372
418
|
count_where = f"WHERE `{time_col}` = '{date_val}'"
|
373
419
|
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
@@ -394,7 +440,7 @@ class MySQLDeduplicator:
|
|
394
440
|
del_ids.extend(ids[1:])
|
395
441
|
affected_rows = 0
|
396
442
|
if not dry_run and del_ids:
|
397
|
-
with self.
|
443
|
+
with self._conn_ctx() as conn:
|
398
444
|
with conn.cursor() as cursor:
|
399
445
|
for i in range(0, len(del_ids), self.batch_size):
|
400
446
|
batch_ids = del_ids[i:i+self.batch_size]
|
@@ -418,7 +464,7 @@ class MySQLDeduplicator:
|
|
418
464
|
GROUP BY {column_list}
|
419
465
|
HAVING COUNT(*) > 1
|
420
466
|
"""
|
421
|
-
with self.
|
467
|
+
with self._conn_ctx() as conn:
|
422
468
|
with conn.cursor() as cursor:
|
423
469
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
424
470
|
cursor.execute(create_temp_sql)
|
@@ -484,7 +530,7 @@ class MySQLDeduplicator:
|
|
484
530
|
pk = self.primary_key
|
485
531
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
486
532
|
# 获取原始数据总量
|
487
|
-
with self.
|
533
|
+
with self._conn_ctx() as conn:
|
488
534
|
with conn.cursor() as cursor:
|
489
535
|
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
|
490
536
|
logger.debug('执行SQL', {'sql': count_sql})
|
@@ -508,7 +554,7 @@ class MySQLDeduplicator:
|
|
508
554
|
del_ids.extend(ids[1:])
|
509
555
|
affected_rows = 0
|
510
556
|
if not dry_run and del_ids:
|
511
|
-
with self.
|
557
|
+
with self._conn_ctx() as conn:
|
512
558
|
with conn.cursor() as cursor:
|
513
559
|
for i in range(0, len(del_ids), self.batch_size):
|
514
560
|
batch_ids = del_ids[i:i+self.batch_size]
|
@@ -529,7 +575,7 @@ class MySQLDeduplicator:
|
|
529
575
|
GROUP BY {column_list}
|
530
576
|
HAVING COUNT(*) > 1
|
531
577
|
"""
|
532
|
-
with self.
|
578
|
+
with self._conn_ctx() as conn:
|
533
579
|
with conn.cursor() as cursor:
|
534
580
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
535
581
|
cursor.execute(create_temp_sql)
|
@@ -584,7 +630,7 @@ class MySQLDeduplicator:
|
|
584
630
|
logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
|
585
631
|
if temp_table:
|
586
632
|
try:
|
587
|
-
with self.
|
633
|
+
with self._conn_ctx() as conn:
|
588
634
|
with conn.cursor() as cursor:
|
589
635
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
590
636
|
cursor.execute(drop_temp_sql)
|
@@ -628,13 +674,14 @@ class MySQLDeduplicator:
|
|
628
674
|
logger.info('单表开始', {
|
629
675
|
"库": database,
|
630
676
|
"表": table,
|
631
|
-
"参数": {
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
677
|
+
# "参数": {
|
678
|
+
# "指定去重列": columns,
|
679
|
+
# "去重方式": "Python" if use_python_dedup else "SQL",
|
680
|
+
# "数据处理": self.duplicate_keep_mode,
|
681
|
+
# "模拟运行": dry_run,
|
682
|
+
# '排除列': self.exclude_columns,
|
683
|
+
# },
|
684
|
+
})
|
638
685
|
all_columns = self._get_table_columns(database, table)
|
639
686
|
all_columns_lower = [col.lower() for col in all_columns]
|
640
687
|
time_col = self.date_column
|
@@ -680,7 +727,7 @@ class MySQLDeduplicator:
|
|
680
727
|
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
681
728
|
total_dup += dup_count
|
682
729
|
total_del += affected_rows
|
683
|
-
logger.
|
730
|
+
logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
|
684
731
|
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
685
732
|
if reorder_id and total_del > 0:
|
686
733
|
try:
|
@@ -688,10 +735,12 @@ class MySQLDeduplicator:
|
|
688
735
|
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
689
736
|
except Exception as e:
|
690
737
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
738
|
+
if affected_rows > 0:
|
739
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
|
691
740
|
return (total_dup, total_del)
|
692
741
|
# 没有date_column,直接全表去重
|
693
742
|
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
|
694
|
-
logger.
|
743
|
+
logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
|
695
744
|
dup_count, affected_rows = result
|
696
745
|
if reorder_id and affected_rows > 0:
|
697
746
|
try:
|
@@ -699,6 +748,8 @@ class MySQLDeduplicator:
|
|
699
748
|
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
700
749
|
except Exception as e:
|
701
750
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
751
|
+
if affected_rows > 0:
|
752
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
|
702
753
|
return result
|
703
754
|
except Exception as e:
|
704
755
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
@@ -770,7 +821,11 @@ class MySQLDeduplicator:
|
|
770
821
|
results[table] = (dup_count, affected_rows)
|
771
822
|
total_dup = sum(r[0] for r in results.values())
|
772
823
|
total_del = sum(r[1] for r in results.values())
|
773
|
-
logger.
|
824
|
+
logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
|
825
|
+
# 只显示有删除的详细结果
|
826
|
+
if total_del > 0:
|
827
|
+
filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
|
828
|
+
logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
|
774
829
|
return results
|
775
830
|
except Exception as e:
|
776
831
|
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
@@ -819,7 +874,8 @@ class MySQLDeduplicator:
|
|
819
874
|
'use_python_dedup': use_python_dedup
|
820
875
|
},
|
821
876
|
})
|
822
|
-
|
877
|
+
# 如果parallel=True且库数量大于1,则只在外层并发,内层串行
|
878
|
+
if parallel and self.max_workers > 1 and len(target_dbs) > 1:
|
823
879
|
with concurrent.futures.ThreadPoolExecutor(
|
824
880
|
max_workers=self.max_workers
|
825
881
|
) as executor:
|
@@ -827,6 +883,7 @@ class MySQLDeduplicator:
|
|
827
883
|
for db in target_dbs:
|
828
884
|
tables = tables_map.get(db) if tables_map else None
|
829
885
|
db_columns_map = columns_map.get(db) if columns_map else None
|
886
|
+
# 内层强制串行
|
830
887
|
futures[executor.submit(
|
831
888
|
self.deduplicate_database,
|
832
889
|
db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
|
@@ -855,7 +912,7 @@ class MySQLDeduplicator:
|
|
855
912
|
r[1] for db in all_results.values()
|
856
913
|
for r in db.values()
|
857
914
|
)
|
858
|
-
logger.
|
915
|
+
logger.debug('全局完成', {
|
859
916
|
"总重复组": total_dup,
|
860
917
|
"总删除行": total_del,
|
861
918
|
"参数": {
|
@@ -867,12 +924,30 @@ class MySQLDeduplicator:
|
|
867
924
|
},
|
868
925
|
"详细结果": dict(all_results)
|
869
926
|
})
|
927
|
+
# 只显示有删除的详细结果
|
928
|
+
if total_del > 0:
|
929
|
+
filtered_results = {
|
930
|
+
db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
|
931
|
+
for db, tbls in all_results.items()
|
932
|
+
}
|
933
|
+
filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
|
934
|
+
logger.info('全局完成(仅显示有删除的结果)', {
|
935
|
+
"总重复组": total_dup,
|
936
|
+
"总删除行": total_del,
|
937
|
+
"参数": {
|
938
|
+
"模拟运行": dry_run,
|
939
|
+
"并行处理": parallel,
|
940
|
+
'排除列': self.exclude_columns,
|
941
|
+
'重排id': reorder_id,
|
942
|
+
'use_python_dedup': use_python_dedup
|
943
|
+
},
|
944
|
+
"详细结果": filtered_results
|
945
|
+
})
|
870
946
|
return all_results
|
871
947
|
except Exception as e:
|
872
948
|
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
873
949
|
return all_results
|
874
950
|
|
875
|
-
@_retry_on_failure
|
876
951
|
def _check_database_exists(self, database: str) -> bool:
|
877
952
|
"""
|
878
953
|
检查数据库是否存在。
|
@@ -883,13 +958,11 @@ class MySQLDeduplicator:
|
|
883
958
|
bool: 数据库是否存在。
|
884
959
|
"""
|
885
960
|
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
886
|
-
|
887
|
-
with self._get_connection() as conn:
|
961
|
+
with self._conn_ctx() as conn:
|
888
962
|
with conn.cursor() as cursor:
|
889
963
|
cursor.execute(sql, (database,))
|
890
964
|
return bool(cursor.fetchone())
|
891
965
|
|
892
|
-
@_retry_on_failure
|
893
966
|
def _check_table_exists(self, database: str, table: str) -> bool:
|
894
967
|
"""
|
895
968
|
检查表是否存在。
|
@@ -905,13 +978,11 @@ class MySQLDeduplicator:
|
|
905
978
|
FROM INFORMATION_SCHEMA.TABLES
|
906
979
|
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
907
980
|
"""
|
908
|
-
|
909
|
-
with self._get_connection() as conn:
|
981
|
+
with self._conn_ctx() as conn:
|
910
982
|
with conn.cursor() as cursor:
|
911
983
|
cursor.execute(sql, (database, table))
|
912
984
|
return bool(cursor.fetchone())
|
913
985
|
|
914
|
-
@_retry_on_failure
|
915
986
|
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
916
987
|
"""
|
917
988
|
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
@@ -923,7 +994,7 @@ class MySQLDeduplicator:
|
|
923
994
|
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
924
995
|
"""
|
925
996
|
id_column = id_column or self.primary_key
|
926
|
-
with self.
|
997
|
+
with self._conn_ctx() as conn:
|
927
998
|
with conn.cursor() as cursor:
|
928
999
|
cursor.execute("""
|
929
1000
|
SELECT COLUMN_NAME, COLUMN_KEY
|
@@ -1032,7 +1103,7 @@ class MySQLDeduplicator:
|
|
1032
1103
|
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
1033
1104
|
return False
|
1034
1105
|
# 检查外键约束
|
1035
|
-
with self.
|
1106
|
+
with self._conn_ctx() as conn:
|
1036
1107
|
with conn.cursor() as cursor:
|
1037
1108
|
cursor.execute("""
|
1038
1109
|
SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
@@ -1042,7 +1113,7 @@ class MySQLDeduplicator:
|
|
1042
1113
|
logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
|
1043
1114
|
return False
|
1044
1115
|
# 获取表结构
|
1045
|
-
with self.
|
1116
|
+
with self._conn_ctx() as conn:
|
1046
1117
|
with conn.cursor() as cursor:
|
1047
1118
|
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
1048
1119
|
create_table_sql = cursor.fetchone()['Create Table']
|
@@ -1055,7 +1126,7 @@ class MySQLDeduplicator:
|
|
1055
1126
|
backup_table = self._make_backup_table_name(table)
|
1056
1127
|
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
1057
1128
|
try:
|
1058
|
-
with self.
|
1129
|
+
with self._conn_ctx() as conn:
|
1059
1130
|
with conn.cursor() as cursor:
|
1060
1131
|
# 1. 创建临时表,结构同原表
|
1061
1132
|
try:
|
@@ -1116,7 +1187,7 @@ class MySQLDeduplicator:
|
|
1116
1187
|
logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
|
1117
1188
|
return False
|
1118
1189
|
logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
|
1119
|
-
# 5.
|
1190
|
+
# 5. 自动删除备份表
|
1120
1191
|
if auto_drop_backup:
|
1121
1192
|
try:
|
1122
1193
|
cursor.execute(f"DROP TABLE {backup_table_quoted}")
|
@@ -1127,7 +1198,7 @@ class MySQLDeduplicator:
|
|
1127
1198
|
except Exception as e:
|
1128
1199
|
logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
|
1129
1200
|
# 回滚:如临时表存在则删掉,恢复原表结构
|
1130
|
-
with self.
|
1201
|
+
with self._conn_ctx() as conn:
|
1131
1202
|
with conn.cursor() as cursor:
|
1132
1203
|
try:
|
1133
1204
|
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
@@ -1135,7 +1206,7 @@ class MySQLDeduplicator:
|
|
1135
1206
|
logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
1136
1207
|
# 恢复原表(如备份表存在)
|
1137
1208
|
try:
|
1138
|
-
with self.
|
1209
|
+
with self._conn_ctx() as conn2:
|
1139
1210
|
with conn2.cursor() as cursor2:
|
1140
1211
|
if self._check_table_exists(database, backup_table):
|
1141
1212
|
cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
|
@@ -1227,23 +1298,16 @@ def main():
|
|
1227
1298
|
batch_size=1000,
|
1228
1299
|
skip_system_dbs=True,
|
1229
1300
|
max_retries=3,
|
1230
|
-
|
1231
|
-
pool_size=
|
1301
|
+
retry_waiting_time=5,
|
1302
|
+
# pool_size=30,
|
1232
1303
|
recent_month=1,
|
1233
1304
|
# date_range=['2025-06-09', '2025-06-10'],
|
1234
1305
|
date_column='日期',
|
1235
|
-
exclude_columns=None,
|
1236
1306
|
exclude_databases=['测试库4'],
|
1237
1307
|
exclude_tables={
|
1238
1308
|
'推广数据2': [
|
1239
1309
|
'地域报表_城市_2025_04',
|
1240
|
-
'地域报表_城市_2025_05',
|
1241
|
-
'地域报表_城市_2025_06',
|
1242
1310
|
# '地域报表_城市_2025_04_copy1',
|
1243
|
-
# '地域报表_城市_2025_05_copy1',
|
1244
|
-
# '地域报表_城市_2025_06_copy1',
|
1245
|
-
'奥莱店_主体报表',
|
1246
|
-
# '奥莱店_主体报表_copy1',
|
1247
1311
|
],
|
1248
1312
|
"生意参谋3": [
|
1249
1313
|
"商品排行_2025",
|
@@ -1255,10 +1319,10 @@ def main():
|
|
1255
1319
|
deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
|
1256
1320
|
|
1257
1321
|
# # 指定数据库去重(多线程)
|
1258
|
-
# deduplicator.deduplicate_database('
|
1322
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
|
1259
1323
|
|
1260
1324
|
# # 指定表去重(使用特定列)
|
1261
|
-
# deduplicator.deduplicate_table('
|
1325
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
|
1262
1326
|
|
1263
1327
|
# # 重排id列
|
1264
1328
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
mdbq/mysql/uploader.py
CHANGED
@@ -23,8 +23,8 @@ logger = mylogger.MyLogger(
|
|
23
23
|
max_log_size=50,
|
24
24
|
backup_count=5,
|
25
25
|
enable_async=False, # 是否启用异步日志
|
26
|
-
sample_rate=1, # 采样
|
27
|
-
sensitive_fields=[], #
|
26
|
+
sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
|
27
|
+
sensitive_fields=[], # 过滤敏感字段列表
|
28
28
|
)
|
29
29
|
|
30
30
|
|
@@ -83,7 +83,7 @@ class MySQLUploader:
|
|
83
83
|
charset: str = 'utf8mb4',
|
84
84
|
collation: str = 'utf8mb4_0900_ai_ci',
|
85
85
|
max_retries: int = 10,
|
86
|
-
|
86
|
+
retry_waiting_time: int = 10,
|
87
87
|
pool_size: int = 5,
|
88
88
|
connect_timeout: int = 10,
|
89
89
|
read_timeout: int = 30,
|
@@ -100,7 +100,7 @@ class MySQLUploader:
|
|
100
100
|
:param charset: 字符集,默认为utf8mb4
|
101
101
|
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci,对大小写不敏感,utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
|
102
102
|
:param max_retries: 最大重试次数,默认为10
|
103
|
-
:param
|
103
|
+
:param retry_waiting_time: 重试间隔(秒),默认为10
|
104
104
|
:param pool_size: 连接池大小,默认为5
|
105
105
|
:param connect_timeout: 连接超时(秒),默认为10
|
106
106
|
:param read_timeout: 读取超时(秒),默认为30
|
@@ -114,7 +114,7 @@ class MySQLUploader:
|
|
114
114
|
self.charset = charset
|
115
115
|
self.collation = collation
|
116
116
|
self.max_retries = max(max_retries, 1)
|
117
|
-
self.
|
117
|
+
self.retry_waiting_time = max(retry_waiting_time, 1)
|
118
118
|
self.pool_size = max(pool_size, 1)
|
119
119
|
self.connect_timeout = connect_timeout
|
120
120
|
self.read_timeout = read_timeout
|
@@ -169,7 +169,7 @@ class MySQLUploader:
|
|
169
169
|
}
|
170
170
|
try:
|
171
171
|
pool = PooledDB(**pool_params)
|
172
|
-
logger.
|
172
|
+
logger.debug('连接池创建成功', {'连接池': self.pool_size, 'host': self.host, 'port': self.port})
|
173
173
|
return pool
|
174
174
|
except Exception as e:
|
175
175
|
self.pool = None
|
@@ -188,14 +188,11 @@ class MySQLUploader:
|
|
188
188
|
def wrapper(self, *args, **kwargs):
|
189
189
|
last_exception = None
|
190
190
|
operation = func.__name__
|
191
|
-
logger.debug(f'开始执行操作: {operation}', {'max_retries': self.max_retries})
|
192
191
|
for attempt in range(self.max_retries):
|
193
192
|
try:
|
194
193
|
result = func(self, *args, **kwargs)
|
195
194
|
if attempt > 0:
|
196
195
|
logger.info('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
|
197
|
-
else:
|
198
|
-
logger.debug('操作成功', {'operation': operation})
|
199
196
|
return result
|
200
197
|
except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
|
201
198
|
last_exception = e
|
@@ -207,7 +204,7 @@ class MySQLUploader:
|
|
207
204
|
'max_retries': self.max_retries
|
208
205
|
}
|
209
206
|
if attempt < self.max_retries - 1:
|
210
|
-
wait_time = self.
|
207
|
+
wait_time = self.retry_waiting_time * (attempt + 1)
|
211
208
|
error_details['wait_time'] = wait_time
|
212
209
|
logger.warning('数据库操作失败,准备重试', error_details)
|
213
210
|
time.sleep(wait_time)
|
@@ -218,13 +215,6 @@ class MySQLUploader:
|
|
218
215
|
logger.error('重连失败', {'error': str(reconnect_error)})
|
219
216
|
else:
|
220
217
|
logger.error('操作最终失败', error_details)
|
221
|
-
except pymysql.IntegrityError as e:
|
222
|
-
logger.error('完整性约束错误', {
|
223
|
-
'operation': operation,
|
224
|
-
'error_code': e.args[0] if e.args else None,
|
225
|
-
'error_message': e.args[1] if len(e.args) > 1 else None
|
226
|
-
})
|
227
|
-
raise e
|
228
218
|
except Exception as e:
|
229
219
|
last_exception = e
|
230
220
|
logger.error('发生意外错误', {
|
@@ -247,10 +237,9 @@ class MySQLUploader:
|
|
247
237
|
"""
|
248
238
|
try:
|
249
239
|
conn = self.pool.connection()
|
250
|
-
logger.debug('获取数据库连接', {'host': self.host, 'port': self.port})
|
251
240
|
return conn
|
252
241
|
except Exception as e:
|
253
|
-
logger.error('
|
242
|
+
logger.error('从连接池获取数据库连接失败', {'error': str(e)})
|
254
243
|
raise ConnectionError(f'连接数据库失败: {str(e)}')
|
255
244
|
|
256
245
|
@_execute_with_retry
|
@@ -392,7 +381,8 @@ class MySQLUploader:
|
|
392
381
|
primary_keys: Optional[List[str]] = None,
|
393
382
|
date_column: Optional[str] = None,
|
394
383
|
indexes: Optional[List[str]] = None,
|
395
|
-
allow_null: bool = False
|
384
|
+
allow_null: bool = False,
|
385
|
+
unique_keys: Optional[List[List[str]]] = None
|
396
386
|
) -> None:
|
397
387
|
"""
|
398
388
|
创建数据表,优化索引创建方式
|
@@ -402,39 +392,48 @@ class MySQLUploader:
|
|
402
392
|
if not set_typ:
|
403
393
|
logger.error('建表时未指定set_typ', {'库': db_name, '表': table_name})
|
404
394
|
raise ValueError('set_typ 未指定')
|
395
|
+
# set_typ的键清洗
|
396
|
+
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
405
397
|
column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
|
406
398
|
for col_name, col_type in set_typ.items():
|
407
|
-
if col_name
|
399
|
+
if col_name == 'id':
|
408
400
|
continue
|
409
|
-
safe_col_name = self.
|
401
|
+
safe_col_name = self._normalize_col(col_name)
|
410
402
|
col_def = f"`{safe_col_name}` {col_type}"
|
411
403
|
if not allow_null and not col_type.lower().startswith('json'):
|
412
404
|
col_def += " NOT NULL"
|
413
405
|
column_defs.append(col_def)
|
414
|
-
|
415
|
-
|
416
|
-
|
406
|
+
# 主键处理逻辑调整
|
407
|
+
if primary_keys and len(primary_keys) > 0:
|
408
|
+
safe_primary_keys = [self._normalize_col(pk) for pk in primary_keys]
|
409
|
+
primary_key_sql = f"PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
417
410
|
else:
|
418
|
-
|
419
|
-
|
420
|
-
primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
|
411
|
+
safe_primary_keys = [self._normalize_col('id')]
|
412
|
+
primary_key_sql = f"PRIMARY KEY (`id`)"
|
421
413
|
# 索引统一在CREATE TABLE中定义
|
422
414
|
index_defs = []
|
423
415
|
if date_column and date_column in set_typ:
|
424
|
-
safe_date_col = self.
|
416
|
+
safe_date_col = self._normalize_col(date_column)
|
425
417
|
index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
|
426
418
|
if indexes:
|
427
419
|
for idx_col in indexes:
|
428
420
|
if idx_col in set_typ:
|
429
|
-
safe_idx_col = self.
|
421
|
+
safe_idx_col = self._normalize_col(idx_col)
|
430
422
|
index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
|
423
|
+
# UNIQUE KEY定义
|
424
|
+
unique_defs = []
|
425
|
+
if unique_keys:
|
426
|
+
for idx, unique_cols in enumerate(unique_keys):
|
427
|
+
if not unique_cols:
|
428
|
+
continue
|
429
|
+
safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
|
430
|
+
unique_name = f"uniq_{'_'.join(safe_unique_cols)}_{idx}"
|
431
|
+
unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
|
431
432
|
index_defs = list(set(index_defs))
|
432
|
-
|
433
|
+
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
433
434
|
sql = f"""
|
434
435
|
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
435
|
-
{','.join(
|
436
|
-
{primary_key_sql}
|
437
|
-
{index_sql}
|
436
|
+
{','.join(all_defs)}
|
438
437
|
) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
|
439
438
|
"""
|
440
439
|
conn = None
|
@@ -443,7 +442,7 @@ class MySQLUploader:
|
|
443
442
|
with conn.cursor() as cursor:
|
444
443
|
cursor.execute(sql)
|
445
444
|
conn.commit()
|
446
|
-
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes})
|
445
|
+
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
|
447
446
|
except Exception as e:
|
448
447
|
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
449
448
|
if conn is not None:
|
@@ -476,11 +475,9 @@ class MySQLUploader:
|
|
476
475
|
try:
|
477
476
|
if date_type:
|
478
477
|
result = pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
|
479
|
-
logger.debug('日期格式化成功', {'原始': value, '格式': fmt, '结果': str(result)})
|
480
478
|
return result
|
481
479
|
else:
|
482
480
|
result = datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
|
483
|
-
logger.debug('日期格式化成功', {'原始': value, '格式': fmt, '结果': str(result)})
|
484
481
|
return result
|
485
482
|
except ValueError:
|
486
483
|
continue
|
@@ -613,7 +610,7 @@ class MySQLUploader:
|
|
613
610
|
cursor.execute(sql_check, (db_name, table_name, column))
|
614
611
|
exists = cursor.fetchone()
|
615
612
|
if exists and list(exists.values())[0] > 0:
|
616
|
-
logger.debug('
|
613
|
+
logger.debug('索引检查', {'库': db_name, '表': table_name, '索引列': column})
|
617
614
|
return
|
618
615
|
cursor.execute(sql_create)
|
619
616
|
conn.commit()
|
@@ -622,6 +619,49 @@ class MySQLUploader:
|
|
622
619
|
logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
|
623
620
|
raise
|
624
621
|
|
622
|
+
def _get_existing_unique_keys(self, db_name: str, table_name: str) -> List[List[str]]:
|
623
|
+
"""
|
624
|
+
获取表中所有UNIQUE KEY的列组合(不含主键)。
|
625
|
+
返回:[[col1, col2], ...]
|
626
|
+
"""
|
627
|
+
db_name = self._validate_identifier(db_name)
|
628
|
+
table_name = self._validate_identifier(table_name)
|
629
|
+
sql = '''
|
630
|
+
SELECT INDEX_NAME, COLUMN_NAME
|
631
|
+
FROM INFORMATION_SCHEMA.STATISTICS
|
632
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND NON_UNIQUE = 0 AND INDEX_NAME != 'PRIMARY'
|
633
|
+
ORDER BY INDEX_NAME, SEQ_IN_INDEX
|
634
|
+
'''
|
635
|
+
unique_map = {}
|
636
|
+
try:
|
637
|
+
with self._get_connection() as conn:
|
638
|
+
with conn.cursor() as cursor:
|
639
|
+
cursor.execute(sql, (db_name, table_name))
|
640
|
+
for row in cursor.fetchall():
|
641
|
+
idx = row['INDEX_NAME']
|
642
|
+
col = row['COLUMN_NAME']
|
643
|
+
unique_map.setdefault(idx, []).append(col)
|
644
|
+
except Exception as e:
|
645
|
+
logger.warning('获取UNIQUE KEY信息失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
646
|
+
# 只返回列名组合,全部清洗小写
|
647
|
+
return [[self._normalize_col(c) for c in cols] for cols in unique_map.values() if cols]
|
648
|
+
|
649
|
+
def _add_unique_key(self, db_name: str, table_name: str, unique_cols: List[str]):
|
650
|
+
"""
|
651
|
+
添加UNIQUE KEY
|
652
|
+
"""
|
653
|
+
safe_cols = [self._normalize_col(col) for col in unique_cols]
|
654
|
+
unique_name = f"uniq_{'_'.join(safe_cols)}_{int(time.time()*1000)%100000}"
|
655
|
+
sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
|
656
|
+
try:
|
657
|
+
with self._get_connection() as conn:
|
658
|
+
with conn.cursor() as cursor:
|
659
|
+
cursor.execute(sql)
|
660
|
+
conn.commit()
|
661
|
+
logger.info('添加唯一约束列成功', {'库': db_name, '表': table_name, '列': unique_cols})
|
662
|
+
except Exception as e:
|
663
|
+
logger.warning('唯一约束列添加失败', {'库': db_name, '表': table_name, '列': unique_cols, '错误': str(e)})
|
664
|
+
|
625
665
|
def _upload_to_table(
|
626
666
|
self,
|
627
667
|
db_name: str,
|
@@ -637,14 +677,15 @@ class MySQLUploader:
|
|
637
677
|
indexes: Optional[List[str]],
|
638
678
|
batch_id: Optional[str] = None,
|
639
679
|
update_on_duplicate: bool = False,
|
640
|
-
transaction_mode: str = "batch"
|
680
|
+
transaction_mode: str = "batch",
|
681
|
+
unique_keys: Optional[List[List[str]]] = None
|
641
682
|
):
|
642
683
|
"""实际执行表上传的方法"""
|
643
|
-
|
644
|
-
if not
|
684
|
+
table_existed = self._check_table_exists(db_name, table_name)
|
685
|
+
if not table_existed:
|
645
686
|
if auto_create:
|
646
687
|
self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
|
647
|
-
allow_null=allow_null)
|
688
|
+
allow_null=allow_null, unique_keys=unique_keys)
|
648
689
|
else:
|
649
690
|
logger.error('数据表不存在', {
|
650
691
|
'库': db_name,
|
@@ -652,8 +693,30 @@ class MySQLUploader:
|
|
652
693
|
'func': sys._getframe().f_code.co_name,
|
653
694
|
})
|
654
695
|
raise ValueError(f"数据表不存在: `{db_name}`.`{table_name}`")
|
655
|
-
|
656
|
-
|
696
|
+
if table_existed and unique_keys:
|
697
|
+
try:
|
698
|
+
exist_ukeys = self._get_existing_unique_keys(db_name, table_name)
|
699
|
+
exist_ukeys_norm = [sorted([c.lower() for c in uk]) for uk in exist_ukeys]
|
700
|
+
filtered_ukeys = [uk for uk in unique_keys if 1 <= len(uk) <= 20]
|
701
|
+
to_add = []
|
702
|
+
for uk in filtered_ukeys:
|
703
|
+
norm_uk = sorted([c.lower() for c in uk])
|
704
|
+
if norm_uk not in exist_ukeys_norm:
|
705
|
+
to_add.append(uk)
|
706
|
+
max_unique_keys = 10
|
707
|
+
if len(exist_ukeys) + len(to_add) > max_unique_keys:
|
708
|
+
logger.warning('unique_keys超限', {
|
709
|
+
'库': db_name,
|
710
|
+
'表': table_name,
|
711
|
+
'已存在': exist_ukeys,
|
712
|
+
'本次待添加': to_add,
|
713
|
+
'最大数量': max_unique_keys
|
714
|
+
})
|
715
|
+
to_add = to_add[:max_unique_keys - len(exist_ukeys)]
|
716
|
+
for uk in to_add:
|
717
|
+
self._add_unique_key(db_name, table_name, uk)
|
718
|
+
except Exception as e:
|
719
|
+
logger.warning('动态unique key处理异常', {'库': db_name, '表': table_name, '错误': str(e)})
|
657
720
|
table_columns = self._get_table_columns(db_name, table_name)
|
658
721
|
if not table_columns:
|
659
722
|
logger.error('获取列失败', {
|
@@ -663,8 +726,6 @@ class MySQLUploader:
|
|
663
726
|
'func': sys._getframe().f_code.co_name,
|
664
727
|
})
|
665
728
|
raise ValueError(f"获取列失败 `{db_name}`.`{table_name}`")
|
666
|
-
|
667
|
-
# 验证数据列与表列匹配
|
668
729
|
for col in set_typ:
|
669
730
|
if col not in table_columns:
|
670
731
|
logger.error('列不存在', {
|
@@ -674,22 +735,19 @@ class MySQLUploader:
|
|
674
735
|
'func': sys._getframe().f_code.co_name,
|
675
736
|
})
|
676
737
|
raise ValueError(f"列不存在: `{col}` -> `{db_name}`.`{table_name}`")
|
677
|
-
|
678
|
-
# 确保分表参考字段为索引
|
679
738
|
if date_column and date_column in table_columns:
|
680
739
|
try:
|
681
740
|
self._ensure_index(db_name, table_name, date_column)
|
682
741
|
except Exception as e:
|
683
742
|
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
|
684
|
-
|
685
|
-
# 插入数据
|
686
|
-
self._insert_data(
|
743
|
+
inserted, skipped, failed = self._insert_data(
|
687
744
|
db_name, table_name, data, set_typ,
|
688
745
|
check_duplicate, duplicate_columns,
|
689
746
|
batch_id=batch_id,
|
690
747
|
update_on_duplicate=update_on_duplicate,
|
691
748
|
transaction_mode=transaction_mode
|
692
749
|
)
|
750
|
+
return inserted, skipped, failed
|
693
751
|
|
694
752
|
def _infer_data_type(self, value: Any, no_log: bool = False) -> str:
|
695
753
|
"""
|
@@ -817,11 +875,8 @@ class MySQLUploader:
|
|
817
875
|
# 统一处理原始数据中列名的特殊字符
|
818
876
|
data = self.normalize_column_names(data)
|
819
877
|
|
820
|
-
# set_typ
|
821
|
-
|
822
|
-
set_typ = {k: v for k, v in set_typ.items()}
|
823
|
-
else:
|
824
|
-
set_typ = {k.lower(): v for k, v in set_typ.items()}
|
878
|
+
# set_typ的键清洗
|
879
|
+
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
825
880
|
|
826
881
|
# 获取数据中实际存在的列名
|
827
882
|
data_columns = set()
|
@@ -890,7 +945,8 @@ class MySQLUploader:
|
|
890
945
|
auto_create: bool = True,
|
891
946
|
indexes: Optional[List[str]] = None,
|
892
947
|
update_on_duplicate: bool = False,
|
893
|
-
transaction_mode: str = "batch"
|
948
|
+
transaction_mode: str = "batch",
|
949
|
+
unique_keys: Optional[List[List[str]]] = None
|
894
950
|
):
|
895
951
|
"""
|
896
952
|
上传数据到数据库的主入口方法,分表逻辑异常处理统计丢弃数据
|
@@ -912,6 +968,7 @@ class MySQLUploader:
|
|
912
968
|
- 'row' : 逐行提交事务(错误隔离性好)
|
913
969
|
- 'batch' : 整批提交事务(性能最优)
|
914
970
|
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
971
|
+
:param unique_keys: 唯一约束列表,每个元素为列名列表,支持多列组合唯一约束
|
915
972
|
:raises: 可能抛出各种验证和数据库相关异常
|
916
973
|
"""
|
917
974
|
# upload_start = time.time()
|
@@ -936,7 +993,8 @@ class MySQLUploader:
|
|
936
993
|
# '自动建表': auto_create,
|
937
994
|
'索引': indexes,
|
938
995
|
'更新旧数据': update_on_duplicate,
|
939
|
-
'事务模式': transaction_mode
|
996
|
+
'事务模式': transaction_mode,
|
997
|
+
'唯一约束': unique_keys
|
940
998
|
},
|
941
999
|
# '数据样例': self._shorten_for_log(data, 2)
|
942
1000
|
})
|
@@ -1005,15 +1063,21 @@ class MySQLUploader:
|
|
1005
1063
|
continue
|
1006
1064
|
|
1007
1065
|
# 对每个分表执行上传
|
1066
|
+
total_inserted = 0
|
1067
|
+
total_skipped = dropped_rows # 分表异常丢弃
|
1068
|
+
total_failed = 0
|
1008
1069
|
for part_table, part_data in partitioned_data.items():
|
1009
1070
|
try:
|
1010
|
-
self._upload_to_table(
|
1071
|
+
inserted, skipped, failed = self._upload_to_table(
|
1011
1072
|
db_name, part_table, part_data, filtered_set_typ,
|
1012
1073
|
primary_keys, check_duplicate, duplicate_columns,
|
1013
1074
|
allow_null, auto_create, partition_date_column,
|
1014
|
-
indexes, batch_id, update_on_duplicate, transaction_mode
|
1075
|
+
indexes, batch_id, update_on_duplicate, transaction_mode,
|
1076
|
+
unique_keys
|
1015
1077
|
)
|
1016
|
-
|
1078
|
+
total_inserted += inserted
|
1079
|
+
total_skipped += skipped
|
1080
|
+
total_failed += failed
|
1017
1081
|
if partition_date_column in filtered_set_typ:
|
1018
1082
|
try:
|
1019
1083
|
self._ensure_index(db_name, part_table, partition_date_column)
|
@@ -1031,13 +1095,16 @@ class MySQLUploader:
|
|
1031
1095
|
continue # 跳过当前分表,继续处理其他分表
|
1032
1096
|
else:
|
1033
1097
|
# 不分表,直接上传
|
1034
|
-
self._upload_to_table(
|
1098
|
+
inserted, skipped, failed = self._upload_to_table(
|
1035
1099
|
db_name, table_name, prepared_data, filtered_set_typ,
|
1036
1100
|
primary_keys, check_duplicate, duplicate_columns,
|
1037
1101
|
allow_null, auto_create, partition_date_column,
|
1038
|
-
indexes, batch_id, update_on_duplicate, transaction_mode
|
1102
|
+
indexes, batch_id, update_on_duplicate, transaction_mode,
|
1103
|
+
unique_keys
|
1039
1104
|
)
|
1040
|
-
|
1105
|
+
total_inserted = inserted
|
1106
|
+
total_skipped = skipped
|
1107
|
+
total_failed = failed
|
1041
1108
|
if partition_date_column in filtered_set_typ:
|
1042
1109
|
try:
|
1043
1110
|
self._ensure_index(db_name, table_name, partition_date_column)
|
@@ -1062,7 +1129,9 @@ class MySQLUploader:
|
|
1062
1129
|
'批次': batch_id,
|
1063
1130
|
'finish': success_flag,
|
1064
1131
|
'数据行': initial_row_count,
|
1065
|
-
'
|
1132
|
+
'插入': total_inserted,
|
1133
|
+
'跳过': total_skipped,
|
1134
|
+
'失败': total_failed
|
1066
1135
|
})
|
1067
1136
|
|
1068
1137
|
@_execute_with_retry
|
@@ -1095,26 +1164,19 @@ class MySQLUploader:
|
|
1095
1164
|
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
1096
1165
|
"""
|
1097
1166
|
if not data:
|
1098
|
-
return
|
1099
|
-
|
1100
|
-
# 验证事务模式
|
1167
|
+
return 0, 0, 0
|
1101
1168
|
transaction_mode = self._validate_transaction_mode(transaction_mode)
|
1102
|
-
|
1103
|
-
# 准备SQL语句
|
1104
1169
|
sql = self._prepare_insert_sql(
|
1105
1170
|
db_name, table_name, set_typ,
|
1106
1171
|
check_duplicate, duplicate_columns,
|
1107
1172
|
update_on_duplicate
|
1108
1173
|
)
|
1109
|
-
|
1110
|
-
# 执行批量插入
|
1111
1174
|
total_inserted, total_skipped, total_failed = self._execute_batch_insert(
|
1112
1175
|
db_name, table_name, data, set_typ,
|
1113
1176
|
sql, check_duplicate, duplicate_columns,
|
1114
1177
|
batch_id, transaction_mode,
|
1115
1178
|
update_on_duplicate
|
1116
1179
|
)
|
1117
|
-
|
1118
1180
|
logger.info('插入完成', {
|
1119
1181
|
'库': db_name,
|
1120
1182
|
'表': table_name,
|
@@ -1124,6 +1186,7 @@ class MySQLUploader:
|
|
1124
1186
|
'失败': total_failed,
|
1125
1187
|
'事务模式': transaction_mode,
|
1126
1188
|
})
|
1189
|
+
return total_inserted, total_skipped, total_failed
|
1127
1190
|
|
1128
1191
|
def _validate_transaction_mode(self, mode: str) -> str:
|
1129
1192
|
"""验证并标准化事务模式"""
|
@@ -1266,6 +1329,7 @@ class MySQLUploader:
|
|
1266
1329
|
update_on_duplicate: bool = False
|
1267
1330
|
) -> Tuple[int, int, int]:
|
1268
1331
|
"""执行批量插入操作,优化batch和hybrid模式"""
|
1332
|
+
import pymysql # 确保异常类型可用
|
1269
1333
|
def get_optimal_batch_size(total_rows: int) -> int:
|
1270
1334
|
if total_rows <= 100:
|
1271
1335
|
return total_rows
|
@@ -1295,7 +1359,13 @@ class MySQLUploader:
|
|
1295
1359
|
try:
|
1296
1360
|
cursor.executemany(sql, values_list)
|
1297
1361
|
conn.commit()
|
1298
|
-
|
1362
|
+
inserted = cursor.rowcount if cursor.rowcount is not None else 0
|
1363
|
+
total_inserted += inserted
|
1364
|
+
total_skipped += len(batch) - inserted
|
1365
|
+
except pymysql.err.IntegrityError as e:
|
1366
|
+
conn.rollback()
|
1367
|
+
total_skipped += len(batch)
|
1368
|
+
logger.debug('批量插入唯一约束冲突,全部跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
1299
1369
|
except Exception as e:
|
1300
1370
|
conn.rollback()
|
1301
1371
|
total_failed += len(batch)
|
@@ -1311,7 +1381,15 @@ class MySQLUploader:
|
|
1311
1381
|
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1312
1382
|
values += [row.get(col) for col in dup_cols]
|
1313
1383
|
cursor.execute(sql, values)
|
1314
|
-
|
1384
|
+
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
1385
|
+
if affected > 0:
|
1386
|
+
total_inserted += 1
|
1387
|
+
else:
|
1388
|
+
total_skipped += 1
|
1389
|
+
except pymysql.err.IntegrityError as e:
|
1390
|
+
conn.rollback()
|
1391
|
+
total_skipped += 1
|
1392
|
+
logger.debug('hybrid单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
1315
1393
|
except Exception as e:
|
1316
1394
|
conn.rollback()
|
1317
1395
|
total_failed += 1
|
@@ -1325,8 +1403,16 @@ class MySQLUploader:
|
|
1325
1403
|
dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
|
1326
1404
|
values += [row.get(col) for col in dup_cols]
|
1327
1405
|
cursor.execute(sql, values)
|
1406
|
+
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
1407
|
+
if affected > 0:
|
1408
|
+
total_inserted += 1
|
1409
|
+
else:
|
1410
|
+
total_skipped += 1
|
1328
1411
|
conn.commit()
|
1329
|
-
|
1412
|
+
except pymysql.err.IntegrityError as e:
|
1413
|
+
conn.rollback()
|
1414
|
+
total_skipped += 1
|
1415
|
+
logger.debug('单行插入唯一约束冲突,跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
1330
1416
|
except Exception as e:
|
1331
1417
|
conn.rollback()
|
1332
1418
|
total_failed += 1
|
@@ -1347,9 +1433,9 @@ class MySQLUploader:
|
|
1347
1433
|
self.pool = None
|
1348
1434
|
except Exception as e:
|
1349
1435
|
logger.warning('关闭连接池时出错', {'error': str(e)})
|
1350
|
-
logger.
|
1436
|
+
logger.debug('finished', {'uploader.py': '连接池关闭'})
|
1351
1437
|
except Exception as e:
|
1352
|
-
logger.error('关闭连接池失败', {'
|
1438
|
+
logger.error('关闭连接池失败', {'uploader.py': str(e)})
|
1353
1439
|
raise
|
1354
1440
|
|
1355
1441
|
def _check_pool_health(self) -> bool:
|
@@ -1431,6 +1517,13 @@ class MySQLUploader:
|
|
1431
1517
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
1432
1518
|
self.close()
|
1433
1519
|
|
1520
|
+
def _normalize_col(self, col: str) -> str:
|
1521
|
+
"""
|
1522
|
+
列名自动清洗并转小写(如case_sensitive为False),保证和表结构一致。
|
1523
|
+
"""
|
1524
|
+
safe = self._validate_identifier(col)
|
1525
|
+
return safe if self.case_sensitive else safe.lower()
|
1526
|
+
|
1434
1527
|
|
1435
1528
|
def main():
|
1436
1529
|
"""
|
@@ -1443,7 +1536,7 @@ def main():
|
|
1443
1536
|
"""
|
1444
1537
|
uploader = MySQLUploader(
|
1445
1538
|
username='root',
|
1446
|
-
password='
|
1539
|
+
password='pwd',
|
1447
1540
|
host='localhost',
|
1448
1541
|
port=3306,
|
1449
1542
|
)
|
@@ -1462,7 +1555,7 @@ def main():
|
|
1462
1555
|
{'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
|
1463
1556
|
{'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': '100'},
|
1464
1557
|
{'日期': '2023-01-15', 'name': 'Alice', 'AGE': 30, 'salary': 0.0},
|
1465
|
-
{'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75}
|
1558
|
+
{'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75},
|
1466
1559
|
]
|
1467
1560
|
|
1468
1561
|
# 上传数据
|
@@ -1474,12 +1567,13 @@ def main():
|
|
1474
1567
|
primary_keys=[], # 创建唯一主键
|
1475
1568
|
check_duplicate=False, # 检查重复数据
|
1476
1569
|
duplicate_columns=[], # 指定排重的组合键
|
1570
|
+
update_on_duplicate=False, # 更新旧数据
|
1477
1571
|
allow_null=False, # 允许插入空值
|
1478
|
-
partition_by='year', #
|
1572
|
+
partition_by='year', # 分表方式
|
1479
1573
|
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
1480
|
-
|
1481
|
-
indexes=[], # 指定索引列
|
1574
|
+
indexes=[], # 普通索引列
|
1482
1575
|
transaction_mode='row', # 事务模式
|
1576
|
+
unique_keys=[['日期', 'name', 'age']] # 唯一约束列表
|
1483
1577
|
)
|
1484
1578
|
|
1485
1579
|
uploader.close()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=vHfePSxiigIQg58VIYYk2QYh_4AtpXtMsfV3nHXNUhg,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,10 +8,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=PD8gA2PixoK2ZH4vWTmz1kbNTab8VGUJLoepD024H5Q,70265
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.
|
28
|
-
mdbq-3.
|
29
|
-
mdbq-3.
|
30
|
-
mdbq-3.
|
27
|
+
mdbq-3.12.1.dist-info/METADATA,sha256=viVkeKnHLlpvAxthu_c50VYyla5Uc2COG99IigfDPmc,364
|
28
|
+
mdbq-3.12.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.12.1.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.12.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|