mdbq 3.11.10__py3-none-any.whl → 3.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +480 -299
- {mdbq-3.11.10.dist-info → mdbq-3.12.0.dist-info}/METADATA +1 -1
- {mdbq-3.11.10.dist-info → mdbq-3.12.0.dist-info}/RECORD +6 -6
- {mdbq-3.11.10.dist-info → mdbq-3.12.0.dist-info}/WHEEL +0 -0
- {mdbq-3.11.10.dist-info → mdbq-3.12.0.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.
|
1
|
+
VERSION = '3.12.0'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -14,6 +14,7 @@ from collections import defaultdict
|
|
14
14
|
import sys
|
15
15
|
from datetime import datetime
|
16
16
|
import uuid
|
17
|
+
from contextlib import contextmanager
|
17
18
|
|
18
19
|
|
19
20
|
warnings.filterwarnings('ignore')
|
@@ -34,32 +35,6 @@ logger = mylogger.MyLogger(
|
|
34
35
|
class MySQLDeduplicator:
|
35
36
|
"""
|
36
37
|
MySQL数据去重
|
37
|
-
|
38
|
-
功能:
|
39
|
-
1. 自动检测并删除MySQL数据库中的重复数据
|
40
|
-
2. 支持全库扫描或指定表理
|
41
|
-
3. 支持多线程/多进程安全处理
|
42
|
-
4. 完善的错误处理和日志记录
|
43
|
-
|
44
|
-
使用示例:
|
45
|
-
deduplicator = MySQLDeduplicator(
|
46
|
-
username='root',
|
47
|
-
password='password',
|
48
|
-
host='localhost',
|
49
|
-
port=3306
|
50
|
-
)
|
51
|
-
|
52
|
-
# 全库去重
|
53
|
-
deduplicator.deduplicate_all()
|
54
|
-
|
55
|
-
# 指定数据库去重(多线程)
|
56
|
-
deduplicator.deduplicate_database('my_db', parallel=True)
|
57
|
-
|
58
|
-
# 指定表去重(使用特定列)
|
59
|
-
deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
|
60
|
-
|
61
|
-
# 关闭连接
|
62
|
-
deduplicator.close()
|
63
38
|
"""
|
64
39
|
|
65
40
|
def __init__(
|
@@ -69,12 +44,12 @@ class MySQLDeduplicator:
|
|
69
44
|
host: str = 'localhost',
|
70
45
|
port: int = 3306,
|
71
46
|
charset: str = 'utf8mb4',
|
72
|
-
max_workers: int =
|
47
|
+
max_workers: int = 2,
|
73
48
|
batch_size: int = 1000,
|
74
49
|
skip_system_dbs: bool = True,
|
75
50
|
max_retries: int = 3,
|
76
|
-
|
77
|
-
pool_size: int =
|
51
|
+
retry_waiting_time: int = 5,
|
52
|
+
pool_size: int = 10,
|
78
53
|
primary_key: str = 'id',
|
79
54
|
date_range: Optional[List[str]] = None,
|
80
55
|
recent_month: Optional[int] = None,
|
@@ -113,35 +88,40 @@ class MySQLDeduplicator:
|
|
113
88
|
cursorclass=pymysql.cursors.DictCursor
|
114
89
|
)
|
115
90
|
|
91
|
+
# 并发模式要将 pool_size 加大
|
92
|
+
MAX_POOL_SIZE = 200
|
93
|
+
MAX_WORKERS = 4
|
94
|
+
if max_workers > MAX_WORKERS:
|
95
|
+
logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS}),自动将 max_workers 调整为 {MAX_WORKERS}")
|
96
|
+
max_workers = MAX_WORKERS
|
97
|
+
expected_threads = max_workers * 10
|
98
|
+
if pool_size < expected_threads:
|
99
|
+
logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10,自动将 pool_size 调整为 {expected_threads}")
|
100
|
+
pool_size = expected_threads
|
101
|
+
if pool_size > MAX_POOL_SIZE:
|
102
|
+
logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE}),自动将 pool_size 调整为 {MAX_POOL_SIZE}")
|
103
|
+
pool_size = MAX_POOL_SIZE
|
104
|
+
self.max_workers = max_workers
|
105
|
+
self.pool_size = pool_size
|
106
|
+
|
116
107
|
# 配置参数
|
117
|
-
self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
|
118
108
|
self.batch_size = batch_size
|
119
109
|
self.skip_system_dbs = skip_system_dbs
|
120
110
|
self.max_retries = max_retries
|
121
|
-
self.
|
111
|
+
self.retry_waiting_time = retry_waiting_time
|
122
112
|
self.primary_key = primary_key
|
123
113
|
|
124
114
|
# 时间范围参数
|
125
|
-
self.date_range = date_range
|
126
|
-
self.recent_month = recent_month
|
127
115
|
self.date_column = date_column
|
128
116
|
self._dedup_start_date = None
|
129
117
|
self._dedup_end_date = None
|
130
|
-
|
131
|
-
default_exclude = {'id'}
|
132
|
-
# exclude_columns 不传则排除: ['id', '更新时间']
|
133
|
-
if not exclude_columns:
|
134
|
-
self.exclude_columns = list(default_exclude | {'更新时间'})
|
135
|
-
else:
|
136
|
-
self.exclude_columns = list(set(exclude_columns) | default_exclude)
|
137
|
-
# 解析时间范围并智能校正date_range
|
138
|
-
if self.date_range and len(self.date_range) == 2:
|
118
|
+
if date_range and len(date_range) == 2:
|
139
119
|
try:
|
140
|
-
start, end =
|
120
|
+
start, end = date_range
|
141
121
|
start_dt = datetime.strptime(start, "%Y-%m-%d")
|
142
122
|
end_dt = datetime.strptime(end, "%Y-%m-%d")
|
143
123
|
if start_dt > end_dt:
|
144
|
-
logger.
|
124
|
+
logger.debug(
|
145
125
|
"date_range顺序不正确,自动交换开始和结束日期。",
|
146
126
|
{"start": start, "end": end}
|
147
127
|
)
|
@@ -151,30 +131,36 @@ class MySQLDeduplicator:
|
|
151
131
|
except Exception as e:
|
152
132
|
logger.error(
|
153
133
|
"date_range参数格式错误,应为['YYYY-MM-DD', 'YYYY-MM-DD'],已忽略时间范围。",
|
154
|
-
{"date_range":
|
134
|
+
{"date_range": date_range, "error": str(e)}
|
155
135
|
)
|
156
136
|
self._dedup_start_date = None
|
157
137
|
self._dedup_end_date = None
|
158
|
-
elif
|
138
|
+
elif recent_month:
|
159
139
|
today = datetime.today()
|
160
|
-
month = today.month -
|
140
|
+
month = today.month - recent_month
|
161
141
|
year = today.year
|
162
142
|
while month <= 0:
|
163
143
|
month += 12
|
164
144
|
year -= 1
|
165
145
|
self._dedup_start_date = f"{year}-{month:02d}-01"
|
166
146
|
self._dedup_end_date = today.strftime("%Y-%m-%d")
|
147
|
+
|
148
|
+
if self._dedup_start_date and self._dedup_end_date:
|
149
|
+
logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
|
150
|
+
|
151
|
+
# 排除列处理,直接合并去重
|
152
|
+
self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
|
167
153
|
|
168
154
|
# 线程安全控制
|
169
155
|
self._lock = threading.Lock()
|
170
156
|
self._processing_tables = set() # 正在处理的表集合
|
171
157
|
|
172
158
|
# 系统数据库列表
|
173
|
-
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
|
159
|
+
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys', 'sakila'}
|
174
160
|
|
175
161
|
# 排除数据库和表的逻辑
|
176
|
-
self.exclude_databases = set(
|
177
|
-
self.exclude_tables = {k.lower(): set(
|
162
|
+
self.exclude_databases = set(db.lower() for db in (exclude_databases or []))
|
163
|
+
self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
|
178
164
|
|
179
165
|
self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
|
180
166
|
|
@@ -197,6 +183,14 @@ class MySQLDeduplicator:
|
|
197
183
|
logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
|
198
184
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
199
185
|
|
186
|
+
@contextmanager
|
187
|
+
def _conn_ctx(self):
|
188
|
+
conn = self._get_connection()
|
189
|
+
try:
|
190
|
+
yield conn
|
191
|
+
finally:
|
192
|
+
conn.close()
|
193
|
+
|
200
194
|
@staticmethod
|
201
195
|
def _retry_on_failure(func: Any) -> Any:
|
202
196
|
"""
|
@@ -220,7 +214,7 @@ class MySQLDeduplicator:
|
|
220
214
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
221
215
|
last_exception = e
|
222
216
|
if attempt < self.max_retries:
|
223
|
-
wait_time = self.
|
217
|
+
wait_time = self.retry_waiting_time * (attempt + 1)
|
224
218
|
logger.warning(
|
225
219
|
f"数据库操作失败,准备重试 (尝试 {attempt + 1}/{self.max_retries})",
|
226
220
|
{'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
|
@@ -236,16 +230,15 @@ class MySQLDeduplicator:
|
|
236
230
|
raise Exception("未知错误")
|
237
231
|
return wrapper
|
238
232
|
|
239
|
-
@_retry_on_failure
|
240
233
|
def _get_databases(self) -> List[str]:
|
241
234
|
"""
|
242
|
-
获取所有非系统数据库列表,排除exclude_databases。
|
235
|
+
获取所有非系统数据库列表,排除 exclude_databases。
|
243
236
|
|
244
237
|
Returns:
|
245
238
|
List[str]: 数据库名列表。
|
246
239
|
"""
|
247
240
|
sql = "SHOW DATABASES"
|
248
|
-
with self.
|
241
|
+
with self._conn_ctx() as conn:
|
249
242
|
with conn.cursor() as cursor:
|
250
243
|
cursor.execute(sql)
|
251
244
|
all_dbs = [row['Database'] for row in cursor.fetchall()]
|
@@ -253,10 +246,9 @@ class MySQLDeduplicator:
|
|
253
246
|
filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
|
254
247
|
return filtered
|
255
248
|
|
256
|
-
@_retry_on_failure
|
257
249
|
def _get_tables(self, database: str) -> List[str]:
|
258
250
|
"""
|
259
|
-
|
251
|
+
获取指定数据库的所有表名(排除 temp_ 前缀的临时表)。
|
260
252
|
|
261
253
|
Args:
|
262
254
|
database (str): 数据库名。
|
@@ -264,15 +256,12 @@ class MySQLDeduplicator:
|
|
264
256
|
List[str]: 表名列表。
|
265
257
|
"""
|
266
258
|
sql = "SHOW TABLES"
|
267
|
-
|
268
|
-
with self._get_connection() as conn:
|
259
|
+
with self._conn_ctx() as conn:
|
269
260
|
with conn.cursor() as cursor:
|
270
261
|
cursor.execute(f"USE `{database}`")
|
271
262
|
cursor.execute(sql)
|
272
|
-
# 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
|
273
263
|
return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
|
274
264
|
|
275
|
-
@_retry_on_failure
|
276
265
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
277
266
|
"""
|
278
267
|
获取指定表的所有列名(排除主键列)。
|
@@ -289,56 +278,22 @@ class MySQLDeduplicator:
|
|
289
278
|
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
290
279
|
ORDER BY ORDINAL_POSITION
|
291
280
|
"""
|
292
|
-
|
293
|
-
with self._get_connection() as conn:
|
281
|
+
with self._conn_ctx() as conn:
|
294
282
|
with conn.cursor() as cursor:
|
295
283
|
cursor.execute(sql, (database, table))
|
296
284
|
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
297
285
|
if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
|
298
286
|
|
299
|
-
def _acquire_table_lock(self, database: str, table: str) -> bool:
|
300
|
-
"""
|
301
|
-
获取表处理锁,防止并发处理同一张表。
|
302
|
-
|
303
|
-
Args:
|
304
|
-
database (str): 数据库名。
|
305
|
-
table (str): 表名。
|
306
|
-
Returns:
|
307
|
-
bool: 是否成功获取锁。
|
308
|
-
"""
|
309
|
-
key = f"{database}.{table}"
|
310
|
-
|
311
|
-
with self._lock:
|
312
|
-
if key in self._processing_tables:
|
313
|
-
logger.debug(f"表 {key} 正在被其他线程处理,跳过")
|
314
|
-
return False
|
315
|
-
self._processing_tables.add(key)
|
316
|
-
return True
|
317
|
-
|
318
|
-
def _release_table_lock(self, database: str, table: str) -> None:
|
319
|
-
"""
|
320
|
-
释放表处理锁。
|
321
|
-
|
322
|
-
Args:
|
323
|
-
database (str): 数据库名。
|
324
|
-
table (str): 表名。
|
325
|
-
"""
|
326
|
-
key = f"{database}.{table}"
|
327
|
-
|
328
|
-
with self._lock:
|
329
|
-
if key in self._processing_tables:
|
330
|
-
self._processing_tables.remove(key)
|
331
|
-
|
332
|
-
@_retry_on_failure
|
333
287
|
def _ensure_index(self, database: str, table: str, date_column: str) -> None:
|
334
288
|
"""
|
335
|
-
检查并为date_column自动创建索引(如果未存在)。
|
289
|
+
检查并为 date_column 自动创建索引(如果未存在)。
|
290
|
+
|
336
291
|
Args:
|
337
292
|
database (str): 数据库名。
|
338
293
|
table (str): 表名。
|
339
294
|
date_column (str): 需要检查的日期列名。
|
340
295
|
"""
|
341
|
-
with self.
|
296
|
+
with self._conn_ctx() as conn:
|
342
297
|
with conn.cursor() as cursor:
|
343
298
|
# 检查索引是否已存在
|
344
299
|
cursor.execute(
|
@@ -356,11 +311,9 @@ class MySQLDeduplicator:
|
|
356
311
|
try:
|
357
312
|
cursor.execute(f"CREATE INDEX `{safe_index_name}` ON `{database}`.`{table}` (`{date_column}`)")
|
358
313
|
conn.commit()
|
359
|
-
logger.
|
314
|
+
logger.debug('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
|
360
315
|
except Exception as e:
|
361
316
|
logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
|
362
|
-
else:
|
363
|
-
logger.debug('date_column已存在索引', {"库": database, "表": table, "date_column": date_column})
|
364
317
|
|
365
318
|
def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
|
366
319
|
"""
|
@@ -377,7 +330,7 @@ class MySQLDeduplicator:
|
|
377
330
|
offset = 0
|
378
331
|
while True:
|
379
332
|
sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
|
380
|
-
with self.
|
333
|
+
with self._conn_ctx() as conn:
|
381
334
|
with conn.cursor() as cursor:
|
382
335
|
cursor.execute(sql)
|
383
336
|
rows = cursor.fetchall()
|
@@ -388,85 +341,184 @@ class MySQLDeduplicator:
|
|
388
341
|
if len(rows) < batch_size:
|
389
342
|
break
|
390
343
|
offset += batch_size
|
391
|
-
|
392
|
-
def _get_all_dates(self, database: str, table: str, date_column: str) ->
|
344
|
+
|
345
|
+
def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
|
393
346
|
"""
|
394
347
|
获取表中所有不同的日期分区(按天)。
|
348
|
+
|
395
349
|
Args:
|
396
350
|
database (str): 数据库名。
|
397
351
|
table (str): 表名。
|
398
352
|
date_column (str): 日期列名。
|
399
353
|
Returns:
|
400
|
-
List: 所有不同的日期(字符串)。
|
354
|
+
List[str]: 所有不同的日期(字符串)。
|
401
355
|
"""
|
402
356
|
sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
|
403
|
-
with self.
|
357
|
+
with self._conn_ctx() as conn:
|
404
358
|
with conn.cursor() as cursor:
|
405
359
|
cursor.execute(sql)
|
406
360
|
return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
|
407
361
|
|
408
362
|
def _deduplicate_table(
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
lock_table: bool = True
|
363
|
+
self,
|
364
|
+
database: str,
|
365
|
+
table: str,
|
366
|
+
columns: Optional[List[str]] = None,
|
367
|
+
dry_run: bool = False,
|
368
|
+
use_python_dedup: bool = False,
|
369
|
+
date_val: Optional[str] = None,
|
370
|
+
lock_table: bool = True
|
418
371
|
) -> Tuple[int, int]:
|
419
372
|
"""
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
373
|
+
执行单表单天去重。只处理 date_val 这一天的数据(如果有 date_column),否则全表。
|
374
|
+
|
375
|
+
Args:
|
376
|
+
database (str): 数据库名。
|
377
|
+
table (str): 表名。
|
378
|
+
columns (Optional[List[str]]): 指定去重列。
|
379
|
+
dry_run (bool): 是否为模拟运行。
|
380
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
381
|
+
date_val (Optional[str]): 指定处理的日期(如有 date_column)。
|
382
|
+
lock_table (bool): 是否加表级锁。
|
383
|
+
Returns:
|
384
|
+
Tuple[int, int]: (重复组数, 实际删除行数)
|
424
385
|
"""
|
425
386
|
if lock_table and not self._acquire_table_lock(database, table):
|
426
387
|
return (0, 0)
|
427
388
|
temp_table = None
|
428
389
|
try:
|
429
|
-
# 获取实际列名
|
430
390
|
all_columns = self._get_table_columns(database, table)
|
431
391
|
all_columns_lower = [col.lower() for col in all_columns]
|
432
392
|
exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
|
433
393
|
time_col = self.date_column
|
434
394
|
time_col_lower = time_col.lower() if time_col else None
|
435
|
-
# 1. 跳过date_column在exclude_columns的情况
|
436
395
|
if time_col_lower and time_col_lower in exclude_columns_lower:
|
437
396
|
logger.warning('date_column在exclude_columns中,跳过该表', {"库": database, "表": table, "date_column": time_col, "exclude_columns": self.exclude_columns})
|
438
397
|
return (0, 0)
|
439
|
-
# 2. 判断表是否包含date_column
|
440
398
|
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
441
|
-
|
442
|
-
|
399
|
+
|
400
|
+
# 只要有date_column,始终分天处理(本函数只处理一天)
|
401
|
+
if has_time_col and date_val is not None:
|
443
402
|
self._ensure_index(database, table, time_col)
|
444
|
-
#
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
if
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
403
|
+
# 获取去重列
|
404
|
+
use_columns = columns or all_columns
|
405
|
+
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
406
|
+
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
407
|
+
if invalid_columns:
|
408
|
+
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
409
|
+
if not use_columns:
|
410
|
+
logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
|
411
|
+
return (0, 0)
|
412
|
+
pk = self.primary_key
|
413
|
+
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
414
|
+
where_sql = f"t.`{time_col}` = '{date_val}'"
|
415
|
+
# 获取原始数据总量(只统计当天数据)
|
416
|
+
with self._conn_ctx() as conn:
|
417
|
+
with conn.cursor() as cursor:
|
418
|
+
count_where = f"WHERE `{time_col}` = '{date_val}'"
|
419
|
+
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
420
|
+
logger.debug('执行SQL', {'sql': count_sql})
|
421
|
+
cursor.execute(count_sql)
|
422
|
+
total_count_row = cursor.fetchone()
|
423
|
+
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
424
|
+
logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": date_val})
|
425
|
+
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
426
|
+
|
427
|
+
# 用Python查找重复
|
428
|
+
if use_python_dedup:
|
429
|
+
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
430
|
+
select_where = f"WHERE `{time_col}` = '{date_val}'"
|
431
|
+
grouped = defaultdict(list)
|
432
|
+
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
433
|
+
key = tuple(row[col] for col in use_columns)
|
434
|
+
grouped[key].append(row[pk_real])
|
435
|
+
dup_count = 0
|
436
|
+
del_ids = []
|
437
|
+
for ids in grouped.values():
|
438
|
+
if len(ids) > 1:
|
439
|
+
dup_count += 1
|
440
|
+
del_ids.extend(ids[1:])
|
441
|
+
affected_rows = 0
|
442
|
+
if not dry_run and del_ids:
|
443
|
+
with self._conn_ctx() as conn:
|
444
|
+
with conn.cursor() as cursor:
|
445
|
+
for i in range(0, len(del_ids), self.batch_size):
|
446
|
+
batch_ids = del_ids[i:i+self.batch_size]
|
447
|
+
del_ids_str = ','.join([str(i) for i in batch_ids])
|
448
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
449
|
+
cursor.execute(delete_sql)
|
450
|
+
batch_deleted = cursor.rowcount
|
451
|
+
affected_rows += batch_deleted
|
452
|
+
conn.commit()
|
453
|
+
logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
|
454
|
+
return (dup_count, affected_rows)
|
455
|
+
# SQL方式查找重复
|
456
|
+
temp_table = self._make_temp_table_name(table)
|
457
|
+
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
458
|
+
create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
|
459
|
+
create_temp_sql = f"""
|
460
|
+
CREATE TABLE `{database}`.`{temp_table}` AS
|
461
|
+
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
462
|
+
FROM `{database}`.`{table}`
|
463
|
+
{create_temp_where}
|
464
|
+
GROUP BY {column_list}
|
465
|
+
HAVING COUNT(*) > 1
|
466
|
+
"""
|
467
|
+
with self._conn_ctx() as conn:
|
468
|
+
with conn.cursor() as cursor:
|
469
|
+
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
470
|
+
cursor.execute(create_temp_sql)
|
471
|
+
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
472
|
+
dup_count_row = cursor.fetchone()
|
473
|
+
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
474
|
+
if dup_count == 0:
|
475
|
+
logger.debug('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": date_val})
|
476
|
+
cursor.execute(drop_temp_sql)
|
477
|
+
conn.commit()
|
478
|
+
return (0, 0)
|
479
|
+
affected_rows = 0
|
480
|
+
if not dry_run:
|
481
|
+
while True:
|
482
|
+
where_clauses = []
|
483
|
+
if self.duplicate_keep_mode == 'keep_one':
|
484
|
+
where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
|
485
|
+
if where_sql.strip():
|
486
|
+
where_clauses.append(where_sql.strip())
|
487
|
+
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
488
|
+
find_dup_ids_sql = f"""
|
489
|
+
SELECT t.`{pk_real}` as del_id
|
490
|
+
FROM `{database}`.`{table}` t
|
491
|
+
JOIN `{database}`.`{temp_table}` tmp
|
492
|
+
ON {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
|
493
|
+
{where_full}
|
494
|
+
LIMIT {self.batch_size}
|
495
|
+
"""
|
496
|
+
logger.debug('查找待删除重复id SQL', {'sql': find_dup_ids_sql})
|
497
|
+
cursor.execute(find_dup_ids_sql)
|
498
|
+
del_ids = [row['del_id'] for row in cursor.fetchall()]
|
499
|
+
if not del_ids:
|
500
|
+
break
|
501
|
+
del_ids_str = ','.join([str(i) for i in del_ids])
|
502
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
503
|
+
logger.debug('按id批量删除SQL', {'sql': delete_sql, 'ids': del_ids})
|
504
|
+
cursor.execute(delete_sql)
|
505
|
+
batch_deleted = cursor.rowcount
|
506
|
+
affected_rows += batch_deleted
|
507
|
+
conn.commit()
|
508
|
+
if batch_deleted == 0:
|
509
|
+
logger.warning('检测到未能删除任何数据,强制跳出循环,防止假死', {"库": database, "表": table})
|
510
|
+
break
|
511
|
+
if batch_deleted < self.batch_size:
|
512
|
+
break
|
513
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": date_val})
|
514
|
+
else:
|
515
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
|
516
|
+
affected_rows = 0
|
517
|
+
cursor.execute(drop_temp_sql)
|
518
|
+
conn.commit()
|
519
|
+
return (dup_count, affected_rows)
|
520
|
+
# 没有date_column,处理全表
|
521
|
+
# ...existing code for full-table deduplication (as before, but without recursion)...
|
470
522
|
use_columns = columns or all_columns
|
471
523
|
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
472
524
|
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
@@ -477,70 +529,53 @@ class MySQLDeduplicator:
|
|
477
529
|
return (0, 0)
|
478
530
|
pk = self.primary_key
|
479
531
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
480
|
-
#
|
481
|
-
|
482
|
-
if has_time_col and dedup_start_date and dedup_end_date:
|
483
|
-
where_sql = f"t.`{time_col}` >= '{dedup_start_date}' AND t.`{time_col}` <= '{dedup_end_date}'"
|
484
|
-
# 获取原始数据总量(只统计区间内数据)
|
485
|
-
with self._get_connection() as conn:
|
532
|
+
# 获取原始数据总量
|
533
|
+
with self._conn_ctx() as conn:
|
486
534
|
with conn.cursor() as cursor:
|
487
|
-
|
488
|
-
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
535
|
+
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
|
489
536
|
logger.debug('执行SQL', {'sql': count_sql})
|
490
537
|
cursor.execute(count_sql)
|
491
538
|
total_count_row = cursor.fetchone()
|
492
539
|
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
493
|
-
logger.
|
540
|
+
logger.debug('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
494
541
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
495
|
-
|
496
|
-
# 用Python查找重复
|
497
542
|
if use_python_dedup:
|
498
|
-
# 1. 拉取所有数据(生成器分批拉取)
|
499
543
|
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
500
|
-
select_where =
|
501
|
-
select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
|
502
|
-
logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
|
503
|
-
# 用生成器分批拉取
|
544
|
+
select_where = ''
|
504
545
|
grouped = defaultdict(list)
|
505
546
|
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
506
547
|
key = tuple(row[col] for col in use_columns)
|
507
548
|
grouped[key].append(row[pk_real])
|
508
|
-
# 2. 统计重复组和待删除id
|
509
549
|
dup_count = 0
|
510
550
|
del_ids = []
|
511
551
|
for ids in grouped.values():
|
512
552
|
if len(ids) > 1:
|
513
553
|
dup_count += 1
|
514
|
-
del_ids.extend(ids[1:])
|
554
|
+
del_ids.extend(ids[1:])
|
515
555
|
affected_rows = 0
|
516
556
|
if not dry_run and del_ids:
|
517
|
-
with self.
|
557
|
+
with self._conn_ctx() as conn:
|
518
558
|
with conn.cursor() as cursor:
|
519
559
|
for i in range(0, len(del_ids), self.batch_size):
|
520
|
-
|
521
|
-
del_ids_str = ','.join([str(i) for i in
|
560
|
+
batch_ids = del_ids[i:i+self.batch_size]
|
561
|
+
del_ids_str = ','.join([str(i) for i in batch_ids])
|
522
562
|
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
523
|
-
logger.debug('用Python分批删除SQL', {'sql': delete_sql, 'ids': batch})
|
524
563
|
cursor.execute(delete_sql)
|
525
564
|
batch_deleted = cursor.rowcount
|
526
565
|
affected_rows += batch_deleted
|
527
566
|
conn.commit()
|
528
|
-
logger.
|
567
|
+
logger.debug('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode})
|
529
568
|
return (dup_count, affected_rows)
|
530
|
-
# SQL方式查找重复
|
531
569
|
temp_table = self._make_temp_table_name(table)
|
532
570
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
533
|
-
# 创建临时表时加where条件
|
534
|
-
create_temp_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
535
571
|
create_temp_sql = f"""
|
536
572
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
537
573
|
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
538
574
|
FROM `{database}`.`{table}`
|
539
|
-
{create_temp_where}
|
540
575
|
GROUP BY {column_list}
|
541
576
|
HAVING COUNT(*) > 1
|
542
577
|
"""
|
543
|
-
with self.
|
578
|
+
with self._conn_ctx() as conn:
|
544
579
|
with conn.cursor() as cursor:
|
545
580
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
546
581
|
cursor.execute(create_temp_sql)
|
@@ -548,7 +583,7 @@ class MySQLDeduplicator:
|
|
548
583
|
dup_count_row = cursor.fetchone()
|
549
584
|
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
550
585
|
if dup_count == 0:
|
551
|
-
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count
|
586
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count})
|
552
587
|
cursor.execute(drop_temp_sql)
|
553
588
|
conn.commit()
|
554
589
|
return (0, 0)
|
@@ -558,8 +593,6 @@ class MySQLDeduplicator:
|
|
558
593
|
where_clauses = []
|
559
594
|
if self.duplicate_keep_mode == 'keep_one':
|
560
595
|
where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
|
561
|
-
if where_sql.strip():
|
562
|
-
where_clauses.append(where_sql.strip())
|
563
596
|
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
564
597
|
find_dup_ids_sql = f"""
|
565
598
|
SELECT t.`{pk_real}` as del_id
|
@@ -586,7 +619,7 @@ class MySQLDeduplicator:
|
|
586
619
|
break
|
587
620
|
if batch_deleted < self.batch_size:
|
588
621
|
break
|
589
|
-
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode
|
622
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode})
|
590
623
|
else:
|
591
624
|
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
|
592
625
|
affected_rows = 0
|
@@ -595,10 +628,9 @@ class MySQLDeduplicator:
|
|
595
628
|
return (dup_count, affected_rows)
|
596
629
|
except Exception as e:
|
597
630
|
logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
|
598
|
-
# 异常时也要清理临时表
|
599
631
|
if temp_table:
|
600
632
|
try:
|
601
|
-
with self.
|
633
|
+
with self._conn_ctx() as conn:
|
602
634
|
with conn.cursor() as cursor:
|
603
635
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
604
636
|
cursor.execute(drop_temp_sql)
|
@@ -611,26 +643,26 @@ class MySQLDeduplicator:
|
|
611
643
|
self._release_table_lock(database, table)
|
612
644
|
|
613
645
|
def deduplicate_table(
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
646
|
+
self,
|
647
|
+
database: str,
|
648
|
+
table: str,
|
649
|
+
columns: Optional[List[str]] = None,
|
650
|
+
dry_run: bool = False,
|
651
|
+
reorder_id: bool = False,
|
652
|
+
use_python_dedup: bool = True
|
621
653
|
) -> Tuple[int, int]:
|
622
654
|
"""
|
623
|
-
|
655
|
+
对指定表进行去重。始终按天分区(如有 date_column),否则全表。
|
624
656
|
|
625
657
|
Args:
|
626
658
|
database (str): 数据库名。
|
627
659
|
table (str): 表名。
|
628
|
-
columns (Optional[List[str]]):
|
629
|
-
dry_run (bool):
|
630
|
-
reorder_id (bool):
|
631
|
-
use_python_dedup (bool): 是否用Python
|
660
|
+
columns (Optional[List[str]]): 指定去重列。
|
661
|
+
dry_run (bool): 是否为模拟运行。
|
662
|
+
reorder_id (bool): 去重后是否自动重排 id 列。
|
663
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
632
664
|
Returns:
|
633
|
-
Tuple[int, int]: (重复组数, 实际删除行数)
|
665
|
+
Tuple[int, int]: (重复组数, 实际删除行数)
|
634
666
|
"""
|
635
667
|
if database.lower() in self.exclude_tables and table.lower() in self.exclude_tables[database.lower()]:
|
636
668
|
logger.info('表被排除', {"库": database, "表": table, "操作": "跳过"})
|
@@ -639,10 +671,76 @@ class MySQLDeduplicator:
|
|
639
671
|
if not self._check_table_exists(database, table):
|
640
672
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
641
673
|
return (0, 0)
|
642
|
-
logger.info('单表开始', {
|
643
|
-
|
644
|
-
|
645
|
-
|
674
|
+
logger.info('单表开始', {
|
675
|
+
"库": database,
|
676
|
+
"表": table,
|
677
|
+
# "参数": {
|
678
|
+
# "指定去重列": columns,
|
679
|
+
# "去重方式": "Python" if use_python_dedup else "SQL",
|
680
|
+
# "数据处理": self.duplicate_keep_mode,
|
681
|
+
# "模拟运行": dry_run,
|
682
|
+
# '排除列': self.exclude_columns,
|
683
|
+
# },
|
684
|
+
})
|
685
|
+
all_columns = self._get_table_columns(database, table)
|
686
|
+
all_columns_lower = [col.lower() for col in all_columns]
|
687
|
+
time_col = self.date_column
|
688
|
+
time_col_lower = time_col.lower() if time_col else None
|
689
|
+
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
690
|
+
if has_time_col:
|
691
|
+
self._ensure_index(database, table, time_col)
|
692
|
+
all_dates = self._get_all_dates(database, table, time_col)
|
693
|
+
# 按date_range/recent_month筛选日期
|
694
|
+
start_date = self._dedup_start_date
|
695
|
+
end_date = self._dedup_end_date
|
696
|
+
if start_date and end_date:
|
697
|
+
all_dates = [d for d in all_dates if str(start_date) <= str(d) <= str(end_date)]
|
698
|
+
if not all_dates:
|
699
|
+
logger.info('无可处理日期', {"库": database, "表": table})
|
700
|
+
return (0, 0)
|
701
|
+
total_dup = 0
|
702
|
+
total_del = 0
|
703
|
+
def process_date(date_val):
|
704
|
+
try:
|
705
|
+
logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
|
706
|
+
dup_count, affected_rows = self._deduplicate_table(
|
707
|
+
database, table, columns, dry_run, use_python_dedup,
|
708
|
+
date_val=date_val, lock_table=False
|
709
|
+
)
|
710
|
+
return (dup_count, affected_rows, date_val, None)
|
711
|
+
except Exception as e:
|
712
|
+
logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
|
713
|
+
return (0, 0, date_val, str(e))
|
714
|
+
if self.max_workers > 1:
|
715
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
716
|
+
future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
|
717
|
+
for future in concurrent.futures.as_completed(future_to_date):
|
718
|
+
dup_count, affected_rows, date_val, err = future.result()
|
719
|
+
if err:
|
720
|
+
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
721
|
+
total_dup += dup_count
|
722
|
+
total_del += affected_rows
|
723
|
+
else:
|
724
|
+
for date_val in all_dates:
|
725
|
+
dup_count, affected_rows, _, err = process_date(date_val)
|
726
|
+
if err:
|
727
|
+
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
728
|
+
total_dup += dup_count
|
729
|
+
total_del += affected_rows
|
730
|
+
logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
|
731
|
+
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
732
|
+
if reorder_id and total_del > 0:
|
733
|
+
try:
|
734
|
+
reorder_ok = self.reorder_id_column(database, table, id_column=self.primary_key, dry_run=dry_run)
|
735
|
+
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
736
|
+
except Exception as e:
|
737
|
+
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
738
|
+
if affected_rows > 0:
|
739
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
|
740
|
+
return (total_dup, total_del)
|
741
|
+
# 没有date_column,直接全表去重
|
742
|
+
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
|
743
|
+
logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
|
646
744
|
dup_count, affected_rows = result
|
647
745
|
if reorder_id and affected_rows > 0:
|
648
746
|
try:
|
@@ -650,34 +748,36 @@ class MySQLDeduplicator:
|
|
650
748
|
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
651
749
|
except Exception as e:
|
652
750
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
751
|
+
if affected_rows > 0:
|
752
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
|
653
753
|
return result
|
654
754
|
except Exception as e:
|
655
755
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
656
756
|
return (0, 0)
|
657
757
|
|
658
758
|
def deduplicate_database(
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
759
|
+
self,
|
760
|
+
database: str,
|
761
|
+
tables: Optional[List[str]] = None,
|
762
|
+
columns_map: Optional[Dict[str, List[str]]] = None,
|
763
|
+
dry_run: bool = False,
|
764
|
+
parallel: bool = False,
|
765
|
+
reorder_id: bool = False,
|
766
|
+
use_python_dedup: bool = True
|
667
767
|
) -> Dict[str, Tuple[int, int]]:
|
668
768
|
"""
|
669
|
-
|
769
|
+
对指定数据库的所有表进行去重。调用 deduplicate_table,自动适配分天。
|
670
770
|
|
671
771
|
Args:
|
672
772
|
database (str): 数据库名。
|
673
|
-
tables (Optional[List[str]]):
|
674
|
-
columns_map (Optional[Dict[str, List[str]]]):
|
773
|
+
tables (Optional[List[str]]): 指定表名列表。
|
774
|
+
columns_map (Optional[Dict[str, List[str]]]): 每个表的去重列映射。
|
675
775
|
dry_run (bool): 是否为模拟运行。
|
676
|
-
parallel (bool):
|
677
|
-
reorder_id (bool):
|
678
|
-
use_python_dedup (bool): 是否用Python
|
776
|
+
parallel (bool): 是否并行处理表。
|
777
|
+
reorder_id (bool): 去重后是否自动重排 id 列。
|
778
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
679
779
|
Returns:
|
680
|
-
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
|
780
|
+
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}
|
681
781
|
"""
|
682
782
|
results = {}
|
683
783
|
try:
|
@@ -693,8 +793,6 @@ class MySQLDeduplicator:
|
|
693
793
|
return results
|
694
794
|
logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
|
695
795
|
if parallel and self.max_workers > 1:
|
696
|
-
logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
|
697
|
-
# 使用线程池并行处理
|
698
796
|
with concurrent.futures.ThreadPoolExecutor(
|
699
797
|
max_workers=self.max_workers
|
700
798
|
) as executor:
|
@@ -715,8 +813,6 @@ class MySQLDeduplicator:
|
|
715
813
|
logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
|
716
814
|
results[table] = (0, 0)
|
717
815
|
else:
|
718
|
-
logger.debug('串行处理表', {'库': database})
|
719
|
-
# 串行处理
|
720
816
|
for table in target_tables:
|
721
817
|
columns = columns_map.get(table) if columns_map else None
|
722
818
|
dup_count, affected_rows = self.deduplicate_table(
|
@@ -725,35 +821,39 @@ class MySQLDeduplicator:
|
|
725
821
|
results[table] = (dup_count, affected_rows)
|
726
822
|
total_dup = sum(r[0] for r in results.values())
|
727
823
|
total_del = sum(r[1] for r in results.values())
|
728
|
-
logger.
|
824
|
+
logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
|
825
|
+
# 只显示有删除的详细结果
|
826
|
+
if total_del > 0:
|
827
|
+
filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
|
828
|
+
logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
|
729
829
|
return results
|
730
830
|
except Exception as e:
|
731
831
|
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
732
832
|
return results
|
733
833
|
|
734
834
|
def deduplicate_all(
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
835
|
+
self,
|
836
|
+
databases: Optional[List[str]] = None,
|
837
|
+
tables_map: Optional[Dict[str, List[str]]] = None,
|
838
|
+
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
839
|
+
dry_run: bool = False,
|
840
|
+
parallel: bool = False,
|
841
|
+
reorder_id: bool = False,
|
842
|
+
use_python_dedup: bool = True
|
743
843
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
744
844
|
"""
|
745
|
-
|
845
|
+
对所有数据库进行去重。调用 deduplicate_database,自动适配分天。
|
746
846
|
|
747
847
|
Args:
|
748
|
-
databases (Optional[List[str]]):
|
749
|
-
tables_map (Optional[Dict[str, List[str]]]):
|
750
|
-
columns_map (Optional[Dict[str, Dict[str, List[str]]]]):
|
751
|
-
dry_run (bool):
|
752
|
-
parallel (bool):
|
753
|
-
reorder_id (bool):
|
754
|
-
use_python_dedup (bool): 是否用Python
|
848
|
+
databases (Optional[List[str]]): 指定数据库名列表。
|
849
|
+
tables_map (Optional[Dict[str, List[str]]]): 每个库的表名映射。
|
850
|
+
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 每个库每个表的去重列映射。
|
851
|
+
dry_run (bool): 是否为模拟运行。
|
852
|
+
parallel (bool): 是否并行处理库。
|
853
|
+
reorder_id (bool): 去重后是否自动重排 id 列。
|
854
|
+
use_python_dedup (bool): 是否用 Python 方式去重。
|
755
855
|
Returns:
|
756
|
-
Dict[str, Dict[str, Tuple[int, int]]]:
|
856
|
+
Dict[str, Dict[str, Tuple[int, int]]]: {库: {表: (重复组数, 实际删除行数)}}
|
757
857
|
"""
|
758
858
|
all_results: Dict[str, Dict[str, Tuple[int, int]]] = defaultdict(dict)
|
759
859
|
try:
|
@@ -763,9 +863,19 @@ class MySQLDeduplicator:
|
|
763
863
|
if not target_dbs:
|
764
864
|
logger.warning('没有可处理的数据库')
|
765
865
|
return all_results
|
766
|
-
logger.info('全局开始', {
|
767
|
-
|
768
|
-
|
866
|
+
logger.info('全局开始', {
|
867
|
+
"数据库数量": len(target_dbs),
|
868
|
+
"数据库列表": target_dbs,
|
869
|
+
"参数": {
|
870
|
+
"模拟运行": dry_run,
|
871
|
+
"并行处理": parallel,
|
872
|
+
'排除列': self.exclude_columns,
|
873
|
+
'重排id': reorder_id,
|
874
|
+
'use_python_dedup': use_python_dedup
|
875
|
+
},
|
876
|
+
})
|
877
|
+
# 如果parallel=True且库数量大于1,则只在外层并发,内层串行
|
878
|
+
if parallel and self.max_workers > 1 and len(target_dbs) > 1:
|
769
879
|
with concurrent.futures.ThreadPoolExecutor(
|
770
880
|
max_workers=self.max_workers
|
771
881
|
) as executor:
|
@@ -773,6 +883,7 @@ class MySQLDeduplicator:
|
|
773
883
|
for db in target_dbs:
|
774
884
|
tables = tables_map.get(db) if tables_map else None
|
775
885
|
db_columns_map = columns_map.get(db) if columns_map else None
|
886
|
+
# 内层强制串行
|
776
887
|
futures[executor.submit(
|
777
888
|
self.deduplicate_database,
|
778
889
|
db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
|
@@ -786,7 +897,6 @@ class MySQLDeduplicator:
|
|
786
897
|
logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
|
787
898
|
all_results[db] = {}
|
788
899
|
else:
|
789
|
-
# 串行处理数据库
|
790
900
|
for db in target_dbs:
|
791
901
|
tables = tables_map.get(db) if tables_map else None
|
792
902
|
db_columns_map = columns_map.get(db) if columns_map else None
|
@@ -802,13 +912,42 @@ class MySQLDeduplicator:
|
|
802
912
|
r[1] for db in all_results.values()
|
803
913
|
for r in db.values()
|
804
914
|
)
|
805
|
-
logger.
|
915
|
+
logger.debug('全局完成', {
|
916
|
+
"总重复组": total_dup,
|
917
|
+
"总删除行": total_del,
|
918
|
+
"参数": {
|
919
|
+
"模拟运行": dry_run,
|
920
|
+
"并行处理": parallel,
|
921
|
+
'排除列': self.exclude_columns,
|
922
|
+
'重排id': reorder_id,
|
923
|
+
'use_python_dedup': use_python_dedup
|
924
|
+
},
|
925
|
+
"详细结果": dict(all_results)
|
926
|
+
})
|
927
|
+
# 只显示有删除的详细结果
|
928
|
+
if total_del > 0:
|
929
|
+
filtered_results = {
|
930
|
+
db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
|
931
|
+
for db, tbls in all_results.items()
|
932
|
+
}
|
933
|
+
filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
|
934
|
+
logger.info('全局完成(仅显示有删除的结果)', {
|
935
|
+
"总重复组": total_dup,
|
936
|
+
"总删除行": total_del,
|
937
|
+
"参数": {
|
938
|
+
"模拟运行": dry_run,
|
939
|
+
"并行处理": parallel,
|
940
|
+
'排除列': self.exclude_columns,
|
941
|
+
'重排id': reorder_id,
|
942
|
+
'use_python_dedup': use_python_dedup
|
943
|
+
},
|
944
|
+
"详细结果": filtered_results
|
945
|
+
})
|
806
946
|
return all_results
|
807
947
|
except Exception as e:
|
808
948
|
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
809
949
|
return all_results
|
810
950
|
|
811
|
-
@_retry_on_failure
|
812
951
|
def _check_database_exists(self, database: str) -> bool:
|
813
952
|
"""
|
814
953
|
检查数据库是否存在。
|
@@ -819,13 +958,11 @@ class MySQLDeduplicator:
|
|
819
958
|
bool: 数据库是否存在。
|
820
959
|
"""
|
821
960
|
sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
|
822
|
-
|
823
|
-
with self._get_connection() as conn:
|
961
|
+
with self._conn_ctx() as conn:
|
824
962
|
with conn.cursor() as cursor:
|
825
963
|
cursor.execute(sql, (database,))
|
826
964
|
return bool(cursor.fetchone())
|
827
965
|
|
828
|
-
@_retry_on_failure
|
829
966
|
def _check_table_exists(self, database: str, table: str) -> bool:
|
830
967
|
"""
|
831
968
|
检查表是否存在。
|
@@ -841,12 +978,35 @@ class MySQLDeduplicator:
|
|
841
978
|
FROM INFORMATION_SCHEMA.TABLES
|
842
979
|
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
|
843
980
|
"""
|
844
|
-
|
845
|
-
with self._get_connection() as conn:
|
981
|
+
with self._conn_ctx() as conn:
|
846
982
|
with conn.cursor() as cursor:
|
847
983
|
cursor.execute(sql, (database, table))
|
848
984
|
return bool(cursor.fetchone())
|
849
985
|
|
986
|
+
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
987
|
+
"""
|
988
|
+
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
989
|
+
Args:
|
990
|
+
database (str): 数据库名。
|
991
|
+
table (str): 表名。
|
992
|
+
id_column (str): id列名,默认使用self.primary_key。
|
993
|
+
Returns:
|
994
|
+
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
995
|
+
"""
|
996
|
+
id_column = id_column or self.primary_key
|
997
|
+
with self._conn_ctx() as conn:
|
998
|
+
with conn.cursor() as cursor:
|
999
|
+
cursor.execute("""
|
1000
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
1001
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
1002
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
1003
|
+
""", (database, table))
|
1004
|
+
columns_info = cursor.fetchall()
|
1005
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
1006
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
1007
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
1008
|
+
return columns, pk_cols, id_is_pk
|
1009
|
+
|
850
1010
|
def close(self) -> None:
|
851
1011
|
"""
|
852
1012
|
关闭连接池。
|
@@ -895,15 +1055,16 @@ class MySQLDeduplicator:
|
|
895
1055
|
auto_drop_backup: bool = True
|
896
1056
|
) -> Any:
|
897
1057
|
"""
|
898
|
-
安全重排指定表或指定库下所有表的id列为顺序自增(1,2,3...)。
|
1058
|
+
安全重排指定表或指定库下所有表的 id 列为顺序自增(1,2,3...)。
|
1059
|
+
|
899
1060
|
Args:
|
900
|
-
database (str):
|
901
|
-
table (Optional[str]): 表名,None
|
902
|
-
id_column (str): id列名,默认"id"
|
903
|
-
dry_run (bool):
|
904
|
-
auto_drop_backup (bool):
|
1061
|
+
database (str): 数据库名。
|
1062
|
+
table (Optional[str]): 表名,None 时批量处理该库所有表。
|
1063
|
+
id_column (str): id 列名,默认 "id"。
|
1064
|
+
dry_run (bool): 是否为模拟运行。
|
1065
|
+
auto_drop_backup (bool): 校验通过后自动删除备份表。
|
905
1066
|
Returns:
|
906
|
-
bool 或 dict: 单表时bool,批量时{表名: bool}
|
1067
|
+
bool 或 dict: 单表时 bool,批量时 {表名: bool}
|
907
1068
|
"""
|
908
1069
|
if not table:
|
909
1070
|
# 批量模式,对库下所有表执行
|
@@ -942,7 +1103,7 @@ class MySQLDeduplicator:
|
|
942
1103
|
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
943
1104
|
return False
|
944
1105
|
# 检查外键约束
|
945
|
-
with self.
|
1106
|
+
with self._conn_ctx() as conn:
|
946
1107
|
with conn.cursor() as cursor:
|
947
1108
|
cursor.execute("""
|
948
1109
|
SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
@@ -952,7 +1113,7 @@ class MySQLDeduplicator:
|
|
952
1113
|
logger.warning('表存在外键约束,跳过id重排', {"库": database, "表": table})
|
953
1114
|
return False
|
954
1115
|
# 获取表结构
|
955
|
-
with self.
|
1116
|
+
with self._conn_ctx() as conn:
|
956
1117
|
with conn.cursor() as cursor:
|
957
1118
|
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
958
1119
|
create_table_sql = cursor.fetchone()['Create Table']
|
@@ -965,7 +1126,7 @@ class MySQLDeduplicator:
|
|
965
1126
|
backup_table = self._make_backup_table_name(table)
|
966
1127
|
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
967
1128
|
try:
|
968
|
-
with self.
|
1129
|
+
with self._conn_ctx() as conn:
|
969
1130
|
with conn.cursor() as cursor:
|
970
1131
|
# 1. 创建临时表,结构同原表
|
971
1132
|
try:
|
@@ -1026,7 +1187,7 @@ class MySQLDeduplicator:
|
|
1026
1187
|
logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
|
1027
1188
|
return False
|
1028
1189
|
logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
|
1029
|
-
# 5.
|
1190
|
+
# 5. 自动删除备份表
|
1030
1191
|
if auto_drop_backup:
|
1031
1192
|
try:
|
1032
1193
|
cursor.execute(f"DROP TABLE {backup_table_quoted}")
|
@@ -1037,7 +1198,7 @@ class MySQLDeduplicator:
|
|
1037
1198
|
except Exception as e:
|
1038
1199
|
logger.error('id重排异常,准备回滚', {"库": database, "表": table, "异常": str(e)})
|
1039
1200
|
# 回滚:如临时表存在则删掉,恢复原表结构
|
1040
|
-
with self.
|
1201
|
+
with self._conn_ctx() as conn:
|
1041
1202
|
with conn.cursor() as cursor:
|
1042
1203
|
try:
|
1043
1204
|
cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
|
@@ -1045,7 +1206,7 @@ class MySQLDeduplicator:
|
|
1045
1206
|
logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
1046
1207
|
# 恢复原表(如备份表存在)
|
1047
1208
|
try:
|
1048
|
-
with self.
|
1209
|
+
with self._conn_ctx() as conn2:
|
1049
1210
|
with conn2.cursor() as cursor2:
|
1050
1211
|
if self._check_table_exists(database, backup_table):
|
1051
1212
|
cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
|
@@ -1057,6 +1218,41 @@ class MySQLDeduplicator:
|
|
1057
1218
|
finally:
|
1058
1219
|
self._release_table_lock(database, table)
|
1059
1220
|
|
1221
|
+
def _acquire_table_lock(self, database: str, table: str, timeout: int = 60) -> bool:
|
1222
|
+
"""
|
1223
|
+
获取表级锁,防止多线程/多进程并发操作同一张表。
|
1224
|
+
Args:
|
1225
|
+
database (str): 数据库名。
|
1226
|
+
table (str): 表名。
|
1227
|
+
timeout (int): 等待锁的超时时间(秒)。
|
1228
|
+
Returns:
|
1229
|
+
bool: 是否成功获取锁。
|
1230
|
+
"""
|
1231
|
+
key = f"{database.lower()}::{table.lower()}"
|
1232
|
+
start_time = time.time()
|
1233
|
+
while True:
|
1234
|
+
with self._lock:
|
1235
|
+
if key not in self._processing_tables:
|
1236
|
+
self._processing_tables.add(key)
|
1237
|
+
return True
|
1238
|
+
if time.time() - start_time > timeout:
|
1239
|
+
logger.warning('获取表级锁超时', {"库": database, "表": table, "timeout": timeout})
|
1240
|
+
return False
|
1241
|
+
time.sleep(0.2)
|
1242
|
+
|
1243
|
+
def _release_table_lock(self, database: str, table: str) -> None:
|
1244
|
+
"""
|
1245
|
+
释放表级锁。
|
1246
|
+
Args:
|
1247
|
+
database (str): 数据库名。
|
1248
|
+
table (str): 表名。
|
1249
|
+
Returns:
|
1250
|
+
None
|
1251
|
+
"""
|
1252
|
+
key = f"{database.lower()}::{table.lower()}"
|
1253
|
+
with self._lock:
|
1254
|
+
self._processing_tables.discard(key)
|
1255
|
+
|
1060
1256
|
@staticmethod
|
1061
1257
|
def _make_safe_table_name(base: str, prefix: str = '', suffix: str = '', max_length: int = 64) -> str:
|
1062
1258
|
"""
|
@@ -1077,30 +1273,6 @@ class MySQLDeduplicator:
|
|
1077
1273
|
return (prefix + suffix)[:max_length]
|
1078
1274
|
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
1079
1275
|
|
1080
|
-
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
1081
|
-
"""
|
1082
|
-
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
1083
|
-
Args:
|
1084
|
-
database (str): 数据库名。
|
1085
|
-
table (str): 表名。
|
1086
|
-
id_column (str): id列名,默认使用self.primary_key。
|
1087
|
-
Returns:
|
1088
|
-
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
1089
|
-
"""
|
1090
|
-
id_column = id_column or self.primary_key
|
1091
|
-
with self._get_connection() as conn:
|
1092
|
-
with conn.cursor() as cursor:
|
1093
|
-
cursor.execute("""
|
1094
|
-
SELECT COLUMN_NAME, COLUMN_KEY
|
1095
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
1096
|
-
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
1097
|
-
""", (database, table))
|
1098
|
-
columns_info = cursor.fetchall()
|
1099
|
-
columns = [row['COLUMN_NAME'] for row in columns_info]
|
1100
|
-
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
1101
|
-
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
1102
|
-
return columns, pk_cols, id_is_pk
|
1103
|
-
|
1104
1276
|
def _make_temp_table_name(self, base: str) -> str:
|
1105
1277
|
"""
|
1106
1278
|
生成临时表名,带有 temp_ 前缀和 _dedup_ 进程线程后缀。
|
@@ -1122,26 +1294,35 @@ def main():
|
|
1122
1294
|
password='pwd',
|
1123
1295
|
host='localhost',
|
1124
1296
|
port=3306,
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1297
|
+
max_workers= 2,
|
1298
|
+
batch_size=1000,
|
1299
|
+
skip_system_dbs=True,
|
1300
|
+
max_retries=3,
|
1301
|
+
retry_waiting_time=5,
|
1302
|
+
# pool_size=30,
|
1303
|
+
recent_month=1,
|
1304
|
+
# date_range=['2025-06-09', '2025-06-10'],
|
1305
|
+
date_column='日期',
|
1306
|
+
exclude_databases=['测试库4'],
|
1307
|
+
exclude_tables={
|
1308
|
+
'推广数据2': [
|
1309
|
+
'地域报表_城市_2025_04',
|
1310
|
+
# '地域报表_城市_2025_04_copy1',
|
1311
|
+
],
|
1312
|
+
"生意参谋3": [
|
1313
|
+
"商品排行_2025",
|
1314
|
+
],
|
1315
|
+
},
|
1135
1316
|
)
|
1136
1317
|
|
1137
1318
|
# 全库去重(单线程)
|
1138
|
-
deduplicator.deduplicate_all(dry_run=
|
1319
|
+
deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
|
1139
1320
|
|
1140
1321
|
# # 指定数据库去重(多线程)
|
1141
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=
|
1322
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
|
1142
1323
|
|
1143
1324
|
# # 指定表去重(使用特定列)
|
1144
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=[
|
1325
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
|
1145
1326
|
|
1146
1327
|
# # 重排id列
|
1147
1328
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=W8WVhYkHLU0SBDlL9Q6XQVTqIrzYjc1kFBZgqzS_NEI,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
14
|
mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.
|
28
|
-
mdbq-3.
|
29
|
-
mdbq-3.
|
30
|
-
mdbq-3.
|
27
|
+
mdbq-3.12.0.dist-info/METADATA,sha256=Q6EyaC61H4okFva6YFV2a0Y3Iqun8L8mnpSkeVXcFdc,364
|
28
|
+
mdbq-3.12.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.12.0.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.12.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|