mdbq 3.11.8__py3-none-any.whl → 3.11.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/log/mylogger.py +1 -1
- mdbq/mysql/deduplicator.py +305 -107
- mdbq/mysql/uploader.py +49 -1
- {mdbq-3.11.8.dist-info → mdbq-3.11.10.dist-info}/METADATA +1 -1
- {mdbq-3.11.8.dist-info → mdbq-3.11.10.dist-info}/RECORD +8 -8
- {mdbq-3.11.8.dist-info → mdbq-3.11.10.dist-info}/WHEEL +0 -0
- {mdbq-3.11.8.dist-info → mdbq-3.11.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.11.
|
1
|
+
VERSION = '3.11.10'
|
mdbq/log/mylogger.py
CHANGED
@@ -247,7 +247,7 @@ class MyLogger:
|
|
247
247
|
if isinstance(log_data.get('message'), str):
|
248
248
|
log_data['message'] = log_data['message'].replace(field, '***')
|
249
249
|
|
250
|
-
return json.dumps(log_data, ensure_ascii=False)
|
250
|
+
return json.dumps(log_data, ensure_ascii=False, default=str)
|
251
251
|
|
252
252
|
formatter = StructuredFormatter()
|
253
253
|
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -37,7 +37,7 @@ class MySQLDeduplicator:
|
|
37
37
|
|
38
38
|
功能:
|
39
39
|
1. 自动检测并删除MySQL数据库中的重复数据
|
40
|
-
2.
|
40
|
+
2. 支持全库扫描或指定表理
|
41
41
|
3. 支持多线程/多进程安全处理
|
42
42
|
4. 完善的错误处理和日志记录
|
43
43
|
|
@@ -114,7 +114,7 @@ class MySQLDeduplicator:
|
|
114
114
|
)
|
115
115
|
|
116
116
|
# 配置参数
|
117
|
-
self.max_workers = max(1,
|
117
|
+
self.max_workers = min(max(1, max_workers), pool_size) # 限制最大线程数,不能超过连接池
|
118
118
|
self.batch_size = batch_size
|
119
119
|
self.skip_system_dbs = skip_system_dbs
|
120
120
|
self.max_retries = max_retries
|
@@ -215,7 +215,7 @@ class MySQLDeduplicator:
|
|
215
215
|
last_exception = None
|
216
216
|
for attempt in range(self.max_retries + 1):
|
217
217
|
try:
|
218
|
-
logger.debug(f'调用{func.__name__},第{attempt+1}
|
218
|
+
logger.debug(f'调用{func.__name__},第{attempt+1}次连接', {'args': args, 'kwargs': kwargs})
|
219
219
|
return func(self, *args, **kwargs)
|
220
220
|
except (pymysql.OperationalError, pymysql.InterfaceError) as e:
|
221
221
|
last_exception = e
|
@@ -269,7 +269,8 @@ class MySQLDeduplicator:
|
|
269
269
|
with conn.cursor() as cursor:
|
270
270
|
cursor.execute(f"USE `{database}`")
|
271
271
|
cursor.execute(sql)
|
272
|
-
|
272
|
+
# 严格过滤所有以'temp_'为前缀的表名(如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等)
|
273
|
+
return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
|
273
274
|
|
274
275
|
@_retry_on_failure
|
275
276
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
@@ -328,129 +329,267 @@ class MySQLDeduplicator:
|
|
328
329
|
if key in self._processing_tables:
|
329
330
|
self._processing_tables.remove(key)
|
330
331
|
|
332
|
+
@_retry_on_failure
|
333
|
+
def _ensure_index(self, database: str, table: str, date_column: str) -> None:
|
334
|
+
"""
|
335
|
+
检查并为date_column自动创建索引(如果未存在)。
|
336
|
+
Args:
|
337
|
+
database (str): 数据库名。
|
338
|
+
table (str): 表名。
|
339
|
+
date_column (str): 需要检查的日期列名。
|
340
|
+
"""
|
341
|
+
with self._get_connection() as conn:
|
342
|
+
with conn.cursor() as cursor:
|
343
|
+
# 检查索引是否已存在
|
344
|
+
cursor.execute(
|
345
|
+
"""
|
346
|
+
SELECT COUNT(1) as idx_count FROM INFORMATION_SCHEMA.STATISTICS
|
347
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s
|
348
|
+
""",
|
349
|
+
(database, table, date_column)
|
350
|
+
)
|
351
|
+
idx_count = cursor.fetchone()['idx_count']
|
352
|
+
if idx_count == 0:
|
353
|
+
# 自动创建索引
|
354
|
+
index_name = f"idx_{date_column}"
|
355
|
+
safe_index_name = self._make_safe_table_name(index_name, prefix='', suffix='', max_length=64)
|
356
|
+
try:
|
357
|
+
cursor.execute(f"CREATE INDEX `{safe_index_name}` ON `{database}`.`{table}` (`{date_column}`)")
|
358
|
+
conn.commit()
|
359
|
+
logger.info('已自动为date_column创建索引', {"库": database, "表": table, "date_column": date_column, "索引名": safe_index_name})
|
360
|
+
except Exception as e:
|
361
|
+
logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
|
362
|
+
else:
|
363
|
+
logger.debug('date_column已存在索引', {"库": database, "表": table, "date_column": date_column})
|
364
|
+
|
365
|
+
def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
|
366
|
+
"""
|
367
|
+
生成器:分批拉取表数据,避免一次性加载全部数据到内存。
|
368
|
+
Args:
|
369
|
+
database (str): 数据库名。
|
370
|
+
table (str): 表名。
|
371
|
+
select_cols (str): 选择的列字符串。
|
372
|
+
select_where (str): where条件字符串。
|
373
|
+
batch_size (int): 每批拉取的行数。
|
374
|
+
Yields:
|
375
|
+
dict: 每行数据。
|
376
|
+
"""
|
377
|
+
offset = 0
|
378
|
+
while True:
|
379
|
+
sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
|
380
|
+
with self._get_connection() as conn:
|
381
|
+
with conn.cursor() as cursor:
|
382
|
+
cursor.execute(sql)
|
383
|
+
rows = cursor.fetchall()
|
384
|
+
if not rows:
|
385
|
+
break
|
386
|
+
for row in rows:
|
387
|
+
yield row
|
388
|
+
if len(rows) < batch_size:
|
389
|
+
break
|
390
|
+
offset += batch_size
|
391
|
+
|
392
|
+
def _get_all_dates(self, database: str, table: str, date_column: str) -> list:
|
393
|
+
"""
|
394
|
+
获取表中所有不同的日期分区(按天)。
|
395
|
+
Args:
|
396
|
+
database (str): 数据库名。
|
397
|
+
table (str): 表名。
|
398
|
+
date_column (str): 日期列名。
|
399
|
+
Returns:
|
400
|
+
List: 所有不同的日期(字符串)。
|
401
|
+
"""
|
402
|
+
sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
|
403
|
+
with self._get_connection() as conn:
|
404
|
+
with conn.cursor() as cursor:
|
405
|
+
cursor.execute(sql)
|
406
|
+
return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
|
407
|
+
|
331
408
|
def _deduplicate_table(
|
332
409
|
self,
|
333
410
|
database: str,
|
334
411
|
table: str,
|
335
412
|
columns: Optional[List[str]] = None,
|
336
|
-
dry_run: bool = False
|
413
|
+
dry_run: bool = False,
|
414
|
+
use_python_dedup: bool = False,
|
415
|
+
dedup_start_date: Optional[str] = None,
|
416
|
+
dedup_end_date: Optional[str] = None,
|
417
|
+
lock_table: bool = True
|
337
418
|
) -> Tuple[int, int]:
|
338
419
|
"""
|
339
420
|
执行单表去重。
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
table (str): 表名。
|
344
|
-
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
345
|
-
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
346
|
-
Returns:
|
347
|
-
Tuple[int, int]: (重复组数, 实际删除行数)。
|
421
|
+
支持按天分批处理(如果表包含date_column),否则全表去重。
|
422
|
+
如果date_column在exclude_columns中,直接跳过该表。
|
423
|
+
优化:分批删除时用主键、避免重复建/删临时表、并发处理每天。
|
348
424
|
"""
|
349
|
-
if not self._acquire_table_lock(database, table):
|
425
|
+
if lock_table and not self._acquire_table_lock(database, table):
|
350
426
|
return (0, 0)
|
351
427
|
temp_table = None
|
352
428
|
try:
|
353
|
-
# 获取原始数据总量
|
354
|
-
with self._get_connection() as conn:
|
355
|
-
with conn.cursor() as cursor:
|
356
|
-
logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
|
357
|
-
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
|
358
|
-
total_count_row = cursor.fetchone()
|
359
|
-
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
360
|
-
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
|
361
429
|
# 获取实际列名
|
362
430
|
all_columns = self._get_table_columns(database, table)
|
363
|
-
logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
|
364
|
-
# 检查是否需要按时间范围过滤
|
365
|
-
use_time_filter = False
|
366
|
-
time_col = self.date_column
|
367
431
|
all_columns_lower = [col.lower() for col in all_columns]
|
368
|
-
# 排除exclude_columns
|
369
432
|
exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
|
370
|
-
|
433
|
+
time_col = self.date_column
|
434
|
+
time_col_lower = time_col.lower() if time_col else None
|
435
|
+
# 1. 跳过date_column在exclude_columns的情况
|
436
|
+
if time_col_lower and time_col_lower in exclude_columns_lower:
|
437
|
+
logger.warning('date_column在exclude_columns中,跳过该表', {"库": database, "表": table, "date_column": time_col, "exclude_columns": self.exclude_columns})
|
438
|
+
return (0, 0)
|
439
|
+
# 2. 判断表是否包含date_column
|
440
|
+
has_time_col = time_col_lower in all_columns_lower if time_col_lower else False
|
441
|
+
# 如果包含date_column,自动检查并创建索引
|
442
|
+
if has_time_col and dedup_start_date is None and dedup_end_date is None:
|
443
|
+
self._ensure_index(database, table, time_col)
|
444
|
+
# 按天分区多线程处理
|
445
|
+
all_dates = self._get_all_dates(database, table, time_col)
|
446
|
+
total_dup = 0
|
447
|
+
total_del = 0
|
448
|
+
def process_date(date_val):
|
449
|
+
try:
|
450
|
+
logger.debug('按天分区去重', {"库": database, "表": table, "日期": date_val})
|
451
|
+
dup_count, affected_rows = self._deduplicate_table(
|
452
|
+
database, table, columns, dry_run, use_python_dedup,
|
453
|
+
dedup_start_date=date_val, dedup_end_date=date_val,
|
454
|
+
lock_table=False
|
455
|
+
)
|
456
|
+
return (dup_count, affected_rows, date_val, None)
|
457
|
+
except Exception as e:
|
458
|
+
logger.error('分区去重异常', {"库": database, "表": table, "日期": date_val, "异常": str(e), "func": sys._getframe().f_code.co_name})
|
459
|
+
return (0, 0, date_val, str(e))
|
460
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
461
|
+
future_to_date = {executor.submit(process_date, date_val): date_val for date_val in all_dates}
|
462
|
+
for future in concurrent.futures.as_completed(future_to_date):
|
463
|
+
dup_count, affected_rows, date_val, err = future.result()
|
464
|
+
if err:
|
465
|
+
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
466
|
+
total_dup += dup_count
|
467
|
+
total_del += affected_rows
|
468
|
+
return (total_dup, total_del)
|
469
|
+
# 获取去重列
|
371
470
|
use_columns = columns or all_columns
|
372
471
|
use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
|
373
472
|
invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
|
374
473
|
if invalid_columns:
|
375
474
|
logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
|
376
475
|
if not use_columns:
|
377
|
-
logger.error('没有有效的去重列', {"库": database, "表": table})
|
476
|
+
logger.error('没有有效的去重列', {"库": database, "表": table, "func": sys._getframe().f_code.co_name})
|
378
477
|
return (0, 0)
|
379
|
-
# 统一用反引号包裹
|
380
|
-
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
381
|
-
temp_table = self._make_safe_table_name(table, prefix=f"temp_", suffix=f"_dedup_{os.getpid()}_{threading.get_ident()}")
|
382
478
|
pk = self.primary_key
|
383
|
-
# 主键判断也用小写
|
384
|
-
if pk.lower() not in all_columns_lower and pk != 'id':
|
385
|
-
logger.error('', {"不存在主键列": database, "表": table, "主键列不存在": pk})
|
386
|
-
return (0, 0)
|
387
|
-
# 找到实际主键名
|
388
479
|
pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
|
389
|
-
#
|
390
|
-
|
391
|
-
if
|
392
|
-
|
480
|
+
# 判断是否需要加日期区间条件
|
481
|
+
where_sql = ''
|
482
|
+
if has_time_col and dedup_start_date and dedup_end_date:
|
483
|
+
where_sql = f"t.`{time_col}` >= '{dedup_start_date}' AND t.`{time_col}` <= '{dedup_end_date}'"
|
484
|
+
# 获取原始数据总量(只统计区间内数据)
|
485
|
+
with self._get_connection() as conn:
|
486
|
+
with conn.cursor() as cursor:
|
487
|
+
count_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
488
|
+
count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
|
489
|
+
logger.debug('执行SQL', {'sql': count_sql})
|
490
|
+
cursor.execute(count_sql)
|
491
|
+
total_count_row = cursor.fetchone()
|
492
|
+
total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
|
493
|
+
logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name, "数据日期": dedup_end_date})
|
494
|
+
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
495
|
+
|
496
|
+
# 用Python查找重复
|
497
|
+
if use_python_dedup:
|
498
|
+
# 1. 拉取所有数据(生成器分批拉取)
|
499
|
+
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
500
|
+
select_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
501
|
+
select_sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where}"
|
502
|
+
logger.debug('用Python查找重复,拉取数据SQL', {'sql': select_sql})
|
503
|
+
# 用生成器分批拉取
|
504
|
+
grouped = defaultdict(list)
|
505
|
+
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
506
|
+
key = tuple(row[col] for col in use_columns)
|
507
|
+
grouped[key].append(row[pk_real])
|
508
|
+
# 2. 统计重复组和待删除id
|
509
|
+
dup_count = 0
|
510
|
+
del_ids = []
|
511
|
+
for ids in grouped.values():
|
512
|
+
if len(ids) > 1:
|
513
|
+
dup_count += 1
|
514
|
+
del_ids.extend(ids[1:]) # 只保留第一个
|
515
|
+
affected_rows = 0
|
516
|
+
if not dry_run and del_ids:
|
517
|
+
with self._get_connection() as conn:
|
518
|
+
with conn.cursor() as cursor:
|
519
|
+
for i in range(0, len(del_ids), self.batch_size):
|
520
|
+
batch = del_ids[i:i+self.batch_size]
|
521
|
+
del_ids_str = ','.join([str(i) for i in batch])
|
522
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
523
|
+
logger.debug('用Python分批删除SQL', {'sql': delete_sql, 'ids': batch})
|
524
|
+
cursor.execute(delete_sql)
|
525
|
+
batch_deleted = cursor.rowcount
|
526
|
+
affected_rows += batch_deleted
|
527
|
+
conn.commit()
|
528
|
+
logger.info('去重完成', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "Python", "数据处理": self.duplicate_keep_mode, "数据日期": dedup_end_date})
|
529
|
+
return (dup_count, affected_rows)
|
530
|
+
# SQL方式查找重复
|
531
|
+
temp_table = self._make_temp_table_name(table)
|
532
|
+
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
533
|
+
# 创建临时表时加where条件
|
534
|
+
create_temp_where = f"WHERE `{time_col}` >= '{dedup_start_date}' AND `{time_col}` <= '{dedup_end_date}'" if has_time_col and dedup_start_date and dedup_end_date else ''
|
393
535
|
create_temp_sql = f"""
|
394
536
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
395
537
|
SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
396
538
|
FROM `{database}`.`{table}`
|
397
|
-
{
|
539
|
+
{create_temp_where}
|
398
540
|
GROUP BY {column_list}
|
399
541
|
HAVING COUNT(*) > 1
|
400
542
|
"""
|
401
|
-
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
402
543
|
with self._get_connection() as conn:
|
403
544
|
with conn.cursor() as cursor:
|
404
545
|
logger.debug('创建临时表SQL', {'sql': create_temp_sql})
|
405
546
|
cursor.execute(create_temp_sql)
|
406
|
-
logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
|
407
547
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
408
548
|
dup_count_row = cursor.fetchone()
|
409
549
|
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
410
550
|
if dup_count == 0:
|
411
|
-
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "
|
412
|
-
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
551
|
+
logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "数据日期": dedup_end_date})
|
413
552
|
cursor.execute(drop_temp_sql)
|
414
553
|
conn.commit()
|
415
554
|
return (0, 0)
|
416
555
|
affected_rows = 0
|
417
556
|
if not dry_run:
|
418
|
-
# 分批删除,避免锁表
|
419
557
|
while True:
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
cursor.execute(delete_dup_sql)
|
558
|
+
where_clauses = []
|
559
|
+
if self.duplicate_keep_mode == 'keep_one':
|
560
|
+
where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
|
561
|
+
if where_sql.strip():
|
562
|
+
where_clauses.append(where_sql.strip())
|
563
|
+
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
564
|
+
find_dup_ids_sql = f"""
|
565
|
+
SELECT t.`{pk_real}` as del_id
|
566
|
+
FROM `{database}`.`{table}` t
|
567
|
+
JOIN `{database}`.`{temp_table}` tmp
|
568
|
+
ON {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
|
569
|
+
{where_full}
|
570
|
+
LIMIT {self.batch_size}
|
571
|
+
"""
|
572
|
+
logger.debug('查找待删除重复id SQL', {'sql': find_dup_ids_sql})
|
573
|
+
cursor.execute(find_dup_ids_sql)
|
574
|
+
del_ids = [row['del_id'] for row in cursor.fetchall()]
|
575
|
+
if not del_ids:
|
576
|
+
break
|
577
|
+
del_ids_str = ','.join([str(i) for i in del_ids])
|
578
|
+
delete_sql = f"DELETE FROM `{database}`.`{table}` WHERE `{pk_real}` IN ({del_ids_str})"
|
579
|
+
logger.debug('按id批量删除SQL', {'sql': delete_sql, 'ids': del_ids})
|
580
|
+
cursor.execute(delete_sql)
|
444
581
|
batch_deleted = cursor.rowcount
|
445
582
|
affected_rows += batch_deleted
|
446
583
|
conn.commit()
|
584
|
+
if batch_deleted == 0:
|
585
|
+
logger.warning('检测到未能删除任何数据,强制跳出循环,防止假死', {"库": database, "表": table})
|
586
|
+
break
|
447
587
|
if batch_deleted < self.batch_size:
|
448
588
|
break
|
449
|
-
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "
|
589
|
+
logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组": dup_count, "实际删除": affected_rows, "去重方式": "SQL", "数据处理": self.duplicate_keep_mode, "数据日期": dedup_end_date})
|
450
590
|
else:
|
451
|
-
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "
|
591
|
+
logger.debug('dry_run模式,不执行删除', {"库": database, "表": table, "重复组": dup_count})
|
452
592
|
affected_rows = 0
|
453
|
-
logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
|
454
593
|
cursor.execute(drop_temp_sql)
|
455
594
|
conn.commit()
|
456
595
|
return (dup_count, affected_rows)
|
@@ -468,14 +607,17 @@ class MySQLDeduplicator:
|
|
468
607
|
logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
|
469
608
|
return (0, 0)
|
470
609
|
finally:
|
471
|
-
|
610
|
+
if lock_table:
|
611
|
+
self._release_table_lock(database, table)
|
472
612
|
|
473
613
|
def deduplicate_table(
|
474
614
|
self,
|
475
615
|
database: str,
|
476
616
|
table: str,
|
477
617
|
columns: Optional[List[str]] = None,
|
478
|
-
dry_run: bool = False
|
618
|
+
dry_run: bool = False,
|
619
|
+
reorder_id: bool = False,
|
620
|
+
use_python_dedup: bool = True
|
479
621
|
) -> Tuple[int, int]:
|
480
622
|
"""
|
481
623
|
对指定表进行去重。
|
@@ -485,6 +627,8 @@ class MySQLDeduplicator:
|
|
485
627
|
table (str): 表名。
|
486
628
|
columns (Optional[List[str]]): 用于去重的列名列表(为None时使用所有列)。
|
487
629
|
dry_run (bool): 是否为模拟运行(只统计不实际删除)。
|
630
|
+
reorder_id (bool): 去重后是否重排id。
|
631
|
+
use_python_dedup (bool): 是否用Python查找重复id。
|
488
632
|
Returns:
|
489
633
|
Tuple[int, int]: (重复组数, 实际删除行数)。
|
490
634
|
"""
|
@@ -495,9 +639,17 @@ class MySQLDeduplicator:
|
|
495
639
|
if not self._check_table_exists(database, table):
|
496
640
|
logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
|
497
641
|
return (0, 0)
|
498
|
-
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
|
499
|
-
result = self._deduplicate_table(database, table, columns, dry_run)
|
642
|
+
logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
|
643
|
+
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup)
|
500
644
|
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
|
645
|
+
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
646
|
+
dup_count, affected_rows = result
|
647
|
+
if reorder_id and affected_rows > 0:
|
648
|
+
try:
|
649
|
+
reorder_ok = self.reorder_id_column(database, table, id_column=self.primary_key, dry_run=dry_run)
|
650
|
+
logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
|
651
|
+
except Exception as e:
|
652
|
+
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
501
653
|
return result
|
502
654
|
except Exception as e:
|
503
655
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
@@ -509,7 +661,9 @@ class MySQLDeduplicator:
|
|
509
661
|
tables: Optional[List[str]] = None,
|
510
662
|
columns_map: Optional[Dict[str, List[str]]] = None,
|
511
663
|
dry_run: bool = False,
|
512
|
-
parallel: bool = False
|
664
|
+
parallel: bool = False,
|
665
|
+
reorder_id: bool = False,
|
666
|
+
use_python_dedup: bool = True
|
513
667
|
) -> Dict[str, Tuple[int, int]]:
|
514
668
|
"""
|
515
669
|
对指定数据库的所有表进行去重。
|
@@ -520,6 +674,8 @@ class MySQLDeduplicator:
|
|
520
674
|
columns_map (Optional[Dict[str, List[str]]]): 各表使用的去重列 {表名: [列名]}。
|
521
675
|
dry_run (bool): 是否为模拟运行。
|
522
676
|
parallel (bool): 是否并行处理。
|
677
|
+
reorder_id (bool): 去重后是否重排id。
|
678
|
+
use_python_dedup (bool): 是否用Python查找重复id。
|
523
679
|
Returns:
|
524
680
|
Dict[str, Tuple[int, int]]: {表名: (重复组数, 实际删除行数)}。
|
525
681
|
"""
|
@@ -548,7 +704,7 @@ class MySQLDeduplicator:
|
|
548
704
|
logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
|
549
705
|
futures[executor.submit(
|
550
706
|
self.deduplicate_table,
|
551
|
-
database, table, columns, dry_run
|
707
|
+
database, table, columns, dry_run, reorder_id, use_python_dedup
|
552
708
|
)] = table
|
553
709
|
for future in concurrent.futures.as_completed(futures):
|
554
710
|
table = futures[future]
|
@@ -564,12 +720,12 @@ class MySQLDeduplicator:
|
|
564
720
|
for table in target_tables:
|
565
721
|
columns = columns_map.get(table) if columns_map else None
|
566
722
|
dup_count, affected_rows = self.deduplicate_table(
|
567
|
-
database, table, columns, dry_run
|
723
|
+
database, table, columns, dry_run, reorder_id, use_python_dedup
|
568
724
|
)
|
569
725
|
results[table] = (dup_count, affected_rows)
|
570
726
|
total_dup = sum(r[0] for r in results.values())
|
571
727
|
total_del = sum(r[1] for r in results.values())
|
572
|
-
logger.info('单库完成', {"库": database, "
|
728
|
+
logger.info('单库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
|
573
729
|
return results
|
574
730
|
except Exception as e:
|
575
731
|
logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
|
@@ -581,7 +737,9 @@ class MySQLDeduplicator:
|
|
581
737
|
tables_map: Optional[Dict[str, List[str]]] = None,
|
582
738
|
columns_map: Optional[Dict[str, Dict[str, List[str]]]] = None,
|
583
739
|
dry_run: bool = False,
|
584
|
-
parallel: bool = False
|
740
|
+
parallel: bool = False,
|
741
|
+
reorder_id: bool = False,
|
742
|
+
use_python_dedup: bool = True
|
585
743
|
) -> Dict[str, Dict[str, Tuple[int, int]]]:
|
586
744
|
"""
|
587
745
|
对所有数据库进行去重。
|
@@ -592,6 +750,8 @@ class MySQLDeduplicator:
|
|
592
750
|
columns_map (Optional[Dict[str, Dict[str, List[str]]]]): 指定每个表去重时使用的列,格式为 {数据库名: {表名: [列名, ...]}}。如果为 None,则使用所有列。
|
593
751
|
dry_run (bool): 是否为模拟运行模式。为 True 时只统计重复行数,不实际删除。
|
594
752
|
parallel (bool): 是否并行处理多个数据库。为 True 时使用线程池并发处理。
|
753
|
+
reorder_id (bool): 去重后是否重排id。
|
754
|
+
use_python_dedup (bool): 是否用Python查找重复id。
|
595
755
|
Returns:
|
596
756
|
Dict[str, Dict[str, Tuple[int, int]]]: 嵌套字典,格式为 {数据库名: {表名: (重复组数, 实际删除行数)}}。
|
597
757
|
"""
|
@@ -603,7 +763,7 @@ class MySQLDeduplicator:
|
|
603
763
|
if not target_dbs:
|
604
764
|
logger.warning('没有可处理的数据库')
|
605
765
|
return all_results
|
606
|
-
logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
|
766
|
+
logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns, 'use_python_dedup': use_python_dedup}})
|
607
767
|
if parallel and self.max_workers > 1:
|
608
768
|
# 使用线程池并行处理多个数据库
|
609
769
|
with concurrent.futures.ThreadPoolExecutor(
|
@@ -615,7 +775,7 @@ class MySQLDeduplicator:
|
|
615
775
|
db_columns_map = columns_map.get(db) if columns_map else None
|
616
776
|
futures[executor.submit(
|
617
777
|
self.deduplicate_database,
|
618
|
-
db, tables, db_columns_map, dry_run, False
|
778
|
+
db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
|
619
779
|
)] = db
|
620
780
|
for future in concurrent.futures.as_completed(futures):
|
621
781
|
db = futures[future]
|
@@ -631,7 +791,7 @@ class MySQLDeduplicator:
|
|
631
791
|
tables = tables_map.get(db) if tables_map else None
|
632
792
|
db_columns_map = columns_map.get(db) if columns_map else None
|
633
793
|
db_results = self.deduplicate_database(
|
634
|
-
db, tables, db_columns_map, dry_run, parallel
|
794
|
+
db, tables, db_columns_map, dry_run, parallel, reorder_id, use_python_dedup
|
635
795
|
)
|
636
796
|
all_results[db] = db_results
|
637
797
|
total_dup = sum(
|
@@ -642,7 +802,7 @@ class MySQLDeduplicator:
|
|
642
802
|
r[1] for db in all_results.values()
|
643
803
|
for r in db.values()
|
644
804
|
)
|
645
|
-
logger.info('全局完成', {"
|
805
|
+
logger.info('全局完成', {"总重复组": total_dup, "总删除行": total_del, "详细结果": dict(all_results)})
|
646
806
|
return all_results
|
647
807
|
except Exception as e:
|
648
808
|
logger.error('异常', {"error": str(e), 'traceback': repr(e)})
|
@@ -772,22 +932,12 @@ class MySQLDeduplicator:
|
|
772
932
|
if not self._check_table_exists(database, table):
|
773
933
|
logger.warning('表不存在,跳过id重排', {"库": database, "表": table})
|
774
934
|
return False
|
775
|
-
# 检查id
|
776
|
-
|
777
|
-
with conn.cursor() as cursor:
|
778
|
-
cursor.execute("""
|
779
|
-
SELECT COLUMN_NAME, COLUMN_KEY
|
780
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
781
|
-
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
782
|
-
""", (database, table))
|
783
|
-
columns_info = cursor.fetchall()
|
784
|
-
columns = [row['COLUMN_NAME'] for row in columns_info]
|
785
|
-
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
935
|
+
# 检查id列、主键信息(用_get_table_info)
|
936
|
+
columns, pk_cols, id_is_pk = self._get_table_info(database, table, id_column)
|
786
937
|
if id_column not in columns:
|
787
938
|
logger.warning('表无id列,跳过id重排', {"库": database, "表": table})
|
788
939
|
return False
|
789
940
|
# 检查主键是否为单列id
|
790
|
-
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
791
941
|
if len(pk_cols) != 1 or pk_cols[0].lower() != id_column.lower():
|
792
942
|
logger.warning('主键不是单列id,跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
|
793
943
|
return False
|
@@ -806,13 +956,13 @@ class MySQLDeduplicator:
|
|
806
956
|
with conn.cursor() as cursor:
|
807
957
|
cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
|
808
958
|
create_table_sql = cursor.fetchone()['Create Table']
|
809
|
-
logger.info('开始id重排', {"库": database, "表": table, "重排列": id_column, "
|
959
|
+
logger.info('开始id重排', {"库": database, "表": table, "重排列": id_column, "试运行": dry_run, "DDL警告": "MySQL DDL操作不可回滚,建议提前备份!"})
|
810
960
|
if dry_run:
|
811
961
|
logger.info('dry_run模式,打印原表结构', {"库": database, "表": table, "建表语句": create_table_sql})
|
812
962
|
return True
|
813
|
-
temp_table = self.
|
963
|
+
temp_table = self._make_temp_table_name(table)
|
814
964
|
temp_table_quoted = f"`{database}`.`{temp_table}`"
|
815
|
-
backup_table = self.
|
965
|
+
backup_table = self._make_backup_table_name(table)
|
816
966
|
backup_table_quoted = f"`{database}`.`{backup_table}`"
|
817
967
|
try:
|
818
968
|
with self._get_connection() as conn:
|
@@ -927,23 +1077,71 @@ class MySQLDeduplicator:
|
|
927
1077
|
return (prefix + suffix)[:max_length]
|
928
1078
|
return f"{prefix}{base[:remain]}{suffix}"[:max_length]
|
929
1079
|
|
1080
|
+
def _get_table_info(self, database: str, table: str, id_column: str = None):
|
1081
|
+
"""
|
1082
|
+
获取表的所有列名、主键列名列表、指定id列是否为主键。
|
1083
|
+
Args:
|
1084
|
+
database (str): 数据库名。
|
1085
|
+
table (str): 表名。
|
1086
|
+
id_column (str): id列名,默认使用self.primary_key。
|
1087
|
+
Returns:
|
1088
|
+
Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
|
1089
|
+
"""
|
1090
|
+
id_column = id_column or self.primary_key
|
1091
|
+
with self._get_connection() as conn:
|
1092
|
+
with conn.cursor() as cursor:
|
1093
|
+
cursor.execute("""
|
1094
|
+
SELECT COLUMN_NAME, COLUMN_KEY
|
1095
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
1096
|
+
WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s
|
1097
|
+
""", (database, table))
|
1098
|
+
columns_info = cursor.fetchall()
|
1099
|
+
columns = [row['COLUMN_NAME'] for row in columns_info]
|
1100
|
+
pk_cols = [row['COLUMN_NAME'] for row in columns_info if row['COLUMN_KEY'] == 'PRI']
|
1101
|
+
id_is_pk = any(row['COLUMN_NAME'].lower() == id_column.lower() and row['COLUMN_KEY'] in ('PRI', 'UNI') for row in columns_info)
|
1102
|
+
return columns, pk_cols, id_is_pk
|
1103
|
+
|
1104
|
+
def _make_temp_table_name(self, base: str) -> str:
|
1105
|
+
"""
|
1106
|
+
生成临时表名,带有 temp_ 前缀和 _dedup_ 进程线程后缀。
|
1107
|
+
"""
|
1108
|
+
suffix = f"_dedup_{os.getpid()}_{threading.get_ident()}"
|
1109
|
+
return self._make_safe_table_name(base, prefix="temp_", suffix=suffix)
|
1110
|
+
|
1111
|
+
def _make_backup_table_name(self, base: str) -> str:
|
1112
|
+
"""
|
1113
|
+
生成备份表名,带有 backup_ 前缀和时间戳+uuid后缀。
|
1114
|
+
"""
|
1115
|
+
suffix = f"_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
1116
|
+
return self._make_safe_table_name(base, prefix="backup_", suffix=suffix)
|
1117
|
+
|
930
1118
|
|
931
1119
|
def main():
|
932
1120
|
deduplicator = MySQLDeduplicator(
|
933
1121
|
username='root',
|
934
1122
|
password='pwd',
|
935
1123
|
host='localhost',
|
936
|
-
port=3306
|
1124
|
+
port=3306,
|
1125
|
+
# date_range=['2025-05-27', '2025-05-28'],
|
1126
|
+
exclude_tables={'推广数据2': [
|
1127
|
+
# '地域报表_城市_2025_04',
|
1128
|
+
# '地域报表_城市_2025_05',
|
1129
|
+
# '地域报表_城市_2025_06',
|
1130
|
+
'地域报表_城市_2025_04_copy1',
|
1131
|
+
'地域报表_城市_2025_05_copy1',
|
1132
|
+
'地域报表_城市_2025_06_copy1',
|
1133
|
+
'主体报表_2025_copy1'
|
1134
|
+
]}
|
937
1135
|
)
|
938
1136
|
|
939
1137
|
# 全库去重(单线程)
|
940
|
-
deduplicator.deduplicate_all(dry_run=
|
1138
|
+
deduplicator.deduplicate_all(dry_run=True, parallel=True, reorder_id=True)
|
941
1139
|
|
942
1140
|
# # 指定数据库去重(多线程)
|
943
|
-
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False)
|
1141
|
+
# deduplicator.deduplicate_database('my_db', dry_run=False, parallel=False, reorder_id=True)
|
944
1142
|
|
945
1143
|
# # 指定表去重(使用特定列)
|
946
|
-
# deduplicator.deduplicate_table('my_db', 'my_table', columns=[
|
1144
|
+
# deduplicator.deduplicate_table('my_db', 'my_table', columns=["name", "date"], dry_run=False, reorder_id=False)
|
947
1145
|
|
948
1146
|
# # 重排id列
|
949
1147
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
@@ -952,5 +1150,5 @@ def main():
|
|
952
1150
|
deduplicator.close()
|
953
1151
|
|
954
1152
|
if __name__ == '__main__':
|
955
|
-
main()
|
1153
|
+
# main()
|
956
1154
|
pass
|
mdbq/mysql/uploader.py
CHANGED
@@ -428,6 +428,7 @@ class MySQLUploader:
|
|
428
428
|
if idx_col in set_typ:
|
429
429
|
safe_idx_col = self._validate_identifier(idx_col)
|
430
430
|
index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
|
431
|
+
index_defs = list(set(index_defs))
|
431
432
|
index_sql = (',' + ','.join(index_defs)) if index_defs else ''
|
432
433
|
sql = f"""
|
433
434
|
CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
|
@@ -593,6 +594,34 @@ class MySQLUploader:
|
|
593
594
|
logger.error('无法获取表列信息', {'库': db_name, '表': table_name, '错误': str(e)})
|
594
595
|
raise
|
595
596
|
|
597
|
+
def _ensure_index(self, db_name: str, table_name: str, column: str):
|
598
|
+
"""
|
599
|
+
确保某列有索引,如果没有则创建。
|
600
|
+
"""
|
601
|
+
db_name = self._validate_identifier(db_name)
|
602
|
+
table_name = self._validate_identifier(table_name)
|
603
|
+
column = self._validate_identifier(column)
|
604
|
+
# 检查索引是否已存在
|
605
|
+
sql_check = '''
|
606
|
+
SELECT COUNT(1) FROM INFORMATION_SCHEMA.STATISTICS
|
607
|
+
WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s
|
608
|
+
'''
|
609
|
+
sql_create = f'ALTER TABLE `{db_name}`.`{table_name}` ADD INDEX `idx_{column}` (`{column}`)'
|
610
|
+
try:
|
611
|
+
with self._get_connection() as conn:
|
612
|
+
with conn.cursor() as cursor:
|
613
|
+
cursor.execute(sql_check, (db_name, table_name, column))
|
614
|
+
exists = cursor.fetchone()
|
615
|
+
if exists and list(exists.values())[0] > 0:
|
616
|
+
logger.debug('索引已存在', {'库': db_name, '表': table_name, '列': column})
|
617
|
+
return
|
618
|
+
cursor.execute(sql_create)
|
619
|
+
conn.commit()
|
620
|
+
logger.info('已为列创建索引', {'库': db_name, '表': table_name, '列': column})
|
621
|
+
except Exception as e:
|
622
|
+
logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
|
623
|
+
raise
|
624
|
+
|
596
625
|
def _upload_to_table(
|
597
626
|
self,
|
598
627
|
db_name: str,
|
@@ -646,6 +675,13 @@ class MySQLUploader:
|
|
646
675
|
})
|
647
676
|
raise ValueError(f"列不存在: `{col}` -> `{db_name}`.`{table_name}`")
|
648
677
|
|
678
|
+
# 确保分表参考字段为索引
|
679
|
+
if date_column and date_column in table_columns:
|
680
|
+
try:
|
681
|
+
self._ensure_index(db_name, table_name, date_column)
|
682
|
+
except Exception as e:
|
683
|
+
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
|
684
|
+
|
649
685
|
# 插入数据
|
650
686
|
self._insert_data(
|
651
687
|
db_name, table_name, data, set_typ,
|
@@ -868,7 +904,7 @@ class MySQLUploader:
|
|
868
904
|
:param duplicate_columns: 用于检查重复的列,可选
|
869
905
|
:param allow_null: 是否允许空值,默认为False
|
870
906
|
:param partition_by: 分表方式('year'、'month'、'None'),可选
|
871
|
-
:param partition_date_column: 用于分表的日期列名,默认为'日期'
|
907
|
+
:param partition_date_column: 用于分表的日期列名,默认为'日期', 默认会添加为索引
|
872
908
|
:param auto_create: 表不存在时是否自动创建,默认为True
|
873
909
|
:param indexes: 需要创建索引的列列表,可选
|
874
910
|
:param update_on_duplicate: 遇到重复数据时是否更新旧数据,默认为False
|
@@ -977,6 +1013,12 @@ class MySQLUploader:
|
|
977
1013
|
allow_null, auto_create, partition_date_column,
|
978
1014
|
indexes, batch_id, update_on_duplicate, transaction_mode
|
979
1015
|
)
|
1016
|
+
# 确保分表参考字段为索引
|
1017
|
+
if partition_date_column in filtered_set_typ:
|
1018
|
+
try:
|
1019
|
+
self._ensure_index(db_name, part_table, partition_date_column)
|
1020
|
+
except Exception as e:
|
1021
|
+
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': part_table, '列': partition_date_column, '错误': str(e)})
|
980
1022
|
except Exception as e:
|
981
1023
|
logger.error('分表上传异常', {
|
982
1024
|
'库': db_name,
|
@@ -995,6 +1037,12 @@ class MySQLUploader:
|
|
995
1037
|
allow_null, auto_create, partition_date_column,
|
996
1038
|
indexes, batch_id, update_on_duplicate, transaction_mode
|
997
1039
|
)
|
1040
|
+
# 确保分表参考字段为索引
|
1041
|
+
if partition_date_column in filtered_set_typ:
|
1042
|
+
try:
|
1043
|
+
self._ensure_index(db_name, table_name, partition_date_column)
|
1044
|
+
except Exception as e:
|
1045
|
+
logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': partition_date_column, '错误': str(e)})
|
998
1046
|
|
999
1047
|
success_flag = True
|
1000
1048
|
|
@@ -1,17 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=L9HK2W1LgO8Zc5gpJgI1uJ5J0VRcUyMXHr1ZT-FeNOM,19
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
8
|
-
mdbq/log/mylogger.py,sha256=
|
8
|
+
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=w8etA5dAsY7g58bWU3SQt7n_OWnS9Y2TVh0D7m0MK9E,57961
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.11.
|
28
|
-
mdbq-3.11.
|
29
|
-
mdbq-3.11.
|
30
|
-
mdbq-3.11.
|
27
|
+
mdbq-3.11.10.dist-info/METADATA,sha256=dVhkC84iq1GWtV6onfsLj18CwfGnIo1bXXDa-TXUU1E,365
|
28
|
+
mdbq-3.11.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.11.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.11.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|