mdbq 3.12.3__py3-none-any.whl → 3.12.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +54 -18
- mdbq/mysql/uploader.py +6 -4
- {mdbq-3.12.3.dist-info → mdbq-3.12.4.dist-info}/METADATA +1 -1
- {mdbq-3.12.3.dist-info → mdbq-3.12.4.dist-info}/RECORD +7 -7
- {mdbq-3.12.3.dist-info → mdbq-3.12.4.dist-info}/WHEEL +0 -0
- {mdbq-3.12.3.dist-info → mdbq-3.12.4.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.12.
|
1
|
+
VERSION = '3.12.4'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -16,12 +16,11 @@ from datetime import datetime
|
|
16
16
|
import uuid
|
17
17
|
from contextlib import contextmanager
|
18
18
|
|
19
|
-
|
20
19
|
warnings.filterwarnings('ignore')
|
21
20
|
logger = mylogger.MyLogger(
|
22
21
|
name='deduplicator',
|
23
22
|
logging_mode='file',
|
24
|
-
log_level='
|
23
|
+
log_level='info',
|
25
24
|
log_file='deduplicator.log',
|
26
25
|
log_format='json',
|
27
26
|
max_log_size=50,
|
@@ -72,7 +71,8 @@ class MySQLDeduplicator:
|
|
72
71
|
exclude_columns: Optional[List[str]] = None,
|
73
72
|
exclude_databases: Optional[List[str]] = None,
|
74
73
|
exclude_tables: Optional[Dict[str, List[str]]] = None,
|
75
|
-
duplicate_keep_mode: str = 'keep_one'
|
74
|
+
duplicate_keep_mode: str = 'keep_one',
|
75
|
+
keep_order: str = 'min'
|
76
76
|
) -> None:
|
77
77
|
"""
|
78
78
|
初始化去重处理器
|
@@ -178,6 +178,7 @@ class MySQLDeduplicator:
|
|
178
178
|
self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
|
179
179
|
|
180
180
|
self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
|
181
|
+
self.keep_order = keep_order if keep_order in ('min', 'max') else 'min'
|
181
182
|
|
182
183
|
def _get_connection(self) -> pymysql.connections.Connection:
|
183
184
|
"""
|
@@ -442,18 +443,31 @@ class MySQLDeduplicator:
|
|
442
443
|
|
443
444
|
# 用Python查找重复
|
444
445
|
if use_python_dedup:
|
446
|
+
# 判断分组字段是否有“更新时间”
|
447
|
+
has_update_time = any(col == '更新时间' for col in use_columns)
|
445
448
|
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
446
|
-
|
449
|
+
if has_update_time:
|
450
|
+
select_cols += ',`更新时间`'
|
451
|
+
select_where = f"WHERE `{time_col}` = '{date_val}'" if date_val else ''
|
447
452
|
grouped = defaultdict(list)
|
448
453
|
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
449
454
|
key = tuple(row[col] for col in use_columns)
|
450
|
-
grouped[key].append(row
|
455
|
+
grouped[key].append(row)
|
451
456
|
dup_count = 0
|
452
457
|
del_ids = []
|
453
458
|
for ids in grouped.values():
|
454
459
|
if len(ids) > 1:
|
455
460
|
dup_count += 1
|
456
|
-
|
461
|
+
if has_update_time:
|
462
|
+
# 按更新时间最大保留
|
463
|
+
keep_row = max(ids, key=lambda x: x.get('更新时间') or '')
|
464
|
+
else:
|
465
|
+
# 按id保留
|
466
|
+
if self.keep_order == 'max':
|
467
|
+
keep_row = max(ids, key=lambda x: x[pk_real])
|
468
|
+
else:
|
469
|
+
keep_row = min(ids, key=lambda x: x[pk_real])
|
470
|
+
del_ids.extend([r[pk_real] for r in ids if r[pk_real] != keep_row[pk_real]])
|
457
471
|
affected_rows = 0
|
458
472
|
if not dry_run and del_ids:
|
459
473
|
with self._conn_ctx() as conn:
|
@@ -469,9 +483,18 @@ class MySQLDeduplicator:
|
|
469
483
|
temp_table = self._make_temp_table_name(table)
|
470
484
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
471
485
|
create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
|
486
|
+
# 判断分组字段是否有“更新时间”
|
487
|
+
has_update_time = any(col == '更新时间' for col in use_columns)
|
488
|
+
if has_update_time:
|
489
|
+
keep_field = '更新时间'
|
490
|
+
keep_func = 'MAX'
|
491
|
+
else:
|
492
|
+
keep_field = pk_real
|
493
|
+
keep_func = 'MAX' if self.keep_order == 'max' else 'MIN'
|
494
|
+
keep_alias = 'keep_val'
|
472
495
|
create_temp_sql = f"""
|
473
496
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
474
|
-
SELECT
|
497
|
+
SELECT {keep_func}(`{keep_field}`) as `{keep_alias}`, {column_list}, COUNT(*) as `dup_count`
|
475
498
|
FROM `{database}`.`{table}`
|
476
499
|
{create_temp_where}
|
477
500
|
GROUP BY {column_list}
|
@@ -494,7 +517,7 @@ class MySQLDeduplicator:
|
|
494
517
|
while True:
|
495
518
|
where_clauses = []
|
496
519
|
if self.duplicate_keep_mode == 'keep_one':
|
497
|
-
where_clauses.append(f"t.`{
|
520
|
+
where_clauses.append(f"t.`{keep_field}` <> tmp.`{keep_alias}`")
|
498
521
|
if where_sql.strip():
|
499
522
|
where_clauses.append(where_sql.strip())
|
500
523
|
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
@@ -752,7 +775,7 @@ class MySQLDeduplicator:
|
|
752
775
|
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
753
776
|
total_dup += dup_count
|
754
777
|
total_del += affected_rows
|
755
|
-
logger.
|
778
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}", "唯一列": columns})
|
756
779
|
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
757
780
|
if reorder_id and total_del > 0:
|
758
781
|
try:
|
@@ -761,11 +784,11 @@ class MySQLDeduplicator:
|
|
761
784
|
except Exception as e:
|
762
785
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
763
786
|
if affected_rows > 0:
|
764
|
-
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
|
787
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del, "唯一列": columns})
|
765
788
|
return (total_dup, total_del)
|
766
789
|
# 没有date_column,直接全表去重
|
767
790
|
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
|
768
|
-
logger.
|
791
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表', "唯一列": columns})
|
769
792
|
dup_count, affected_rows = result
|
770
793
|
if reorder_id and affected_rows > 0:
|
771
794
|
try:
|
@@ -774,7 +797,7 @@ class MySQLDeduplicator:
|
|
774
797
|
except Exception as e:
|
775
798
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
776
799
|
if affected_rows > 0:
|
777
|
-
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
|
800
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows, "唯一列": columns})
|
778
801
|
return result
|
779
802
|
except Exception as e:
|
780
803
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
@@ -1314,18 +1337,24 @@ class MySQLDeduplicator:
|
|
1314
1337
|
|
1315
1338
|
|
1316
1339
|
def main():
|
1340
|
+
from mdbq.config import config
|
1341
|
+
dir_path = os.path.expanduser("~")
|
1342
|
+
my_cont = config.read_config(file_path=os.path.join(dir_path, 'spd.txt'))
|
1343
|
+
username, password, host, port = my_cont['username'], my_cont['password'], my_cont['host'], int(my_cont['port'])
|
1344
|
+
# host = 'localhost'
|
1345
|
+
|
1317
1346
|
deduplicator = MySQLDeduplicator(
|
1318
|
-
username=
|
1319
|
-
password=
|
1320
|
-
host=
|
1321
|
-
port=
|
1347
|
+
username=username,
|
1348
|
+
password=password,
|
1349
|
+
host=host,
|
1350
|
+
port=port,
|
1322
1351
|
max_workers= 2,
|
1323
1352
|
batch_size=1000,
|
1324
1353
|
skip_system_dbs=True,
|
1325
1354
|
max_retries=3,
|
1326
1355
|
retry_waiting_time=5,
|
1327
1356
|
pool_size=30,
|
1328
|
-
recent_month=1,
|
1357
|
+
# recent_month=1,
|
1329
1358
|
# date_range=['2025-06-09', '2025-06-10'],
|
1330
1359
|
exclude_columns=['更新时间'],
|
1331
1360
|
# exclude_databases=['测试库4'],
|
@@ -1338,6 +1367,7 @@ def main():
|
|
1338
1367
|
# "商品排行_2025",
|
1339
1368
|
# ],
|
1340
1369
|
# },
|
1370
|
+
keep_order='MAX', # 保留重复组中指定列的最大值
|
1341
1371
|
)
|
1342
1372
|
|
1343
1373
|
# 全库去重(单线程)
|
@@ -1347,7 +1377,13 @@ def main():
|
|
1347
1377
|
# deduplicator.deduplicate_database('数据引擎2', dry_run=False, parallel=True, reorder_id=True)
|
1348
1378
|
|
1349
1379
|
# # 指定表去重(使用特定列)
|
1350
|
-
deduplicator.deduplicate_table(
|
1380
|
+
deduplicator.deduplicate_table(
|
1381
|
+
'京东数据3',
|
1382
|
+
'推广数据_京准通_2024',
|
1383
|
+
columns=['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id', 'spu_id', '花费', '展现数', '点击数'],
|
1384
|
+
dry_run=False,
|
1385
|
+
reorder_id=True,
|
1386
|
+
)
|
1351
1387
|
|
1352
1388
|
# # 重排id列
|
1353
1389
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
mdbq/mysql/uploader.py
CHANGED
@@ -323,7 +323,7 @@ class MySQLUploader:
|
|
323
323
|
logger.error('无效的标识符', {'标识符': identifier})
|
324
324
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
325
325
|
# 始终做特殊字符清理
|
326
|
-
cleaned = re.sub(r'[
|
326
|
+
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
327
327
|
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
328
328
|
if not cleaned:
|
329
329
|
logger.error('无法清理异常标识符', {'原始标识符': identifier})
|
@@ -332,6 +332,8 @@ class MySQLUploader:
|
|
332
332
|
'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
|
333
333
|
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
334
334
|
}
|
335
|
+
if len(cleaned) > 64:
|
336
|
+
cleaned = cleaned[:64]
|
335
337
|
if cleaned.lower() in mysql_keywords:
|
336
338
|
logger.debug('存在MySQL保留字', {'标识符': cleaned})
|
337
339
|
return f"`{cleaned}`"
|
@@ -423,11 +425,11 @@ class MySQLUploader:
|
|
423
425
|
# UNIQUE KEY定义
|
424
426
|
unique_defs = []
|
425
427
|
if unique_keys:
|
426
|
-
for
|
428
|
+
for unique_cols in unique_keys:
|
427
429
|
if not unique_cols:
|
428
430
|
continue
|
429
431
|
safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
|
430
|
-
unique_name = f"uniq_{'_'.join(safe_unique_cols)}
|
432
|
+
unique_name = f"uniq_{'_'.join(safe_unique_cols)}"
|
431
433
|
unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
|
432
434
|
index_defs = list(set(index_defs))
|
433
435
|
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
@@ -651,7 +653,7 @@ class MySQLUploader:
|
|
651
653
|
添加UNIQUE KEY
|
652
654
|
"""
|
653
655
|
safe_cols = [self._normalize_col(col) for col in unique_cols]
|
654
|
-
unique_name = f"uniq_{'_'.join(safe_cols)}
|
656
|
+
unique_name = f"uniq_{'_'.join(safe_cols)}"
|
655
657
|
sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
|
656
658
|
try:
|
657
659
|
with self._get_connection() as conn:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=V3m3JOLEsFbctTVRE9dNc1iuNQahT_FbWxcHtAoWWqc,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,10 +8,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=MYcNe0rwWOFS2Bqac8yGwwothlz8H--cOi6yuZf3qIs,72602
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=ekpPaJypnuwxi2v42e-khqwT_eZ5LRl1ylQP492xbkk,70271
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
17
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
24
24
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
25
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
26
|
mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
|
27
|
-
mdbq-3.12.
|
28
|
-
mdbq-3.12.
|
29
|
-
mdbq-3.12.
|
30
|
-
mdbq-3.12.
|
27
|
+
mdbq-3.12.4.dist-info/METADATA,sha256=bptLkLuByUNJJnZ_ruWRjxeG2LOFvaqK26M3MtQkf78,364
|
28
|
+
mdbq-3.12.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
29
|
+
mdbq-3.12.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
30
|
+
mdbq-3.12.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|