mdbq 3.12.3__py3-none-any.whl → 3.12.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/deduplicator.py +68 -20
- mdbq/mysql/unique_.py +379 -0
- mdbq/mysql/uploader.py +6 -4
- mdbq/spider/aikucun.py +1 -0
- {mdbq-3.12.3.dist-info → mdbq-3.12.5.dist-info}/METADATA +1 -1
- {mdbq-3.12.3.dist-info → mdbq-3.12.5.dist-info}/RECORD +9 -8
- {mdbq-3.12.3.dist-info → mdbq-3.12.5.dist-info}/WHEEL +0 -0
- {mdbq-3.12.3.dist-info → mdbq-3.12.5.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.12.
|
1
|
+
VERSION = '3.12.5'
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -16,12 +16,11 @@ from datetime import datetime
|
|
16
16
|
import uuid
|
17
17
|
from contextlib import contextmanager
|
18
18
|
|
19
|
-
|
20
19
|
warnings.filterwarnings('ignore')
|
21
20
|
logger = mylogger.MyLogger(
|
22
21
|
name='deduplicator',
|
23
|
-
logging_mode='
|
24
|
-
log_level='
|
22
|
+
logging_mode='both',
|
23
|
+
log_level='info',
|
25
24
|
log_file='deduplicator.log',
|
26
25
|
log_format='json',
|
27
26
|
max_log_size=50,
|
@@ -72,7 +71,8 @@ class MySQLDeduplicator:
|
|
72
71
|
exclude_columns: Optional[List[str]] = None,
|
73
72
|
exclude_databases: Optional[List[str]] = None,
|
74
73
|
exclude_tables: Optional[Dict[str, List[str]]] = None,
|
75
|
-
duplicate_keep_mode: str = 'keep_one'
|
74
|
+
duplicate_keep_mode: str = 'keep_one',
|
75
|
+
keep_order: str = 'min'
|
76
76
|
) -> None:
|
77
77
|
"""
|
78
78
|
初始化去重处理器
|
@@ -178,6 +178,7 @@ class MySQLDeduplicator:
|
|
178
178
|
self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
|
179
179
|
|
180
180
|
self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
|
181
|
+
self.keep_order = keep_order if keep_order in ('min', 'max') else 'min'
|
181
182
|
|
182
183
|
def _get_connection(self) -> pymysql.connections.Connection:
|
183
184
|
"""
|
@@ -442,18 +443,31 @@ class MySQLDeduplicator:
|
|
442
443
|
|
443
444
|
# 用Python查找重复
|
444
445
|
if use_python_dedup:
|
446
|
+
# 判断分组字段是否有“更新时间”
|
447
|
+
has_update_time = any(col == '更新时间' for col in use_columns)
|
445
448
|
select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
|
446
|
-
|
449
|
+
if has_update_time:
|
450
|
+
select_cols += ',`更新时间`'
|
451
|
+
select_where = f"WHERE `{time_col}` = '{date_val}'" if date_val else ''
|
447
452
|
grouped = defaultdict(list)
|
448
453
|
for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
|
449
454
|
key = tuple(row[col] for col in use_columns)
|
450
|
-
grouped[key].append(row
|
455
|
+
grouped[key].append(row)
|
451
456
|
dup_count = 0
|
452
457
|
del_ids = []
|
453
458
|
for ids in grouped.values():
|
454
459
|
if len(ids) > 1:
|
455
460
|
dup_count += 1
|
456
|
-
|
461
|
+
if has_update_time:
|
462
|
+
# 按更新时间最大保留
|
463
|
+
keep_row = max(ids, key=lambda x: x.get('更新时间') or '')
|
464
|
+
else:
|
465
|
+
# 按id保留
|
466
|
+
if self.keep_order == 'max':
|
467
|
+
keep_row = max(ids, key=lambda x: x[pk_real])
|
468
|
+
else:
|
469
|
+
keep_row = min(ids, key=lambda x: x[pk_real])
|
470
|
+
del_ids.extend([r[pk_real] for r in ids if r[pk_real] != keep_row[pk_real]])
|
457
471
|
affected_rows = 0
|
458
472
|
if not dry_run and del_ids:
|
459
473
|
with self._conn_ctx() as conn:
|
@@ -469,9 +483,18 @@ class MySQLDeduplicator:
|
|
469
483
|
temp_table = self._make_temp_table_name(table)
|
470
484
|
drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
|
471
485
|
create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
|
486
|
+
# 判断分组字段是否有“更新时间”
|
487
|
+
has_update_time = any(col == '更新时间' for col in use_columns)
|
488
|
+
if has_update_time:
|
489
|
+
keep_field = '更新时间'
|
490
|
+
keep_func = 'MAX'
|
491
|
+
else:
|
492
|
+
keep_field = pk_real
|
493
|
+
keep_func = 'MAX' if self.keep_order == 'max' else 'MIN'
|
494
|
+
keep_alias = 'keep_val'
|
472
495
|
create_temp_sql = f"""
|
473
496
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
474
|
-
SELECT
|
497
|
+
SELECT {keep_func}(`{keep_field}`) as `{keep_alias}`, {column_list}, COUNT(*) as `dup_count`
|
475
498
|
FROM `{database}`.`{table}`
|
476
499
|
{create_temp_where}
|
477
500
|
GROUP BY {column_list}
|
@@ -494,7 +517,7 @@ class MySQLDeduplicator:
|
|
494
517
|
while True:
|
495
518
|
where_clauses = []
|
496
519
|
if self.duplicate_keep_mode == 'keep_one':
|
497
|
-
where_clauses.append(f"t.`{
|
520
|
+
where_clauses.append(f"t.`{keep_field}` <> tmp.`{keep_alias}`")
|
498
521
|
if where_sql.strip():
|
499
522
|
where_clauses.append(where_sql.strip())
|
500
523
|
where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
@@ -705,6 +728,18 @@ class MySQLDeduplicator:
|
|
705
728
|
})
|
706
729
|
all_columns = self._get_table_columns(database, table)
|
707
730
|
all_columns_lower = [col.lower() for col in all_columns]
|
731
|
+
# columns有效性检查
|
732
|
+
if columns:
|
733
|
+
invalid_columns = [col for col in columns if col.lower() not in all_columns_lower]
|
734
|
+
if invalid_columns:
|
735
|
+
logger.warning('columns中存在表字段不存在的列,跳过该表', {
|
736
|
+
"库": database,
|
737
|
+
"表": table,
|
738
|
+
"columns": columns,
|
739
|
+
"实际表字段": all_columns,
|
740
|
+
"缺失字段": invalid_columns
|
741
|
+
})
|
742
|
+
return (0, 0)
|
708
743
|
time_col = self.date_column
|
709
744
|
time_col_lower = time_col.lower() if time_col else None
|
710
745
|
# 如果传了columns且columns不包含date_column,则不分天,直接全表去重
|
@@ -752,7 +787,7 @@ class MySQLDeduplicator:
|
|
752
787
|
logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
|
753
788
|
total_dup += dup_count
|
754
789
|
total_del += affected_rows
|
755
|
-
logger.
|
790
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}", "唯一列": columns})
|
756
791
|
# 自动重排id列(仅当有实际删除时且reorder_id为True)
|
757
792
|
if reorder_id and total_del > 0:
|
758
793
|
try:
|
@@ -761,11 +796,11 @@ class MySQLDeduplicator:
|
|
761
796
|
except Exception as e:
|
762
797
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
763
798
|
if affected_rows > 0:
|
764
|
-
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
|
799
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del, "唯一列": columns})
|
765
800
|
return (total_dup, total_del)
|
766
801
|
# 没有date_column,直接全表去重
|
767
802
|
result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
|
768
|
-
logger.
|
803
|
+
logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表', "唯一列": columns})
|
769
804
|
dup_count, affected_rows = result
|
770
805
|
if reorder_id and affected_rows > 0:
|
771
806
|
try:
|
@@ -774,7 +809,7 @@ class MySQLDeduplicator:
|
|
774
809
|
except Exception as e:
|
775
810
|
logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
|
776
811
|
if affected_rows > 0:
|
777
|
-
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
|
812
|
+
logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows, "唯一列": columns})
|
778
813
|
return result
|
779
814
|
except Exception as e:
|
780
815
|
logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
|
@@ -1314,18 +1349,24 @@ class MySQLDeduplicator:
|
|
1314
1349
|
|
1315
1350
|
|
1316
1351
|
def main():
|
1352
|
+
from mdbq.config import config
|
1353
|
+
dir_path = os.path.expanduser("~")
|
1354
|
+
my_cont = config.read_config(file_path=os.path.join(dir_path, 'spd.txt'))
|
1355
|
+
username, password, host, port = my_cont['username'], my_cont['password'], my_cont['host'], int(my_cont['port'])
|
1356
|
+
# host = 'localhost'
|
1357
|
+
|
1317
1358
|
deduplicator = MySQLDeduplicator(
|
1318
|
-
username=
|
1319
|
-
password=
|
1320
|
-
host=
|
1321
|
-
port=
|
1359
|
+
username=username,
|
1360
|
+
password=password,
|
1361
|
+
host=host,
|
1362
|
+
port=port,
|
1322
1363
|
max_workers= 2,
|
1323
1364
|
batch_size=1000,
|
1324
1365
|
skip_system_dbs=True,
|
1325
1366
|
max_retries=3,
|
1326
1367
|
retry_waiting_time=5,
|
1327
1368
|
pool_size=30,
|
1328
|
-
recent_month=1,
|
1369
|
+
# recent_month=1,
|
1329
1370
|
# date_range=['2025-06-09', '2025-06-10'],
|
1330
1371
|
exclude_columns=['更新时间'],
|
1331
1372
|
# exclude_databases=['测试库4'],
|
@@ -1338,6 +1379,7 @@ def main():
|
|
1338
1379
|
# "商品排行_2025",
|
1339
1380
|
# ],
|
1340
1381
|
# },
|
1382
|
+
keep_order='MAX', # 保留重复组中指定列的最大值
|
1341
1383
|
)
|
1342
1384
|
|
1343
1385
|
# 全库去重(单线程)
|
@@ -1347,7 +1389,13 @@ def main():
|
|
1347
1389
|
# deduplicator.deduplicate_database('数据引擎2', dry_run=False, parallel=True, reorder_id=True)
|
1348
1390
|
|
1349
1391
|
# # 指定表去重(使用特定列)
|
1350
|
-
deduplicator.deduplicate_table(
|
1392
|
+
# deduplicator.deduplicate_table(
|
1393
|
+
# '达摩盘3',
|
1394
|
+
# '货品洞察_全店单品_2024_11',
|
1395
|
+
# columns=['日期', '店铺名称', '数据周期', '商品id'],
|
1396
|
+
# dry_run=False,
|
1397
|
+
# reorder_id=True,
|
1398
|
+
# )
|
1351
1399
|
|
1352
1400
|
# # 重排id列
|
1353
1401
|
# deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
|
@@ -1356,5 +1404,5 @@ def main():
|
|
1356
1404
|
deduplicator.close()
|
1357
1405
|
|
1358
1406
|
if __name__ == '__main__':
|
1359
|
-
|
1407
|
+
main()
|
1360
1408
|
pass
|
mdbq/mysql/unique_.py
ADDED
@@ -0,0 +1,379 @@
|
|
1
|
+
import re
|
2
|
+
import pymysql
|
3
|
+
from typing import List, Dict, Any, Tuple
|
4
|
+
from mdbq.log import mylogger
|
5
|
+
from mdbq.config import config
|
6
|
+
from dbutils.pooled_db import PooledDB
|
7
|
+
import os
|
8
|
+
|
9
|
+
logger = mylogger.MyLogger(
|
10
|
+
name='unique_',
|
11
|
+
logging_mode='file',
|
12
|
+
log_level='debug',
|
13
|
+
log_file='unique_.log',
|
14
|
+
log_format='json',
|
15
|
+
max_log_size=50,
|
16
|
+
backup_count=5,
|
17
|
+
enable_async=False, # 是否启用异步日志
|
18
|
+
sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
|
19
|
+
sensitive_fields=[], # 敏感字段列表
|
20
|
+
enable_metrics=False, # 是否启用性能指标
|
21
|
+
)
|
22
|
+
|
23
|
+
class UniqueManager:
|
24
|
+
"""
|
25
|
+
MySQL唯一约束批量添加工具
|
26
|
+
"""
|
27
|
+
def __init__(self, username: str, password: str, host: str, port: int = 3306):
|
28
|
+
"""
|
29
|
+
初始化MySQL连接参数和日志,创建连接池
|
30
|
+
"""
|
31
|
+
self.username = username
|
32
|
+
self.password = password
|
33
|
+
self.host = host
|
34
|
+
self.port = port
|
35
|
+
self.pool = PooledDB(
|
36
|
+
creator=pymysql,
|
37
|
+
maxconnections=10,
|
38
|
+
mincached=2,
|
39
|
+
maxcached=5,
|
40
|
+
blocking=True,
|
41
|
+
host=self.host,
|
42
|
+
user=self.username,
|
43
|
+
password=self.password,
|
44
|
+
port=self.port,
|
45
|
+
charset='utf8mb4',
|
46
|
+
autocommit=True
|
47
|
+
)
|
48
|
+
|
49
|
+
def add_unique(self, my_databases: List[Dict[str, Any]]) -> None:
|
50
|
+
"""
|
51
|
+
主入口,遍历所有库表,批量添加唯一约束
|
52
|
+
"""
|
53
|
+
total_databases, success_cnt, fail_cnt, skip_cnt, detail_results = 0, 0, 0, 0, []
|
54
|
+
for db_group in my_databases:
|
55
|
+
for db_name, tables in db_group.items():
|
56
|
+
total_databases += 1
|
57
|
+
db_result = self._process_database(db_name, tables)
|
58
|
+
success_cnt += db_result['success_cnt']
|
59
|
+
fail_cnt += db_result['fail_cnt']
|
60
|
+
skip_cnt += db_result['skip_cnt']
|
61
|
+
detail_results.extend(db_result['details'])
|
62
|
+
# 分组详细结果
|
63
|
+
success_list = [d for d in detail_results if d.get('result') == '成功']
|
64
|
+
fail_list = [d for d in detail_results if d.get('result') == '失败']
|
65
|
+
skip_list = [d for d in detail_results if d.get('result') == '跳过']
|
66
|
+
total_tables = len(success_list) + len(fail_list) + len(skip_list) # 处理过的表数量
|
67
|
+
if success_list:
|
68
|
+
logger.info('成功表', {
|
69
|
+
'数量': len(success_list),
|
70
|
+
'详情': success_list
|
71
|
+
})
|
72
|
+
if fail_list:
|
73
|
+
logger.error('失败表', {
|
74
|
+
'数量': len(fail_list),
|
75
|
+
'详情': fail_list
|
76
|
+
})
|
77
|
+
if skip_list:
|
78
|
+
logger.info('跳过表', {
|
79
|
+
'数量': len(skip_list),
|
80
|
+
'详情': skip_list
|
81
|
+
})
|
82
|
+
logger.info('全部执行完成', {
|
83
|
+
'库统计': total_databases,
|
84
|
+
'表统计': total_tables,
|
85
|
+
'成功': success_cnt,
|
86
|
+
'失败': fail_cnt,
|
87
|
+
'跳过': skip_cnt
|
88
|
+
})
|
89
|
+
|
90
|
+
def _process_database(self, db_name: str, tables: Dict[str, Any]) -> Dict[str, Any]:
|
91
|
+
"""
|
92
|
+
处理单个数据库下所有表,支持模糊匹配表名,限定在当前数据库
|
93
|
+
"""
|
94
|
+
# 用于统计所有被处理过的表名
|
95
|
+
processed_tables = set()
|
96
|
+
success_cnt, fail_cnt, skip_cnt = 0, 0, 0
|
97
|
+
details = []
|
98
|
+
# 获取当前数据库下所有表名
|
99
|
+
conn = self.pool.connection()
|
100
|
+
try:
|
101
|
+
with conn.cursor() as cursor:
|
102
|
+
cursor.execute(f"USE `{db_name}`")
|
103
|
+
cursor.execute("SHOW TABLES")
|
104
|
+
all_tables = [row[0] for row in cursor.fetchall()]
|
105
|
+
finally:
|
106
|
+
conn.close()
|
107
|
+
all_table_count = len(all_tables) # 新增:该库下所有表数量
|
108
|
+
# 只在当前db_name下做模糊匹配
|
109
|
+
for table_pattern, unique_keys_list in tables.items():
|
110
|
+
# 支持通配符 * 或 ?,转为正则
|
111
|
+
if '*' in table_pattern or '?' in table_pattern:
|
112
|
+
regex = re.compile('^' + table_pattern.replace('*', '.*').replace('?', '.') + '$')
|
113
|
+
matched_tables = [t for t in all_tables if regex.match(t)]
|
114
|
+
else:
|
115
|
+
# 也支持部分匹配(如“明细”)
|
116
|
+
matched_tables = [t for t in all_tables if table_pattern in t]
|
117
|
+
if table_pattern in all_tables:
|
118
|
+
matched_tables.append(table_pattern)
|
119
|
+
matched_tables = list(set(matched_tables))
|
120
|
+
if not matched_tables:
|
121
|
+
logger.warning('未找到匹配的数据表', {'库': db_name, '表模式': table_pattern})
|
122
|
+
skip_cnt += 1
|
123
|
+
details.append({'库': db_name, '表': table_pattern, 'result': '跳过'})
|
124
|
+
continue
|
125
|
+
for real_table in matched_tables:
|
126
|
+
processed_tables.add(real_table)
|
127
|
+
try:
|
128
|
+
res = self._process_table(db_name, real_table, unique_keys_list)
|
129
|
+
success_cnt += res['success_cnt']
|
130
|
+
fail_cnt += res['fail_cnt']
|
131
|
+
skip_cnt += res['skip_cnt']
|
132
|
+
details.extend(res['details'])
|
133
|
+
except Exception as e:
|
134
|
+
logger.error('唯一约束失败', {'库': db_name, '表': real_table, 'error': str(e)})
|
135
|
+
fail_cnt += 1
|
136
|
+
details.append({'库': db_name, '表': real_table, 'result': '失败'})
|
137
|
+
table_count = len(processed_tables)
|
138
|
+
return {'table_count': table_count, 'all_table_count': all_table_count, 'success_cnt': success_cnt, 'fail_cnt': fail_cnt, 'skip_cnt': skip_cnt, 'details': details}
|
139
|
+
|
140
|
+
def _process_table(self, db_name: str, table_name: str, unique_keys_list: List[List[str]]) -> Dict[str, Any]:
|
141
|
+
"""
|
142
|
+
处理单个表的所有唯一约束,返回本表的成功/失败/跳过计数和详细结果
|
143
|
+
修复唯一约束重命名后原约束未删除的问题。
|
144
|
+
"""
|
145
|
+
success_cnt, fail_cnt, skip_cnt = 0, 0, 0
|
146
|
+
details = []
|
147
|
+
conn = self.pool.connection()
|
148
|
+
try:
|
149
|
+
with conn.cursor() as cursor:
|
150
|
+
cursor.execute(f"USE `{db_name}`")
|
151
|
+
# 获取所有唯一索引信息
|
152
|
+
cursor.execute(f"SHOW INDEX FROM `{table_name}` WHERE Non_unique=0")
|
153
|
+
indexes = cursor.fetchall()
|
154
|
+
from collections import defaultdict
|
155
|
+
key_columns = defaultdict(list)
|
156
|
+
key_names = set()
|
157
|
+
for idx in indexes:
|
158
|
+
key_name = idx[2]
|
159
|
+
col_name = idx[4]
|
160
|
+
seq_in_index = idx[3]
|
161
|
+
key_columns[key_name].append((seq_in_index, col_name)) # SEQ_IN_INDEX, COLUMN_NAME
|
162
|
+
key_names.add(key_name)
|
163
|
+
# 统计唯一索引数量
|
164
|
+
unique_count = len(key_columns)
|
165
|
+
if unique_count >= 20:
|
166
|
+
logger.warning('唯一索引数量超限,跳过全部', {'库': db_name, '表': table_name, '唯一索引数': unique_count})
|
167
|
+
for unique_cols in unique_keys_list:
|
168
|
+
clean_cols = [self._clean_column_name(col) for col in unique_cols]
|
169
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '跳过', '原因': '唯一索引数量超限'})
|
170
|
+
skip_cnt += 1
|
171
|
+
return {'success_cnt': success_cnt, 'fail_cnt': fail_cnt, 'skip_cnt': skip_cnt, 'details': details}
|
172
|
+
for idx, unique_cols in enumerate(unique_keys_list):
|
173
|
+
clean_cols = [self._clean_column_name(col) for col in unique_cols]
|
174
|
+
target_name = self._gen_constraint_name(table_name, clean_cols, idx)
|
175
|
+
# 检查是否有相同字段组合的唯一索引(顺序必须一致)
|
176
|
+
found = False
|
177
|
+
found_key_name = None
|
178
|
+
for kname, col_seq_list in key_columns.items():
|
179
|
+
sorted_cols = [col for _, col in sorted(col_seq_list)]
|
180
|
+
if sorted_cols == clean_cols:
|
181
|
+
found = True
|
182
|
+
found_key_name = kname
|
183
|
+
break
|
184
|
+
if found:
|
185
|
+
if found_key_name == target_name:
|
186
|
+
# 名称和字段都相同,跳过
|
187
|
+
skip_cnt += 1
|
188
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '跳过', '原因': '名称和字段都相同'})
|
189
|
+
else:
|
190
|
+
# 字段相同但名称不同,重命名(先删后加,确保原唯一约束被删除)
|
191
|
+
try:
|
192
|
+
cursor.execute(f"ALTER TABLE `{table_name}` DROP INDEX `{found_key_name}`")
|
193
|
+
# 刷新索引信息,防止后续误判
|
194
|
+
cursor.execute(f"SHOW INDEX FROM `{table_name}` WHERE Non_unique=0")
|
195
|
+
# 再添加新唯一约束
|
196
|
+
self._add_unique(cursor, table_name, clean_cols, target_name)
|
197
|
+
logger.info('唯一约束重命名成功', {'库': db_name, '表': table_name, '唯一约束': clean_cols, '原名': found_key_name, '新名': target_name})
|
198
|
+
success_cnt += 1
|
199
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '成功', '操作': '重命名', '原名': found_key_name, '新名': target_name})
|
200
|
+
except Exception as e:
|
201
|
+
logger.error('唯一约束重命名失败', {'库': db_name, '表': table_name, '唯一约束': clean_cols, '原名': found_key_name, '新名': target_name, 'error': str(e)})
|
202
|
+
fail_cnt += 1
|
203
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '失败', '操作': '重命名', '原名': found_key_name, '新名': target_name, 'error': str(e)})
|
204
|
+
else:
|
205
|
+
# 字段组合不存在,直接添加
|
206
|
+
try:
|
207
|
+
self._add_unique(cursor, table_name, clean_cols, target_name)
|
208
|
+
logger.info('添加唯一约束成功', {'库': db_name, '表': table_name, '唯一约束': clean_cols})
|
209
|
+
success_cnt += 1
|
210
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '成功', '操作': '添加'})
|
211
|
+
except Exception as e:
|
212
|
+
err_str = str(e)
|
213
|
+
if 'Duplicate key name' in err_str:
|
214
|
+
skip_cnt += 1
|
215
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '跳过', '原因': '唯一约束名已存在'})
|
216
|
+
logger.info('唯一约束名已存在,跳过', {'库': db_name, '表': table_name, '唯一约束': clean_cols, 'error': err_str})
|
217
|
+
else:
|
218
|
+
logger.error('添加唯一约束失败', {'库': db_name, '表': table_name, '唯一约束': clean_cols, 'error': err_str})
|
219
|
+
fail_cnt += 1
|
220
|
+
details.append({'库': db_name, '表': table_name, '唯一约束': clean_cols, 'result': '失败', '操作': '添加', 'error': err_str})
|
221
|
+
finally:
|
222
|
+
conn.close()
|
223
|
+
return {'success_cnt': success_cnt, 'fail_cnt': fail_cnt, 'skip_cnt': skip_cnt, 'details': details}
|
224
|
+
|
225
|
+
def _clean_column_name(self, col: str) -> str:
|
226
|
+
"""
|
227
|
+
支持中英文字段名,清理非法字符,只保留中英文、数字、下划线,并统一转为小写
|
228
|
+
"""
|
229
|
+
col = col.strip()
|
230
|
+
col = re.sub(r'[^\w\u4e00-\u9fff$]', '_', col)
|
231
|
+
col = re.sub(r'_+', '_', col).strip('_')
|
232
|
+
col = col.lower()
|
233
|
+
if len(col) > 64:
|
234
|
+
col = col[:64]
|
235
|
+
return col
|
236
|
+
|
237
|
+
def _gen_constraint_name(self, table: str, cols: List[str], idx: int) -> str:
|
238
|
+
"""
|
239
|
+
生成唯一约束名,最长64字符,所有列名先规范化,保证与实际索引字段一致
|
240
|
+
"""
|
241
|
+
base = f"uniq"
|
242
|
+
for col in cols:
|
243
|
+
clean_col = self._clean_column_name(col)
|
244
|
+
base += f"_{clean_col}"
|
245
|
+
if len(base) > 64:
|
246
|
+
base = base[:63] + 'x'
|
247
|
+
return base
|
248
|
+
|
249
|
+
def _unique_exists(self, cursor, table: str, cols: List[str]) -> bool:
|
250
|
+
"""
|
251
|
+
检查唯一约束是否已存在,支持多列唯一约束
|
252
|
+
"""
|
253
|
+
sql = f"SHOW INDEX FROM `{table}` WHERE Non_unique=0"
|
254
|
+
cursor.execute(sql)
|
255
|
+
indexes = cursor.fetchall()
|
256
|
+
# MySQL返回的索引信息,需按Key_name分组,收集每个唯一索引的所有列
|
257
|
+
from collections import defaultdict
|
258
|
+
key_columns = defaultdict(list)
|
259
|
+
for idx in indexes:
|
260
|
+
key_name = idx[2] # Key_name
|
261
|
+
col_name = idx[4] # Column_name
|
262
|
+
key_columns[key_name].append(col_name)
|
263
|
+
for col_list in key_columns.values():
|
264
|
+
if set(col_list) == set(cols) and len(col_list) == len(cols):
|
265
|
+
return True
|
266
|
+
return False
|
267
|
+
|
268
|
+
def _add_unique(self, cursor, table: str, cols: List[str], constraint_name: str) -> None:
|
269
|
+
"""
|
270
|
+
添加唯一约束
|
271
|
+
"""
|
272
|
+
cols_sql = ','.join([f'`{c}`' for c in cols])
|
273
|
+
sql = f"ALTER TABLE `{table}` ADD CONSTRAINT `{constraint_name}` UNIQUE ({cols_sql})"
|
274
|
+
cursor.execute(sql)
|
275
|
+
|
276
|
+
|
277
|
+
def main():
|
278
|
+
dir_path = os.path.expanduser("~")
|
279
|
+
my_cont = config.read_config(file_path=os.path.join(dir_path, 'spd.txt'))
|
280
|
+
username, password, host, port = my_cont['username'], my_cont['password'], my_cont['host'], int(my_cont['port'])
|
281
|
+
# host = 'localhost'
|
282
|
+
|
283
|
+
my_databases = [
|
284
|
+
{
|
285
|
+
# '京东数据3': {
|
286
|
+
# "u_商品明细": [['日期', '店铺名称', '商品id', '访客数', '浏览量']],
|
287
|
+
# "商智_店铺来源": [['日期', '店铺名称', '一级来源', '二级来源', '三级来源', '访客数', '浏览量']],
|
288
|
+
# '推广数据_京准通': [['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id', 'spu_id', '花费', '展现数', '点击数']],
|
289
|
+
# '推广数据_关键词报表': [['日期', '店铺名称', '产品线', '计划id', '搜索词', '关键词', '花费', '展现数', '点击数']],
|
290
|
+
# '推广数据_搜索词报表': [['日期', '店铺名称', '产品线', '搜索词', '花费', '展现数', '点击数']],
|
291
|
+
# '推广数据_全站营销': [['日期', '店铺名称', '产品线', '花费']],
|
292
|
+
# },
|
293
|
+
# "人群画像2": {
|
294
|
+
# "*": [['日期', '账户id', '人群id', '画像id', '标签id']],
|
295
|
+
# },
|
296
|
+
# "属性设置3": {
|
297
|
+
# "京东商品属性": [['sku_id']],
|
298
|
+
# "商品sku属性": [['日期', 'sku_id']],
|
299
|
+
# "商品主图视频": [['日期', '商品主图', '750主图', '商品视频']],
|
300
|
+
# "商品类目属性": [['日期', '商品id']],
|
301
|
+
# "商品素材中心": [['商品id']],
|
302
|
+
# "商品索引表_主推排序调用": [['商品id']],
|
303
|
+
# "地理区域": [['省份']],
|
304
|
+
# "城市等级": [['城市']],
|
305
|
+
# "货品年份基准": [['平台', '上市年份']],
|
306
|
+
# },
|
307
|
+
# "市场数据3": {
|
308
|
+
# "京东_商家榜单": [['日期', '分类', '类型', '店铺名称', '成交金额指数']],
|
309
|
+
# "市场排行_2025": [['日期', '接口类型', '类目等级', '类目名称', '商品id']],
|
310
|
+
# "搜索流失_细分单品": [['日期', '店铺名称', '分类', '商品id', '竞品id', '竞店id', '统计周期']],
|
311
|
+
# "搜索流失榜单": [['日期', '店铺名称', '分类', '商品id', '统计周期']],
|
312
|
+
# "浏览流失_细分单品": [['日期', '店铺名称', '分类', '商品id', '竞品id', '竞店id', '统计周期']],
|
313
|
+
# "浏览流失榜单": [['日期', '店铺名称', '分类', '商品id', '统计周期']],
|
314
|
+
# "淘宝店铺数据": [['日期', '店铺id', '商品id']],
|
315
|
+
# "竞店流失": [['日期', '店铺名称', '竞店商家id']],
|
316
|
+
# },
|
317
|
+
# "数据引擎2": {
|
318
|
+
# "供给投入": [['日期', '报告id', '品牌ID', '类目Id', '指标名称', '父级指标']],
|
319
|
+
# "新老客贡献": [['日期', '报告id', '品牌ID', '类目Id']],
|
320
|
+
# "进店搜索词": [['日期', '报告id', '品牌ID', '搜索词', '类目Id']],
|
321
|
+
# },
|
322
|
+
# "爱库存2": {
|
323
|
+
# "sku榜单": [['日期', '平台', '店铺名称', '条码']],
|
324
|
+
# "spu榜单": [['日期', '平台', '店铺名称', '商品款号', '访客量']],
|
325
|
+
# },
|
326
|
+
# "生意参谋3": {
|
327
|
+
# "crm成交客户": [['客户id']],
|
328
|
+
# "商品排行": [['日期', '店铺名称', '商品id']],
|
329
|
+
# "流量来源构成": [['日期', '店铺名称', '来源构成', '类别', '一级来源', '二级来源', '三级来源']],
|
330
|
+
# "手淘搜索": [['日期', '店铺名称', '搜索词', '词类型', '访客数']],
|
331
|
+
# "新品追踪": [['日期', '店铺名称', '商品id']],
|
332
|
+
# "直播分场次效果": [['场次id']],
|
333
|
+
# },
|
334
|
+
# "生意经3": {
|
335
|
+
# "sku销量_按名称": [['日期', '店铺名称', '宝贝id', 'sku名称', '销售额']],
|
336
|
+
# "sku销量_按商家编码": [['日期', '店铺名称', '宝贝id', 'sku编码', '销售额']],
|
337
|
+
# "地域分析_城市": [['日期', '店铺名称', '城市', '销售额']],
|
338
|
+
# "地域分析_省份": [['日期', '店铺名称', '省份', '销售额']],
|
339
|
+
# "宝贝指标": [['日期', '店铺名称', '宝贝id', '销售额']],
|
340
|
+
# "店铺销售指标": [['日期', '店铺名称', '销售额']],
|
341
|
+
# "订单数据": [['日期', '店铺名称', '订单号', '商品链接', '净销售额_已扣退款_分摊邮费优惠等', '退款额']],
|
342
|
+
# },
|
343
|
+
# "达摩盘3": {
|
344
|
+
# "dmp人群报表": [['日期', '店铺名称', '人群id', '推广单元信息', '消耗_元', '展现量']],
|
345
|
+
# "全域洞察": [['日期', '起始日期', '店铺名称', '场景id', '父渠道id', '展现量', '花费']],
|
346
|
+
# "关键词_人群画像_关联购买类目": [['日期', '数据周期', '店铺名称', '关键词', '关联类目id']],
|
347
|
+
# "关键词_人群画像_性别": [['日期', '数据周期', '店铺名称', '关键词', '词']],
|
348
|
+
# "关键词_人群画像_消费层级": [['日期', '数据周期', '店铺名称', '关键词', '层级id', '层级值', '标签分类']],
|
349
|
+
# "关键词_市场总结": [['日期', '关键词', '数据周期', '板块']],
|
350
|
+
# "关键词_市场趋势": [['日期', '关键词']],
|
351
|
+
# "关键词_竞争透视_地域分布": [['日期', '数据周期', '店铺名称', '关键词', '省份id']],
|
352
|
+
# "关键词_竞争透视_搜索时段分布": [['日期', '数据周期', '店铺名称', '关键词', '时段']],
|
353
|
+
# "关键词_竞争透视_搜索资源位": [['日期', '数据周期', '店铺名称', '关键词', '渠道id']],
|
354
|
+
# "关键词_竞争透视_竞争度": [['日期', '数据周期', '店铺名称', '关键词', '出价区间']],
|
355
|
+
# "店铺deeplink人群洞察": [['日期', '店铺名称', '人群类型', '人群规模', '人群总计']],
|
356
|
+
# "我的人群属性": [['日期', '人群id']],
|
357
|
+
# "货品_潜品加速": [['日期', '店铺名称', '商品id']],
|
358
|
+
# "货品洞察_全店单品": [['日期', '店铺名称', '数据周期', '商品id']],
|
359
|
+
# "货品洞察_品类洞察": [['日期', '店铺名称', '数据周期', '叶子类目名称']],
|
360
|
+
# },
|
361
|
+
|
362
|
+
# "聚合数据": {
|
363
|
+
# "多店推广场景_按日聚合": [["日期", "店铺名称", "营销场景", "花费"]],
|
364
|
+
# "天猫_主体报表": [['日期', '推广渠道', '店铺名称', '营销场景', '商品id', '花费']],
|
365
|
+
# }
|
366
|
+
}
|
367
|
+
]
|
368
|
+
manager = UniqueManager(
|
369
|
+
username=username,
|
370
|
+
password=password,
|
371
|
+
host=host,
|
372
|
+
port=port
|
373
|
+
)
|
374
|
+
manager.add_unique(my_databases)
|
375
|
+
|
376
|
+
|
377
|
+
if __name__ == "__main__":
|
378
|
+
main()
|
379
|
+
pass
|
mdbq/mysql/uploader.py
CHANGED
@@ -323,7 +323,7 @@ class MySQLUploader:
|
|
323
323
|
logger.error('无效的标识符', {'标识符': identifier})
|
324
324
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
325
325
|
# 始终做特殊字符清理
|
326
|
-
cleaned = re.sub(r'[
|
326
|
+
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
327
327
|
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
328
328
|
if not cleaned:
|
329
329
|
logger.error('无法清理异常标识符', {'原始标识符': identifier})
|
@@ -332,6 +332,8 @@ class MySQLUploader:
|
|
332
332
|
'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
|
333
333
|
'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
|
334
334
|
}
|
335
|
+
if len(cleaned) > 64:
|
336
|
+
cleaned = cleaned[:64]
|
335
337
|
if cleaned.lower() in mysql_keywords:
|
336
338
|
logger.debug('存在MySQL保留字', {'标识符': cleaned})
|
337
339
|
return f"`{cleaned}`"
|
@@ -423,11 +425,11 @@ class MySQLUploader:
|
|
423
425
|
# UNIQUE KEY定义
|
424
426
|
unique_defs = []
|
425
427
|
if unique_keys:
|
426
|
-
for
|
428
|
+
for unique_cols in unique_keys:
|
427
429
|
if not unique_cols:
|
428
430
|
continue
|
429
431
|
safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
|
430
|
-
unique_name = f"uniq_{'_'.join(safe_unique_cols)}
|
432
|
+
unique_name = f"uniq_{'_'.join(safe_unique_cols)}"
|
431
433
|
unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
|
432
434
|
index_defs = list(set(index_defs))
|
433
435
|
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
@@ -651,7 +653,7 @@ class MySQLUploader:
|
|
651
653
|
添加UNIQUE KEY
|
652
654
|
"""
|
653
655
|
safe_cols = [self._normalize_col(col) for col in unique_cols]
|
654
|
-
unique_name = f"uniq_{'_'.join(safe_cols)}
|
656
|
+
unique_name = f"uniq_{'_'.join(safe_cols)}"
|
655
657
|
sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
|
656
658
|
try:
|
657
659
|
with self._get_connection() as conn:
|
mdbq/spider/aikucun.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=WZxQ0Ff6Xa3xY0bl0b3TrdtEVmfNdVpKYRR2LH8QeFg,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -8,10 +8,11 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
|
8
8
|
mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
|
9
9
|
mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
|
-
mdbq/mysql/deduplicator.py,sha256=
|
11
|
+
mdbq/mysql/deduplicator.py,sha256=Sz-Xg7XBvACTQC3WHqOxhSF4d2a6F535v70RDxTdjvg,73138
|
12
12
|
mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
|
13
13
|
mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
|
14
|
-
mdbq/mysql/
|
14
|
+
mdbq/mysql/unique_.py,sha256=eygkSlRda786iwpR1Q-ofnrhDqhZUE4Z0yVZ9LR4EEU,21158
|
15
|
+
mdbq/mysql/uploader.py,sha256=ekpPaJypnuwxi2v42e-khqwT_eZ5LRl1ylQP492xbkk,70271
|
15
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
17
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -23,8 +24,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
23
24
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
24
25
|
mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
|
25
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
26
|
-
mdbq/spider/aikucun.py,sha256=
|
27
|
-
mdbq-3.12.
|
28
|
-
mdbq-3.12.
|
29
|
-
mdbq-3.12.
|
30
|
-
mdbq-3.12.
|
27
|
+
mdbq/spider/aikucun.py,sha256=GaekqY55pDEgVxbeQzHHshnQMC2YDv3v4mA7cQwjli4,21019
|
28
|
+
mdbq-3.12.5.dist-info/METADATA,sha256=pbRybOVLfVrNE7kj93JD-pVbhJwSrAK7zewtJH6T7E8,364
|
29
|
+
mdbq-3.12.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.12.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.12.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|