mdbq 3.10.7__py3-none-any.whl → 3.10.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/optimize.py +1 -0
- mdbq/aggregation/query_data.py +2 -0
- mdbq/log/mylogger.py +8 -19
- mdbq/mysql/deduplicator.py +30 -22
- mdbq/mysql/mysql.py +336 -280
- mdbq/mysql/s_query.py +159 -143
- mdbq/mysql/uploader.py +125 -52
- mdbq/redis/getredis.py +0 -2
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/METADATA +1 -1
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/RECORD +13 -13
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/WHEEL +0 -0
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.10.
|
1
|
+
VERSION = '3.10.9'
|
mdbq/aggregation/optimize.py
CHANGED
mdbq/aggregation/query_data.py
CHANGED
@@ -3995,6 +3995,7 @@ def main(days=150, months=3):
|
|
3995
3995
|
"推广数据2",
|
3996
3996
|
"推广数据_淘宝店",
|
3997
3997
|
"推广数据_奥莱店",
|
3998
|
+
"推广数据_圣积天猫店",
|
3998
3999
|
"爱库存2",
|
3999
4000
|
"生意参谋3",
|
4000
4001
|
"生意经3",
|
@@ -4003,6 +4004,7 @@ def main(days=150, months=3):
|
|
4003
4004
|
'商品人群画像2',
|
4004
4005
|
'市场数据3',
|
4005
4006
|
'回传数据',
|
4007
|
+
'数据引擎2',
|
4006
4008
|
]
|
4007
4009
|
# 使用 ThreadPoolExecutor 来并行运行
|
4008
4010
|
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
mdbq/log/mylogger.py
CHANGED
@@ -239,8 +239,9 @@ class MyLogger:
|
|
239
239
|
log_data['异常'] = self.formatException(record.exc_info)
|
240
240
|
|
241
241
|
# 过滤敏感信息
|
242
|
-
if hasattr(record, '过滤'
|
243
|
-
|
242
|
+
if hasattr(record, 'extra_data') and '过滤' in record.extra_data:
|
243
|
+
sensitive_fields = record.extra_data['过滤']
|
244
|
+
for field in sensitive_fields:
|
244
245
|
if field in log_data:
|
245
246
|
log_data[field] = '***'
|
246
247
|
if isinstance(log_data.get('message'), str):
|
@@ -447,8 +448,11 @@ class MyLogger:
|
|
447
448
|
try:
|
448
449
|
self._log_queue.put((level, message, extra), timeout=0.1)
|
449
450
|
except queue.Full:
|
450
|
-
#
|
451
|
-
|
451
|
+
# 队列满时降级为同步日志,添加队列状态信息到extra
|
452
|
+
if extra is None:
|
453
|
+
extra = {}
|
454
|
+
extra['queue_status'] = 'full'
|
455
|
+
self._sync_log(level, message, extra)
|
452
456
|
else:
|
453
457
|
self._sync_log(level, message, extra)
|
454
458
|
|
@@ -509,21 +513,6 @@ class MyLogger:
|
|
509
513
|
if not extra:
|
510
514
|
extra = {}
|
511
515
|
|
512
|
-
# # 获取完整的异常堆栈
|
513
|
-
# tb = exc_info.__traceback__
|
514
|
-
# while tb.tb_next:
|
515
|
-
# tb = tb.tb_next # 获取最内层的堆栈帧
|
516
|
-
#
|
517
|
-
# extra.update({
|
518
|
-
# 'module': tb.tb_frame.f_globals.get('__name__', ''),
|
519
|
-
# 'function': tb.tb_frame.f_code.co_name,
|
520
|
-
# 'line': tb.tb_lineno,
|
521
|
-
# 'file': tb.tb_frame.f_code.co_filename,
|
522
|
-
# '异常': str(exc_info),
|
523
|
-
# '类型': exc_info.__class__.__name__,
|
524
|
-
# '堆栈': self._format_traceback(exc_info)
|
525
|
-
# })
|
526
|
-
|
527
516
|
# 使用inspect获取调用栈
|
528
517
|
frame = inspect.currentframe()
|
529
518
|
try:
|
mdbq/mysql/deduplicator.py
CHANGED
@@ -24,7 +24,7 @@ logger = mylogger.MyLogger(
|
|
24
24
|
max_log_size=50,
|
25
25
|
backup_count=5,
|
26
26
|
enable_async=False, # 是否启用异步日志
|
27
|
-
sample_rate=1, # 采样
|
27
|
+
sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
|
28
28
|
sensitive_fields=[], # 敏感字段列表
|
29
29
|
)
|
30
30
|
|
@@ -72,8 +72,9 @@ class MySQLDeduplicator:
|
|
72
72
|
skip_system_dbs: bool = True,
|
73
73
|
max_retries: int = 3,
|
74
74
|
retry_interval: int = 5,
|
75
|
-
pool_size: int = 5
|
76
|
-
|
75
|
+
pool_size: int = 5,
|
76
|
+
primary_key: str = 'id'
|
77
|
+
) -> None:
|
77
78
|
"""
|
78
79
|
初始化去重处理器
|
79
80
|
|
@@ -88,6 +89,7 @@ class MySQLDeduplicator:
|
|
88
89
|
:param max_retries: 最大重试次数
|
89
90
|
:param retry_interval: 重试间隔(秒)
|
90
91
|
:param pool_size: 连接池大小
|
92
|
+
:param primary_key: 主键列名,默认为'id'
|
91
93
|
"""
|
92
94
|
# 连接池状态标志
|
93
95
|
self._closed = False
|
@@ -110,6 +112,7 @@ class MySQLDeduplicator:
|
|
110
112
|
self.skip_system_dbs = skip_system_dbs
|
111
113
|
self.max_retries = max_retries
|
112
114
|
self.retry_interval = retry_interval
|
115
|
+
self.primary_key = primary_key
|
113
116
|
|
114
117
|
# 线程安全控制
|
115
118
|
self._lock = threading.Lock()
|
@@ -118,7 +121,7 @@ class MySQLDeduplicator:
|
|
118
121
|
# 系统数据库列表
|
119
122
|
self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
|
120
123
|
|
121
|
-
def _get_connection(self):
|
124
|
+
def _get_connection(self) -> pymysql.connections.Connection:
|
122
125
|
"""从连接池获取连接"""
|
123
126
|
if self._closed:
|
124
127
|
raise ConnectionError("连接池已关闭")
|
@@ -131,7 +134,7 @@ class MySQLDeduplicator:
|
|
131
134
|
raise ConnectionError(f"连接数据库失败: {str(e)}")
|
132
135
|
|
133
136
|
@staticmethod
|
134
|
-
def _retry_on_failure(func):
|
137
|
+
def _retry_on_failure(func: Any) -> Any:
|
135
138
|
"""重试装饰器"""
|
136
139
|
|
137
140
|
@wraps(func)
|
@@ -187,7 +190,7 @@ class MySQLDeduplicator:
|
|
187
190
|
|
188
191
|
@_retry_on_failure
|
189
192
|
def _get_table_columns(self, database: str, table: str) -> List[str]:
|
190
|
-
"""获取表的列名(
|
193
|
+
"""获取表的列名(排除主键列)"""
|
191
194
|
sql = """
|
192
195
|
SELECT COLUMN_NAME
|
193
196
|
FROM INFORMATION_SCHEMA.COLUMNS
|
@@ -199,7 +202,7 @@ class MySQLDeduplicator:
|
|
199
202
|
with conn.cursor() as cursor:
|
200
203
|
cursor.execute(sql, (database, table))
|
201
204
|
return [row['COLUMN_NAME'] for row in cursor.fetchall()
|
202
|
-
if row['COLUMN_NAME'].lower() !=
|
205
|
+
if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
|
203
206
|
|
204
207
|
def _acquire_table_lock(self, database: str, table: str) -> bool:
|
205
208
|
"""获取表处理锁,防止并发处理同一张表"""
|
@@ -212,7 +215,7 @@ class MySQLDeduplicator:
|
|
212
215
|
self._processing_tables.add(key)
|
213
216
|
return True
|
214
217
|
|
215
|
-
def _release_table_lock(self, database: str, table: str):
|
218
|
+
def _release_table_lock(self, database: str, table: str) -> None:
|
216
219
|
"""释放表处理锁"""
|
217
220
|
key = f"{database}.{table}"
|
218
221
|
|
@@ -255,7 +258,7 @@ class MySQLDeduplicator:
|
|
255
258
|
if invalid_columns:
|
256
259
|
logger.warning(
|
257
260
|
f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
|
258
|
-
{'invalid_columns': invalid_columns}
|
261
|
+
{'invalid_columns': list(invalid_columns)}
|
259
262
|
)
|
260
263
|
use_columns = [col for col in use_columns if col in all_columns]
|
261
264
|
|
@@ -265,14 +268,17 @@ class MySQLDeduplicator:
|
|
265
268
|
|
266
269
|
# 构建去重SQL
|
267
270
|
column_list = ', '.join([f'`{col}`' for col in use_columns])
|
268
|
-
#
|
269
|
-
temp_table = f"temp_{table}_dedup_{os.getpid()}"
|
270
|
-
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)
|
271
|
-
|
272
|
-
#
|
271
|
+
# 临时表名限制64字符以内
|
272
|
+
temp_table = f"temp_{table}_dedup_{os.getpid()}"
|
273
|
+
temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
|
274
|
+
pk = self.primary_key
|
275
|
+
# 校验主键列是否存在
|
276
|
+
if pk not in all_columns and pk != 'id':
|
277
|
+
logger.error(f"表 {database}.{table} 不存在主键列 {pk}")
|
278
|
+
return (0, 0)
|
273
279
|
create_temp_sql = f"""
|
274
280
|
CREATE TABLE `{database}`.`{temp_table}` AS
|
275
|
-
SELECT MIN(`
|
281
|
+
SELECT MIN(`{pk}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
|
276
282
|
FROM `{database}`.`{table}`
|
277
283
|
GROUP BY {column_list}
|
278
284
|
HAVING COUNT(*) > 1
|
@@ -280,7 +286,7 @@ class MySQLDeduplicator:
|
|
280
286
|
|
281
287
|
delete_dup_sql = f"""
|
282
288
|
DELETE FROM `{database}`.`{table}`
|
283
|
-
WHERE `
|
289
|
+
WHERE `{pk}` NOT IN (
|
284
290
|
SELECT `min_id` FROM `{database}`.`{temp_table}`
|
285
291
|
) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
|
286
292
|
"""
|
@@ -292,7 +298,8 @@ class MySQLDeduplicator:
|
|
292
298
|
# 创建临时表统计重复数据
|
293
299
|
cursor.execute(create_temp_sql)
|
294
300
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
|
295
|
-
|
301
|
+
dup_count_row = cursor.fetchone()
|
302
|
+
dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
|
296
303
|
|
297
304
|
if dup_count == 0:
|
298
305
|
logger.info(f"表 {database}.{table} 没有重复数据")
|
@@ -557,7 +564,7 @@ class MySQLDeduplicator:
|
|
557
564
|
cursor.execute(sql, (database, table))
|
558
565
|
return bool(cursor.fetchone())
|
559
566
|
|
560
|
-
def close(self):
|
567
|
+
def close(self) -> None:
|
561
568
|
"""关闭连接池"""
|
562
569
|
try:
|
563
570
|
if hasattr(self, 'pool') and self.pool and not self._closed:
|
@@ -567,17 +574,17 @@ class MySQLDeduplicator:
|
|
567
574
|
except Exception as e:
|
568
575
|
logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
|
569
576
|
|
570
|
-
def __enter__(self):
|
577
|
+
def __enter__(self) -> 'MySQLDeduplicator':
|
571
578
|
return self
|
572
579
|
|
573
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
580
|
+
def __exit__(self, exc_type: Optional[type], exc_val: Optional[BaseException], exc_tb: Optional[Any]) -> None:
|
574
581
|
self.close()
|
575
582
|
|
576
583
|
|
577
584
|
def main():
|
578
585
|
deduplicator = MySQLDeduplicator(
|
579
586
|
username='root',
|
580
|
-
password='
|
587
|
+
password='pwd',
|
581
588
|
host='localhost',
|
582
589
|
port=3306
|
583
590
|
)
|
@@ -595,4 +602,5 @@ def main():
|
|
595
602
|
deduplicator.close()
|
596
603
|
|
597
604
|
if __name__ == '__main__':
|
598
|
-
main()
|
605
|
+
# main()
|
606
|
+
pass
|