PyPI - mdbq - Versions diffs - 3.10.8__py3-none-any.whl → 3.10.10__py3-none-any.whl - Mend

mdbq 3.10.8py3-none-any.whl → 3.10.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

mdbq/__version__.py +1 -1
mdbq/aggregation/optimize.py +1 -0
mdbq/aggregation/query_data.py +2 -0
mdbq/mysql/deduplicator.py +171 -157
mdbq/mysql/mysql.py +336 -280
mdbq/mysql/s_query.py +159 -143
mdbq/redis/getredis.py +0 -2
{mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/METADATA +1 -1
{mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/RECORD +11 -11
{mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/WHEEL +0 -0
{mdbq-3.10.8.dist-info → mdbq-3.10.10.dist-info}/top_level.txt +0 -0

mdbq/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- VERSION = '3.10.8'
1	+ VERSION = '3.10.10'

mdbq/aggregation/optimize.py CHANGED Viewed

@@ -457,6 +457,7 @@ def main():
         '安全组',
         # '视频数据',
         # '聚合数据',
+        '数据引擎2'
     ]
     tables_list = op.get_table_in_databases(db_list=db_list, reset_id=False)
     op.deduplicate(

mdbq/aggregation/query_data.py CHANGED Viewed

@@ -3995,6 +3995,7 @@ def main(days=150, months=3):
         "推广数据2",
         "推广数据_淘宝店",
         "推广数据_奥莱店",
+        "推广数据_圣积天猫店",
         "爱库存2",
         "生意参谋3",
         "生意经3",
@@ -4003,6 +4004,7 @@ def main(days=150, months=3):
         '商品人群画像2',
         '市场数据3',
         '回传数据',
+        '数据引擎2',
     ]
     # 使用 ThreadPoolExecutor 来并行运行
     # with concurrent.futures.ThreadPoolExecutor() as executor:

mdbq/mysql/deduplicator.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # -*- coding:utf-8 -*-
-import datetime
 import re
 import time
 from functools import wraps
@@ -7,11 +6,12 @@ import warnings
 import pymysql
 import os
 from mdbq.log import mylogger
-from typing import List, Dict, Optional, Any, Tuple, Set
+from typing import List, Dict, Optional, Any, Tuple
 from dbutils.pooled_db import PooledDB
 import threading
 import concurrent.futures
 from collections import defaultdict
+import sys
 warnings.filterwarnings('ignore')
@@ -24,7 +24,7 @@ logger = mylogger.MyLogger(
     max_log_size=50,
     backup_count=5,
     enable_async=False,  # 是否启用异步日志
-    sample_rate=1,  # 采样50%的DEBUG/INFO日志
+    sample_rate=1,  # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
     sensitive_fields=[],  #  敏感字段列表
 )
@@ -72,26 +72,28 @@ class MySQLDeduplicator:
             skip_system_dbs: bool = True,
             max_retries: int = 3,
             retry_interval: int = 5,
-            pool_size: int = 5
-    ):
+            pool_size: int = 5,
+            primary_key: str = 'id',
+            date_range: Optional[List[str]] = None,
+            recent_month: Optional[int] = None,
+            date_column: str = '日期',
+            exclude_columns: Optional[List[str]] = None
+    ) -> None:
         """
         初始化去重处理器
-        :param username: 数据库用户名
-        :param password: 数据库密码
-        :param host: 数据库主机，默认为localhost
-        :param port: 数据库端口，默认为3306
-        :param charset: 字符集，默认为utf8mb4
-        :param max_workers: 最大工作线程数，默认为1(单线程)
-        :param batch_size: 批量处理大小，默认为1000
-        :param skip_system_dbs: 是否跳过系统数据库，默认为True
-        :param max_retries: 最大重试次数
-        :param retry_interval: 重试间隔(秒)
-        :param pool_size: 连接池大小
+        新增参数:
+        :param date_range: 指定去重的日期区间 [start_date, end_date]，格式'YYYY-MM-DD'
+        :param recent_month: 最近N个月的数据去重（与date_range互斥，优先生效）
+        :param date_column: 时间列名，默认为'日期'
+        :param exclude_columns: 去重时排除的列名列表，默认为['id', '更新时间']
         """
         # 连接池状态标志
         self._closed = False
+        logger.debug('初始化MySQLDeduplicator', {
+            'host': host, 'port': port, 'user': username, 'charset': charset,
+            'max_workers': max_workers, 'batch_size': batch_size, 'pool_size': pool_size,
+            'exclude_columns': exclude_columns
+        })
         # 初始化连接池
         self.pool = PooledDB(
             creator=pymysql,
@@ -110,6 +112,34 @@ class MySQLDeduplicator:
         self.skip_system_dbs = skip_system_dbs
         self.max_retries = max_retries
         self.retry_interval = retry_interval
+        self.primary_key = primary_key
+        # 时间范围参数
+        self.date_range = date_range
+        self.recent_month = recent_month
+        self.date_column = date_column
+        self._dedup_start_date = None
+        self._dedup_end_date = None
+        # 不管 exclude_columns 是否传入, 'id' 一定会被排除
+        default_exclude = {'id'}
+        # exclude_columns 不传则排除: ['id', '更新时间']
+        if not exclude_columns:
+            self.exclude_columns = list(default_exclude | {'更新时间'})
+        else:
+            self.exclude_columns = list(set(exclude_columns) | default_exclude)
+        # 解析时间范围
+        if self.date_range and len(self.date_range) == 2:
+            self._dedup_start_date, self._dedup_end_date = self.date_range
+        elif self.recent_month:
+            from datetime import datetime, timedelta
+            today = datetime.today()
+            month = today.month - self.recent_month
+            year = today.year
+            while month <= 0:
+                month += 12
+                year -= 1
+            self._dedup_start_date = f"{year}-{month:02d}-01"
+            self._dedup_end_date = today.strftime("%Y-%m-%d")
         # 线程安全控制
         self._lock = threading.Lock()
@@ -118,27 +148,28 @@ class MySQLDeduplicator:
         # 系统数据库列表
         self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
-    def _get_connection(self):
+    def _get_connection(self) -> pymysql.connections.Connection:
         """从连接池获取连接"""
         if self._closed:
+            logger.error('尝试获取连接但连接池已关闭')
             raise ConnectionError("连接池已关闭")
         try:
             conn = self.pool.connection()
             logger.debug("成功获取数据库连接")
             return conn
         except Exception as e:
-            logger.error(f"获取数据库连接失败: {str(e)}")
+            logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
             raise ConnectionError(f"连接数据库失败: {str(e)}")
     @staticmethod
-    def _retry_on_failure(func):
+    def _retry_on_failure(func: Any) -> Any:
         """重试装饰器"""
         @wraps(func)
         def wrapper(self, *args, **kwargs):
             last_exception = None
             for attempt in range(self.max_retries + 1):
                 try:
+                    logger.debug(f'调用{func.__name__}，第{attempt+1}次尝试', {'args': args, 'kwargs': kwargs})
                     return func(self, *args, **kwargs)
                 except (pymysql.OperationalError, pymysql.InterfaceError) as e:
                     last_exception = e
@@ -146,18 +177,17 @@ class MySQLDeduplicator:
                         wait_time = self.retry_interval * (attempt + 1)
                         logger.warning(
                             f"数据库操作失败，准备重试 (尝试 {attempt + 1}/{self.max_retries})",
-                            {'error': str(e), 'wait_time': wait_time})
+                            {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
                         time.sleep(wait_time)
                         continue
                 except Exception as e:
                     last_exception = e
-                    logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__})
+                    logger.error(f"操作失败: {str(e)}", {'error_type': type(e).__name__, 'func': func.__name__})
                     break
             if last_exception:
+                logger.error('重试后依然失败', {'func': func.__name__, 'last_exception': str(last_exception)})
                 raise last_exception
             raise Exception("未知错误")
         return wrapper
     @_retry_on_failure
@@ -187,7 +217,7 @@ class MySQLDeduplicator:
     @_retry_on_failure
     def _get_table_columns(self, database: str, table: str) -> List[str]:
-        """获取表的列名(排除id列)"""
+        """获取表的列名(排除主键列)"""
         sql = """
         SELECT COLUMN_NAME
         FROM INFORMATION_SCHEMA.COLUMNS
@@ -199,7 +229,7 @@ class MySQLDeduplicator:
             with conn.cursor() as cursor:
                 cursor.execute(sql, (database, table))
                 return [row['COLUMN_NAME'] for row in cursor.fetchall()
-                        if row['COLUMN_NAME'].lower() != 'id']
+                        if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
     def _acquire_table_lock(self, database: str, table: str) -> bool:
         """获取表处理锁，防止并发处理同一张表"""
@@ -212,7 +242,7 @@ class MySQLDeduplicator:
             self._processing_tables.add(key)
             return True
-    def _release_table_lock(self, database: str, table: str):
+    def _release_table_lock(self, database: str, table: str) -> None:
         """释放表处理锁"""
         key = f"{database}.{table}"
@@ -238,100 +268,111 @@ class MySQLDeduplicator:
         """
         if not self._acquire_table_lock(database, table):
             return (0, 0)
+        temp_table = None
         try:
-            logger.info(f"开始处理表: {database}.{table}")
+            # 获取原始数据总量
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    logger.debug('执行SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{table}`'})
+                    cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`")
+                    total_count_row = cursor.fetchone()
+                    total_count = total_count_row['cnt'] if total_count_row and 'cnt' in total_count_row else 0
+            logger.info('执行', {"库": database, "表": table, "开始处理数据量": total_count, 'func': sys._getframe().f_code.co_name})
             # 获取实际列名
             all_columns = self._get_table_columns(database, table)
-            if not all_columns:
-                logger.warning(f"表 {database}.{table} 没有有效列(可能只有id列)，跳过")
-                return (0, 0)
-            # 使用指定列或所有列
+            logger.debug('获取表列', {'库': database, '表': table, 'all_columns': all_columns})
+            # 检查是否需要按时间范围过滤
+            use_time_filter = False
+            time_col = self.date_column
+            all_columns_lower = [col.lower() for col in all_columns]
+            # 排除exclude_columns
+            exclude_columns_lower = [col.lower() for col in getattr(self, 'exclude_columns', [])]
+            # 统一列名小写做判断
             use_columns = columns or all_columns
-            invalid_columns = set(use_columns) - set(all_columns)
+            use_columns = [col for col in use_columns if col.lower() in all_columns_lower and col.lower() not in exclude_columns_lower]
+            invalid_columns = set([col for col in (columns or []) if col.lower() not in all_columns_lower])
             if invalid_columns:
-                logger.warning(
-                    f"表 {database}.{table} 中不存在以下列: {invalid_columns}，使用有效列",
-                    {'invalid_columns': invalid_columns}
-                )
-                use_columns = [col for col in use_columns if col in all_columns]
+                logger.warning('不存在的列', {"库": database, "表": table, "不存在以下列": invalid_columns, 'func': sys._getframe().f_code.co_name})
             if not use_columns:
-                logger.error(f"表 {database}.{table} 没有有效的去重列")
+                logger.error('没有有效的去重列', {"库": database, "表": table})
                 return (0, 0)
-            # 构建去重SQL
+            # 统一用反引号包裹
             column_list = ', '.join([f'`{col}`' for col in use_columns])
-            # temp_table = f"temp_{table}_{int(time.time())}"
-            temp_table = f"temp_{table}_dedup_{os.getpid()}"  # 使用进程ID构建临时表
-            temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)  # 确保表名合法
-            # 使用临时表方案处理去重，避免锁表问题
+            temp_table = f"temp_{table}_dedup_{os.getpid()}_{threading.get_ident()}"
+            temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
+            pk = self.primary_key
+            # 主键判断也用小写
+            if pk.lower() not in all_columns_lower and pk != 'id':
+                logger.error('', {"不存在主键列": database, "表": table, "主键列不存在": pk})
+                return (0, 0)
+            # 找到实际主键名
+            pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
+            # 构造where条件
+            where_time = ''
+            if use_time_filter:
+                where_time = f"WHERE `{time_col}` >= '{self._dedup_start_date}' AND `{time_col}` <= '{self._dedup_end_date}'"
             create_temp_sql = f"""
             CREATE TABLE `{database}`.`{temp_table}` AS
-            SELECT MIN(`id`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
+            SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
             FROM `{database}`.`{table}`
+            {where_time}
             GROUP BY {column_list}
             HAVING COUNT(*) > 1
             """
-            delete_dup_sql = f"""
-            DELETE FROM `{database}`.`{table}`
-            WHERE `id` NOT IN (
-                SELECT `min_id` FROM `{database}`.`{temp_table}`
-            ) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
-            """
             drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
             with self._get_connection() as conn:
                 with conn.cursor() as cursor:
-                    # 创建临时表统计重复数据
+                    logger.debug('创建临时表SQL', {'sql': create_temp_sql})
                     cursor.execute(create_temp_sql)
+                    logger.debug('统计临时表重复组SQL', {'sql': f'SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`'})
                     cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
-                    dup_count = cursor.fetchone()['cnt']
+                    dup_count_row = cursor.fetchone()
+                    dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
                     if dup_count == 0:
-                        logger.info(f"表 {database}.{table} 没有重复数据")
+                        logger.info('没有重复数据', {"库": database, "表": table, "数据量": total_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
+                        logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
                         cursor.execute(drop_temp_sql)
                         conn.commit()
                         return (0, 0)
-                    logger.info(
-                        f"表 {database}.{table} 发现 {dup_count} 组重复数据",
-                        {'columns': use_columns}
-                    )
+                    affected_rows = 0
                     if not dry_run:
-                        # 执行实际删除
-                        cursor.execute(delete_dup_sql)
-                        affected_rows = cursor.rowcount
-                        conn.commit()
-                        logger.info(
-                            f"表 {database}.{table} 已删除 {affected_rows} 行重复数据",
-                            {'columns': use_columns}
-                        )
+                        # 分批删除，避免锁表
+                        while True:
+                            delete_dup_sql = f"""
+                            DELETE FROM `{database}`.`{table}`
+                            WHERE `{pk_real}` NOT IN (
+                                SELECT `min_id` FROM `{database}`.`{temp_table}`
+                            ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
+                            AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
+                            LIMIT {self.batch_size}
+                            """
+                            logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
+                            cursor.execute(delete_dup_sql)
+                            batch_deleted = cursor.rowcount
+                            affected_rows += batch_deleted
+                            conn.commit()
+                            if batch_deleted < self.batch_size:
+                                break
+                        logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
                     else:
+                        logger.debug('dry_run模式，不执行删除', {"库": database, "表": table, "重复组数": dup_count, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None})
                         affected_rows = 0
-                        logger.info(
-                            f"[模拟运行] 表 {database}.{table} 将删除 {dup_count} 组重复数据",
-                            {'columns': use_columns}
-                        )
-                    # 清理临时表
+                    logger.debug('删除临时表SQL', {'sql': drop_temp_sql})
                     cursor.execute(drop_temp_sql)
                     conn.commit()
                     return (dup_count, affected_rows)
         except Exception as e:
-            logger.error(
-                f"处理表 {database}.{table} 时出错: {str(e)}",
-                {'error_type': type(e).__name__}
-            )
+            logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
+            # 异常时也要清理临时表
+            if temp_table:
+                try:
+                    with self._get_connection() as conn:
+                        with conn.cursor() as cursor:
+                            drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
+                            cursor.execute(drop_temp_sql)
+                            conn.commit()
+                except Exception as drop_e:
+                    logger.error('异常时清理临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
             return (0, 0)
         finally:
             self._release_table_lock(database, table)
@@ -353,17 +394,15 @@ class MySQLDeduplicator:
         :return: (重复行数, 删除行数)
         """
         try:
-            # 检查表是否存在
             if not self._check_table_exists(database, table):
-                logger.warning(f"表 {database}.{table} 不存在，跳过")
+                logger.warning('表不存在', {"库": database, "表": table, "warning": "跳过"})
                 return (0, 0)
-            return self._deduplicate_table(database, table, columns, dry_run)
+            logger.info('单表开始', {"库": database, "表": table, "参数": {"指定去重列": columns, "模拟运行": dry_run, '排除列': self.exclude_columns}})
+            result = self._deduplicate_table(database, table, columns, dry_run)
+            logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result})
+            return result
         except Exception as e:
-            logger.error(
-                f"处理表 {database}.{table} 时发生全局错误: {str(e)}",
-                {'error_type': type(e).__name__}
-            )
+            logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
             return (0, 0)
     def deduplicate_database(
@@ -385,49 +424,40 @@ class MySQLDeduplicator:
         :return: 字典 {表名: (重复行数, 删除行数)}
         """
         results = {}
         try:
-            # 检查数据库是否存在
             if not self._check_database_exists(database):
-                logger.warning(f"数据库 {database} 不存在，跳过")
+                logger.warning('数据库不存在', {"库": database})
                 return results
-            # 获取要处理的表
             target_tables = tables or self._get_tables(database)
+            logger.debug('获取目标表', {'库': database, 'tables': target_tables})
             if not target_tables:
-                logger.info(f"数据库 {database} 中没有表，跳过")
+                logger.info('数据库中没有表', {"库": database, "操作": "跳过"})
                 return results
-            logger.info(
-                f"开始处理数据库 {database} 中的 {len(target_tables)} 张表",
-                {'tables': target_tables}
-            )
+            logger.info('库统计', {"库": database, "表数量": len(target_tables), "表列表": target_tables})
             if parallel and self.max_workers > 1:
-                # 并行处理
+                logger.debug('并行处理表', {'库': database, 'max_workers': self.max_workers})
+                # 使用线程池并行处理
                 with concurrent.futures.ThreadPoolExecutor(
                         max_workers=self.max_workers
                 ) as executor:
                     futures = {}
                     for table in target_tables:
                         columns = columns_map.get(table) if columns_map else None
+                        logger.debug('提交表去重任务', {'库': database, '表': table, 'columns': columns})
                         futures[executor.submit(
                             self.deduplicate_table,
                             database, table, columns, dry_run
                         )] = table
                     for future in concurrent.futures.as_completed(futures):
                         table = futures[future]
                         try:
                             dup_count, affected_rows = future.result()
                             results[table] = (dup_count, affected_rows)
                         except Exception as e:
-                            logger.error(
-                                f"处理表 {database}.{table} 时出错: {str(e)}",
-                                {'error_type': type(e).__name__}
-                            )
+                            logger.error('异常', {"库": database, "表": table, "error": str(e), 'traceback': repr(e)})
                             results[table] = (0, 0)
             else:
+                logger.debug('串行处理表', {'库': database})
                 # 串行处理
                 for table in target_tables:
                     columns = columns_map.get(table) if columns_map else None
@@ -435,20 +465,12 @@ class MySQLDeduplicator:
                         database, table, columns, dry_run
                     )
                     results[table] = (dup_count, affected_rows)
-            # 统计结果
             total_dup = sum(r[0] for r in results.values())
             total_del = sum(r[1] for r in results.values())
-            logger.info(
-                f"数据库 {database} 处理完成 - 共发现 {total_dup} 组重复数据，删除 {total_del} 行",
-                {'results': results}
-            )
+            logger.info('单库完成', {"库": database, "重复组数": total_dup, "总删除行数": total_del, "详细结果": results})
             return results
         except Exception as e:
-            logger.error(f"处理数据库 {database} 时发生全局错误: {str(e)}", {'error_type': type(e).__name__})
+            logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
             return results
     def deduplicate_all(
@@ -470,18 +492,15 @@ class MySQLDeduplicator:
         :return: 嵌套字典 {数据库名: {表名: (重复行数, 删除行数)}}
         """
         all_results = defaultdict(dict)
         try:
-            # 获取要处理的数据库
             target_dbs = databases or self._get_databases()
+            logger.debug('获取目标数据库', {'databases': target_dbs})
             if not target_dbs:
-                logger.warning("没有可处理的数据库")
+                logger.warning('没有可处理的数据库')
                 return all_results
-            logger.info(f"开始处理 {len(target_dbs)} 个数据库", {'databases': target_dbs})
+            logger.info('全局开始', {"数据库数量": len(target_dbs), "数据库列表": target_dbs, "参数": {"模拟运行": dry_run, "并行处理": parallel, '排除列': self.exclude_columns}})
             if parallel and self.max_workers > 1:
-                # 并行处理数据库
+                # 使用线程池并行处理多个数据库
                 with concurrent.futures.ThreadPoolExecutor(
                         max_workers=self.max_workers
                 ) as executor:
@@ -493,14 +512,13 @@ class MySQLDeduplicator:
                             self.deduplicate_database,
                             db, tables, db_columns_map, dry_run, False
                         )] = db
                     for future in concurrent.futures.as_completed(futures):
                         db = futures[future]
                         try:
                             db_results = future.result()
                             all_results[db] = db_results
                         except Exception as e:
-                            logger.error(f"处理数据库 {db} 时出错: {str(e)}", {'error_type': type(e).__name__})
+                            logger.error('异常', {"库": db, "error": str(e), 'traceback': repr(e)})
                             all_results[db] = {}
             else:
                 # 串行处理数据库
@@ -511,8 +529,6 @@ class MySQLDeduplicator:
                         db, tables, db_columns_map, dry_run, parallel
                     )
                     all_results[db] = db_results
-            # 统计总体结果
             total_dup = sum(
                 r[0] for db in all_results.values()
                 for r in db.values()
@@ -521,16 +537,10 @@ class MySQLDeduplicator:
                 r[1] for db in all_results.values()
                 for r in db.values()
             )
-            logger.info(
-                f"所有数据库处理完成 - 共发现 {total_dup} 组重复数据，删除 {total_del} 行",
-                {'total_results': all_results}
-            )
+            logger.info('全局完成', {"总重复组数": total_dup, "总删除行数": total_del, "详细结果": dict(all_results)})
             return all_results
         except Exception as e:
-            logger.error(f"全局处理时发生错误: {str(e)}", {'error_type': type(e).__name__})
+            logger.error('异常', {"error": str(e), 'traceback': repr(e)})
             return all_results
     @_retry_on_failure
@@ -557,42 +567,46 @@ class MySQLDeduplicator:
                 cursor.execute(sql, (database, table))
                 return bool(cursor.fetchone())
-    def close(self):
+    def close(self) -> None:
         """关闭连接池"""
         try:
             if hasattr(self, 'pool') and self.pool and not self._closed:
                 self.pool.close()
                 self._closed = True
                 logger.info("数据库连接池已关闭")
+            else:
+                logger.info('连接池已关闭或不存在')
         except Exception as e:
-            logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
+            logger.error(f"关闭连接池时出错", {'error_type': type(e).__name__, 'error': str(e)})
-    def __enter__(self):
+    def __enter__(self) -> 'MySQLDeduplicator':
         return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
+    def __exit__(self, exc_type: Optional[type], exc_val: Optional[BaseException], exc_tb: Optional[Any]) -> None:
         self.close()
 def main():
     deduplicator = MySQLDeduplicator(
         username='root',
-        password='188988yang188',
+        password='pwd',
         host='localhost',
         port=3306
     )
     # 全库去重(单线程)
-    deduplicator.deduplicate_all()
+    deduplicator.deduplicate_all(dry_run=False, parallel=False)
     # # 指定数据库去重(多线程)
-    # deduplicator.deduplicate_database('my_db', parallel=True)
+    # logger.info('调用deduplicate_database')
+    # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True)
     # # 指定表去重(使用特定列)
-    # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'])
+    # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'date'], dry_run=False)
     # 关闭连接
     deduplicator.close()
 if __name__ == '__main__':
-    main()
+    # main()
+    pass

mdbq 3.10.8__py3-none-any.whl → 3.10.10__py3-none-any.whl

mdbq 3.10.8py3-none-any.whl → 3.10.10py3-none-any.whl