PyPI - mdbq - Versions diffs - 3.11.11__py3-none-any.whl → 3.12.1__py3-none-any.whl - Mend

mdbq 3.11.11py3-none-any.whl → 3.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

mdbq/__version__.py +1 -1
mdbq/mysql/deduplicator.py +127 -63
mdbq/mysql/uploader.py +177 -83
{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/METADATA +1 -1
{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/RECORD +7 -7
{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/WHEEL +0 -0
{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/top_level.txt +0 -0

mdbq/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- VERSION = '3.11.11'
1	+ VERSION = '3.12.1'

mdbq/mysql/deduplicator.py CHANGED Viewed

@@ -14,6 +14,7 @@ from collections import defaultdict
 import sys
 from datetime import datetime
 import uuid
+from contextlib import contextmanager
 warnings.filterwarnings('ignore')
@@ -47,8 +48,8 @@ class MySQLDeduplicator:
             batch_size: int = 1000,
             skip_system_dbs: bool = True,
             max_retries: int = 3,
-            retry_interval: int = 5,
-            pool_size: int = 5,
+            retry_waiting_time: int = 5,
+            pool_size: int = 10,
             primary_key: str = 'id',
             date_range: Optional[List[str]] = None,
             recent_month: Optional[int] = None,
@@ -87,15 +88,30 @@ class MySQLDeduplicator:
             cursorclass=pymysql.cursors.DictCursor
         )
+        # 并发模式要将 pool_size 加大
+        MAX_POOL_SIZE = 200
+        MAX_WORKERS = 4
+        if max_workers > MAX_WORKERS:
+            logger.warning(f"max_workers({max_workers}) 超过最大建议值({MAX_WORKERS})，自动将 max_workers 调整为 {MAX_WORKERS}")
+            max_workers = MAX_WORKERS
+        expected_threads = max_workers * 10
+        if pool_size < expected_threads:
+            logger.warning(f"pool_size({pool_size}) < max_workers({max_workers}) * 10，自动将 pool_size 调整为 {expected_threads}")
+            pool_size = expected_threads
+        if pool_size > MAX_POOL_SIZE:
+            logger.warning(f"pool_size({pool_size}) 超过最大建议值({MAX_POOL_SIZE})，自动将 pool_size 调整为 {MAX_POOL_SIZE}")
+            pool_size = MAX_POOL_SIZE
+        self.max_workers = max_workers
+        self.pool_size = pool_size
         # 配置参数
-        self.max_workers = min(max(1, max_workers), pool_size)  # 限制最大线程数，不能超过连接池
         self.batch_size = batch_size
         self.skip_system_dbs = skip_system_dbs
         self.max_retries = max_retries
-        self.retry_interval = retry_interval
+        self.retry_waiting_time = retry_waiting_time
         self.primary_key = primary_key
-        # 时间范围参数（只保留解析后的结果，去除冗余原始参数）
+        # 时间范围参数
         self.date_column = date_column
         self._dedup_start_date = None
         self._dedup_end_date = None
@@ -128,6 +144,9 @@ class MySQLDeduplicator:
                 year -= 1
             self._dedup_start_date = f"{year}-{month:02d}-01"
             self._dedup_end_date = today.strftime("%Y-%m-%d")
+        if self._dedup_start_date and self._dedup_end_date:
+            logger.info('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
         # 排除列处理，直接合并去重
         self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
@@ -164,6 +183,14 @@ class MySQLDeduplicator:
             logger.error(f"获取数据库连接失败: {str(e)}", {'error_type': type(e).__name__})
             raise ConnectionError(f"连接数据库失败: {str(e)}")
+    @contextmanager
+    def _conn_ctx(self):
+        conn = self._get_connection()
+        try:
+            yield conn
+        finally:
+            conn.close()
     @staticmethod
     def _retry_on_failure(func: Any) -> Any:
         """
@@ -187,7 +214,7 @@ class MySQLDeduplicator:
                 except (pymysql.OperationalError, pymysql.InterfaceError) as e:
                     last_exception = e
                     if attempt < self.max_retries:
-                        wait_time = self.retry_interval * (attempt + 1)
+                        wait_time = self.retry_waiting_time * (attempt + 1)
                         logger.warning(
                             f"数据库操作失败，准备重试 (尝试 {attempt + 1}/{self.max_retries})",
                             {'error': str(e), 'wait_time': wait_time, 'func': func.__name__})
@@ -203,7 +230,6 @@ class MySQLDeduplicator:
             raise Exception("未知错误")
         return wrapper
-    @_retry_on_failure
     def _get_databases(self) -> List[str]:
         """
         获取所有非系统数据库列表，排除 exclude_databases。
@@ -212,7 +238,7 @@ class MySQLDeduplicator:
             List[str]: 数据库名列表。
         """
         sql = "SHOW DATABASES"
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute(sql)
                 all_dbs = [row['Database'] for row in cursor.fetchall()]
@@ -220,7 +246,6 @@ class MySQLDeduplicator:
                 filtered = [db for db in all_dbs if db.lower() not in self.SYSTEM_DATABASES and db.lower() not in self.exclude_databases] if self.skip_system_dbs else [db for db in all_dbs if db.lower() not in self.exclude_databases]
                 return filtered
-    @_retry_on_failure
     def _get_tables(self, database: str) -> List[str]:
         """
         获取指定数据库的所有表名（排除 temp_ 前缀的临时表）。
@@ -231,15 +256,12 @@ class MySQLDeduplicator:
             List[str]: 表名列表。
         """
         sql = "SHOW TABLES"
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute(f"USE `{database}`")
                 cursor.execute(sql)
-                # 严格过滤所有以'temp_'为前缀的表名（如temp_xxx、temp_xxx_dedup_...、temp_xxx_reorderid_...等）
                 return [row[f'Tables_in_{database}'] for row in cursor.fetchall() if not re.match(r'^temp_.*', row[f'Tables_in_{database}'])]
-    @_retry_on_failure
     def _get_table_columns(self, database: str, table: str) -> List[str]:
         """
         获取指定表的所有列名（排除主键列）。
@@ -256,14 +278,12 @@ class MySQLDeduplicator:
         WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
         ORDER BY ORDINAL_POSITION
         """
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute(sql, (database, table))
                 return [row['COLUMN_NAME'] for row in cursor.fetchall()
                         if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
-    @_retry_on_failure
     def _ensure_index(self, database: str, table: str, date_column: str) -> None:
         """
         检查并为 date_column 自动创建索引（如果未存在）。
@@ -273,7 +293,7 @@ class MySQLDeduplicator:
             table (str): 表名。
             date_column (str): 需要检查的日期列名。
         """
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 # 检查索引是否已存在
                 cursor.execute(
@@ -295,7 +315,33 @@ class MySQLDeduplicator:
                     except Exception as e:
                         logger.error('自动创建date_column索引失败', {"库": database, "表": table, "date_column": date_column, "异常": str(e)})
-    @_retry_on_failure
+    def _row_generator(self, database, table, select_cols, select_where, batch_size=10000):
+        """
+        生成器：分批拉取表数据，避免一次性加载全部数据到内存。
+        Args:
+            database (str): 数据库名。
+            table (str): 表名。
+            select_cols (str): 选择的列字符串。
+            select_where (str): where条件字符串。
+            batch_size (int): 每批拉取的行数。
+        Yields:
+            dict: 每行数据。
+        """
+        offset = 0
+        while True:
+            sql = f"SELECT {select_cols} FROM `{database}`.`{table}` {select_where} LIMIT {batch_size} OFFSET {offset}"
+            with self._conn_ctx() as conn:
+                with conn.cursor() as cursor:
+                    cursor.execute(sql)
+                    rows = cursor.fetchall()
+                    if not rows:
+                        break
+                    for row in rows:
+                        yield row
+            if len(rows) < batch_size:
+                break
+            offset += batch_size
     def _get_all_dates(self, database: str, table: str, date_column: str) -> List[str]:
         """
         获取表中所有不同的日期分区（按天）。
@@ -308,7 +354,7 @@ class MySQLDeduplicator:
             List[str]: 所有不同的日期（字符串）。
         """
         sql = f"SELECT DISTINCT `{date_column}` FROM `{database}`.`{table}` ORDER BY `{date_column}` ASC"
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute(sql)
                 return [row[date_column] for row in cursor.fetchall() if row[date_column] is not None]
@@ -367,7 +413,7 @@ class MySQLDeduplicator:
                 pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
                 where_sql = f"t.`{time_col}` = '{date_val}'"
                 # 获取原始数据总量（只统计当天数据）
-                with self._get_connection() as conn:
+                with self._conn_ctx() as conn:
                     with conn.cursor() as cursor:
                         count_where = f"WHERE `{time_col}` = '{date_val}'"
                         count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}` {count_where}"
@@ -394,7 +440,7 @@ class MySQLDeduplicator:
                             del_ids.extend(ids[1:])
                     affected_rows = 0
                     if not dry_run and del_ids:
-                        with self._get_connection() as conn:
+                        with self._conn_ctx() as conn:
                             with conn.cursor() as cursor:
                                 for i in range(0, len(del_ids), self.batch_size):
                                     batch_ids = del_ids[i:i+self.batch_size]
@@ -418,7 +464,7 @@ class MySQLDeduplicator:
                 GROUP BY {column_list}
                 HAVING COUNT(*) > 1
                 """
-                with self._get_connection() as conn:
+                with self._conn_ctx() as conn:
                     with conn.cursor() as cursor:
                         logger.debug('创建临时表SQL', {'sql': create_temp_sql})
                         cursor.execute(create_temp_sql)
@@ -484,7 +530,7 @@ class MySQLDeduplicator:
             pk = self.primary_key
             pk_real = next((c for c in all_columns if c.lower() == pk.lower()), pk)
             # 获取原始数据总量
-            with self._get_connection() as conn:
+            with self._conn_ctx() as conn:
                 with conn.cursor() as cursor:
                     count_sql = f"SELECT COUNT(*) as cnt FROM `{database}`.`{table}`"
                     logger.debug('执行SQL', {'sql': count_sql})
@@ -508,7 +554,7 @@ class MySQLDeduplicator:
                         del_ids.extend(ids[1:])
                 affected_rows = 0
                 if not dry_run and del_ids:
-                    with self._get_connection() as conn:
+                    with self._conn_ctx() as conn:
                         with conn.cursor() as cursor:
                             for i in range(0, len(del_ids), self.batch_size):
                                 batch_ids = del_ids[i:i+self.batch_size]
@@ -529,7 +575,7 @@ class MySQLDeduplicator:
             GROUP BY {column_list}
             HAVING COUNT(*) > 1
             """
-            with self._get_connection() as conn:
+            with self._conn_ctx() as conn:
                 with conn.cursor() as cursor:
                     logger.debug('创建临时表SQL', {'sql': create_temp_sql})
                     cursor.execute(create_temp_sql)
@@ -584,7 +630,7 @@ class MySQLDeduplicator:
             logger.error('异常', {"库": database, "表": table, "异常": str(e), 'func': sys._getframe().f_code.co_name, 'traceback': repr(e)})
             if temp_table:
                 try:
-                    with self._get_connection() as conn:
+                    with self._conn_ctx() as conn:
                         with conn.cursor() as cursor:
                             drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
                             cursor.execute(drop_temp_sql)
@@ -628,13 +674,14 @@ class MySQLDeduplicator:
             logger.info('单表开始', {
                 "库": database,
                 "表": table,
-                "参数": {
-                    "指定去重列": columns,
-                    "去重方式": "Python" if use_python_dedup else "SQL",
-                    "数据处理": self.duplicate_keep_mode,
-                    "模拟运行": dry_run,
-                    '排除列': self.exclude_columns,
-                    }})
+                # "参数": {
+                #     "指定去重列": columns,
+                #     "去重方式": "Python" if use_python_dedup else "SQL",
+                #     "数据处理": self.duplicate_keep_mode,
+                #     "模拟运行": dry_run,
+                #     '排除列': self.exclude_columns,
+                #     },
+                })
             all_columns = self._get_table_columns(database, table)
             all_columns_lower = [col.lower() for col in all_columns]
             time_col = self.date_column
@@ -680,7 +727,7 @@ class MySQLDeduplicator:
                             logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
                         total_dup += dup_count
                         total_del += affected_rows
-                logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
+                logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
                 # 自动重排id列（仅当有实际删除时且reorder_id为True）
                 if reorder_id and total_del > 0:
                     try:
@@ -688,10 +735,12 @@ class MySQLDeduplicator:
                         logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
                     except Exception as e:
                         logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
+                if affected_rows > 0:
+                    logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
                 return (total_dup, total_del)
             # 没有date_column，直接全表去重
             result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
-            logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
+            logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
             dup_count, affected_rows = result
             if reorder_id and affected_rows > 0:
                 try:
@@ -699,6 +748,8 @@ class MySQLDeduplicator:
                     logger.info('自动重排id列完成', {"库": database, "表": table, "结果": reorder_ok})
                 except Exception as e:
                     logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
+            if affected_rows > 0:
+                logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
             return result
         except Exception as e:
             logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
@@ -770,7 +821,11 @@ class MySQLDeduplicator:
                     results[table] = (dup_count, affected_rows)
             total_dup = sum(r[0] for r in results.values())
             total_del = sum(r[1] for r in results.values())
-            logger.info('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
+            logger.debug('库完成', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": results})
+            # 只显示有删除的详细结果
+            if total_del > 0:
+                filtered_results = {tbl: res for tbl, res in results.items() if res[1] > 0}
+                logger.info('库完成(仅显示有删除的结果)', {"库": database, "重复组": total_dup, "总删除行": total_del, "详细结果": filtered_results})
             return results
         except Exception as e:
             logger.error('发生全局错误', {"库": database, 'func': sys._getframe().f_code.co_name, "error": str(e), 'traceback': repr(e)})
@@ -819,7 +874,8 @@ class MySQLDeduplicator:
                     'use_python_dedup': use_python_dedup
                     },
                 })
-            if parallel and self.max_workers > 1:
+            # 如果parallel=True且库数量大于1，则只在外层并发，内层串行
+            if parallel and self.max_workers > 1 and len(target_dbs) > 1:
                 with concurrent.futures.ThreadPoolExecutor(
                         max_workers=self.max_workers
                 ) as executor:
@@ -827,6 +883,7 @@ class MySQLDeduplicator:
                     for db in target_dbs:
                         tables = tables_map.get(db) if tables_map else None
                         db_columns_map = columns_map.get(db) if columns_map else None
+                        # 内层强制串行
                         futures[executor.submit(
                             self.deduplicate_database,
                             db, tables, db_columns_map, dry_run, False, reorder_id, use_python_dedup
@@ -855,7 +912,7 @@ class MySQLDeduplicator:
                 r[1] for db in all_results.values()
                 for r in db.values()
             )
-            logger.info('全局完成', {
+            logger.debug('全局完成', {
                 "总重复组": total_dup,
                 "总删除行": total_del,
                 "参数": {
@@ -867,12 +924,30 @@ class MySQLDeduplicator:
                     },
                 "详细结果": dict(all_results)
                 })
+            # 只显示有删除的详细结果
+            if total_del > 0:
+                filtered_results = {
+                    db: {tbl: res for tbl, res in tbls.items() if res[1] > 0}
+                    for db, tbls in all_results.items()
+                }
+                filtered_results = {db: tbls for db, tbls in filtered_results.items() if tbls}
+                logger.info('全局完成(仅显示有删除的结果)', {
+                    "总重复组": total_dup,
+                    "总删除行": total_del,
+                    "参数": {
+                        "模拟运行": dry_run,
+                        "并行处理": parallel,
+                        '排除列': self.exclude_columns,
+                        '重排id': reorder_id,
+                        'use_python_dedup': use_python_dedup
+                    },
+                    "详细结果": filtered_results
+                })
             return all_results
         except Exception as e:
             logger.error('异常', {"error": str(e), 'traceback': repr(e)})
             return all_results
-    @_retry_on_failure
     def _check_database_exists(self, database: str) -> bool:
         """
         检查数据库是否存在。
@@ -883,13 +958,11 @@ class MySQLDeduplicator:
             bool: 数据库是否存在。
         """
         sql = "SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = %s"
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute(sql, (database,))
                 return bool(cursor.fetchone())
-    @_retry_on_failure
     def _check_table_exists(self, database: str, table: str) -> bool:
         """
         检查表是否存在。
@@ -905,13 +978,11 @@ class MySQLDeduplicator:
         FROM INFORMATION_SCHEMA.TABLES
         WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s
         """
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute(sql, (database, table))
                 return bool(cursor.fetchone())
-    @_retry_on_failure
     def _get_table_info(self, database: str, table: str, id_column: str = None):
         """
         获取表的所有列名、主键列名列表、指定id列是否为主键。
@@ -923,7 +994,7 @@ class MySQLDeduplicator:
             Tuple[List[str], List[str], bool]: (所有列名, 主键列名, id列是否为主键)
         """
         id_column = id_column or self.primary_key
-        with self._get_connection() as conn:
+        with self._conn_ctx() as conn:
             with conn.cursor() as cursor:
                 cursor.execute("""
                     SELECT COLUMN_NAME, COLUMN_KEY
@@ -1032,7 +1103,7 @@ class MySQLDeduplicator:
                 logger.warning('主键不是单列id，跳过id重排', {"库": database, "表": table, "主键列": pk_cols})
                 return False
             # 检查外键约束
-            with self._get_connection() as conn:
+            with self._conn_ctx() as conn:
                 with conn.cursor() as cursor:
                     cursor.execute("""
                         SELECT * FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
@@ -1042,7 +1113,7 @@ class MySQLDeduplicator:
                         logger.warning('表存在外键约束，跳过id重排', {"库": database, "表": table})
                         return False
             # 获取表结构
-            with self._get_connection() as conn:
+            with self._conn_ctx() as conn:
                 with conn.cursor() as cursor:
                     cursor.execute(f"SHOW CREATE TABLE {table_quoted}")
                     create_table_sql = cursor.fetchone()['Create Table']
@@ -1055,7 +1126,7 @@ class MySQLDeduplicator:
             backup_table = self._make_backup_table_name(table)
             backup_table_quoted = f"`{database}`.`{backup_table}`"
             try:
-                with self._get_connection() as conn:
+                with self._conn_ctx() as conn:
                     with conn.cursor() as cursor:
                         # 1. 创建临时表，结构同原表
                         try:
@@ -1116,7 +1187,7 @@ class MySQLDeduplicator:
                                 logger.error('回滚恢复原表失败', {"库": database, "表": table, "异常": str(e)})
                             return False
                         logger.info('id重排成功且数据量一致', {"库": database, "表": table, "新表": new_cnt, "备份表": old_cnt, "备份表名": backup_table})
-                        # 5. 可选：自动删除备份表
+                        # 5. 自动删除备份表
                         if auto_drop_backup:
                             try:
                                 cursor.execute(f"DROP TABLE {backup_table_quoted}")
@@ -1127,7 +1198,7 @@ class MySQLDeduplicator:
             except Exception as e:
                 logger.error('id重排异常，准备回滚', {"库": database, "表": table, "异常": str(e)})
                 # 回滚：如临时表存在则删掉，恢复原表结构
-                with self._get_connection() as conn:
+                with self._conn_ctx() as conn:
                     with conn.cursor() as cursor:
                         try:
                             cursor.execute(f"DROP TABLE IF EXISTS {temp_table_quoted}")
@@ -1135,7 +1206,7 @@ class MySQLDeduplicator:
                             logger.error('回滚时删除临时表失败', {"库": database, "表": table, "异常": str(drop_e)})
                         # 恢复原表（如备份表存在）
                         try:
-                            with self._get_connection() as conn2:
+                            with self._conn_ctx() as conn2:
                                 with conn2.cursor() as cursor2:
                                     if self._check_table_exists(database, backup_table):
                                         cursor2.execute(f"DROP TABLE IF EXISTS {table_quoted}")
@@ -1227,23 +1298,16 @@ def main():
         batch_size=1000,
         skip_system_dbs=True,
         max_retries=3,
-        retry_interval=5,
-        pool_size=5,
+        retry_waiting_time=5,
+        # pool_size=30,
         recent_month=1,
         # date_range=['2025-06-09', '2025-06-10'],
         date_column='日期',
-        exclude_columns=None,
         exclude_databases=['测试库4'],
         exclude_tables={
             '推广数据2': [
                 '地域报表_城市_2025_04',
-                '地域报表_城市_2025_05',
-                '地域报表_城市_2025_06',
                 # '地域报表_城市_2025_04_copy1',
-                # '地域报表_城市_2025_05_copy1',
-                # '地域报表_城市_2025_06_copy1',
-                '奥莱店_主体报表',
-                # '奥莱店_主体报表_copy1',
             ],
             "生意参谋3": [
                 "商品排行_2025",
@@ -1255,10 +1319,10 @@ def main():
     deduplicator.deduplicate_all(dry_run=False, parallel=True, reorder_id=True)
     # # 指定数据库去重(多线程)
-    # deduplicator.deduplicate_database('推广数据2', dry_run=False, parallel=True, reorder_id=True)
+    # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reorder_id=True)
     # # 指定表去重(使用特定列)
-    # deduplicator.deduplicate_table('推广数据2', '地域报表_城市_2025_06_copy1', columns=[], dry_run=False, reorder_id=True)
+    # deduplicator.deduplicate_table('my_db', 'my_table', columns=['name', 'data'], dry_run=False, reorder_id=True)
     # # 重排id列
     # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)

mdbq/mysql/uploader.py CHANGED Viewed

@@ -23,8 +23,8 @@ logger = mylogger.MyLogger(
     max_log_size=50,
     backup_count=5,
     enable_async=False,  # 是否启用异步日志
-    sample_rate=1,  # 采样50%的DEBUG/INFO日志
-    sensitive_fields=[],  #  敏感字段列表
+    sample_rate=1,  # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
+    sensitive_fields=[],  #  过滤敏感字段列表
 )
@@ -83,7 +83,7 @@ class MySQLUploader:
             charset: str = 'utf8mb4',
             collation: str = 'utf8mb4_0900_ai_ci',
             max_retries: int = 10,
-            retry_interval: int = 10,
+            retry_waiting_time: int = 10,
             pool_size: int = 5,
             connect_timeout: int = 10,
             read_timeout: int = 30,
@@ -100,7 +100,7 @@ class MySQLUploader:
         :param charset: 字符集，默认为utf8mb4
         :param collation: 排序规则，默认为utf8mb4_0900_ai_ci，对大小写不敏感，utf8mb4_0900_as_cs/utf8mb4_bin: 对大小写敏感
         :param max_retries: 最大重试次数，默认为10
-        :param retry_interval: 重试间隔(秒)，默认为10
+        :param retry_waiting_time: 重试间隔(秒)，默认为10
         :param pool_size: 连接池大小，默认为5
         :param connect_timeout: 连接超时(秒)，默认为10
         :param read_timeout: 读取超时(秒)，默认为30
@@ -114,7 +114,7 @@ class MySQLUploader:
         self.charset = charset
         self.collation = collation
         self.max_retries = max(max_retries, 1)
-        self.retry_interval = max(retry_interval, 1)
+        self.retry_waiting_time = max(retry_waiting_time, 1)
         self.pool_size = max(pool_size, 1)
         self.connect_timeout = connect_timeout
         self.read_timeout = read_timeout
@@ -169,7 +169,7 @@ class MySQLUploader:
             }
         try:
             pool = PooledDB(**pool_params)
-            logger.info('连接池创建成功', {'连接池': self.pool_size, 'host': self.host, 'port': self.port})
+            logger.debug('连接池创建成功', {'连接池': self.pool_size, 'host': self.host, 'port': self.port})
             return pool
         except Exception as e:
             self.pool = None
@@ -188,14 +188,11 @@ class MySQLUploader:
         def wrapper(self, *args, **kwargs):
             last_exception = None
             operation = func.__name__
-            logger.debug(f'开始执行操作: {operation}', {'max_retries': self.max_retries})
             for attempt in range(self.max_retries):
                 try:
                     result = func(self, *args, **kwargs)
                     if attempt > 0:
                         logger.info('操作成功(重试后)', {'operation': operation, 'attempts': attempt + 1})
-                    else:
-                        logger.debug('操作成功', {'operation': operation})
                     return result
                 except (pymysql.OperationalError, pymysql.err.MySQLError) as e:
                     last_exception = e
@@ -207,7 +204,7 @@ class MySQLUploader:
                         'max_retries': self.max_retries
                     }
                     if attempt < self.max_retries - 1:
-                        wait_time = self.retry_interval * (attempt + 1)
+                        wait_time = self.retry_waiting_time * (attempt + 1)
                         error_details['wait_time'] = wait_time
                         logger.warning('数据库操作失败，准备重试', error_details)
                         time.sleep(wait_time)
@@ -218,13 +215,6 @@ class MySQLUploader:
                             logger.error('重连失败', {'error': str(reconnect_error)})
                     else:
                         logger.error('操作最终失败', error_details)
-                except pymysql.IntegrityError as e:
-                    logger.error('完整性约束错误', {
-                        'operation': operation,
-                        'error_code': e.args[0] if e.args else None,
-                        'error_message': e.args[1] if len(e.args) > 1 else None
-                    })
-                    raise e
                 except Exception as e:
                     last_exception = e
                     logger.error('发生意外错误', {
@@ -247,10 +237,9 @@ class MySQLUploader:
         """
         try:
             conn = self.pool.connection()
-            logger.debug('获取数据库连接', {'host': self.host, 'port': self.port})
             return conn
         except Exception as e:
-            logger.error('获取数据库连接失败', {'error': str(e)})
+            logger.error('从连接池获取数据库连接失败', {'error': str(e)})
             raise ConnectionError(f'连接数据库失败: {str(e)}')
     @_execute_with_retry
@@ -392,7 +381,8 @@ class MySQLUploader:
             primary_keys: Optional[List[str]] = None,
             date_column: Optional[str] = None,
             indexes: Optional[List[str]] = None,
-            allow_null: bool = False
+            allow_null: bool = False,
+            unique_keys: Optional[List[List[str]]] = None
     ) -> None:
         """
         创建数据表，优化索引创建方式
@@ -402,39 +392,48 @@ class MySQLUploader:
         if not set_typ:
             logger.error('建表时未指定set_typ', {'库': db_name, '表': table_name})
             raise ValueError('set_typ 未指定')
+        # set_typ的键清洗
+        set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
         column_defs = ["`id` INT NOT NULL AUTO_INCREMENT"]
         for col_name, col_type in set_typ.items():
-            if col_name.lower() == 'id':
+            if col_name == 'id':
                 continue
-            safe_col_name = self._validate_identifier(col_name)
+            safe_col_name = self._normalize_col(col_name)
             col_def = f"`{safe_col_name}` {col_type}"
             if not allow_null and not col_type.lower().startswith('json'):
                 col_def += " NOT NULL"
             column_defs.append(col_def)
-        if primary_keys:
-            if 'id' not in [pk.lower() for pk in primary_keys]:
-                primary_keys = ['id'] + primary_keys
+        # 主键处理逻辑调整
+        if primary_keys and len(primary_keys) > 0:
+            safe_primary_keys = [self._normalize_col(pk) for pk in primary_keys]
+            primary_key_sql = f"PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
         else:
-            primary_keys = ['id']
-        safe_primary_keys = [self._validate_identifier(pk) for pk in primary_keys]
-        primary_key_sql = f", PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
+            safe_primary_keys = [self._normalize_col('id')]
+            primary_key_sql = f"PRIMARY KEY (`id`)"
         # 索引统一在CREATE TABLE中定义
         index_defs = []
         if date_column and date_column in set_typ:
-            safe_date_col = self._validate_identifier(date_column)
+            safe_date_col = self._normalize_col(date_column)
             index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
         if indexes:
             for idx_col in indexes:
                 if idx_col in set_typ:
-                    safe_idx_col = self._validate_identifier(idx_col)
+                    safe_idx_col = self._normalize_col(idx_col)
                     index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
+        # UNIQUE KEY定义
+        unique_defs = []
+        if unique_keys:
+            for idx, unique_cols in enumerate(unique_keys):
+                if not unique_cols:
+                    continue
+                safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
+                unique_name = f"uniq_{'_'.join(safe_unique_cols)}_{idx}"
+                unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
         index_defs = list(set(index_defs))
-        index_sql = (',' + ','.join(index_defs)) if index_defs else ''
+        all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
         sql = f"""
         CREATE TABLE IF NOT EXISTS `{db_name}`.`{table_name}` (
-            {','.join(column_defs)}
-            {primary_key_sql}
-            {index_sql}
+            {','.join(all_defs)}
         ) ENGINE=InnoDB DEFAULT CHARSET={self.charset} COLLATE={self.collation}
         """
         conn = None
@@ -443,7 +442,7 @@ class MySQLUploader:
                 with conn.cursor() as cursor:
                     cursor.execute(sql)
                 conn.commit()
-                logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes})
+                logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
         except Exception as e:
             logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
             if conn is not None:
@@ -476,11 +475,9 @@ class MySQLUploader:
             try:
                 if date_type:
                     result = pd.to_datetime(datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d'))
-                    logger.debug('日期格式化成功', {'原始': value, '格式': fmt, '结果': str(result)})
                     return result
                 else:
                     result = datetime.datetime.strptime(value, fmt).strftime('%Y-%m-%d %H:%M:%S')
-                    logger.debug('日期格式化成功', {'原始': value, '格式': fmt, '结果': str(result)})
                     return result
             except ValueError:
                 continue
@@ -613,7 +610,7 @@ class MySQLUploader:
                     cursor.execute(sql_check, (db_name, table_name, column))
                     exists = cursor.fetchone()
                     if exists and list(exists.values())[0] > 0:
-                        logger.debug('索引已存在', {'库': db_name, '表': table_name, '列': column})
+                        logger.debug('索引检查', {'库': db_name, '表': table_name, '索引列': column})
                         return
                     cursor.execute(sql_create)
                 conn.commit()
@@ -622,6 +619,49 @@ class MySQLUploader:
             logger.error('创建索引失败', {'库': db_name, '表': table_name, '列': column, '错误': str(e)})
             raise
+    def _get_existing_unique_keys(self, db_name: str, table_name: str) -> List[List[str]]:
+        """
+        获取表中所有UNIQUE KEY的列组合（不含主键）。
+        返回：[[col1, col2], ...]
+        """
+        db_name = self._validate_identifier(db_name)
+        table_name = self._validate_identifier(table_name)
+        sql = '''
+            SELECT INDEX_NAME, COLUMN_NAME
+            FROM INFORMATION_SCHEMA.STATISTICS
+            WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND NON_UNIQUE = 0 AND INDEX_NAME != 'PRIMARY'
+            ORDER BY INDEX_NAME, SEQ_IN_INDEX
+        '''
+        unique_map = {}
+        try:
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    cursor.execute(sql, (db_name, table_name))
+                    for row in cursor.fetchall():
+                        idx = row['INDEX_NAME']
+                        col = row['COLUMN_NAME']
+                        unique_map.setdefault(idx, []).append(col)
+        except Exception as e:
+            logger.warning('获取UNIQUE KEY信息失败', {'库': db_name, '表': table_name, '错误': str(e)})
+        # 只返回列名组合，全部清洗小写
+        return [[self._normalize_col(c) for c in cols] for cols in unique_map.values() if cols]
+    def _add_unique_key(self, db_name: str, table_name: str, unique_cols: List[str]):
+        """
+        添加UNIQUE KEY
+        """
+        safe_cols = [self._normalize_col(col) for col in unique_cols]
+        unique_name = f"uniq_{'_'.join(safe_cols)}_{int(time.time()*1000)%100000}"
+        sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
+        try:
+            with self._get_connection() as conn:
+                with conn.cursor() as cursor:
+                    cursor.execute(sql)
+                conn.commit()
+            logger.info('添加唯一约束列成功', {'库': db_name, '表': table_name, '列': unique_cols})
+        except Exception as e:
+            logger.warning('唯一约束列添加失败', {'库': db_name, '表': table_name, '列': unique_cols, '错误': str(e)})
     def _upload_to_table(
             self,
             db_name: str,
@@ -637,14 +677,15 @@ class MySQLUploader:
             indexes: Optional[List[str]],
             batch_id: Optional[str] = None,
             update_on_duplicate: bool = False,
-            transaction_mode: str = "batch"
+            transaction_mode: str = "batch",
+            unique_keys: Optional[List[List[str]]] = None
     ):
         """实际执行表上传的方法"""
-        # 检查表是否存在
-        if not self._check_table_exists(db_name, table_name):
+        table_existed = self._check_table_exists(db_name, table_name)
+        if not table_existed:
             if auto_create:
                 self._create_table(db_name, table_name, set_typ, primary_keys, date_column, indexes,
-                                   allow_null=allow_null)
+                                   allow_null=allow_null, unique_keys=unique_keys)
             else:
                 logger.error('数据表不存在', {
                     '库': db_name,
@@ -652,8 +693,30 @@ class MySQLUploader:
                     'func': sys._getframe().f_code.co_name,
                 })
                 raise ValueError(f"数据表不存在: `{db_name}`.`{table_name}`")
-        # 获取表结构并验证
+        if table_existed and unique_keys:
+            try:
+                exist_ukeys = self._get_existing_unique_keys(db_name, table_name)
+                exist_ukeys_norm = [sorted([c.lower() for c in uk]) for uk in exist_ukeys]
+                filtered_ukeys = [uk for uk in unique_keys if 1 <= len(uk) <= 20]
+                to_add = []
+                for uk in filtered_ukeys:
+                    norm_uk = sorted([c.lower() for c in uk])
+                    if norm_uk not in exist_ukeys_norm:
+                        to_add.append(uk)
+                max_unique_keys = 10
+                if len(exist_ukeys) + len(to_add) > max_unique_keys:
+                    logger.warning('unique_keys超限', {
+                        '库': db_name,
+                        '表': table_name,
+                        '已存在': exist_ukeys,
+                        '本次待添加': to_add,
+                        '最大数量': max_unique_keys
+                    })
+                    to_add = to_add[:max_unique_keys - len(exist_ukeys)]
+                for uk in to_add:
+                    self._add_unique_key(db_name, table_name, uk)
+            except Exception as e:
+                logger.warning('动态unique key处理异常', {'库': db_name, '表': table_name, '错误': str(e)})
         table_columns = self._get_table_columns(db_name, table_name)
         if not table_columns:
             logger.error('获取列失败', {
@@ -663,8 +726,6 @@ class MySQLUploader:
                 'func': sys._getframe().f_code.co_name,
             })
             raise ValueError(f"获取列失败 `{db_name}`.`{table_name}`")
-        # 验证数据列与表列匹配
         for col in set_typ:
             if col not in table_columns:
                 logger.error('列不存在', {
@@ -674,22 +735,19 @@ class MySQLUploader:
                     'func': sys._getframe().f_code.co_name,
                 })
                 raise ValueError(f"列不存在: `{col}` -> `{db_name}`.`{table_name}`")
-        # 确保分表参考字段为索引
         if date_column and date_column in table_columns:
             try:
                 self._ensure_index(db_name, table_name, date_column)
             except Exception as e:
                 logger.warning('分表参考字段索引创建失败', {'库': db_name, '表': table_name, '列': date_column, '错误': str(e)})
-        # 插入数据
-        self._insert_data(
+        inserted, skipped, failed = self._insert_data(
             db_name, table_name, data, set_typ,
             check_duplicate, duplicate_columns,
             batch_id=batch_id,
             update_on_duplicate=update_on_duplicate,
             transaction_mode=transaction_mode
         )
+        return inserted, skipped, failed
     def _infer_data_type(self, value: Any, no_log: bool = False) -> str:
         """
@@ -817,11 +875,8 @@ class MySQLUploader:
         # 统一处理原始数据中列名的特殊字符
         data = self.normalize_column_names(data)
-        # set_typ的键处理
-        if self.case_sensitive:
-            set_typ = {k: v for k, v in set_typ.items()}
-        else:
-            set_typ = {k.lower(): v for k, v in set_typ.items()}
+        # set_typ的键清洗
+        set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
         # 获取数据中实际存在的列名
         data_columns = set()
@@ -890,7 +945,8 @@ class MySQLUploader:
             auto_create: bool = True,
             indexes: Optional[List[str]] = None,
             update_on_duplicate: bool = False,
-            transaction_mode: str = "batch"
+            transaction_mode: str = "batch",
+            unique_keys: Optional[List[List[str]]] = None
     ):
         """
         上传数据到数据库的主入口方法，分表逻辑异常处理统计丢弃数据
@@ -912,6 +968,7 @@ class MySQLUploader:
             - 'row'     : 逐行提交事务（错误隔离性好）
             - 'batch'   : 整批提交事务（性能最优）
             - 'hybrid'  : 混合模式（每N行提交，平衡性能与安全性）
+        :param unique_keys: 唯一约束列表，每个元素为列名列表，支持多列组合唯一约束
         :raises: 可能抛出各种验证和数据库相关异常
         """
         # upload_start = time.time()
@@ -936,7 +993,8 @@ class MySQLUploader:
                 # '自动建表': auto_create,
                 '索引': indexes,
                 '更新旧数据': update_on_duplicate,
-                '事务模式': transaction_mode
+                '事务模式': transaction_mode,
+                '唯一约束': unique_keys
             },
             # '数据样例': self._shorten_for_log(data, 2)
         })
@@ -1005,15 +1063,21 @@ class MySQLUploader:
                         continue
                 # 对每个分表执行上传
+                total_inserted = 0
+                total_skipped = dropped_rows  # 分表异常丢弃
+                total_failed = 0
                 for part_table, part_data in partitioned_data.items():
                     try:
-                        self._upload_to_table(
+                        inserted, skipped, failed = self._upload_to_table(
                             db_name, part_table, part_data, filtered_set_typ,
                             primary_keys, check_duplicate, duplicate_columns,
                             allow_null, auto_create, partition_date_column,
-                            indexes, batch_id, update_on_duplicate, transaction_mode
+                            indexes, batch_id, update_on_duplicate, transaction_mode,
+                            unique_keys
                         )
-                        # 确保分表参考字段为索引
+                        total_inserted += inserted
+                        total_skipped += skipped
+                        total_failed += failed
                         if partition_date_column in filtered_set_typ:
                             try:
                                 self._ensure_index(db_name, part_table, partition_date_column)
@@ -1031,13 +1095,16 @@ class MySQLUploader:
                         continue  # 跳过当前分表，继续处理其他分表
             else:
                 # 不分表，直接上传
-                self._upload_to_table(
+                inserted, skipped, failed = self._upload_to_table(
                     db_name, table_name, prepared_data, filtered_set_typ,
                     primary_keys, check_duplicate, duplicate_columns,
                     allow_null, auto_create, partition_date_column,
-                    indexes, batch_id, update_on_duplicate, transaction_mode
+                    indexes, batch_id, update_on_duplicate, transaction_mode,
+                    unique_keys
                 )
-                # 确保分表参考字段为索引
+                total_inserted = inserted
+                total_skipped = skipped
+                total_failed = failed
                 if partition_date_column in filtered_set_typ:
                     try:
                         self._ensure_index(db_name, table_name, partition_date_column)
@@ -1062,7 +1129,9 @@ class MySQLUploader:
                 '批次': batch_id,
                 'finish': success_flag,
                 '数据行': initial_row_count,
-                '丢弃行数': dropped_rows
+                '插入': total_inserted,
+                '跳过': total_skipped,
+                '失败': total_failed
             })
     @_execute_with_retry
@@ -1095,26 +1164,19 @@ class MySQLUploader:
             - 'hybrid'  : 混合模式（每N行提交，平衡性能与安全性）
         """
         if not data:
-            return
-        # 验证事务模式
+            return 0, 0, 0
         transaction_mode = self._validate_transaction_mode(transaction_mode)
-        # 准备SQL语句
         sql = self._prepare_insert_sql(
             db_name, table_name, set_typ,
             check_duplicate, duplicate_columns,
             update_on_duplicate
         )
-        # 执行批量插入
         total_inserted, total_skipped, total_failed = self._execute_batch_insert(
             db_name, table_name, data, set_typ,
             sql, check_duplicate, duplicate_columns,
             batch_id, transaction_mode,
             update_on_duplicate
         )
         logger.info('插入完成', {
             '库': db_name,
             '表': table_name,
@@ -1124,6 +1186,7 @@ class MySQLUploader:
             '失败': total_failed,
             '事务模式': transaction_mode,
         })
+        return total_inserted, total_skipped, total_failed
     def _validate_transaction_mode(self, mode: str) -> str:
         """验证并标准化事务模式"""
@@ -1266,6 +1329,7 @@ class MySQLUploader:
             update_on_duplicate: bool = False
     ) -> Tuple[int, int, int]:
         """执行批量插入操作，优化batch和hybrid模式"""
+        import pymysql  # 确保异常类型可用
         def get_optimal_batch_size(total_rows: int) -> int:
             if total_rows <= 100:
                 return total_rows
@@ -1295,7 +1359,13 @@ class MySQLUploader:
                         try:
                             cursor.executemany(sql, values_list)
                             conn.commit()
-                            total_inserted += len(batch)
+                            inserted = cursor.rowcount if cursor.rowcount is not None else 0
+                            total_inserted += inserted
+                            total_skipped += len(batch) - inserted
+                        except pymysql.err.IntegrityError as e:
+                            conn.rollback()
+                            total_skipped += len(batch)
+                            logger.debug('批量插入唯一约束冲突，全部跳过', {'库': db_name, '表': table_name, '错误': str(e)})
                         except Exception as e:
                             conn.rollback()
                             total_failed += len(batch)
@@ -1311,7 +1381,15 @@ class MySQLUploader:
                                     dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
                                     values += [row.get(col) for col in dup_cols]
                                 cursor.execute(sql, values)
-                                total_inserted += 1
+                                affected = cursor.rowcount if cursor.rowcount is not None else 0
+                                if affected > 0:
+                                    total_inserted += 1
+                                else:
+                                    total_skipped += 1
+                            except pymysql.err.IntegrityError as e:
+                                conn.rollback()
+                                total_skipped += 1
+                                logger.debug('hybrid单行插入唯一约束冲突，跳过', {'库': db_name, '表': table_name, '错误': str(e)})
                             except Exception as e:
                                 conn.rollback()
                                 total_failed += 1
@@ -1325,8 +1403,16 @@ class MySQLUploader:
                                 dup_cols = duplicate_columns if duplicate_columns else [col for col in all_columns if col.lower() not in self.base_excute_col]
                                 values += [row.get(col) for col in dup_cols]
                             cursor.execute(sql, values)
+                            affected = cursor.rowcount if cursor.rowcount is not None else 0
+                            if affected > 0:
+                                total_inserted += 1
+                            else:
+                                total_skipped += 1
                             conn.commit()
-                            total_inserted += 1
+                        except pymysql.err.IntegrityError as e:
+                            conn.rollback()
+                            total_skipped += 1
+                            logger.debug('单行插入唯一约束冲突，跳过', {'库': db_name, '表': table_name, '错误': str(e)})
                         except Exception as e:
                             conn.rollback()
                             total_failed += 1
@@ -1347,9 +1433,9 @@ class MySQLUploader:
                     self.pool = None
                 except Exception as e:
                     logger.warning('关闭连接池时出错', {'error': str(e)})
-                logger.info('连接池关闭', {'uploader.py': '连接池关闭'})
+                logger.debug('finished', {'uploader.py': '连接池关闭'})
         except Exception as e:
-            logger.error('关闭连接池失败', {'error': str(e)})
+            logger.error('关闭连接池失败', {'uploader.py': str(e)})
             raise
     def _check_pool_health(self) -> bool:
@@ -1431,6 +1517,13 @@ class MySQLUploader:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
+    def _normalize_col(self, col: str) -> str:
+        """
+        列名自动清洗并转小写（如case_sensitive为False），保证和表结构一致。
+        """
+        safe = self._validate_identifier(col)
+        return safe if self.case_sensitive else safe.lower()
 def main():
     """
@@ -1443,7 +1536,7 @@ def main():
     """
     uploader = MySQLUploader(
         username='root',
-        password='pw',
+        password='pwd',
         host='localhost',
         port=3306,
     )
@@ -1462,7 +1555,7 @@ def main():
         {'日期': '2023-01-8', 'name': 'JACk', 'AGE': '24', 'salary': 555.1545},
         {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 35, 'salary': '100'},
         {'日期': '2023-01-15', 'name': 'Alice', 'AGE': 30, 'salary': 0.0},
-        {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75}
+        {'日期': '2023-02-20', 'name': 'Bob', 'AGE': 25, 'salary': 45000.75},
     ]
     # 上传数据
@@ -1474,12 +1567,13 @@ def main():
         primary_keys=[],  # 创建唯一主键
         check_duplicate=False,  # 检查重复数据
         duplicate_columns=[],  # 指定排重的组合键
+        update_on_duplicate=False,  # 更新旧数据
         allow_null=False,  # 允许插入空值
-        partition_by='year',  # 按月分表
+        partition_by='year',  # 分表方式
         partition_date_column='日期',  # 用于分表的日期列名，默认为'日期'
-        auto_create=True,  # 表不存在时自动创建, 默认参数不要更改
-        indexes=[],  # 指定索引列
+        indexes=[],  # 普通索引列
         transaction_mode='row',  # 事务模式
+        unique_keys=[['日期', 'name', 'age']]  # 唯一约束列表
     )
     uploader.close()

{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: mdbq
-Version: 3.11.11
+Version: 3.12.1
 Home-page: https://pypi.org/project/mdbq
 Author: xigua,
 Author-email: 2587125111@qq.com

{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
-mdbq/__version__.py,sha256=GrY3av2BYeEaosI2qWYizQyTwyijdq8IuOuFjTJqLxE,19
+mdbq/__version__.py,sha256=vHfePSxiigIQg58VIYYk2QYh_4AtpXtMsfV3nHXNUhg,18
 mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
 mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
 mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,10 +8,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
 mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
 mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
 mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
-mdbq/mysql/deduplicator.py,sha256=e84MLhWjdCoDB8GxUV-z5drn8hdKGlJKnHzNW0rjIM8,65345
+mdbq/mysql/deduplicator.py,sha256=KMJ_YyqAniaLVRqOHLgO92PgwknIDB-EgaOY7S6iMZ4,68599
 mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
 mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
-mdbq/mysql/uploader.py,sha256=8Px_W2bYOr1wQgMXMK0DggNiuE6a6Ul4BlJake8LSo8,64469
+mdbq/mysql/uploader.py,sha256=PD8gA2PixoK2ZH4vWTmz1kbNTab8VGUJLoepD024H5Q,70265
 mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
 mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
 mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
 mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
 mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
 mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
-mdbq-3.11.11.dist-info/METADATA,sha256=NHTu8tsBwtvh90jaiNN4E4i9SW5xkH6P-yYcBrxwSbU,365
-mdbq-3.11.11.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-mdbq-3.11.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
-mdbq-3.11.11.dist-info/RECORD,,
+mdbq-3.12.1.dist-info/METADATA,sha256=viVkeKnHLlpvAxthu_c50VYyla5Uc2COG99IigfDPmc,364
+mdbq-3.12.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+mdbq-3.12.1.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
+mdbq-3.12.1.dist-info/RECORD,,

{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{mdbq-3.11.11.dist-info → mdbq-3.12.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

mdbq 3.11.11__py3-none-any.whl → 3.12.1__py3-none-any.whl

mdbq 3.11.11py3-none-any.whl → 3.12.1py3-none-any.whl