PyPI - mdbq - Versions diffs - 4.2.7__tar.gz → 4.2.9__tar.gz - Mend

mdbq 4.2.7tar.gz → 4.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mdbq might be problematic. Click here for more details.

Files changed (46) hide show

{mdbq-4.2.7 → mdbq-4.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: mdbq
-Version: 4.2.7
+Version: 4.2.9
 Home-page: https://pypi.org/project/mdbq
 Author: xigua,
 Author-email: 2587125111@qq.com

mdbq-4.2.9/mdbq/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ VERSION = '4.2.9'

{mdbq-4.2.7 → mdbq-4.2.9}/mdbq/mysql/uploader.py RENAMED Viewed

@@ -11,8 +11,6 @@ from typing import Union, List, Dict, Optional, Any, Tuple, Iterator
 from functools import wraps
 from decimal import Decimal, InvalidOperation
 import math
-import concurrent.futures
-import threading
 import pymysql
 import pandas as pd
 import psutil
@@ -64,6 +62,17 @@ class DatabaseConnectionManager:
             'autocommit': False
         }
+        # 设置时区为北京时间，确保时间戳的一致性
+        if 'init_command' not in self.config:
+            pool_params['init_command'] = "SET time_zone = '+08:00'"
+        else:
+            # 如果用户已设置init_command，则追加时区设置
+            existing_commands = self.config['init_command']
+            if 'time_zone' not in existing_commands.lower():
+                pool_params['init_command'] = f"{existing_commands}; SET time_zone = '+08:00'"
+            else:
+                pool_params['init_command'] = existing_commands
         if self.config.get('ssl'):
             pool_params['ssl'] = self.config['ssl']
@@ -256,18 +265,26 @@ class DataTypeInferrer:
         # 采样数据进行类型推断
         sample_data = data[:sample_size] if len(data) > sample_size else data
+        # 首先收集所有列名
+        all_columns = set()
+        for row in sample_data:
+            for col in row.keys():
+                if col.lower() not in ['id', 'create_at', 'update_at']:
+                    all_columns.add(col)
+        # 为每个列初始化候选类型列表
+        for col in all_columns:
+            type_candidates[col] = []
         for row in sample_data:
             for col, value in row.items():
                 # 跳过系统列
                 if col.lower() in ['id', 'create_at', 'update_at']:
                     continue
-                if value is not None and str(value).strip():
-                    mysql_type = DataTypeInferrer.infer_mysql_type(value)
-                    if col not in type_candidates:
-                        type_candidates[col] = []
-                    type_candidates[col].append(mysql_type)
+                # 即使值为空，也要推断类型
+                mysql_type = DataTypeInferrer.infer_mysql_type(value)
+                type_candidates[col].append(mysql_type)
         # 为每列选择最合适的类型
         for col, types in type_candidates.items():
@@ -673,6 +690,19 @@ class TableManager:
         db_name = self._sanitize_identifier(db_name)
         table_name = self._sanitize_identifier(table_name)
+        # 验证columns不为空
+        if not columns:
+            raise ValueError(f"创建表失败：columns不能为空。数据库: {db_name}, 表: {table_name}")
+        # 验证unique_keys中的列是否存在于columns中
+        if unique_keys:
+            business_columns = {k.lower(): k for k in columns.keys() if k.lower() not in ['id', 'create_at', 'update_at']}
+            for i, uk in enumerate(unique_keys):
+                for col in uk:
+                    col_lower = col.lower()
+                    if col_lower not in business_columns and col not in columns:
+                        raise ValueError(f"唯一约束中的列 '{col}' 不存在于表定义中。可用列: {list(business_columns.keys())}")
         # 构建列定义
         column_defs = []
@@ -705,8 +735,15 @@ class TableManager:
                     safe_uk_parts = []
                     for col in filtered_uk:
                         safe_col_name = self._sanitize_identifier(col)
-                        # 检查是否需要前缀索引
-                        col_type = columns.get(col, 'varchar(255)').lower()
+                        # 检查是否需要前缀索引 - 优先使用原始列名，然后尝试小写
+                        col_lower = col.lower()
+                        if col in columns:
+                            col_type = columns[col].lower()
+                        elif col_lower in columns:
+                            col_type = columns[col_lower].lower()
+                        else:
+                            col_type = 'varchar(255)'
                         if 'varchar' in col_type:
                             # 提取varchar长度
                             match = re.search(r'varchar\((\d+)\)', col_type)
@@ -716,20 +753,11 @@ class TableManager:
                                 if length > 191:
                                     prefix_length = 191
                                     safe_uk_parts.append(f"`{safe_col_name}`({prefix_length})")
-                                    logger.debug('应用前缀索引', {
-                                        '列名': col,
-                                        '原始长度': length,
-                                        '前缀长度': prefix_length
-                                    })
                                 else:
                                     safe_uk_parts.append(f"`{safe_col_name}`")
                             else:
                                 # 如果没有指定长度，默认使用前缀索引
                                 safe_uk_parts.append(f"`{safe_col_name}`(191)")
-                                logger.debug('应用默认前缀索引', {
-                                    '列名': col,
-                                    '前缀长度': 191
-                                })
                         else:
                             # 非varchar字段保持原样
                             safe_uk_parts.append(f"`{safe_col_name}`")
@@ -749,9 +777,17 @@ class TableManager:
         with self.conn_mgr.get_connection() as conn:
             with conn.cursor() as cursor:
-                cursor.execute(sql)
-                conn.commit()
-                logger.debug('表已创建', {'database': db_name, 'table': table_name})
+                try:
+                    cursor.execute(sql)
+                    conn.commit()
+                    logger.debug('表已创建', {'database': db_name, 'table': table_name})
+                except Exception as e:
+                    logger.error('创建表失败', {
+                        'database': db_name,
+                        'table': table_name,
+                        'error': str(e)
+                    })
+                    raise
     def get_partition_table_name(self, base_name: str, date_value: str, partition_by: str) -> str:
         """获取分表名称"""
@@ -795,8 +831,6 @@ class TableManager:
         return cleaned
 class DataProcessor:
     """数据处理器"""
@@ -1063,6 +1097,12 @@ class MySQLUploader:
     - 支持自动建表、分表、数据类型推断
     - 高可用连接池管理和重试机制
     - 流式批量插入优化
+    - 自动设置数据库连接时区为北京时间(+08:00)，确保时间戳一致性
+    时区说明：
+    - 所有数据库连接会自动设置为北京时间(+08:00)
+    - create_at和update_at列使用CURRENT_TIMESTAMP，会按照连接时区记录时间
+    - 可使用check_timezone_settings()方法验证时区设置
     """
     def __init__(self, username: str, password: str, host: str = 'localhost',
@@ -1152,21 +1192,35 @@ class MySQLUploader:
             normalized_data = DataProcessor.normalize_data(data)
             # 推断或验证列类型
-            if set_typ is None:
+            if set_typ is None or not set_typ:
                 # 取第一个chunk进行类型推断
                 first_chunk = next(iter(normalized_data))
+                if not first_chunk:
+                    raise ValueError("数据为空，无法推断列类型")
                 set_typ = DataTypeInferrer.infer_types_from_data(first_chunk)
                 # 重新创建迭代器
                 normalized_data = DataProcessor.normalize_data(data)
                 logger.debug('自动推断数据类型', {'类型映射': set_typ})
+                # 验证推断结果
+                if not set_typ or not any(col for col in set_typ.keys() if col.lower() not in ['id', 'create_at', 'update_at']):
+                    raise ValueError(f"类型推断失败，无有效业务列。推断结果: {set_typ}")
             # 将set_typ的键统一转为小写
             set_typ = self.tran_set_typ_to_lower(set_typ)
+            # 最终验证：确保有业务列定义
+            business_columns = {k: v for k, v in set_typ.items() if k.lower() not in ['id', 'create_at', 'update_at']}
+            if not business_columns:
+                raise ValueError(f"没有有效的业务列定义。set_typ: {set_typ}")
             # 确保数据库存在
             self.table_mgr.ensure_database_exists(db_name)
             # 处理分表逻辑
             if partition_by:
                 upload_result = self._handle_partitioned_upload(
                     db_name, table_name, normalized_data, set_typ,
@@ -1372,6 +1426,16 @@ class MySQLUploader:
         main_result['failed_rows'] += partition_result['failed_rows']
         main_result['tables_created'].extend(partition_result['tables_created'])
+    def tran_set_typ_to_lower(self, set_typ: Dict[str, str]) -> Dict[str, str]:
+        if not isinstance(set_typ, dict) or set_typ is None:
+            return {}
+        set_typ_lower = {}
+        for key, value in set_typ.items():
+            set_typ_lower[key.lower()] = value
+        return set_typ_lower
     def close(self):
         """关闭连接"""
         if self.conn_mgr:
@@ -1389,178 +1453,6 @@ class MySQLUploader:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
-    def upload_data_concurrent(self, db_name: str, table_name: str,
-                              data: Union[Dict, List[Dict], pd.DataFrame],
-                              set_typ: Optional[Dict[str, str]] = None,
-                              allow_null: bool = False,
-                              partition_by: Optional[str] = None,
-                              partition_date_column: str = '日期',
-                              update_on_duplicate: bool = False,
-                              unique_keys: Optional[List[List[str]]] = None,
-                              max_workers: int = 3) -> Dict[str, Any]:
-        """
-        并发上传数据到MySQL数据库
-        :param max_workers: 最大并发工作线程数
-        :return: 上传结果详情
-        """
-        db_name = db_name.lower()
-        table_name = table_name.lower()
-        result = {
-            'success': False,
-            'inserted_rows': 0,
-            'skipped_rows': 0,
-            'failed_rows': 0,
-            'tables_created': []
-        }
-        try:
-            # 标准化数据为流式迭代器
-            normalized_data = DataProcessor.normalize_data(data, chunk_size=2000)  # 更小的chunk用于并发
-            # 推断或验证列类型
-            if set_typ is None:
-                first_chunk = next(iter(normalized_data))
-                set_typ = DataTypeInferrer.infer_types_from_data(first_chunk)
-                normalized_data = DataProcessor.normalize_data(data, chunk_size=2000)
-                logger.debug('自动推断数据类型', {'类型映射': set_typ})
-            # 将set_typ的键统一转为小写
-            set_typ = self.tran_set_typ_to_lower(set_typ)
-            # 确保数据库存在
-            self.table_mgr.ensure_database_exists(db_name)
-            # 创建线程锁用于表创建的线程安全
-            table_creation_lock = threading.Lock()
-            created_tables_set = set()
-            def process_chunk_worker(chunk_data):
-                """工作线程函数"""
-                try:
-                    if partition_by:
-                        # 分表处理
-                        partitioned_chunk = DataProcessor.partition_data_by_date(
-                            chunk_data, partition_date_column, partition_by
-                        )
-                        chunk_result = {
-                            'inserted_rows': 0,
-                            'skipped_rows': 0,
-                            'failed_rows': 0,
-                            'tables_created': []
-                        }
-                        for partition_suffix, partition_data in partitioned_chunk.items():
-                            partition_table_name = f"{table_name}_{partition_suffix}"
-                            table_key = f"{db_name}.{partition_table_name}"
-                            # 确保表存在（线程安全）
-                            with table_creation_lock:
-                                if table_key not in created_tables_set:
-                                    if not self.table_mgr.table_exists(db_name, partition_table_name):
-                                        self.table_mgr.create_table(db_name, partition_table_name, set_typ,
-                                                                   unique_keys=unique_keys, allow_null=allow_null)
-                                        chunk_result['tables_created'].append(table_key)
-                                    else:
-                                        self.table_mgr.ensure_system_columns(db_name, partition_table_name)
-                                    created_tables_set.add(table_key)
-                            # 准备并插入数据
-                            prepared_data = DataProcessor.prepare_data_for_insert(
-                                partition_data, set_typ, allow_null
-                            )
-                            inserted, skipped, failed = self.data_inserter.insert_data(
-                                db_name, partition_table_name, prepared_data, set_typ, update_on_duplicate
-                            )
-                            chunk_result['inserted_rows'] += inserted
-                            chunk_result['skipped_rows'] += skipped
-                            chunk_result['failed_rows'] += failed
-                    else:
-                        # 单表处理
-                        table_key = f"{db_name}.{table_name}"
-                        with table_creation_lock:
-                            if table_key not in created_tables_set:
-                                if not self.table_mgr.table_exists(db_name, table_name):
-                                    self.table_mgr.create_table(db_name, table_name, set_typ,
-                                                               unique_keys=unique_keys, allow_null=allow_null)
-                                    chunk_result = {'tables_created': [table_key]}
-                                else:
-                                    self.table_mgr.ensure_system_columns(db_name, table_name)
-                                    chunk_result = {'tables_created': []}
-                                created_tables_set.add(table_key)
-                            else:
-                                chunk_result = {'tables_created': []}
-                        prepared_chunk = DataProcessor.prepare_data_for_insert(
-                            chunk_data, set_typ, allow_null
-                        )
-                        inserted, skipped, failed = self.data_inserter.insert_data(
-                            db_name, table_name, prepared_chunk, set_typ, update_on_duplicate
-                        )
-                        chunk_result.update({
-                            'inserted_rows': inserted,
-                            'skipped_rows': skipped,
-                            'failed_rows': failed
-                        })
-                    return chunk_result
-                except Exception as e:
-                    logger.error('并发处理chunk失败', {'错误': str(e)})
-                    return {
-                        'inserted_rows': 0,
-                        'skipped_rows': 0,
-                        'failed_rows': len(chunk_data) if chunk_data else 0,
-                        'tables_created': []
-                    }
-            # 使用线程池执行并发处理
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                # 提交所有任务
-                future_to_chunk = {}
-                for chunk in normalized_data:
-                    if chunk:
-                        future = executor.submit(process_chunk_worker, chunk)
-                        future_to_chunk[future] = len(chunk)
-                # 收集结果
-                for future in concurrent.futures.as_completed(future_to_chunk):
-                    chunk_result = future.result()
-                    result['inserted_rows'] += chunk_result['inserted_rows']
-                    result['skipped_rows'] += chunk_result['skipped_rows']
-                    result['failed_rows'] += chunk_result['failed_rows']
-                    result['tables_created'].extend(chunk_result['tables_created'])
-            # 去重tables_created
-            result['tables_created'] = list(set(result['tables_created']))
-            result['success'] = result['failed_rows'] == 0
-        except Exception as e:
-            logger.error('并发数据上传失败', {
-                '数据库': db_name,
-                '表名': table_name,
-                '错误': str(e)
-            })
-            result['success'] = False
-        return result
-    def tran_set_typ_to_lower(self, set_typ: Dict[str, str]) -> Dict[str, str]:
-        if not isinstance(set_typ, dict):
-            return set_typ
-        set_typ_lower = {}
-        for key, value in set_typ.items():
-            set_typ_lower[key.lower()] = value
-        return set_typ_lower
 # 使用示例
 if __name__ == '__main__':

{mdbq-4.2.7 → mdbq-4.2.9}/mdbq.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: mdbq
-Version: 4.2.7
+Version: 4.2.9
 Home-page: https://pypi.org/project/mdbq
 Author: xigua,
 Author-email: 2587125111@qq.com