mdbq 4.0.3__tar.gz → 4.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {mdbq-4.0.3 → mdbq-4.0.5}/PKG-INFO +1 -1
  2. mdbq-4.0.5/mdbq/__version__.py +1 -0
  3. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/aggregation/query_data.py +108 -76
  4. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/mysql/s_query.py +29 -8
  5. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/mysql/uploader.py +85 -47
  6. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq.egg-info/PKG-INFO +1 -1
  7. mdbq-4.0.3/mdbq/__version__.py +0 -1
  8. {mdbq-4.0.3 → mdbq-4.0.5}/README.txt +0 -0
  9. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/__init__.py +0 -0
  10. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/aggregation/__init__.py +0 -0
  11. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/config/__init__.py +0 -0
  12. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/config/config.py +0 -0
  13. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/log/__init__.py +0 -0
  14. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/log/mylogger.py +0 -0
  15. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/log/spider_logging.py +0 -0
  16. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/mysql/__init__.py +0 -0
  17. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/mysql/deduplicator.py +0 -0
  18. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/mysql/mysql.py +0 -0
  19. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/mysql/unique_.py +0 -0
  20. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/other/__init__.py +0 -0
  21. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/other/download_sku_picture.py +0 -0
  22. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/other/otk.py +0 -0
  23. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/other/pov_city.py +0 -0
  24. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/other/ua_sj.py +0 -0
  25. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/pbix/__init__.py +0 -0
  26. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/pbix/pbix_refresh.py +0 -0
  27. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/pbix/refresh_all.py +0 -0
  28. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/redis/__init__.py +0 -0
  29. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/redis/getredis.py +0 -0
  30. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/spider/__init__.py +0 -0
  31. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq/spider/aikucun.py +0 -0
  32. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq.egg-info/SOURCES.txt +0 -0
  33. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq.egg-info/dependency_links.txt +0 -0
  34. {mdbq-4.0.3 → mdbq-4.0.5}/mdbq.egg-info/top_level.txt +0 -0
  35. {mdbq-4.0.3 → mdbq-4.0.5}/setup.cfg +0 -0
  36. {mdbq-4.0.3 → mdbq-4.0.5}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.0.3
3
+ Version: 4.0.5
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -0,0 +1 @@
1
+ VERSION = '4.0.5'
@@ -14,7 +14,8 @@ import platform
14
14
  import os
15
15
  import time
16
16
  import calendar
17
- import concurrent.futures
17
+ from collections.abc import Mapping, Sequence
18
+ import inspect
18
19
 
19
20
  dir_path = os.path.expanduser("~")
20
21
  config_file = os.path.join(dir_path, 'spd.txt')
@@ -36,6 +37,47 @@ logger = mylogger.MyLogger(
36
37
  )
37
38
 
38
39
 
40
+ def reorder_columns(df: pd.DataFrame, set_type) -> pd.DataFrame:
41
+ """
42
+ 调整DataFrame的列顺序,按照set_type中的顺序排列,忽略大小写,set_type中不存在的列自动跳过。
43
+ set_type可以是列表或字典(此时用字典的键名作为顺序)。
44
+ 不改变数据和数据类型。
45
+ 如果 set_type 为 None、空列表或空字典,则直接返回原 df,不做任何调整。
46
+ """
47
+ # 直接返回原 df 的情况
48
+ if set_type is None:
49
+ return df
50
+ if isinstance(set_type, Mapping) and len(set_type) == 0:
51
+ return df
52
+ if isinstance(set_type, Sequence) and not isinstance(set_type, str) and len(set_type) == 0:
53
+ return df
54
+
55
+ # 如果set_type是字典,提取其键名
56
+ if isinstance(set_type, Mapping):
57
+ col_order = list(set_type.keys())
58
+ elif isinstance(set_type, Sequence) and not isinstance(set_type, str):
59
+ col_order = list(set_type)
60
+ else:
61
+ raise ValueError("set_type must be a list or a dict (or other mapping type)")
62
+
63
+ # 构建原始列名的映射(小写->原始名)
64
+ col_map = {col.lower(): col for col in df.columns}
65
+ # 生成新顺序的列名(只保留df中存在的列,且顺序按set_type)
66
+ new_cols = []
67
+ used = set()
68
+ for col in col_order:
69
+ key = col.lower()
70
+ if key in col_map and key not in used:
71
+ new_cols.append(col_map[key])
72
+ used.add(key)
73
+ # 添加剩余未在set_type中出现的列,保持原顺序
74
+ for col in df.columns:
75
+ if col.lower() not in used:
76
+ new_cols.append(col)
77
+ # 返回新顺序的DataFrame
78
+ return df[new_cols]
79
+
80
+
39
81
  def upload_data_decorator(**upload_kwargs):
40
82
  """
41
83
  数据上传装饰器
@@ -45,82 +87,90 @@ def upload_data_decorator(**upload_kwargs):
45
87
  def decorator(func):
46
88
  @wraps(func)
47
89
  def wrapper(*args, **kwargs):
90
+ db_name = None
91
+ table_name = None
48
92
  try:
93
+ # 获取函数签名和参数
94
+ sig = inspect.signature(func)
95
+ bound_args = sig.bind(*args, **kwargs)
96
+ args_dict = bound_args.arguments
97
+
98
+ # 获取所需参数
99
+ def get_param_value(param_name, alternatives=None):
100
+ if alternatives is None:
101
+ alternatives = [param_name]
102
+ # 从 kwargs 或 args_dict 中获取参数值
103
+ for key in alternatives:
104
+ if key in kwargs:
105
+ return kwargs[key]
106
+ if key in args_dict:
107
+ return args_dict[key]
108
+ return None
109
+
110
+ # 获取参数值
111
+ set_type = get_param_value('set_type', ['set_type', 'set_typ'])
112
+ db_name = get_param_value('db_name')
113
+ table_name = get_param_value('table_name')
114
+
49
115
  # 执行原始函数
50
116
  result = func(*args, **kwargs)
51
117
 
52
- # 如果返回 None,直接返回
53
118
  if result is None:
54
119
  return None
55
-
56
- # 如果返回的是 DataFrame
120
+
121
+ # 处理 DataFrame 结果
57
122
  if isinstance(result, pd.DataFrame):
58
- # 设置默认值
59
- default_kwargs = {
123
+ if set_type is not None:
124
+ result = reorder_columns(result, set_type)
125
+
126
+ # 合并参数
127
+ merged_kwargs = {
60
128
  'check_duplicate': False,
61
129
  'update_on_duplicate': True,
62
130
  'allow_null': False,
63
- 'transaction_mode': 'batch'
131
+ 'transaction_mode': 'batch',
132
+ **upload_kwargs
64
133
  }
65
- # 更新参数,优先使用装饰器参数
66
- merged_kwargs = {**default_kwargs, **upload_kwargs}
67
134
 
68
- # 上传数据
69
- uld.upload_data(
70
- data=result,
71
- **merged_kwargs
72
- )
135
+ uld.upload_data(data=result, **merged_kwargs)
73
136
  return True
74
-
75
- # 如果返回的是元组
137
+
138
+ # 处理元组结果
76
139
  elif isinstance(result, tuple):
77
- # 检查元组长度
78
140
  if len(result) < 2:
79
- logger.warning('函数返回的元组长度小于2,直接返回原结果,不执行上传', {'函数': func.__name__})
141
+ logger.warning('函数返回的元组长度小于2,直接返回原结果,不执行上传', {'函数': func.__name__, '库': db_name, '表': table_name})
80
142
  return result
81
-
82
- # 获取前两个元素
143
+
83
144
  df, extra_kwargs = result[0], result[1]
84
145
 
85
- # 检查第一个元素是否为DataFrame
86
146
  if not isinstance(df, pd.DataFrame):
87
- logger.warning('函数返回的元组第一个元素不是DataFrame,直接返回原结果,不执行上传', {'函数': func.__name__})
147
+ logger.warning('函数返回的元组第一个元素不是DataFrame,直接返回原结果,不执行上传', {'函数': func.__name__, '库': db_name, '表': table_name})
88
148
  return result
89
-
90
- # 合并装饰器参数和函数参数
91
- merged_kwargs = {**upload_kwargs}
92
- merged_kwargs.update(extra_kwargs)
93
-
94
- # 设置默认值
95
- default_kwargs = {
149
+
150
+ if set_type is not None:
151
+ df = reorder_columns(df, set_type)
152
+ result = (df, extra_kwargs) + result[2:]
153
+
154
+ # 合并参数
155
+ merged_kwargs = {
96
156
  'check_duplicate': False,
97
157
  'update_on_duplicate': True,
98
158
  'allow_null': False,
99
- 'transaction_mode': 'batch'
159
+ 'transaction_mode': 'batch',
160
+ **upload_kwargs,
161
+ **extra_kwargs
100
162
  }
101
- # 更新参数,优先使用装饰器参数
102
- for key, value in default_kwargs.items():
103
- if key not in merged_kwargs:
104
- merged_kwargs[key] = value
105
163
 
106
- # 上传数据
107
- uld.upload_data(
108
- data=df,
109
- **merged_kwargs
110
- )
164
+ uld.upload_data(data=df, **merged_kwargs)
111
165
 
112
- # 如果元组长度大于2,返回完整元组
113
- if len(result) > 2:
114
- return result
115
- return True
116
-
117
- # 其他情况直接返回结果
166
+ return result if len(result) > 2 else True
167
+
118
168
  return result
119
-
169
+
120
170
  except Exception as e:
121
- logger.error('数据上传失败', {'函数': func.__name__, '错误': str(e)})
171
+ logger.error('数据上传失败', {'函数': func.__name__, '库': db_name, '表': table_name, '错误': str(e)})
122
172
  return False
123
-
173
+
124
174
  return wrapper
125
175
  return decorator
126
176
 
@@ -1951,7 +2001,7 @@ class MysqlDatasQuery:
1951
2001
  'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
1952
2002
  'indexes': [], # 普通索引列
1953
2003
  'transaction_mode': 'batch', # 事务模式
1954
- 'unique_keys': [['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id']], # 唯一约束列表
2004
+ 'unique_keys': [['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id', '花费']], # 唯一约束列表
1955
2005
  }
1956
2006
 
1957
2007
  @try_except
@@ -2119,7 +2169,7 @@ class MysqlDatasQuery:
2119
2169
  'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
2120
2170
  'indexes': [], # 普通索引列
2121
2171
  'transaction_mode': 'batch', # 事务模式
2122
- 'unique_keys': [['日期', '产品线', '搜索词', '计划id', '搜索词', '关键词']], # 唯一约束列表
2172
+ 'unique_keys': [['日期', '产品线', '计划id', '搜索词', '关键词']], # 唯一约束列表
2123
2173
  }
2124
2174
 
2125
2175
  @try_except
@@ -3032,7 +3082,7 @@ class MysqlDatasQuery:
3032
3082
  'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
3033
3083
  'indexes': [], # 普通索引列
3034
3084
  'transaction_mode': 'batch', # 事务模式
3035
- 'unique_keys': [['日期', '店铺名称', '营销场景']], # 唯一约束列表
3085
+ 'unique_keys': [['日期', '店铺名称', '商品款号', 'spuid']], # 唯一约束列表
3036
3086
  }
3037
3087
 
3038
3088
  @upload_data_decorator()
@@ -3661,7 +3711,6 @@ def date_table():
3661
3711
  df = df.reset_index(drop=False)
3662
3712
  df.rename(columns={'index': 'id'}, inplace=True)
3663
3713
  df['id'] = df['id'].apply(lambda x: x + 1)
3664
-
3665
3714
  set_typ = {
3666
3715
  '日期': 'date',
3667
3716
  '年': 'varchar(50)',
@@ -3676,6 +3725,7 @@ def date_table():
3676
3725
  '索引': 'int',
3677
3726
  '月索引': 'int',
3678
3727
  }
3728
+
3679
3729
  return df, {
3680
3730
  'db_name': '聚合数据',
3681
3731
  'table_name': '日期表',
@@ -3693,12 +3743,7 @@ def date_table():
3693
3743
  }
3694
3744
 
3695
3745
 
3696
- def query1(months=1, less_dict=None, download_manager=None):
3697
- if less_dict is None:
3698
- less_dict = []
3699
- if months == 0:
3700
- logger.info('months 不建议为 0')
3701
- return
3746
+ def query1(months=1, download_manager=None):
3702
3747
  sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
3703
3748
  sdq.months = months # 设置数据周期, 1 表示近 2 个月
3704
3749
 
@@ -3725,12 +3770,7 @@ def query1(months=1, less_dict=None, download_manager=None):
3725
3770
  sdq.performance_concat(bb_tg=False, db_name='聚合数据', table_name='天猫_推广汇总') # _推广商品销售
3726
3771
 
3727
3772
 
3728
- def query2(months=1, less_dict=None, download_manager=None):
3729
- if less_dict is None:
3730
- less_dict = []
3731
- if months == 0:
3732
- logger.info('months 不建议为 0')
3733
- return
3773
+ def query2(months=1, download_manager=None):
3734
3774
  sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
3735
3775
  sdq.months = months # 设置数据周期, 1 表示近 2 个月
3736
3776
  sdq.dplyd(db_name='聚合数据', table_name='店铺流量来源构成')
@@ -3743,18 +3783,13 @@ def query2(months=1, less_dict=None, download_manager=None):
3743
3783
  sdq.deeplink(db_name='聚合数据', table_name='达摩盘_deeplink人群洞察')
3744
3784
 
3745
3785
 
3746
- def query3(months=1, less_dict=None, download_manager=None):
3747
- if less_dict is None:
3748
- less_dict = []
3749
- if months == 0:
3750
- logger.info('months 不建议为 0')
3751
- return
3786
+ def query3(months=1, download_manager=None):
3752
3787
  sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
3753
3788
  sdq.months = months # 设置数据周期, 1 表示近 2 个月
3754
3789
  sdq.spph(db_name='聚合数据', table_name='天猫_商品排行')
3755
3790
 
3756
3791
 
3757
- def main(days=150, months=3):
3792
+ def main(months=3):
3758
3793
  # 1. 更新日期表 更新货品年份基准表, 属性设置 3 - 货品年份基准
3759
3794
  date_table()
3760
3795
 
@@ -3772,9 +3807,6 @@ def main(days=150, months=3):
3772
3807
 
3773
3808
 
3774
3809
  if __name__ == '__main__':
3775
- main(
3776
- days=150, # 清理聚合数据的日期长度
3777
- months=3 # 生成聚合数据的长度
3778
- )
3810
+ # main(months=3)
3779
3811
 
3780
- # date_table()
3812
+ pass
@@ -762,6 +762,21 @@ class QueryDatas:
762
762
  finally:
763
763
  self.pool = None
764
764
 
765
+ def _adjust_page_size(self, last_duration, current_page_size, min_size=1000, max_size=10000, target_time=2.0):
766
+ """
767
+ 根据上一次批次耗时自动调整下一次的 page_size。
768
+ - last_duration: 上一批次查询耗时(秒)
769
+ - current_page_size: 当前批次大小
770
+ - min_size, max_size: 允许的最小/最大批次
771
+ - target_time: 期望每批耗时(秒)
772
+ """
773
+ if last_duration < target_time / 2 and current_page_size < max_size:
774
+ return min(current_page_size * 2, max_size)
775
+ elif last_duration > target_time * 2 and current_page_size > min_size:
776
+ return max(current_page_size // 2, min_size)
777
+ else:
778
+ return current_page_size
779
+
765
780
  def data_to_df(
766
781
  self,
767
782
  db_name: str,
@@ -890,18 +905,20 @@ class QueryDatas:
890
905
  # 分页查询
891
906
  offset = 0
892
907
  all_results = []
893
-
908
+ min_size, max_size = 1000, 10000
909
+ target_time = 1.0 # 期望每批1秒
910
+
894
911
  while offset < total_count:
912
+ start_time = time.time()
895
913
  # 添加分页参数
896
914
  page_sql = f"{base_sql} LIMIT %s OFFSET %s"
897
915
  page_params = list(params) + [page_size, offset]
898
-
899
916
  cursor.execute(page_sql, tuple(page_params))
900
917
  page_results = cursor.fetchall()
901
-
918
+
902
919
  if not page_results:
903
920
  break
904
-
921
+
905
922
  if return_format == 'list_dict':
906
923
  all_results.extend(page_results)
907
924
  else:
@@ -909,14 +926,18 @@ class QueryDatas:
909
926
  all_results = pd.DataFrame(page_results)
910
927
  else:
911
928
  all_results = pd.concat([all_results, pd.DataFrame(page_results)], ignore_index=True)
912
-
913
- offset += page_size
929
+
930
+ duration = time.time() - start_time
931
+ page_size = self._adjust_page_size(duration, page_size, min_size, max_size, target_time)
932
+ offset += len(page_results)
914
933
  logger.debug('分页查询进度', {
915
934
  '库': db_name,
916
935
  '表': table_name,
917
- '当前偏移量': offset,
936
+ # '当前偏移量': offset,
918
937
  '总记录数': total_count,
919
- '已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index)
938
+ '已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index),
939
+ '本批耗时': f'{duration:.2f}',
940
+ '下批page_size': page_size
920
941
  })
921
942
 
922
943
  if return_format == 'df' and isinstance(all_results, pd.DataFrame) and not all_results.empty:
@@ -14,6 +14,7 @@ from dbutils.pooled_db import PooledDB
14
14
  import json
15
15
  import sys
16
16
  from decimal import Decimal, InvalidOperation
17
+ import math
17
18
 
18
19
  warnings.filterwarnings('ignore')
19
20
  logger = mylogger.MyLogger(
@@ -240,8 +241,16 @@ class MySQLUploader:
240
241
  conn = self.pool.connection()
241
242
  return conn
242
243
  except Exception as e:
243
- logger.error('从连接池获取数据库连接失败', {'error': str(e)})
244
- raise ConnectionError(f'连接数据库失败: {str(e)}')
244
+ logger.error('从连接池获取数据库连接失败,尝试重建连接池', {'error': str(e)})
245
+ # 强制重建连接池
246
+ try:
247
+ self.pool = self._create_connection_pool()
248
+ conn = self.pool.connection()
249
+ logger.info('重建连接池后获取连接成功')
250
+ return conn
251
+ except Exception as e2:
252
+ logger.error('重建连接池后依然获取连接失败', {'error': str(e2)})
253
+ raise ConnectionError(f'连接数据库失败: {str(e2)}')
245
254
 
246
255
  @_execute_with_retry
247
256
  def _check_database_exists(self, db_name: str) -> bool:
@@ -407,31 +416,36 @@ class MySQLUploader:
407
416
  col_def += " NOT NULL"
408
417
  column_defs.append(col_def)
409
418
  # 主键处理逻辑调整
419
+ def _index_col_sql(col):
420
+ col_type = set_typ.get(col, '').lower()
421
+ if 'varchar' in col_type or 'text' in col_type:
422
+ return f"`{self._normalize_col(col)}`(100)"
423
+ return f"`{self._normalize_col(col)}`"
410
424
  if primary_keys and len(primary_keys) > 0:
411
- safe_primary_keys = [self._normalize_col(pk) for pk in primary_keys]
412
- primary_key_sql = f"PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
425
+ safe_primary_keys = [_index_col_sql(pk) for pk in primary_keys]
426
+ primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
413
427
  else:
414
- safe_primary_keys = [self._normalize_col('id')]
428
+ safe_primary_keys = [_index_col_sql('id')]
415
429
  primary_key_sql = f"PRIMARY KEY (`id`)"
416
430
  # 索引统一在CREATE TABLE中定义
417
431
  index_defs = []
418
432
  if date_column and date_column in set_typ:
419
- safe_date_col = self._normalize_col(date_column)
420
- index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
433
+ safe_date_col = _index_col_sql(date_column)
434
+ index_defs.append(f"INDEX `idx_{self._normalize_col(date_column)}` ({safe_date_col})")
421
435
  if indexes:
422
436
  for idx_col in indexes:
423
437
  if idx_col in set_typ:
424
- safe_idx_col = self._normalize_col(idx_col)
425
- index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
438
+ safe_idx_col = _index_col_sql(idx_col)
439
+ index_defs.append(f"INDEX `idx_{self._normalize_col(idx_col)}` ({safe_idx_col})")
426
440
  # UNIQUE KEY定义
427
441
  unique_defs = []
428
442
  if unique_keys:
429
443
  for unique_cols in unique_keys:
430
444
  if not unique_cols:
431
445
  continue
432
- safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
433
- unique_name = f"uniq_{'_'.join(safe_unique_cols)}"
434
- unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
446
+ safe_unique_cols = [_index_col_sql(col) for col in unique_cols]
447
+ unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in unique_cols])}"
448
+ unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
435
449
  index_defs = list(set(index_defs))
436
450
  all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
437
451
  sql = f"""
@@ -447,7 +461,7 @@ class MySQLUploader:
447
461
  conn.commit()
448
462
  logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
449
463
  except Exception as e:
450
- logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
464
+ logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e), '异常类型': type(e).__name__})
451
465
  if conn is not None:
452
466
  conn.rollback()
453
467
  raise
@@ -491,34 +505,53 @@ class MySQLUploader:
491
505
  def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
492
506
  """
493
507
  根据列类型验证并转换数据值
494
-
495
- :param value: 要验证的值
496
- :param column_type: 列的数据类型
497
- :param allow_null: 是否允许空值
498
- :param db_name: 数据库名(用于日志)
499
- :param table_name: 表名(用于日志)
500
- :param col_name: 列名(用于日志)
501
- :return: 转换后的值
502
- :raises ValueError: 当值转换失败时抛出
503
508
  """
509
+ column_type_lower = column_type.lower() if column_type else ''
510
+ # 统一判断None/NaN
511
+ is_nan = False
504
512
  if value is None:
513
+ is_nan = True
514
+ elif isinstance(value, float) and math.isnan(value):
515
+ is_nan = True
516
+ elif str(value).lower() in ['nan', 'none']:
517
+ is_nan = True
518
+ if is_nan:
505
519
  if not allow_null:
506
- logger.warning('字段值为None但不允许空值, 已填充为none', {
507
- '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
508
- })
509
- return 'none'
520
+ if 'int' in column_type_lower:
521
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为0', {
522
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
523
+ })
524
+ return 0
525
+ elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
526
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为0.0', {
527
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
528
+ })
529
+ return 0.0
530
+ elif 'date' in column_type_lower or 'time' in column_type_lower:
531
+ if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
532
+ default_date = '2000-01-01 00:00:00'
533
+ else:
534
+ default_date = '2000-01-01'
535
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为默认日期', {
536
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type, '默认值': default_date
537
+ })
538
+ return default_date
539
+ else:
540
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为none字符串', {
541
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
542
+ })
543
+ return 'none'
510
544
  return None
511
545
  try:
512
- column_type_lower = column_type.lower()
513
546
  if isinstance(value, str) and value.strip().endswith('%'):
514
- try:
547
+ if re.match(r'^\d+(\.\d+)?%$', value.strip()):
515
548
  percent_str = value.strip().replace('%', '')
516
549
  percent_value = float(percent_str)
517
550
  decimal_value = percent_value / 100
518
551
  logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
519
552
  return decimal_value
520
- except ValueError:
521
- logger.warning('百分比字符串转小数失败', {
553
+ else:
554
+ logger.warning('百分比字符串不符合格式,跳过转换', {
522
555
  '库': db_name, '表': table_name, '列': col_name, '原始': value
523
556
  })
524
557
  elif 'int' in column_type_lower:
@@ -548,10 +581,18 @@ class MySQLUploader:
548
581
  })
549
582
  raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
550
583
  return str(value)
551
- elif 'char' in column_type_lower or 'text' in column_type_lower:
584
+ elif 'varchar' in column_type_lower:
552
585
  if isinstance(value, str):
553
586
  return value.replace('\\', '\\\\').replace("'", "\\'")
554
- return str(value)
587
+ elif 'text' in column_type_lower:
588
+ if isinstance(value, str):
589
+ max_length = 65535
590
+ if len(value) > max_length:
591
+ logger.warning(f'TEXT字符串长度不允许超过 {max_length},已截断', {
592
+ '库': db_name, '表': table_name, '列': col_name, '原始值': f'{value[:50]}...', '截断后值': f'{value[:50]}...'
593
+ })
594
+ value = value[:max_length]
595
+ return value.replace('\\', '\\\\').replace("'", "\\'")
555
596
  elif 'json' in column_type_lower:
556
597
  return json.dumps(value) if value is not None else None
557
598
  else:
@@ -881,22 +922,21 @@ class MySQLUploader:
881
922
  # set_typ的键清洗
882
923
  set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
883
924
 
884
- # 获取数据中实际存在的列名
885
- data_columns = set()
886
- if data and len(data) > 0:
887
- data_columns = set(data[0].keys())
888
-
889
- # 过滤set_typ,只保留数据中存在的列
925
+ # 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
890
926
  filtered_set_typ = {}
891
- for col in data_columns:
892
- if col in set_typ:
927
+ data_columns = list(data[0].keys()) if data and len(data) > 0 else []
928
+ # 先按set_typ顺序
929
+ for col in set_typ:
930
+ if col in data_columns:
893
931
  filtered_set_typ[col] = set_typ[col]
894
- else:
895
- # 如果列不在set_typ中,采样多个非None值推断类型
932
+ # 再补充data中有但set_typ没有的列
933
+ for col in data_columns:
934
+ if col not in filtered_set_typ:
935
+ # 推断类型
896
936
  sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
897
937
  inferred_type = None
898
938
  for val in sample_values:
899
- inferred_type = self._infer_data_type(val, no_log=True) # 推断日期类型不记录日志, 避免日志噪音过多
939
+ inferred_type = self._infer_data_type(val, no_log=True)
900
940
  if inferred_type:
901
941
  break
902
942
  if not inferred_type:
@@ -1326,8 +1366,7 @@ class MySQLUploader:
1326
1366
  if cached:
1327
1367
  return cached
1328
1368
  # 获取所有列名(排除id)
1329
- all_columns = [col for col in set_typ.keys()
1330
- if col.lower() != 'id']
1369
+ all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
1331
1370
  if not check_duplicate:
1332
1371
  sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
1333
1372
  update_on_duplicate)
@@ -1364,7 +1403,6 @@ class MySQLUploader:
1364
1403
  - 只有遇到严重的数据库错误(如所有行都因唯一约束冲突且没有ON DUPLICATE KEY UPDATE),才会整体回滚。
1365
1404
  - 返回值为(插入行数, 跳过行数, 失败行数)。
1366
1405
  """
1367
- import pymysql # 确保异常类型可用
1368
1406
  def get_optimal_batch_size(total_rows: int) -> int:
1369
1407
  if total_rows <= 100:
1370
1408
  return total_rows
@@ -1612,5 +1650,5 @@ def main():
1612
1650
 
1613
1651
 
1614
1652
  if __name__ == '__main__':
1615
- main()
1653
+ # main()
1616
1654
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.0.3
3
+ Version: 4.0.5
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1 +0,0 @@
1
- VERSION = '4.0.3'
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes