mdbq 4.0.3__py3-none-any.whl → 4.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +108 -76
- mdbq/mysql/s_query.py +29 -8
- mdbq/mysql/uploader.py +85 -47
- {mdbq-4.0.3.dist-info → mdbq-4.0.5.dist-info}/METADATA +1 -1
- {mdbq-4.0.3.dist-info → mdbq-4.0.5.dist-info}/RECORD +8 -8
- {mdbq-4.0.3.dist-info → mdbq-4.0.5.dist-info}/WHEEL +0 -0
- {mdbq-4.0.3.dist-info → mdbq-4.0.5.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.5'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -14,7 +14,8 @@ import platform
|
|
14
14
|
import os
|
15
15
|
import time
|
16
16
|
import calendar
|
17
|
-
import
|
17
|
+
from collections.abc import Mapping, Sequence
|
18
|
+
import inspect
|
18
19
|
|
19
20
|
dir_path = os.path.expanduser("~")
|
20
21
|
config_file = os.path.join(dir_path, 'spd.txt')
|
@@ -36,6 +37,47 @@ logger = mylogger.MyLogger(
|
|
36
37
|
)
|
37
38
|
|
38
39
|
|
40
|
+
def reorder_columns(df: pd.DataFrame, set_type) -> pd.DataFrame:
|
41
|
+
"""
|
42
|
+
调整DataFrame的列顺序,按照set_type中的顺序排列,忽略大小写,set_type中不存在的列自动跳过。
|
43
|
+
set_type可以是列表或字典(此时用字典的键名作为顺序)。
|
44
|
+
不改变数据和数据类型。
|
45
|
+
如果 set_type 为 None、空列表或空字典,则直接返回原 df,不做任何调整。
|
46
|
+
"""
|
47
|
+
# 直接返回原 df 的情况
|
48
|
+
if set_type is None:
|
49
|
+
return df
|
50
|
+
if isinstance(set_type, Mapping) and len(set_type) == 0:
|
51
|
+
return df
|
52
|
+
if isinstance(set_type, Sequence) and not isinstance(set_type, str) and len(set_type) == 0:
|
53
|
+
return df
|
54
|
+
|
55
|
+
# 如果set_type是字典,提取其键名
|
56
|
+
if isinstance(set_type, Mapping):
|
57
|
+
col_order = list(set_type.keys())
|
58
|
+
elif isinstance(set_type, Sequence) and not isinstance(set_type, str):
|
59
|
+
col_order = list(set_type)
|
60
|
+
else:
|
61
|
+
raise ValueError("set_type must be a list or a dict (or other mapping type)")
|
62
|
+
|
63
|
+
# 构建原始列名的映射(小写->原始名)
|
64
|
+
col_map = {col.lower(): col for col in df.columns}
|
65
|
+
# 生成新顺序的列名(只保留df中存在的列,且顺序按set_type)
|
66
|
+
new_cols = []
|
67
|
+
used = set()
|
68
|
+
for col in col_order:
|
69
|
+
key = col.lower()
|
70
|
+
if key in col_map and key not in used:
|
71
|
+
new_cols.append(col_map[key])
|
72
|
+
used.add(key)
|
73
|
+
# 添加剩余未在set_type中出现的列,保持原顺序
|
74
|
+
for col in df.columns:
|
75
|
+
if col.lower() not in used:
|
76
|
+
new_cols.append(col)
|
77
|
+
# 返回新顺序的DataFrame
|
78
|
+
return df[new_cols]
|
79
|
+
|
80
|
+
|
39
81
|
def upload_data_decorator(**upload_kwargs):
|
40
82
|
"""
|
41
83
|
数据上传装饰器
|
@@ -45,82 +87,90 @@ def upload_data_decorator(**upload_kwargs):
|
|
45
87
|
def decorator(func):
|
46
88
|
@wraps(func)
|
47
89
|
def wrapper(*args, **kwargs):
|
90
|
+
db_name = None
|
91
|
+
table_name = None
|
48
92
|
try:
|
93
|
+
# 获取函数签名和参数
|
94
|
+
sig = inspect.signature(func)
|
95
|
+
bound_args = sig.bind(*args, **kwargs)
|
96
|
+
args_dict = bound_args.arguments
|
97
|
+
|
98
|
+
# 获取所需参数
|
99
|
+
def get_param_value(param_name, alternatives=None):
|
100
|
+
if alternatives is None:
|
101
|
+
alternatives = [param_name]
|
102
|
+
# 从 kwargs 或 args_dict 中获取参数值
|
103
|
+
for key in alternatives:
|
104
|
+
if key in kwargs:
|
105
|
+
return kwargs[key]
|
106
|
+
if key in args_dict:
|
107
|
+
return args_dict[key]
|
108
|
+
return None
|
109
|
+
|
110
|
+
# 获取参数值
|
111
|
+
set_type = get_param_value('set_type', ['set_type', 'set_typ'])
|
112
|
+
db_name = get_param_value('db_name')
|
113
|
+
table_name = get_param_value('table_name')
|
114
|
+
|
49
115
|
# 执行原始函数
|
50
116
|
result = func(*args, **kwargs)
|
51
117
|
|
52
|
-
# 如果返回 None,直接返回
|
53
118
|
if result is None:
|
54
119
|
return None
|
55
|
-
|
56
|
-
#
|
120
|
+
|
121
|
+
# 处理 DataFrame 结果
|
57
122
|
if isinstance(result, pd.DataFrame):
|
58
|
-
|
59
|
-
|
123
|
+
if set_type is not None:
|
124
|
+
result = reorder_columns(result, set_type)
|
125
|
+
|
126
|
+
# 合并参数
|
127
|
+
merged_kwargs = {
|
60
128
|
'check_duplicate': False,
|
61
129
|
'update_on_duplicate': True,
|
62
130
|
'allow_null': False,
|
63
|
-
'transaction_mode': 'batch'
|
131
|
+
'transaction_mode': 'batch',
|
132
|
+
**upload_kwargs
|
64
133
|
}
|
65
|
-
# 更新参数,优先使用装饰器参数
|
66
|
-
merged_kwargs = {**default_kwargs, **upload_kwargs}
|
67
134
|
|
68
|
-
|
69
|
-
uld.upload_data(
|
70
|
-
data=result,
|
71
|
-
**merged_kwargs
|
72
|
-
)
|
135
|
+
uld.upload_data(data=result, **merged_kwargs)
|
73
136
|
return True
|
74
|
-
|
75
|
-
#
|
137
|
+
|
138
|
+
# 处理元组结果
|
76
139
|
elif isinstance(result, tuple):
|
77
|
-
# 检查元组长度
|
78
140
|
if len(result) < 2:
|
79
|
-
logger.warning('函数返回的元组长度小于2,直接返回原结果,不执行上传', {'函数': func.__name__})
|
141
|
+
logger.warning('函数返回的元组长度小于2,直接返回原结果,不执行上传', {'函数': func.__name__, '库': db_name, '表': table_name})
|
80
142
|
return result
|
81
|
-
|
82
|
-
# 获取前两个元素
|
143
|
+
|
83
144
|
df, extra_kwargs = result[0], result[1]
|
84
145
|
|
85
|
-
# 检查第一个元素是否为DataFrame
|
86
146
|
if not isinstance(df, pd.DataFrame):
|
87
|
-
logger.warning('函数返回的元组第一个元素不是DataFrame,直接返回原结果,不执行上传', {'函数': func.__name__})
|
147
|
+
logger.warning('函数返回的元组第一个元素不是DataFrame,直接返回原结果,不执行上传', {'函数': func.__name__, '库': db_name, '表': table_name})
|
88
148
|
return result
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
#
|
95
|
-
|
149
|
+
|
150
|
+
if set_type is not None:
|
151
|
+
df = reorder_columns(df, set_type)
|
152
|
+
result = (df, extra_kwargs) + result[2:]
|
153
|
+
|
154
|
+
# 合并参数
|
155
|
+
merged_kwargs = {
|
96
156
|
'check_duplicate': False,
|
97
157
|
'update_on_duplicate': True,
|
98
158
|
'allow_null': False,
|
99
|
-
'transaction_mode': 'batch'
|
159
|
+
'transaction_mode': 'batch',
|
160
|
+
**upload_kwargs,
|
161
|
+
**extra_kwargs
|
100
162
|
}
|
101
|
-
# 更新参数,优先使用装饰器参数
|
102
|
-
for key, value in default_kwargs.items():
|
103
|
-
if key not in merged_kwargs:
|
104
|
-
merged_kwargs[key] = value
|
105
163
|
|
106
|
-
|
107
|
-
uld.upload_data(
|
108
|
-
data=df,
|
109
|
-
**merged_kwargs
|
110
|
-
)
|
164
|
+
uld.upload_data(data=df, **merged_kwargs)
|
111
165
|
|
112
|
-
|
113
|
-
|
114
|
-
return result
|
115
|
-
return True
|
116
|
-
|
117
|
-
# 其他情况直接返回结果
|
166
|
+
return result if len(result) > 2 else True
|
167
|
+
|
118
168
|
return result
|
119
|
-
|
169
|
+
|
120
170
|
except Exception as e:
|
121
|
-
logger.error('数据上传失败', {'函数': func.__name__, '错误': str(e)})
|
171
|
+
logger.error('数据上传失败', {'函数': func.__name__, '库': db_name, '表': table_name, '错误': str(e)})
|
122
172
|
return False
|
123
|
-
|
173
|
+
|
124
174
|
return wrapper
|
125
175
|
return decorator
|
126
176
|
|
@@ -1951,7 +2001,7 @@ class MysqlDatasQuery:
|
|
1951
2001
|
'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
|
1952
2002
|
'indexes': [], # 普通索引列
|
1953
2003
|
'transaction_mode': 'batch', # 事务模式
|
1954
|
-
'unique_keys': [['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id']], # 唯一约束列表
|
2004
|
+
'unique_keys': [['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id', '花费']], # 唯一约束列表
|
1955
2005
|
}
|
1956
2006
|
|
1957
2007
|
@try_except
|
@@ -2119,7 +2169,7 @@ class MysqlDatasQuery:
|
|
2119
2169
|
'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
|
2120
2170
|
'indexes': [], # 普通索引列
|
2121
2171
|
'transaction_mode': 'batch', # 事务模式
|
2122
|
-
'unique_keys': [['日期', '产品线', '
|
2172
|
+
'unique_keys': [['日期', '产品线', '计划id', '搜索词', '关键词']], # 唯一约束列表
|
2123
2173
|
}
|
2124
2174
|
|
2125
2175
|
@try_except
|
@@ -3032,7 +3082,7 @@ class MysqlDatasQuery:
|
|
3032
3082
|
'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
|
3033
3083
|
'indexes': [], # 普通索引列
|
3034
3084
|
'transaction_mode': 'batch', # 事务模式
|
3035
|
-
'unique_keys': [['日期', '店铺名称', '
|
3085
|
+
'unique_keys': [['日期', '店铺名称', '商品款号', 'spuid']], # 唯一约束列表
|
3036
3086
|
}
|
3037
3087
|
|
3038
3088
|
@upload_data_decorator()
|
@@ -3661,7 +3711,6 @@ def date_table():
|
|
3661
3711
|
df = df.reset_index(drop=False)
|
3662
3712
|
df.rename(columns={'index': 'id'}, inplace=True)
|
3663
3713
|
df['id'] = df['id'].apply(lambda x: x + 1)
|
3664
|
-
|
3665
3714
|
set_typ = {
|
3666
3715
|
'日期': 'date',
|
3667
3716
|
'年': 'varchar(50)',
|
@@ -3676,6 +3725,7 @@ def date_table():
|
|
3676
3725
|
'索引': 'int',
|
3677
3726
|
'月索引': 'int',
|
3678
3727
|
}
|
3728
|
+
|
3679
3729
|
return df, {
|
3680
3730
|
'db_name': '聚合数据',
|
3681
3731
|
'table_name': '日期表',
|
@@ -3693,12 +3743,7 @@ def date_table():
|
|
3693
3743
|
}
|
3694
3744
|
|
3695
3745
|
|
3696
|
-
def query1(months=1,
|
3697
|
-
if less_dict is None:
|
3698
|
-
less_dict = []
|
3699
|
-
if months == 0:
|
3700
|
-
logger.info('months 不建议为 0')
|
3701
|
-
return
|
3746
|
+
def query1(months=1, download_manager=None):
|
3702
3747
|
sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
|
3703
3748
|
sdq.months = months # 设置数据周期, 1 表示近 2 个月
|
3704
3749
|
|
@@ -3725,12 +3770,7 @@ def query1(months=1, less_dict=None, download_manager=None):
|
|
3725
3770
|
sdq.performance_concat(bb_tg=False, db_name='聚合数据', table_name='天猫_推广汇总') # _推广商品销售
|
3726
3771
|
|
3727
3772
|
|
3728
|
-
def query2(months=1,
|
3729
|
-
if less_dict is None:
|
3730
|
-
less_dict = []
|
3731
|
-
if months == 0:
|
3732
|
-
logger.info('months 不建议为 0')
|
3733
|
-
return
|
3773
|
+
def query2(months=1, download_manager=None):
|
3734
3774
|
sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
|
3735
3775
|
sdq.months = months # 设置数据周期, 1 表示近 2 个月
|
3736
3776
|
sdq.dplyd(db_name='聚合数据', table_name='店铺流量来源构成')
|
@@ -3743,18 +3783,13 @@ def query2(months=1, less_dict=None, download_manager=None):
|
|
3743
3783
|
sdq.deeplink(db_name='聚合数据', table_name='达摩盘_deeplink人群洞察')
|
3744
3784
|
|
3745
3785
|
|
3746
|
-
def query3(months=1,
|
3747
|
-
if less_dict is None:
|
3748
|
-
less_dict = []
|
3749
|
-
if months == 0:
|
3750
|
-
logger.info('months 不建议为 0')
|
3751
|
-
return
|
3786
|
+
def query3(months=1, download_manager=None):
|
3752
3787
|
sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
|
3753
3788
|
sdq.months = months # 设置数据周期, 1 表示近 2 个月
|
3754
3789
|
sdq.spph(db_name='聚合数据', table_name='天猫_商品排行')
|
3755
3790
|
|
3756
3791
|
|
3757
|
-
def main(
|
3792
|
+
def main(months=3):
|
3758
3793
|
# 1. 更新日期表 更新货品年份基准表, 属性设置 3 - 货品年份基准
|
3759
3794
|
date_table()
|
3760
3795
|
|
@@ -3772,9 +3807,6 @@ def main(days=150, months=3):
|
|
3772
3807
|
|
3773
3808
|
|
3774
3809
|
if __name__ == '__main__':
|
3775
|
-
main(
|
3776
|
-
days=150, # 清理聚合数据的日期长度
|
3777
|
-
months=3 # 生成聚合数据的长度
|
3778
|
-
)
|
3810
|
+
# main(months=3)
|
3779
3811
|
|
3780
|
-
|
3812
|
+
pass
|
mdbq/mysql/s_query.py
CHANGED
@@ -762,6 +762,21 @@ class QueryDatas:
|
|
762
762
|
finally:
|
763
763
|
self.pool = None
|
764
764
|
|
765
|
+
def _adjust_page_size(self, last_duration, current_page_size, min_size=1000, max_size=10000, target_time=2.0):
|
766
|
+
"""
|
767
|
+
根据上一次批次耗时自动调整下一次的 page_size。
|
768
|
+
- last_duration: 上一批次查询耗时(秒)
|
769
|
+
- current_page_size: 当前批次大小
|
770
|
+
- min_size, max_size: 允许的最小/最大批次
|
771
|
+
- target_time: 期望每批耗时(秒)
|
772
|
+
"""
|
773
|
+
if last_duration < target_time / 2 and current_page_size < max_size:
|
774
|
+
return min(current_page_size * 2, max_size)
|
775
|
+
elif last_duration > target_time * 2 and current_page_size > min_size:
|
776
|
+
return max(current_page_size // 2, min_size)
|
777
|
+
else:
|
778
|
+
return current_page_size
|
779
|
+
|
765
780
|
def data_to_df(
|
766
781
|
self,
|
767
782
|
db_name: str,
|
@@ -890,18 +905,20 @@ class QueryDatas:
|
|
890
905
|
# 分页查询
|
891
906
|
offset = 0
|
892
907
|
all_results = []
|
893
|
-
|
908
|
+
min_size, max_size = 1000, 10000
|
909
|
+
target_time = 1.0 # 期望每批1秒
|
910
|
+
|
894
911
|
while offset < total_count:
|
912
|
+
start_time = time.time()
|
895
913
|
# 添加分页参数
|
896
914
|
page_sql = f"{base_sql} LIMIT %s OFFSET %s"
|
897
915
|
page_params = list(params) + [page_size, offset]
|
898
|
-
|
899
916
|
cursor.execute(page_sql, tuple(page_params))
|
900
917
|
page_results = cursor.fetchall()
|
901
|
-
|
918
|
+
|
902
919
|
if not page_results:
|
903
920
|
break
|
904
|
-
|
921
|
+
|
905
922
|
if return_format == 'list_dict':
|
906
923
|
all_results.extend(page_results)
|
907
924
|
else:
|
@@ -909,14 +926,18 @@ class QueryDatas:
|
|
909
926
|
all_results = pd.DataFrame(page_results)
|
910
927
|
else:
|
911
928
|
all_results = pd.concat([all_results, pd.DataFrame(page_results)], ignore_index=True)
|
912
|
-
|
913
|
-
|
929
|
+
|
930
|
+
duration = time.time() - start_time
|
931
|
+
page_size = self._adjust_page_size(duration, page_size, min_size, max_size, target_time)
|
932
|
+
offset += len(page_results)
|
914
933
|
logger.debug('分页查询进度', {
|
915
934
|
'库': db_name,
|
916
935
|
'表': table_name,
|
917
|
-
'当前偏移量': offset,
|
936
|
+
# '当前偏移量': offset,
|
918
937
|
'总记录数': total_count,
|
919
|
-
'已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index)
|
938
|
+
'已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index),
|
939
|
+
'本批耗时': f'{duration:.2f}',
|
940
|
+
'下批page_size': page_size
|
920
941
|
})
|
921
942
|
|
922
943
|
if return_format == 'df' and isinstance(all_results, pd.DataFrame) and not all_results.empty:
|
mdbq/mysql/uploader.py
CHANGED
@@ -14,6 +14,7 @@ from dbutils.pooled_db import PooledDB
|
|
14
14
|
import json
|
15
15
|
import sys
|
16
16
|
from decimal import Decimal, InvalidOperation
|
17
|
+
import math
|
17
18
|
|
18
19
|
warnings.filterwarnings('ignore')
|
19
20
|
logger = mylogger.MyLogger(
|
@@ -240,8 +241,16 @@ class MySQLUploader:
|
|
240
241
|
conn = self.pool.connection()
|
241
242
|
return conn
|
242
243
|
except Exception as e:
|
243
|
-
logger.error('
|
244
|
-
|
244
|
+
logger.error('从连接池获取数据库连接失败,尝试重建连接池', {'error': str(e)})
|
245
|
+
# 强制重建连接池
|
246
|
+
try:
|
247
|
+
self.pool = self._create_connection_pool()
|
248
|
+
conn = self.pool.connection()
|
249
|
+
logger.info('重建连接池后获取连接成功')
|
250
|
+
return conn
|
251
|
+
except Exception as e2:
|
252
|
+
logger.error('重建连接池后依然获取连接失败', {'error': str(e2)})
|
253
|
+
raise ConnectionError(f'连接数据库失败: {str(e2)}')
|
245
254
|
|
246
255
|
@_execute_with_retry
|
247
256
|
def _check_database_exists(self, db_name: str) -> bool:
|
@@ -407,31 +416,36 @@ class MySQLUploader:
|
|
407
416
|
col_def += " NOT NULL"
|
408
417
|
column_defs.append(col_def)
|
409
418
|
# 主键处理逻辑调整
|
419
|
+
def _index_col_sql(col):
|
420
|
+
col_type = set_typ.get(col, '').lower()
|
421
|
+
if 'varchar' in col_type or 'text' in col_type:
|
422
|
+
return f"`{self._normalize_col(col)}`(100)"
|
423
|
+
return f"`{self._normalize_col(col)}`"
|
410
424
|
if primary_keys and len(primary_keys) > 0:
|
411
|
-
safe_primary_keys = [
|
412
|
-
primary_key_sql = f"PRIMARY KEY (
|
425
|
+
safe_primary_keys = [_index_col_sql(pk) for pk in primary_keys]
|
426
|
+
primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
|
413
427
|
else:
|
414
|
-
safe_primary_keys = [
|
428
|
+
safe_primary_keys = [_index_col_sql('id')]
|
415
429
|
primary_key_sql = f"PRIMARY KEY (`id`)"
|
416
430
|
# 索引统一在CREATE TABLE中定义
|
417
431
|
index_defs = []
|
418
432
|
if date_column and date_column in set_typ:
|
419
|
-
safe_date_col =
|
420
|
-
index_defs.append(f"INDEX `idx_{
|
433
|
+
safe_date_col = _index_col_sql(date_column)
|
434
|
+
index_defs.append(f"INDEX `idx_{self._normalize_col(date_column)}` ({safe_date_col})")
|
421
435
|
if indexes:
|
422
436
|
for idx_col in indexes:
|
423
437
|
if idx_col in set_typ:
|
424
|
-
safe_idx_col =
|
425
|
-
index_defs.append(f"INDEX `idx_{
|
438
|
+
safe_idx_col = _index_col_sql(idx_col)
|
439
|
+
index_defs.append(f"INDEX `idx_{self._normalize_col(idx_col)}` ({safe_idx_col})")
|
426
440
|
# UNIQUE KEY定义
|
427
441
|
unique_defs = []
|
428
442
|
if unique_keys:
|
429
443
|
for unique_cols in unique_keys:
|
430
444
|
if not unique_cols:
|
431
445
|
continue
|
432
|
-
safe_unique_cols = [
|
433
|
-
unique_name = f"uniq_{'_'.join(
|
434
|
-
unique_defs.append(f"UNIQUE KEY `{unique_name}` (
|
446
|
+
safe_unique_cols = [_index_col_sql(col) for col in unique_cols]
|
447
|
+
unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in unique_cols])}"
|
448
|
+
unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
|
435
449
|
index_defs = list(set(index_defs))
|
436
450
|
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
437
451
|
sql = f"""
|
@@ -447,7 +461,7 @@ class MySQLUploader:
|
|
447
461
|
conn.commit()
|
448
462
|
logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
|
449
463
|
except Exception as e:
|
450
|
-
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
|
464
|
+
logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e), '异常类型': type(e).__name__})
|
451
465
|
if conn is not None:
|
452
466
|
conn.rollback()
|
453
467
|
raise
|
@@ -491,34 +505,53 @@ class MySQLUploader:
|
|
491
505
|
def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
|
492
506
|
"""
|
493
507
|
根据列类型验证并转换数据值
|
494
|
-
|
495
|
-
:param value: 要验证的值
|
496
|
-
:param column_type: 列的数据类型
|
497
|
-
:param allow_null: 是否允许空值
|
498
|
-
:param db_name: 数据库名(用于日志)
|
499
|
-
:param table_name: 表名(用于日志)
|
500
|
-
:param col_name: 列名(用于日志)
|
501
|
-
:return: 转换后的值
|
502
|
-
:raises ValueError: 当值转换失败时抛出
|
503
508
|
"""
|
509
|
+
column_type_lower = column_type.lower() if column_type else ''
|
510
|
+
# 统一判断None/NaN
|
511
|
+
is_nan = False
|
504
512
|
if value is None:
|
513
|
+
is_nan = True
|
514
|
+
elif isinstance(value, float) and math.isnan(value):
|
515
|
+
is_nan = True
|
516
|
+
elif str(value).lower() in ['nan', 'none']:
|
517
|
+
is_nan = True
|
518
|
+
if is_nan:
|
505
519
|
if not allow_null:
|
506
|
-
|
507
|
-
'
|
508
|
-
|
509
|
-
|
520
|
+
if 'int' in column_type_lower:
|
521
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为0', {
|
522
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
523
|
+
})
|
524
|
+
return 0
|
525
|
+
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
526
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为0.0', {
|
527
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
528
|
+
})
|
529
|
+
return 0.0
|
530
|
+
elif 'date' in column_type_lower or 'time' in column_type_lower:
|
531
|
+
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
532
|
+
default_date = '2000-01-01 00:00:00'
|
533
|
+
else:
|
534
|
+
default_date = '2000-01-01'
|
535
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为默认日期', {
|
536
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type, '默认值': default_date
|
537
|
+
})
|
538
|
+
return default_date
|
539
|
+
else:
|
540
|
+
logger.debug('字段值为None/NaN但不允许空值, 已填充为none字符串', {
|
541
|
+
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
542
|
+
})
|
543
|
+
return 'none'
|
510
544
|
return None
|
511
545
|
try:
|
512
|
-
column_type_lower = column_type.lower()
|
513
546
|
if isinstance(value, str) and value.strip().endswith('%'):
|
514
|
-
|
547
|
+
if re.match(r'^\d+(\.\d+)?%$', value.strip()):
|
515
548
|
percent_str = value.strip().replace('%', '')
|
516
549
|
percent_value = float(percent_str)
|
517
550
|
decimal_value = percent_value / 100
|
518
551
|
logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
|
519
552
|
return decimal_value
|
520
|
-
|
521
|
-
logger.warning('
|
553
|
+
else:
|
554
|
+
logger.warning('百分比字符串不符合格式,跳过转换', {
|
522
555
|
'库': db_name, '表': table_name, '列': col_name, '原始': value
|
523
556
|
})
|
524
557
|
elif 'int' in column_type_lower:
|
@@ -548,10 +581,18 @@ class MySQLUploader:
|
|
548
581
|
})
|
549
582
|
raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
|
550
583
|
return str(value)
|
551
|
-
elif '
|
584
|
+
elif 'varchar' in column_type_lower:
|
552
585
|
if isinstance(value, str):
|
553
586
|
return value.replace('\\', '\\\\').replace("'", "\\'")
|
554
|
-
|
587
|
+
elif 'text' in column_type_lower:
|
588
|
+
if isinstance(value, str):
|
589
|
+
max_length = 65535
|
590
|
+
if len(value) > max_length:
|
591
|
+
logger.warning(f'TEXT字符串长度不允许超过 {max_length},已截断', {
|
592
|
+
'库': db_name, '表': table_name, '列': col_name, '原始值': f'{value[:50]}...', '截断后值': f'{value[:50]}...'
|
593
|
+
})
|
594
|
+
value = value[:max_length]
|
595
|
+
return value.replace('\\', '\\\\').replace("'", "\\'")
|
555
596
|
elif 'json' in column_type_lower:
|
556
597
|
return json.dumps(value) if value is not None else None
|
557
598
|
else:
|
@@ -881,22 +922,21 @@ class MySQLUploader:
|
|
881
922
|
# set_typ的键清洗
|
882
923
|
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
883
924
|
|
884
|
-
#
|
885
|
-
data_columns = set()
|
886
|
-
if data and len(data) > 0:
|
887
|
-
data_columns = set(data[0].keys())
|
888
|
-
|
889
|
-
# 过滤set_typ,只保留数据中存在的列
|
925
|
+
# 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
|
890
926
|
filtered_set_typ = {}
|
891
|
-
|
892
|
-
|
927
|
+
data_columns = list(data[0].keys()) if data and len(data) > 0 else []
|
928
|
+
# 先按set_typ顺序
|
929
|
+
for col in set_typ:
|
930
|
+
if col in data_columns:
|
893
931
|
filtered_set_typ[col] = set_typ[col]
|
894
|
-
|
895
|
-
|
932
|
+
# 再补充data中有但set_typ没有的列
|
933
|
+
for col in data_columns:
|
934
|
+
if col not in filtered_set_typ:
|
935
|
+
# 推断类型
|
896
936
|
sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
|
897
937
|
inferred_type = None
|
898
938
|
for val in sample_values:
|
899
|
-
inferred_type = self._infer_data_type(val, no_log=True)
|
939
|
+
inferred_type = self._infer_data_type(val, no_log=True)
|
900
940
|
if inferred_type:
|
901
941
|
break
|
902
942
|
if not inferred_type:
|
@@ -1326,8 +1366,7 @@ class MySQLUploader:
|
|
1326
1366
|
if cached:
|
1327
1367
|
return cached
|
1328
1368
|
# 获取所有列名(排除id)
|
1329
|
-
all_columns = [col for col in set_typ.keys()
|
1330
|
-
if col.lower() != 'id']
|
1369
|
+
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
1331
1370
|
if not check_duplicate:
|
1332
1371
|
sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1333
1372
|
update_on_duplicate)
|
@@ -1364,7 +1403,6 @@ class MySQLUploader:
|
|
1364
1403
|
- 只有遇到严重的数据库错误(如所有行都因唯一约束冲突且没有ON DUPLICATE KEY UPDATE),才会整体回滚。
|
1365
1404
|
- 返回值为(插入行数, 跳过行数, 失败行数)。
|
1366
1405
|
"""
|
1367
|
-
import pymysql # 确保异常类型可用
|
1368
1406
|
def get_optimal_batch_size(total_rows: int) -> int:
|
1369
1407
|
if total_rows <= 100:
|
1370
1408
|
return total_rows
|
@@ -1612,5 +1650,5 @@ def main():
|
|
1612
1650
|
|
1613
1651
|
|
1614
1652
|
if __name__ == '__main__':
|
1615
|
-
main()
|
1653
|
+
# main()
|
1616
1654
|
pass
|
@@ -1,7 +1,7 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=K0JdqT-aY_eW77ySyyxnpc599EoZ9CKOLZg_w5AvAnM,17
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=3GBdX0HWKvQ-B3NiZE_hzWbJ7sqClzCd8KTvXpVPnZ4,170452
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
@@ -10,9 +10,9 @@ mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,16
|
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
11
|
mdbq/mysql/deduplicator.py,sha256=8v3MC6TJ0YEiExWrTP9OXAxTYnL9XbpYL2vWaER1h2M,73099
|
12
12
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
13
|
-
mdbq/mysql/s_query.py,sha256=
|
13
|
+
mdbq/mysql/s_query.py,sha256=RnVCwMQ_n9PcAimbMWbHe9k8eil8shtCfa3LwLBZi6c,41909
|
14
14
|
mdbq/mysql/unique_.py,sha256=Wgqq_PjAAD757JTa10wjYaJgssZ_C_ypU6DW56jbuyw,21074
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=bYE_VGTeEigpRFYvZ9Ob3A9vxq21NuOdrXFkv8Bm_p8,74919
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=l3zBK7wrZl0oO42-_UGylyatnIp_SBw8wDDvof9fht4,23534
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=hPRzLQvFIF4ibN8aP3Dg_ru5meac90faPyzOB22cj-o,20965
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
28
|
+
mdbq-4.0.5.dist-info/METADATA,sha256=boklJ7iCN4Uh-Czst1DiQlPrKKSawDIYknmipAd9w5A,363
|
29
|
+
mdbq-4.0.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-4.0.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-4.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|