mdbq 3.9.9__py3-none-any.whl → 3.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +107 -48
- {mdbq-3.9.9.dist-info → mdbq-3.9.10.dist-info}/METADATA +1 -1
- {mdbq-3.9.9.dist-info → mdbq-3.9.10.dist-info}/RECORD +6 -6
- {mdbq-3.9.9.dist-info → mdbq-3.9.10.dist-info}/WHEEL +0 -0
- {mdbq-3.9.9.dist-info → mdbq-3.9.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.9.
|
1
|
+
VERSION = '3.9.10'
|
mdbq/mysql/uploader.py
CHANGED
@@ -99,7 +99,6 @@ class MySQLUploader:
|
|
99
99
|
:param port: 数据库端口,默认为3306
|
100
100
|
:param charset: 字符集,默认为utf8mb4
|
101
101
|
:param collation: 排序规则,默认为utf8mb4_0900_ai_ci
|
102
|
-
|
103
102
|
:param max_retries: 最大重试次数,默认为10
|
104
103
|
:param retry_interval: 重试间隔(秒),默认为10
|
105
104
|
:param pool_size: 连接池大小,默认为5
|
@@ -122,7 +121,7 @@ class MySQLUploader:
|
|
122
121
|
self.write_timeout = write_timeout
|
123
122
|
self.ssl = ssl
|
124
123
|
self._prepared_statements = StatementCache(maxsize=100)
|
125
|
-
self._max_cached_statements = 100
|
124
|
+
self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
|
126
125
|
self._table_metadata_cache = {}
|
127
126
|
self.metadata_cache_ttl = 300 # 5分钟缓存时间
|
128
127
|
|
@@ -386,7 +385,11 @@ class MySQLUploader:
|
|
386
385
|
raise ValueError(error_msg)
|
387
386
|
|
388
387
|
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
389
|
-
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
|
388
|
+
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
389
|
+
|
390
|
+
# 将多个连续的下划线替换为单个下划线, 移除开头和结尾的下划线
|
391
|
+
cleaned = re.sub(r'_+', '_', cleaned).strip('_')
|
392
|
+
|
390
393
|
if not cleaned:
|
391
394
|
error_msg = f"无法清理异常标识符: {identifier}"
|
392
395
|
logger.error(error_msg)
|
@@ -597,19 +600,26 @@ class MySQLUploader:
|
|
597
600
|
column_type_lower = column_type.lower()
|
598
601
|
|
599
602
|
# 处理百分比值
|
600
|
-
if isinstance(value, str)
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
603
|
+
if isinstance(value, str):
|
604
|
+
if value.endswith('%'):
|
605
|
+
try:
|
606
|
+
# 移除百分号并转换为小数
|
607
|
+
percent_value = float(value.strip().replace('%', ''))
|
608
|
+
decimal_value = percent_value / 100
|
609
|
+
return decimal_value
|
610
|
+
except ValueError:
|
611
|
+
pass # 如果不是有效的百分比数字,继续正常处理
|
612
|
+
|
613
|
+
elif 'int' in column_type_lower:
|
614
|
+
if isinstance(value, str):
|
615
|
+
# 移除可能的逗号和空格
|
616
|
+
value = value.replace(',', '').strip()
|
617
|
+
# 尝试转换为浮点数再转整数
|
618
|
+
try:
|
619
|
+
return int(float(value))
|
620
|
+
except ValueError:
|
621
|
+
raise ValueError(f"`{value}` 无法转为整数")
|
622
|
+
return int(value) if value is not None else None
|
613
623
|
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
614
624
|
if isinstance(value, str):
|
615
625
|
# 处理可能包含逗号的数字字符串
|
@@ -630,12 +640,11 @@ class MySQLUploader:
|
|
630
640
|
return value.replace('\\', '\\\\').replace("'", "\\'")
|
631
641
|
return str(value)
|
632
642
|
elif 'json' in column_type_lower:
|
633
|
-
import json
|
634
643
|
return json.dumps(value) if value is not None else None
|
635
644
|
else:
|
636
645
|
return value
|
637
646
|
except (ValueError, TypeError) as e:
|
638
|
-
error_msg = f"
|
647
|
+
error_msg = f"转换异常 -> 无法将 `{value}` 的数据类型转为: `{column_type}` -> {str(e)}"
|
639
648
|
logger.error(error_msg)
|
640
649
|
raise ValueError(error_msg)
|
641
650
|
|
@@ -681,7 +690,8 @@ class MySQLUploader:
|
|
681
690
|
auto_create: bool,
|
682
691
|
date_column: Optional[str],
|
683
692
|
indexes: Optional[List[str]],
|
684
|
-
batch_id: Optional[str] = None
|
693
|
+
batch_id: Optional[str] = None,
|
694
|
+
update_on_duplicate: bool = False
|
685
695
|
):
|
686
696
|
"""实际执行表上传的方法"""
|
687
697
|
# 检查表是否存在
|
@@ -711,7 +721,9 @@ class MySQLUploader:
|
|
711
721
|
# 插入数据
|
712
722
|
self._insert_data(
|
713
723
|
db_name, table_name, data, set_typ,
|
714
|
-
check_duplicate, duplicate_columns
|
724
|
+
check_duplicate, duplicate_columns,
|
725
|
+
batch_id=batch_id,
|
726
|
+
update_on_duplicate=update_on_duplicate
|
715
727
|
)
|
716
728
|
|
717
729
|
def _infer_data_type(self, value: Any) -> str:
|
@@ -721,12 +733,13 @@ class MySQLUploader:
|
|
721
733
|
:param value: 要推断的值
|
722
734
|
:return: MySQL数据类型字符串
|
723
735
|
"""
|
724
|
-
if value is None:
|
736
|
+
if value is None or str(value).lower() in ['', 'none', 'nan']:
|
725
737
|
return 'VARCHAR(255)' # 默认字符串类型
|
726
738
|
|
727
739
|
# 检查是否是百分比字符串
|
728
|
-
if isinstance(value, str)
|
729
|
-
|
740
|
+
if isinstance(value, str):
|
741
|
+
if value.endswith('%'):
|
742
|
+
return 'DECIMAL(10,4)' # 百分比统一使用DECIMAL(10,4)
|
730
743
|
|
731
744
|
if isinstance(value, bool):
|
732
745
|
return 'TINYINT(1)'
|
@@ -773,6 +786,26 @@ class MySQLUploader:
|
|
773
786
|
else:
|
774
787
|
return 'VARCHAR(255)'
|
775
788
|
|
789
|
+
def normalize_column_names(self, data: Union[pd.DataFrame, List[Dict[str, Any]]]) -> Union[
|
790
|
+
pd.DataFrame, List[Dict[str, Any]]]:
|
791
|
+
"""
|
792
|
+
1. pandas:规范化列名
|
793
|
+
2. 字典列表:规范化每个字典的键
|
794
|
+
|
795
|
+
参数:
|
796
|
+
data: 输入数据,支持两种类型:
|
797
|
+
- pandas.DataFrame:将规范化其列名
|
798
|
+
- List[Dict[str, Any]]:将规范化列表中每个字典的键
|
799
|
+
"""
|
800
|
+
if isinstance(data, pd.DataFrame):
|
801
|
+
# 处理DataFrame
|
802
|
+
data.columns = [self._validate_identifier(col) for col in data.columns]
|
803
|
+
return data
|
804
|
+
elif isinstance(data, list):
|
805
|
+
# 处理字典列表
|
806
|
+
return [{self._validate_identifier(k): v for k, v in item.items()} for item in data]
|
807
|
+
return data
|
808
|
+
|
776
809
|
def _prepare_data(
|
777
810
|
self,
|
778
811
|
data: Union[Dict, List[Dict], pd.DataFrame],
|
@@ -807,6 +840,9 @@ class MySQLUploader:
|
|
807
840
|
logger.error(error_msg)
|
808
841
|
raise ValueError(error_msg)
|
809
842
|
|
843
|
+
# 统一处理原始数据中列名的特殊字符
|
844
|
+
data = self.normalize_column_names(data)
|
845
|
+
|
810
846
|
# 将set_typ的键转为小写
|
811
847
|
set_typ = {k.lower(): v for k, v in set_typ.items()}
|
812
848
|
|
@@ -826,11 +862,11 @@ class MySQLUploader:
|
|
826
862
|
if sample_values:
|
827
863
|
inferred_type = self._infer_data_type(sample_values[0])
|
828
864
|
filtered_set_typ[col] = inferred_type
|
829
|
-
logger.debug(f"自动推断列
|
865
|
+
logger.debug(f"自动推断列 `{col}` 的数据类型为: {inferred_type}")
|
830
866
|
else:
|
831
867
|
# 没有样本值,使用默认类型
|
832
868
|
filtered_set_typ[col] = 'VARCHAR(255)'
|
833
|
-
logger.debug(f"
|
869
|
+
logger.debug(f"列 `{col}` 使用默认数据类型: VARCHAR(255)")
|
834
870
|
|
835
871
|
prepared_data = []
|
836
872
|
for row_idx, row in enumerate(data, 1):
|
@@ -842,7 +878,7 @@ class MySQLUploader:
|
|
842
878
|
|
843
879
|
if col_name not in row:
|
844
880
|
if not allow_null:
|
845
|
-
error_msg = f"
|
881
|
+
error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`"
|
846
882
|
logger.error(error_msg)
|
847
883
|
raise ValueError(error_msg)
|
848
884
|
prepared_row[col_name] = None
|
@@ -850,7 +886,7 @@ class MySQLUploader:
|
|
850
886
|
try:
|
851
887
|
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
|
852
888
|
except ValueError as e:
|
853
|
-
error_msg = f"
|
889
|
+
error_msg = f"行号:{row_idx}, 列名:`{col_name}`-> 报错: {str(e)}"
|
854
890
|
logger.error(error_msg)
|
855
891
|
raise ValueError(error_msg)
|
856
892
|
prepared_data.append(prepared_row)
|
@@ -871,7 +907,8 @@ class MySQLUploader:
|
|
871
907
|
partition_by: Optional[str] = None,
|
872
908
|
partition_date_column: str = '日期',
|
873
909
|
auto_create: bool = True,
|
874
|
-
indexes: Optional[List[str]] = None
|
910
|
+
indexes: Optional[List[str]] = None,
|
911
|
+
update_on_duplicate: bool = False
|
875
912
|
):
|
876
913
|
"""
|
877
914
|
上传数据到数据库的主入口方法
|
@@ -888,6 +925,7 @@ class MySQLUploader:
|
|
888
925
|
:param partition_date_column: 用于分表的日期列名,默认为'日期'
|
889
926
|
:param auto_create: 表不存在时是否自动创建,默认为True
|
890
927
|
:param indexes: 需要创建索引的列列表,可选
|
928
|
+
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
891
929
|
:raises: 可能抛出各种验证和数据库相关异常
|
892
930
|
"""
|
893
931
|
upload_start = time.time()
|
@@ -898,10 +936,10 @@ class MySQLUploader:
|
|
898
936
|
|
899
937
|
logger.info("开始上传数据", {
|
900
938
|
'批次号': batch_id,
|
901
|
-
'
|
902
|
-
'
|
939
|
+
'库': db_name,
|
940
|
+
'表': table_name,
|
903
941
|
'分表方式': partition_by,
|
904
|
-
'
|
942
|
+
'排重': check_duplicate,
|
905
943
|
'总计行数': len(data) if hasattr(data, '__len__') else 1,
|
906
944
|
'自动建表': auto_create
|
907
945
|
})
|
@@ -913,10 +951,12 @@ class MySQLUploader:
|
|
913
951
|
# logger.error(error_msg)
|
914
952
|
# raise ValueError(error_msg)
|
915
953
|
|
916
|
-
if partition_by
|
917
|
-
|
918
|
-
|
919
|
-
|
954
|
+
if partition_by:
|
955
|
+
partition_by = str(partition_by).lower()
|
956
|
+
if partition_by not in ['year', 'month']:
|
957
|
+
error_msg = "分表方式必须是 'year' 或 'month'"
|
958
|
+
logger.error(error_msg)
|
959
|
+
raise ValueError(error_msg)
|
920
960
|
|
921
961
|
# 准备数据
|
922
962
|
prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null)
|
@@ -962,7 +1002,7 @@ class MySQLUploader:
|
|
962
1002
|
db_name, part_table, part_data, filtered_set_typ,
|
963
1003
|
primary_keys, check_duplicate, duplicate_columns,
|
964
1004
|
allow_null, auto_create, partition_date_column,
|
965
|
-
indexes, batch_id
|
1005
|
+
indexes, batch_id, update_on_duplicate
|
966
1006
|
)
|
967
1007
|
except Exception as e:
|
968
1008
|
logger.error("分表上传失败", {
|
@@ -976,7 +1016,7 @@ class MySQLUploader:
|
|
976
1016
|
db_name, table_name, prepared_data, filtered_set_typ,
|
977
1017
|
primary_keys, check_duplicate, duplicate_columns,
|
978
1018
|
allow_null, auto_create, partition_date_column,
|
979
|
-
indexes, batch_id
|
1019
|
+
indexes, batch_id, update_on_duplicate
|
980
1020
|
)
|
981
1021
|
|
982
1022
|
success_flag = True
|
@@ -1004,7 +1044,8 @@ class MySQLUploader:
|
|
1004
1044
|
check_duplicate: bool = False,
|
1005
1045
|
duplicate_columns: Optional[List[str]] = None,
|
1006
1046
|
batch_size: int = 1000,
|
1007
|
-
batch_id: Optional[str] = None
|
1047
|
+
batch_id: Optional[str] = None,
|
1048
|
+
update_on_duplicate: bool = False
|
1008
1049
|
):
|
1009
1050
|
"""
|
1010
1051
|
实际执行数据插入的方法
|
@@ -1016,6 +1057,7 @@ class MySQLUploader:
|
|
1016
1057
|
:param check_duplicate: 是否检查重复数据,默认为False
|
1017
1058
|
:param duplicate_columns: 用于检查重复的列,可选
|
1018
1059
|
:param batch_size: 批量插入大小,默认为1000
|
1060
|
+
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
1019
1061
|
:param batch_id: 批次ID用于日志追踪,可选
|
1020
1062
|
"""
|
1021
1063
|
if not data:
|
@@ -1048,8 +1090,21 @@ class MySQLUploader:
|
|
1048
1090
|
|
1049
1091
|
where_clause = " AND ".join(conditions)
|
1050
1092
|
|
1051
|
-
|
1093
|
+
if update_on_duplicate:
|
1094
|
+
# 更新模式 - 使用ON DUPLICATE KEY UPDATE语法
|
1095
|
+
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
|
1096
|
+
sql = f"""
|
1052
1097
|
INSERT INTO `{db_name}`.`{table_name}`
|
1098
|
+
(`{'`,`'.join(safe_columns)}`)
|
1099
|
+
VALUES ({placeholders})
|
1100
|
+
ON DUPLICATE KEY UPDATE {update_clause}
|
1101
|
+
"""
|
1102
|
+
|
1103
|
+
# 注意:在update_on_duplicate模式下,row_values只需要插入数据,不需要排重列值
|
1104
|
+
def prepare_values(row):
|
1105
|
+
return [row.get(col) for col in all_columns]
|
1106
|
+
else:
|
1107
|
+
sql = f"""INSERT INTO `{db_name}`.`{table_name}`
|
1053
1108
|
(`{'`,`'.join(safe_columns)}`)
|
1054
1109
|
SELECT {placeholders}
|
1055
1110
|
FROM DUAL
|
@@ -1058,6 +1113,10 @@ class MySQLUploader:
|
|
1058
1113
|
WHERE {where_clause}
|
1059
1114
|
)
|
1060
1115
|
"""
|
1116
|
+
|
1117
|
+
# 在check_duplicate模式下,row_values需要插入数据+排重列值
|
1118
|
+
def prepare_values(row):
|
1119
|
+
return [row.get(col) for col in all_columns] + [row.get(col) for col in duplicate_columns]
|
1061
1120
|
else:
|
1062
1121
|
sql = f"""
|
1063
1122
|
INSERT INTO `{db_name}`.`{table_name}`
|
@@ -1065,6 +1124,10 @@ class MySQLUploader:
|
|
1065
1124
|
VALUES ({placeholders})
|
1066
1125
|
"""
|
1067
1126
|
|
1127
|
+
# 普通模式下,row_values只需要插入数据
|
1128
|
+
def prepare_values(row):
|
1129
|
+
return [row.get(col) for col in all_columns]
|
1130
|
+
|
1068
1131
|
total_inserted = 0
|
1069
1132
|
total_skipped = 0
|
1070
1133
|
total_failed = 0 # 失败计数器
|
@@ -1080,11 +1143,7 @@ class MySQLUploader:
|
|
1080
1143
|
for row in batch:
|
1081
1144
|
try:
|
1082
1145
|
# 准备参数
|
1083
|
-
row_values =
|
1084
|
-
# 如果是排重检查,添加排重列值
|
1085
|
-
if check_duplicate:
|
1086
|
-
row_values += [row.get(col) for col in duplicate_columns]
|
1087
|
-
|
1146
|
+
row_values = prepare_values(row)
|
1088
1147
|
cursor.execute(sql, row_values)
|
1089
1148
|
successful_rows += 1
|
1090
1149
|
conn.commit() # 每次成功插入后提交
|
@@ -1096,13 +1155,13 @@ class MySQLUploader:
|
|
1096
1155
|
# 记录失败行详细信息
|
1097
1156
|
error_details = {
|
1098
1157
|
'批次号': batch_id,
|
1099
|
-
'
|
1100
|
-
'
|
1158
|
+
'库': db_name,
|
1159
|
+
'表': table_name,
|
1101
1160
|
'error_type': type(e).__name__,
|
1102
1161
|
'error_message': str(e),
|
1103
|
-
'
|
1162
|
+
'数据类型': set_typ,
|
1104
1163
|
'是否排重': check_duplicate,
|
1105
|
-
'
|
1164
|
+
'排重列': duplicate_columns
|
1106
1165
|
}
|
1107
1166
|
logger.error(f"单行插入失败: {error_details}")
|
1108
1167
|
continue # 跳过当前行,继续处理下一行
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=83jYP6xnYylgp029cctX2BP7k_exd-phUiwATgIjhH0,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
|
5
5
|
mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
|
@@ -12,7 +12,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
|
12
12
|
mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
|
13
13
|
mdbq/mysql/mysql.py,sha256=jTcizvUtRdwMhWK2i_LA9yDPmcifLjUzVhwTbC3wfJk,119785
|
14
14
|
mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=V23PAzT59lMUqijkUiwV6a1qNwk9T76k8HKxY8fYW9w,52140
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
|
28
|
-
mdbq-3.9.
|
29
|
-
mdbq-3.9.
|
30
|
-
mdbq-3.9.
|
31
|
-
mdbq-3.9.
|
28
|
+
mdbq-3.9.10.dist-info/METADATA,sha256=Ln51lgeqZn0zAjgLUKXaMNJ5ZXCkX3Eyu0iao37_IQw,364
|
29
|
+
mdbq-3.9.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.9.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.9.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|