mdbq 4.1.7__py3-none-any.whl → 4.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mdbq might be problematic. Click here for more details.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +149 -74
- {mdbq-4.1.7.dist-info → mdbq-4.1.8.dist-info}/METADATA +1 -1
- {mdbq-4.1.7.dist-info → mdbq-4.1.8.dist-info}/RECORD +6 -6
- {mdbq-4.1.7.dist-info → mdbq-4.1.8.dist-info}/WHEEL +0 -0
- {mdbq-4.1.7.dist-info → mdbq-4.1.8.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
VERSION = '4.1.
|
|
1
|
+
VERSION = '4.1.8'
|
mdbq/mysql/uploader.py
CHANGED
|
@@ -784,11 +784,18 @@ class MySQLUploader:
|
|
|
784
784
|
'char': 'none',
|
|
785
785
|
'mediumtext': 'none',
|
|
786
786
|
'longtext': 'none',
|
|
787
|
+
'enum': None, # enum类型需要特殊处理,使用第一个可选值
|
|
788
|
+
'set': '', # set类型默认为空字符串
|
|
787
789
|
}
|
|
788
790
|
fallback = 'none'
|
|
789
791
|
for typ, val in fallback_map.items():
|
|
790
792
|
if typ in column_type_lower:
|
|
791
|
-
|
|
793
|
+
if typ == 'enum' and val is None:
|
|
794
|
+
# 对于enum类型,使用第一个可选值作为默认值
|
|
795
|
+
enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
|
|
796
|
+
fallback = enum_values[0] if enum_values else 'none'
|
|
797
|
+
else:
|
|
798
|
+
fallback = val
|
|
792
799
|
break
|
|
793
800
|
if not allow_null:
|
|
794
801
|
logger.warning("该列不允许为空值", {"库": db_name, "表": table_name, "allow_null": allow_null, "列": col_name, "值": value, "兜底值": fallback})
|
|
@@ -833,6 +840,33 @@ class MySQLUploader:
|
|
|
833
840
|
except (ValueError, TypeError, InvalidOperation) as e:
|
|
834
841
|
logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
|
|
835
842
|
raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
|
|
843
|
+
# ENUM类型验证
|
|
844
|
+
elif 'enum' in column_type_lower:
|
|
845
|
+
# 提取enum的可选值,支持单引号和双引号
|
|
846
|
+
enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
|
|
847
|
+
str_value = str(value).strip()
|
|
848
|
+
if str_value not in enum_values:
|
|
849
|
+
logger.error(f"值 `{str_value}` 不在enum允许的值中: {enum_values}",
|
|
850
|
+
{"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
|
|
851
|
+
raise ValueError(f"值 `{str_value}` 不在enum允许的值中: {enum_values}")
|
|
852
|
+
return str_value
|
|
853
|
+
# SET类型验证
|
|
854
|
+
elif 'set' in column_type_lower:
|
|
855
|
+
# 提取set的可选值,支持单引号和双引号
|
|
856
|
+
set_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
|
|
857
|
+
str_value = str(value).strip()
|
|
858
|
+
# SET类型可以是多个值的组合,用逗号分隔
|
|
859
|
+
if ',' in str_value:
|
|
860
|
+
input_values = [v.strip() for v in str_value.split(',')]
|
|
861
|
+
else:
|
|
862
|
+
input_values = [str_value]
|
|
863
|
+
|
|
864
|
+
for val in input_values:
|
|
865
|
+
if val and val not in set_values:
|
|
866
|
+
logger.error(f"值 `{val}` 不在set允许的值中: {set_values}",
|
|
867
|
+
{"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
|
|
868
|
+
raise ValueError(f"值 `{val}` 不在set允许的值中: {set_values}")
|
|
869
|
+
return str_value
|
|
836
870
|
# 字符串类型验证
|
|
837
871
|
elif 'varchar' in column_type_lower:
|
|
838
872
|
str_value = str(value)
|
|
@@ -1134,21 +1168,27 @@ class MySQLUploader:
|
|
|
1134
1168
|
) -> Tuple[List[Dict], Dict[str, str]]:
|
|
1135
1169
|
"""
|
|
1136
1170
|
准备要上传的数据,验证并转换数据类型
|
|
1171
|
+
根据set_typ自动处理所有数据类型的列:补齐缺失的列并丢弃多余的列
|
|
1137
1172
|
"""
|
|
1173
|
+
# set_typ的键清洗
|
|
1174
|
+
if not set_typ:
|
|
1175
|
+
set_typ = {}
|
|
1176
|
+
normalized_set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
|
1177
|
+
|
|
1138
1178
|
# 统一数据格式为字典列表
|
|
1139
1179
|
if isinstance(data, pd.DataFrame):
|
|
1140
1180
|
try:
|
|
1141
1181
|
if self.case_sensitive:
|
|
1142
|
-
data.columns = [col for col in data.columns]
|
|
1182
|
+
data.columns = [self._validate_identifier(col) for col in data.columns]
|
|
1143
1183
|
else:
|
|
1144
|
-
data.columns = [col.lower() for col in data.columns]
|
|
1184
|
+
data.columns = [self._validate_identifier(col).lower() for col in data.columns]
|
|
1145
1185
|
data = data.replace({pd.NA: None}).to_dict('records')
|
|
1146
1186
|
except Exception as e:
|
|
1147
|
-
logger.error('
|
|
1187
|
+
logger.error('DataFrame处理时发生错误', {
|
|
1148
1188
|
'error': str(e),
|
|
1149
1189
|
'data': self._shorten_for_log(data),
|
|
1150
1190
|
})
|
|
1151
|
-
raise ValueError(f"
|
|
1191
|
+
raise ValueError(f"DataFrame处理时发生错误: {e}")
|
|
1152
1192
|
elif isinstance(data, dict):
|
|
1153
1193
|
if self.case_sensitive:
|
|
1154
1194
|
data = [{k: v for k, v in data.items()}]
|
|
@@ -1168,34 +1208,65 @@ class MySQLUploader:
|
|
|
1168
1208
|
# 统一处理原始数据中列名的特殊字符
|
|
1169
1209
|
data = self.normalize_column_names(data)
|
|
1170
1210
|
|
|
1171
|
-
if not
|
|
1211
|
+
if not normalized_set_typ:
|
|
1172
1212
|
logger.warning('set_typ为空, 将自动推断数据类型, 可能存在数据类型识别错误')
|
|
1173
|
-
# set_typ的键清洗
|
|
1174
|
-
if not set_typ:
|
|
1175
|
-
set_typ = {}
|
|
1176
|
-
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
|
1177
1213
|
|
|
1178
|
-
#
|
|
1214
|
+
# 根据set_typ处理所有数据的列:严格按set_typ定义的列进行过滤
|
|
1179
1215
|
filtered_set_typ = {}
|
|
1180
1216
|
data_columns = list(data[0].keys()) if data and len(data) > 0 else []
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1217
|
+
|
|
1218
|
+
if normalized_set_typ:
|
|
1219
|
+
# 严格按照set_typ定义的列进行过滤,排除id列
|
|
1220
|
+
for col in normalized_set_typ:
|
|
1221
|
+
if (self.case_sensitive and col == 'id') or (not self.case_sensitive and col.lower() == 'id'):
|
|
1222
|
+
continue
|
|
1223
|
+
filtered_set_typ[col] = normalized_set_typ[col]
|
|
1224
|
+
|
|
1225
|
+
# 对所有数据行进行列处理:补齐缺失列,丢弃多余列
|
|
1226
|
+
processed_data = []
|
|
1227
|
+
for row in data:
|
|
1228
|
+
processed_row = {}
|
|
1229
|
+
# 只保留set_typ中定义的列
|
|
1230
|
+
for col in filtered_set_typ:
|
|
1231
|
+
if col in row:
|
|
1232
|
+
processed_row[col] = row[col]
|
|
1233
|
+
else:
|
|
1234
|
+
processed_row[col] = None # 缺失列用None填充
|
|
1235
|
+
processed_data.append(processed_row)
|
|
1236
|
+
data = processed_data
|
|
1237
|
+
|
|
1238
|
+
# 检查是否有丢弃的列
|
|
1239
|
+
dropped_columns = [col for col in data_columns if col not in filtered_set_typ]
|
|
1240
|
+
if dropped_columns:
|
|
1241
|
+
logger.warning('数据中存在set_typ未定义的列并已被丢弃', {
|
|
1242
|
+
'库': db_name,
|
|
1243
|
+
'表': table_name,
|
|
1244
|
+
'丢弃列': dropped_columns,
|
|
1245
|
+
# '保留列': list(filtered_set_typ.keys())
|
|
1246
|
+
})
|
|
1247
|
+
|
|
1248
|
+
logger.debug('数据列处理完成', {
|
|
1249
|
+
'库': db_name,
|
|
1250
|
+
'表': table_name,
|
|
1251
|
+
'原始列': data_columns,
|
|
1252
|
+
'目标列': list(filtered_set_typ.keys()),
|
|
1253
|
+
'丢弃列': dropped_columns
|
|
1254
|
+
})
|
|
1255
|
+
else:
|
|
1256
|
+
# 如果set_typ为空,则推断所有数据列的类型
|
|
1257
|
+
for col in data_columns:
|
|
1258
|
+
if col not in filtered_set_typ:
|
|
1259
|
+
# 推断类型
|
|
1260
|
+
sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
|
|
1261
|
+
inferred_type = None
|
|
1262
|
+
for val in sample_values:
|
|
1263
|
+
inferred_type = self._infer_data_type(val, no_log=True)
|
|
1264
|
+
if inferred_type:
|
|
1265
|
+
break
|
|
1266
|
+
if not inferred_type:
|
|
1267
|
+
inferred_type = 'VARCHAR(255)'
|
|
1268
|
+
filtered_set_typ[col] = inferred_type
|
|
1269
|
+
logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
|
|
1199
1270
|
|
|
1200
1271
|
prepared_data = []
|
|
1201
1272
|
for row_idx, row in enumerate(data, 1):
|
|
@@ -1205,11 +1276,15 @@ class MySQLUploader:
|
|
|
1205
1276
|
if (self.case_sensitive and col_name == 'id') or (not self.case_sensitive and col_name.lower() == 'id'):
|
|
1206
1277
|
continue
|
|
1207
1278
|
if col_name not in row:
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1279
|
+
# 对于缺失的列,使用None作为默认值,在_validate_value中会根据allow_null和列类型进行进一步处理
|
|
1280
|
+
try:
|
|
1281
|
+
prepared_row[col_name] = self._validate_value(None, filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
|
|
1282
|
+
except ValueError as e:
|
|
1283
|
+
if not allow_null:
|
|
1284
|
+
error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`, 且不允许空值"
|
|
1285
|
+
logger.error(error_msg, {'row': self._shorten_for_log(row)})
|
|
1286
|
+
raise ValueError(error_msg)
|
|
1287
|
+
prepared_row[col_name] = None
|
|
1213
1288
|
else:
|
|
1214
1289
|
try:
|
|
1215
1290
|
prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
|
|
@@ -2296,46 +2371,46 @@ class MySQLUploader:
|
|
|
2296
2371
|
})
|
|
2297
2372
|
return validated_keys
|
|
2298
2373
|
|
|
2299
|
-
|
|
2300
|
-
def process_df_columns(
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
) -> pd.DataFrame:
|
|
2305
|
-
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2374
|
+
@staticmethod
|
|
2375
|
+
def process_df_columns(
|
|
2376
|
+
df: pd.DataFrame,
|
|
2377
|
+
columns: List[str],
|
|
2378
|
+
default_value: Any = 0
|
|
2379
|
+
) -> pd.DataFrame:
|
|
2380
|
+
"""
|
|
2381
|
+
处理DataFrame的列,补齐缺失的列并丢弃多余的列
|
|
2382
|
+
|
|
2383
|
+
:param df: 要处理的DataFrame
|
|
2384
|
+
:param columns: 所需的列名列表,注意不处理大小写
|
|
2385
|
+
:param default_value: 缺失列的填充值,默认为None
|
|
2386
|
+
:return: 处理后的DataFrame
|
|
2387
|
+
"""
|
|
2388
|
+
if df is None or not isinstance(df, pd.DataFrame) or not isinstance(columns, list) or not columns:
|
|
2389
|
+
return df
|
|
2390
|
+
|
|
2391
|
+
# 获取当前列名
|
|
2392
|
+
current_columns = list(df.columns)
|
|
2393
|
+
|
|
2394
|
+
# 找出需要添加的列和需要删除的列
|
|
2395
|
+
missing_columns = [col for col in columns if col not in current_columns]
|
|
2396
|
+
extra_columns = [col for col in current_columns if col not in columns]
|
|
2397
|
+
|
|
2398
|
+
# 复制DataFrame
|
|
2399
|
+
result_df = df.copy()
|
|
2400
|
+
|
|
2401
|
+
# 删除多余的列
|
|
2402
|
+
if extra_columns:
|
|
2403
|
+
result_df = result_df.drop(columns=extra_columns)
|
|
2404
|
+
|
|
2405
|
+
# 添加缺失的列
|
|
2406
|
+
if missing_columns:
|
|
2407
|
+
for col in missing_columns:
|
|
2408
|
+
result_df[col] = default_value
|
|
2409
|
+
|
|
2410
|
+
# 按照指定顺序重新排列列
|
|
2411
|
+
result_df = result_df.reindex(columns=columns)
|
|
2412
|
+
|
|
2413
|
+
return result_df
|
|
2339
2414
|
|
|
2340
2415
|
|
|
2341
2416
|
def main():
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
|
2
|
-
mdbq/__version__.py,sha256=
|
|
2
|
+
mdbq/__version__.py,sha256=Jd5sP6XMAzuwfeUQJhdmEnXOsZWP5LcueloV0VDjdww,17
|
|
3
3
|
mdbq/auth/__init__.py,sha256=pnPMAt63sh1B6kEvmutUuro46zVf2v2YDAG7q-jV_To,24
|
|
4
4
|
mdbq/auth/auth_backend.py,sha256=iLN7AqiSq7fQgFtNtge_TIlVOR1hrCSZXH6oId6uGX4,116924
|
|
5
5
|
mdbq/auth/crypto.py,sha256=fcZRFCnrKVVdWDUx_zds51ynFYwS9DBvJOrRQVldrfM,15931
|
|
@@ -15,7 +15,7 @@ mdbq/mysql/deduplicator.py,sha256=2fugLyKs_xkvYvoG0C0hRYbJ_w8-4oa1FJ_vavoD7Qo,73
|
|
|
15
15
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
|
16
16
|
mdbq/mysql/s_query.py,sha256=N2xHJf2CiUXjXIVBemdst-wamIP3908EGAJOFG13fCU,50475
|
|
17
17
|
mdbq/mysql/unique_.py,sha256=MaztT-WIyEQUs-OOYY4pFulgHVcXR1BfCy3QUz0XM_U,21127
|
|
18
|
-
mdbq/mysql/uploader.py,sha256=
|
|
18
|
+
mdbq/mysql/uploader.py,sha256=VhI_VKpJ1fld6dzdDv_Q1gN56mrcPnDsXAdnDgFAqeg,117186
|
|
19
19
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
|
20
20
|
mdbq/other/download_sku_picture.py,sha256=X66sVdvVgzoNzmgVJyPtd7bjEvctEKtLPblEPF65EWc,46940
|
|
21
21
|
mdbq/other/error_handler.py,sha256=4p5haAXSY-P78stp4Xwo_MwAngWYqyKj5ogWIuYXMeY,12631
|
|
@@ -35,7 +35,7 @@ mdbq/route/routes.py,sha256=QVGfTvDgu0CpcKCvk1ra74H8uojgqTLUav1fnVAqLEA,29433
|
|
|
35
35
|
mdbq/selenium/__init__.py,sha256=AKzeEceqZyvqn2dEDoJSzDQnbuENkJSHAlbHAD0u0ZI,10
|
|
36
36
|
mdbq/selenium/get_driver.py,sha256=1NTlVUE6QsyjTrVVVqTO2LOnYf578ccFWlWnvIXGtic,20903
|
|
37
37
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
|
38
|
-
mdbq-4.1.
|
|
39
|
-
mdbq-4.1.
|
|
40
|
-
mdbq-4.1.
|
|
41
|
-
mdbq-4.1.
|
|
38
|
+
mdbq-4.1.8.dist-info/METADATA,sha256=rBRP7HOwmYzaA380L7fXLBlAFQn2xHgFYObmlMSLa2I,363
|
|
39
|
+
mdbq-4.1.8.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
40
|
+
mdbq-4.1.8.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
|
41
|
+
mdbq-4.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|