mdbq 4.1.6__py3-none-any.whl → 4.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mdbq might be problematic. Click here for more details.

mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '4.1.6'
1
+ VERSION = '4.1.8'
mdbq/mysql/uploader.py CHANGED
@@ -784,11 +784,18 @@ class MySQLUploader:
784
784
  'char': 'none',
785
785
  'mediumtext': 'none',
786
786
  'longtext': 'none',
787
+ 'enum': None, # enum类型需要特殊处理,使用第一个可选值
788
+ 'set': '', # set类型默认为空字符串
787
789
  }
788
790
  fallback = 'none'
789
791
  for typ, val in fallback_map.items():
790
792
  if typ in column_type_lower:
791
- fallback = val
793
+ if typ == 'enum' and val is None:
794
+ # 对于enum类型,使用第一个可选值作为默认值
795
+ enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
796
+ fallback = enum_values[0] if enum_values else 'none'
797
+ else:
798
+ fallback = val
792
799
  break
793
800
  if not allow_null:
794
801
  logger.warning("该列不允许为空值", {"库": db_name, "表": table_name, "allow_null": allow_null, "列": col_name, "值": value, "兜底值": fallback})
@@ -833,6 +840,33 @@ class MySQLUploader:
833
840
  except (ValueError, TypeError, InvalidOperation) as e:
834
841
  logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
835
842
  raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
843
+ # ENUM类型验证
844
+ elif 'enum' in column_type_lower:
845
+ # 提取enum的可选值,支持单引号和双引号
846
+ enum_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
847
+ str_value = str(value).strip()
848
+ if str_value not in enum_values:
849
+ logger.error(f"值 `{str_value}` 不在enum允许的值中: {enum_values}",
850
+ {"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
851
+ raise ValueError(f"值 `{str_value}` 不在enum允许的值中: {enum_values}")
852
+ return str_value
853
+ # SET类型验证
854
+ elif 'set' in column_type_lower:
855
+ # 提取set的可选值,支持单引号和双引号
856
+ set_values = re.findall(r"['\"]([^'\"]*)['\"]", column_type)
857
+ str_value = str(value).strip()
858
+ # SET类型可以是多个值的组合,用逗号分隔
859
+ if ',' in str_value:
860
+ input_values = [v.strip() for v in str_value.split(',')]
861
+ else:
862
+ input_values = [str_value]
863
+
864
+ for val in input_values:
865
+ if val and val not in set_values:
866
+ logger.error(f"值 `{val}` 不在set允许的值中: {set_values}",
867
+ {"库": db_name, "表": table_name, "列": col_name, "列类型": column_type})
868
+ raise ValueError(f"值 `{val}` 不在set允许的值中: {set_values}")
869
+ return str_value
836
870
  # 字符串类型验证
837
871
  elif 'varchar' in column_type_lower:
838
872
  str_value = str(value)
@@ -1134,21 +1168,27 @@ class MySQLUploader:
1134
1168
  ) -> Tuple[List[Dict], Dict[str, str]]:
1135
1169
  """
1136
1170
  准备要上传的数据,验证并转换数据类型
1171
+ 根据set_typ自动处理所有数据类型的列:补齐缺失的列并丢弃多余的列
1137
1172
  """
1173
+ # set_typ的键清洗
1174
+ if not set_typ:
1175
+ set_typ = {}
1176
+ normalized_set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
1177
+
1138
1178
  # 统一数据格式为字典列表
1139
1179
  if isinstance(data, pd.DataFrame):
1140
1180
  try:
1141
1181
  if self.case_sensitive:
1142
- data.columns = [col for col in data.columns]
1182
+ data.columns = [self._validate_identifier(col) for col in data.columns]
1143
1183
  else:
1144
- data.columns = [col.lower() for col in data.columns]
1184
+ data.columns = [self._validate_identifier(col).lower() for col in data.columns]
1145
1185
  data = data.replace({pd.NA: None}).to_dict('records')
1146
1186
  except Exception as e:
1147
- logger.error('数据转字典时发生错误', {
1187
+ logger.error('DataFrame处理时发生错误', {
1148
1188
  'error': str(e),
1149
1189
  'data': self._shorten_for_log(data),
1150
1190
  })
1151
- raise ValueError(f"数据转字典时发生错误: {e}")
1191
+ raise ValueError(f"DataFrame处理时发生错误: {e}")
1152
1192
  elif isinstance(data, dict):
1153
1193
  if self.case_sensitive:
1154
1194
  data = [{k: v for k, v in data.items()}]
@@ -1168,34 +1208,65 @@ class MySQLUploader:
1168
1208
  # 统一处理原始数据中列名的特殊字符
1169
1209
  data = self.normalize_column_names(data)
1170
1210
 
1171
- if not set_typ:
1211
+ if not normalized_set_typ:
1172
1212
  logger.warning('set_typ为空, 将自动推断数据类型, 可能存在数据类型识别错误')
1173
- # set_typ的键清洗
1174
- if not set_typ:
1175
- set_typ = {}
1176
- set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
1177
1213
 
1178
- # 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
1214
+ # 根据set_typ处理所有数据的列:严格按set_typ定义的列进行过滤
1179
1215
  filtered_set_typ = {}
1180
1216
  data_columns = list(data[0].keys()) if data and len(data) > 0 else []
1181
- # 先按set_typ顺序
1182
- for col in set_typ:
1183
- if col in data_columns:
1184
- filtered_set_typ[col] = set_typ[col]
1185
- # 再补充data中有但set_typ没有的列
1186
- for col in data_columns:
1187
- if col not in filtered_set_typ:
1188
- # 推断类型
1189
- sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
1190
- inferred_type = None
1191
- for val in sample_values:
1192
- inferred_type = self._infer_data_type(val, no_log=True)
1193
- if inferred_type:
1194
- break
1195
- if not inferred_type:
1196
- inferred_type = 'VARCHAR(255)'
1197
- filtered_set_typ[col] = inferred_type
1198
- logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
1217
+
1218
+ if normalized_set_typ:
1219
+ # 严格按照set_typ定义的列进行过滤,排除id列
1220
+ for col in normalized_set_typ:
1221
+ if (self.case_sensitive and col == 'id') or (not self.case_sensitive and col.lower() == 'id'):
1222
+ continue
1223
+ filtered_set_typ[col] = normalized_set_typ[col]
1224
+
1225
+ # 对所有数据行进行列处理:补齐缺失列,丢弃多余列
1226
+ processed_data = []
1227
+ for row in data:
1228
+ processed_row = {}
1229
+ # 只保留set_typ中定义的列
1230
+ for col in filtered_set_typ:
1231
+ if col in row:
1232
+ processed_row[col] = row[col]
1233
+ else:
1234
+ processed_row[col] = None # 缺失列用None填充
1235
+ processed_data.append(processed_row)
1236
+ data = processed_data
1237
+
1238
+ # 检查是否有丢弃的列
1239
+ dropped_columns = [col for col in data_columns if col not in filtered_set_typ]
1240
+ if dropped_columns:
1241
+ logger.warning('数据中存在set_typ未定义的列并已被丢弃', {
1242
+ '库': db_name,
1243
+ '表': table_name,
1244
+ '丢弃列': dropped_columns,
1245
+ # '保留列': list(filtered_set_typ.keys())
1246
+ })
1247
+
1248
+ logger.debug('数据列处理完成', {
1249
+ '库': db_name,
1250
+ '表': table_name,
1251
+ '原始列': data_columns,
1252
+ '目标列': list(filtered_set_typ.keys()),
1253
+ '丢弃列': dropped_columns
1254
+ })
1255
+ else:
1256
+ # 如果set_typ为空,则推断所有数据列的类型
1257
+ for col in data_columns:
1258
+ if col not in filtered_set_typ:
1259
+ # 推断类型
1260
+ sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
1261
+ inferred_type = None
1262
+ for val in sample_values:
1263
+ inferred_type = self._infer_data_type(val, no_log=True)
1264
+ if inferred_type:
1265
+ break
1266
+ if not inferred_type:
1267
+ inferred_type = 'VARCHAR(255)'
1268
+ filtered_set_typ[col] = inferred_type
1269
+ logger.debug(f"自动推断列 `{col}` 的数据类型为: `{inferred_type}`")
1199
1270
 
1200
1271
  prepared_data = []
1201
1272
  for row_idx, row in enumerate(data, 1):
@@ -1205,11 +1276,15 @@ class MySQLUploader:
1205
1276
  if (self.case_sensitive and col_name == 'id') or (not self.case_sensitive and col_name.lower() == 'id'):
1206
1277
  continue
1207
1278
  if col_name not in row:
1208
- if not allow_null:
1209
- error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`"
1210
- logger.error(error_msg, {'row': self._shorten_for_log(row)})
1211
- raise ValueError(error_msg)
1212
- prepared_row[col_name] = None
1279
+ # 对于缺失的列,使用None作为默认值,在_validate_value中会根据allow_null和列类型进行进一步处理
1280
+ try:
1281
+ prepared_row[col_name] = self._validate_value(None, filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
1282
+ except ValueError as e:
1283
+ if not allow_null:
1284
+ error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`, 且不允许空值"
1285
+ logger.error(error_msg, {'row': self._shorten_for_log(row)})
1286
+ raise ValueError(error_msg)
1287
+ prepared_row[col_name] = None
1213
1288
  else:
1214
1289
  try:
1215
1290
  prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null, db_name, table_name, col_name)
@@ -2296,6 +2371,47 @@ class MySQLUploader:
2296
2371
  })
2297
2372
  return validated_keys
2298
2373
 
2374
+ @staticmethod
2375
+ def process_df_columns(
2376
+ df: pd.DataFrame,
2377
+ columns: List[str],
2378
+ default_value: Any = 0
2379
+ ) -> pd.DataFrame:
2380
+ """
2381
+ 处理DataFrame的列,补齐缺失的列并丢弃多余的列
2382
+
2383
+ :param df: 要处理的DataFrame
2384
+ :param columns: 所需的列名列表,注意不处理大小写
2385
+ :param default_value: 缺失列的填充值,默认为None
2386
+ :return: 处理后的DataFrame
2387
+ """
2388
+ if df is None or not isinstance(df, pd.DataFrame) or not isinstance(columns, list) or not columns:
2389
+ return df
2390
+
2391
+ # 获取当前列名
2392
+ current_columns = list(df.columns)
2393
+
2394
+ # 找出需要添加的列和需要删除的列
2395
+ missing_columns = [col for col in columns if col not in current_columns]
2396
+ extra_columns = [col for col in current_columns if col not in columns]
2397
+
2398
+ # 复制DataFrame
2399
+ result_df = df.copy()
2400
+
2401
+ # 删除多余的列
2402
+ if extra_columns:
2403
+ result_df = result_df.drop(columns=extra_columns)
2404
+
2405
+ # 添加缺失的列
2406
+ if missing_columns:
2407
+ for col in missing_columns:
2408
+ result_df[col] = default_value
2409
+
2410
+ # 按照指定顺序重新排列列
2411
+ result_df = result_df.reindex(columns=columns)
2412
+
2413
+ return result_df
2414
+
2299
2415
 
2300
2416
  def main():
2301
2417
  dir_path = os.path.expanduser("~")
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.1.6
3
+ Version: 4.1.8
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=7c62s32-DiiicOsmr_3thHK6QH89lYKau8J0SY1GH-g,17
2
+ mdbq/__version__.py,sha256=Jd5sP6XMAzuwfeUQJhdmEnXOsZWP5LcueloV0VDjdww,17
3
3
  mdbq/auth/__init__.py,sha256=pnPMAt63sh1B6kEvmutUuro46zVf2v2YDAG7q-jV_To,24
4
4
  mdbq/auth/auth_backend.py,sha256=iLN7AqiSq7fQgFtNtge_TIlVOR1hrCSZXH6oId6uGX4,116924
5
5
  mdbq/auth/crypto.py,sha256=fcZRFCnrKVVdWDUx_zds51ynFYwS9DBvJOrRQVldrfM,15931
@@ -15,7 +15,7 @@ mdbq/mysql/deduplicator.py,sha256=2fugLyKs_xkvYvoG0C0hRYbJ_w8-4oa1FJ_vavoD7Qo,73
15
15
  mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
16
16
  mdbq/mysql/s_query.py,sha256=N2xHJf2CiUXjXIVBemdst-wamIP3908EGAJOFG13fCU,50475
17
17
  mdbq/mysql/unique_.py,sha256=MaztT-WIyEQUs-OOYY4pFulgHVcXR1BfCy3QUz0XM_U,21127
18
- mdbq/mysql/uploader.py,sha256=FG_4btNwTjbCqZFeIigCfar7r-OOA7VkyuJsOOC9WLw,111539
18
+ mdbq/mysql/uploader.py,sha256=VhI_VKpJ1fld6dzdDv_Q1gN56mrcPnDsXAdnDgFAqeg,117186
19
19
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
20
20
  mdbq/other/download_sku_picture.py,sha256=X66sVdvVgzoNzmgVJyPtd7bjEvctEKtLPblEPF65EWc,46940
21
21
  mdbq/other/error_handler.py,sha256=4p5haAXSY-P78stp4Xwo_MwAngWYqyKj5ogWIuYXMeY,12631
@@ -35,7 +35,7 @@ mdbq/route/routes.py,sha256=QVGfTvDgu0CpcKCvk1ra74H8uojgqTLUav1fnVAqLEA,29433
35
35
  mdbq/selenium/__init__.py,sha256=AKzeEceqZyvqn2dEDoJSzDQnbuENkJSHAlbHAD0u0ZI,10
36
36
  mdbq/selenium/get_driver.py,sha256=1NTlVUE6QsyjTrVVVqTO2LOnYf578ccFWlWnvIXGtic,20903
37
37
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
38
- mdbq-4.1.6.dist-info/METADATA,sha256=2dRtmGxtsWYYcheH8sXkFexnBq687lwKffpmQIdWtrU,363
39
- mdbq-4.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- mdbq-4.1.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
41
- mdbq-4.1.6.dist-info/RECORD,,
38
+ mdbq-4.1.8.dist-info/METADATA,sha256=rBRP7HOwmYzaA380L7fXLBlAFQn2xHgFYObmlMSLa2I,363
39
+ mdbq-4.1.8.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
40
+ mdbq-4.1.8.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
41
+ mdbq-4.1.8.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5