mdbq 3.10.2__py3-none-any.whl → 3.10.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.10.2'
1
+ VERSION = '3.10.4'
mdbq/mysql/uploader.py CHANGED
@@ -106,6 +106,8 @@ class MySQLUploader:
106
106
  :param connect_timeout: 连接超时(秒),默认为10
107
107
  :param read_timeout: 读取超时(秒),默认为30
108
108
  :param write_timeout: 写入超时(秒),默认为30
109
+ :param base_excute_col: # 排重插入数据时始终排除该列
110
+ :param case_sensitive: # 是否保持大小写敏感,默认为False(转为小写)
109
111
  :param ssl: SSL配置字典,默认为None
110
112
  """
111
113
  self.username = username
@@ -120,6 +122,8 @@ class MySQLUploader:
120
122
  self.connect_timeout = connect_timeout
121
123
  self.read_timeout = read_timeout
122
124
  self.write_timeout = write_timeout
125
+ self.base_excute_col = ['id', '更新时间'] # 排重插入数据时始终排除该列
126
+ self.case_sensitive = False # 是否保持大小写敏感,默认为False(转为小写)
123
127
  self.ssl = ssl
124
128
  self._prepared_statements = StatementCache(maxsize=100)
125
129
  self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
@@ -303,8 +307,8 @@ class MySQLUploader:
303
307
  return exists
304
308
  except Exception as e:
305
309
  logger.error(sys._getframe().f_code.co_name, {
310
+ '库': db_name,
306
311
  '检查数据库是否存在时出错': str(e),
307
- '库': db_name
308
312
  })
309
313
  raise
310
314
 
@@ -377,6 +381,10 @@ class MySQLUploader:
377
381
  })
378
382
  raise ValueError(f"无效的标识符: `{identifier}`")
379
383
 
384
+ # 统一转为小写(除非明确要求大小写敏感)
385
+ if not self.case_sensitive:
386
+ identifier = identifier.lower()
387
+
380
388
  # 移除非法字符,只保留字母、数字、下划线和美元符号
381
389
  cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
382
390
 
@@ -546,9 +554,9 @@ class MySQLUploader:
546
554
 
547
555
  except Exception as e:
548
556
  logger.error(sys._getframe().f_code.co_name, {
549
- '建表失败': str(e),
550
557
  '库': db_name,
551
558
  '表': table_name,
559
+ '建表失败': str(e),
552
560
  })
553
561
  conn.rollback()
554
562
  raise
@@ -673,14 +681,14 @@ class MySQLUploader:
673
681
  with self._get_connection() as conn:
674
682
  with conn.cursor() as cursor:
675
683
  cursor.execute(sql, (db_name, table_name))
676
- set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
684
+ set_typ = {row['COLUMN_NAME'].lower(): row['DATA_TYPE'] for row in cursor.fetchall()}
677
685
  logger.debug(f"`{db_name}`.`{table_name}`: 获取表的列信息: `{set_typ}`")
678
686
  return set_typ
679
687
  except Exception as e:
680
688
  logger.error(sys._getframe().f_code.co_name, {
681
- '无法获取表列信息': str(e),
682
689
  '库': db_name,
683
690
  '表': table_name,
691
+ '无法获取表列信息': str(e),
684
692
  })
685
693
  raise
686
694
 
@@ -709,9 +717,9 @@ class MySQLUploader:
709
717
  allow_null=allow_null)
710
718
  else:
711
719
  logger.error(sys._getframe().f_code.co_name, {
712
- '数据表不存在': table_name,
713
720
  '库': db_name,
714
721
  '表': table_name,
722
+ '数据表不存在': table_name,
715
723
  })
716
724
  raise ValueError(f"数据表不存在: `{db_name}`.`{table_name}`")
717
725
 
@@ -719,9 +727,9 @@ class MySQLUploader:
719
727
  table_columns = self._get_table_columns(db_name, table_name)
720
728
  if not table_columns:
721
729
  logger.error(sys._getframe().f_code.co_name, {
722
- '获取列失败': table_columns,
723
730
  '库': db_name,
724
731
  '表': table_name,
732
+ '获取列失败': table_columns,
725
733
  })
726
734
  raise ValueError(f"获取列失败 `{db_name}`.`{table_name}`")
727
735
 
@@ -729,9 +737,9 @@ class MySQLUploader:
729
737
  for col in set_typ:
730
738
  if col not in table_columns:
731
739
  logger.error(sys._getframe().f_code.co_name, {
732
- '列不存在': f'{col} -> {table_columns}',
733
740
  '库': db_name,
734
741
  '表': table_name,
742
+ '列不存在': f'{col} -> {table_columns}',
735
743
  })
736
744
  raise ValueError(f"列不存在: `{col}` -> `{db_name}`.`{table_name}`")
737
745
 
@@ -960,7 +968,7 @@ class MySQLUploader:
960
968
  默认值为 'batch'
961
969
  :raises: 可能抛出各种验证和数据库相关异常
962
970
  """
963
- upload_start = time.time()
971
+ # upload_start = time.time()
964
972
  initial_row_count = len(data) if hasattr(data, '__len__') else 1
965
973
 
966
974
  batch_id = f"batch_{int(time.time() * 1000)}"
@@ -972,7 +980,7 @@ class MySQLUploader:
972
980
  '批次': batch_id,
973
981
  '分表方式': partition_by,
974
982
  '排重': check_duplicate,
975
- '传入总计': len(data) if hasattr(data, '__len__') else 1,
983
+ '传入': len(data) if hasattr(data, '__len__') else 1,
976
984
  '自动建表': auto_create
977
985
  })
978
986
 
@@ -985,10 +993,10 @@ class MySQLUploader:
985
993
  partition_by = str(partition_by).lower()
986
994
  if partition_by not in ['year', 'month']:
987
995
  logger.error(sys._getframe().f_code.co_name, {
988
- '分表方式必须是 "year" 或 "month" 或 "None"': partition_by,
989
996
  '库': db_name,
990
997
  '表': table_name,
991
- '批次': batch_id
998
+ '批次': batch_id,
999
+ '分表方式必须是 "year" 或 "month" 或 "None"': partition_by,
992
1000
  })
993
1001
  raise ValueError("分表方式必须是 'year' 或 'month' 或 'None'")
994
1002
 
@@ -1029,10 +1037,10 @@ class MySQLUploader:
1029
1037
  partitioned_data[part_table].append(row)
1030
1038
  except Exception as e:
1031
1039
  logger.error(sys._getframe().f_code.co_name, {
1032
- 'row_data': row,
1033
- '分表处理失败': str(e),
1034
1040
  '库': db_name,
1035
1041
  '表': table_name,
1042
+ 'row_data': row,
1043
+ '分表处理失败': str(e),
1036
1044
  })
1037
1045
  continue # 跳过当前行
1038
1046
 
@@ -1047,10 +1055,10 @@ class MySQLUploader:
1047
1055
  )
1048
1056
  except Exception as e:
1049
1057
  logger.error(sys._getframe().f_code.co_name, {
1050
- '分表': part_table,
1051
- '分表上传失败': str(e),
1052
1058
  '库': db_name,
1053
1059
  '表': table_name,
1060
+ '分表': part_table,
1061
+ '分表上传失败': str(e),
1054
1062
  })
1055
1063
  continue # 跳过当前分表,继续处理其他分表
1056
1064
  else:
@@ -1066,10 +1074,10 @@ class MySQLUploader:
1066
1074
 
1067
1075
  except Exception as e:
1068
1076
  logger.error(sys._getframe().f_code.co_name, {
1069
- '上传过程发生全局错误': str(e),
1070
- 'error_type': type(e).__name__,
1071
1077
  '库': db_name,
1072
1078
  '表': table_name,
1079
+ '上传过程发生全局错误': str(e),
1080
+ 'error_type': type(e).__name__,
1073
1081
  })
1074
1082
  finally:
1075
1083
  logger.info("存储完成", {
@@ -1077,7 +1085,7 @@ class MySQLUploader:
1077
1085
  '表': table_name,
1078
1086
  '批次': batch_id,
1079
1087
  'success': success_flag,
1080
- '耗时': round(time.time() - upload_start, 2),
1088
+ # '耗时': round(time.time() - upload_start, 2),
1081
1089
  '数据行': initial_row_count
1082
1090
  })
1083
1091
 
@@ -1154,6 +1162,67 @@ class MySQLUploader:
1154
1162
  return 'batch'
1155
1163
  return mode.lower()
1156
1164
 
1165
+ def _build_simple_insert_sql(self, db_name, table_name, columns, update_on_duplicate):
1166
+ safe_columns = [self._validate_identifier(col) for col in columns]
1167
+ placeholders = ','.join(['%s'] * len(safe_columns))
1168
+
1169
+ sql = f"""
1170
+ INSERT INTO `{db_name}`.`{table_name}`
1171
+ (`{'`,`'.join(safe_columns)}`)
1172
+ VALUES ({placeholders})
1173
+ """
1174
+
1175
+ # # 情况2:不检查重复但允许更新
1176
+ # if update_on_duplicate:
1177
+ # update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
1178
+ # for col in columns])
1179
+ # sql += f" ON DUPLICATE KEY UPDATE {update_clause}"
1180
+
1181
+ return sql
1182
+
1183
+ def _build_duplicate_check_sql(self, db_name, table_name, all_columns,
1184
+ duplicate_columns, update_on_duplicate, set_typ):
1185
+ duplicate_columns = [_item for _item in duplicate_columns if _item.lower() not in self.base_excute_col]
1186
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
1187
+ placeholders = ','.join(['%s'] * len(safe_columns))
1188
+
1189
+ # 确定排重列(排除id和更新时间列)
1190
+ dup_cols = duplicate_columns if duplicate_columns else all_columns
1191
+
1192
+ # 构建排重条件
1193
+ conditions = []
1194
+ for col in dup_cols:
1195
+ col_type = set_typ.get(col, '').lower()
1196
+ if col_type.startswith('decimal'):
1197
+ scale = self._get_decimal_scale(col_type)
1198
+ conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
1199
+ else:
1200
+ conditions.append(f"`{col}` = %s")
1201
+
1202
+ # 情况3/5:允许更新
1203
+ if update_on_duplicate:
1204
+ update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
1205
+ for col in all_columns])
1206
+ sql = f"""
1207
+ INSERT INTO `{db_name}`.`{table_name}`
1208
+ (`{'`,`'.join(safe_columns)}`)
1209
+ VALUES ({placeholders})
1210
+ ON DUPLICATE KEY UPDATE {update_clause}
1211
+ """
1212
+ else:
1213
+ # 情况4/6:不允许更新
1214
+ sql = f"""
1215
+ INSERT INTO `{db_name}`.`{table_name}`
1216
+ (`{'`,`'.join(safe_columns)}`)
1217
+ SELECT {placeholders}
1218
+ FROM DUAL
1219
+ WHERE NOT EXISTS (
1220
+ SELECT 1 FROM `{db_name}`.`{table_name}`
1221
+ WHERE {' AND '.join(conditions)}
1222
+ )
1223
+ """
1224
+ return sql
1225
+
1157
1226
  def _prepare_insert_sql(
1158
1227
  self,
1159
1228
  db_name: str,
@@ -1163,55 +1232,29 @@ class MySQLUploader:
1163
1232
  duplicate_columns: Optional[List[str]],
1164
1233
  update_on_duplicate: bool
1165
1234
  ) -> str:
1166
- """准备插入SQL语句"""
1167
- # 获取所有列名(排除id列)
1168
- all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
1169
- safe_columns = [self._validate_identifier(col) for col in all_columns]
1170
- placeholders = ','.join(['%s'] * len(safe_columns))
1235
+ """
1236
+ 准备插入SQL语句
1171
1237
 
1172
- if check_duplicate:
1173
- if not duplicate_columns:
1174
- duplicate_columns = all_columns
1175
- else:
1176
- duplicate_columns = [col for col in duplicate_columns if col != 'id']
1177
-
1178
- conditions = []
1179
- for col in duplicate_columns:
1180
- col_type = set_typ.get(col, '').lower()
1181
- if col_type.startswith('decimal'):
1182
- scale_match = re.search(r'decimal\(\d+,(\d+)\)', col_type)
1183
- scale = int(scale_match.group(1)) if scale_match else 2
1184
- conditions.append(f"ROUND(`{self._validate_identifier(col)}`, {scale}) = ROUND(%s, {scale})")
1185
- else:
1186
- conditions.append(f"`{self._validate_identifier(col)}` = %s")
1187
-
1188
- where_clause = " AND ".join(conditions)
1189
-
1190
- if update_on_duplicate:
1191
- # 更新模式 - 使用ON DUPLICATE KEY UPDATE语法
1192
- update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
1193
- return f"""
1194
- INSERT INTO `{db_name}`.`{table_name}`
1195
- (`{'`,`'.join(safe_columns)}`)
1196
- VALUES ({placeholders})
1197
- ON DUPLICATE KEY UPDATE {update_clause}
1198
- """
1199
- else:
1200
- return f"""INSERT INTO `{db_name}`.`{table_name}`
1201
- (`{'`,`'.join(safe_columns)}`)
1202
- SELECT {placeholders}
1203
- FROM DUAL
1204
- WHERE NOT EXISTS (
1205
- SELECT 1 FROM `{db_name}`.`{table_name}`
1206
- WHERE {where_clause}
1207
- )
1208
- """
1209
- else:
1210
- return f"""
1211
- INSERT INTO `{db_name}`.`{table_name}`
1212
- (`{'`,`'.join(safe_columns)}`)
1213
- VALUES ({placeholders})
1214
- """
1238
+ 1. check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
1239
+ 2. check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
1240
+ 3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
1241
+ 4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
1242
+ 5. 当 check_duplicate=True duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
1243
+ 6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
1244
+
1245
+ """
1246
+ # 获取所有列名(排除id和更新时间列)
1247
+ all_columns = [col for col in set_typ.keys()
1248
+ if col.lower() != 'id']
1249
+
1250
+ # 情况1-2:不检查重复
1251
+ if not check_duplicate:
1252
+ return self._build_simple_insert_sql(db_name, table_name, all_columns,
1253
+ update_on_duplicate)
1254
+
1255
+ # 情况3-6:检查重复
1256
+ return self._build_duplicate_check_sql(db_name, table_name, all_columns,
1257
+ duplicate_columns, update_on_duplicate, set_typ)
1215
1258
 
1216
1259
  def _execute_batch_insert(
1217
1260
  self,
@@ -1228,7 +1271,8 @@ class MySQLUploader:
1228
1271
  ) -> Tuple[int, int, int]:
1229
1272
  """执行批量插入操作"""
1230
1273
  # 获取所有列名(排除id列)
1231
- all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
1274
+ all_columns = [col for col in set_typ.keys()
1275
+ if col.lower() != 'id']
1232
1276
 
1233
1277
  total_inserted = 0
1234
1278
  total_skipped = 0
@@ -1277,6 +1321,7 @@ class MySQLUploader:
1277
1321
  try:
1278
1322
  for row_idx, row in enumerate(batch, 1):
1279
1323
  result = self._process_single_row(
1324
+ db_name, table_name,
1280
1325
  cursor, row, all_columns, sql,
1281
1326
  check_duplicate, duplicate_columns
1282
1327
  )
@@ -1308,6 +1353,7 @@ class MySQLUploader:
1308
1353
  for row_idx, row in enumerate(batch, 1):
1309
1354
  try:
1310
1355
  result = self._process_single_row(
1356
+ db_name, table_name,
1311
1357
  cursor, row, all_columns, sql,
1312
1358
  check_duplicate, duplicate_columns
1313
1359
  )
@@ -1360,6 +1406,8 @@ class MySQLUploader:
1360
1406
 
1361
1407
  def _process_single_row(
1362
1408
  self,
1409
+ db_name,
1410
+ table_name,
1363
1411
  cursor,
1364
1412
  row: Dict,
1365
1413
  all_columns: List[str],
@@ -1374,6 +1422,8 @@ class MySQLUploader:
1374
1422
  if check_duplicate:
1375
1423
  row_values += [row.get(col) for col in duplicate_columns]
1376
1424
 
1425
+ # logger.info(sql)
1426
+ # logger.info(row_values)
1377
1427
  cursor.execute(sql, row_values)
1378
1428
 
1379
1429
  if check_duplicate:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.10.2
3
+ Version: 3.10.4
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=tL5iFQ6j9Svg-3tbUuEZAgDFN3ipIhdJjFUPU6EHSRQ,18
2
+ mdbq/__version__.py,sha256=L-43kDdR8o3iwkH5IR35xUFgTPugEww0j_gk9jPlkCU,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
5
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
@@ -12,7 +12,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
12
12
  mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
13
13
  mdbq/mysql/mysql.py,sha256=Fzaqbjg5g3HdNl50jInIrdurdzcgR2CCzdKLVImD1-Q,55339
14
14
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
15
- mdbq/mysql/uploader.py,sha256=XOSeGg74zN3qYFfWmLqr98H7tCj74dIMCS3C0cvS3kU,58994
15
+ mdbq/mysql/uploader.py,sha256=ElT1-Jq5nR6qg8re0rfs26YGNPHK6zsNGc3ni7TnWFA,61954
16
16
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
17
17
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
18
18
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
25
25
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
26
26
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
27
27
  mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
28
- mdbq-3.10.2.dist-info/METADATA,sha256=D9d_UixDPHEbrdRE1yjA4SHjo4tYoY60_R4cBGPF3ms,364
29
- mdbq-3.10.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
- mdbq-3.10.2.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
- mdbq-3.10.2.dist-info/RECORD,,
28
+ mdbq-3.10.4.dist-info/METADATA,sha256=z-9kwc0z6aVg7ugS9FPf2TZd1vfyNBYz0qpvsfW3b_w,364
29
+ mdbq-3.10.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
+ mdbq-3.10.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
+ mdbq-3.10.4.dist-info/RECORD,,
File without changes