mdbq 3.10.3__py3-none-any.whl → 3.10.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.10.3'
1
+ VERSION = '3.10.5'
mdbq/mysql/uploader.py CHANGED
@@ -106,6 +106,8 @@ class MySQLUploader:
106
106
  :param connect_timeout: 连接超时(秒),默认为10
107
107
  :param read_timeout: 读取超时(秒),默认为30
108
108
  :param write_timeout: 写入超时(秒),默认为30
109
+ :param base_excute_col: # 排重插入数据时始终排除该列
110
+ :param case_sensitive: # 是否保持大小写敏感,默认为False(转为小写)
109
111
  :param ssl: SSL配置字典,默认为None
110
112
  """
111
113
  self.username = username
@@ -120,6 +122,8 @@ class MySQLUploader:
120
122
  self.connect_timeout = connect_timeout
121
123
  self.read_timeout = read_timeout
122
124
  self.write_timeout = write_timeout
125
+ self.base_excute_col = ['id', '更新时间'] # 排重插入数据时始终排除该列
126
+ self.case_sensitive = False # 是否保持大小写敏感,默认为False(转为小写)
123
127
  self.ssl = ssl
124
128
  self._prepared_statements = StatementCache(maxsize=100)
125
129
  self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
@@ -377,6 +381,10 @@ class MySQLUploader:
377
381
  })
378
382
  raise ValueError(f"无效的标识符: `{identifier}`")
379
383
 
384
+ # 统一转为小写(除非明确要求大小写敏感)
385
+ if not self.case_sensitive:
386
+ identifier = identifier.lower()
387
+
380
388
  # 移除非法字符,只保留字母、数字、下划线和美元符号
381
389
  cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
382
390
 
@@ -673,7 +681,7 @@ class MySQLUploader:
673
681
  with self._get_connection() as conn:
674
682
  with conn.cursor() as cursor:
675
683
  cursor.execute(sql, (db_name, table_name))
676
- set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
684
+ set_typ = {row['COLUMN_NAME'].lower(): row['DATA_TYPE'] for row in cursor.fetchall()}
677
685
  logger.debug(f"`{db_name}`.`{table_name}`: 获取表的列信息: `{set_typ}`")
678
686
  return set_typ
679
687
  except Exception as e:
@@ -917,7 +925,6 @@ class MySQLUploader:
917
925
  raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
918
926
  prepared_data.append(prepared_row)
919
927
 
920
- logger.debug(f"已准备 {len(prepared_data)} 行数据")
921
928
  return prepared_data, filtered_set_typ
922
929
 
923
930
  def upload_data(
@@ -953,7 +960,7 @@ class MySQLUploader:
953
960
  :param auto_create: 表不存在时是否自动创建,默认为True
954
961
  :param indexes: 需要创建索引的列列表,可选
955
962
  :param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
956
- :param transaction_mode: 事务提交模式,可选值:
963
+ :param transaction_mode: 事务模式,可选值:
957
964
  - 'row' : 逐行提交事务(错误隔离性好)
958
965
  - 'batch' : 整批提交事务(性能最优)
959
966
  - 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
@@ -973,14 +980,11 @@ class MySQLUploader:
973
980
  '分表方式': partition_by,
974
981
  '排重': check_duplicate,
975
982
  '传入': len(data) if hasattr(data, '__len__') else 1,
976
- '自动建表': auto_create
983
+ # '自动建表': auto_create
977
984
  })
978
985
 
979
986
  try:
980
987
  # 验证参数
981
- if not set_typ:
982
- logger.debug(f'set_typ 参数缺失,建表不指定数据类型字典,后续存储数据容易引发异常')
983
-
984
988
  if partition_by:
985
989
  partition_by = str(partition_by).lower()
986
990
  if partition_by not in ['year', 'month']:
@@ -1076,7 +1080,7 @@ class MySQLUploader:
1076
1080
  '库': db_name,
1077
1081
  '表': table_name,
1078
1082
  '批次': batch_id,
1079
- 'success': success_flag,
1083
+ 'finish': success_flag,
1080
1084
  # '耗时': round(time.time() - upload_start, 2),
1081
1085
  '数据行': initial_row_count
1082
1086
  })
@@ -1106,7 +1110,7 @@ class MySQLUploader:
1106
1110
  :param batch_size: 批量插入大小,默认为1000
1107
1111
  :param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
1108
1112
  :param batch_id: 批次ID用于日志追踪,可选
1109
- :param transaction_mode: 事务提交模式,可选值:
1113
+ :param transaction_mode: 事务模式,可选值:
1110
1114
  - 'row' : 逐行提交事务(错误隔离性好)
1111
1115
  - 'batch' : 整批提交事务(性能最优)
1112
1116
  - 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
@@ -1135,11 +1139,11 @@ class MySQLUploader:
1135
1139
  logger.info('插入完成', {
1136
1140
  '库': db_name,
1137
1141
  '表': table_name,
1138
- '完成总计': len(data),
1142
+ '总计': len(data),
1139
1143
  '插入': total_inserted,
1140
1144
  '跳过': total_skipped,
1141
1145
  '失败': total_failed,
1142
- '事务提交模式': transaction_mode,
1146
+ '事务模式': transaction_mode,
1143
1147
  })
1144
1148
 
1145
1149
  def _validate_transaction_mode(self, mode: str) -> str:
@@ -1154,6 +1158,67 @@ class MySQLUploader:
1154
1158
  return 'batch'
1155
1159
  return mode.lower()
1156
1160
 
1161
+ def _build_simple_insert_sql(self, db_name, table_name, columns, update_on_duplicate):
1162
+ safe_columns = [self._validate_identifier(col) for col in columns]
1163
+ placeholders = ','.join(['%s'] * len(safe_columns))
1164
+
1165
+ sql = f"""
1166
+ INSERT INTO `{db_name}`.`{table_name}`
1167
+ (`{'`,`'.join(safe_columns)}`)
1168
+ VALUES ({placeholders})
1169
+ """
1170
+
1171
+ # # 情况2:不检查重复但允许更新
1172
+ # if update_on_duplicate:
1173
+ # update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
1174
+ # for col in columns])
1175
+ # sql += f" ON DUPLICATE KEY UPDATE {update_clause}"
1176
+
1177
+ return sql
1178
+
1179
+ def _build_duplicate_check_sql(self, db_name, table_name, all_columns,
1180
+ duplicate_columns, update_on_duplicate, set_typ):
1181
+ duplicate_columns = [_item for _item in duplicate_columns if _item.lower() not in self.base_excute_col]
1182
+ safe_columns = [self._validate_identifier(col) for col in all_columns]
1183
+ placeholders = ','.join(['%s'] * len(safe_columns))
1184
+
1185
+ # 确定排重列(排除id和更新时间列)
1186
+ dup_cols = duplicate_columns if duplicate_columns else all_columns
1187
+
1188
+ # 构建排重条件
1189
+ conditions = []
1190
+ for col in dup_cols:
1191
+ col_type = set_typ.get(col, '').lower()
1192
+ if col_type.startswith('decimal'):
1193
+ scale = self._get_decimal_scale(col_type)
1194
+ conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
1195
+ else:
1196
+ conditions.append(f"`{col}` = %s")
1197
+
1198
+ # 情况3/5:允许更新
1199
+ if update_on_duplicate:
1200
+ update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
1201
+ for col in all_columns])
1202
+ sql = f"""
1203
+ INSERT INTO `{db_name}`.`{table_name}`
1204
+ (`{'`,`'.join(safe_columns)}`)
1205
+ VALUES ({placeholders})
1206
+ ON DUPLICATE KEY UPDATE {update_clause}
1207
+ """
1208
+ else:
1209
+ # 情况4/6:不允许更新
1210
+ sql = f"""
1211
+ INSERT INTO `{db_name}`.`{table_name}`
1212
+ (`{'`,`'.join(safe_columns)}`)
1213
+ SELECT {placeholders}
1214
+ FROM DUAL
1215
+ WHERE NOT EXISTS (
1216
+ SELECT 1 FROM `{db_name}`.`{table_name}`
1217
+ WHERE {' AND '.join(conditions)}
1218
+ )
1219
+ """
1220
+ return sql
1221
+
1157
1222
  def _prepare_insert_sql(
1158
1223
  self,
1159
1224
  db_name: str,
@@ -1163,55 +1228,29 @@ class MySQLUploader:
1163
1228
  duplicate_columns: Optional[List[str]],
1164
1229
  update_on_duplicate: bool
1165
1230
  ) -> str:
1166
- """准备插入SQL语句"""
1167
- # 获取所有列名(排除 `id`、`更新时间` 列)
1168
- all_columns = [col for col in set_typ.keys() if col.lower() not in ['id', '更新时间']]
1169
- safe_columns = [self._validate_identifier(col) for col in all_columns]
1170
- placeholders = ','.join(['%s'] * len(safe_columns))
1231
+ """
1232
+ 准备插入SQL语句
1171
1233
 
1172
- if check_duplicate:
1173
- if not duplicate_columns:
1174
- duplicate_columns = all_columns
1175
- else:
1176
- duplicate_columns = [col for col in duplicate_columns if col.lower() not in ['id', '更新时间']]
1177
-
1178
- conditions = []
1179
- for col in duplicate_columns:
1180
- col_type = set_typ.get(col, '').lower()
1181
- if col_type.startswith('decimal'):
1182
- scale_match = re.search(r'decimal\(\d+,(\d+)\)', col_type)
1183
- scale = int(scale_match.group(1)) if scale_match else 2
1184
- conditions.append(f"ROUND(`{self._validate_identifier(col)}`, {scale}) = ROUND(%s, {scale})")
1185
- else:
1186
- conditions.append(f"`{self._validate_identifier(col)}` = %s")
1187
-
1188
- where_clause = " AND ".join(conditions)
1189
-
1190
- if update_on_duplicate:
1191
- # 更新模式 - 使用ON DUPLICATE KEY UPDATE语法
1192
- update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
1193
- return f"""
1194
- INSERT INTO `{db_name}`.`{table_name}`
1195
- (`{'`,`'.join(safe_columns)}`)
1196
- VALUES ({placeholders})
1197
- ON DUPLICATE KEY UPDATE {update_clause}
1198
- """
1199
- else:
1200
- return f"""INSERT INTO `{db_name}`.`{table_name}`
1201
- (`{'`,`'.join(safe_columns)}`)
1202
- SELECT {placeholders}
1203
- FROM DUAL
1204
- WHERE NOT EXISTS (
1205
- SELECT 1 FROM `{db_name}`.`{table_name}`
1206
- WHERE {where_clause}
1207
- )
1208
- """
1209
- else:
1210
- return f"""
1211
- INSERT INTO `{db_name}`.`{table_name}`
1212
- (`{'`,`'.join(safe_columns)}`)
1213
- VALUES ({placeholders})
1214
- """
1234
+ 1. check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
1235
+ 2. check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
1236
+ 3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
1237
+ 4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
1238
+ 5. 当 check_duplicate=True duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
1239
+ 6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
1240
+
1241
+ """
1242
+ # 获取所有列名(排除id和更新时间列)
1243
+ all_columns = [col for col in set_typ.keys()
1244
+ if col.lower() != 'id']
1245
+
1246
+ # 情况1-2:不检查重复
1247
+ if not check_duplicate:
1248
+ return self._build_simple_insert_sql(db_name, table_name, all_columns,
1249
+ update_on_duplicate)
1250
+
1251
+ # 情况3-6:检查重复
1252
+ return self._build_duplicate_check_sql(db_name, table_name, all_columns,
1253
+ duplicate_columns, update_on_duplicate, set_typ)
1215
1254
 
1216
1255
  def _execute_batch_insert(
1217
1256
  self,
@@ -1228,7 +1267,8 @@ class MySQLUploader:
1228
1267
  ) -> Tuple[int, int, int]:
1229
1268
  """执行批量插入操作"""
1230
1269
  # 获取所有列名(排除id列)
1231
- all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
1270
+ all_columns = [col for col in set_typ.keys()
1271
+ if col.lower() != 'id']
1232
1272
 
1233
1273
  total_inserted = 0
1234
1274
  total_skipped = 0
@@ -1277,6 +1317,7 @@ class MySQLUploader:
1277
1317
  try:
1278
1318
  for row_idx, row in enumerate(batch, 1):
1279
1319
  result = self._process_single_row(
1320
+ db_name, table_name,
1280
1321
  cursor, row, all_columns, sql,
1281
1322
  check_duplicate, duplicate_columns
1282
1323
  )
@@ -1300,7 +1341,7 @@ class MySQLUploader:
1300
1341
  '批次': f'{batch_id} {batch_index + 1}/{total_data_length}',
1301
1342
  'error_type': type(e).__name__,
1302
1343
  '批量操作失败': str(e),
1303
- '事务提交模式': transaction_mode,
1344
+ '事务模式': transaction_mode,
1304
1345
  '处理方式': '整个批次回滚'
1305
1346
  })
1306
1347
 
@@ -1308,6 +1349,7 @@ class MySQLUploader:
1308
1349
  for row_idx, row in enumerate(batch, 1):
1309
1350
  try:
1310
1351
  result = self._process_single_row(
1352
+ db_name, table_name,
1311
1353
  cursor, row, all_columns, sql,
1312
1354
  check_duplicate, duplicate_columns
1313
1355
  )
@@ -1336,30 +1378,19 @@ class MySQLUploader:
1336
1378
  '数据类型': set_typ,
1337
1379
  '是否排重': check_duplicate,
1338
1380
  '排重列': duplicate_columns,
1339
- '事务提交模式': transaction_mode,
1381
+ '事务模式': transaction_mode,
1340
1382
  })
1341
1383
 
1342
1384
  # 混合模式最后统一提交
1343
1385
  if transaction_mode == 'hybrid':
1344
1386
  conn.commit()
1345
1387
 
1346
- logger.debug(sys._getframe().f_code.co_name, {
1347
- '库': db_name,
1348
- '表': table_name,
1349
- '批次': batch_id,
1350
- '批次处理完成': batch_index // len(batch) + 1,
1351
- '总批次': (total_data_length + len(batch) - 1) // len(batch),
1352
- '数据量': len(batch),
1353
- '插入': batch_inserted,
1354
- '跳过': batch_skipped,
1355
- '失败': batch_failed,
1356
- '事务提交模式': transaction_mode,
1357
- })
1358
-
1359
1388
  return batch_inserted, batch_skipped, batch_failed
1360
1389
 
1361
1390
  def _process_single_row(
1362
1391
  self,
1392
+ db_name,
1393
+ table_name,
1363
1394
  cursor,
1364
1395
  row: Dict,
1365
1396
  all_columns: List[str],
@@ -1374,6 +1405,8 @@ class MySQLUploader:
1374
1405
  if check_duplicate:
1375
1406
  row_values += [row.get(col) for col in duplicate_columns]
1376
1407
 
1408
+ # logger.info(sql)
1409
+ # logger.info(row_values)
1377
1410
  cursor.execute(sql, row_values)
1378
1411
 
1379
1412
  if check_duplicate:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.10.3
3
+ Version: 3.10.5
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=S1pYeTgXo5MtZqzwck9ASp8x1pB1PZ33oC1NI7fY9dI,18
2
+ mdbq/__version__.py,sha256=51aaiJO4QmLmJb4fxtT4uQtl5MznQsVENkLEBRAcEBc,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
5
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
@@ -12,7 +12,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
12
12
  mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
13
13
  mdbq/mysql/mysql.py,sha256=Fzaqbjg5g3HdNl50jInIrdurdzcgR2CCzdKLVImD1-Q,55339
14
14
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
15
- mdbq/mysql/uploader.py,sha256=9wgFxsiFSAngdX2pWj57jElaspwqfPtadC-xQqvweUc,59066
15
+ mdbq/mysql/uploader.py,sha256=mQUcOFOuw1BeeJpPX1seDSKtJXuTzAG75GA8oQNWMT8,61195
16
16
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
17
17
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
18
18
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
25
25
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
26
26
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
27
27
  mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
28
- mdbq-3.10.3.dist-info/METADATA,sha256=TM8JAb8gTTte7N0agKbaDWZ14bmkl66dgCxIbLTqCbc,364
29
- mdbq-3.10.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
- mdbq-3.10.3.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
- mdbq-3.10.3.dist-info/RECORD,,
28
+ mdbq-3.10.5.dist-info/METADATA,sha256=I0ojjsBkeW8KpaITA6ImuAvlJt3dONcmuJzRpLNkWqU,364
29
+ mdbq-3.10.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
+ mdbq-3.10.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
+ mdbq-3.10.5.dist-info/RECORD,,
File without changes