mdbq 3.10.3__py3-none-any.whl → 3.10.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +108 -75
- {mdbq-3.10.3.dist-info → mdbq-3.10.5.dist-info}/METADATA +1 -1
- {mdbq-3.10.3.dist-info → mdbq-3.10.5.dist-info}/RECORD +6 -6
- {mdbq-3.10.3.dist-info → mdbq-3.10.5.dist-info}/WHEEL +0 -0
- {mdbq-3.10.3.dist-info → mdbq-3.10.5.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.10.
|
1
|
+
VERSION = '3.10.5'
|
mdbq/mysql/uploader.py
CHANGED
@@ -106,6 +106,8 @@ class MySQLUploader:
|
|
106
106
|
:param connect_timeout: 连接超时(秒),默认为10
|
107
107
|
:param read_timeout: 读取超时(秒),默认为30
|
108
108
|
:param write_timeout: 写入超时(秒),默认为30
|
109
|
+
:param base_excute_col: # 排重插入数据时始终排除该列
|
110
|
+
:param case_sensitive: # 是否保持大小写敏感,默认为False(转为小写)
|
109
111
|
:param ssl: SSL配置字典,默认为None
|
110
112
|
"""
|
111
113
|
self.username = username
|
@@ -120,6 +122,8 @@ class MySQLUploader:
|
|
120
122
|
self.connect_timeout = connect_timeout
|
121
123
|
self.read_timeout = read_timeout
|
122
124
|
self.write_timeout = write_timeout
|
125
|
+
self.base_excute_col = ['id', '更新时间'] # 排重插入数据时始终排除该列
|
126
|
+
self.case_sensitive = False # 是否保持大小写敏感,默认为False(转为小写)
|
123
127
|
self.ssl = ssl
|
124
128
|
self._prepared_statements = StatementCache(maxsize=100)
|
125
129
|
self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
|
@@ -377,6 +381,10 @@ class MySQLUploader:
|
|
377
381
|
})
|
378
382
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
379
383
|
|
384
|
+
# 统一转为小写(除非明确要求大小写敏感)
|
385
|
+
if not self.case_sensitive:
|
386
|
+
identifier = identifier.lower()
|
387
|
+
|
380
388
|
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
381
389
|
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
382
390
|
|
@@ -673,7 +681,7 @@ class MySQLUploader:
|
|
673
681
|
with self._get_connection() as conn:
|
674
682
|
with conn.cursor() as cursor:
|
675
683
|
cursor.execute(sql, (db_name, table_name))
|
676
|
-
set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
|
684
|
+
set_typ = {row['COLUMN_NAME'].lower(): row['DATA_TYPE'] for row in cursor.fetchall()}
|
677
685
|
logger.debug(f"`{db_name}`.`{table_name}`: 获取表的列信息: `{set_typ}`")
|
678
686
|
return set_typ
|
679
687
|
except Exception as e:
|
@@ -917,7 +925,6 @@ class MySQLUploader:
|
|
917
925
|
raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
|
918
926
|
prepared_data.append(prepared_row)
|
919
927
|
|
920
|
-
logger.debug(f"已准备 {len(prepared_data)} 行数据")
|
921
928
|
return prepared_data, filtered_set_typ
|
922
929
|
|
923
930
|
def upload_data(
|
@@ -953,7 +960,7 @@ class MySQLUploader:
|
|
953
960
|
:param auto_create: 表不存在时是否自动创建,默认为True
|
954
961
|
:param indexes: 需要创建索引的列列表,可选
|
955
962
|
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
956
|
-
:param transaction_mode:
|
963
|
+
:param transaction_mode: 事务模式,可选值:
|
957
964
|
- 'row' : 逐行提交事务(错误隔离性好)
|
958
965
|
- 'batch' : 整批提交事务(性能最优)
|
959
966
|
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
@@ -973,14 +980,11 @@ class MySQLUploader:
|
|
973
980
|
'分表方式': partition_by,
|
974
981
|
'排重': check_duplicate,
|
975
982
|
'传入': len(data) if hasattr(data, '__len__') else 1,
|
976
|
-
'自动建表': auto_create
|
983
|
+
# '自动建表': auto_create
|
977
984
|
})
|
978
985
|
|
979
986
|
try:
|
980
987
|
# 验证参数
|
981
|
-
if not set_typ:
|
982
|
-
logger.debug(f'set_typ 参数缺失,建表不指定数据类型字典,后续存储数据容易引发异常')
|
983
|
-
|
984
988
|
if partition_by:
|
985
989
|
partition_by = str(partition_by).lower()
|
986
990
|
if partition_by not in ['year', 'month']:
|
@@ -1076,7 +1080,7 @@ class MySQLUploader:
|
|
1076
1080
|
'库': db_name,
|
1077
1081
|
'表': table_name,
|
1078
1082
|
'批次': batch_id,
|
1079
|
-
'
|
1083
|
+
'finish': success_flag,
|
1080
1084
|
# '耗时': round(time.time() - upload_start, 2),
|
1081
1085
|
'数据行': initial_row_count
|
1082
1086
|
})
|
@@ -1106,7 +1110,7 @@ class MySQLUploader:
|
|
1106
1110
|
:param batch_size: 批量插入大小,默认为1000
|
1107
1111
|
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
1108
1112
|
:param batch_id: 批次ID用于日志追踪,可选
|
1109
|
-
:param transaction_mode:
|
1113
|
+
:param transaction_mode: 事务模式,可选值:
|
1110
1114
|
- 'row' : 逐行提交事务(错误隔离性好)
|
1111
1115
|
- 'batch' : 整批提交事务(性能最优)
|
1112
1116
|
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
@@ -1135,11 +1139,11 @@ class MySQLUploader:
|
|
1135
1139
|
logger.info('插入完成', {
|
1136
1140
|
'库': db_name,
|
1137
1141
|
'表': table_name,
|
1138
|
-
'
|
1142
|
+
'总计': len(data),
|
1139
1143
|
'插入': total_inserted,
|
1140
1144
|
'跳过': total_skipped,
|
1141
1145
|
'失败': total_failed,
|
1142
|
-
'
|
1146
|
+
'事务模式': transaction_mode,
|
1143
1147
|
})
|
1144
1148
|
|
1145
1149
|
def _validate_transaction_mode(self, mode: str) -> str:
|
@@ -1154,6 +1158,67 @@ class MySQLUploader:
|
|
1154
1158
|
return 'batch'
|
1155
1159
|
return mode.lower()
|
1156
1160
|
|
1161
|
+
def _build_simple_insert_sql(self, db_name, table_name, columns, update_on_duplicate):
|
1162
|
+
safe_columns = [self._validate_identifier(col) for col in columns]
|
1163
|
+
placeholders = ','.join(['%s'] * len(safe_columns))
|
1164
|
+
|
1165
|
+
sql = f"""
|
1166
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
1167
|
+
(`{'`,`'.join(safe_columns)}`)
|
1168
|
+
VALUES ({placeholders})
|
1169
|
+
"""
|
1170
|
+
|
1171
|
+
# # 情况2:不检查重复但允许更新
|
1172
|
+
# if update_on_duplicate:
|
1173
|
+
# update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
|
1174
|
+
# for col in columns])
|
1175
|
+
# sql += f" ON DUPLICATE KEY UPDATE {update_clause}"
|
1176
|
+
|
1177
|
+
return sql
|
1178
|
+
|
1179
|
+
def _build_duplicate_check_sql(self, db_name, table_name, all_columns,
|
1180
|
+
duplicate_columns, update_on_duplicate, set_typ):
|
1181
|
+
duplicate_columns = [_item for _item in duplicate_columns if _item.lower() not in self.base_excute_col]
|
1182
|
+
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
1183
|
+
placeholders = ','.join(['%s'] * len(safe_columns))
|
1184
|
+
|
1185
|
+
# 确定排重列(排除id和更新时间列)
|
1186
|
+
dup_cols = duplicate_columns if duplicate_columns else all_columns
|
1187
|
+
|
1188
|
+
# 构建排重条件
|
1189
|
+
conditions = []
|
1190
|
+
for col in dup_cols:
|
1191
|
+
col_type = set_typ.get(col, '').lower()
|
1192
|
+
if col_type.startswith('decimal'):
|
1193
|
+
scale = self._get_decimal_scale(col_type)
|
1194
|
+
conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
|
1195
|
+
else:
|
1196
|
+
conditions.append(f"`{col}` = %s")
|
1197
|
+
|
1198
|
+
# 情况3/5:允许更新
|
1199
|
+
if update_on_duplicate:
|
1200
|
+
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
|
1201
|
+
for col in all_columns])
|
1202
|
+
sql = f"""
|
1203
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
1204
|
+
(`{'`,`'.join(safe_columns)}`)
|
1205
|
+
VALUES ({placeholders})
|
1206
|
+
ON DUPLICATE KEY UPDATE {update_clause}
|
1207
|
+
"""
|
1208
|
+
else:
|
1209
|
+
# 情况4/6:不允许更新
|
1210
|
+
sql = f"""
|
1211
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
1212
|
+
(`{'`,`'.join(safe_columns)}`)
|
1213
|
+
SELECT {placeholders}
|
1214
|
+
FROM DUAL
|
1215
|
+
WHERE NOT EXISTS (
|
1216
|
+
SELECT 1 FROM `{db_name}`.`{table_name}`
|
1217
|
+
WHERE {' AND '.join(conditions)}
|
1218
|
+
)
|
1219
|
+
"""
|
1220
|
+
return sql
|
1221
|
+
|
1157
1222
|
def _prepare_insert_sql(
|
1158
1223
|
self,
|
1159
1224
|
db_name: str,
|
@@ -1163,55 +1228,29 @@ class MySQLUploader:
|
|
1163
1228
|
duplicate_columns: Optional[List[str]],
|
1164
1229
|
update_on_duplicate: bool
|
1165
1230
|
) -> str:
|
1166
|
-
"""
|
1167
|
-
|
1168
|
-
all_columns = [col for col in set_typ.keys() if col.lower() not in ['id', '更新时间']]
|
1169
|
-
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
1170
|
-
placeholders = ','.join(['%s'] * len(safe_columns))
|
1231
|
+
"""
|
1232
|
+
准备插入SQL语句
|
1171
1233
|
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
|
1193
|
-
return f"""
|
1194
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
1195
|
-
(`{'`,`'.join(safe_columns)}`)
|
1196
|
-
VALUES ({placeholders})
|
1197
|
-
ON DUPLICATE KEY UPDATE {update_clause}
|
1198
|
-
"""
|
1199
|
-
else:
|
1200
|
-
return f"""INSERT INTO `{db_name}`.`{table_name}`
|
1201
|
-
(`{'`,`'.join(safe_columns)}`)
|
1202
|
-
SELECT {placeholders}
|
1203
|
-
FROM DUAL
|
1204
|
-
WHERE NOT EXISTS (
|
1205
|
-
SELECT 1 FROM `{db_name}`.`{table_name}`
|
1206
|
-
WHERE {where_clause}
|
1207
|
-
)
|
1208
|
-
"""
|
1209
|
-
else:
|
1210
|
-
return f"""
|
1211
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
1212
|
-
(`{'`,`'.join(safe_columns)}`)
|
1213
|
-
VALUES ({placeholders})
|
1214
|
-
"""
|
1234
|
+
1. 当 check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
|
1235
|
+
2. 当 check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
|
1236
|
+
3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1237
|
+
4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1238
|
+
5. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1239
|
+
6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1240
|
+
|
1241
|
+
"""
|
1242
|
+
# 获取所有列名(排除id和更新时间列)
|
1243
|
+
all_columns = [col for col in set_typ.keys()
|
1244
|
+
if col.lower() != 'id']
|
1245
|
+
|
1246
|
+
# 情况1-2:不检查重复
|
1247
|
+
if not check_duplicate:
|
1248
|
+
return self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1249
|
+
update_on_duplicate)
|
1250
|
+
|
1251
|
+
# 情况3-6:检查重复
|
1252
|
+
return self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1253
|
+
duplicate_columns, update_on_duplicate, set_typ)
|
1215
1254
|
|
1216
1255
|
def _execute_batch_insert(
|
1217
1256
|
self,
|
@@ -1228,7 +1267,8 @@ class MySQLUploader:
|
|
1228
1267
|
) -> Tuple[int, int, int]:
|
1229
1268
|
"""执行批量插入操作"""
|
1230
1269
|
# 获取所有列名(排除id列)
|
1231
|
-
all_columns = [col for col in set_typ.keys()
|
1270
|
+
all_columns = [col for col in set_typ.keys()
|
1271
|
+
if col.lower() != 'id']
|
1232
1272
|
|
1233
1273
|
total_inserted = 0
|
1234
1274
|
total_skipped = 0
|
@@ -1277,6 +1317,7 @@ class MySQLUploader:
|
|
1277
1317
|
try:
|
1278
1318
|
for row_idx, row in enumerate(batch, 1):
|
1279
1319
|
result = self._process_single_row(
|
1320
|
+
db_name, table_name,
|
1280
1321
|
cursor, row, all_columns, sql,
|
1281
1322
|
check_duplicate, duplicate_columns
|
1282
1323
|
)
|
@@ -1300,7 +1341,7 @@ class MySQLUploader:
|
|
1300
1341
|
'批次': f'{batch_id} {batch_index + 1}/{total_data_length}',
|
1301
1342
|
'error_type': type(e).__name__,
|
1302
1343
|
'批量操作失败': str(e),
|
1303
|
-
'
|
1344
|
+
'事务模式': transaction_mode,
|
1304
1345
|
'处理方式': '整个批次回滚'
|
1305
1346
|
})
|
1306
1347
|
|
@@ -1308,6 +1349,7 @@ class MySQLUploader:
|
|
1308
1349
|
for row_idx, row in enumerate(batch, 1):
|
1309
1350
|
try:
|
1310
1351
|
result = self._process_single_row(
|
1352
|
+
db_name, table_name,
|
1311
1353
|
cursor, row, all_columns, sql,
|
1312
1354
|
check_duplicate, duplicate_columns
|
1313
1355
|
)
|
@@ -1336,30 +1378,19 @@ class MySQLUploader:
|
|
1336
1378
|
'数据类型': set_typ,
|
1337
1379
|
'是否排重': check_duplicate,
|
1338
1380
|
'排重列': duplicate_columns,
|
1339
|
-
'
|
1381
|
+
'事务模式': transaction_mode,
|
1340
1382
|
})
|
1341
1383
|
|
1342
1384
|
# 混合模式最后统一提交
|
1343
1385
|
if transaction_mode == 'hybrid':
|
1344
1386
|
conn.commit()
|
1345
1387
|
|
1346
|
-
logger.debug(sys._getframe().f_code.co_name, {
|
1347
|
-
'库': db_name,
|
1348
|
-
'表': table_name,
|
1349
|
-
'批次': batch_id,
|
1350
|
-
'批次处理完成': batch_index // len(batch) + 1,
|
1351
|
-
'总批次': (total_data_length + len(batch) - 1) // len(batch),
|
1352
|
-
'数据量': len(batch),
|
1353
|
-
'插入': batch_inserted,
|
1354
|
-
'跳过': batch_skipped,
|
1355
|
-
'失败': batch_failed,
|
1356
|
-
'事务提交模式': transaction_mode,
|
1357
|
-
})
|
1358
|
-
|
1359
1388
|
return batch_inserted, batch_skipped, batch_failed
|
1360
1389
|
|
1361
1390
|
def _process_single_row(
|
1362
1391
|
self,
|
1392
|
+
db_name,
|
1393
|
+
table_name,
|
1363
1394
|
cursor,
|
1364
1395
|
row: Dict,
|
1365
1396
|
all_columns: List[str],
|
@@ -1374,6 +1405,8 @@ class MySQLUploader:
|
|
1374
1405
|
if check_duplicate:
|
1375
1406
|
row_values += [row.get(col) for col in duplicate_columns]
|
1376
1407
|
|
1408
|
+
# logger.info(sql)
|
1409
|
+
# logger.info(row_values)
|
1377
1410
|
cursor.execute(sql, row_values)
|
1378
1411
|
|
1379
1412
|
if check_duplicate:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=51aaiJO4QmLmJb4fxtT4uQtl5MznQsVENkLEBRAcEBc,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
|
5
5
|
mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
|
@@ -12,7 +12,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
|
12
12
|
mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
|
13
13
|
mdbq/mysql/mysql.py,sha256=Fzaqbjg5g3HdNl50jInIrdurdzcgR2CCzdKLVImD1-Q,55339
|
14
14
|
mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=mQUcOFOuw1BeeJpPX1seDSKtJXuTzAG75GA8oQNWMT8,61195
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
|
28
|
-
mdbq-3.10.
|
29
|
-
mdbq-3.10.
|
30
|
-
mdbq-3.10.
|
31
|
-
mdbq-3.10.
|
28
|
+
mdbq-3.10.5.dist-info/METADATA,sha256=I0ojjsBkeW8KpaITA6ImuAvlJt3dONcmuJzRpLNkWqU,364
|
29
|
+
mdbq-3.10.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.10.5.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.10.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|