mdbq 3.10.3__py3-none-any.whl → 3.10.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +100 -50
- {mdbq-3.10.3.dist-info → mdbq-3.10.4.dist-info}/METADATA +1 -1
- {mdbq-3.10.3.dist-info → mdbq-3.10.4.dist-info}/RECORD +6 -6
- {mdbq-3.10.3.dist-info → mdbq-3.10.4.dist-info}/WHEEL +0 -0
- {mdbq-3.10.3.dist-info → mdbq-3.10.4.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.10.
|
1
|
+
VERSION = '3.10.4'
|
mdbq/mysql/uploader.py
CHANGED
@@ -106,6 +106,8 @@ class MySQLUploader:
|
|
106
106
|
:param connect_timeout: 连接超时(秒),默认为10
|
107
107
|
:param read_timeout: 读取超时(秒),默认为30
|
108
108
|
:param write_timeout: 写入超时(秒),默认为30
|
109
|
+
:param base_excute_col: # 排重插入数据时始终排除该列
|
110
|
+
:param case_sensitive: # 是否保持大小写敏感,默认为False(转为小写)
|
109
111
|
:param ssl: SSL配置字典,默认为None
|
110
112
|
"""
|
111
113
|
self.username = username
|
@@ -120,6 +122,8 @@ class MySQLUploader:
|
|
120
122
|
self.connect_timeout = connect_timeout
|
121
123
|
self.read_timeout = read_timeout
|
122
124
|
self.write_timeout = write_timeout
|
125
|
+
self.base_excute_col = ['id', '更新时间'] # 排重插入数据时始终排除该列
|
126
|
+
self.case_sensitive = False # 是否保持大小写敏感,默认为False(转为小写)
|
123
127
|
self.ssl = ssl
|
124
128
|
self._prepared_statements = StatementCache(maxsize=100)
|
125
129
|
self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
|
@@ -377,6 +381,10 @@ class MySQLUploader:
|
|
377
381
|
})
|
378
382
|
raise ValueError(f"无效的标识符: `{identifier}`")
|
379
383
|
|
384
|
+
# 统一转为小写(除非明确要求大小写敏感)
|
385
|
+
if not self.case_sensitive:
|
386
|
+
identifier = identifier.lower()
|
387
|
+
|
380
388
|
# 移除非法字符,只保留字母、数字、下划线和美元符号
|
381
389
|
cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
|
382
390
|
|
@@ -673,7 +681,7 @@ class MySQLUploader:
|
|
673
681
|
with self._get_connection() as conn:
|
674
682
|
with conn.cursor() as cursor:
|
675
683
|
cursor.execute(sql, (db_name, table_name))
|
676
|
-
set_typ = {row['COLUMN_NAME']: row['DATA_TYPE'] for row in cursor.fetchall()}
|
684
|
+
set_typ = {row['COLUMN_NAME'].lower(): row['DATA_TYPE'] for row in cursor.fetchall()}
|
677
685
|
logger.debug(f"`{db_name}`.`{table_name}`: 获取表的列信息: `{set_typ}`")
|
678
686
|
return set_typ
|
679
687
|
except Exception as e:
|
@@ -1154,6 +1162,67 @@ class MySQLUploader:
|
|
1154
1162
|
return 'batch'
|
1155
1163
|
return mode.lower()
|
1156
1164
|
|
1165
|
+
def _build_simple_insert_sql(self, db_name, table_name, columns, update_on_duplicate):
|
1166
|
+
safe_columns = [self._validate_identifier(col) for col in columns]
|
1167
|
+
placeholders = ','.join(['%s'] * len(safe_columns))
|
1168
|
+
|
1169
|
+
sql = f"""
|
1170
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
1171
|
+
(`{'`,`'.join(safe_columns)}`)
|
1172
|
+
VALUES ({placeholders})
|
1173
|
+
"""
|
1174
|
+
|
1175
|
+
# # 情况2:不检查重复但允许更新
|
1176
|
+
# if update_on_duplicate:
|
1177
|
+
# update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
|
1178
|
+
# for col in columns])
|
1179
|
+
# sql += f" ON DUPLICATE KEY UPDATE {update_clause}"
|
1180
|
+
|
1181
|
+
return sql
|
1182
|
+
|
1183
|
+
def _build_duplicate_check_sql(self, db_name, table_name, all_columns,
|
1184
|
+
duplicate_columns, update_on_duplicate, set_typ):
|
1185
|
+
duplicate_columns = [_item for _item in duplicate_columns if _item.lower() not in self.base_excute_col]
|
1186
|
+
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
1187
|
+
placeholders = ','.join(['%s'] * len(safe_columns))
|
1188
|
+
|
1189
|
+
# 确定排重列(排除id和更新时间列)
|
1190
|
+
dup_cols = duplicate_columns if duplicate_columns else all_columns
|
1191
|
+
|
1192
|
+
# 构建排重条件
|
1193
|
+
conditions = []
|
1194
|
+
for col in dup_cols:
|
1195
|
+
col_type = set_typ.get(col, '').lower()
|
1196
|
+
if col_type.startswith('decimal'):
|
1197
|
+
scale = self._get_decimal_scale(col_type)
|
1198
|
+
conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
|
1199
|
+
else:
|
1200
|
+
conditions.append(f"`{col}` = %s")
|
1201
|
+
|
1202
|
+
# 情况3/5:允许更新
|
1203
|
+
if update_on_duplicate:
|
1204
|
+
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)"
|
1205
|
+
for col in all_columns])
|
1206
|
+
sql = f"""
|
1207
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
1208
|
+
(`{'`,`'.join(safe_columns)}`)
|
1209
|
+
VALUES ({placeholders})
|
1210
|
+
ON DUPLICATE KEY UPDATE {update_clause}
|
1211
|
+
"""
|
1212
|
+
else:
|
1213
|
+
# 情况4/6:不允许更新
|
1214
|
+
sql = f"""
|
1215
|
+
INSERT INTO `{db_name}`.`{table_name}`
|
1216
|
+
(`{'`,`'.join(safe_columns)}`)
|
1217
|
+
SELECT {placeholders}
|
1218
|
+
FROM DUAL
|
1219
|
+
WHERE NOT EXISTS (
|
1220
|
+
SELECT 1 FROM `{db_name}`.`{table_name}`
|
1221
|
+
WHERE {' AND '.join(conditions)}
|
1222
|
+
)
|
1223
|
+
"""
|
1224
|
+
return sql
|
1225
|
+
|
1157
1226
|
def _prepare_insert_sql(
|
1158
1227
|
self,
|
1159
1228
|
db_name: str,
|
@@ -1163,55 +1232,29 @@ class MySQLUploader:
|
|
1163
1232
|
duplicate_columns: Optional[List[str]],
|
1164
1233
|
update_on_duplicate: bool
|
1165
1234
|
) -> str:
|
1166
|
-
"""
|
1167
|
-
|
1168
|
-
all_columns = [col for col in set_typ.keys() if col.lower() not in ['id', '更新时间']]
|
1169
|
-
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
1170
|
-
placeholders = ','.join(['%s'] * len(safe_columns))
|
1235
|
+
"""
|
1236
|
+
准备插入SQL语句
|
1171
1237
|
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
|
1193
|
-
return f"""
|
1194
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
1195
|
-
(`{'`,`'.join(safe_columns)}`)
|
1196
|
-
VALUES ({placeholders})
|
1197
|
-
ON DUPLICATE KEY UPDATE {update_clause}
|
1198
|
-
"""
|
1199
|
-
else:
|
1200
|
-
return f"""INSERT INTO `{db_name}`.`{table_name}`
|
1201
|
-
(`{'`,`'.join(safe_columns)}`)
|
1202
|
-
SELECT {placeholders}
|
1203
|
-
FROM DUAL
|
1204
|
-
WHERE NOT EXISTS (
|
1205
|
-
SELECT 1 FROM `{db_name}`.`{table_name}`
|
1206
|
-
WHERE {where_clause}
|
1207
|
-
)
|
1208
|
-
"""
|
1209
|
-
else:
|
1210
|
-
return f"""
|
1211
|
-
INSERT INTO `{db_name}`.`{table_name}`
|
1212
|
-
(`{'`,`'.join(safe_columns)}`)
|
1213
|
-
VALUES ({placeholders})
|
1214
|
-
"""
|
1238
|
+
1. 当 check_duplicate=False 时,忽略 duplicate_columns 和 update_on_duplicate 参数,直接插入全部data。
|
1239
|
+
2. 当 check_duplicate=False 且 update_on_duplicate=True 时,由于 check_duplicate=False,直接插入全部data。
|
1240
|
+
3. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=True 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1241
|
+
4. 当 check_duplicate=True 且 duplicate_columns=[] 且 update_on_duplicate=False 时,获取数据库所有列(但排除`id`和`更新时间`列),按这些列(不含`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1242
|
+
5. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=True 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,遇到重复数据时更新旧数据。
|
1243
|
+
6. 当 check_duplicate=True 且 duplicate_columns 指定了排重列且 update_on_duplicate=False 时,按 duplicate_columns 指定的列(但排除`id`和`更新时间`)排重插入,不考虑是否更新旧数据。
|
1244
|
+
|
1245
|
+
"""
|
1246
|
+
# 获取所有列名(排除id和更新时间列)
|
1247
|
+
all_columns = [col for col in set_typ.keys()
|
1248
|
+
if col.lower() != 'id']
|
1249
|
+
|
1250
|
+
# 情况1-2:不检查重复
|
1251
|
+
if not check_duplicate:
|
1252
|
+
return self._build_simple_insert_sql(db_name, table_name, all_columns,
|
1253
|
+
update_on_duplicate)
|
1254
|
+
|
1255
|
+
# 情况3-6:检查重复
|
1256
|
+
return self._build_duplicate_check_sql(db_name, table_name, all_columns,
|
1257
|
+
duplicate_columns, update_on_duplicate, set_typ)
|
1215
1258
|
|
1216
1259
|
def _execute_batch_insert(
|
1217
1260
|
self,
|
@@ -1228,7 +1271,8 @@ class MySQLUploader:
|
|
1228
1271
|
) -> Tuple[int, int, int]:
|
1229
1272
|
"""执行批量插入操作"""
|
1230
1273
|
# 获取所有列名(排除id列)
|
1231
|
-
all_columns = [col for col in set_typ.keys()
|
1274
|
+
all_columns = [col for col in set_typ.keys()
|
1275
|
+
if col.lower() != 'id']
|
1232
1276
|
|
1233
1277
|
total_inserted = 0
|
1234
1278
|
total_skipped = 0
|
@@ -1277,6 +1321,7 @@ class MySQLUploader:
|
|
1277
1321
|
try:
|
1278
1322
|
for row_idx, row in enumerate(batch, 1):
|
1279
1323
|
result = self._process_single_row(
|
1324
|
+
db_name, table_name,
|
1280
1325
|
cursor, row, all_columns, sql,
|
1281
1326
|
check_duplicate, duplicate_columns
|
1282
1327
|
)
|
@@ -1308,6 +1353,7 @@ class MySQLUploader:
|
|
1308
1353
|
for row_idx, row in enumerate(batch, 1):
|
1309
1354
|
try:
|
1310
1355
|
result = self._process_single_row(
|
1356
|
+
db_name, table_name,
|
1311
1357
|
cursor, row, all_columns, sql,
|
1312
1358
|
check_duplicate, duplicate_columns
|
1313
1359
|
)
|
@@ -1360,6 +1406,8 @@ class MySQLUploader:
|
|
1360
1406
|
|
1361
1407
|
def _process_single_row(
|
1362
1408
|
self,
|
1409
|
+
db_name,
|
1410
|
+
table_name,
|
1363
1411
|
cursor,
|
1364
1412
|
row: Dict,
|
1365
1413
|
all_columns: List[str],
|
@@ -1374,6 +1422,8 @@ class MySQLUploader:
|
|
1374
1422
|
if check_duplicate:
|
1375
1423
|
row_values += [row.get(col) for col in duplicate_columns]
|
1376
1424
|
|
1425
|
+
# logger.info(sql)
|
1426
|
+
# logger.info(row_values)
|
1377
1427
|
cursor.execute(sql, row_values)
|
1378
1428
|
|
1379
1429
|
if check_duplicate:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=L-43kDdR8o3iwkH5IR35xUFgTPugEww0j_gk9jPlkCU,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
|
5
5
|
mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
|
@@ -12,7 +12,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
|
12
12
|
mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
|
13
13
|
mdbq/mysql/mysql.py,sha256=Fzaqbjg5g3HdNl50jInIrdurdzcgR2CCzdKLVImD1-Q,55339
|
14
14
|
mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=ElT1-Jq5nR6qg8re0rfs26YGNPHK6zsNGc3ni7TnWFA,61954
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=YyPWa_nOH1zs8wgTDcgzn5w8szGKWPyWzmWMVIPkFnU,21638
|
28
|
-
mdbq-3.10.
|
29
|
-
mdbq-3.10.
|
30
|
-
mdbq-3.10.
|
31
|
-
mdbq-3.10.
|
28
|
+
mdbq-3.10.4.dist-info/METADATA,sha256=z-9kwc0z6aVg7ugS9FPf2TZd1vfyNBYz0qpvsfW3b_w,364
|
29
|
+
mdbq-3.10.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-3.10.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-3.10.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|