mdbq 3.9.17__tar.gz → 3.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mdbq-3.9.17 → mdbq-3.10.0}/PKG-INFO +1 -1
- mdbq-3.10.0/mdbq/__version__.py +1 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/mysql/uploader.py +84 -54
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/spider/aikucun.py +55 -10
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq.egg-info/PKG-INFO +1 -1
- mdbq-3.9.17/mdbq/__version__.py +0 -1
- {mdbq-3.9.17 → mdbq-3.10.0}/README.txt +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/aggregation/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/aggregation/optimize.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/aggregation/query_data.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/config/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/config/config.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/log/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/log/mylogger.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/log/spider_logging.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/mysql/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/mysql/deduplicator.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/mysql/mysql.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/mysql/s_query.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/other/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/other/download_sku_picture.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/other/otk.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/other/pov_city.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/other/ua_sj.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/pbix/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/pbix/pbix_refresh.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/pbix/refresh_all.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/redis/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/redis/getredis.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq/spider/__init__.py +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq.egg-info/SOURCES.txt +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq.egg-info/dependency_links.txt +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/mdbq.egg-info/top_level.txt +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/setup.cfg +0 -0
- {mdbq-3.9.17 → mdbq-3.10.0}/setup.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
VERSION = '3.10.0'
|
@@ -698,7 +698,8 @@ class MySQLUploader:
|
|
698
698
|
date_column: Optional[str],
|
699
699
|
indexes: Optional[List[str]],
|
700
700
|
batch_id: Optional[str] = None,
|
701
|
-
update_on_duplicate: bool = False
|
701
|
+
update_on_duplicate: bool = False,
|
702
|
+
transaction_mode: str = "batch"
|
702
703
|
):
|
703
704
|
"""实际执行表上传的方法"""
|
704
705
|
# 检查表是否存在
|
@@ -739,7 +740,8 @@ class MySQLUploader:
|
|
739
740
|
db_name, table_name, data, set_typ,
|
740
741
|
check_duplicate, duplicate_columns,
|
741
742
|
batch_id=batch_id,
|
742
|
-
update_on_duplicate=update_on_duplicate
|
743
|
+
update_on_duplicate=update_on_duplicate,
|
744
|
+
transaction_mode=transaction_mode
|
743
745
|
)
|
744
746
|
|
745
747
|
def _infer_data_type(self, value: Any) -> str:
|
@@ -932,7 +934,8 @@ class MySQLUploader:
|
|
932
934
|
partition_date_column: str = '日期',
|
933
935
|
auto_create: bool = True,
|
934
936
|
indexes: Optional[List[str]] = None,
|
935
|
-
update_on_duplicate: bool = False
|
937
|
+
update_on_duplicate: bool = False,
|
938
|
+
transaction_mode: str = "batch"
|
936
939
|
):
|
937
940
|
"""
|
938
941
|
上传数据到数据库的主入口方法
|
@@ -950,6 +953,11 @@ class MySQLUploader:
|
|
950
953
|
:param auto_create: 表不存在时是否自动创建,默认为True
|
951
954
|
:param indexes: 需要创建索引的列列表,可选
|
952
955
|
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
956
|
+
:param transaction_mode: 事务提交模式,可选值:
|
957
|
+
- 'row' : 逐行提交事务(错误隔离性好)
|
958
|
+
- 'batch' : 整批提交事务(性能最优)
|
959
|
+
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
960
|
+
默认值为 'batch'
|
953
961
|
:raises: 可能抛出各种验证和数据库相关异常
|
954
962
|
"""
|
955
963
|
upload_start = time.time()
|
@@ -1035,7 +1043,7 @@ class MySQLUploader:
|
|
1035
1043
|
db_name, part_table, part_data, filtered_set_typ,
|
1036
1044
|
primary_keys, check_duplicate, duplicate_columns,
|
1037
1045
|
allow_null, auto_create, partition_date_column,
|
1038
|
-
indexes, batch_id, update_on_duplicate
|
1046
|
+
indexes, batch_id, update_on_duplicate, transaction_mode
|
1039
1047
|
)
|
1040
1048
|
except Exception as e:
|
1041
1049
|
logger.error(sys._getframe().f_code.co_name, {
|
@@ -1051,7 +1059,7 @@ class MySQLUploader:
|
|
1051
1059
|
db_name, table_name, prepared_data, filtered_set_typ,
|
1052
1060
|
primary_keys, check_duplicate, duplicate_columns,
|
1053
1061
|
allow_null, auto_create, partition_date_column,
|
1054
|
-
indexes, batch_id, update_on_duplicate
|
1062
|
+
indexes, batch_id, update_on_duplicate, transaction_mode
|
1055
1063
|
)
|
1056
1064
|
|
1057
1065
|
success_flag = True
|
@@ -1083,7 +1091,8 @@ class MySQLUploader:
|
|
1083
1091
|
duplicate_columns: Optional[List[str]] = None,
|
1084
1092
|
batch_size: int = 1000,
|
1085
1093
|
batch_id: Optional[str] = None,
|
1086
|
-
update_on_duplicate: bool = False
|
1094
|
+
update_on_duplicate: bool = False,
|
1095
|
+
transaction_mode: str = "batch"
|
1087
1096
|
):
|
1088
1097
|
"""
|
1089
1098
|
实际执行数据插入的方法
|
@@ -1097,15 +1106,37 @@ class MySQLUploader:
|
|
1097
1106
|
:param batch_size: 批量插入大小,默认为1000
|
1098
1107
|
:param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
|
1099
1108
|
:param batch_id: 批次ID用于日志追踪,可选
|
1109
|
+
:param transaction_mode: 事务提交模式,可选值:
|
1110
|
+
- 'row' : 逐行提交事务(错误隔离性好)
|
1111
|
+
- 'batch' : 整批提交事务(性能最优)
|
1112
|
+
- 'hybrid' : 混合模式(每N行提交,平衡性能与安全性)
|
1113
|
+
默认值为 'batch'
|
1100
1114
|
"""
|
1101
1115
|
if not data:
|
1102
1116
|
return
|
1103
1117
|
|
1118
|
+
valid_modes = ('row', 'batch', 'hybrid')
|
1119
|
+
if transaction_mode.lower() not in valid_modes:
|
1120
|
+
logger.error(sys._getframe().f_code.co_name, {
|
1121
|
+
'库': db_name,
|
1122
|
+
'表': table_name,
|
1123
|
+
'参数异常': f'transaction_mode -> {transaction_mode}',
|
1124
|
+
'可选值': valid_modes,
|
1125
|
+
'自动使用默认模式': 'batch'
|
1126
|
+
})
|
1127
|
+
transaction_mode = 'batch'
|
1128
|
+
# return
|
1129
|
+
|
1104
1130
|
# 获取所有列名(排除id列)
|
1105
1131
|
all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
|
1106
1132
|
safe_columns = [self._validate_identifier(col) for col in all_columns]
|
1107
1133
|
placeholders = ','.join(['%s'] * len(safe_columns))
|
1108
1134
|
|
1135
|
+
# 初始化统计变量
|
1136
|
+
total_inserted = 0
|
1137
|
+
total_skipped = 0
|
1138
|
+
total_failed = 0
|
1139
|
+
|
1109
1140
|
# 构建基础SQL语句
|
1110
1141
|
if check_duplicate:
|
1111
1142
|
if not duplicate_columns:
|
@@ -1116,10 +1147,7 @@ class MySQLUploader:
|
|
1116
1147
|
conditions = []
|
1117
1148
|
for col in duplicate_columns:
|
1118
1149
|
col_type = set_typ.get(col, '').lower()
|
1119
|
-
|
1120
|
-
# 处理DECIMAL类型,使用ROUND确保精度一致
|
1121
1150
|
if col_type.startswith('decimal'):
|
1122
|
-
# 提取小数位数,如DECIMAL(10,2)提取2
|
1123
1151
|
scale_match = re.search(r'decimal\(\d+,(\d+)\)', col_type)
|
1124
1152
|
scale = int(scale_match.group(1)) if scale_match else 2
|
1125
1153
|
conditions.append(f"ROUND(`{self._validate_identifier(col)}`, {scale}) = ROUND(%s, {scale})")
|
@@ -1137,10 +1165,6 @@ class MySQLUploader:
|
|
1137
1165
|
VALUES ({placeholders})
|
1138
1166
|
ON DUPLICATE KEY UPDATE {update_clause}
|
1139
1167
|
"""
|
1140
|
-
|
1141
|
-
# 注意:在update_on_duplicate模式下,row_values只需要插入数据,不需要排重列值
|
1142
|
-
def prepare_values(row):
|
1143
|
-
return [row.get(col) for col in all_columns]
|
1144
1168
|
else:
|
1145
1169
|
sql = f"""INSERT INTO `{db_name}`.`{table_name}`
|
1146
1170
|
(`{'`,`'.join(safe_columns)}`)
|
@@ -1151,10 +1175,6 @@ class MySQLUploader:
|
|
1151
1175
|
WHERE {where_clause}
|
1152
1176
|
)
|
1153
1177
|
"""
|
1154
|
-
|
1155
|
-
# 在check_duplicate模式下,row_values需要插入数据+排重列值
|
1156
|
-
def prepare_values(row):
|
1157
|
-
return [row.get(col) for col in all_columns] + [row.get(col) for col in duplicate_columns]
|
1158
1178
|
else:
|
1159
1179
|
sql = f"""
|
1160
1180
|
INSERT INTO `{db_name}`.`{table_name}`
|
@@ -1162,35 +1182,44 @@ class MySQLUploader:
|
|
1162
1182
|
VALUES ({placeholders})
|
1163
1183
|
"""
|
1164
1184
|
|
1165
|
-
# 普通模式下,row_values只需要插入数据
|
1166
|
-
def prepare_values(row):
|
1167
|
-
return [row.get(col) for col in all_columns]
|
1168
|
-
|
1169
|
-
total_inserted = 0
|
1170
|
-
total_skipped = 0
|
1171
|
-
total_failed = 0 # 失败计数器
|
1172
|
-
|
1173
1185
|
# 分批插入数据
|
1174
1186
|
with self._get_connection() as conn:
|
1175
1187
|
with conn.cursor() as cursor:
|
1176
1188
|
for i in range(0, len(data), batch_size):
|
1177
|
-
batch_start = time.time()
|
1178
1189
|
batch = data[i:i + batch_size]
|
1179
|
-
|
1190
|
+
batch_inserted = 0
|
1191
|
+
batch_skipped = 0
|
1192
|
+
batch_failed = 0
|
1180
1193
|
|
1181
|
-
for row in batch:
|
1194
|
+
for row_idx, row in enumerate(batch, 1):
|
1182
1195
|
try:
|
1183
1196
|
# 准备参数
|
1184
|
-
row_values =
|
1197
|
+
row_values = [row.get(col) for col in all_columns]
|
1198
|
+
if check_duplicate and not update_on_duplicate:
|
1199
|
+
row_values += [row.get(col) for col in duplicate_columns]
|
1200
|
+
|
1185
1201
|
cursor.execute(sql, row_values)
|
1186
|
-
successful_rows += 1
|
1187
|
-
conn.commit() # 每次成功插入后提交
|
1188
1202
|
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1203
|
+
if check_duplicate:
|
1204
|
+
# 检查是否实际插入了行
|
1205
|
+
if cursor.rowcount > 0:
|
1206
|
+
batch_inserted += 1
|
1207
|
+
else:
|
1208
|
+
batch_skipped += 1
|
1209
|
+
else:
|
1210
|
+
batch_inserted += 1
|
1211
|
+
|
1212
|
+
# 根据模式决定提交时机
|
1213
|
+
if transaction_mode == 'row':
|
1214
|
+
conn.commit() # 逐行提交
|
1215
|
+
elif transaction_mode == 'hybrid' and row_idx % 100 == 0:
|
1216
|
+
conn.commit() # 每100行提交一次
|
1192
1217
|
|
1193
|
-
|
1218
|
+
except Exception as e:
|
1219
|
+
# if transaction_mode == 'row':
|
1220
|
+
# conn.rollback()
|
1221
|
+
conn.rollback()
|
1222
|
+
batch_failed += 1
|
1194
1223
|
logger.error(sys._getframe().f_code.co_name, {
|
1195
1224
|
'库': db_name,
|
1196
1225
|
'表': table_name,
|
@@ -1199,31 +1228,30 @@ class MySQLUploader:
|
|
1199
1228
|
'单行插入失败': str(e),
|
1200
1229
|
'数据类型': set_typ,
|
1201
1230
|
'是否排重': check_duplicate,
|
1202
|
-
'排重列': duplicate_columns
|
1231
|
+
'排重列': duplicate_columns,
|
1232
|
+
'事务提交模式': transaction_mode,
|
1203
1233
|
})
|
1204
|
-
continue # 跳过当前行,继续处理下一行
|
1205
|
-
|
1206
|
-
# 更新统计信息
|
1207
|
-
if check_duplicate:
|
1208
|
-
cursor.execute("SELECT ROW_COUNT()")
|
1209
|
-
affected_rows = cursor.rowcount
|
1210
|
-
total_inserted += affected_rows
|
1211
|
-
total_skipped += len(batch) - affected_rows - (len(batch) - successful_rows)
|
1212
|
-
else:
|
1213
|
-
total_inserted += successful_rows
|
1214
1234
|
|
1215
|
-
|
1235
|
+
# 批量模式最后统一提交
|
1236
|
+
if transaction_mode in ('batch', 'hybrid'):
|
1237
|
+
conn.commit()
|
1238
|
+
|
1239
|
+
# 更新总统计
|
1240
|
+
total_inserted += batch_inserted
|
1241
|
+
total_skipped += batch_skipped
|
1242
|
+
total_failed += batch_failed
|
1243
|
+
|
1216
1244
|
logger.debug(sys._getframe().f_code.co_name, {
|
1217
1245
|
'库': db_name,
|
1218
1246
|
'表': table_name,
|
1219
1247
|
'批次': batch_id,
|
1220
1248
|
'批次处理完成': i // batch_size + 1,
|
1221
|
-
'
|
1222
|
-
'
|
1223
|
-
'
|
1224
|
-
'
|
1225
|
-
'
|
1226
|
-
'
|
1249
|
+
'总批次': (len(data) + batch_size - 1) // batch_size,
|
1250
|
+
'数据量': len(batch),
|
1251
|
+
'插入': batch_inserted,
|
1252
|
+
'跳过': batch_skipped,
|
1253
|
+
'失败': batch_failed,
|
1254
|
+
'事务提交模式': transaction_mode,
|
1227
1255
|
})
|
1228
1256
|
|
1229
1257
|
logger.info('插入完成', {
|
@@ -1232,7 +1260,8 @@ class MySQLUploader:
|
|
1232
1260
|
'完成总计': len(data),
|
1233
1261
|
'插入': total_inserted,
|
1234
1262
|
'跳过': total_skipped,
|
1235
|
-
'失败': total_failed
|
1263
|
+
'失败': total_failed,
|
1264
|
+
'事务提交模式': transaction_mode,
|
1236
1265
|
})
|
1237
1266
|
|
1238
1267
|
def close(self):
|
@@ -1343,6 +1372,7 @@ def main():
|
|
1343
1372
|
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
1344
1373
|
auto_create=True, # 表不存在时自动创建, 默认参数不要更改
|
1345
1374
|
indexes=[], # 指定索引列
|
1375
|
+
transaction_mode='row', # 事务模式
|
1346
1376
|
)
|
1347
1377
|
|
1348
1378
|
uploader.close()
|
@@ -28,10 +28,23 @@ config_file = os.path.join(dir_path, 'spd.txt')
|
|
28
28
|
content = config.read_config(file_path=config_file)
|
29
29
|
username, password, host, port = content['username'], content['password'], content['host'], content['port']
|
30
30
|
|
31
|
-
m_engine = mysql.MysqlUpload(username=username, password=password, host=host, port=port, charset='utf8mb4')
|
31
|
+
# m_engine = mysql.MysqlUpload(username=username, password=password, host=host, port=port, charset='utf8mb4')
|
32
|
+
uld = uploader.MySQLUploader(username=username, password=password, host=host, port=int(port), pool_size=10)
|
32
33
|
# 实例化一个数据查询类,用来获取 cookies 表数据
|
33
34
|
download = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
34
|
-
logger =
|
35
|
+
logger = mylogger.MyLogger(
|
36
|
+
name='aikucun',
|
37
|
+
logging_mode='file',
|
38
|
+
log_level='info',
|
39
|
+
log_file='aikucun.log',
|
40
|
+
log_format='json',
|
41
|
+
max_log_size=50,
|
42
|
+
backup_count=5,
|
43
|
+
enable_async=False, # 是否启用异步日志
|
44
|
+
sample_rate=1, # 采样50%的DEBUG/INFO日志
|
45
|
+
sensitive_fields=[], # 敏感字段列表
|
46
|
+
enable_metrics=False, # 是否启用性能指标
|
47
|
+
)
|
35
48
|
|
36
49
|
|
37
50
|
def keep_connect(_db_name, _config, max_try: int=10):
|
@@ -175,10 +188,26 @@ class AikuCun:
|
|
175
188
|
'更新时间': 'timestamp'
|
176
189
|
}
|
177
190
|
# 更新至数据库记录
|
178
|
-
m_engine.dict_to_mysql(
|
191
|
+
# m_engine.dict_to_mysql(
|
192
|
+
# db_name=self.db_name,
|
193
|
+
# table_name=self.table_name,
|
194
|
+
# dict_data=self.token,
|
195
|
+
# )
|
196
|
+
uld.upload_data(
|
179
197
|
db_name=self.db_name,
|
180
198
|
table_name=self.table_name,
|
181
|
-
|
199
|
+
data=self.token,
|
200
|
+
set_typ={},
|
201
|
+
primary_keys=[],
|
202
|
+
check_duplicate=False,
|
203
|
+
update_on_duplicate=False,
|
204
|
+
duplicate_columns=[],
|
205
|
+
allow_null=False,
|
206
|
+
partition_by=None,
|
207
|
+
partition_date_column='日期',
|
208
|
+
auto_create=True,
|
209
|
+
indexes=[],
|
210
|
+
transaction_mode='row', # 事务模式
|
182
211
|
)
|
183
212
|
|
184
213
|
def get_data_from_bbx(self, start_date=None, end_date=None, item_type='spu', page_num=1, page_size=300):
|
@@ -400,14 +429,30 @@ class AikuCun:
|
|
400
429
|
drop_dup = ['日期', '平台', '店铺名称', '商品款号', '访客量']
|
401
430
|
else:
|
402
431
|
drop_dup = ['日期', '平台', '店铺名称', '条码']
|
403
|
-
m_engine.insert_many_dict(
|
432
|
+
# m_engine.insert_many_dict(
|
433
|
+
# db_name=db_name,
|
434
|
+
# table_name=table_name,
|
435
|
+
# dict_data_list=_results,
|
436
|
+
# icm_update=drop_dup, # 唯一组合键
|
437
|
+
# # unique_main_key=['人群id'],
|
438
|
+
# set_typ=set_typ,
|
439
|
+
# allow_not_null=False, # 创建允许插入空值的列
|
440
|
+
# )
|
441
|
+
uld.upload_data(
|
404
442
|
db_name=db_name,
|
405
443
|
table_name=table_name,
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
444
|
+
data=_results,
|
445
|
+
set_typ=set_typ, # 定义列和数据类型
|
446
|
+
primary_keys=[], # 创建唯一主键
|
447
|
+
check_duplicate=True, # 检查重复数据
|
448
|
+
update_on_duplicate=False, # 遇到重复时更新数据,默认 False 跳过
|
449
|
+
duplicate_columns=drop_dup, # 指定排重的组合键
|
450
|
+
allow_null=False, # 允许插入空值
|
451
|
+
partition_by=None, # 按年/月分表
|
452
|
+
partition_date_column='日期', # 用于分表的日期列名,默认为'日期'
|
453
|
+
auto_create=True, # 表不存在时自动创建, 默认参数不要更改
|
454
|
+
indexes=[], # 指定索引列
|
455
|
+
transaction_mode='row', # 事务模式
|
411
456
|
)
|
412
457
|
|
413
458
|
def get_sign(self):
|
mdbq-3.9.17/mdbq/__version__.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
VERSION = '3.9.17'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|