mdbq 4.0.8__py3-none-any.whl → 4.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +83 -62
- mdbq/mysql/s_query.py +183 -144
- mdbq/mysql/uploader.py +58 -9
- {mdbq-4.0.8.dist-info → mdbq-4.0.10.dist-info}/METADATA +1 -1
- {mdbq-4.0.8.dist-info → mdbq-4.0.10.dist-info}/RECORD +8 -8
- {mdbq-4.0.8.dist-info → mdbq-4.0.10.dist-info}/WHEEL +0 -0
- {mdbq-4.0.8.dist-info → mdbq-4.0.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.10'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -119,9 +119,16 @@ def upload_data_decorator(**upload_kwargs):
|
|
119
119
|
return None
|
120
120
|
|
121
121
|
# 处理 DataFrame 结果
|
122
|
-
if isinstance(result, pd.DataFrame):
|
122
|
+
if isinstance(result, (pd.DataFrame, list, dict)):
|
123
123
|
if set_type is not None:
|
124
|
-
|
124
|
+
if isinstance(result, pd.DataFrame):
|
125
|
+
result = reorder_columns(result, set_type)
|
126
|
+
elif isinstance(result, list):
|
127
|
+
# 如果是list,转换为DataFrame以调整列顺序
|
128
|
+
result = reorder_columns(pd.DataFrame(result), set_type)
|
129
|
+
elif isinstance(result, dict):
|
130
|
+
# 如果是dict,转换为DataFrame以调整列顺序
|
131
|
+
result = reorder_columns(pd.DataFrame([result]), set_type)
|
125
132
|
|
126
133
|
# 合并参数
|
127
134
|
merged_kwargs = {
|
@@ -143,12 +150,19 @@ def upload_data_decorator(**upload_kwargs):
|
|
143
150
|
|
144
151
|
df, extra_kwargs = result[0], result[1]
|
145
152
|
|
146
|
-
if not isinstance(df, pd.DataFrame):
|
147
|
-
logger.warning('函数返回的元组第一个元素不是DataFrame,直接返回原结果,不执行上传', {'函数': func.__name__, '库': db_name, '表': table_name})
|
153
|
+
if not isinstance(df, (pd.DataFrame, list, dict)):
|
154
|
+
logger.warning('函数返回的元组第一个元素不是DataFrame/list/dict,直接返回原结果,不执行上传', {'函数': func.__name__, '库': db_name, '表': table_name})
|
148
155
|
return result
|
149
156
|
|
150
157
|
if set_type is not None:
|
151
|
-
|
158
|
+
if isinstance(df, pd.DataFrame):
|
159
|
+
df = reorder_columns(df, set_type)
|
160
|
+
elif isinstance(df, list):
|
161
|
+
# 如果是list,转换为DataFrame以调整列顺序
|
162
|
+
df = reorder_columns(pd.DataFrame(df), set_type)
|
163
|
+
elif isinstance(df, dict):
|
164
|
+
# 如果是dict,转换为DataFrame以调整列顺序
|
165
|
+
df = reorder_columns(pd.DataFrame([df]), set_type)
|
152
166
|
result = (df, extra_kwargs) + result[2:]
|
153
167
|
|
154
168
|
# 合并参数
|
@@ -1543,7 +1557,6 @@ class MysqlDatasQuery:
|
|
1543
1557
|
'unique_keys': [['日期', '店铺id', '商品id']], # 唯一约束列表
|
1544
1558
|
}
|
1545
1559
|
|
1546
|
-
|
1547
1560
|
@upload_data_decorator()
|
1548
1561
|
def spph(self, db_name='聚合数据', table_name='天猫_商品排行'):
|
1549
1562
|
""" """
|
@@ -2370,61 +2383,57 @@ class MysqlDatasQuery:
|
|
2370
2383
|
'更新时间': 'timestamp',
|
2371
2384
|
}
|
2372
2385
|
logger.info('正在更新数据库', {'主机': f'{host}:{port}', '库': db_name, '表': table_name})
|
2373
|
-
|
2374
|
-
'日期'
|
2375
|
-
'店铺名称'
|
2376
|
-
'场次信息'
|
2377
|
-
'场次id'
|
2378
|
-
'直播开播时间'
|
2379
|
-
'开播时长'
|
2380
|
-
'封面图点击率'
|
2381
|
-
'观看人数'
|
2382
|
-
'观看次数'
|
2383
|
-
'新增粉丝数'
|
2384
|
-
'流量券消耗'
|
2385
|
-
'观看总时长'
|
2386
|
-
'人均观看时长'
|
2387
|
-
'次均观看时长'
|
2388
|
-
'商品点击人数'
|
2389
|
-
'商品点击次数'
|
2390
|
-
'商品点击率'
|
2391
|
-
'加购人数'
|
2392
|
-
'加购件数'
|
2393
|
-
'加购次数'
|
2394
|
-
'成交金额'
|
2395
|
-
'成交人数'
|
2396
|
-
'成交件数'
|
2397
|
-
'成交笔数'
|
2398
|
-
'成交转化率'
|
2399
|
-
'退款人数'
|
2400
|
-
'退款笔数'
|
2401
|
-
'退款件数'
|
2402
|
-
'退款金额'
|
2403
|
-
'预售定金支付金额'
|
2404
|
-
'预售预估总金额'
|
2405
|
-
|
2406
|
-
|
2407
|
-
for
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
2412
|
-
|
2413
|
-
|
2414
|
-
|
2415
|
-
|
2416
|
-
|
2417
|
-
|
2418
|
-
|
2419
|
-
|
2420
|
-
|
2421
|
-
|
2422
|
-
|
2423
|
-
|
2424
|
-
'unique_keys': [['场次id']], # 唯一约束列表
|
2425
|
-
}
|
2426
|
-
else:
|
2427
|
-
return None, None
|
2386
|
+
ordered_columns = [
|
2387
|
+
'日期',
|
2388
|
+
'店铺名称',
|
2389
|
+
'场次信息',
|
2390
|
+
'场次id',
|
2391
|
+
'直播开播时间',
|
2392
|
+
'开播时长',
|
2393
|
+
'封面图点击率',
|
2394
|
+
'观看人数',
|
2395
|
+
'观看次数',
|
2396
|
+
'新增粉丝数',
|
2397
|
+
'流量券消耗',
|
2398
|
+
'观看总时长',
|
2399
|
+
'人均观看时长',
|
2400
|
+
'次均观看时长',
|
2401
|
+
'商品点击人数',
|
2402
|
+
'商品点击次数',
|
2403
|
+
'商品点击率',
|
2404
|
+
'加购人数',
|
2405
|
+
'加购件数',
|
2406
|
+
'加购次数',
|
2407
|
+
'成交金额',
|
2408
|
+
'成交人数',
|
2409
|
+
'成交件数',
|
2410
|
+
'成交笔数',
|
2411
|
+
'成交转化率',
|
2412
|
+
'退款人数',
|
2413
|
+
'退款笔数',
|
2414
|
+
'退款件数',
|
2415
|
+
'退款金额',
|
2416
|
+
'预售定金支付金额',
|
2417
|
+
'预售预估总金额',
|
2418
|
+
]
|
2419
|
+
# 使用reindex重排列顺序,未定义的列会自动放在最后
|
2420
|
+
df = df.reindex(columns=[col for col in ordered_columns if col in df.columns] +
|
2421
|
+
[col for col in df.columns if col not in ordered_columns])
|
2422
|
+
return df, {
|
2423
|
+
'db_name': db_name,
|
2424
|
+
'table_name': table_name,
|
2425
|
+
'set_typ': set_typ,
|
2426
|
+
'primary_keys': [], # 创建唯一主键
|
2427
|
+
'check_duplicate': False, # 检查重复数据
|
2428
|
+
'duplicate_columns': [], # 指定排重的组合键
|
2429
|
+
'update_on_duplicate': True, # 更新旧数据
|
2430
|
+
'allow_null': False, # 允许插入空值
|
2431
|
+
'partition_by': None, # 分表方式
|
2432
|
+
'partition_date_column': '日期', # 用于分表的日期列名,默认为'日期'
|
2433
|
+
'indexes': [], # 普通索引列
|
2434
|
+
'transaction_mode': 'batch', # 事务模式
|
2435
|
+
'unique_keys': [['场次id']], # 唯一约束列表
|
2436
|
+
}
|
2428
2437
|
|
2429
2438
|
# @try_except
|
2430
2439
|
@upload_data_decorator()
|
@@ -3675,13 +3684,25 @@ def main(months=3):
|
|
3675
3684
|
password=password,
|
3676
3685
|
host=host,
|
3677
3686
|
port=port,
|
3678
|
-
maxconnections=
|
3687
|
+
maxconnections=20,
|
3679
3688
|
)
|
3680
3689
|
query1(download_manager=download_manager, months=months)
|
3681
3690
|
query2(download_manager=download_manager, months=months)
|
3682
3691
|
query3(download_manager=download_manager, months=months)
|
3692
|
+
logger.info('数据聚合完成')
|
3683
3693
|
|
3684
3694
|
|
3685
3695
|
if __name__ == '__main__':
|
3686
3696
|
main(months=3)
|
3687
3697
|
pass
|
3698
|
+
|
3699
|
+
# download_manager = s_query.QueryDatas(
|
3700
|
+
# username=username,
|
3701
|
+
# password=password,
|
3702
|
+
# host=host,
|
3703
|
+
# port=port,
|
3704
|
+
# maxconnections=10,
|
3705
|
+
# )
|
3706
|
+
# sdq = MysqlDatasQuery(download_manager=download_manager)
|
3707
|
+
# sdq.months = 3
|
3708
|
+
# sdq.zb_ccfx(db_name='聚合数据', table_name='生意参谋_直播场次分析')
|
mdbq/mysql/s_query.py
CHANGED
@@ -47,9 +47,9 @@ class QueryDatas:
|
|
47
47
|
host: 数据库主机
|
48
48
|
port: 数据库端口
|
49
49
|
charset: 字符集,默认utf8mb4
|
50
|
-
maxconnections:
|
51
|
-
mincached:
|
52
|
-
maxcached:
|
50
|
+
maxconnections: 最大活动连接数,默认20
|
51
|
+
mincached: 最小缓存连接数,空闲连接数量,默认2
|
52
|
+
maxcached: 最大缓存连接数,最大空闲连接数,默认5
|
53
53
|
connect_timeout: 连接超时时间,默认10秒
|
54
54
|
read_timeout: 读取超时时间,默认30秒
|
55
55
|
write_timeout: 写入超时时间,默认30秒
|
@@ -253,20 +253,8 @@ class QueryDatas:
|
|
253
253
|
|
254
254
|
# @_execute_with_retry
|
255
255
|
def _get_connection(self, db_name: Optional[str] = None) -> pymysql.connections.Connection:
|
256
|
-
"""
|
257
|
-
从连接池获取数据库连接
|
258
|
-
|
259
|
-
Args:
|
260
|
-
db_name: 可选的数据库名,如果提供则会在连接后选择该数据库
|
261
|
-
|
262
|
-
Returns:
|
263
|
-
数据库连接对象
|
264
|
-
|
265
|
-
Raises:
|
266
|
-
ConnectionError: 当获取连接失败时抛出
|
267
|
-
"""
|
256
|
+
"""从连接池获取数据库连接"""
|
268
257
|
try:
|
269
|
-
# 只在连续失败次数达到阈值时检查健康状态
|
270
258
|
if self._pool_stats['consecutive_failures'] >= self._pool_stats['max_consecutive_failures']:
|
271
259
|
if not self._check_pool_health():
|
272
260
|
logger.warning('连接池不健康,尝试重新创建')
|
@@ -282,66 +270,184 @@ class QueryDatas:
|
|
282
270
|
error_code = e.args[0] if e.args else None
|
283
271
|
if error_code in (2003, 2006, 2013):
|
284
272
|
logger.error('数据库连接错误', {
|
273
|
+
'库': db_name,
|
285
274
|
'错误代码': error_code,
|
286
275
|
'错误信息': str(e),
|
287
|
-
'数据库': db_name
|
288
276
|
})
|
289
277
|
self.pool = self._create_connection_pool(10, 2, 5)
|
290
278
|
self._pool_stats['consecutive_failures'] = 0
|
291
279
|
raise ConnectionError(f'数据库连接错误: {str(e)}')
|
292
|
-
|
293
|
-
raise
|
280
|
+
raise
|
294
281
|
except Exception as e:
|
295
282
|
logger.error('从连接池获取数据库连接失败', {
|
283
|
+
'库': db_name,
|
296
284
|
'错误': str(e),
|
297
|
-
'数据库': db_name
|
298
285
|
})
|
299
286
|
raise ConnectionError(f'连接数据库失败: {str(e)}')
|
300
287
|
|
301
288
|
# @_execute_with_retry
|
302
|
-
def _execute_query(self, sql: str, params: tuple = None, db_name: str = None
|
289
|
+
def _execute_query(self, sql: str, params: tuple = None, db_name: str = None,
|
290
|
+
fetch_all: bool = True, error_handling: bool = True) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
|
291
|
+
"""执行SQL查询的通用方法"""
|
292
|
+
try:
|
293
|
+
if sql.upper().startswith('SHOW DATABASES'):
|
294
|
+
with closing(self._get_connection()) as connection:
|
295
|
+
with closing(connection.cursor()) as cursor:
|
296
|
+
cursor.execute(sql, params)
|
297
|
+
return cursor.fetchall() if fetch_all else cursor.fetchone()
|
298
|
+
else:
|
299
|
+
with closing(self._get_connection(db_name)) as connection:
|
300
|
+
with closing(connection.cursor()) as cursor:
|
301
|
+
cursor.execute(sql, params)
|
302
|
+
return cursor.fetchall() if fetch_all else cursor.fetchone()
|
303
|
+
except pymysql.OperationalError as e:
|
304
|
+
error_code = e.args[0] if e.args else None
|
305
|
+
if error_handling:
|
306
|
+
if error_code in (1045, 1049): # 访问被拒绝或数据库不存在
|
307
|
+
logger.error('数据库访问错误', {
|
308
|
+
'SQL': sql,
|
309
|
+
'参数': params,
|
310
|
+
'库': db_name,
|
311
|
+
'错误代码': error_code,
|
312
|
+
'错误信息': str(e)
|
313
|
+
})
|
314
|
+
else:
|
315
|
+
logger.error('数据库操作错误', {
|
316
|
+
'库': db_name,
|
317
|
+
'SQL': sql,
|
318
|
+
'参数': params,
|
319
|
+
'错误代码': error_code,
|
320
|
+
'错误信息': str(e)
|
321
|
+
})
|
322
|
+
return None
|
323
|
+
raise
|
324
|
+
except Exception as e:
|
325
|
+
if error_handling:
|
326
|
+
logger.error('执行SQL查询失败', {
|
327
|
+
'库': db_name,
|
328
|
+
'SQL': sql,
|
329
|
+
'参数': params,
|
330
|
+
'错误类型': type(e).__name__,
|
331
|
+
'错误信息': str(e)
|
332
|
+
})
|
333
|
+
return None
|
334
|
+
raise
|
335
|
+
|
336
|
+
def _get_table_info(self, db_name: str, table_name: str, info_type: Literal['columns', 'dtypes', 'exists'] = 'exists') -> Union[bool, List[Dict[str, Any]], List[str]]:
|
303
337
|
"""
|
304
|
-
|
338
|
+
获取表信息的通用方法。
|
305
339
|
|
306
340
|
Args:
|
307
|
-
sql: SQL查询语句
|
308
|
-
params: 查询参数
|
309
341
|
db_name: 数据库名
|
310
|
-
|
342
|
+
table_name: 表名
|
343
|
+
info_type: 信息类型
|
344
|
+
- 'exists': 检查表是否存在(默认)
|
345
|
+
- 'columns': 获取列名列表
|
346
|
+
- 'dtypes': 获取列名和类型
|
347
|
+
|
311
348
|
Returns:
|
312
|
-
|
349
|
+
根据info_type返回不同类型的信息:
|
350
|
+
- 'exists': 返回bool,表示表是否存在
|
351
|
+
- 'columns': 返回列名列表
|
352
|
+
- 'dtypes': 返回列名和类型的列表
|
313
353
|
"""
|
314
354
|
try:
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
355
|
+
if info_type == 'exists':
|
356
|
+
result = self._execute_query("SHOW DATABASES LIKE %s", (db_name,))
|
357
|
+
if not result:
|
358
|
+
all_dbs = self._execute_query("SHOW DATABASES")
|
359
|
+
available_dbs = [db['Database'] for db in all_dbs] if all_dbs else []
|
360
|
+
logger.info('数据库不存在', {
|
361
|
+
'库': db_name,
|
362
|
+
'可用的数据库': available_dbs,
|
363
|
+
'可能的原因': '数据库名称错误或没有访问权限'
|
364
|
+
})
|
365
|
+
return False
|
366
|
+
|
367
|
+
result = self._execute_query("SHOW TABLES LIKE %s", (table_name,), db_name=db_name)
|
368
|
+
if not result:
|
369
|
+
all_tables = self._execute_query("SHOW TABLES", db_name=db_name)
|
370
|
+
available_tables = [table[f'Tables_in_{db_name}'] for table in all_tables] if all_tables else []
|
371
|
+
logger.info('表不存在', {
|
372
|
+
'库': db_name,
|
373
|
+
'表': table_name,
|
374
|
+
'可用的表': available_tables,
|
375
|
+
'可能的原因': '表名称错误或没有访问权限'
|
376
|
+
})
|
377
|
+
return False
|
378
|
+
return True
|
379
|
+
|
380
|
+
elif info_type == 'columns':
|
381
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
382
|
+
result = self._execute_query(sql, (db_name, table_name))
|
383
|
+
return [col['COLUMN_NAME'] for col in result] if result else []
|
384
|
+
|
385
|
+
elif info_type == 'dtypes':
|
386
|
+
sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
387
|
+
return self._execute_query(sql, (db_name, table_name)) or []
|
388
|
+
|
319
389
|
except Exception as e:
|
320
|
-
logger.error('
|
321
|
-
'
|
322
|
-
'
|
323
|
-
'
|
390
|
+
logger.error('获取表信息失败', {
|
391
|
+
'库': db_name,
|
392
|
+
'表': table_name,
|
393
|
+
'信息类型': info_type,
|
324
394
|
'错误类型': type(e).__name__,
|
325
395
|
'错误信息': str(e)
|
326
396
|
})
|
327
|
-
return
|
397
|
+
return [] if info_type != 'exists' else False
|
328
398
|
|
329
|
-
def
|
330
|
-
"""
|
331
|
-
|
399
|
+
def check_infos(self, db_name: str, table_name: str) -> bool:
|
400
|
+
"""检查数据库和数据表是否存在"""
|
401
|
+
return self._get_table_info(db_name, table_name, 'exists')
|
402
|
+
|
403
|
+
def _format_columns(self, columns: List[str]) -> str:
|
404
|
+
"""格式化列名列表为SQL语句"""
|
405
|
+
return ', '.join([f'`{col}`' for col in columns])
|
406
|
+
|
407
|
+
def columns_to_list(self, db_name: str, table_name: str, columns_name: List[str], where: str = None) -> List[Dict[str, Any]]:
|
408
|
+
"""获取数据表的指定列数据"""
|
409
|
+
if not self._get_table_info(db_name, table_name):
|
410
|
+
return []
|
332
411
|
|
333
|
-
|
334
|
-
db_name
|
335
|
-
|
336
|
-
condition: SQL条件字符串(不含WHERE)
|
337
|
-
columns: 查询字段字符串或以逗号分隔的字段名,默认'更新时间'
|
412
|
+
try:
|
413
|
+
existing_columns = self._get_table_info(db_name, table_name, 'columns')
|
414
|
+
columns_name = [col for col in columns_name if col in existing_columns]
|
338
415
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
416
|
+
if not columns_name:
|
417
|
+
logger.info('未找到匹配的列名', {'库': db_name, '表': table_name, '请求列': columns_name})
|
418
|
+
return []
|
419
|
+
|
420
|
+
sql = f"SELECT {self._format_columns(columns_name)} FROM `{db_name}`.`{table_name}`"
|
421
|
+
if where:
|
422
|
+
sql += f" WHERE {where}"
|
344
423
|
|
424
|
+
logger.debug('执行列查询', {'库': db_name, '表': table_name, 'SQL': sql})
|
425
|
+
return self._execute_query(sql, db_name=db_name) or []
|
426
|
+
|
427
|
+
except Exception as e:
|
428
|
+
logger.error('列查询失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
429
|
+
return []
|
430
|
+
|
431
|
+
def dtypes_to_list(self, db_name: str, table_name: str, columns_name: List[str] = None) -> List[Dict[str, Any]]:
|
432
|
+
"""获取数据表的列名和类型"""
|
433
|
+
if not self._get_table_info(db_name, table_name):
|
434
|
+
return []
|
435
|
+
|
436
|
+
try:
|
437
|
+
result = self._get_table_info(db_name, table_name, 'dtypes')
|
438
|
+
if columns_name:
|
439
|
+
columns_name = set(columns_name)
|
440
|
+
result = [row for row in result if row['COLUMN_NAME'] in columns_name]
|
441
|
+
return result
|
442
|
+
except Exception as e:
|
443
|
+
logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
444
|
+
return []
|
445
|
+
|
446
|
+
def check_condition(self, db_name: str, table_name: str, condition: str, columns: str = '更新时间') -> Optional[List[Dict[str, Any]]]:
|
447
|
+
"""按指定条件查询数据库表"""
|
448
|
+
if not self._get_table_info(db_name, table_name):
|
449
|
+
return None
|
450
|
+
|
345
451
|
sql = f"SELECT {columns} FROM `{table_name}` WHERE {condition}"
|
346
452
|
logger.debug('执行SQL查询', {'库': db_name, '表': table_name, 'SQL': sql})
|
347
453
|
return self._execute_query(sql, db_name=db_name)
|
@@ -598,98 +704,6 @@ class QueryDatas:
|
|
598
704
|
df[col] = df[col].astype(float)
|
599
705
|
return df
|
600
706
|
|
601
|
-
# @_execute_with_retry
|
602
|
-
def columns_to_list(self, db_name, table_name, columns_name, where: str = None) -> list:
|
603
|
-
"""
|
604
|
-
获取数据表的指定列, 支持where条件筛选, 返回列表字典。
|
605
|
-
:param db_name: 数据库名
|
606
|
-
:param table_name: 表名
|
607
|
-
:param columns_name: 需要获取的列名列表
|
608
|
-
:param where: 可选,SQL条件字符串(不含WHERE)
|
609
|
-
:return: [{列1:值, 列2:值, ...}, ...]
|
610
|
-
"""
|
611
|
-
if not self.check_infos(db_name, table_name):
|
612
|
-
return []
|
613
|
-
|
614
|
-
try:
|
615
|
-
with closing(self._get_connection(db_name)) as connection:
|
616
|
-
with closing(connection.cursor()) as cursor:
|
617
|
-
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
618
|
-
cursor.execute(sql, (db_name, table_name))
|
619
|
-
cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()]
|
620
|
-
columns_name = [item for item in columns_name if item in cols_exist]
|
621
|
-
if not columns_name:
|
622
|
-
logger.info('未找到匹配的列名', {'库': db_name, '表': table_name, '请求列': columns_name})
|
623
|
-
return []
|
624
|
-
columns_in = ', '.join([f'`{col}`' for col in columns_name])
|
625
|
-
sql = f"SELECT {columns_in} FROM `{db_name}`.`{table_name}`"
|
626
|
-
if where:
|
627
|
-
sql += f" WHERE {where}"
|
628
|
-
logger.debug('执行列查询', {'库': db_name, '表': table_name, 'SQL': sql})
|
629
|
-
cursor.execute(sql)
|
630
|
-
column_values = cursor.fetchall()
|
631
|
-
return column_values
|
632
|
-
except Exception as e:
|
633
|
-
logger.error('列查询失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
634
|
-
return []
|
635
|
-
|
636
|
-
# @_execute_with_retry
|
637
|
-
def dtypes_to_list(self, db_name, table_name, columns_name=None) -> list:
|
638
|
-
"""
|
639
|
-
获取数据表的列名和类型, 支持只返回部分字段类型。
|
640
|
-
:param db_name: 数据库名
|
641
|
-
:param table_name: 表名
|
642
|
-
:param columns_name: 可选,字段名列表,仅返回这些字段的类型
|
643
|
-
:return: [{'COLUMN_NAME': ..., 'COLUMN_TYPE': ...}, ...]
|
644
|
-
"""
|
645
|
-
if not self.check_infos(db_name, table_name):
|
646
|
-
return []
|
647
|
-
|
648
|
-
try:
|
649
|
-
with closing(self._get_connection(db_name)) as connection:
|
650
|
-
with closing(connection.cursor()) as cursor:
|
651
|
-
sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
652
|
-
cursor.execute(sql, (db_name, table_name))
|
653
|
-
column_name_and_type = cursor.fetchall()
|
654
|
-
if columns_name:
|
655
|
-
columns_name = set(columns_name)
|
656
|
-
column_name_and_type = [row for row in column_name_and_type if row['COLUMN_NAME'] in columns_name]
|
657
|
-
return column_name_and_type
|
658
|
-
except Exception as e:
|
659
|
-
logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
|
660
|
-
return []
|
661
|
-
|
662
|
-
# @_execute_with_retry
|
663
|
-
def check_infos(self, db_name, table_name) -> bool:
|
664
|
-
"""
|
665
|
-
检查数据库和数据表是否存在。
|
666
|
-
:param db_name: 数据库名
|
667
|
-
:param table_name: 表名
|
668
|
-
:return: 存在返回True,否则False
|
669
|
-
"""
|
670
|
-
try:
|
671
|
-
# 检查数据库是否存在
|
672
|
-
result = self._execute_query("SHOW DATABASES LIKE %s", (db_name,))
|
673
|
-
if not result:
|
674
|
-
logger.info('数据库不存在', {'库': db_name})
|
675
|
-
return False
|
676
|
-
|
677
|
-
# 检查表是否存在
|
678
|
-
result = self._execute_query("SHOW TABLES LIKE %s", (table_name,), db_name=db_name)
|
679
|
-
if not result:
|
680
|
-
logger.info('表不存在', {'库': db_name, '表': table_name})
|
681
|
-
return False
|
682
|
-
return True
|
683
|
-
|
684
|
-
except Exception as e:
|
685
|
-
logger.error('检查数据库或表失败', {
|
686
|
-
'库': db_name,
|
687
|
-
'表': table_name,
|
688
|
-
'错误类型': type(e).__name__,
|
689
|
-
'错误信息': str(e)
|
690
|
-
})
|
691
|
-
return False
|
692
|
-
|
693
707
|
def __enter__(self):
|
694
708
|
"""上下文管理器入口"""
|
695
709
|
return self
|
@@ -772,6 +786,8 @@ class QueryDatas:
|
|
772
786
|
- 当return_format='list_dict'时,返回列表字典
|
773
787
|
- 如果查询失败,返回空的DataFrame或空列表
|
774
788
|
"""
|
789
|
+
start_time = time.time()
|
790
|
+
|
775
791
|
if not db_name or not table_name:
|
776
792
|
logger.error('数据库名和表名不能为空', {'库': db_name, '表': table_name})
|
777
793
|
return [] if return_format == 'list_dict' else pd.DataFrame()
|
@@ -786,7 +802,7 @@ class QueryDatas:
|
|
786
802
|
start_date, end_date = self._validate_date_range(start_date, end_date, db_name, table_name)
|
787
803
|
|
788
804
|
# 检查数据库和表是否存在
|
789
|
-
if not self.
|
805
|
+
if not self._get_table_info(db_name, table_name):
|
790
806
|
return [] if return_format == 'list_dict' else pd.DataFrame()
|
791
807
|
try:
|
792
808
|
with closing(self._get_connection(db_name)) as connection:
|
@@ -863,7 +879,7 @@ class QueryDatas:
|
|
863
879
|
target_time = 1.0 # 期望每批1秒
|
864
880
|
|
865
881
|
while offset < total_count:
|
866
|
-
|
882
|
+
_p_time = time.time()
|
867
883
|
# 添加分页参数
|
868
884
|
page_sql = f"{base_sql} LIMIT %s OFFSET %s"
|
869
885
|
page_params = list(params) + [page_size, offset]
|
@@ -881,7 +897,7 @@ class QueryDatas:
|
|
881
897
|
else:
|
882
898
|
all_results = pd.concat([all_results, pd.DataFrame(page_results)], ignore_index=True)
|
883
899
|
|
884
|
-
duration = time.time() -
|
900
|
+
duration = time.time() - _p_time
|
885
901
|
page_size = self._adjust_page_size(duration, page_size, min_size, max_size, target_time)
|
886
902
|
offset += len(page_results)
|
887
903
|
logger.debug('分页查询进度', {
|
@@ -896,6 +912,21 @@ class QueryDatas:
|
|
896
912
|
|
897
913
|
if return_format == 'df' and isinstance(all_results, pd.DataFrame) and not all_results.empty:
|
898
914
|
all_results = self._convert_decimal_columns(all_results)
|
915
|
+
logger.info('查询完成', {
|
916
|
+
'库': db_name,
|
917
|
+
'表': table_name,
|
918
|
+
'总记录数': total_count,
|
919
|
+
'已获取记录数': len(all_results) if return_format == 'list_dict' else len(all_results.index),
|
920
|
+
'查询耗时': f'{time.time() - start_time:.2f}s',
|
921
|
+
'查询参数': {
|
922
|
+
'开始日期': start_date,
|
923
|
+
'结束日期': end_date,
|
924
|
+
'日期字段': date_field,
|
925
|
+
'限制行数': limit,
|
926
|
+
'分页大小': page_size,
|
927
|
+
'返回数据格式': return_format,
|
928
|
+
}
|
929
|
+
})
|
899
930
|
return all_results
|
900
931
|
|
901
932
|
except Exception as e:
|
@@ -903,7 +934,15 @@ class QueryDatas:
|
|
903
934
|
'库': db_name,
|
904
935
|
'表': table_name,
|
905
936
|
'错误类型': type(e).__name__,
|
906
|
-
'错误信息': str(e)
|
937
|
+
'错误信息': str(e),
|
938
|
+
'查询参数': {
|
939
|
+
'开始日期': start_date,
|
940
|
+
'结束日期': end_date,
|
941
|
+
'日期字段': date_field,
|
942
|
+
'限制行数': limit,
|
943
|
+
'分页大小': page_size,
|
944
|
+
'返回数据格式': return_format,
|
945
|
+
}
|
907
946
|
})
|
908
947
|
return [] if return_format == 'list_dict' else pd.DataFrame()
|
909
948
|
|
mdbq/mysql/uploader.py
CHANGED
@@ -404,7 +404,15 @@ class MySQLUploader:
|
|
404
404
|
raise ValueError('set_typ 未指定')
|
405
405
|
# set_typ的键清洗
|
406
406
|
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
407
|
-
|
407
|
+
|
408
|
+
# 处理id列和主键
|
409
|
+
column_defs = []
|
410
|
+
|
411
|
+
# 添加id列(仅在没有指定主键时)
|
412
|
+
if not primary_keys:
|
413
|
+
column_defs.append("`id` INT NOT NULL AUTO_INCREMENT")
|
414
|
+
|
415
|
+
# 添加其他列
|
408
416
|
for col_name, col_type in set_typ.items():
|
409
417
|
if col_name == 'id':
|
410
418
|
continue
|
@@ -413,18 +421,23 @@ class MySQLUploader:
|
|
413
421
|
if not allow_null and not col_type.lower().startswith('json'):
|
414
422
|
col_def += " NOT NULL"
|
415
423
|
column_defs.append(col_def)
|
424
|
+
|
416
425
|
# 主键处理逻辑调整
|
417
426
|
def _index_col_sql(col):
|
418
427
|
col_type = set_typ.get(col, '').lower()
|
419
428
|
if 'varchar' in col_type or 'text' in col_type:
|
420
429
|
return f"`{self._normalize_col(col)}`(100)"
|
421
430
|
return f"`{self._normalize_col(col)}`"
|
431
|
+
|
432
|
+
# 处理主键
|
422
433
|
if primary_keys and len(primary_keys) > 0:
|
434
|
+
# 如果指定了主键,直接使用指定的主键
|
423
435
|
safe_primary_keys = [_index_col_sql(pk) for pk in primary_keys]
|
424
436
|
primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
|
425
437
|
else:
|
426
|
-
|
438
|
+
# 如果没有指定主键,使用id作为主键
|
427
439
|
primary_key_sql = f"PRIMARY KEY (`id`)"
|
440
|
+
|
428
441
|
# 索引统一在CREATE TABLE中定义
|
429
442
|
index_defs = []
|
430
443
|
if date_column and date_column in set_typ:
|
@@ -435,15 +448,28 @@ class MySQLUploader:
|
|
435
448
|
if idx_col in set_typ:
|
436
449
|
safe_idx_col = _index_col_sql(idx_col)
|
437
450
|
index_defs.append(f"INDEX `idx_{self._normalize_col(idx_col)}` ({safe_idx_col})")
|
451
|
+
|
438
452
|
# UNIQUE KEY定义
|
439
453
|
unique_defs = []
|
440
454
|
if unique_keys:
|
441
455
|
for unique_cols in unique_keys:
|
442
456
|
if not unique_cols:
|
443
457
|
continue
|
458
|
+
# 检查唯一约束是否与主键冲突
|
459
|
+
if primary_keys:
|
460
|
+
# 如果唯一约束的列是主键的一部分,则跳过
|
461
|
+
if set(unique_cols).issubset(set(primary_keys)):
|
462
|
+
logger.warning('跳过与主键冲突的唯一约束', {
|
463
|
+
'库': db_name,
|
464
|
+
'表': table_name,
|
465
|
+
'唯一约束': unique_cols,
|
466
|
+
'主键': primary_keys
|
467
|
+
})
|
468
|
+
continue
|
444
469
|
safe_unique_cols = [_index_col_sql(col) for col in unique_cols]
|
445
470
|
unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in unique_cols])}"
|
446
471
|
unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
|
472
|
+
|
447
473
|
index_defs = list(set(index_defs))
|
448
474
|
all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
|
449
475
|
sql = f"""
|
@@ -1437,11 +1463,22 @@ class MySQLUploader:
|
|
1437
1463
|
try:
|
1438
1464
|
cursor.executemany(sql, values_list)
|
1439
1465
|
conn.commit()
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1466
|
+
# 在batch模式下,affected_rows表示实际影响的行数
|
1467
|
+
# 如果update_on_duplicate为True,则affected_rows包含更新的行数
|
1468
|
+
# 如果update_on_duplicate为False,则affected_rows只包含插入的行数
|
1469
|
+
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
1470
|
+
if update_on_duplicate:
|
1471
|
+
# 当启用更新时,affected_rows包含插入和更新的行数
|
1472
|
+
# 我们需要区分插入和更新的行数
|
1473
|
+
# 由于无法准确区分,我们假设所有行都是插入的
|
1474
|
+
total_inserted += len(batch)
|
1475
|
+
else:
|
1476
|
+
# 当不启用更新时,affected_rows只包含插入的行数
|
1477
|
+
total_inserted += affected
|
1478
|
+
total_skipped += len(batch) - affected
|
1443
1479
|
except pymysql.err.IntegrityError as e:
|
1444
1480
|
conn.rollback()
|
1481
|
+
# 在唯一约束冲突时,所有行都被跳过
|
1445
1482
|
total_skipped += len(batch)
|
1446
1483
|
logger.debug('批量插入唯一约束冲突,全部跳过', {'库': db_name, '表': table_name, '错误': str(e)})
|
1447
1484
|
except Exception as e:
|
@@ -1460,10 +1497,16 @@ class MySQLUploader:
|
|
1460
1497
|
values += [row.get(col) for col in dup_cols]
|
1461
1498
|
cursor.execute(sql, values)
|
1462
1499
|
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
1463
|
-
if
|
1500
|
+
if update_on_duplicate:
|
1501
|
+
# 当启用更新时,affected_rows包含插入和更新的行数
|
1502
|
+
# 假设所有行都是插入的,因为无法区分插入和更新
|
1464
1503
|
total_inserted += 1
|
1465
1504
|
else:
|
1466
|
-
|
1505
|
+
# 当不启用更新时,affected_rows只包含插入的行数
|
1506
|
+
if affected > 0:
|
1507
|
+
total_inserted += 1
|
1508
|
+
else:
|
1509
|
+
total_skipped += 1
|
1467
1510
|
except pymysql.err.IntegrityError as e:
|
1468
1511
|
conn.rollback()
|
1469
1512
|
total_skipped += 1
|
@@ -1482,10 +1525,16 @@ class MySQLUploader:
|
|
1482
1525
|
values += [row.get(col) for col in dup_cols]
|
1483
1526
|
cursor.execute(sql, values)
|
1484
1527
|
affected = cursor.rowcount if cursor.rowcount is not None else 0
|
1485
|
-
if
|
1528
|
+
if update_on_duplicate:
|
1529
|
+
# 当启用更新时,affected_rows包含插入和更新的行数
|
1530
|
+
# 假设所有行都是插入的,因为无法区分插入和更新
|
1486
1531
|
total_inserted += 1
|
1487
1532
|
else:
|
1488
|
-
|
1533
|
+
# 当不启用更新时,affected_rows只包含插入的行数
|
1534
|
+
if affected > 0:
|
1535
|
+
total_inserted += 1
|
1536
|
+
else:
|
1537
|
+
total_skipped += 1
|
1489
1538
|
conn.commit()
|
1490
1539
|
except pymysql.err.IntegrityError as e:
|
1491
1540
|
conn.rollback()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=DkNrzYKkaEbmtMXcPR8H0qa5y2WJne8kPSNBfTO6mFo,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=NdhsLJvt6NgZSMHpkMxnmEEXzDhUiR5tRwYwI-PfwIw,166732
|
5
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
6
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
7
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
@@ -10,9 +10,9 @@ mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,16
|
|
10
10
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
11
11
|
mdbq/mysql/deduplicator.py,sha256=8v3MC6TJ0YEiExWrTP9OXAxTYnL9XbpYL2vWaER1h2M,73099
|
12
12
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
13
|
-
mdbq/mysql/s_query.py,sha256=
|
13
|
+
mdbq/mysql/s_query.py,sha256=jGBdGPE4mtB06vccfaWIEWpSAqdY-nWc1s9bzCUh8Gg,42916
|
14
14
|
mdbq/mysql/unique_.py,sha256=Wgqq_PjAAD757JTa10wjYaJgssZ_C_ypU6DW56jbuyw,21074
|
15
|
-
mdbq/mysql/uploader.py,sha256=
|
15
|
+
mdbq/mysql/uploader.py,sha256=wNQE7UjCEyAKri9CnQXO7d6EVXCaYqFze2i2tcGAVpw,81001
|
16
16
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
17
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
18
18
|
mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=hPRzLQvFIF4ibN8aP3Dg_ru5meac90faPyzOB22cj-o,20965
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
28
|
+
mdbq-4.0.10.dist-info/METADATA,sha256=AJXVA5kCyLJQiObIW13jNvJAWsXdXJRFN2xCCSdDO78,364
|
29
|
+
mdbq-4.0.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-4.0.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-4.0.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|