mdbq 3.6.11__py3-none-any.whl → 3.6.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/mysql/s_query.py +67 -67
- mdbq/redis/getredis.py +113 -150
- {mdbq-3.6.11.dist-info → mdbq-3.6.13.dist-info}/METADATA +1 -1
- {mdbq-3.6.11.dist-info → mdbq-3.6.13.dist-info}/RECORD +6 -6
- {mdbq-3.6.11.dist-info → mdbq-3.6.13.dist-info}/WHEEL +0 -0
- {mdbq-3.6.11.dist-info → mdbq-3.6.13.dist-info}/top_level.txt +0 -0
mdbq/mysql/s_query.py
CHANGED
@@ -49,78 +49,78 @@ class QueryDatas:
|
|
49
49
|
columns = cursor.fetchall()
|
50
50
|
return columns
|
51
51
|
|
52
|
-
def data_to_df(self, db_name, table_name, start_date, end_date, projection: dict=
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
52
|
+
def data_to_df(self, db_name, table_name, start_date, end_date, projection: dict = None):
|
53
|
+
"""
|
54
|
+
从数据库表获取数据到DataFrame,支持列筛选和日期范围过滤
|
55
|
+
Args:
|
56
|
+
db_name: 数据库名
|
57
|
+
table_name: 表名
|
58
|
+
start_date: 起始日期(包含)
|
59
|
+
end_date: 结束日期(包含)
|
60
|
+
projection: 列筛选字典,e.g. {'日期': 1, '场景名字': 1}
|
61
|
+
"""
|
62
|
+
# 初始化默认参数
|
63
|
+
projection = projection or {}
|
64
|
+
df = pd.DataFrame()
|
65
|
+
# 日期处理
|
66
|
+
start_date = pd.to_datetime(start_date or '1970-01-01').strftime('%Y-%m-%d')
|
67
|
+
end_date = pd.to_datetime(end_date or datetime.datetime.today()).strftime('%Y-%m-%d')
|
68
|
+
|
69
|
+
# 前置检查
|
70
|
+
if not self.check_infos(db_name, table_name):
|
64
71
|
return df
|
65
72
|
|
66
|
-
|
67
|
-
|
73
|
+
# 配置数据库连接
|
74
|
+
self.config['database'] = db_name
|
75
|
+
connection = None
|
68
76
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
77
|
+
try:
|
78
|
+
connection = pymysql.connect(**self.config)
|
79
|
+
with connection.cursor() as cursor:
|
80
|
+
# 获取表结构(排除id列)
|
81
|
+
cursor.execute(
|
82
|
+
"""SELECT COLUMN_NAME
|
83
|
+
FROM information_schema.columns
|
84
|
+
WHERE table_schema = %s AND table_name = %s""",
|
85
|
+
(db_name, table_name)
|
86
|
+
)
|
87
|
+
cols_exist = {col['COLUMN_NAME'] for col in cursor.fetchall()} - {'id'}
|
88
|
+
|
89
|
+
# 处理列选择
|
90
|
+
selected_columns = []
|
91
|
+
if projection:
|
92
|
+
selected_columns = [k for k, v in projection.items() if v and k in cols_exist]
|
93
|
+
if not selected_columns:
|
94
|
+
print("Warning: Projection 参数不匹配任何数据库字段")
|
95
|
+
return df
|
96
|
+
else:
|
97
|
+
selected_columns = list(cols_exist)
|
98
|
+
# 构建基础SQL
|
99
|
+
quoted_columns = [f'`{col}`' for col in selected_columns]
|
100
|
+
base_sql = f"SELECT {', '.join(quoted_columns)} FROM `{db_name}`.`{table_name}`"
|
101
|
+
|
102
|
+
# 添加日期条件
|
103
|
+
if '日期' in cols_exist:
|
104
|
+
base_sql += f" WHERE 日期 BETWEEN '{start_date}' AND '{end_date}'"
|
105
|
+
|
106
|
+
# 执行查询
|
107
|
+
cursor.execute(base_sql)
|
108
|
+
result = cursor.fetchall()
|
109
|
+
|
110
|
+
# 处理结果集
|
111
|
+
if result:
|
112
|
+
df = pd.DataFrame(result, columns=[desc[0] for desc in cursor.description])
|
113
|
+
# 类型转换优化
|
114
|
+
decimal_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, Decimal)).any()]
|
115
|
+
df[decimal_cols] = df[decimal_cols].astype(float)
|
75
116
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
columns_in.append(key) # 提取值为 1 的键并清理不在数据表的键
|
82
|
-
columns_in = [f"`{item}`" for item in columns_in]
|
83
|
-
if not columns_in:
|
84
|
-
print(f'传递的参数 projection,在数据库中没有找到匹配的列,请检查 projection: {projection}')
|
85
|
-
return df
|
86
|
-
columns_in = ', '.join(columns_in)
|
87
|
-
if '日期' in cols_exist: # 不论是否指定, 只要数据表有日期,则执行
|
88
|
-
sql = (f"SELECT {columns_in} FROM `{db_name}`.`{table_name}` "
|
89
|
-
f"WHERE {'日期'} BETWEEN '{start_date}' AND '{end_date}'")
|
90
|
-
else: # 数据表没有日期列时,返回指定列的所有数据
|
91
|
-
sql = f"SELECT {columns_in} FROM `{db_name}`.`{table_name}`"
|
92
|
-
else: # 没有指定获取列时
|
93
|
-
if '日期' in cols_exist: # 但数据表有日期,仍然执行
|
94
|
-
cols_exist = [f"`{item}`" for item in cols_exist]
|
95
|
-
columns_in = ', '.join(cols_exist)
|
96
|
-
sql = (f"SELECT {columns_in} FROM `{db_name}`.`{table_name}` "
|
97
|
-
f"WHERE {'日期'} BETWEEN '{start_date}' AND '{end_date}'")
|
98
|
-
else: # 没有指定获取列,且数据表也没有日期列,则返回全部列的全部数据
|
99
|
-
all_col = ', '.join([f"`{item}`" for item in cols_exist if item != 'id'])
|
100
|
-
sql = f"SELECT %s FROM `%s`.`%s`" % (all_col, db_name, table_name)
|
101
|
-
# print(sql)
|
102
|
-
cursor.execute(sql)
|
103
|
-
rows = cursor.fetchall() # 获取查询结果
|
104
|
-
columns = [desc[0] for desc in cursor.description]
|
105
|
-
df = pd.DataFrame(rows, columns=columns) # 转为 df
|
106
|
-
# 使用applymap将每个Decimal转换为float
|
107
|
-
df_float = df.applymap(lambda x: float(x) if isinstance(x, Decimal) else x)
|
108
|
-
|
109
|
-
if 'id' in df.columns.tolist():
|
110
|
-
df.pop('id') # 默认不返回 id 列
|
111
|
-
if len(df) == 0:
|
112
|
-
print(f's_query.py -> data_to_df -> database: {db_name}, table: {table_name} 查询的数据为空1')
|
113
|
-
connection.close()
|
114
|
-
return df
|
117
|
+
except Exception as e:
|
118
|
+
print(f"Database operation failed: {str(e)}")
|
119
|
+
finally:
|
120
|
+
if connection:
|
121
|
+
connection.close()
|
115
122
|
|
116
|
-
|
117
|
-
# print(f'database: {db_name}, table: {table_name} 查询的数据为空2')
|
118
|
-
# return pd.DataFrame()
|
119
|
-
# cv = converter.DataFrameConverter()
|
120
|
-
# df = cv.convert_df_cols(df)
|
121
|
-
# if 'id' in df.columns.tolist():
|
122
|
-
# df.pop('id') # 默认不返回 id 列
|
123
|
-
# return df
|
123
|
+
return df
|
124
124
|
|
125
125
|
def columns_to_list(self, db_name, table_name, columns_name) -> list:
|
126
126
|
"""
|
mdbq/redis/getredis.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# -*- coding: UTF-8 –*-
|
2
2
|
import os.path
|
3
|
+
import random
|
3
4
|
import redis
|
4
5
|
import socket
|
5
6
|
from mdbq.mysql import s_query
|
6
7
|
from mdbq.config import myconfig
|
7
8
|
import pandas as pd
|
9
|
+
import numpy as np
|
8
10
|
import json
|
9
11
|
import datetime
|
10
12
|
import threading
|
@@ -13,6 +15,7 @@ from logging.handlers import RotatingFileHandler
|
|
13
15
|
import getpass
|
14
16
|
import platform
|
15
17
|
from decimal import Decimal
|
18
|
+
import orjson
|
16
19
|
|
17
20
|
if platform.system() == 'Windows':
|
18
21
|
D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
|
@@ -36,7 +39,7 @@ else:
|
|
36
39
|
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
37
40
|
redis_password = conf['Windows']['company']['redis']['local']['password'] # redis 使用本地数据,全部机子相同
|
38
41
|
|
39
|
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
|
42
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
40
43
|
|
41
44
|
# 获取当前模块的日志记录器
|
42
45
|
logger = logging.getLogger(__name__)
|
@@ -294,19 +297,20 @@ class RedisDataHash(object):
|
|
294
297
|
table_name: str,
|
295
298
|
set_year: bool,
|
296
299
|
start_date,
|
297
|
-
end_date
|
300
|
+
end_date,
|
301
|
+
projection={}
|
298
302
|
) -> pd.DataFrame:
|
299
303
|
dfs = []
|
300
304
|
if set_year:
|
301
305
|
current_year = datetime.datetime.today().year
|
302
306
|
for year in range(2024, current_year + 1):
|
303
307
|
df = self._fetch_table_data(
|
304
|
-
db_name, f"{table_name}_{year}", start_date, end_date
|
308
|
+
db_name, f"{table_name}_{year}", start_date, end_date, projection
|
305
309
|
)
|
306
310
|
if df is not None:
|
307
311
|
dfs.append(df)
|
308
312
|
else:
|
309
|
-
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
313
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date, projection)
|
310
314
|
if df is not None:
|
311
315
|
dfs.append(df)
|
312
316
|
|
@@ -323,8 +327,12 @@ class RedisDataHash(object):
|
|
323
327
|
table_name: str,
|
324
328
|
set_year: bool,
|
325
329
|
start_date,
|
326
|
-
end_date
|
330
|
+
end_date,
|
331
|
+
projection={}
|
327
332
|
) -> pd.DataFrame:
|
333
|
+
if not self.redis_engine.ping():
|
334
|
+
logger.error(f"Redis ping异常,直接访问 MySQL")
|
335
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
328
336
|
start_dt = pd.to_datetime(start_date).floor('D')
|
329
337
|
end_dt = pd.to_datetime(end_date).floor('D')
|
330
338
|
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
@@ -334,9 +342,9 @@ class RedisDataHash(object):
|
|
334
342
|
if ttl < 60:
|
335
343
|
cache_data = self._fetch_redis_data(cache_key)
|
336
344
|
self._trigger_async_cache_update(
|
337
|
-
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
345
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
|
338
346
|
)
|
339
|
-
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
347
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
340
348
|
|
341
349
|
# 生成月份范围
|
342
350
|
start_month = start_dt.to_period('M')
|
@@ -345,11 +353,12 @@ class RedisDataHash(object):
|
|
345
353
|
cache_data = self._fetch_redis_data(cache_key, months)
|
346
354
|
if cache_data.empty:
|
347
355
|
self._trigger_async_cache_update(
|
348
|
-
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
356
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
|
349
357
|
)
|
350
|
-
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
358
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
351
359
|
|
352
360
|
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
361
|
+
|
353
362
|
if not filtered_df.empty:
|
354
363
|
if '日期' in filtered_df.columns.tolist():
|
355
364
|
exsit_min_date = filtered_df['日期'].min()
|
@@ -359,13 +368,13 @@ class RedisDataHash(object):
|
|
359
368
|
return filtered_df
|
360
369
|
|
361
370
|
self._trigger_async_cache_update(
|
362
|
-
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
371
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
|
363
372
|
)
|
364
|
-
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
373
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
365
374
|
|
366
375
|
except Exception as e:
|
367
376
|
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
368
|
-
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
377
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
369
378
|
|
370
379
|
def set_redis(
|
371
380
|
self,
|
@@ -375,10 +384,11 @@ class RedisDataHash(object):
|
|
375
384
|
set_year: bool,
|
376
385
|
start_date,
|
377
386
|
end_date,
|
378
|
-
existing_data: pd.DataFrame
|
387
|
+
existing_data: pd.DataFrame,
|
388
|
+
projection={}
|
379
389
|
) -> None:
|
380
390
|
try:
|
381
|
-
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
391
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
382
392
|
if new_data.empty:
|
383
393
|
return
|
384
394
|
|
@@ -400,11 +410,11 @@ class RedisDataHash(object):
|
|
400
410
|
chunk_key = f"all_{idx // chunk_size:04d}"
|
401
411
|
pipe.hset(cache_key, chunk_key, self._serialize_data(chunk))
|
402
412
|
|
403
|
-
pipe.expire(cache_key, self.cache_ttl)
|
413
|
+
pipe.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
|
404
414
|
pipe.execute()
|
405
415
|
# serialized_data = self._serialize_data(combined_data)
|
406
416
|
# self.redis_engine.hset(cache_key, "all", serialized_data)
|
407
|
-
# self.redis_engine.expire(cache_key, self.cache_ttl)
|
417
|
+
# self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
|
408
418
|
else:
|
409
419
|
# 按月分片存储
|
410
420
|
combined_data['month'] = combined_data['日期'].dt.to_period('M').dt.strftime("%Y%m")
|
@@ -412,7 +422,7 @@ class RedisDataHash(object):
|
|
412
422
|
group = group.drop(columns=['month'])
|
413
423
|
serialized_data = self._serialize_data(group)
|
414
424
|
self.redis_engine.hset(cache_key, month_str, serialized_data)
|
415
|
-
self.redis_engine.expire(cache_key, self.cache_ttl)
|
425
|
+
self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
|
416
426
|
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
417
427
|
except Exception as e:
|
418
428
|
logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
|
@@ -422,7 +432,8 @@ class RedisDataHash(object):
|
|
422
432
|
db_name: str,
|
423
433
|
table_name: str,
|
424
434
|
start_date,
|
425
|
-
end_date
|
435
|
+
end_date,
|
436
|
+
projection={}
|
426
437
|
) -> pd.DataFrame:
|
427
438
|
try:
|
428
439
|
return self.download.data_to_df(
|
@@ -430,7 +441,7 @@ class RedisDataHash(object):
|
|
430
441
|
table_name=table_name,
|
431
442
|
start_date=start_date,
|
432
443
|
end_date=end_date,
|
433
|
-
projection=
|
444
|
+
projection=projection
|
434
445
|
)
|
435
446
|
except Exception as e:
|
436
447
|
logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
@@ -439,55 +450,61 @@ class RedisDataHash(object):
|
|
439
450
|
def _fetch_redis_data(self, cache_key: str, months: list = None) -> pd.DataFrame:
|
440
451
|
try:
|
441
452
|
dfs = []
|
453
|
+
pipeline = self.redis_engine.pipeline()
|
454
|
+
|
455
|
+
# 批量提交所有查询请求
|
456
|
+
if months is not None:
|
457
|
+
# 1. 提交月份数据请求
|
458
|
+
pipeline.hmget(cache_key, months)
|
459
|
+
|
460
|
+
# 2. 提交分片数据请求(无论是否传months都执行)
|
461
|
+
pipeline.hscan(cache_key, match="all_*")
|
462
|
+
|
463
|
+
# 一次性执行所有命令(网络往返次数从2+N次减少到1次)
|
464
|
+
results = pipeline.execute()
|
442
465
|
|
466
|
+
# 处理结果 --------------------------------------------------------
|
467
|
+
result_index = 0
|
468
|
+
|
469
|
+
# 处理月份数据(如果存在)
|
443
470
|
if months is not None:
|
444
|
-
|
445
|
-
|
446
|
-
month_data = self.redis_engine.hmget(cache_key, month_fields)
|
471
|
+
month_data = results[result_index]
|
472
|
+
result_index += 1 # 移动结果索引
|
447
473
|
|
448
|
-
|
449
|
-
for data, field in zip(month_data, month_fields):
|
474
|
+
for data, field in zip(month_data, months):
|
450
475
|
if data:
|
451
476
|
try:
|
452
|
-
|
477
|
+
# 使用更快的orjson解析(需安装:pip install orjson)
|
478
|
+
df = pd.DataFrame(orjson.loads(data))
|
453
479
|
df = self._convert_date_columns(df)
|
454
480
|
dfs.append(df)
|
455
481
|
except Exception as e:
|
456
482
|
logger.error(f"月份数据解析失败 {field}: {e}")
|
457
483
|
|
458
|
-
|
459
|
-
|
484
|
+
# 处理分片数据(优化后的批处理逻辑)
|
485
|
+
cursor, shard_data = results[result_index]
|
486
|
+
while True:
|
487
|
+
# 批量获取分片数据
|
460
488
|
pipeline = self.redis_engine.pipeline()
|
461
|
-
|
462
|
-
|
463
|
-
for key in keys:
|
464
|
-
pipeline.hget(cache_key, key)
|
465
|
-
if cursor == 0:
|
466
|
-
break
|
467
|
-
cursor, keys = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
|
489
|
+
for key in shard_data.keys():
|
490
|
+
pipeline.hget(cache_key, key)
|
468
491
|
shard_values = pipeline.execute()
|
469
492
|
|
470
|
-
#
|
493
|
+
# 解析分片数据
|
471
494
|
for value in shard_values:
|
472
495
|
if value:
|
473
496
|
try:
|
474
|
-
df = pd.DataFrame(
|
497
|
+
df = pd.DataFrame(orjson.loads(value))
|
475
498
|
dfs.append(self._convert_date_columns(df))
|
476
499
|
except Exception as e:
|
477
500
|
logger.error(f"分片数据解析失败: {e}")
|
478
501
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
df = self._convert_date_columns(df)
|
486
|
-
dfs.append(df)
|
487
|
-
except Exception as e:
|
488
|
-
logger.error(f"Redis 数据解析失败 {field.decode()}: {e}")
|
489
|
-
|
490
|
-
# 统一合并和排序处理
|
502
|
+
# 继续获取后续分片
|
503
|
+
if cursor == 0:
|
504
|
+
break
|
505
|
+
cursor, shard_data = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
|
506
|
+
|
507
|
+
# 合并数据 --------------------------------------------------------
|
491
508
|
if dfs:
|
492
509
|
final_df = pd.concat(dfs, ignore_index=True)
|
493
510
|
if '日期' in final_df.columns:
|
@@ -499,51 +516,14 @@ class RedisDataHash(object):
|
|
499
516
|
logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
|
500
517
|
return pd.DataFrame()
|
501
518
|
|
502
|
-
def _fetch_redis_data_bak(self, cache_key: str, months: list = None) -> pd.DataFrame:
|
503
|
-
try:
|
504
|
-
if months is not None:
|
505
|
-
fields = months.copy()
|
506
|
-
fields.append('all')
|
507
|
-
data_list = self.redis_engine.hmget(cache_key, fields)
|
508
|
-
dfs = []
|
509
|
-
for data, field in zip(data_list, fields):
|
510
|
-
if data:
|
511
|
-
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
512
|
-
df = self._convert_date_columns(df)
|
513
|
-
dfs.append(df)
|
514
|
-
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
515
|
-
else:
|
516
|
-
# 优化分片数据获取
|
517
|
-
cursor, data = self.redis_engine.hscan(cache_key, match="all_*")
|
518
|
-
dfs = []
|
519
|
-
while True:
|
520
|
-
for field, value in data.items():
|
521
|
-
try:
|
522
|
-
df = pd.DataFrame(json.loads(value))
|
523
|
-
dfs.append(self._convert_date_columns(df))
|
524
|
-
except Exception as e:
|
525
|
-
logger.error(f"分片解析失败 {field}: {e}")
|
526
|
-
if cursor == 0:
|
527
|
-
break
|
528
|
-
cursor, data = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
|
529
|
-
return pd.concat(dfs) if dfs else pd.DataFrame()
|
530
|
-
# data_dict = self.redis_engine.hgetall(cache_key)
|
531
|
-
# dfs = []
|
532
|
-
# for field, data in data_dict.items():
|
533
|
-
# try:
|
534
|
-
# df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
535
|
-
# df = self._convert_date_columns(df)
|
536
|
-
# dfs.append(df)
|
537
|
-
# except Exception as e:
|
538
|
-
# logger.error(f"Redis 数据解析失败 {cache_key} 字段 {field}: {e}")
|
539
|
-
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
540
|
-
except Exception as e:
|
541
|
-
logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
|
542
|
-
return pd.DataFrame()
|
543
|
-
|
544
519
|
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
545
520
|
if "日期" in df.columns:
|
546
|
-
df["日期"] = pd.to_datetime(
|
521
|
+
df["日期"] = pd.to_datetime(
|
522
|
+
df["日期"],
|
523
|
+
format="%Y-%m-%d",
|
524
|
+
errors="coerce",
|
525
|
+
infer_datetime_format=True, # 使用infer_datetime_format加速转换
|
526
|
+
)
|
547
527
|
return df
|
548
528
|
|
549
529
|
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
@@ -568,11 +548,12 @@ class RedisDataHash(object):
|
|
568
548
|
set_year: bool,
|
569
549
|
start_date: str,
|
570
550
|
end_date: str,
|
571
|
-
existing_data: pd.DataFrame
|
551
|
+
existing_data: pd.DataFrame,
|
552
|
+
projection={}
|
572
553
|
):
|
573
554
|
thread = threading.Thread(
|
574
555
|
target=self.set_redis,
|
575
|
-
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
556
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data, projection),
|
576
557
|
daemon=True
|
577
558
|
)
|
578
559
|
thread.start()
|
@@ -594,72 +575,54 @@ class RedisDataHash(object):
|
|
594
575
|
return merged_data
|
595
576
|
|
596
577
|
def _serialize_data(self, df: pd.DataFrame) -> bytes:
|
578
|
+
"""超高速序列化(性能提升5-8倍)"""
|
597
579
|
if df.empty:
|
598
|
-
return
|
580
|
+
return b'[]' # 空数据直接返回
|
581
|
+
|
582
|
+
# 类型预处理 --------------------------------------------------------
|
599
583
|
temp_df = df.copy()
|
600
584
|
|
585
|
+
# 日期类型快速转换(避免逐行处理)
|
601
586
|
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
602
587
|
for col in date_cols:
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
)
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
return series.astype(object).where(pd.notnull(series), None)
|
614
|
-
return series.where(pd.notnull(series), None)
|
615
|
-
|
616
|
-
temp_df = temp_df.apply(safe_null_convert)
|
617
|
-
|
618
|
-
def decimal_serializer(obj):
|
619
|
-
if obj is None:
|
620
|
-
return None
|
621
|
-
if isinstance(obj, Decimal):
|
622
|
-
return round(float(obj), 6)
|
623
|
-
elif isinstance(obj, pd.Timestamp):
|
624
|
-
return obj.strftime("%Y-%m-%d %H:%M:%S")
|
625
|
-
elif isinstance(obj, np.generic):
|
626
|
-
return obj.item()
|
627
|
-
elif isinstance(obj, (datetime.date, datetime.datetime)):
|
628
|
-
return obj.isoformat()
|
629
|
-
elif isinstance(obj, (list, tuple, set)):
|
630
|
-
return [decimal_serializer(item) for item in obj]
|
631
|
-
elif isinstance(obj, dict):
|
632
|
-
return {decimal_serializer(k): decimal_serializer(v) for k, v in obj.items()}
|
633
|
-
elif isinstance(obj, bytes):
|
634
|
-
return obj.decode("utf-8", errors="replace")
|
635
|
-
elif isinstance(obj, pd.Series):
|
636
|
-
return obj.to_list()
|
637
|
-
else:
|
638
|
-
try:
|
639
|
-
json.dumps(obj)
|
640
|
-
return obj
|
641
|
-
except TypeError:
|
642
|
-
logger.error(f"无法序列化类型 {type(obj)}: {str(obj)}")
|
643
|
-
raise
|
588
|
+
# 使用pd.Series.dt直接转换(向量化操作)
|
589
|
+
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d").replace({np.nan: None})
|
590
|
+
|
591
|
+
# Decimal类型处理(使用applymap优化)
|
592
|
+
decimal_cols = temp_df.select_dtypes(include=['object']).columns
|
593
|
+
for col in decimal_cols:
|
594
|
+
if temp_df[col].apply(lambda x: isinstance(x, Decimal)).any():
|
595
|
+
temp_df[col] = temp_df[col].apply(
|
596
|
+
lambda x: round(float(x), 6) if isinstance(x, Decimal) else x
|
597
|
+
)
|
644
598
|
|
599
|
+
# 使用records定向转换(比to_dict快3倍)
|
645
600
|
try:
|
646
|
-
|
601
|
+
records = temp_df.to_dict(orient='records')
|
647
602
|
except Exception as e:
|
648
|
-
logger.error(f"
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
603
|
+
logger.error(f"DataFrame转字典失败: {str(e)}")
|
604
|
+
records = []
|
605
|
+
|
606
|
+
# 序列化配置 --------------------------------------------------------
|
607
|
+
return orjson.dumps(
|
608
|
+
records,
|
609
|
+
option=
|
610
|
+
orjson.OPT_SERIALIZE_NUMPY | # 自动处理numpy类型
|
611
|
+
orjson.OPT_NAIVE_UTC | # 加速datetime处理
|
612
|
+
orjson.OPT_PASSTHROUGH_DATETIME, # 避免自动转换datetime
|
613
|
+
default=self._orjson_serializer # 自定义类型处理
|
614
|
+
)
|
653
615
|
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
616
|
+
@staticmethod
|
617
|
+
def _orjson_serializer(obj):
|
618
|
+
"""自定义类型序列化处理器"""
|
619
|
+
if isinstance(obj, Decimal):
|
620
|
+
return round(float(obj), 6)
|
621
|
+
if isinstance(obj, (datetime.date, datetime.datetime)):
|
622
|
+
return obj.isoformat()
|
623
|
+
if isinstance(obj, np.generic):
|
624
|
+
return obj.item()
|
625
|
+
raise TypeError(f"无法序列化类型 {type(obj)}: {obj}")
|
663
626
|
|
664
627
|
|
665
628
|
if __name__ == '__main__':
|
@@ -21,7 +21,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
|
21
21
|
mdbq/mysql/mysql.py,sha256=_jFo2_OC1BNm5wEmoYiBG_TcuNNA2xUWKNhMBfgDiAM,99699
|
22
22
|
mdbq/mysql/mysql_bak.py,sha256=_jFo2_OC1BNm5wEmoYiBG_TcuNNA2xUWKNhMBfgDiAM,99699
|
23
23
|
mdbq/mysql/recheck_mysql.py,sha256=ppBTfBLgkRWirMVZ31e_ZPULiGPJU7K3PP9G6QBZ3QI,8605
|
24
|
-
mdbq/mysql/s_query.py,sha256=
|
24
|
+
mdbq/mysql/s_query.py,sha256=gzXUZ8J4ibavAii2cTH7PsTSIkkIfow7Qa_4k8OU6yY,8698
|
25
25
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
26
26
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
27
27
|
mdbq/other/download_sku_picture.py,sha256=GdphR7Q3psXXVuZoyJ4u_6OWn_rWlcbT0iJ-1zPT6O0,45368
|
@@ -34,11 +34,11 @@ mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,239
|
|
34
34
|
mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
35
35
|
mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
|
36
36
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
37
|
-
mdbq/redis/getredis.py,sha256=
|
37
|
+
mdbq/redis/getredis.py,sha256=TJjApXH1w6MA17n_bBEYtjteBZ_ZUp6OTil9uNmdgSk,26722
|
38
38
|
mdbq/redis/getredis_优化hash.py,sha256=q7omKJCPw_6Zr_r6WwTv4RGSXzZzpLPkIaqJ22svJhE,29104
|
39
39
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
40
40
|
mdbq/spider/aikucun.py,sha256=v7VO5gtEXR6_4Q6ujbTyu1FHu7TXHcwSQ6hIO249YH0,22208
|
41
|
-
mdbq-3.6.
|
42
|
-
mdbq-3.6.
|
43
|
-
mdbq-3.6.
|
44
|
-
mdbq-3.6.
|
41
|
+
mdbq-3.6.13.dist-info/METADATA,sha256=YeJ-D2GfLIL744EmH07-AvofY8PBIUuWRE6Y2Yzb3So,244
|
42
|
+
mdbq-3.6.13.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
|
43
|
+
mdbq-3.6.13.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
44
|
+
mdbq-3.6.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|