mdbq 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__init__.py +1 -0
- mdbq/__version__.py +3 -0
- mdbq/aggregation/__init__.py +4 -0
- mdbq/aggregation/aggregation_bak.py +1438 -0
- mdbq/aggregation/datashow_bak.py +1264 -0
- mdbq/aggregation/optimize_data.py +76 -0
- mdbq/aggregation/query_data.py +3869 -0
- mdbq/bdup/__init__.py +5 -0
- mdbq/bdup/bdup.py +111 -0
- mdbq/config/__init__.py +4 -0
- mdbq/config/default.py +131 -0
- mdbq/config/myconfig.py +32 -0
- mdbq/config/products.py +159 -0
- mdbq/config/set_support.py +25 -0
- mdbq/dataframe/__init__.py +4 -0
- mdbq/dataframe/converter.py +107 -0
- mdbq/log/__init__.py +4 -0
- mdbq/log/mylogger.py +66 -0
- mdbq/log/spider_logging.py +55 -0
- mdbq/mongo/__init__.py +4 -0
- mdbq/mongo/mongo.py +729 -0
- mdbq/mysql/__init__.py +4 -0
- mdbq/mysql/mysql.py +1784 -0
- mdbq/mysql/s_query.py +211 -0
- mdbq/mysql/year_month_day.py +38 -0
- mdbq/other/__init__.py +4 -0
- mdbq/other/download_sku_picture.py +985 -0
- mdbq/other/porxy.py +115 -0
- mdbq/other/pov_city.py +405 -0
- mdbq/other/sku_picture_bak.py +1081 -0
- mdbq/other/ua_sj.py +222 -0
- mdbq/pbix/__init__.py +4 -0
- mdbq/pbix/pbix_refresh.py +70 -0
- mdbq/pbix/refresh_all.py +158 -0
- mdbq/pbix/refresh_all_old.py +177 -0
- mdbq/redis/__init__.py +4 -0
- mdbq/redis/getredis.py +642 -0
- mdbq/spider/__init__.py +4 -0
- mdbq/spider/aikucun.py +494 -0
- {mdbq-3.7.5.dist-info → mdbq-3.7.6.dist-info}/METADATA +1 -1
- mdbq-3.7.6.dist-info/RECORD +43 -0
- mdbq-3.7.6.dist-info/top_level.txt +1 -0
- mdbq-3.7.5.dist-info/RECORD +0 -4
- mdbq-3.7.5.dist-info/top_level.txt +0 -1
- {mdbq-3.7.5.dist-info → mdbq-3.7.6.dist-info}/WHEEL +0 -0
mdbq/redis/getredis.py
ADDED
@@ -0,0 +1,642 @@
|
|
1
|
+
# -*- coding: UTF-8 –*-
|
2
|
+
import os.path
|
3
|
+
import random
|
4
|
+
import redis
|
5
|
+
import socket
|
6
|
+
from mdbq.mysql import s_query
|
7
|
+
from mdbq.config import default
|
8
|
+
import pandas as pd
|
9
|
+
import numpy as np
|
10
|
+
import json
|
11
|
+
import datetime
|
12
|
+
import threading
|
13
|
+
import logging
|
14
|
+
from logging.handlers import RotatingFileHandler
|
15
|
+
import getpass
|
16
|
+
import platform
|
17
|
+
from decimal import Decimal
|
18
|
+
import orjson
|
19
|
+
|
20
|
+
if platform.system() == 'Windows':
|
21
|
+
D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
|
22
|
+
else:
|
23
|
+
D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
|
24
|
+
|
25
|
+
m_engine, username, password, host, port = default.get_mysql_engine(platform='Windows', hostname='xigua_lx', sql='mysql', local='remoto', config_file=None)
|
26
|
+
|
27
|
+
# 获取当前模块的日志记录器
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
# 创建一个文件处理器,用于将日志写入文件
|
31
|
+
if not os.path.isdir(os.path.join(D_PATH, 'logfile')):
|
32
|
+
os.makedirs(os.path.join(D_PATH, 'logfile'))
|
33
|
+
log_file = os.path.join(D_PATH, 'logfile', 'redis.log')
|
34
|
+
file_handler = RotatingFileHandler(log_file, maxBytes=3 * 1024 * 1024, backupCount=10, encoding='utf-8') # 保留10个备份文件
|
35
|
+
file_handler.setLevel(logging.INFO) # 设置文件处理器的日志级别
|
36
|
+
|
37
|
+
# 创建一个日志格式器,并设置给文件处理器
|
38
|
+
formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
39
|
+
file_handler.setFormatter(formatter)
|
40
|
+
|
41
|
+
# 将文件处理器添加到日志记录器
|
42
|
+
logger.addHandler(file_handler)
|
43
|
+
|
44
|
+
|
45
|
+
class RedisData(object):
|
46
|
+
"""
|
47
|
+
存储 string
|
48
|
+
"""
|
49
|
+
def __init__(self, redis_engine, download, cache_ttl: int):
|
50
|
+
self.redis_engine = redis_engine # Redis 数据处理引擎
|
51
|
+
self.download = download # MySQL 数据处理引擎
|
52
|
+
self.cache_ttl = cache_ttl * 60 # 缓存过期时间(秒)
|
53
|
+
|
54
|
+
def get_from_mysql(
|
55
|
+
self,
|
56
|
+
db_name: str,
|
57
|
+
table_name: str,
|
58
|
+
set_year: bool,
|
59
|
+
start_date,
|
60
|
+
end_date
|
61
|
+
) -> pd.DataFrame:
|
62
|
+
"""
|
63
|
+
从 MySQL 读取数据并返回 DataFrame
|
64
|
+
|
65
|
+
Args:
|
66
|
+
set_year: 表名是否包含年份后缀
|
67
|
+
"""
|
68
|
+
dfs = []
|
69
|
+
if set_year:
|
70
|
+
current_year = datetime.datetime.today().year
|
71
|
+
for year in range(2024, current_year + 1):
|
72
|
+
df = self._fetch_table_data(
|
73
|
+
db_name, f"{table_name}_{year}", start_date, end_date
|
74
|
+
)
|
75
|
+
if df is not None:
|
76
|
+
dfs.append(df)
|
77
|
+
else:
|
78
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
79
|
+
if df is not None:
|
80
|
+
dfs.append(df)
|
81
|
+
|
82
|
+
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
83
|
+
if combined_df.empty:
|
84
|
+
logger.info(f"警告: {db_name}.{table_name} 未读取到数据")
|
85
|
+
else:
|
86
|
+
combined_df = self._convert_date_columns(combined_df)
|
87
|
+
return combined_df
|
88
|
+
|
89
|
+
def get_from_redis(
|
90
|
+
self,
|
91
|
+
db_name: str,
|
92
|
+
table_name: str,
|
93
|
+
set_year: bool,
|
94
|
+
start_date,
|
95
|
+
end_date
|
96
|
+
) -> pd.DataFrame:
|
97
|
+
"""
|
98
|
+
从 Redis 获取数据,若缓存过期/不完整则触发异步更新
|
99
|
+
"""
|
100
|
+
start_dt = pd.to_datetime(start_date)
|
101
|
+
end_dt = pd.to_datetime(end_date)
|
102
|
+
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
103
|
+
|
104
|
+
# 尝试获取缓存元数据
|
105
|
+
try:
|
106
|
+
ttl = self.redis_engine.ttl(cache_key)
|
107
|
+
cache_data = self._fetch_redis_data(cache_key)
|
108
|
+
except Exception as e:
|
109
|
+
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
110
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
111
|
+
|
112
|
+
# 缓存失效处理逻辑
|
113
|
+
if ttl < 60 or cache_data.empty:
|
114
|
+
self._trigger_async_cache_update(
|
115
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
116
|
+
)
|
117
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
118
|
+
|
119
|
+
# 处理有效缓存数据
|
120
|
+
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
121
|
+
if not filtered_df.empty:
|
122
|
+
return filtered_df
|
123
|
+
|
124
|
+
# 缓存数据不满足查询范围要求
|
125
|
+
self._trigger_async_cache_update(
|
126
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
127
|
+
)
|
128
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
129
|
+
|
130
|
+
def set_redis(
|
131
|
+
self,
|
132
|
+
cache_key: str,
|
133
|
+
db_name: str,
|
134
|
+
table_name: str,
|
135
|
+
set_year: bool,
|
136
|
+
start_date,
|
137
|
+
end_date,
|
138
|
+
existing_data: pd.DataFrame
|
139
|
+
) -> pd.DataFrame:
|
140
|
+
"""
|
141
|
+
异步更新 Redis 缓存,合并新旧数据
|
142
|
+
"""
|
143
|
+
try:
|
144
|
+
# 从 MySQL 获取新数据
|
145
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
146
|
+
if new_data.empty:
|
147
|
+
return pd.DataFrame()
|
148
|
+
|
149
|
+
# 合并历史数据
|
150
|
+
combined_data = self._merge_data(new_data, existing_data)
|
151
|
+
|
152
|
+
# 序列化并存储到 Redis
|
153
|
+
serialized_data = self._serialize_data(combined_data)
|
154
|
+
self.redis_engine.set(cache_key, serialized_data)
|
155
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
156
|
+
|
157
|
+
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
158
|
+
return combined_data
|
159
|
+
|
160
|
+
except Exception as e:
|
161
|
+
logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
|
162
|
+
return pd.DataFrame()
|
163
|
+
|
164
|
+
# Helper Methods ------------------------------------------------
|
165
|
+
|
166
|
+
def _fetch_table_data(
|
167
|
+
self,
|
168
|
+
db_name: str,
|
169
|
+
table_name: str,
|
170
|
+
start_date,
|
171
|
+
end_date
|
172
|
+
) -> pd.DataFrame:
|
173
|
+
"""封装 MySQL 数据获取逻辑"""
|
174
|
+
try:
|
175
|
+
return self.download.data_to_df(
|
176
|
+
db_name=db_name,
|
177
|
+
table_name=table_name,
|
178
|
+
start_date=start_date,
|
179
|
+
end_date=end_date,
|
180
|
+
projection={}
|
181
|
+
)
|
182
|
+
except Exception as e:
|
183
|
+
logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
184
|
+
return pd.DataFrame()
|
185
|
+
|
186
|
+
def _fetch_redis_data(self, cache_key: str) -> pd.DataFrame:
|
187
|
+
"""从 Redis 获取并解析数据(自动转换日期列)"""
|
188
|
+
try:
|
189
|
+
data = self.redis_engine.get(cache_key)
|
190
|
+
if not data:
|
191
|
+
return pd.DataFrame()
|
192
|
+
# 反序列化数据
|
193
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
194
|
+
return self._convert_date_columns(df)
|
195
|
+
except Exception as e:
|
196
|
+
logger.error(f"Redis 数据解析失败 {cache_key}: {e}")
|
197
|
+
return pd.DataFrame()
|
198
|
+
|
199
|
+
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
200
|
+
"""统一处理日期列转换"""
|
201
|
+
if "日期" in df.columns:
|
202
|
+
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
203
|
+
return df
|
204
|
+
|
205
|
+
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
206
|
+
"""生成标准化的缓存键"""
|
207
|
+
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
208
|
+
|
209
|
+
def _filter_by_date_range(
|
210
|
+
self,
|
211
|
+
df: pd.DataFrame,
|
212
|
+
start_dt: datetime.datetime,
|
213
|
+
end_dt: datetime.datetime
|
214
|
+
) -> pd.DataFrame:
|
215
|
+
"""按日期范围筛选数据"""
|
216
|
+
if "日期" not in df.columns:
|
217
|
+
return df
|
218
|
+
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
219
|
+
return df[date_mask].copy()
|
220
|
+
|
221
|
+
def _trigger_async_cache_update(
|
222
|
+
self,
|
223
|
+
cache_key: str,
|
224
|
+
db_name: str,
|
225
|
+
table_name: str,
|
226
|
+
set_year: bool,
|
227
|
+
start_date: str,
|
228
|
+
end_date: str,
|
229
|
+
existing_data: pd.DataFrame
|
230
|
+
):
|
231
|
+
"""启动异步缓存更新线程"""
|
232
|
+
thread = threading.Thread(
|
233
|
+
target=self.set_redis,
|
234
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
235
|
+
daemon=True
|
236
|
+
)
|
237
|
+
thread.start()
|
238
|
+
|
239
|
+
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
240
|
+
"""合并新旧数据集"""
|
241
|
+
if existing_data.empty or "日期" not in existing_data.columns:
|
242
|
+
return new_data
|
243
|
+
|
244
|
+
new_min = new_data["日期"].min()
|
245
|
+
new_max = new_data["日期"].max()
|
246
|
+
valid_historical = existing_data[
|
247
|
+
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
248
|
+
]
|
249
|
+
return pd.concat([new_data, valid_historical], ignore_index=True).drop_duplicates(subset=["日期"])
|
250
|
+
|
251
|
+
def _serialize_data(self, df: pd.DataFrame) -> str:
|
252
|
+
"""序列化 DataFrame 并处理日期类型"""
|
253
|
+
temp_df = df.copy()
|
254
|
+
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
255
|
+
for col in date_cols:
|
256
|
+
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d")
|
257
|
+
return temp_df.to_json(orient="records", force_ascii=False)
|
258
|
+
|
259
|
+
class RedisDataHash(object):
|
260
|
+
"""
|
261
|
+
存储 hash
|
262
|
+
Redis缓存与MySQL数据联合查询处理器
|
263
|
+
|
264
|
+
功能特性:
|
265
|
+
- 支持带年份分表的MySQL数据查询
|
266
|
+
- 多级缓存策略(内存缓存+Redis缓存)
|
267
|
+
- 异步缓存更新机制
|
268
|
+
- 自动处理日期范围和数据类型转换
|
269
|
+
"""
|
270
|
+
|
271
|
+
def __init__(self, redis_engine, download, cache_ttl: int):
|
272
|
+
self.redis_engine = redis_engine
|
273
|
+
self.download = download
|
274
|
+
self.cache_ttl = cache_ttl * 60 # 转换为秒存储
|
275
|
+
|
276
|
+
def get_from_mysql(
|
277
|
+
self,
|
278
|
+
db_name: str,
|
279
|
+
table_name: str,
|
280
|
+
set_year: bool,
|
281
|
+
start_date,
|
282
|
+
end_date,
|
283
|
+
projection={}
|
284
|
+
) -> pd.DataFrame:
|
285
|
+
dfs = []
|
286
|
+
if set_year:
|
287
|
+
current_year = datetime.datetime.today().year
|
288
|
+
for year in range(2024, current_year + 1):
|
289
|
+
df = self._fetch_table_data(
|
290
|
+
db_name, f"{table_name}_{year}", start_date, end_date, projection
|
291
|
+
)
|
292
|
+
if df is not None:
|
293
|
+
dfs.append(df)
|
294
|
+
else:
|
295
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date, projection)
|
296
|
+
if df is not None:
|
297
|
+
dfs.append(df)
|
298
|
+
|
299
|
+
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
300
|
+
if combined_df.empty:
|
301
|
+
logger.warn(f"warning: {db_name}.{table_name} 未读取到数据")
|
302
|
+
else:
|
303
|
+
combined_df = self._convert_date_columns(combined_df)
|
304
|
+
return combined_df
|
305
|
+
|
306
|
+
def get_from_redis(
|
307
|
+
self,
|
308
|
+
db_name: str,
|
309
|
+
table_name: str,
|
310
|
+
set_year: bool,
|
311
|
+
start_date,
|
312
|
+
end_date,
|
313
|
+
projection={}
|
314
|
+
) -> pd.DataFrame:
|
315
|
+
if not self.redis_engine.ping():
|
316
|
+
logger.error(f"Redis ping异常,直接访问 MySQL")
|
317
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
318
|
+
start_dt = pd.to_datetime(start_date).floor('D')
|
319
|
+
end_dt = pd.to_datetime(end_date).floor('D')
|
320
|
+
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
321
|
+
|
322
|
+
try:
|
323
|
+
ttl = self.redis_engine.ttl(cache_key)
|
324
|
+
if ttl < 60:
|
325
|
+
cache_data = self._fetch_redis_data(cache_key)
|
326
|
+
self._trigger_async_cache_update(
|
327
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
|
328
|
+
)
|
329
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
330
|
+
|
331
|
+
# 生成月份范围
|
332
|
+
start_month = start_dt.to_period('M')
|
333
|
+
end_month = end_dt.to_period('M')
|
334
|
+
months = pd.period_range(start_month, end_month, freq='M').strftime("%Y%m").tolist()
|
335
|
+
cache_data = self._fetch_redis_data(cache_key, months)
|
336
|
+
if cache_data.empty:
|
337
|
+
self._trigger_async_cache_update(
|
338
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
|
339
|
+
)
|
340
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
341
|
+
|
342
|
+
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
343
|
+
|
344
|
+
if not filtered_df.empty:
|
345
|
+
if '日期' in filtered_df.columns.tolist():
|
346
|
+
exsit_min_date = filtered_df['日期'].min()
|
347
|
+
if exsit_min_date <= start_dt:
|
348
|
+
return filtered_df
|
349
|
+
else:
|
350
|
+
return filtered_df
|
351
|
+
|
352
|
+
self._trigger_async_cache_update(
|
353
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
|
354
|
+
)
|
355
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
356
|
+
|
357
|
+
except Exception as e:
|
358
|
+
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
359
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
360
|
+
|
361
|
+
def set_redis(
|
362
|
+
self,
|
363
|
+
cache_key: str,
|
364
|
+
db_name: str,
|
365
|
+
table_name: str,
|
366
|
+
set_year: bool,
|
367
|
+
start_date,
|
368
|
+
end_date,
|
369
|
+
existing_data: pd.DataFrame,
|
370
|
+
projection={}
|
371
|
+
) -> None:
|
372
|
+
try:
|
373
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
|
374
|
+
if new_data.empty:
|
375
|
+
return
|
376
|
+
|
377
|
+
combined_data = self._merge_data(new_data, existing_data)
|
378
|
+
|
379
|
+
if not combined_data.empty:
|
380
|
+
if '日期' not in combined_data.columns:
|
381
|
+
# 原子化删除旧分片
|
382
|
+
# 优化分片存储性能
|
383
|
+
chunk_size = 5000
|
384
|
+
with self.redis_engine.pipeline(transaction=False) as pipe:
|
385
|
+
# 批量删除旧分片
|
386
|
+
for key in self.redis_engine.hscan_iter(cache_key, match="all_*"):
|
387
|
+
pipe.hdel(cache_key, key[0])
|
388
|
+
|
389
|
+
# 批量写入新分片
|
390
|
+
for idx in range(0, len(combined_data), chunk_size):
|
391
|
+
chunk = combined_data.iloc[idx:idx + chunk_size]
|
392
|
+
chunk_key = f"all_{idx // chunk_size:04d}"
|
393
|
+
pipe.hset(cache_key, chunk_key, self._serialize_data(chunk))
|
394
|
+
|
395
|
+
pipe.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
|
396
|
+
pipe.execute()
|
397
|
+
# serialized_data = self._serialize_data(combined_data)
|
398
|
+
# self.redis_engine.hset(cache_key, "all", serialized_data)
|
399
|
+
# self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
|
400
|
+
else:
|
401
|
+
# 按月分片存储
|
402
|
+
combined_data['month'] = combined_data['日期'].dt.to_period('M').dt.strftime("%Y%m")
|
403
|
+
for month_str, group in combined_data.groupby('month'):
|
404
|
+
group = group.drop(columns=['month'])
|
405
|
+
serialized_data = self._serialize_data(group)
|
406
|
+
self.redis_engine.hset(cache_key, month_str, serialized_data)
|
407
|
+
self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
|
408
|
+
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
409
|
+
except Exception as e:
|
410
|
+
logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
|
411
|
+
|
412
|
+
def _fetch_table_data(
|
413
|
+
self,
|
414
|
+
db_name: str,
|
415
|
+
table_name: str,
|
416
|
+
start_date,
|
417
|
+
end_date,
|
418
|
+
projection={}
|
419
|
+
) -> pd.DataFrame:
|
420
|
+
try:
|
421
|
+
return self.download.data_to_df(
|
422
|
+
db_name=db_name,
|
423
|
+
table_name=table_name,
|
424
|
+
start_date=start_date,
|
425
|
+
end_date=end_date,
|
426
|
+
projection=projection
|
427
|
+
)
|
428
|
+
except Exception as e:
|
429
|
+
logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
430
|
+
return pd.DataFrame()
|
431
|
+
|
432
|
+
def _fetch_redis_data(self, cache_key: str, months: list = None) -> pd.DataFrame:
|
433
|
+
try:
|
434
|
+
dfs = []
|
435
|
+
pipeline = self.redis_engine.pipeline()
|
436
|
+
|
437
|
+
# 批量提交所有查询请求
|
438
|
+
if months is not None:
|
439
|
+
# 1. 提交月份数据请求
|
440
|
+
pipeline.hmget(cache_key, months)
|
441
|
+
|
442
|
+
# 2. 提交分片数据请求(无论是否传months都执行)
|
443
|
+
pipeline.hscan(cache_key, match="all_*")
|
444
|
+
|
445
|
+
# 一次性执行所有命令(网络往返次数从2+N次减少到1次)
|
446
|
+
results = pipeline.execute()
|
447
|
+
|
448
|
+
# 处理结果 --------------------------------------------------------
|
449
|
+
result_index = 0
|
450
|
+
|
451
|
+
# 处理月份数据(如果存在)
|
452
|
+
if months is not None:
|
453
|
+
month_data = results[result_index]
|
454
|
+
result_index += 1 # 移动结果索引
|
455
|
+
|
456
|
+
for data, field in zip(month_data, months):
|
457
|
+
if data:
|
458
|
+
try:
|
459
|
+
# 使用更快的orjson解析(需安装:pip install orjson)
|
460
|
+
df = pd.DataFrame(orjson.loads(data))
|
461
|
+
df = self._convert_date_columns(df)
|
462
|
+
dfs.append(df)
|
463
|
+
except Exception as e:
|
464
|
+
logger.error(f"月份数据解析失败 {field}: {e}")
|
465
|
+
|
466
|
+
# 处理分片数据(优化后的批处理逻辑)
|
467
|
+
cursor, shard_data = results[result_index]
|
468
|
+
while True:
|
469
|
+
# 批量获取分片数据
|
470
|
+
pipeline = self.redis_engine.pipeline()
|
471
|
+
for key in shard_data.keys():
|
472
|
+
pipeline.hget(cache_key, key)
|
473
|
+
shard_values = pipeline.execute()
|
474
|
+
|
475
|
+
# 解析分片数据
|
476
|
+
for value in shard_values:
|
477
|
+
if value:
|
478
|
+
try:
|
479
|
+
df = pd.DataFrame(orjson.loads(value))
|
480
|
+
dfs.append(self._convert_date_columns(df))
|
481
|
+
except Exception as e:
|
482
|
+
logger.error(f"分片数据解析失败: {e}")
|
483
|
+
|
484
|
+
# 继续获取后续分片
|
485
|
+
if cursor == 0:
|
486
|
+
break
|
487
|
+
cursor, shard_data = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
|
488
|
+
|
489
|
+
# 合并数据 --------------------------------------------------------
|
490
|
+
if dfs:
|
491
|
+
final_df = pd.concat(dfs, ignore_index=True)
|
492
|
+
if '日期' in final_df.columns:
|
493
|
+
final_df = final_df.sort_values('日期', ascending=False)
|
494
|
+
return final_df
|
495
|
+
return pd.DataFrame()
|
496
|
+
|
497
|
+
except Exception as e:
|
498
|
+
logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
|
499
|
+
return pd.DataFrame()
|
500
|
+
|
501
|
+
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
502
|
+
if "日期" in df.columns:
|
503
|
+
df["日期"] = pd.to_datetime(
|
504
|
+
df["日期"],
|
505
|
+
format="%Y-%m-%d",
|
506
|
+
errors="coerce",
|
507
|
+
infer_datetime_format=True, # 使用infer_datetime_format加速转换
|
508
|
+
)
|
509
|
+
return df
|
510
|
+
|
511
|
+
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
512
|
+
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
513
|
+
|
514
|
+
def _filter_by_date_range(
|
515
|
+
self,
|
516
|
+
df: pd.DataFrame,
|
517
|
+
start_dt: datetime.datetime,
|
518
|
+
end_dt: datetime.datetime
|
519
|
+
) -> pd.DataFrame:
|
520
|
+
if "日期" not in df.columns:
|
521
|
+
return df
|
522
|
+
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
523
|
+
return df[date_mask].copy()
|
524
|
+
|
525
|
+
def _trigger_async_cache_update(
|
526
|
+
self,
|
527
|
+
cache_key: str,
|
528
|
+
db_name: str,
|
529
|
+
table_name: str,
|
530
|
+
set_year: bool,
|
531
|
+
start_date: str,
|
532
|
+
end_date: str,
|
533
|
+
existing_data: pd.DataFrame,
|
534
|
+
projection={}
|
535
|
+
):
|
536
|
+
thread = threading.Thread(
|
537
|
+
target=self.set_redis,
|
538
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data, projection),
|
539
|
+
daemon=True
|
540
|
+
)
|
541
|
+
thread.start()
|
542
|
+
|
543
|
+
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
544
|
+
if existing_data.empty or "日期" not in existing_data.columns:
|
545
|
+
return new_data
|
546
|
+
new_data["日期"] = pd.to_datetime(new_data["日期"])
|
547
|
+
existing_data["日期"] = pd.to_datetime(existing_data["日期"])
|
548
|
+
|
549
|
+
new_min = new_data["日期"].min()
|
550
|
+
new_max = new_data["日期"].max()
|
551
|
+
|
552
|
+
valid_historical = existing_data[
|
553
|
+
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
554
|
+
]
|
555
|
+
merged_data = pd.concat([new_data, valid_historical], ignore_index=True)
|
556
|
+
merged_data.sort_values(['日期'], ascending=[False], ignore_index=True, inplace=True)
|
557
|
+
return merged_data
|
558
|
+
|
559
|
+
def _serialize_data(self, df: pd.DataFrame) -> bytes:
|
560
|
+
"""超高速序列化(性能提升5-8倍)"""
|
561
|
+
if df.empty:
|
562
|
+
return b'[]' # 空数据直接返回
|
563
|
+
|
564
|
+
# 类型预处理 --------------------------------------------------------
|
565
|
+
temp_df = df.copy()
|
566
|
+
|
567
|
+
# 日期类型快速转换(避免逐行处理)
|
568
|
+
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
569
|
+
for col in date_cols:
|
570
|
+
# 使用pd.Series.dt直接转换(向量化操作)
|
571
|
+
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d").replace({np.nan: None})
|
572
|
+
|
573
|
+
# Decimal类型处理(使用applymap优化)
|
574
|
+
decimal_cols = temp_df.select_dtypes(include=['object']).columns
|
575
|
+
for col in decimal_cols:
|
576
|
+
if temp_df[col].apply(lambda x: isinstance(x, Decimal)).any():
|
577
|
+
temp_df[col] = temp_df[col].apply(
|
578
|
+
lambda x: round(float(x), 6) if isinstance(x, Decimal) else x
|
579
|
+
)
|
580
|
+
|
581
|
+
# 使用records定向转换(比to_dict快3倍)
|
582
|
+
try:
|
583
|
+
records = temp_df.to_dict(orient='records')
|
584
|
+
except Exception as e:
|
585
|
+
logger.error(f"DataFrame转字典失败: {str(e)}")
|
586
|
+
records = []
|
587
|
+
|
588
|
+
# 序列化配置 --------------------------------------------------------
|
589
|
+
return orjson.dumps(
|
590
|
+
records,
|
591
|
+
option=
|
592
|
+
orjson.OPT_SERIALIZE_NUMPY | # 自动处理numpy类型
|
593
|
+
orjson.OPT_NAIVE_UTC | # 加速datetime处理
|
594
|
+
orjson.OPT_PASSTHROUGH_DATETIME, # 避免自动转换datetime
|
595
|
+
default=self._orjson_serializer # 自定义类型处理
|
596
|
+
)
|
597
|
+
|
598
|
+
@staticmethod
|
599
|
+
def _orjson_serializer(obj):
|
600
|
+
"""自定义类型序列化处理器"""
|
601
|
+
if isinstance(obj, Decimal):
|
602
|
+
return round(float(obj), 6)
|
603
|
+
if isinstance(obj, (datetime.date, datetime.datetime)):
|
604
|
+
return obj.isoformat()
|
605
|
+
if isinstance(obj, np.generic):
|
606
|
+
return obj.item()
|
607
|
+
raise TypeError(f"无法序列化类型 {type(obj)}: {obj}")
|
608
|
+
|
609
|
+
|
610
|
+
if __name__ == '__main__':
|
611
|
+
# # ****************************************************
|
612
|
+
# # 这一部分在外部定义,只需要定义一次,开始
|
613
|
+
# redis_config = {
|
614
|
+
# 'host': '127.0.0.1',
|
615
|
+
# 'port': 6379, # 默认Redis端口
|
616
|
+
# 'db': 0, # 默认Redis数据库索引
|
617
|
+
# # 'username': 'default',
|
618
|
+
# 'password': redis_password,
|
619
|
+
# }
|
620
|
+
# # redis 实例化
|
621
|
+
# r = redis.Redis(**redis_config)
|
622
|
+
# # mysql 实例化
|
623
|
+
# d = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
624
|
+
# # 将两个库的实例化对传给 RedisData 类,并实例化数据处理引擎
|
625
|
+
# m = RedisData(redis_engin=r, download=d)
|
626
|
+
# # ****************************************************
|
627
|
+
#
|
628
|
+
# # 以下为动态获取数据库数据
|
629
|
+
# db_name = '聚合数据'
|
630
|
+
# table_name = '多店推广场景_按日聚合'
|
631
|
+
# set_year = False
|
632
|
+
# df = m.get_from_redis(
|
633
|
+
# db_name=db_name,
|
634
|
+
# table_name=table_name,
|
635
|
+
# set_year=set_year,
|
636
|
+
# start_date='2025-01-01',
|
637
|
+
# end_date='2025-01-31'
|
638
|
+
# )
|
639
|
+
# logger.info(df)
|
640
|
+
#
|
641
|
+
|
642
|
+
logger.info(socket.gethostname())
|
mdbq/spider/__init__.py
ADDED