mdbq 3.6.9__py3-none-any.whl → 3.6.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/redis/getredis.py +130 -143
- mdbq/redis/getredis_/344/274/230/345/214/226hash.py +710 -0
- {mdbq-3.6.9.dist-info → mdbq-3.6.11.dist-info}/METADATA +1 -1
- {mdbq-3.6.9.dist-info → mdbq-3.6.11.dist-info}/RECORD +6 -5
- {mdbq-3.6.9.dist-info → mdbq-3.6.11.dist-info}/WHEEL +0 -0
- {mdbq-3.6.9.dist-info → mdbq-3.6.11.dist-info}/top_level.txt +0 -0
mdbq/redis/getredis.py
CHANGED
@@ -271,7 +271,6 @@ class RedisData(object):
|
|
271
271
|
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d")
|
272
272
|
return temp_df.to_json(orient="records", force_ascii=False)
|
273
273
|
|
274
|
-
|
275
274
|
class RedisDataHash(object):
|
276
275
|
"""
|
277
276
|
存储 hash
|
@@ -285,13 +284,6 @@ class RedisDataHash(object):
|
|
285
284
|
"""
|
286
285
|
|
287
286
|
def __init__(self, redis_engine, download, cache_ttl: int):
|
288
|
-
"""
|
289
|
-
初始化缓存处理器
|
290
|
-
|
291
|
-
:param redis_engine: Redis连接实例
|
292
|
-
:param download: 数据下载处理器(需实现data_to_df方法)
|
293
|
-
:param cache_ttl: 缓存存活时间(单位:分钟,内部转换为秒存储)
|
294
|
-
"""
|
295
287
|
self.redis_engine = redis_engine
|
296
288
|
self.download = download
|
297
289
|
self.cache_ttl = cache_ttl * 60 # 转换为秒存储
|
@@ -304,20 +296,8 @@ class RedisDataHash(object):
|
|
304
296
|
start_date,
|
305
297
|
end_date
|
306
298
|
) -> pd.DataFrame:
|
307
|
-
"""
|
308
|
-
从MySQL直接获取数据的核心方法
|
309
|
-
|
310
|
-
处理逻辑:
|
311
|
-
1. 当启用年份分表时(set_year=True),自动遍历2024到当前年份的所有分表
|
312
|
-
2. 合并所有符合条件的数据表内容
|
313
|
-
3. 自动处理日期列格式转换
|
314
|
-
|
315
|
-
:return: 合并后的DataFrame(可能包含多个分表数据)
|
316
|
-
"""
|
317
|
-
# 原有实现保持不变
|
318
299
|
dfs = []
|
319
300
|
if set_year:
|
320
|
-
# 处理年份分表情况(例如 table_2024, table_2025...)
|
321
301
|
current_year = datetime.datetime.today().year
|
322
302
|
for year in range(2024, current_year + 1):
|
323
303
|
df = self._fetch_table_data(
|
@@ -326,12 +306,10 @@ class RedisDataHash(object):
|
|
326
306
|
if df is not None:
|
327
307
|
dfs.append(df)
|
328
308
|
else:
|
329
|
-
# 单表查询模式
|
330
309
|
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
331
310
|
if df is not None:
|
332
311
|
dfs.append(df)
|
333
312
|
|
334
|
-
# 合并结果并处理空数据情况
|
335
313
|
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
336
314
|
if combined_df.empty:
|
337
315
|
logger.warn(f"warning: {db_name}.{table_name} 未读取到数据")
|
@@ -347,69 +325,45 @@ class RedisDataHash(object):
|
|
347
325
|
start_date,
|
348
326
|
end_date
|
349
327
|
) -> pd.DataFrame:
|
350
|
-
"""
|
351
|
-
带缓存策略的数据获取主入口
|
352
|
-
|
353
|
-
执行流程:
|
354
|
-
1. 生成缓存键并检查TTL(存活时间)
|
355
|
-
2. 当TTL<60秒时触发异步更新,同时直接访问MySQL获取最新数据
|
356
|
-
3. 从Redis获取历史数据并进行日期过滤
|
357
|
-
4. 若缓存数据不完整,触发异步更新并降级到MySQL查询
|
358
|
-
5. 异常时自动降级到MySQL查询
|
359
|
-
|
360
|
-
设计特点:
|
361
|
-
- 缓存预热:首次访问时异步更新缓存
|
362
|
-
- 降级机制:任何异常自动切换直连MySQL
|
363
|
-
- 过时缓存:当TTL不足时并行更新缓存
|
364
|
-
"""
|
365
|
-
# 时分秒部分重置为 00:00:00 这是个巨坑,不可以省略
|
366
328
|
start_dt = pd.to_datetime(start_date).floor('D')
|
367
329
|
end_dt = pd.to_datetime(end_date).floor('D')
|
368
|
-
# 生成缓存键名
|
369
330
|
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
370
331
|
|
371
332
|
try:
|
372
|
-
# 检查缓存
|
373
333
|
ttl = self.redis_engine.ttl(cache_key)
|
374
|
-
if ttl < 60:
|
375
|
-
# 获取当前缓存
|
334
|
+
if ttl < 60:
|
376
335
|
cache_data = self._fetch_redis_data(cache_key)
|
377
|
-
# 异步更新缓存
|
378
336
|
self._trigger_async_cache_update(
|
379
337
|
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
380
338
|
)
|
381
|
-
# 立即降级返回MySQL查询
|
382
339
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
383
340
|
|
384
|
-
#
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
341
|
+
# 生成月份范围
|
342
|
+
start_month = start_dt.to_period('M')
|
343
|
+
end_month = end_dt.to_period('M')
|
344
|
+
months = pd.period_range(start_month, end_month, freq='M').strftime("%Y%m").tolist()
|
345
|
+
cache_data = self._fetch_redis_data(cache_key, months)
|
389
346
|
if cache_data.empty:
|
390
347
|
self._trigger_async_cache_update(
|
391
348
|
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
392
349
|
)
|
393
350
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
394
|
-
|
351
|
+
|
395
352
|
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
396
353
|
if not filtered_df.empty:
|
397
354
|
if '日期' in filtered_df.columns.tolist():
|
398
|
-
# 缓存数据的日期在请求日期范围内时,直接返回缓存数据
|
399
355
|
exsit_min_date = filtered_df['日期'].min()
|
400
356
|
if exsit_min_date <= start_dt:
|
401
357
|
return filtered_df
|
402
358
|
else:
|
403
359
|
return filtered_df
|
404
|
-
|
360
|
+
|
405
361
|
self._trigger_async_cache_update(
|
406
362
|
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
407
363
|
)
|
408
|
-
# 立即降级返回MySQL查询
|
409
364
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
410
365
|
|
411
366
|
except Exception as e:
|
412
|
-
# 异常策略:立即返回MySQL查询,保障服务可用
|
413
367
|
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
414
368
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
415
369
|
|
@@ -423,45 +377,41 @@ class RedisDataHash(object):
|
|
423
377
|
end_date,
|
424
378
|
existing_data: pd.DataFrame
|
425
379
|
) -> None:
|
426
|
-
"""
|
427
|
-
异步缓存更新方法
|
428
|
-
|
429
|
-
核心逻辑:
|
430
|
-
1. 获取MySQL最新数据
|
431
|
-
2. 合并新旧数据(保留历史数据中不在新数据时间范围内的部分)
|
432
|
-
3. 智能存储策略:
|
433
|
-
- 无日期字段:全量存储到"all"字段
|
434
|
-
- 有日期字段:按年份分片存储(提升查询效率)
|
435
|
-
|
436
|
-
设计特点:
|
437
|
-
- 增量更新:仅合并必要数据,避免全量覆盖
|
438
|
-
- 数据分片:按年存储提升大数据的读取性能
|
439
|
-
- 容错处理:跳过无日期字段的异常情况
|
440
|
-
"""
|
441
380
|
try:
|
442
|
-
# 获取最新数据(使用最新查询条件)
|
443
381
|
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
444
382
|
if new_data.empty:
|
445
383
|
return
|
446
384
|
|
447
|
-
# 合并缓存数据
|
448
385
|
combined_data = self._merge_data(new_data, existing_data)
|
449
386
|
|
450
387
|
if not combined_data.empty:
|
451
|
-
|
452
|
-
|
453
|
-
#
|
454
|
-
|
455
|
-
self.redis_engine.
|
456
|
-
|
388
|
+
if '日期' not in combined_data.columns:
|
389
|
+
# 原子化删除旧分片
|
390
|
+
# 优化分片存储性能
|
391
|
+
chunk_size = 5000
|
392
|
+
with self.redis_engine.pipeline(transaction=False) as pipe:
|
393
|
+
# 批量删除旧分片
|
394
|
+
for key in self.redis_engine.hscan_iter(cache_key, match="all_*"):
|
395
|
+
pipe.hdel(cache_key, key[0])
|
396
|
+
|
397
|
+
# 批量写入新分片
|
398
|
+
for idx in range(0, len(combined_data), chunk_size):
|
399
|
+
chunk = combined_data.iloc[idx:idx + chunk_size]
|
400
|
+
chunk_key = f"all_{idx // chunk_size:04d}"
|
401
|
+
pipe.hset(cache_key, chunk_key, self._serialize_data(chunk))
|
402
|
+
|
403
|
+
pipe.expire(cache_key, self.cache_ttl)
|
404
|
+
pipe.execute()
|
405
|
+
# serialized_data = self._serialize_data(combined_data)
|
406
|
+
# self.redis_engine.hset(cache_key, "all", serialized_data)
|
407
|
+
# self.redis_engine.expire(cache_key, self.cache_ttl)
|
457
408
|
else:
|
458
|
-
#
|
459
|
-
combined_data['
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
self.redis_engine.hset(cache_key, year_str, serialized_data)
|
409
|
+
# 按月分片存储
|
410
|
+
combined_data['month'] = combined_data['日期'].dt.to_period('M').dt.strftime("%Y%m")
|
411
|
+
for month_str, group in combined_data.groupby('month'):
|
412
|
+
group = group.drop(columns=['month'])
|
413
|
+
serialized_data = self._serialize_data(group)
|
414
|
+
self.redis_engine.hset(cache_key, month_str, serialized_data)
|
465
415
|
self.redis_engine.expire(cache_key, self.cache_ttl)
|
466
416
|
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
467
417
|
except Exception as e:
|
@@ -474,7 +424,6 @@ class RedisDataHash(object):
|
|
474
424
|
start_date,
|
475
425
|
end_date
|
476
426
|
) -> pd.DataFrame:
|
477
|
-
"""执行MySQL查询并返回DataFrame(带异常处理)"""
|
478
427
|
try:
|
479
428
|
return self.download.data_to_df(
|
480
429
|
db_name=db_name,
|
@@ -484,23 +433,77 @@ class RedisDataHash(object):
|
|
484
433
|
projection={}
|
485
434
|
)
|
486
435
|
except Exception as e:
|
487
|
-
logger.
|
436
|
+
logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
488
437
|
return pd.DataFrame()
|
489
438
|
|
490
|
-
def _fetch_redis_data(self, cache_key: str,
|
491
|
-
|
492
|
-
|
439
|
+
def _fetch_redis_data(self, cache_key: str, months: list = None) -> pd.DataFrame:
|
440
|
+
try:
|
441
|
+
dfs = []
|
493
442
|
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
443
|
+
if months is not None:
|
444
|
+
# 1. 获取指定月份数据
|
445
|
+
month_fields = months.copy()
|
446
|
+
month_data = self.redis_engine.hmget(cache_key, month_fields)
|
447
|
+
|
448
|
+
# 处理月份数据
|
449
|
+
for data, field in zip(month_data, month_fields):
|
450
|
+
if data:
|
451
|
+
try:
|
452
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
453
|
+
df = self._convert_date_columns(df)
|
454
|
+
dfs.append(df)
|
455
|
+
except Exception as e:
|
456
|
+
logger.error(f"月份数据解析失败 {field}: {e}")
|
457
|
+
|
458
|
+
# 2. 获取所有分片数据
|
459
|
+
# 优化分片数据获取
|
460
|
+
pipeline = self.redis_engine.pipeline()
|
461
|
+
cursor, keys = self.redis_engine.hscan(cache_key, match="all_*")
|
462
|
+
while True:
|
463
|
+
for key in keys:
|
464
|
+
pipeline.hget(cache_key, key)
|
465
|
+
if cursor == 0:
|
466
|
+
break
|
467
|
+
cursor, keys = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
|
468
|
+
shard_values = pipeline.execute()
|
469
|
+
|
470
|
+
# 处理分片数据
|
471
|
+
for value in shard_values:
|
472
|
+
if value:
|
473
|
+
try:
|
474
|
+
df = pd.DataFrame(json.loads(value.decode("utf-8")))
|
475
|
+
dfs.append(self._convert_date_columns(df))
|
476
|
+
except Exception as e:
|
477
|
+
logger.error(f"分片数据解析失败: {e}")
|
478
|
+
|
479
|
+
else:
|
480
|
+
# 原有全量获取逻辑保持不变
|
481
|
+
data_dict = self.redis_engine.hgetall(cache_key)
|
482
|
+
for field, data in data_dict.items():
|
483
|
+
try:
|
484
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
485
|
+
df = self._convert_date_columns(df)
|
486
|
+
dfs.append(df)
|
487
|
+
except Exception as e:
|
488
|
+
logger.error(f"Redis 数据解析失败 {field.decode()}: {e}")
|
489
|
+
|
490
|
+
# 统一合并和排序处理
|
491
|
+
if dfs:
|
492
|
+
final_df = pd.concat(dfs, ignore_index=True)
|
493
|
+
if '日期' in final_df.columns:
|
494
|
+
final_df = final_df.sort_values('日期', ascending=False)
|
495
|
+
return final_df
|
496
|
+
return pd.DataFrame()
|
497
|
+
|
498
|
+
except Exception as e:
|
499
|
+
logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
|
500
|
+
return pd.DataFrame()
|
501
|
+
|
502
|
+
def _fetch_redis_data_bak(self, cache_key: str, months: list = None) -> pd.DataFrame:
|
499
503
|
try:
|
500
|
-
if
|
501
|
-
|
502
|
-
fields
|
503
|
-
fields += ['all']
|
504
|
+
if months is not None:
|
505
|
+
fields = months.copy()
|
506
|
+
fields.append('all')
|
504
507
|
data_list = self.redis_engine.hmget(cache_key, fields)
|
505
508
|
dfs = []
|
506
509
|
for data, field in zip(data_list, fields):
|
@@ -510,29 +513,40 @@ class RedisDataHash(object):
|
|
510
513
|
dfs.append(df)
|
511
514
|
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
512
515
|
else:
|
513
|
-
#
|
514
|
-
|
516
|
+
# 优化分片数据获取
|
517
|
+
cursor, data = self.redis_engine.hscan(cache_key, match="all_*")
|
515
518
|
dfs = []
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
519
|
+
while True:
|
520
|
+
for field, value in data.items():
|
521
|
+
try:
|
522
|
+
df = pd.DataFrame(json.loads(value))
|
523
|
+
dfs.append(self._convert_date_columns(df))
|
524
|
+
except Exception as e:
|
525
|
+
logger.error(f"分片解析失败 {field}: {e}")
|
526
|
+
if cursor == 0:
|
527
|
+
break
|
528
|
+
cursor, data = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
|
529
|
+
return pd.concat(dfs) if dfs else pd.DataFrame()
|
530
|
+
# data_dict = self.redis_engine.hgetall(cache_key)
|
531
|
+
# dfs = []
|
532
|
+
# for field, data in data_dict.items():
|
533
|
+
# try:
|
534
|
+
# df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
535
|
+
# df = self._convert_date_columns(df)
|
536
|
+
# dfs.append(df)
|
537
|
+
# except Exception as e:
|
538
|
+
# logger.error(f"Redis 数据解析失败 {cache_key} 字段 {field}: {e}")
|
523
539
|
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
524
540
|
except Exception as e:
|
525
|
-
logger.
|
541
|
+
logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
|
526
542
|
return pd.DataFrame()
|
527
543
|
|
528
544
|
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
529
|
-
"""统一日期列格式转换"""
|
530
545
|
if "日期" in df.columns:
|
531
546
|
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
532
547
|
return df
|
533
548
|
|
534
549
|
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
535
|
-
"""生成缓存键名"""
|
536
550
|
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
537
551
|
|
538
552
|
def _filter_by_date_range(
|
@@ -541,7 +555,6 @@ class RedisDataHash(object):
|
|
541
555
|
start_dt: datetime.datetime,
|
542
556
|
end_dt: datetime.datetime
|
543
557
|
) -> pd.DataFrame:
|
544
|
-
"""按日期范围精确过滤数据"""
|
545
558
|
if "日期" not in df.columns:
|
546
559
|
return df
|
547
560
|
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
@@ -557,7 +570,6 @@ class RedisDataHash(object):
|
|
557
570
|
end_date: str,
|
558
571
|
existing_data: pd.DataFrame
|
559
572
|
):
|
560
|
-
"""启动异步线程执行缓存更新(不阻塞主流程)"""
|
561
573
|
thread = threading.Thread(
|
562
574
|
target=self.set_redis,
|
563
575
|
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
@@ -566,17 +578,14 @@ class RedisDataHash(object):
|
|
566
578
|
thread.start()
|
567
579
|
|
568
580
|
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
569
|
-
"""合并新旧数据集策略:保留现有数据中在新数据范围外的历史数据,并按日期排序"""
|
570
581
|
if existing_data.empty or "日期" not in existing_data.columns:
|
571
582
|
return new_data
|
572
583
|
new_data["日期"] = pd.to_datetime(new_data["日期"])
|
573
584
|
existing_data["日期"] = pd.to_datetime(existing_data["日期"])
|
574
585
|
|
575
|
-
# 计算新数据日期范围
|
576
586
|
new_min = new_data["日期"].min()
|
577
587
|
new_max = new_data["日期"].max()
|
578
588
|
|
579
|
-
# 保留现有数据中在新数据范围之外的部分
|
580
589
|
valid_historical = existing_data[
|
581
590
|
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
582
591
|
]
|
@@ -585,53 +594,35 @@ class RedisDataHash(object):
|
|
585
594
|
return merged_data
|
586
595
|
|
587
596
|
def _serialize_data(self, df: pd.DataFrame) -> bytes:
|
588
|
-
"""
|
589
|
-
高性能数据序列化方法
|
590
|
-
|
591
|
-
处理要点:
|
592
|
-
1. 日期类型转换为字符串
|
593
|
-
2. Decimal类型转换为浮点数
|
594
|
-
3. NaN值统一转换为None
|
595
|
-
4. 优化JSON序列化性能
|
596
|
-
"""
|
597
597
|
if df.empty:
|
598
598
|
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
599
599
|
temp_df = df.copy()
|
600
600
|
|
601
|
-
# 处理日期类型列(安全转换)
|
602
601
|
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
603
602
|
for col in date_cols:
|
604
|
-
# 处理全NaT列避免类型错误
|
605
603
|
if temp_df[col].isna().all():
|
606
|
-
temp_df[col] = temp_df[col].astype(object)
|
604
|
+
temp_df[col] = temp_df[col].astype(object)
|
607
605
|
temp_df[col] = (
|
608
606
|
temp_df[col]
|
609
|
-
.dt.strftime("%Y-%m-%d")
|
607
|
+
.dt.strftime("%Y-%m-%d")
|
610
608
|
.where(temp_df[col].notna(), None)
|
611
609
|
)
|
612
610
|
|
613
|
-
# 统一空值处理(保护全None列类型)
|
614
611
|
def safe_null_convert(series):
|
615
|
-
"""保留全None列的原始dtype"""
|
616
612
|
if series.isna().all():
|
617
613
|
return series.astype(object).where(pd.notnull(series), None)
|
618
614
|
return series.where(pd.notnull(series), None)
|
619
615
|
|
620
616
|
temp_df = temp_df.apply(safe_null_convert)
|
621
617
|
|
622
|
-
# 类型处理函数(增强嵌套结构处理)
|
623
618
|
def decimal_serializer(obj):
|
624
|
-
"""递归序列化处理"""
|
625
|
-
# 提前处理None值
|
626
619
|
if obj is None:
|
627
620
|
return None
|
628
|
-
|
629
|
-
# 按类型分发处理
|
630
621
|
if isinstance(obj, Decimal):
|
631
622
|
return round(float(obj), 6)
|
632
623
|
elif isinstance(obj, pd.Timestamp):
|
633
|
-
return obj.strftime("%Y-%m-%d %H:%M:%S")
|
634
|
-
elif isinstance(obj, np.generic):
|
624
|
+
return obj.strftime("%Y-%m-%d %H:%M:%S")
|
625
|
+
elif isinstance(obj, np.generic):
|
635
626
|
return obj.item()
|
636
627
|
elif isinstance(obj, (datetime.date, datetime.datetime)):
|
637
628
|
return obj.isoformat()
|
@@ -640,11 +631,10 @@ class RedisDataHash(object):
|
|
640
631
|
elif isinstance(obj, dict):
|
641
632
|
return {decimal_serializer(k): decimal_serializer(v) for k, v in obj.items()}
|
642
633
|
elif isinstance(obj, bytes):
|
643
|
-
return obj.decode("utf-8", errors="replace")
|
644
|
-
elif isinstance(obj, pd.Series):
|
634
|
+
return obj.decode("utf-8", errors="replace")
|
635
|
+
elif isinstance(obj, pd.Series):
|
645
636
|
return obj.to_list()
|
646
637
|
else:
|
647
|
-
# 尝试直接转换可序列化类型
|
648
638
|
try:
|
649
639
|
json.dumps(obj)
|
650
640
|
return obj
|
@@ -652,18 +642,15 @@ class RedisDataHash(object):
|
|
652
642
|
logger.error(f"无法序列化类型 {type(obj)}: {str(obj)}")
|
653
643
|
raise
|
654
644
|
|
655
|
-
# 序列化前防御性检查
|
656
645
|
try:
|
657
646
|
data_records = temp_df.to_dict(orient="records")
|
658
647
|
except Exception as e:
|
659
648
|
logger.error(f"数据转换字典失败: {str(e)}")
|
660
649
|
raise
|
661
650
|
|
662
|
-
# 空记录特殊处理
|
663
651
|
if not data_records:
|
664
652
|
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
665
653
|
|
666
|
-
# 执行序列化
|
667
654
|
try:
|
668
655
|
return json.dumps(
|
669
656
|
data_records,
|
@@ -0,0 +1,710 @@
|
|
1
|
+
# -*- coding: UTF-8 –*-
|
2
|
+
import os.path
|
3
|
+
import redis
|
4
|
+
import socket
|
5
|
+
from mdbq.mysql import s_query
|
6
|
+
from mdbq.config import myconfig
|
7
|
+
import pandas as pd
|
8
|
+
import json
|
9
|
+
import datetime
|
10
|
+
import threading
|
11
|
+
import logging
|
12
|
+
from logging.handlers import RotatingFileHandler
|
13
|
+
import getpass
|
14
|
+
import platform
|
15
|
+
from decimal import Decimal
|
16
|
+
|
17
|
+
if platform.system() == 'Windows':
|
18
|
+
D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
|
19
|
+
else:
|
20
|
+
D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
|
21
|
+
|
22
|
+
|
23
|
+
if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
|
24
|
+
conf = myconfig.main()
|
25
|
+
conf_data = conf['Windows']['company']['mysql']['local']
|
26
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
27
|
+
redis_password = conf['Windows']['company']['redis']['local']['password']
|
28
|
+
elif socket.gethostname() == 'MacBookPro':
|
29
|
+
conf = myconfig.main()
|
30
|
+
conf_data = conf['Windows']['xigua_lx']['mysql']['local']
|
31
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
32
|
+
redis_password = conf['Windows']['company']['redis']['local']['password']
|
33
|
+
else:
|
34
|
+
conf = myconfig.main()
|
35
|
+
conf_data = conf['Windows']['xigua_lx']['mysql']['local']
|
36
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
37
|
+
redis_password = conf['Windows']['company']['redis']['local']['password'] # redis 使用本地数据,全部机子相同
|
38
|
+
|
39
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
|
40
|
+
|
41
|
+
# 获取当前模块的日志记录器
|
42
|
+
logger = logging.getLogger(__name__)
|
43
|
+
|
44
|
+
# 创建一个文件处理器,用于将日志写入文件
|
45
|
+
# file_handler = logging.FileHandler(os.path.join(D_PATH, 'logfile', 'redis.log'))
|
46
|
+
if not os.path.isdir(os.path.join(D_PATH, 'logfile')):
|
47
|
+
os.makedirs(os.path.join(D_PATH, 'logfile'))
|
48
|
+
log_file = os.path.join(D_PATH, 'logfile', 'redis.log')
|
49
|
+
file_handler = RotatingFileHandler(log_file, maxBytes=3 * 1024 * 1024, backupCount=10) # 保留10个备份文件
|
50
|
+
file_handler.setLevel(logging.INFO) # 设置文件处理器的日志级别
|
51
|
+
|
52
|
+
# 创建一个日志格式器,并设置给文件处理器
|
53
|
+
formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
54
|
+
file_handler.setFormatter(formatter)
|
55
|
+
|
56
|
+
# 将文件处理器添加到日志记录器
|
57
|
+
logger.addHandler(file_handler)
|
58
|
+
|
59
|
+
|
60
|
+
class RedisData(object):
|
61
|
+
"""
|
62
|
+
存储 string
|
63
|
+
"""
|
64
|
+
def __init__(self, redis_engine, download, cache_ttl: int):
|
65
|
+
self.redis_engine = redis_engine # Redis 数据处理引擎
|
66
|
+
self.download = download # MySQL 数据处理引擎
|
67
|
+
self.cache_ttl = cache_ttl * 60 # 缓存过期时间(秒)
|
68
|
+
|
69
|
+
def get_from_mysql(
|
70
|
+
self,
|
71
|
+
db_name: str,
|
72
|
+
table_name: str,
|
73
|
+
set_year: bool,
|
74
|
+
start_date,
|
75
|
+
end_date
|
76
|
+
) -> pd.DataFrame:
|
77
|
+
"""
|
78
|
+
从 MySQL 读取数据并返回 DataFrame
|
79
|
+
|
80
|
+
Args:
|
81
|
+
set_year: 表名是否包含年份后缀
|
82
|
+
"""
|
83
|
+
dfs = []
|
84
|
+
if set_year:
|
85
|
+
current_year = datetime.datetime.today().year
|
86
|
+
for year in range(2024, current_year + 1):
|
87
|
+
df = self._fetch_table_data(
|
88
|
+
db_name, f"{table_name}_{year}", start_date, end_date
|
89
|
+
)
|
90
|
+
if df is not None:
|
91
|
+
dfs.append(df)
|
92
|
+
else:
|
93
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
94
|
+
if df is not None:
|
95
|
+
dfs.append(df)
|
96
|
+
|
97
|
+
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
98
|
+
if combined_df.empty:
|
99
|
+
logger.info(f"警告: {db_name}.{table_name} 未读取到数据")
|
100
|
+
else:
|
101
|
+
combined_df = self._convert_date_columns(combined_df)
|
102
|
+
return combined_df
|
103
|
+
|
104
|
+
def get_from_redis(
|
105
|
+
self,
|
106
|
+
db_name: str,
|
107
|
+
table_name: str,
|
108
|
+
set_year: bool,
|
109
|
+
start_date,
|
110
|
+
end_date
|
111
|
+
) -> pd.DataFrame:
|
112
|
+
"""
|
113
|
+
从 Redis 获取数据,若缓存过期/不完整则触发异步更新
|
114
|
+
"""
|
115
|
+
start_dt = pd.to_datetime(start_date)
|
116
|
+
end_dt = pd.to_datetime(end_date)
|
117
|
+
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
118
|
+
|
119
|
+
# 尝试获取缓存元数据
|
120
|
+
try:
|
121
|
+
ttl = self.redis_engine.ttl(cache_key)
|
122
|
+
cache_data = self._fetch_redis_data(cache_key)
|
123
|
+
except Exception as e:
|
124
|
+
logger.info(f"Redis 连接异常: {e},直接访问 MySQL")
|
125
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
126
|
+
|
127
|
+
# 缓存失效处理逻辑
|
128
|
+
if ttl < 60 or cache_data.empty:
|
129
|
+
self._trigger_async_cache_update(
|
130
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
131
|
+
)
|
132
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
133
|
+
|
134
|
+
# 处理有效缓存数据
|
135
|
+
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
136
|
+
if not filtered_df.empty:
|
137
|
+
return filtered_df
|
138
|
+
|
139
|
+
# 缓存数据不满足查询范围要求
|
140
|
+
self._trigger_async_cache_update(
|
141
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
142
|
+
)
|
143
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
144
|
+
|
145
|
+
def set_redis(
|
146
|
+
self,
|
147
|
+
cache_key: str,
|
148
|
+
db_name: str,
|
149
|
+
table_name: str,
|
150
|
+
set_year: bool,
|
151
|
+
start_date,
|
152
|
+
end_date,
|
153
|
+
existing_data: pd.DataFrame
|
154
|
+
) -> pd.DataFrame:
|
155
|
+
"""
|
156
|
+
异步更新 Redis 缓存,合并新旧数据
|
157
|
+
"""
|
158
|
+
try:
|
159
|
+
# 从 MySQL 获取新数据
|
160
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
161
|
+
if new_data.empty:
|
162
|
+
return pd.DataFrame()
|
163
|
+
|
164
|
+
# 合并历史数据
|
165
|
+
combined_data = self._merge_data(new_data, existing_data)
|
166
|
+
|
167
|
+
# 序列化并存储到 Redis
|
168
|
+
serialized_data = self._serialize_data(combined_data)
|
169
|
+
self.redis_engine.set(cache_key, serialized_data)
|
170
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
171
|
+
|
172
|
+
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
173
|
+
return combined_data
|
174
|
+
|
175
|
+
except Exception as e:
|
176
|
+
logger.info(f"缓存更新失败: {cache_key} - {str(e)}")
|
177
|
+
return pd.DataFrame()
|
178
|
+
|
179
|
+
# Helper Methods ------------------------------------------------
|
180
|
+
|
181
|
+
def _fetch_table_data(
|
182
|
+
self,
|
183
|
+
db_name: str,
|
184
|
+
table_name: str,
|
185
|
+
start_date,
|
186
|
+
end_date
|
187
|
+
) -> pd.DataFrame:
|
188
|
+
"""封装 MySQL 数据获取逻辑"""
|
189
|
+
try:
|
190
|
+
return self.download.data_to_df(
|
191
|
+
db_name=db_name,
|
192
|
+
table_name=table_name,
|
193
|
+
start_date=start_date,
|
194
|
+
end_date=end_date,
|
195
|
+
projection={}
|
196
|
+
)
|
197
|
+
except Exception as e:
|
198
|
+
logger.info(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
199
|
+
return pd.DataFrame()
|
200
|
+
|
201
|
+
def _fetch_redis_data(self, cache_key: str) -> pd.DataFrame:
|
202
|
+
"""从 Redis 获取并解析数据(自动转换日期列)"""
|
203
|
+
try:
|
204
|
+
data = self.redis_engine.get(cache_key)
|
205
|
+
if not data:
|
206
|
+
return pd.DataFrame()
|
207
|
+
# 反序列化数据
|
208
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
209
|
+
return self._convert_date_columns(df)
|
210
|
+
except Exception as e:
|
211
|
+
logger.info(f"Redis 数据解析失败 {cache_key}: {e}")
|
212
|
+
return pd.DataFrame()
|
213
|
+
|
214
|
+
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
215
|
+
"""统一处理日期列转换"""
|
216
|
+
if "日期" in df.columns:
|
217
|
+
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
218
|
+
return df
|
219
|
+
|
220
|
+
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
221
|
+
"""生成标准化的缓存键"""
|
222
|
+
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
223
|
+
|
224
|
+
def _filter_by_date_range(
|
225
|
+
self,
|
226
|
+
df: pd.DataFrame,
|
227
|
+
start_dt: datetime.datetime,
|
228
|
+
end_dt: datetime.datetime
|
229
|
+
) -> pd.DataFrame:
|
230
|
+
"""按日期范围筛选数据"""
|
231
|
+
if "日期" not in df.columns:
|
232
|
+
return df
|
233
|
+
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
234
|
+
return df[date_mask].copy()
|
235
|
+
|
236
|
+
def _trigger_async_cache_update(
|
237
|
+
self,
|
238
|
+
cache_key: str,
|
239
|
+
db_name: str,
|
240
|
+
table_name: str,
|
241
|
+
set_year: bool,
|
242
|
+
start_date: str,
|
243
|
+
end_date: str,
|
244
|
+
existing_data: pd.DataFrame
|
245
|
+
):
|
246
|
+
"""启动异步缓存更新线程"""
|
247
|
+
thread = threading.Thread(
|
248
|
+
target=self.set_redis,
|
249
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
250
|
+
daemon=True
|
251
|
+
)
|
252
|
+
thread.start()
|
253
|
+
|
254
|
+
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
255
|
+
"""合并新旧数据集"""
|
256
|
+
if existing_data.empty or "日期" not in existing_data.columns:
|
257
|
+
return new_data
|
258
|
+
|
259
|
+
new_min = new_data["日期"].min()
|
260
|
+
new_max = new_data["日期"].max()
|
261
|
+
valid_historical = existing_data[
|
262
|
+
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
263
|
+
]
|
264
|
+
return pd.concat([new_data, valid_historical], ignore_index=True).drop_duplicates(subset=["日期"])
|
265
|
+
|
266
|
+
def _serialize_data(self, df: pd.DataFrame) -> str:
|
267
|
+
"""序列化 DataFrame 并处理日期类型"""
|
268
|
+
temp_df = df.copy()
|
269
|
+
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
270
|
+
for col in date_cols:
|
271
|
+
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d")
|
272
|
+
return temp_df.to_json(orient="records", force_ascii=False)
|
273
|
+
|
274
|
+
|
275
|
+
class RedisDataHash(object):
|
276
|
+
"""
|
277
|
+
存储 hash
|
278
|
+
Redis缓存与MySQL数据联合查询处理器
|
279
|
+
|
280
|
+
功能特性:
|
281
|
+
- 支持带年份分表的MySQL数据查询
|
282
|
+
- 多级缓存策略(内存缓存+Redis缓存)
|
283
|
+
- 异步缓存更新机制
|
284
|
+
- 自动处理日期范围和数据类型转换
|
285
|
+
"""
|
286
|
+
|
287
|
+
def __init__(self, redis_engine, download, cache_ttl: int):
|
288
|
+
"""
|
289
|
+
初始化缓存处理器
|
290
|
+
|
291
|
+
:param redis_engine: Redis连接实例
|
292
|
+
:param download: 数据下载处理器(需实现data_to_df方法)
|
293
|
+
:param cache_ttl: 缓存存活时间(单位:分钟,内部转换为秒存储)
|
294
|
+
"""
|
295
|
+
self.redis_engine = redis_engine
|
296
|
+
self.download = download
|
297
|
+
self.cache_ttl = cache_ttl * 60 # 转换为秒存储
|
298
|
+
|
299
|
+
def get_from_mysql(
|
300
|
+
self,
|
301
|
+
db_name: str,
|
302
|
+
table_name: str,
|
303
|
+
set_year: bool,
|
304
|
+
start_date,
|
305
|
+
end_date
|
306
|
+
) -> pd.DataFrame:
|
307
|
+
"""
|
308
|
+
从MySQL直接获取数据的核心方法
|
309
|
+
|
310
|
+
处理逻辑:
|
311
|
+
1. 当启用年份分表时(set_year=True),自动遍历2024到当前年份的所有分表
|
312
|
+
2. 合并所有符合条件的数据表内容
|
313
|
+
3. 自动处理日期列格式转换
|
314
|
+
|
315
|
+
:return: 合并后的DataFrame(可能包含多个分表数据)
|
316
|
+
"""
|
317
|
+
# 原有实现保持不变
|
318
|
+
dfs = []
|
319
|
+
if set_year:
|
320
|
+
# 处理年份分表情况(例如 table_2024, table_2025...)
|
321
|
+
current_year = datetime.datetime.today().year
|
322
|
+
for year in range(2024, current_year + 1):
|
323
|
+
df = self._fetch_table_data(
|
324
|
+
db_name, f"{table_name}_{year}", start_date, end_date
|
325
|
+
)
|
326
|
+
if df is not None:
|
327
|
+
dfs.append(df)
|
328
|
+
else:
|
329
|
+
# 单表查询模式
|
330
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
331
|
+
if df is not None:
|
332
|
+
dfs.append(df)
|
333
|
+
|
334
|
+
# 合并结果并处理空数据情况
|
335
|
+
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
336
|
+
if combined_df.empty:
|
337
|
+
logger.warn(f"warning: {db_name}.{table_name} 未读取到数据")
|
338
|
+
else:
|
339
|
+
combined_df = self._convert_date_columns(combined_df)
|
340
|
+
return combined_df
|
341
|
+
|
342
|
+
def get_from_redis(
|
343
|
+
self,
|
344
|
+
db_name: str,
|
345
|
+
table_name: str,
|
346
|
+
set_year: bool,
|
347
|
+
start_date,
|
348
|
+
end_date
|
349
|
+
) -> pd.DataFrame:
|
350
|
+
"""
|
351
|
+
带缓存策略的数据获取主入口
|
352
|
+
|
353
|
+
执行流程:
|
354
|
+
1. 生成缓存键并检查TTL(存活时间)
|
355
|
+
2. 当TTL<60秒时触发异步更新,同时直接访问MySQL获取最新数据
|
356
|
+
3. 从Redis获取历史数据并进行日期过滤
|
357
|
+
4. 若缓存数据不完整,触发异步更新并降级到MySQL查询
|
358
|
+
5. 异常时自动降级到MySQL查询
|
359
|
+
|
360
|
+
设计特点:
|
361
|
+
- 缓存预热:首次访问时异步更新缓存
|
362
|
+
- 降级机制:任何异常自动切换直连MySQL
|
363
|
+
- 过时缓存:当TTL不足时并行更新缓存
|
364
|
+
"""
|
365
|
+
# 时分秒部分重置为 00:00:00 这是个巨坑,不可以省略
|
366
|
+
start_dt = pd.to_datetime(start_date).floor('D')
|
367
|
+
end_dt = pd.to_datetime(end_date).floor('D')
|
368
|
+
# 生成缓存键名
|
369
|
+
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
370
|
+
|
371
|
+
try:
|
372
|
+
# 检查缓存
|
373
|
+
ttl = self.redis_engine.ttl(cache_key)
|
374
|
+
if ttl < 60: # 当剩余时间不足1分钟时触发更新
|
375
|
+
# 获取当前缓存
|
376
|
+
cache_data = self._fetch_redis_data(cache_key)
|
377
|
+
# 异步更新缓存
|
378
|
+
self._trigger_async_cache_update(
|
379
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
380
|
+
)
|
381
|
+
# 立即降级返回MySQL查询
|
382
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
383
|
+
|
384
|
+
# 按年份范围获取缓存数据(优化大数据量时的读取效率)
|
385
|
+
start_year = start_dt.year
|
386
|
+
end_year = end_dt.year
|
387
|
+
cache_data = self._fetch_redis_data(cache_key, start_year, end_year)
|
388
|
+
# 空数据检查(缓存未命中)
|
389
|
+
if cache_data.empty:
|
390
|
+
self._trigger_async_cache_update(
|
391
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
392
|
+
)
|
393
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
394
|
+
# 按请求范围过滤数据(应对按年存储的粗粒度缓存)
|
395
|
+
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
396
|
+
if not filtered_df.empty:
|
397
|
+
if '日期' in filtered_df.columns.tolist():
|
398
|
+
# 缓存数据的日期在请求日期范围内时,直接返回缓存数据
|
399
|
+
exsit_min_date = filtered_df['日期'].min()
|
400
|
+
if exsit_min_date <= start_dt:
|
401
|
+
return filtered_df
|
402
|
+
else:
|
403
|
+
return filtered_df
|
404
|
+
# 缓存数据不完整时触发异步更新缓存
|
405
|
+
self._trigger_async_cache_update(
|
406
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
407
|
+
)
|
408
|
+
# 立即降级返回MySQL查询
|
409
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
410
|
+
|
411
|
+
except Exception as e:
|
412
|
+
# 异常策略:立即返回MySQL查询,保障服务可用
|
413
|
+
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
414
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
415
|
+
|
416
|
+
def set_redis(
|
417
|
+
self,
|
418
|
+
cache_key: str,
|
419
|
+
db_name: str,
|
420
|
+
table_name: str,
|
421
|
+
set_year: bool,
|
422
|
+
start_date,
|
423
|
+
end_date,
|
424
|
+
existing_data: pd.DataFrame
|
425
|
+
) -> None:
|
426
|
+
"""
|
427
|
+
异步缓存更新方法
|
428
|
+
|
429
|
+
核心逻辑:
|
430
|
+
1. 获取MySQL最新数据
|
431
|
+
2. 合并新旧数据(保留历史数据中不在新数据时间范围内的部分)
|
432
|
+
3. 智能存储策略:
|
433
|
+
- 无日期字段:全量存储到"all"字段
|
434
|
+
- 有日期字段:按年份分片存储(提升查询效率)
|
435
|
+
|
436
|
+
设计特点:
|
437
|
+
- 增量更新:仅合并必要数据,避免全量覆盖
|
438
|
+
- 数据分片:按年存储提升大数据的读取性能
|
439
|
+
- 容错处理:跳过无日期字段的异常情况
|
440
|
+
"""
|
441
|
+
try:
|
442
|
+
# 获取最新数据(使用最新查询条件)
|
443
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
444
|
+
if new_data.empty:
|
445
|
+
return
|
446
|
+
|
447
|
+
# 合并缓存数据
|
448
|
+
combined_data = self._merge_data(new_data, existing_data)
|
449
|
+
|
450
|
+
if not combined_data.empty:
|
451
|
+
# 处理无日期字段的特殊情况
|
452
|
+
if '日期' not in combined_data.columns.tolist():
|
453
|
+
# 数据序列化
|
454
|
+
serialized_data = self._serialize_data(combined_data)
|
455
|
+
self.redis_engine.hset(cache_key, "all", serialized_data)
|
456
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
457
|
+
else:
|
458
|
+
# 按年份分片存储策略
|
459
|
+
combined_data['年份'] = combined_data['日期'].dt.year
|
460
|
+
# 分组存储到Redis哈希的不同字段(例如2024字段存储当年数据)
|
461
|
+
for year, group in combined_data.groupby('年份'):
|
462
|
+
year_str = str(year)
|
463
|
+
serialized_data = self._serialize_data(group.drop(columns=['年份']))
|
464
|
+
self.redis_engine.hset(cache_key, year_str, serialized_data)
|
465
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
466
|
+
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
467
|
+
except Exception as e:
|
468
|
+
logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
|
469
|
+
|
470
|
+
def _fetch_table_data(
|
471
|
+
self,
|
472
|
+
db_name: str,
|
473
|
+
table_name: str,
|
474
|
+
start_date,
|
475
|
+
end_date
|
476
|
+
) -> pd.DataFrame:
|
477
|
+
"""执行MySQL查询并返回DataFrame(带异常处理)"""
|
478
|
+
try:
|
479
|
+
return self.download.data_to_df(
|
480
|
+
db_name=db_name,
|
481
|
+
table_name=table_name,
|
482
|
+
start_date=start_date,
|
483
|
+
end_date=end_date,
|
484
|
+
projection={}
|
485
|
+
)
|
486
|
+
except Exception as e:
|
487
|
+
logger.info(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
488
|
+
return pd.DataFrame()
|
489
|
+
|
490
|
+
def _fetch_redis_data(self, cache_key: str, start_year: int = None, end_year: int = None) -> pd.DataFrame:
|
491
|
+
"""
|
492
|
+
从Redis哈希表读取数据
|
493
|
+
|
494
|
+
优化策略:
|
495
|
+
- 当指定年份范围时,仅获取相关字段(hmget)
|
496
|
+
- 未指定范围时全量获取(hgetall)
|
497
|
+
-- 从mysql过来的表,虽然没有日期列,但也指定了 start_year/end_year,再redis中存储的键名是"all",所以要把 all也加进去
|
498
|
+
"""
|
499
|
+
try:
|
500
|
+
if start_year is not None and end_year is not None:
|
501
|
+
# 按年份范围精确获取字段(提升性能)
|
502
|
+
fields = [str(y) for y in range(start_year, end_year + 1)]
|
503
|
+
fields += ['all']
|
504
|
+
data_list = self.redis_engine.hmget(cache_key, fields)
|
505
|
+
dfs = []
|
506
|
+
for data, field in zip(data_list, fields):
|
507
|
+
if data:
|
508
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
509
|
+
df = self._convert_date_columns(df)
|
510
|
+
dfs.append(df)
|
511
|
+
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
512
|
+
else:
|
513
|
+
# 全量获取模式
|
514
|
+
data_dict = self.redis_engine.hgetall(cache_key)
|
515
|
+
dfs = []
|
516
|
+
for field, data in data_dict.items():
|
517
|
+
try:
|
518
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
519
|
+
df = self._convert_date_columns(df)
|
520
|
+
dfs.append(df)
|
521
|
+
except Exception as e:
|
522
|
+
logger.info(f"Redis 数据解析失败 {cache_key} 字段 {field}: {e}")
|
523
|
+
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
524
|
+
except Exception as e:
|
525
|
+
logger.info(f"Redis 数据获取失败 {cache_key}: {e}")
|
526
|
+
return pd.DataFrame()
|
527
|
+
|
528
|
+
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
529
|
+
"""统一日期列格式转换"""
|
530
|
+
if "日期" in df.columns:
|
531
|
+
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
532
|
+
return df
|
533
|
+
|
534
|
+
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
535
|
+
"""生成缓存键名"""
|
536
|
+
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
537
|
+
|
538
|
+
def _filter_by_date_range(
|
539
|
+
self,
|
540
|
+
df: pd.DataFrame,
|
541
|
+
start_dt: datetime.datetime,
|
542
|
+
end_dt: datetime.datetime
|
543
|
+
) -> pd.DataFrame:
|
544
|
+
"""按日期范围精确过滤数据"""
|
545
|
+
if "日期" not in df.columns:
|
546
|
+
return df
|
547
|
+
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
548
|
+
return df[date_mask].copy()
|
549
|
+
|
550
|
+
def _trigger_async_cache_update(
|
551
|
+
self,
|
552
|
+
cache_key: str,
|
553
|
+
db_name: str,
|
554
|
+
table_name: str,
|
555
|
+
set_year: bool,
|
556
|
+
start_date: str,
|
557
|
+
end_date: str,
|
558
|
+
existing_data: pd.DataFrame
|
559
|
+
):
|
560
|
+
"""启动异步线程执行缓存更新(不阻塞主流程)"""
|
561
|
+
thread = threading.Thread(
|
562
|
+
target=self.set_redis,
|
563
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
564
|
+
daemon=True
|
565
|
+
)
|
566
|
+
thread.start()
|
567
|
+
|
568
|
+
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
569
|
+
"""合并新旧数据集策略:保留现有数据中在新数据范围外的历史数据,并按日期排序"""
|
570
|
+
if existing_data.empty or "日期" not in existing_data.columns:
|
571
|
+
return new_data
|
572
|
+
new_data["日期"] = pd.to_datetime(new_data["日期"])
|
573
|
+
existing_data["日期"] = pd.to_datetime(existing_data["日期"])
|
574
|
+
|
575
|
+
# 计算新数据日期范围
|
576
|
+
new_min = new_data["日期"].min()
|
577
|
+
new_max = new_data["日期"].max()
|
578
|
+
|
579
|
+
# 保留现有数据中在新数据范围之外的部分
|
580
|
+
valid_historical = existing_data[
|
581
|
+
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
582
|
+
]
|
583
|
+
merged_data = pd.concat([new_data, valid_historical], ignore_index=True)
|
584
|
+
merged_data.sort_values(['日期'], ascending=[False], ignore_index=True, inplace=True)
|
585
|
+
return merged_data
|
586
|
+
|
587
|
+
def _serialize_data(self, df: pd.DataFrame) -> bytes:
|
588
|
+
"""
|
589
|
+
高性能数据序列化方法
|
590
|
+
|
591
|
+
处理要点:
|
592
|
+
1. 日期类型转换为字符串
|
593
|
+
2. Decimal类型转换为浮点数
|
594
|
+
3. NaN值统一转换为None
|
595
|
+
4. 优化JSON序列化性能
|
596
|
+
"""
|
597
|
+
if df.empty:
|
598
|
+
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
599
|
+
temp_df = df.copy()
|
600
|
+
|
601
|
+
# 处理日期类型列(安全转换)
|
602
|
+
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
603
|
+
for col in date_cols:
|
604
|
+
# 处理全NaT列避免类型错误
|
605
|
+
if temp_df[col].isna().all():
|
606
|
+
temp_df[col] = temp_df[col].astype(object) # 转换为object类型避免NaT
|
607
|
+
temp_df[col] = (
|
608
|
+
temp_df[col]
|
609
|
+
.dt.strftime("%Y-%m-%d") # 安全使用dt访问器(因类型强制为datetime)
|
610
|
+
.where(temp_df[col].notna(), None)
|
611
|
+
)
|
612
|
+
|
613
|
+
# 统一空值处理(保护全None列类型)
|
614
|
+
def safe_null_convert(series):
|
615
|
+
"""保留全None列的原始dtype"""
|
616
|
+
if series.isna().all():
|
617
|
+
return series.astype(object).where(pd.notnull(series), None)
|
618
|
+
return series.where(pd.notnull(series), None)
|
619
|
+
|
620
|
+
temp_df = temp_df.apply(safe_null_convert)
|
621
|
+
|
622
|
+
# 类型处理函数(增强嵌套结构处理)
|
623
|
+
def decimal_serializer(obj):
|
624
|
+
"""递归序列化处理"""
|
625
|
+
# 提前处理None值
|
626
|
+
if obj is None:
|
627
|
+
return None
|
628
|
+
|
629
|
+
# 按类型分发处理
|
630
|
+
if isinstance(obj, Decimal):
|
631
|
+
return round(float(obj), 6)
|
632
|
+
elif isinstance(obj, pd.Timestamp):
|
633
|
+
return obj.strftime("%Y-%m-%d %H:%M:%S") # 兜底处理漏网之鱼
|
634
|
+
elif isinstance(obj, np.generic): # 处理所有numpy标量类型
|
635
|
+
return obj.item()
|
636
|
+
elif isinstance(obj, (datetime.date, datetime.datetime)):
|
637
|
+
return obj.isoformat()
|
638
|
+
elif isinstance(obj, (list, tuple, set)):
|
639
|
+
return [decimal_serializer(item) for item in obj]
|
640
|
+
elif isinstance(obj, dict):
|
641
|
+
return {decimal_serializer(k): decimal_serializer(v) for k, v in obj.items()}
|
642
|
+
elif isinstance(obj, bytes):
|
643
|
+
return obj.decode("utf-8", errors="replace") # 二进制安全处理
|
644
|
+
elif isinstance(obj, pd.Series): # 防止意外传入Series对象
|
645
|
+
return obj.to_list()
|
646
|
+
else:
|
647
|
+
# 尝试直接转换可序列化类型
|
648
|
+
try:
|
649
|
+
json.dumps(obj)
|
650
|
+
return obj
|
651
|
+
except TypeError:
|
652
|
+
logger.error(f"无法序列化类型 {type(obj)}: {str(obj)}")
|
653
|
+
raise
|
654
|
+
|
655
|
+
# 序列化前防御性检查
|
656
|
+
try:
|
657
|
+
data_records = temp_df.to_dict(orient="records")
|
658
|
+
except Exception as e:
|
659
|
+
logger.error(f"数据转换字典失败: {str(e)}")
|
660
|
+
raise
|
661
|
+
|
662
|
+
# 空记录特殊处理
|
663
|
+
if not data_records:
|
664
|
+
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
665
|
+
|
666
|
+
# 执行序列化
|
667
|
+
try:
|
668
|
+
return json.dumps(
|
669
|
+
data_records,
|
670
|
+
ensure_ascii=False,
|
671
|
+
default=decimal_serializer
|
672
|
+
).encode("utf-8")
|
673
|
+
except TypeError as e:
|
674
|
+
logger.error(f"序列化失败,请检查未处理的数据类型: {str(e)}")
|
675
|
+
raise
|
676
|
+
|
677
|
+
|
678
|
+
if __name__ == '__main__':
|
679
|
+
# # ****************************************************
|
680
|
+
# # 这一部分在外部定义,只需要定义一次,开始
|
681
|
+
# redis_config = {
|
682
|
+
# 'host': '127.0.0.1',
|
683
|
+
# 'port': 6379, # 默认Redis端口
|
684
|
+
# 'db': 0, # 默认Redis数据库索引
|
685
|
+
# # 'username': 'default',
|
686
|
+
# 'password': redis_password,
|
687
|
+
# }
|
688
|
+
# # redis 实例化
|
689
|
+
# r = redis.Redis(**redis_config)
|
690
|
+
# # mysql 实例化
|
691
|
+
# d = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
692
|
+
# # 将两个库的实例化对象传给 RedisData 类,并实例化数据处理引擎
|
693
|
+
# m = RedisData(redis_engin=r, download=d)
|
694
|
+
# # ****************************************************
|
695
|
+
#
|
696
|
+
# # 以下为动态获取数据库数据
|
697
|
+
# db_name = '聚合数据'
|
698
|
+
# table_name = '多店推广场景_按日聚合'
|
699
|
+
# set_year = False
|
700
|
+
# df = m.get_from_redis(
|
701
|
+
# db_name=db_name,
|
702
|
+
# table_name=table_name,
|
703
|
+
# set_year=set_year,
|
704
|
+
# start_date='2025-01-01',
|
705
|
+
# end_date='2025-01-31'
|
706
|
+
# )
|
707
|
+
# logger.info(df)
|
708
|
+
#
|
709
|
+
|
710
|
+
logger.info(socket.gethostname())
|
@@ -34,10 +34,11 @@ mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,239
|
|
34
34
|
mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
35
35
|
mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
|
36
36
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
37
|
-
mdbq/redis/getredis.py,sha256=
|
37
|
+
mdbq/redis/getredis.py,sha256=QAiqkxgrQf6AHgWQdIKah3FKkM5HE8TqwJdTXrlyR6c,28427
|
38
|
+
mdbq/redis/getredis_优化hash.py,sha256=q7omKJCPw_6Zr_r6WwTv4RGSXzZzpLPkIaqJ22svJhE,29104
|
38
39
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
39
40
|
mdbq/spider/aikucun.py,sha256=v7VO5gtEXR6_4Q6ujbTyu1FHu7TXHcwSQ6hIO249YH0,22208
|
40
|
-
mdbq-3.6.
|
41
|
-
mdbq-3.6.
|
42
|
-
mdbq-3.6.
|
43
|
-
mdbq-3.6.
|
41
|
+
mdbq-3.6.11.dist-info/METADATA,sha256=nf9h8l9QqT6ZrZ-J4cassVWcqRi2r3Oicu9eicCtCaA,244
|
42
|
+
mdbq-3.6.11.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
|
43
|
+
mdbq-3.6.11.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
44
|
+
mdbq-3.6.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|