mdbq 3.6.9__py3-none-any.whl → 3.6.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/redis/getredis.py +28 -133
- mdbq/redis/getredis_/344/274/230/345/214/226hash.py +710 -0
- {mdbq-3.6.9.dist-info → mdbq-3.6.10.dist-info}/METADATA +1 -1
- {mdbq-3.6.9.dist-info → mdbq-3.6.10.dist-info}/RECORD +6 -5
- {mdbq-3.6.9.dist-info → mdbq-3.6.10.dist-info}/WHEEL +0 -0
- {mdbq-3.6.9.dist-info → mdbq-3.6.10.dist-info}/top_level.txt +0 -0
mdbq/redis/getredis.py
CHANGED
@@ -271,7 +271,6 @@ class RedisData(object):
|
|
271
271
|
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d")
|
272
272
|
return temp_df.to_json(orient="records", force_ascii=False)
|
273
273
|
|
274
|
-
|
275
274
|
class RedisDataHash(object):
|
276
275
|
"""
|
277
276
|
存储 hash
|
@@ -285,13 +284,6 @@ class RedisDataHash(object):
|
|
285
284
|
"""
|
286
285
|
|
287
286
|
def __init__(self, redis_engine, download, cache_ttl: int):
|
288
|
-
"""
|
289
|
-
初始化缓存处理器
|
290
|
-
|
291
|
-
:param redis_engine: Redis连接实例
|
292
|
-
:param download: 数据下载处理器(需实现data_to_df方法)
|
293
|
-
:param cache_ttl: 缓存存活时间(单位:分钟,内部转换为秒存储)
|
294
|
-
"""
|
295
287
|
self.redis_engine = redis_engine
|
296
288
|
self.download = download
|
297
289
|
self.cache_ttl = cache_ttl * 60 # 转换为秒存储
|
@@ -304,20 +296,8 @@ class RedisDataHash(object):
|
|
304
296
|
start_date,
|
305
297
|
end_date
|
306
298
|
) -> pd.DataFrame:
|
307
|
-
"""
|
308
|
-
从MySQL直接获取数据的核心方法
|
309
|
-
|
310
|
-
处理逻辑:
|
311
|
-
1. 当启用年份分表时(set_year=True),自动遍历2024到当前年份的所有分表
|
312
|
-
2. 合并所有符合条件的数据表内容
|
313
|
-
3. 自动处理日期列格式转换
|
314
|
-
|
315
|
-
:return: 合并后的DataFrame(可能包含多个分表数据)
|
316
|
-
"""
|
317
|
-
# 原有实现保持不变
|
318
299
|
dfs = []
|
319
300
|
if set_year:
|
320
|
-
# 处理年份分表情况(例如 table_2024, table_2025...)
|
321
301
|
current_year = datetime.datetime.today().year
|
322
302
|
for year in range(2024, current_year + 1):
|
323
303
|
df = self._fetch_table_data(
|
@@ -326,12 +306,10 @@ class RedisDataHash(object):
|
|
326
306
|
if df is not None:
|
327
307
|
dfs.append(df)
|
328
308
|
else:
|
329
|
-
# 单表查询模式
|
330
309
|
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
331
310
|
if df is not None:
|
332
311
|
dfs.append(df)
|
333
312
|
|
334
|
-
# 合并结果并处理空数据情况
|
335
313
|
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
336
314
|
if combined_df.empty:
|
337
315
|
logger.warn(f"warning: {db_name}.{table_name} 未读取到数据")
|
@@ -347,69 +325,46 @@ class RedisDataHash(object):
|
|
347
325
|
start_date,
|
348
326
|
end_date
|
349
327
|
) -> pd.DataFrame:
|
350
|
-
"""
|
351
|
-
带缓存策略的数据获取主入口
|
352
|
-
|
353
|
-
执行流程:
|
354
|
-
1. 生成缓存键并检查TTL(存活时间)
|
355
|
-
2. 当TTL<60秒时触发异步更新,同时直接访问MySQL获取最新数据
|
356
|
-
3. 从Redis获取历史数据并进行日期过滤
|
357
|
-
4. 若缓存数据不完整,触发异步更新并降级到MySQL查询
|
358
|
-
5. 异常时自动降级到MySQL查询
|
359
|
-
|
360
|
-
设计特点:
|
361
|
-
- 缓存预热:首次访问时异步更新缓存
|
362
|
-
- 降级机制:任何异常自动切换直连MySQL
|
363
|
-
- 过时缓存:当TTL不足时并行更新缓存
|
364
|
-
"""
|
365
|
-
# 时分秒部分重置为 00:00:00 这是个巨坑,不可以省略
|
366
328
|
start_dt = pd.to_datetime(start_date).floor('D')
|
367
329
|
end_dt = pd.to_datetime(end_date).floor('D')
|
368
|
-
# 生成缓存键名
|
369
330
|
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
370
331
|
|
371
332
|
try:
|
372
|
-
# 检查缓存
|
373
333
|
ttl = self.redis_engine.ttl(cache_key)
|
374
|
-
if ttl < 60:
|
375
|
-
# 获取当前缓存
|
334
|
+
if ttl < 60:
|
376
335
|
cache_data = self._fetch_redis_data(cache_key)
|
377
|
-
# 异步更新缓存
|
378
336
|
self._trigger_async_cache_update(
|
379
337
|
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
380
338
|
)
|
381
|
-
# 立即降级返回MySQL查询
|
382
339
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
383
340
|
|
384
|
-
#
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
341
|
+
# 生成月份范围
|
342
|
+
start_month = start_dt.to_period('M')
|
343
|
+
end_month = end_dt.to_period('M')
|
344
|
+
months = pd.period_range(start_month, end_month, freq='M').strftime("%Y%m").tolist()
|
345
|
+
cache_data = self._fetch_redis_data(cache_key, months)
|
346
|
+
|
389
347
|
if cache_data.empty:
|
390
348
|
self._trigger_async_cache_update(
|
391
349
|
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
392
350
|
)
|
393
351
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
394
|
-
|
352
|
+
|
395
353
|
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
396
354
|
if not filtered_df.empty:
|
397
355
|
if '日期' in filtered_df.columns.tolist():
|
398
|
-
# 缓存数据的日期在请求日期范围内时,直接返回缓存数据
|
399
356
|
exsit_min_date = filtered_df['日期'].min()
|
400
357
|
if exsit_min_date <= start_dt:
|
401
358
|
return filtered_df
|
402
359
|
else:
|
403
360
|
return filtered_df
|
404
|
-
|
361
|
+
|
405
362
|
self._trigger_async_cache_update(
|
406
363
|
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
407
364
|
)
|
408
|
-
# 立即降级返回MySQL查询
|
409
365
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
410
366
|
|
411
367
|
except Exception as e:
|
412
|
-
# 异常策略:立即返回MySQL查询,保障服务可用
|
413
368
|
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
414
369
|
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
415
370
|
|
@@ -423,45 +378,25 @@ class RedisDataHash(object):
|
|
423
378
|
end_date,
|
424
379
|
existing_data: pd.DataFrame
|
425
380
|
) -> None:
|
426
|
-
"""
|
427
|
-
异步缓存更新方法
|
428
|
-
|
429
|
-
核心逻辑:
|
430
|
-
1. 获取MySQL最新数据
|
431
|
-
2. 合并新旧数据(保留历史数据中不在新数据时间范围内的部分)
|
432
|
-
3. 智能存储策略:
|
433
|
-
- 无日期字段:全量存储到"all"字段
|
434
|
-
- 有日期字段:按年份分片存储(提升查询效率)
|
435
|
-
|
436
|
-
设计特点:
|
437
|
-
- 增量更新:仅合并必要数据,避免全量覆盖
|
438
|
-
- 数据分片:按年存储提升大数据的读取性能
|
439
|
-
- 容错处理:跳过无日期字段的异常情况
|
440
|
-
"""
|
441
381
|
try:
|
442
|
-
# 获取最新数据(使用最新查询条件)
|
443
382
|
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
444
383
|
if new_data.empty:
|
445
384
|
return
|
446
385
|
|
447
|
-
# 合并缓存数据
|
448
386
|
combined_data = self._merge_data(new_data, existing_data)
|
449
387
|
|
450
388
|
if not combined_data.empty:
|
451
|
-
# 处理无日期字段的特殊情况
|
452
389
|
if '日期' not in combined_data.columns.tolist():
|
453
|
-
# 数据序列化
|
454
390
|
serialized_data = self._serialize_data(combined_data)
|
455
391
|
self.redis_engine.hset(cache_key, "all", serialized_data)
|
456
392
|
self.redis_engine.expire(cache_key, self.cache_ttl)
|
457
393
|
else:
|
458
|
-
#
|
459
|
-
combined_data['
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
self.redis_engine.hset(cache_key, year_str, serialized_data)
|
394
|
+
# 按月分片存储
|
395
|
+
combined_data['month'] = combined_data['日期'].dt.to_period('M').dt.strftime("%Y%m")
|
396
|
+
for month_str, group in combined_data.groupby('month'):
|
397
|
+
group = group.drop(columns=['month'])
|
398
|
+
serialized_data = self._serialize_data(group)
|
399
|
+
self.redis_engine.hset(cache_key, month_str, serialized_data)
|
465
400
|
self.redis_engine.expire(cache_key, self.cache_ttl)
|
466
401
|
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
467
402
|
except Exception as e:
|
@@ -474,7 +409,6 @@ class RedisDataHash(object):
|
|
474
409
|
start_date,
|
475
410
|
end_date
|
476
411
|
) -> pd.DataFrame:
|
477
|
-
"""执行MySQL查询并返回DataFrame(带异常处理)"""
|
478
412
|
try:
|
479
413
|
return self.download.data_to_df(
|
480
414
|
db_name=db_name,
|
@@ -484,23 +418,14 @@ class RedisDataHash(object):
|
|
484
418
|
projection={}
|
485
419
|
)
|
486
420
|
except Exception as e:
|
487
|
-
logger.
|
421
|
+
logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
488
422
|
return pd.DataFrame()
|
489
423
|
|
490
|
-
def _fetch_redis_data(self, cache_key: str,
|
491
|
-
"""
|
492
|
-
从Redis哈希表读取数据
|
493
|
-
|
494
|
-
优化策略:
|
495
|
-
- 当指定年份范围时,仅获取相关字段(hmget)
|
496
|
-
- 未指定范围时全量获取(hgetall)
|
497
|
-
-- 从mysql过来的表,虽然没有日期列,但也指定了 start_year/end_year,再redis中存储的键名是"all",所以要把 all也加进去
|
498
|
-
"""
|
424
|
+
def _fetch_redis_data(self, cache_key: str, months: list = None) -> pd.DataFrame:
|
499
425
|
try:
|
500
|
-
if
|
501
|
-
|
502
|
-
fields
|
503
|
-
fields += ['all']
|
426
|
+
if months is not None:
|
427
|
+
fields = months.copy()
|
428
|
+
fields.append('all')
|
504
429
|
data_list = self.redis_engine.hmget(cache_key, fields)
|
505
430
|
dfs = []
|
506
431
|
for data, field in zip(data_list, fields):
|
@@ -510,7 +435,6 @@ class RedisDataHash(object):
|
|
510
435
|
dfs.append(df)
|
511
436
|
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
512
437
|
else:
|
513
|
-
# 全量获取模式
|
514
438
|
data_dict = self.redis_engine.hgetall(cache_key)
|
515
439
|
dfs = []
|
516
440
|
for field, data in data_dict.items():
|
@@ -519,20 +443,18 @@ class RedisDataHash(object):
|
|
519
443
|
df = self._convert_date_columns(df)
|
520
444
|
dfs.append(df)
|
521
445
|
except Exception as e:
|
522
|
-
logger.
|
446
|
+
logger.error(f"Redis 数据解析失败 {cache_key} 字段 {field}: {e}")
|
523
447
|
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
524
448
|
except Exception as e:
|
525
|
-
logger.
|
449
|
+
logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
|
526
450
|
return pd.DataFrame()
|
527
451
|
|
528
452
|
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
529
|
-
"""统一日期列格式转换"""
|
530
453
|
if "日期" in df.columns:
|
531
454
|
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
532
455
|
return df
|
533
456
|
|
534
457
|
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
535
|
-
"""生成缓存键名"""
|
536
458
|
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
537
459
|
|
538
460
|
def _filter_by_date_range(
|
@@ -541,7 +463,6 @@ class RedisDataHash(object):
|
|
541
463
|
start_dt: datetime.datetime,
|
542
464
|
end_dt: datetime.datetime
|
543
465
|
) -> pd.DataFrame:
|
544
|
-
"""按日期范围精确过滤数据"""
|
545
466
|
if "日期" not in df.columns:
|
546
467
|
return df
|
547
468
|
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
@@ -557,7 +478,6 @@ class RedisDataHash(object):
|
|
557
478
|
end_date: str,
|
558
479
|
existing_data: pd.DataFrame
|
559
480
|
):
|
560
|
-
"""启动异步线程执行缓存更新(不阻塞主流程)"""
|
561
481
|
thread = threading.Thread(
|
562
482
|
target=self.set_redis,
|
563
483
|
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
@@ -566,17 +486,14 @@ class RedisDataHash(object):
|
|
566
486
|
thread.start()
|
567
487
|
|
568
488
|
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
569
|
-
"""合并新旧数据集策略:保留现有数据中在新数据范围外的历史数据,并按日期排序"""
|
570
489
|
if existing_data.empty or "日期" not in existing_data.columns:
|
571
490
|
return new_data
|
572
491
|
new_data["日期"] = pd.to_datetime(new_data["日期"])
|
573
492
|
existing_data["日期"] = pd.to_datetime(existing_data["日期"])
|
574
493
|
|
575
|
-
# 计算新数据日期范围
|
576
494
|
new_min = new_data["日期"].min()
|
577
495
|
new_max = new_data["日期"].max()
|
578
496
|
|
579
|
-
# 保留现有数据中在新数据范围之外的部分
|
580
497
|
valid_historical = existing_data[
|
581
498
|
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
582
499
|
]
|
@@ -585,53 +502,35 @@ class RedisDataHash(object):
|
|
585
502
|
return merged_data
|
586
503
|
|
587
504
|
def _serialize_data(self, df: pd.DataFrame) -> bytes:
|
588
|
-
"""
|
589
|
-
高性能数据序列化方法
|
590
|
-
|
591
|
-
处理要点:
|
592
|
-
1. 日期类型转换为字符串
|
593
|
-
2. Decimal类型转换为浮点数
|
594
|
-
3. NaN值统一转换为None
|
595
|
-
4. 优化JSON序列化性能
|
596
|
-
"""
|
597
505
|
if df.empty:
|
598
506
|
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
599
507
|
temp_df = df.copy()
|
600
508
|
|
601
|
-
# 处理日期类型列(安全转换)
|
602
509
|
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
603
510
|
for col in date_cols:
|
604
|
-
# 处理全NaT列避免类型错误
|
605
511
|
if temp_df[col].isna().all():
|
606
|
-
temp_df[col] = temp_df[col].astype(object)
|
512
|
+
temp_df[col] = temp_df[col].astype(object)
|
607
513
|
temp_df[col] = (
|
608
514
|
temp_df[col]
|
609
|
-
.dt.strftime("%Y-%m-%d")
|
515
|
+
.dt.strftime("%Y-%m-%d")
|
610
516
|
.where(temp_df[col].notna(), None)
|
611
517
|
)
|
612
518
|
|
613
|
-
# 统一空值处理(保护全None列类型)
|
614
519
|
def safe_null_convert(series):
|
615
|
-
"""保留全None列的原始dtype"""
|
616
520
|
if series.isna().all():
|
617
521
|
return series.astype(object).where(pd.notnull(series), None)
|
618
522
|
return series.where(pd.notnull(series), None)
|
619
523
|
|
620
524
|
temp_df = temp_df.apply(safe_null_convert)
|
621
525
|
|
622
|
-
# 类型处理函数(增强嵌套结构处理)
|
623
526
|
def decimal_serializer(obj):
|
624
|
-
"""递归序列化处理"""
|
625
|
-
# 提前处理None值
|
626
527
|
if obj is None:
|
627
528
|
return None
|
628
|
-
|
629
|
-
# 按类型分发处理
|
630
529
|
if isinstance(obj, Decimal):
|
631
530
|
return round(float(obj), 6)
|
632
531
|
elif isinstance(obj, pd.Timestamp):
|
633
|
-
return obj.strftime("%Y-%m-%d %H:%M:%S")
|
634
|
-
elif isinstance(obj, np.generic):
|
532
|
+
return obj.strftime("%Y-%m-%d %H:%M:%S")
|
533
|
+
elif isinstance(obj, np.generic):
|
635
534
|
return obj.item()
|
636
535
|
elif isinstance(obj, (datetime.date, datetime.datetime)):
|
637
536
|
return obj.isoformat()
|
@@ -640,11 +539,10 @@ class RedisDataHash(object):
|
|
640
539
|
elif isinstance(obj, dict):
|
641
540
|
return {decimal_serializer(k): decimal_serializer(v) for k, v in obj.items()}
|
642
541
|
elif isinstance(obj, bytes):
|
643
|
-
return obj.decode("utf-8", errors="replace")
|
644
|
-
elif isinstance(obj, pd.Series):
|
542
|
+
return obj.decode("utf-8", errors="replace")
|
543
|
+
elif isinstance(obj, pd.Series):
|
645
544
|
return obj.to_list()
|
646
545
|
else:
|
647
|
-
# 尝试直接转换可序列化类型
|
648
546
|
try:
|
649
547
|
json.dumps(obj)
|
650
548
|
return obj
|
@@ -652,18 +550,15 @@ class RedisDataHash(object):
|
|
652
550
|
logger.error(f"无法序列化类型 {type(obj)}: {str(obj)}")
|
653
551
|
raise
|
654
552
|
|
655
|
-
# 序列化前防御性检查
|
656
553
|
try:
|
657
554
|
data_records = temp_df.to_dict(orient="records")
|
658
555
|
except Exception as e:
|
659
556
|
logger.error(f"数据转换字典失败: {str(e)}")
|
660
557
|
raise
|
661
558
|
|
662
|
-
# 空记录特殊处理
|
663
559
|
if not data_records:
|
664
560
|
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
665
561
|
|
666
|
-
# 执行序列化
|
667
562
|
try:
|
668
563
|
return json.dumps(
|
669
564
|
data_records,
|
@@ -0,0 +1,710 @@
|
|
1
|
+
# -*- coding: UTF-8 –*-
|
2
|
+
import os.path
|
3
|
+
import redis
|
4
|
+
import socket
|
5
|
+
from mdbq.mysql import s_query
|
6
|
+
from mdbq.config import myconfig
|
7
|
+
import pandas as pd
|
8
|
+
import json
|
9
|
+
import datetime
|
10
|
+
import threading
|
11
|
+
import logging
|
12
|
+
from logging.handlers import RotatingFileHandler
|
13
|
+
import getpass
|
14
|
+
import platform
|
15
|
+
from decimal import Decimal
|
16
|
+
|
17
|
+
if platform.system() == 'Windows':
|
18
|
+
D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
|
19
|
+
else:
|
20
|
+
D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
|
21
|
+
|
22
|
+
|
23
|
+
if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
|
24
|
+
conf = myconfig.main()
|
25
|
+
conf_data = conf['Windows']['company']['mysql']['local']
|
26
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
27
|
+
redis_password = conf['Windows']['company']['redis']['local']['password']
|
28
|
+
elif socket.gethostname() == 'MacBookPro':
|
29
|
+
conf = myconfig.main()
|
30
|
+
conf_data = conf['Windows']['xigua_lx']['mysql']['local']
|
31
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
32
|
+
redis_password = conf['Windows']['company']['redis']['local']['password']
|
33
|
+
else:
|
34
|
+
conf = myconfig.main()
|
35
|
+
conf_data = conf['Windows']['xigua_lx']['mysql']['local']
|
36
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
|
37
|
+
redis_password = conf['Windows']['company']['redis']['local']['password'] # redis 使用本地数据,全部机子相同
|
38
|
+
|
39
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
|
40
|
+
|
41
|
+
# 获取当前模块的日志记录器
|
42
|
+
logger = logging.getLogger(__name__)
|
43
|
+
|
44
|
+
# 创建一个文件处理器,用于将日志写入文件
|
45
|
+
# file_handler = logging.FileHandler(os.path.join(D_PATH, 'logfile', 'redis.log'))
|
46
|
+
if not os.path.isdir(os.path.join(D_PATH, 'logfile')):
|
47
|
+
os.makedirs(os.path.join(D_PATH, 'logfile'))
|
48
|
+
log_file = os.path.join(D_PATH, 'logfile', 'redis.log')
|
49
|
+
file_handler = RotatingFileHandler(log_file, maxBytes=3 * 1024 * 1024, backupCount=10) # 保留10个备份文件
|
50
|
+
file_handler.setLevel(logging.INFO) # 设置文件处理器的日志级别
|
51
|
+
|
52
|
+
# 创建一个日志格式器,并设置给文件处理器
|
53
|
+
formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
54
|
+
file_handler.setFormatter(formatter)
|
55
|
+
|
56
|
+
# 将文件处理器添加到日志记录器
|
57
|
+
logger.addHandler(file_handler)
|
58
|
+
|
59
|
+
|
60
|
+
class RedisData(object):
|
61
|
+
"""
|
62
|
+
存储 string
|
63
|
+
"""
|
64
|
+
def __init__(self, redis_engine, download, cache_ttl: int):
|
65
|
+
self.redis_engine = redis_engine # Redis 数据处理引擎
|
66
|
+
self.download = download # MySQL 数据处理引擎
|
67
|
+
self.cache_ttl = cache_ttl * 60 # 缓存过期时间(秒)
|
68
|
+
|
69
|
+
def get_from_mysql(
|
70
|
+
self,
|
71
|
+
db_name: str,
|
72
|
+
table_name: str,
|
73
|
+
set_year: bool,
|
74
|
+
start_date,
|
75
|
+
end_date
|
76
|
+
) -> pd.DataFrame:
|
77
|
+
"""
|
78
|
+
从 MySQL 读取数据并返回 DataFrame
|
79
|
+
|
80
|
+
Args:
|
81
|
+
set_year: 表名是否包含年份后缀
|
82
|
+
"""
|
83
|
+
dfs = []
|
84
|
+
if set_year:
|
85
|
+
current_year = datetime.datetime.today().year
|
86
|
+
for year in range(2024, current_year + 1):
|
87
|
+
df = self._fetch_table_data(
|
88
|
+
db_name, f"{table_name}_{year}", start_date, end_date
|
89
|
+
)
|
90
|
+
if df is not None:
|
91
|
+
dfs.append(df)
|
92
|
+
else:
|
93
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
94
|
+
if df is not None:
|
95
|
+
dfs.append(df)
|
96
|
+
|
97
|
+
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
98
|
+
if combined_df.empty:
|
99
|
+
logger.info(f"警告: {db_name}.{table_name} 未读取到数据")
|
100
|
+
else:
|
101
|
+
combined_df = self._convert_date_columns(combined_df)
|
102
|
+
return combined_df
|
103
|
+
|
104
|
+
def get_from_redis(
|
105
|
+
self,
|
106
|
+
db_name: str,
|
107
|
+
table_name: str,
|
108
|
+
set_year: bool,
|
109
|
+
start_date,
|
110
|
+
end_date
|
111
|
+
) -> pd.DataFrame:
|
112
|
+
"""
|
113
|
+
从 Redis 获取数据,若缓存过期/不完整则触发异步更新
|
114
|
+
"""
|
115
|
+
start_dt = pd.to_datetime(start_date)
|
116
|
+
end_dt = pd.to_datetime(end_date)
|
117
|
+
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
118
|
+
|
119
|
+
# 尝试获取缓存元数据
|
120
|
+
try:
|
121
|
+
ttl = self.redis_engine.ttl(cache_key)
|
122
|
+
cache_data = self._fetch_redis_data(cache_key)
|
123
|
+
except Exception as e:
|
124
|
+
logger.info(f"Redis 连接异常: {e},直接访问 MySQL")
|
125
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
126
|
+
|
127
|
+
# 缓存失效处理逻辑
|
128
|
+
if ttl < 60 or cache_data.empty:
|
129
|
+
self._trigger_async_cache_update(
|
130
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
131
|
+
)
|
132
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
133
|
+
|
134
|
+
# 处理有效缓存数据
|
135
|
+
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
136
|
+
if not filtered_df.empty:
|
137
|
+
return filtered_df
|
138
|
+
|
139
|
+
# 缓存数据不满足查询范围要求
|
140
|
+
self._trigger_async_cache_update(
|
141
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
142
|
+
)
|
143
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
144
|
+
|
145
|
+
def set_redis(
|
146
|
+
self,
|
147
|
+
cache_key: str,
|
148
|
+
db_name: str,
|
149
|
+
table_name: str,
|
150
|
+
set_year: bool,
|
151
|
+
start_date,
|
152
|
+
end_date,
|
153
|
+
existing_data: pd.DataFrame
|
154
|
+
) -> pd.DataFrame:
|
155
|
+
"""
|
156
|
+
异步更新 Redis 缓存,合并新旧数据
|
157
|
+
"""
|
158
|
+
try:
|
159
|
+
# 从 MySQL 获取新数据
|
160
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
161
|
+
if new_data.empty:
|
162
|
+
return pd.DataFrame()
|
163
|
+
|
164
|
+
# 合并历史数据
|
165
|
+
combined_data = self._merge_data(new_data, existing_data)
|
166
|
+
|
167
|
+
# 序列化并存储到 Redis
|
168
|
+
serialized_data = self._serialize_data(combined_data)
|
169
|
+
self.redis_engine.set(cache_key, serialized_data)
|
170
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
171
|
+
|
172
|
+
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
173
|
+
return combined_data
|
174
|
+
|
175
|
+
except Exception as e:
|
176
|
+
logger.info(f"缓存更新失败: {cache_key} - {str(e)}")
|
177
|
+
return pd.DataFrame()
|
178
|
+
|
179
|
+
# Helper Methods ------------------------------------------------
|
180
|
+
|
181
|
+
def _fetch_table_data(
|
182
|
+
self,
|
183
|
+
db_name: str,
|
184
|
+
table_name: str,
|
185
|
+
start_date,
|
186
|
+
end_date
|
187
|
+
) -> pd.DataFrame:
|
188
|
+
"""封装 MySQL 数据获取逻辑"""
|
189
|
+
try:
|
190
|
+
return self.download.data_to_df(
|
191
|
+
db_name=db_name,
|
192
|
+
table_name=table_name,
|
193
|
+
start_date=start_date,
|
194
|
+
end_date=end_date,
|
195
|
+
projection={}
|
196
|
+
)
|
197
|
+
except Exception as e:
|
198
|
+
logger.info(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
199
|
+
return pd.DataFrame()
|
200
|
+
|
201
|
+
def _fetch_redis_data(self, cache_key: str) -> pd.DataFrame:
|
202
|
+
"""从 Redis 获取并解析数据(自动转换日期列)"""
|
203
|
+
try:
|
204
|
+
data = self.redis_engine.get(cache_key)
|
205
|
+
if not data:
|
206
|
+
return pd.DataFrame()
|
207
|
+
# 反序列化数据
|
208
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
209
|
+
return self._convert_date_columns(df)
|
210
|
+
except Exception as e:
|
211
|
+
logger.info(f"Redis 数据解析失败 {cache_key}: {e}")
|
212
|
+
return pd.DataFrame()
|
213
|
+
|
214
|
+
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
215
|
+
"""统一处理日期列转换"""
|
216
|
+
if "日期" in df.columns:
|
217
|
+
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
218
|
+
return df
|
219
|
+
|
220
|
+
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
221
|
+
"""生成标准化的缓存键"""
|
222
|
+
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
223
|
+
|
224
|
+
def _filter_by_date_range(
|
225
|
+
self,
|
226
|
+
df: pd.DataFrame,
|
227
|
+
start_dt: datetime.datetime,
|
228
|
+
end_dt: datetime.datetime
|
229
|
+
) -> pd.DataFrame:
|
230
|
+
"""按日期范围筛选数据"""
|
231
|
+
if "日期" not in df.columns:
|
232
|
+
return df
|
233
|
+
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
234
|
+
return df[date_mask].copy()
|
235
|
+
|
236
|
+
def _trigger_async_cache_update(
|
237
|
+
self,
|
238
|
+
cache_key: str,
|
239
|
+
db_name: str,
|
240
|
+
table_name: str,
|
241
|
+
set_year: bool,
|
242
|
+
start_date: str,
|
243
|
+
end_date: str,
|
244
|
+
existing_data: pd.DataFrame
|
245
|
+
):
|
246
|
+
"""启动异步缓存更新线程"""
|
247
|
+
thread = threading.Thread(
|
248
|
+
target=self.set_redis,
|
249
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
250
|
+
daemon=True
|
251
|
+
)
|
252
|
+
thread.start()
|
253
|
+
|
254
|
+
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
255
|
+
"""合并新旧数据集"""
|
256
|
+
if existing_data.empty or "日期" not in existing_data.columns:
|
257
|
+
return new_data
|
258
|
+
|
259
|
+
new_min = new_data["日期"].min()
|
260
|
+
new_max = new_data["日期"].max()
|
261
|
+
valid_historical = existing_data[
|
262
|
+
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
263
|
+
]
|
264
|
+
return pd.concat([new_data, valid_historical], ignore_index=True).drop_duplicates(subset=["日期"])
|
265
|
+
|
266
|
+
def _serialize_data(self, df: pd.DataFrame) -> str:
|
267
|
+
"""序列化 DataFrame 并处理日期类型"""
|
268
|
+
temp_df = df.copy()
|
269
|
+
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
270
|
+
for col in date_cols:
|
271
|
+
temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d")
|
272
|
+
return temp_df.to_json(orient="records", force_ascii=False)
|
273
|
+
|
274
|
+
|
275
|
+
class RedisDataHash(object):
|
276
|
+
"""
|
277
|
+
存储 hash
|
278
|
+
Redis缓存与MySQL数据联合查询处理器
|
279
|
+
|
280
|
+
功能特性:
|
281
|
+
- 支持带年份分表的MySQL数据查询
|
282
|
+
- 多级缓存策略(内存缓存+Redis缓存)
|
283
|
+
- 异步缓存更新机制
|
284
|
+
- 自动处理日期范围和数据类型转换
|
285
|
+
"""
|
286
|
+
|
287
|
+
def __init__(self, redis_engine, download, cache_ttl: int):
|
288
|
+
"""
|
289
|
+
初始化缓存处理器
|
290
|
+
|
291
|
+
:param redis_engine: Redis连接实例
|
292
|
+
:param download: 数据下载处理器(需实现data_to_df方法)
|
293
|
+
:param cache_ttl: 缓存存活时间(单位:分钟,内部转换为秒存储)
|
294
|
+
"""
|
295
|
+
self.redis_engine = redis_engine
|
296
|
+
self.download = download
|
297
|
+
self.cache_ttl = cache_ttl * 60 # 转换为秒存储
|
298
|
+
|
299
|
+
def get_from_mysql(
|
300
|
+
self,
|
301
|
+
db_name: str,
|
302
|
+
table_name: str,
|
303
|
+
set_year: bool,
|
304
|
+
start_date,
|
305
|
+
end_date
|
306
|
+
) -> pd.DataFrame:
|
307
|
+
"""
|
308
|
+
从MySQL直接获取数据的核心方法
|
309
|
+
|
310
|
+
处理逻辑:
|
311
|
+
1. 当启用年份分表时(set_year=True),自动遍历2024到当前年份的所有分表
|
312
|
+
2. 合并所有符合条件的数据表内容
|
313
|
+
3. 自动处理日期列格式转换
|
314
|
+
|
315
|
+
:return: 合并后的DataFrame(可能包含多个分表数据)
|
316
|
+
"""
|
317
|
+
# 原有实现保持不变
|
318
|
+
dfs = []
|
319
|
+
if set_year:
|
320
|
+
# 处理年份分表情况(例如 table_2024, table_2025...)
|
321
|
+
current_year = datetime.datetime.today().year
|
322
|
+
for year in range(2024, current_year + 1):
|
323
|
+
df = self._fetch_table_data(
|
324
|
+
db_name, f"{table_name}_{year}", start_date, end_date
|
325
|
+
)
|
326
|
+
if df is not None:
|
327
|
+
dfs.append(df)
|
328
|
+
else:
|
329
|
+
# 单表查询模式
|
330
|
+
df = self._fetch_table_data(db_name, table_name, start_date, end_date)
|
331
|
+
if df is not None:
|
332
|
+
dfs.append(df)
|
333
|
+
|
334
|
+
# 合并结果并处理空数据情况
|
335
|
+
combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
336
|
+
if combined_df.empty:
|
337
|
+
logger.warn(f"warning: {db_name}.{table_name} 未读取到数据")
|
338
|
+
else:
|
339
|
+
combined_df = self._convert_date_columns(combined_df)
|
340
|
+
return combined_df
|
341
|
+
|
342
|
+
def get_from_redis(
|
343
|
+
self,
|
344
|
+
db_name: str,
|
345
|
+
table_name: str,
|
346
|
+
set_year: bool,
|
347
|
+
start_date,
|
348
|
+
end_date
|
349
|
+
) -> pd.DataFrame:
|
350
|
+
"""
|
351
|
+
带缓存策略的数据获取主入口
|
352
|
+
|
353
|
+
执行流程:
|
354
|
+
1. 生成缓存键并检查TTL(存活时间)
|
355
|
+
2. 当TTL<60秒时触发异步更新,同时直接访问MySQL获取最新数据
|
356
|
+
3. 从Redis获取历史数据并进行日期过滤
|
357
|
+
4. 若缓存数据不完整,触发异步更新并降级到MySQL查询
|
358
|
+
5. 异常时自动降级到MySQL查询
|
359
|
+
|
360
|
+
设计特点:
|
361
|
+
- 缓存预热:首次访问时异步更新缓存
|
362
|
+
- 降级机制:任何异常自动切换直连MySQL
|
363
|
+
- 过时缓存:当TTL不足时并行更新缓存
|
364
|
+
"""
|
365
|
+
# 时分秒部分重置为 00:00:00 这是个巨坑,不可以省略
|
366
|
+
start_dt = pd.to_datetime(start_date).floor('D')
|
367
|
+
end_dt = pd.to_datetime(end_date).floor('D')
|
368
|
+
# 生成缓存键名
|
369
|
+
cache_key = self._generate_cache_key(db_name, table_name, set_year)
|
370
|
+
|
371
|
+
try:
|
372
|
+
# 检查缓存
|
373
|
+
ttl = self.redis_engine.ttl(cache_key)
|
374
|
+
if ttl < 60: # 当剩余时间不足1分钟时触发更新
|
375
|
+
# 获取当前缓存
|
376
|
+
cache_data = self._fetch_redis_data(cache_key)
|
377
|
+
# 异步更新缓存
|
378
|
+
self._trigger_async_cache_update(
|
379
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
380
|
+
)
|
381
|
+
# 立即降级返回MySQL查询
|
382
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
383
|
+
|
384
|
+
# 按年份范围获取缓存数据(优化大数据量时的读取效率)
|
385
|
+
start_year = start_dt.year
|
386
|
+
end_year = end_dt.year
|
387
|
+
cache_data = self._fetch_redis_data(cache_key, start_year, end_year)
|
388
|
+
# 空数据检查(缓存未命中)
|
389
|
+
if cache_data.empty:
|
390
|
+
self._trigger_async_cache_update(
|
391
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
392
|
+
)
|
393
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
394
|
+
# 按请求范围过滤数据(应对按年存储的粗粒度缓存)
|
395
|
+
filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
|
396
|
+
if not filtered_df.empty:
|
397
|
+
if '日期' in filtered_df.columns.tolist():
|
398
|
+
# 缓存数据的日期在请求日期范围内时,直接返回缓存数据
|
399
|
+
exsit_min_date = filtered_df['日期'].min()
|
400
|
+
if exsit_min_date <= start_dt:
|
401
|
+
return filtered_df
|
402
|
+
else:
|
403
|
+
return filtered_df
|
404
|
+
# 缓存数据不完整时触发异步更新缓存
|
405
|
+
self._trigger_async_cache_update(
|
406
|
+
cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
|
407
|
+
)
|
408
|
+
# 立即降级返回MySQL查询
|
409
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
410
|
+
|
411
|
+
except Exception as e:
|
412
|
+
# 异常策略:立即返回MySQL查询,保障服务可用
|
413
|
+
logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
|
414
|
+
return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
415
|
+
|
416
|
+
def set_redis(
|
417
|
+
self,
|
418
|
+
cache_key: str,
|
419
|
+
db_name: str,
|
420
|
+
table_name: str,
|
421
|
+
set_year: bool,
|
422
|
+
start_date,
|
423
|
+
end_date,
|
424
|
+
existing_data: pd.DataFrame
|
425
|
+
) -> None:
|
426
|
+
"""
|
427
|
+
异步缓存更新方法
|
428
|
+
|
429
|
+
核心逻辑:
|
430
|
+
1. 获取MySQL最新数据
|
431
|
+
2. 合并新旧数据(保留历史数据中不在新数据时间范围内的部分)
|
432
|
+
3. 智能存储策略:
|
433
|
+
- 无日期字段:全量存储到"all"字段
|
434
|
+
- 有日期字段:按年份分片存储(提升查询效率)
|
435
|
+
|
436
|
+
设计特点:
|
437
|
+
- 增量更新:仅合并必要数据,避免全量覆盖
|
438
|
+
- 数据分片:按年存储提升大数据的读取性能
|
439
|
+
- 容错处理:跳过无日期字段的异常情况
|
440
|
+
"""
|
441
|
+
try:
|
442
|
+
# 获取最新数据(使用最新查询条件)
|
443
|
+
new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
|
444
|
+
if new_data.empty:
|
445
|
+
return
|
446
|
+
|
447
|
+
# 合并缓存数据
|
448
|
+
combined_data = self._merge_data(new_data, existing_data)
|
449
|
+
|
450
|
+
if not combined_data.empty:
|
451
|
+
# 处理无日期字段的特殊情况
|
452
|
+
if '日期' not in combined_data.columns.tolist():
|
453
|
+
# 数据序列化
|
454
|
+
serialized_data = self._serialize_data(combined_data)
|
455
|
+
self.redis_engine.hset(cache_key, "all", serialized_data)
|
456
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
457
|
+
else:
|
458
|
+
# 按年份分片存储策略
|
459
|
+
combined_data['年份'] = combined_data['日期'].dt.year
|
460
|
+
# 分组存储到Redis哈希的不同字段(例如2024字段存储当年数据)
|
461
|
+
for year, group in combined_data.groupby('年份'):
|
462
|
+
year_str = str(year)
|
463
|
+
serialized_data = self._serialize_data(group.drop(columns=['年份']))
|
464
|
+
self.redis_engine.hset(cache_key, year_str, serialized_data)
|
465
|
+
self.redis_engine.expire(cache_key, self.cache_ttl)
|
466
|
+
logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
|
467
|
+
except Exception as e:
|
468
|
+
logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
|
469
|
+
|
470
|
+
def _fetch_table_data(
|
471
|
+
self,
|
472
|
+
db_name: str,
|
473
|
+
table_name: str,
|
474
|
+
start_date,
|
475
|
+
end_date
|
476
|
+
) -> pd.DataFrame:
|
477
|
+
"""执行MySQL查询并返回DataFrame(带异常处理)"""
|
478
|
+
try:
|
479
|
+
return self.download.data_to_df(
|
480
|
+
db_name=db_name,
|
481
|
+
table_name=table_name,
|
482
|
+
start_date=start_date,
|
483
|
+
end_date=end_date,
|
484
|
+
projection={}
|
485
|
+
)
|
486
|
+
except Exception as e:
|
487
|
+
logger.info(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
|
488
|
+
return pd.DataFrame()
|
489
|
+
|
490
|
+
def _fetch_redis_data(self, cache_key: str, start_year: int = None, end_year: int = None) -> pd.DataFrame:
|
491
|
+
"""
|
492
|
+
从Redis哈希表读取数据
|
493
|
+
|
494
|
+
优化策略:
|
495
|
+
- 当指定年份范围时,仅获取相关字段(hmget)
|
496
|
+
- 未指定范围时全量获取(hgetall)
|
497
|
+
-- 从mysql过来的表,虽然没有日期列,但也指定了 start_year/end_year,再redis中存储的键名是"all",所以要把 all也加进去
|
498
|
+
"""
|
499
|
+
try:
|
500
|
+
if start_year is not None and end_year is not None:
|
501
|
+
# 按年份范围精确获取字段(提升性能)
|
502
|
+
fields = [str(y) for y in range(start_year, end_year + 1)]
|
503
|
+
fields += ['all']
|
504
|
+
data_list = self.redis_engine.hmget(cache_key, fields)
|
505
|
+
dfs = []
|
506
|
+
for data, field in zip(data_list, fields):
|
507
|
+
if data:
|
508
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
509
|
+
df = self._convert_date_columns(df)
|
510
|
+
dfs.append(df)
|
511
|
+
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
512
|
+
else:
|
513
|
+
# 全量获取模式
|
514
|
+
data_dict = self.redis_engine.hgetall(cache_key)
|
515
|
+
dfs = []
|
516
|
+
for field, data in data_dict.items():
|
517
|
+
try:
|
518
|
+
df = pd.DataFrame(json.loads(data.decode("utf-8")))
|
519
|
+
df = self._convert_date_columns(df)
|
520
|
+
dfs.append(df)
|
521
|
+
except Exception as e:
|
522
|
+
logger.info(f"Redis 数据解析失败 {cache_key} 字段 {field}: {e}")
|
523
|
+
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
|
524
|
+
except Exception as e:
|
525
|
+
logger.info(f"Redis 数据获取失败 {cache_key}: {e}")
|
526
|
+
return pd.DataFrame()
|
527
|
+
|
528
|
+
def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
529
|
+
"""统一日期列格式转换"""
|
530
|
+
if "日期" in df.columns:
|
531
|
+
df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
|
532
|
+
return df
|
533
|
+
|
534
|
+
def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
|
535
|
+
"""生成缓存键名"""
|
536
|
+
return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
|
537
|
+
|
538
|
+
def _filter_by_date_range(
|
539
|
+
self,
|
540
|
+
df: pd.DataFrame,
|
541
|
+
start_dt: datetime.datetime,
|
542
|
+
end_dt: datetime.datetime
|
543
|
+
) -> pd.DataFrame:
|
544
|
+
"""按日期范围精确过滤数据"""
|
545
|
+
if "日期" not in df.columns:
|
546
|
+
return df
|
547
|
+
date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
|
548
|
+
return df[date_mask].copy()
|
549
|
+
|
550
|
+
def _trigger_async_cache_update(
|
551
|
+
self,
|
552
|
+
cache_key: str,
|
553
|
+
db_name: str,
|
554
|
+
table_name: str,
|
555
|
+
set_year: bool,
|
556
|
+
start_date: str,
|
557
|
+
end_date: str,
|
558
|
+
existing_data: pd.DataFrame
|
559
|
+
):
|
560
|
+
"""启动异步线程执行缓存更新(不阻塞主流程)"""
|
561
|
+
thread = threading.Thread(
|
562
|
+
target=self.set_redis,
|
563
|
+
args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
|
564
|
+
daemon=True
|
565
|
+
)
|
566
|
+
thread.start()
|
567
|
+
|
568
|
+
def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
|
569
|
+
"""合并新旧数据集策略:保留现有数据中在新数据范围外的历史数据,并按日期排序"""
|
570
|
+
if existing_data.empty or "日期" not in existing_data.columns:
|
571
|
+
return new_data
|
572
|
+
new_data["日期"] = pd.to_datetime(new_data["日期"])
|
573
|
+
existing_data["日期"] = pd.to_datetime(existing_data["日期"])
|
574
|
+
|
575
|
+
# 计算新数据日期范围
|
576
|
+
new_min = new_data["日期"].min()
|
577
|
+
new_max = new_data["日期"].max()
|
578
|
+
|
579
|
+
# 保留现有数据中在新数据范围之外的部分
|
580
|
+
valid_historical = existing_data[
|
581
|
+
(existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
|
582
|
+
]
|
583
|
+
merged_data = pd.concat([new_data, valid_historical], ignore_index=True)
|
584
|
+
merged_data.sort_values(['日期'], ascending=[False], ignore_index=True, inplace=True)
|
585
|
+
return merged_data
|
586
|
+
|
587
|
+
def _serialize_data(self, df: pd.DataFrame) -> bytes:
|
588
|
+
"""
|
589
|
+
高性能数据序列化方法
|
590
|
+
|
591
|
+
处理要点:
|
592
|
+
1. 日期类型转换为字符串
|
593
|
+
2. Decimal类型转换为浮点数
|
594
|
+
3. NaN值统一转换为None
|
595
|
+
4. 优化JSON序列化性能
|
596
|
+
"""
|
597
|
+
if df.empty:
|
598
|
+
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
599
|
+
temp_df = df.copy()
|
600
|
+
|
601
|
+
# 处理日期类型列(安全转换)
|
602
|
+
date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
|
603
|
+
for col in date_cols:
|
604
|
+
# 处理全NaT列避免类型错误
|
605
|
+
if temp_df[col].isna().all():
|
606
|
+
temp_df[col] = temp_df[col].astype(object) # 转换为object类型避免NaT
|
607
|
+
temp_df[col] = (
|
608
|
+
temp_df[col]
|
609
|
+
.dt.strftime("%Y-%m-%d") # 安全使用dt访问器(因类型强制为datetime)
|
610
|
+
.where(temp_df[col].notna(), None)
|
611
|
+
)
|
612
|
+
|
613
|
+
# 统一空值处理(保护全None列类型)
|
614
|
+
def safe_null_convert(series):
|
615
|
+
"""保留全None列的原始dtype"""
|
616
|
+
if series.isna().all():
|
617
|
+
return series.astype(object).where(pd.notnull(series), None)
|
618
|
+
return series.where(pd.notnull(series), None)
|
619
|
+
|
620
|
+
temp_df = temp_df.apply(safe_null_convert)
|
621
|
+
|
622
|
+
# 类型处理函数(增强嵌套结构处理)
|
623
|
+
def decimal_serializer(obj):
|
624
|
+
"""递归序列化处理"""
|
625
|
+
# 提前处理None值
|
626
|
+
if obj is None:
|
627
|
+
return None
|
628
|
+
|
629
|
+
# 按类型分发处理
|
630
|
+
if isinstance(obj, Decimal):
|
631
|
+
return round(float(obj), 6)
|
632
|
+
elif isinstance(obj, pd.Timestamp):
|
633
|
+
return obj.strftime("%Y-%m-%d %H:%M:%S") # 兜底处理漏网之鱼
|
634
|
+
elif isinstance(obj, np.generic): # 处理所有numpy标量类型
|
635
|
+
return obj.item()
|
636
|
+
elif isinstance(obj, (datetime.date, datetime.datetime)):
|
637
|
+
return obj.isoformat()
|
638
|
+
elif isinstance(obj, (list, tuple, set)):
|
639
|
+
return [decimal_serializer(item) for item in obj]
|
640
|
+
elif isinstance(obj, dict):
|
641
|
+
return {decimal_serializer(k): decimal_serializer(v) for k, v in obj.items()}
|
642
|
+
elif isinstance(obj, bytes):
|
643
|
+
return obj.decode("utf-8", errors="replace") # 二进制安全处理
|
644
|
+
elif isinstance(obj, pd.Series): # 防止意外传入Series对象
|
645
|
+
return obj.to_list()
|
646
|
+
else:
|
647
|
+
# 尝试直接转换可序列化类型
|
648
|
+
try:
|
649
|
+
json.dumps(obj)
|
650
|
+
return obj
|
651
|
+
except TypeError:
|
652
|
+
logger.error(f"无法序列化类型 {type(obj)}: {str(obj)}")
|
653
|
+
raise
|
654
|
+
|
655
|
+
# 序列化前防御性检查
|
656
|
+
try:
|
657
|
+
data_records = temp_df.to_dict(orient="records")
|
658
|
+
except Exception as e:
|
659
|
+
logger.error(f"数据转换字典失败: {str(e)}")
|
660
|
+
raise
|
661
|
+
|
662
|
+
# 空记录特殊处理
|
663
|
+
if not data_records:
|
664
|
+
return json.dumps([], ensure_ascii=False).encode("utf-8")
|
665
|
+
|
666
|
+
# 执行序列化
|
667
|
+
try:
|
668
|
+
return json.dumps(
|
669
|
+
data_records,
|
670
|
+
ensure_ascii=False,
|
671
|
+
default=decimal_serializer
|
672
|
+
).encode("utf-8")
|
673
|
+
except TypeError as e:
|
674
|
+
logger.error(f"序列化失败,请检查未处理的数据类型: {str(e)}")
|
675
|
+
raise
|
676
|
+
|
677
|
+
|
678
|
+
if __name__ == '__main__':
|
679
|
+
# # ****************************************************
|
680
|
+
# # 这一部分在外部定义,只需要定义一次,开始
|
681
|
+
# redis_config = {
|
682
|
+
# 'host': '127.0.0.1',
|
683
|
+
# 'port': 6379, # 默认Redis端口
|
684
|
+
# 'db': 0, # 默认Redis数据库索引
|
685
|
+
# # 'username': 'default',
|
686
|
+
# 'password': redis_password,
|
687
|
+
# }
|
688
|
+
# # redis 实例化
|
689
|
+
# r = redis.Redis(**redis_config)
|
690
|
+
# # mysql 实例化
|
691
|
+
# d = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
692
|
+
# # 将两个库的实例化对象传给 RedisData 类,并实例化数据处理引擎
|
693
|
+
# m = RedisData(redis_engin=r, download=d)
|
694
|
+
# # ****************************************************
|
695
|
+
#
|
696
|
+
# # 以下为动态获取数据库数据
|
697
|
+
# db_name = '聚合数据'
|
698
|
+
# table_name = '多店推广场景_按日聚合'
|
699
|
+
# set_year = False
|
700
|
+
# df = m.get_from_redis(
|
701
|
+
# db_name=db_name,
|
702
|
+
# table_name=table_name,
|
703
|
+
# set_year=set_year,
|
704
|
+
# start_date='2025-01-01',
|
705
|
+
# end_date='2025-01-31'
|
706
|
+
# )
|
707
|
+
# logger.info(df)
|
708
|
+
#
|
709
|
+
|
710
|
+
logger.info(socket.gethostname())
|
@@ -34,10 +34,11 @@ mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,239
|
|
34
34
|
mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
35
35
|
mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
|
36
36
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
37
|
-
mdbq/redis/getredis.py,sha256=
|
37
|
+
mdbq/redis/getredis.py,sha256=pBgRyUrRmOlW-oXry3Hat9GahZgljvidNEDZJFn-geU,23932
|
38
|
+
mdbq/redis/getredis_优化hash.py,sha256=q7omKJCPw_6Zr_r6WwTv4RGSXzZzpLPkIaqJ22svJhE,29104
|
38
39
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
39
40
|
mdbq/spider/aikucun.py,sha256=v7VO5gtEXR6_4Q6ujbTyu1FHu7TXHcwSQ6hIO249YH0,22208
|
40
|
-
mdbq-3.6.
|
41
|
-
mdbq-3.6.
|
42
|
-
mdbq-3.6.
|
43
|
-
mdbq-3.6.
|
41
|
+
mdbq-3.6.10.dist-info/METADATA,sha256=D8ooXZMsVBNM_wbcXjE4xq2wHJU200gXHbEPkRpKioA,244
|
42
|
+
mdbq-3.6.10.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
|
43
|
+
mdbq-3.6.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
44
|
+
mdbq-3.6.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|