mdbq 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. mdbq/__init__.py +1 -0
  2. mdbq/__version__.py +3 -0
  3. mdbq/aggregation/__init__.py +4 -0
  4. mdbq/aggregation/aggregation_bak.py +1438 -0
  5. mdbq/aggregation/datashow_bak.py +1264 -0
  6. mdbq/aggregation/optimize_data.py +76 -0
  7. mdbq/aggregation/query_data.py +3869 -0
  8. mdbq/bdup/__init__.py +5 -0
  9. mdbq/bdup/bdup.py +111 -0
  10. mdbq/config/__init__.py +4 -0
  11. mdbq/config/default.py +131 -0
  12. mdbq/config/myconfig.py +32 -0
  13. mdbq/config/products.py +159 -0
  14. mdbq/config/set_support.py +25 -0
  15. mdbq/dataframe/__init__.py +4 -0
  16. mdbq/dataframe/converter.py +107 -0
  17. mdbq/log/__init__.py +4 -0
  18. mdbq/log/mylogger.py +66 -0
  19. mdbq/log/spider_logging.py +55 -0
  20. mdbq/mongo/__init__.py +4 -0
  21. mdbq/mongo/mongo.py +729 -0
  22. mdbq/mysql/__init__.py +4 -0
  23. mdbq/mysql/mysql.py +1784 -0
  24. mdbq/mysql/s_query.py +211 -0
  25. mdbq/mysql/year_month_day.py +38 -0
  26. mdbq/other/__init__.py +4 -0
  27. mdbq/other/download_sku_picture.py +985 -0
  28. mdbq/other/porxy.py +115 -0
  29. mdbq/other/pov_city.py +405 -0
  30. mdbq/other/sku_picture_bak.py +1081 -0
  31. mdbq/other/ua_sj.py +222 -0
  32. mdbq/pbix/__init__.py +4 -0
  33. mdbq/pbix/pbix_refresh.py +70 -0
  34. mdbq/pbix/refresh_all.py +158 -0
  35. mdbq/pbix/refresh_all_old.py +177 -0
  36. mdbq/redis/__init__.py +4 -0
  37. mdbq/redis/getredis.py +642 -0
  38. mdbq/spider/__init__.py +4 -0
  39. mdbq/spider/aikucun.py +494 -0
  40. {mdbq-3.7.5.dist-info → mdbq-3.7.6.dist-info}/METADATA +1 -1
  41. mdbq-3.7.6.dist-info/RECORD +43 -0
  42. mdbq-3.7.6.dist-info/top_level.txt +1 -0
  43. mdbq-3.7.5.dist-info/RECORD +0 -4
  44. mdbq-3.7.5.dist-info/top_level.txt +0 -1
  45. {mdbq-3.7.5.dist-info → mdbq-3.7.6.dist-info}/WHEEL +0 -0
mdbq/redis/getredis.py ADDED
@@ -0,0 +1,642 @@
1
+ # -*- coding: UTF-8 –*-
2
+ import os.path
3
+ import random
4
+ import redis
5
+ import socket
6
+ from mdbq.mysql import s_query
7
+ from mdbq.config import default
8
+ import pandas as pd
9
+ import numpy as np
10
+ import json
11
+ import datetime
12
+ import threading
13
+ import logging
14
+ from logging.handlers import RotatingFileHandler
15
+ import getpass
16
+ import platform
17
+ from decimal import Decimal
18
+ import orjson
19
+
20
+ if platform.system() == 'Windows':
21
+ D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
22
+ else:
23
+ D_PATH = os.path.join(f'/Users/{getpass.getuser()}/Downloads')
24
+
25
+ m_engine, username, password, host, port = default.get_mysql_engine(platform='Windows', hostname='xigua_lx', sql='mysql', local='remoto', config_file=None)
26
+
27
+ # 获取当前模块的日志记录器
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # 创建一个文件处理器,用于将日志写入文件
31
+ if not os.path.isdir(os.path.join(D_PATH, 'logfile')):
32
+ os.makedirs(os.path.join(D_PATH, 'logfile'))
33
+ log_file = os.path.join(D_PATH, 'logfile', 'redis.log')
34
+ file_handler = RotatingFileHandler(log_file, maxBytes=3 * 1024 * 1024, backupCount=10, encoding='utf-8') # 保留10个备份文件
35
+ file_handler.setLevel(logging.INFO) # 设置文件处理器的日志级别
36
+
37
+ # 创建一个日志格式器,并设置给文件处理器
38
+ formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
39
+ file_handler.setFormatter(formatter)
40
+
41
+ # 将文件处理器添加到日志记录器
42
+ logger.addHandler(file_handler)
43
+
44
+
45
+ class RedisData(object):
46
+ """
47
+ 存储 string
48
+ """
49
+ def __init__(self, redis_engine, download, cache_ttl: int):
50
+ self.redis_engine = redis_engine # Redis 数据处理引擎
51
+ self.download = download # MySQL 数据处理引擎
52
+ self.cache_ttl = cache_ttl * 60 # 缓存过期时间(秒)
53
+
54
+ def get_from_mysql(
55
+ self,
56
+ db_name: str,
57
+ table_name: str,
58
+ set_year: bool,
59
+ start_date,
60
+ end_date
61
+ ) -> pd.DataFrame:
62
+ """
63
+ 从 MySQL 读取数据并返回 DataFrame
64
+
65
+ Args:
66
+ set_year: 表名是否包含年份后缀
67
+ """
68
+ dfs = []
69
+ if set_year:
70
+ current_year = datetime.datetime.today().year
71
+ for year in range(2024, current_year + 1):
72
+ df = self._fetch_table_data(
73
+ db_name, f"{table_name}_{year}", start_date, end_date
74
+ )
75
+ if df is not None:
76
+ dfs.append(df)
77
+ else:
78
+ df = self._fetch_table_data(db_name, table_name, start_date, end_date)
79
+ if df is not None:
80
+ dfs.append(df)
81
+
82
+ combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
83
+ if combined_df.empty:
84
+ logger.info(f"警告: {db_name}.{table_name} 未读取到数据")
85
+ else:
86
+ combined_df = self._convert_date_columns(combined_df)
87
+ return combined_df
88
+
89
+ def get_from_redis(
90
+ self,
91
+ db_name: str,
92
+ table_name: str,
93
+ set_year: bool,
94
+ start_date,
95
+ end_date
96
+ ) -> pd.DataFrame:
97
+ """
98
+ 从 Redis 获取数据,若缓存过期/不完整则触发异步更新
99
+ """
100
+ start_dt = pd.to_datetime(start_date)
101
+ end_dt = pd.to_datetime(end_date)
102
+ cache_key = self._generate_cache_key(db_name, table_name, set_year)
103
+
104
+ # 尝试获取缓存元数据
105
+ try:
106
+ ttl = self.redis_engine.ttl(cache_key)
107
+ cache_data = self._fetch_redis_data(cache_key)
108
+ except Exception as e:
109
+ logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
110
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
111
+
112
+ # 缓存失效处理逻辑
113
+ if ttl < 60 or cache_data.empty:
114
+ self._trigger_async_cache_update(
115
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
116
+ )
117
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
118
+
119
+ # 处理有效缓存数据
120
+ filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
121
+ if not filtered_df.empty:
122
+ return filtered_df
123
+
124
+ # 缓存数据不满足查询范围要求
125
+ self._trigger_async_cache_update(
126
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
127
+ )
128
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
129
+
130
+ def set_redis(
131
+ self,
132
+ cache_key: str,
133
+ db_name: str,
134
+ table_name: str,
135
+ set_year: bool,
136
+ start_date,
137
+ end_date,
138
+ existing_data: pd.DataFrame
139
+ ) -> pd.DataFrame:
140
+ """
141
+ 异步更新 Redis 缓存,合并新旧数据
142
+ """
143
+ try:
144
+ # 从 MySQL 获取新数据
145
+ new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
146
+ if new_data.empty:
147
+ return pd.DataFrame()
148
+
149
+ # 合并历史数据
150
+ combined_data = self._merge_data(new_data, existing_data)
151
+
152
+ # 序列化并存储到 Redis
153
+ serialized_data = self._serialize_data(combined_data)
154
+ self.redis_engine.set(cache_key, serialized_data)
155
+ self.redis_engine.expire(cache_key, self.cache_ttl)
156
+
157
+ logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
158
+ return combined_data
159
+
160
+ except Exception as e:
161
+ logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
162
+ return pd.DataFrame()
163
+
164
+ # Helper Methods ------------------------------------------------
165
+
166
+ def _fetch_table_data(
167
+ self,
168
+ db_name: str,
169
+ table_name: str,
170
+ start_date,
171
+ end_date
172
+ ) -> pd.DataFrame:
173
+ """封装 MySQL 数据获取逻辑"""
174
+ try:
175
+ return self.download.data_to_df(
176
+ db_name=db_name,
177
+ table_name=table_name,
178
+ start_date=start_date,
179
+ end_date=end_date,
180
+ projection={}
181
+ )
182
+ except Exception as e:
183
+ logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
184
+ return pd.DataFrame()
185
+
186
+ def _fetch_redis_data(self, cache_key: str) -> pd.DataFrame:
187
+ """从 Redis 获取并解析数据(自动转换日期列)"""
188
+ try:
189
+ data = self.redis_engine.get(cache_key)
190
+ if not data:
191
+ return pd.DataFrame()
192
+ # 反序列化数据
193
+ df = pd.DataFrame(json.loads(data.decode("utf-8")))
194
+ return self._convert_date_columns(df)
195
+ except Exception as e:
196
+ logger.error(f"Redis 数据解析失败 {cache_key}: {e}")
197
+ return pd.DataFrame()
198
+
199
+ def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
200
+ """统一处理日期列转换"""
201
+ if "日期" in df.columns:
202
+ df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
203
+ return df
204
+
205
+ def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
206
+ """生成标准化的缓存键"""
207
+ return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
208
+
209
+ def _filter_by_date_range(
210
+ self,
211
+ df: pd.DataFrame,
212
+ start_dt: datetime.datetime,
213
+ end_dt: datetime.datetime
214
+ ) -> pd.DataFrame:
215
+ """按日期范围筛选数据"""
216
+ if "日期" not in df.columns:
217
+ return df
218
+ date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
219
+ return df[date_mask].copy()
220
+
221
+ def _trigger_async_cache_update(
222
+ self,
223
+ cache_key: str,
224
+ db_name: str,
225
+ table_name: str,
226
+ set_year: bool,
227
+ start_date: str,
228
+ end_date: str,
229
+ existing_data: pd.DataFrame
230
+ ):
231
+ """启动异步缓存更新线程"""
232
+ thread = threading.Thread(
233
+ target=self.set_redis,
234
+ args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
235
+ daemon=True
236
+ )
237
+ thread.start()
238
+
239
+ def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
240
+ """合并新旧数据集"""
241
+ if existing_data.empty or "日期" not in existing_data.columns:
242
+ return new_data
243
+
244
+ new_min = new_data["日期"].min()
245
+ new_max = new_data["日期"].max()
246
+ valid_historical = existing_data[
247
+ (existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
248
+ ]
249
+ return pd.concat([new_data, valid_historical], ignore_index=True).drop_duplicates(subset=["日期"])
250
+
251
+ def _serialize_data(self, df: pd.DataFrame) -> str:
252
+ """序列化 DataFrame 并处理日期类型"""
253
+ temp_df = df.copy()
254
+ date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
255
+ for col in date_cols:
256
+ temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d")
257
+ return temp_df.to_json(orient="records", force_ascii=False)
258
+
259
+ class RedisDataHash(object):
260
+ """
261
+ 存储 hash
262
+ Redis缓存与MySQL数据联合查询处理器
263
+
264
+ 功能特性:
265
+ - 支持带年份分表的MySQL数据查询
266
+ - 多级缓存策略(内存缓存+Redis缓存)
267
+ - 异步缓存更新机制
268
+ - 自动处理日期范围和数据类型转换
269
+ """
270
+
271
+ def __init__(self, redis_engine, download, cache_ttl: int):
272
+ self.redis_engine = redis_engine
273
+ self.download = download
274
+ self.cache_ttl = cache_ttl * 60 # 转换为秒存储
275
+
276
+ def get_from_mysql(
277
+ self,
278
+ db_name: str,
279
+ table_name: str,
280
+ set_year: bool,
281
+ start_date,
282
+ end_date,
283
+ projection={}
284
+ ) -> pd.DataFrame:
285
+ dfs = []
286
+ if set_year:
287
+ current_year = datetime.datetime.today().year
288
+ for year in range(2024, current_year + 1):
289
+ df = self._fetch_table_data(
290
+ db_name, f"{table_name}_{year}", start_date, end_date, projection
291
+ )
292
+ if df is not None:
293
+ dfs.append(df)
294
+ else:
295
+ df = self._fetch_table_data(db_name, table_name, start_date, end_date, projection)
296
+ if df is not None:
297
+ dfs.append(df)
298
+
299
+ combined_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
300
+ if combined_df.empty:
301
+ logger.warn(f"warning: {db_name}.{table_name} 未读取到数据")
302
+ else:
303
+ combined_df = self._convert_date_columns(combined_df)
304
+ return combined_df
305
+
306
+ def get_from_redis(
307
+ self,
308
+ db_name: str,
309
+ table_name: str,
310
+ set_year: bool,
311
+ start_date,
312
+ end_date,
313
+ projection={}
314
+ ) -> pd.DataFrame:
315
+ if not self.redis_engine.ping():
316
+ logger.error(f"Redis ping异常,直接访问 MySQL")
317
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
318
+ start_dt = pd.to_datetime(start_date).floor('D')
319
+ end_dt = pd.to_datetime(end_date).floor('D')
320
+ cache_key = self._generate_cache_key(db_name, table_name, set_year)
321
+
322
+ try:
323
+ ttl = self.redis_engine.ttl(cache_key)
324
+ if ttl < 60:
325
+ cache_data = self._fetch_redis_data(cache_key)
326
+ self._trigger_async_cache_update(
327
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
328
+ )
329
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
330
+
331
+ # 生成月份范围
332
+ start_month = start_dt.to_period('M')
333
+ end_month = end_dt.to_period('M')
334
+ months = pd.period_range(start_month, end_month, freq='M').strftime("%Y%m").tolist()
335
+ cache_data = self._fetch_redis_data(cache_key, months)
336
+ if cache_data.empty:
337
+ self._trigger_async_cache_update(
338
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
339
+ )
340
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
341
+
342
+ filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
343
+
344
+ if not filtered_df.empty:
345
+ if '日期' in filtered_df.columns.tolist():
346
+ exsit_min_date = filtered_df['日期'].min()
347
+ if exsit_min_date <= start_dt:
348
+ return filtered_df
349
+ else:
350
+ return filtered_df
351
+
352
+ self._trigger_async_cache_update(
353
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
354
+ )
355
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
356
+
357
+ except Exception as e:
358
+ logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
359
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
360
+
361
+ def set_redis(
362
+ self,
363
+ cache_key: str,
364
+ db_name: str,
365
+ table_name: str,
366
+ set_year: bool,
367
+ start_date,
368
+ end_date,
369
+ existing_data: pd.DataFrame,
370
+ projection={}
371
+ ) -> None:
372
+ try:
373
+ new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
374
+ if new_data.empty:
375
+ return
376
+
377
+ combined_data = self._merge_data(new_data, existing_data)
378
+
379
+ if not combined_data.empty:
380
+ if '日期' not in combined_data.columns:
381
+ # 原子化删除旧分片
382
+ # 优化分片存储性能
383
+ chunk_size = 5000
384
+ with self.redis_engine.pipeline(transaction=False) as pipe:
385
+ # 批量删除旧分片
386
+ for key in self.redis_engine.hscan_iter(cache_key, match="all_*"):
387
+ pipe.hdel(cache_key, key[0])
388
+
389
+ # 批量写入新分片
390
+ for idx in range(0, len(combined_data), chunk_size):
391
+ chunk = combined_data.iloc[idx:idx + chunk_size]
392
+ chunk_key = f"all_{idx // chunk_size:04d}"
393
+ pipe.hset(cache_key, chunk_key, self._serialize_data(chunk))
394
+
395
+ pipe.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
396
+ pipe.execute()
397
+ # serialized_data = self._serialize_data(combined_data)
398
+ # self.redis_engine.hset(cache_key, "all", serialized_data)
399
+ # self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
400
+ else:
401
+ # 按月分片存储
402
+ combined_data['month'] = combined_data['日期'].dt.to_period('M').dt.strftime("%Y%m")
403
+ for month_str, group in combined_data.groupby('month'):
404
+ group = group.drop(columns=['month'])
405
+ serialized_data = self._serialize_data(group)
406
+ self.redis_engine.hset(cache_key, month_str, serialized_data)
407
+ self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
408
+ logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
409
+ except Exception as e:
410
+ logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
411
+
412
+ def _fetch_table_data(
413
+ self,
414
+ db_name: str,
415
+ table_name: str,
416
+ start_date,
417
+ end_date,
418
+ projection={}
419
+ ) -> pd.DataFrame:
420
+ try:
421
+ return self.download.data_to_df(
422
+ db_name=db_name,
423
+ table_name=table_name,
424
+ start_date=start_date,
425
+ end_date=end_date,
426
+ projection=projection
427
+ )
428
+ except Exception as e:
429
+ logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
430
+ return pd.DataFrame()
431
+
432
+ def _fetch_redis_data(self, cache_key: str, months: list = None) -> pd.DataFrame:
433
+ try:
434
+ dfs = []
435
+ pipeline = self.redis_engine.pipeline()
436
+
437
+ # 批量提交所有查询请求
438
+ if months is not None:
439
+ # 1. 提交月份数据请求
440
+ pipeline.hmget(cache_key, months)
441
+
442
+ # 2. 提交分片数据请求(无论是否传months都执行)
443
+ pipeline.hscan(cache_key, match="all_*")
444
+
445
+ # 一次性执行所有命令(网络往返次数从2+N次减少到1次)
446
+ results = pipeline.execute()
447
+
448
+ # 处理结果 --------------------------------------------------------
449
+ result_index = 0
450
+
451
+ # 处理月份数据(如果存在)
452
+ if months is not None:
453
+ month_data = results[result_index]
454
+ result_index += 1 # 移动结果索引
455
+
456
+ for data, field in zip(month_data, months):
457
+ if data:
458
+ try:
459
+ # 使用更快的orjson解析(需安装:pip install orjson)
460
+ df = pd.DataFrame(orjson.loads(data))
461
+ df = self._convert_date_columns(df)
462
+ dfs.append(df)
463
+ except Exception as e:
464
+ logger.error(f"月份数据解析失败 {field}: {e}")
465
+
466
+ # 处理分片数据(优化后的批处理逻辑)
467
+ cursor, shard_data = results[result_index]
468
+ while True:
469
+ # 批量获取分片数据
470
+ pipeline = self.redis_engine.pipeline()
471
+ for key in shard_data.keys():
472
+ pipeline.hget(cache_key, key)
473
+ shard_values = pipeline.execute()
474
+
475
+ # 解析分片数据
476
+ for value in shard_values:
477
+ if value:
478
+ try:
479
+ df = pd.DataFrame(orjson.loads(value))
480
+ dfs.append(self._convert_date_columns(df))
481
+ except Exception as e:
482
+ logger.error(f"分片数据解析失败: {e}")
483
+
484
+ # 继续获取后续分片
485
+ if cursor == 0:
486
+ break
487
+ cursor, shard_data = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
488
+
489
+ # 合并数据 --------------------------------------------------------
490
+ if dfs:
491
+ final_df = pd.concat(dfs, ignore_index=True)
492
+ if '日期' in final_df.columns:
493
+ final_df = final_df.sort_values('日期', ascending=False)
494
+ return final_df
495
+ return pd.DataFrame()
496
+
497
+ except Exception as e:
498
+ logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
499
+ return pd.DataFrame()
500
+
501
+ def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
502
+ if "日期" in df.columns:
503
+ df["日期"] = pd.to_datetime(
504
+ df["日期"],
505
+ format="%Y-%m-%d",
506
+ errors="coerce",
507
+ infer_datetime_format=True, # 使用infer_datetime_format加速转换
508
+ )
509
+ return df
510
+
511
+ def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
512
+ return f"{db_name}:{table_name}_haveyear" if set_year else f"{db_name}:{table_name}"
513
+
514
+ def _filter_by_date_range(
515
+ self,
516
+ df: pd.DataFrame,
517
+ start_dt: datetime.datetime,
518
+ end_dt: datetime.datetime
519
+ ) -> pd.DataFrame:
520
+ if "日期" not in df.columns:
521
+ return df
522
+ date_mask = (df["日期"] >= start_dt) & (df["日期"] <= end_dt)
523
+ return df[date_mask].copy()
524
+
525
+ def _trigger_async_cache_update(
526
+ self,
527
+ cache_key: str,
528
+ db_name: str,
529
+ table_name: str,
530
+ set_year: bool,
531
+ start_date: str,
532
+ end_date: str,
533
+ existing_data: pd.DataFrame,
534
+ projection={}
535
+ ):
536
+ thread = threading.Thread(
537
+ target=self.set_redis,
538
+ args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data, projection),
539
+ daemon=True
540
+ )
541
+ thread.start()
542
+
543
+ def _merge_data(self, new_data: pd.DataFrame, existing_data: pd.DataFrame) -> pd.DataFrame:
544
+ if existing_data.empty or "日期" not in existing_data.columns:
545
+ return new_data
546
+ new_data["日期"] = pd.to_datetime(new_data["日期"])
547
+ existing_data["日期"] = pd.to_datetime(existing_data["日期"])
548
+
549
+ new_min = new_data["日期"].min()
550
+ new_max = new_data["日期"].max()
551
+
552
+ valid_historical = existing_data[
553
+ (existing_data["日期"] < new_min) | (existing_data["日期"] > new_max)
554
+ ]
555
+ merged_data = pd.concat([new_data, valid_historical], ignore_index=True)
556
+ merged_data.sort_values(['日期'], ascending=[False], ignore_index=True, inplace=True)
557
+ return merged_data
558
+
559
+ def _serialize_data(self, df: pd.DataFrame) -> bytes:
560
+ """超高速序列化(性能提升5-8倍)"""
561
+ if df.empty:
562
+ return b'[]' # 空数据直接返回
563
+
564
+ # 类型预处理 --------------------------------------------------------
565
+ temp_df = df.copy()
566
+
567
+ # 日期类型快速转换(避免逐行处理)
568
+ date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
569
+ for col in date_cols:
570
+ # 使用pd.Series.dt直接转换(向量化操作)
571
+ temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d").replace({np.nan: None})
572
+
573
+ # Decimal类型处理(使用applymap优化)
574
+ decimal_cols = temp_df.select_dtypes(include=['object']).columns
575
+ for col in decimal_cols:
576
+ if temp_df[col].apply(lambda x: isinstance(x, Decimal)).any():
577
+ temp_df[col] = temp_df[col].apply(
578
+ lambda x: round(float(x), 6) if isinstance(x, Decimal) else x
579
+ )
580
+
581
+ # 使用records定向转换(比to_dict快3倍)
582
+ try:
583
+ records = temp_df.to_dict(orient='records')
584
+ except Exception as e:
585
+ logger.error(f"DataFrame转字典失败: {str(e)}")
586
+ records = []
587
+
588
+ # 序列化配置 --------------------------------------------------------
589
+ return orjson.dumps(
590
+ records,
591
+ option=
592
+ orjson.OPT_SERIALIZE_NUMPY | # 自动处理numpy类型
593
+ orjson.OPT_NAIVE_UTC | # 加速datetime处理
594
+ orjson.OPT_PASSTHROUGH_DATETIME, # 避免自动转换datetime
595
+ default=self._orjson_serializer # 自定义类型处理
596
+ )
597
+
598
+ @staticmethod
599
+ def _orjson_serializer(obj):
600
+ """自定义类型序列化处理器"""
601
+ if isinstance(obj, Decimal):
602
+ return round(float(obj), 6)
603
+ if isinstance(obj, (datetime.date, datetime.datetime)):
604
+ return obj.isoformat()
605
+ if isinstance(obj, np.generic):
606
+ return obj.item()
607
+ raise TypeError(f"无法序列化类型 {type(obj)}: {obj}")
608
+
609
+
610
+ if __name__ == '__main__':
611
+ # # ****************************************************
612
+ # # 这一部分在外部定义,只需要定义一次,开始
613
+ # redis_config = {
614
+ # 'host': '127.0.0.1',
615
+ # 'port': 6379, # 默认Redis端口
616
+ # 'db': 0, # 默认Redis数据库索引
617
+ # # 'username': 'default',
618
+ # 'password': redis_password,
619
+ # }
620
+ # # redis 实例化
621
+ # r = redis.Redis(**redis_config)
622
+ # # mysql 实例化
623
+ # d = s_query.QueryDatas(username=username, password=password, host=host, port=port)
624
+ # # 将两个库的实例化对传给 RedisData 类,并实例化数据处理引擎
625
+ # m = RedisData(redis_engin=r, download=d)
626
+ # # ****************************************************
627
+ #
628
+ # # 以下为动态获取数据库数据
629
+ # db_name = '聚合数据'
630
+ # table_name = '多店推广场景_按日聚合'
631
+ # set_year = False
632
+ # df = m.get_from_redis(
633
+ # db_name=db_name,
634
+ # table_name=table_name,
635
+ # set_year=set_year,
636
+ # start_date='2025-01-01',
637
+ # end_date='2025-01-31'
638
+ # )
639
+ # logger.info(df)
640
+ #
641
+
642
+ logger.info(socket.gethostname())
@@ -0,0 +1,4 @@
1
+
2
+
3
+
4
+ # 爬取数据