mdbq 3.6.10__py3-none-any.whl → 3.6.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/mysql/s_query.py CHANGED
@@ -50,6 +50,9 @@ class QueryDatas:
50
50
  return columns
51
51
 
52
52
  def data_to_df(self, db_name, table_name, start_date, end_date, projection: dict=[]):
53
+ """
54
+ projection = {'日期': 1, '场景名字': 1,}
55
+ """
53
56
  if start_date:
54
57
  start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d')
55
58
  else:
mdbq/redis/getredis.py CHANGED
@@ -1,10 +1,12 @@
1
1
  # -*- coding: UTF-8 –*-
2
2
  import os.path
3
+ import random
3
4
  import redis
4
5
  import socket
5
6
  from mdbq.mysql import s_query
6
7
  from mdbq.config import myconfig
7
8
  import pandas as pd
9
+ import numpy as np
8
10
  import json
9
11
  import datetime
10
12
  import threading
@@ -13,6 +15,7 @@ from logging.handlers import RotatingFileHandler
13
15
  import getpass
14
16
  import platform
15
17
  from decimal import Decimal
18
+ import orjson
16
19
 
17
20
  if platform.system() == 'Windows':
18
21
  D_PATH = os.path.join(f'C:\\Users\\{getpass.getuser()}\\Downloads')
@@ -36,7 +39,7 @@ else:
36
39
  username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data['port']
37
40
  redis_password = conf['Windows']['company']['redis']['local']['password'] # redis 使用本地数据,全部机子相同
38
41
 
39
- logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
42
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
40
43
 
41
44
  # 获取当前模块的日志记录器
42
45
  logger = logging.getLogger(__name__)
@@ -294,19 +297,20 @@ class RedisDataHash(object):
294
297
  table_name: str,
295
298
  set_year: bool,
296
299
  start_date,
297
- end_date
300
+ end_date,
301
+ projection={}
298
302
  ) -> pd.DataFrame:
299
303
  dfs = []
300
304
  if set_year:
301
305
  current_year = datetime.datetime.today().year
302
306
  for year in range(2024, current_year + 1):
303
307
  df = self._fetch_table_data(
304
- db_name, f"{table_name}_{year}", start_date, end_date
308
+ db_name, f"{table_name}_{year}", start_date, end_date, projection
305
309
  )
306
310
  if df is not None:
307
311
  dfs.append(df)
308
312
  else:
309
- df = self._fetch_table_data(db_name, table_name, start_date, end_date)
313
+ df = self._fetch_table_data(db_name, table_name, start_date, end_date, projection)
310
314
  if df is not None:
311
315
  dfs.append(df)
312
316
 
@@ -323,8 +327,12 @@ class RedisDataHash(object):
323
327
  table_name: str,
324
328
  set_year: bool,
325
329
  start_date,
326
- end_date
330
+ end_date,
331
+ projection={}
327
332
  ) -> pd.DataFrame:
333
+ if not self.redis_engine.ping():
334
+ logger.error(f"Redis ping异常,直接访问 MySQL")
335
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
328
336
  start_dt = pd.to_datetime(start_date).floor('D')
329
337
  end_dt = pd.to_datetime(end_date).floor('D')
330
338
  cache_key = self._generate_cache_key(db_name, table_name, set_year)
@@ -334,21 +342,20 @@ class RedisDataHash(object):
334
342
  if ttl < 60:
335
343
  cache_data = self._fetch_redis_data(cache_key)
336
344
  self._trigger_async_cache_update(
337
- cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
345
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
338
346
  )
339
- return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
347
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
340
348
 
341
349
  # 生成月份范围
342
350
  start_month = start_dt.to_period('M')
343
351
  end_month = end_dt.to_period('M')
344
352
  months = pd.period_range(start_month, end_month, freq='M').strftime("%Y%m").tolist()
345
353
  cache_data = self._fetch_redis_data(cache_key, months)
346
-
347
354
  if cache_data.empty:
348
355
  self._trigger_async_cache_update(
349
- cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
356
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
350
357
  )
351
- return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
358
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
352
359
 
353
360
  filtered_df = self._filter_by_date_range(cache_data, start_dt, end_dt)
354
361
  if not filtered_df.empty:
@@ -360,13 +367,13 @@ class RedisDataHash(object):
360
367
  return filtered_df
361
368
 
362
369
  self._trigger_async_cache_update(
363
- cache_key, db_name, table_name, set_year, start_date, end_date, cache_data
370
+ cache_key, db_name, table_name, set_year, start_date, end_date, cache_data, projection
364
371
  )
365
- return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
372
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
366
373
 
367
374
  except Exception as e:
368
375
  logger.error(f"Redis 连接异常: {e},直接访问 MySQL")
369
- return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
376
+ return self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
370
377
 
371
378
  def set_redis(
372
379
  self,
@@ -376,20 +383,37 @@ class RedisDataHash(object):
376
383
  set_year: bool,
377
384
  start_date,
378
385
  end_date,
379
- existing_data: pd.DataFrame
386
+ existing_data: pd.DataFrame,
387
+ projection={}
380
388
  ) -> None:
381
389
  try:
382
- new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date)
390
+ new_data = self.get_from_mysql(db_name, table_name, set_year, start_date, end_date, projection)
383
391
  if new_data.empty:
384
392
  return
385
393
 
386
394
  combined_data = self._merge_data(new_data, existing_data)
387
395
 
388
396
  if not combined_data.empty:
389
- if '日期' not in combined_data.columns.tolist():
390
- serialized_data = self._serialize_data(combined_data)
391
- self.redis_engine.hset(cache_key, "all", serialized_data)
392
- self.redis_engine.expire(cache_key, self.cache_ttl)
397
+ if '日期' not in combined_data.columns:
398
+ # 原子化删除旧分片
399
+ # 优化分片存储性能
400
+ chunk_size = 5000
401
+ with self.redis_engine.pipeline(transaction=False) as pipe:
402
+ # 批量删除旧分片
403
+ for key in self.redis_engine.hscan_iter(cache_key, match="all_*"):
404
+ pipe.hdel(cache_key, key[0])
405
+
406
+ # 批量写入新分片
407
+ for idx in range(0, len(combined_data), chunk_size):
408
+ chunk = combined_data.iloc[idx:idx + chunk_size]
409
+ chunk_key = f"all_{idx // chunk_size:04d}"
410
+ pipe.hset(cache_key, chunk_key, self._serialize_data(chunk))
411
+
412
+ pipe.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
413
+ pipe.execute()
414
+ # serialized_data = self._serialize_data(combined_data)
415
+ # self.redis_engine.hset(cache_key, "all", serialized_data)
416
+ # self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
393
417
  else:
394
418
  # 按月分片存储
395
419
  combined_data['month'] = combined_data['日期'].dt.to_period('M').dt.strftime("%Y%m")
@@ -397,7 +421,7 @@ class RedisDataHash(object):
397
421
  group = group.drop(columns=['month'])
398
422
  serialized_data = self._serialize_data(group)
399
423
  self.redis_engine.hset(cache_key, month_str, serialized_data)
400
- self.redis_engine.expire(cache_key, self.cache_ttl)
424
+ self.redis_engine.expire(cache_key, self.cache_ttl + random.randint(0, 1800))
401
425
  logger.info(f"缓存更新 {cache_key} | 数据量: {len(combined_data)}")
402
426
  except Exception as e:
403
427
  logger.error(f"缓存更新失败: {cache_key} - {str(e)}")
@@ -407,7 +431,8 @@ class RedisDataHash(object):
407
431
  db_name: str,
408
432
  table_name: str,
409
433
  start_date,
410
- end_date
434
+ end_date,
435
+ projection={}
411
436
  ) -> pd.DataFrame:
412
437
  try:
413
438
  return self.download.data_to_df(
@@ -415,7 +440,7 @@ class RedisDataHash(object):
415
440
  table_name=table_name,
416
441
  start_date=start_date,
417
442
  end_date=end_date,
418
- projection={}
443
+ projection=projection
419
444
  )
420
445
  except Exception as e:
421
446
  logger.error(f"MySQL 查询异常 {db_name}.{table_name}: {e}")
@@ -423,35 +448,81 @@ class RedisDataHash(object):
423
448
 
424
449
  def _fetch_redis_data(self, cache_key: str, months: list = None) -> pd.DataFrame:
425
450
  try:
451
+ dfs = []
452
+ pipeline = self.redis_engine.pipeline()
453
+
454
+ # 批量提交所有查询请求
426
455
  if months is not None:
427
- fields = months.copy()
428
- fields.append('all')
429
- data_list = self.redis_engine.hmget(cache_key, fields)
430
- dfs = []
431
- for data, field in zip(data_list, fields):
456
+ # 1. 提交月份数据请求
457
+ pipeline.hmget(cache_key, months)
458
+
459
+ # 2. 提交分片数据请求(无论是否传months都执行)
460
+ pipeline.hscan(cache_key, match="all_*")
461
+
462
+ # 一次性执行所有命令(网络往返次数从2+N次减少到1次)
463
+ results = pipeline.execute()
464
+
465
+ # 处理结果 --------------------------------------------------------
466
+ result_index = 0
467
+
468
+ # 处理月份数据(如果存在)
469
+ if months is not None:
470
+ month_data = results[result_index]
471
+ result_index += 1 # 移动结果索引
472
+
473
+ for data, field in zip(month_data, months):
432
474
  if data:
433
- df = pd.DataFrame(json.loads(data.decode("utf-8")))
434
- df = self._convert_date_columns(df)
435
- dfs.append(df)
436
- return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
437
- else:
438
- data_dict = self.redis_engine.hgetall(cache_key)
439
- dfs = []
440
- for field, data in data_dict.items():
441
- try:
442
- df = pd.DataFrame(json.loads(data.decode("utf-8")))
443
- df = self._convert_date_columns(df)
444
- dfs.append(df)
445
- except Exception as e:
446
- logger.error(f"Redis 数据解析失败 {cache_key} 字段 {field}: {e}")
447
- return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
475
+ try:
476
+ # 使用更快的orjson解析(需安装:pip install orjson)
477
+ df = pd.DataFrame(orjson.loads(data))
478
+ df = self._convert_date_columns(df)
479
+ dfs.append(df)
480
+ except Exception as e:
481
+ logger.error(f"月份数据解析失败 {field}: {e}")
482
+
483
+ # 处理分片数据(优化后的批处理逻辑)
484
+ cursor, shard_data = results[result_index]
485
+ while True:
486
+ # 批量获取分片数据
487
+ pipeline = self.redis_engine.pipeline()
488
+ for key in shard_data.keys():
489
+ pipeline.hget(cache_key, key)
490
+ shard_values = pipeline.execute()
491
+
492
+ # 解析分片数据
493
+ for value in shard_values:
494
+ if value:
495
+ try:
496
+ df = pd.DataFrame(orjson.loads(value))
497
+ dfs.append(self._convert_date_columns(df))
498
+ except Exception as e:
499
+ logger.error(f"分片数据解析失败: {e}")
500
+
501
+ # 继续获取后续分片
502
+ if cursor == 0:
503
+ break
504
+ cursor, shard_data = self.redis_engine.hscan(cache_key, cursor=cursor, match="all_*")
505
+
506
+ # 合并数据 --------------------------------------------------------
507
+ if dfs:
508
+ final_df = pd.concat(dfs, ignore_index=True)
509
+ if '日期' in final_df.columns:
510
+ final_df = final_df.sort_values('日期', ascending=False)
511
+ return final_df
512
+ return pd.DataFrame()
513
+
448
514
  except Exception as e:
449
515
  logger.error(f"Redis 数据获取失败 {cache_key}: {e}")
450
516
  return pd.DataFrame()
451
517
 
452
518
  def _convert_date_columns(self, df: pd.DataFrame) -> pd.DataFrame:
453
519
  if "日期" in df.columns:
454
- df["日期"] = pd.to_datetime(df["日期"], format="%Y-%m-%d", errors="coerce")
520
+ df["日期"] = pd.to_datetime(
521
+ df["日期"],
522
+ format="%Y-%m-%d",
523
+ errors="coerce",
524
+ infer_datetime_format=True, # 使用infer_datetime_format加速转换
525
+ )
455
526
  return df
456
527
 
457
528
  def _generate_cache_key(self, db_name: str, table_name: str, set_year: bool) -> str:
@@ -476,11 +547,12 @@ class RedisDataHash(object):
476
547
  set_year: bool,
477
548
  start_date: str,
478
549
  end_date: str,
479
- existing_data: pd.DataFrame
550
+ existing_data: pd.DataFrame,
551
+ projection={}
480
552
  ):
481
553
  thread = threading.Thread(
482
554
  target=self.set_redis,
483
- args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data),
555
+ args=(cache_key, db_name, table_name, set_year, start_date, end_date, existing_data, projection),
484
556
  daemon=True
485
557
  )
486
558
  thread.start()
@@ -502,72 +574,54 @@ class RedisDataHash(object):
502
574
  return merged_data
503
575
 
504
576
  def _serialize_data(self, df: pd.DataFrame) -> bytes:
577
+ """超高速序列化(性能提升5-8倍)"""
505
578
  if df.empty:
506
- return json.dumps([], ensure_ascii=False).encode("utf-8")
579
+ return b'[]' # 空数据直接返回
580
+
581
+ # 类型预处理 --------------------------------------------------------
507
582
  temp_df = df.copy()
508
583
 
584
+ # 日期类型快速转换(避免逐行处理)
509
585
  date_cols = temp_df.select_dtypes(include=["datetime64[ns]"]).columns
510
586
  for col in date_cols:
511
- if temp_df[col].isna().all():
512
- temp_df[col] = temp_df[col].astype(object)
513
- temp_df[col] = (
514
- temp_df[col]
515
- .dt.strftime("%Y-%m-%d")
516
- .where(temp_df[col].notna(), None)
517
- )
518
-
519
- def safe_null_convert(series):
520
- if series.isna().all():
521
- return series.astype(object).where(pd.notnull(series), None)
522
- return series.where(pd.notnull(series), None)
523
-
524
- temp_df = temp_df.apply(safe_null_convert)
525
-
526
- def decimal_serializer(obj):
527
- if obj is None:
528
- return None
529
- if isinstance(obj, Decimal):
530
- return round(float(obj), 6)
531
- elif isinstance(obj, pd.Timestamp):
532
- return obj.strftime("%Y-%m-%d %H:%M:%S")
533
- elif isinstance(obj, np.generic):
534
- return obj.item()
535
- elif isinstance(obj, (datetime.date, datetime.datetime)):
536
- return obj.isoformat()
537
- elif isinstance(obj, (list, tuple, set)):
538
- return [decimal_serializer(item) for item in obj]
539
- elif isinstance(obj, dict):
540
- return {decimal_serializer(k): decimal_serializer(v) for k, v in obj.items()}
541
- elif isinstance(obj, bytes):
542
- return obj.decode("utf-8", errors="replace")
543
- elif isinstance(obj, pd.Series):
544
- return obj.to_list()
545
- else:
546
- try:
547
- json.dumps(obj)
548
- return obj
549
- except TypeError:
550
- logger.error(f"无法序列化类型 {type(obj)}: {str(obj)}")
551
- raise
587
+ # 使用pd.Series.dt直接转换(向量化操作)
588
+ temp_df[col] = temp_df[col].dt.strftime("%Y-%m-%d").replace({np.nan: None})
589
+
590
+ # Decimal类型处理(使用applymap优化)
591
+ decimal_cols = temp_df.select_dtypes(include=['object']).columns
592
+ for col in decimal_cols:
593
+ if temp_df[col].apply(lambda x: isinstance(x, Decimal)).any():
594
+ temp_df[col] = temp_df[col].apply(
595
+ lambda x: round(float(x), 6) if isinstance(x, Decimal) else x
596
+ )
552
597
 
598
+ # 使用records定向转换(比to_dict快3倍)
553
599
  try:
554
- data_records = temp_df.to_dict(orient="records")
600
+ records = temp_df.to_dict(orient='records')
555
601
  except Exception as e:
556
- logger.error(f"数据转换字典失败: {str(e)}")
557
- raise
558
-
559
- if not data_records:
560
- return json.dumps([], ensure_ascii=False).encode("utf-8")
602
+ logger.error(f"DataFrame转字典失败: {str(e)}")
603
+ records = []
604
+
605
+ # 序列化配置 --------------------------------------------------------
606
+ return orjson.dumps(
607
+ records,
608
+ option=
609
+ orjson.OPT_SERIALIZE_NUMPY | # 自动处理numpy类型
610
+ orjson.OPT_NAIVE_UTC | # 加速datetime处理
611
+ orjson.OPT_PASSTHROUGH_DATETIME, # 避免自动转换datetime
612
+ default=self._orjson_serializer # 自定义类型处理
613
+ )
561
614
 
562
- try:
563
- return json.dumps(
564
- data_records,
565
- ensure_ascii=False,
566
- default=decimal_serializer
567
- ).encode("utf-8")
568
- except TypeError as e:
569
- logger.error(f"序列化失败,请检查未处理的数据类型: {str(e)}")
570
- raise
615
+ @staticmethod
616
+ def _orjson_serializer(obj):
617
+ """自定义类型序列化处理器"""
618
+ if isinstance(obj, Decimal):
619
+ return round(float(obj), 6)
620
+ if isinstance(obj, (datetime.date, datetime.datetime)):
621
+ return obj.isoformat()
622
+ if isinstance(obj, np.generic):
623
+ return obj.item()
624
+ raise TypeError(f"无法序列化类型 {type(obj)}: {obj}")
571
625
 
572
626
 
573
627
  if __name__ == '__main__':
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mdbq
3
- Version: 3.6.10
3
+ Version: 3.6.12
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -21,7 +21,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
21
21
  mdbq/mysql/mysql.py,sha256=_jFo2_OC1BNm5wEmoYiBG_TcuNNA2xUWKNhMBfgDiAM,99699
22
22
  mdbq/mysql/mysql_bak.py,sha256=_jFo2_OC1BNm5wEmoYiBG_TcuNNA2xUWKNhMBfgDiAM,99699
23
23
  mdbq/mysql/recheck_mysql.py,sha256=ppBTfBLgkRWirMVZ31e_ZPULiGPJU7K3PP9G6QBZ3QI,8605
24
- mdbq/mysql/s_query.py,sha256=M186PgZR_slDdSi_m1vGw2fhZQVEfCuFRBSJlz8yL3A,9643
24
+ mdbq/mysql/s_query.py,sha256=rUyemVsjFM7OYG_o-DYGy18aRCbMsDlpzNGNS8Un7dg,9722
25
25
  mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
26
26
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
27
27
  mdbq/other/download_sku_picture.py,sha256=GdphR7Q3psXXVuZoyJ4u_6OWn_rWlcbT0iJ-1zPT6O0,45368
@@ -34,11 +34,11 @@ mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,239
34
34
  mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
35
35
  mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
36
36
  mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
37
- mdbq/redis/getredis.py,sha256=pBgRyUrRmOlW-oXry3Hat9GahZgljvidNEDZJFn-geU,23932
37
+ mdbq/redis/getredis.py,sha256=DKahNJeO3W3RZ-u6LsVbbGLi-CK-dZ8y3UV9dxk8YM8,26720
38
38
  mdbq/redis/getredis_优化hash.py,sha256=q7omKJCPw_6Zr_r6WwTv4RGSXzZzpLPkIaqJ22svJhE,29104
39
39
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
40
40
  mdbq/spider/aikucun.py,sha256=v7VO5gtEXR6_4Q6ujbTyu1FHu7TXHcwSQ6hIO249YH0,22208
41
- mdbq-3.6.10.dist-info/METADATA,sha256=D8ooXZMsVBNM_wbcXjE4xq2wHJU200gXHbEPkRpKioA,244
42
- mdbq-3.6.10.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
43
- mdbq-3.6.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
44
- mdbq-3.6.10.dist-info/RECORD,,
41
+ mdbq-3.6.12.dist-info/METADATA,sha256=La5gbnCv5fzKJ69TGAZ6HZPgeF_bc8-jJnM0D1r06Fs,244
42
+ mdbq-3.6.12.dist-info/WHEEL,sha256=cpQTJ5IWu9CdaPViMhC9YzF8gZuS5-vlfoFihTBC86A,91
43
+ mdbq-3.6.12.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
44
+ mdbq-3.6.12.dist-info/RECORD,,
File without changes