mdbq 4.2.3__tar.gz → 4.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mdbq might be problematic. Click here for more details.

Files changed (46) hide show
  1. {mdbq-4.2.3 → mdbq-4.2.5}/PKG-INFO +1 -1
  2. mdbq-4.2.5/mdbq/__version__.py +1 -0
  3. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/mysql/deduplicator.py +11 -12
  4. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/mysql/mysql.py +10 -4
  5. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/mysql/s_query.py +1 -1
  6. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/mysql/uploader.py +48 -2
  7. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/other/download_sku_picture.py +0 -1
  8. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq.egg-info/PKG-INFO +1 -1
  9. mdbq-4.2.3/mdbq/__version__.py +0 -1
  10. {mdbq-4.2.3 → mdbq-4.2.5}/README.txt +0 -0
  11. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/__init__.py +0 -0
  12. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/auth/__init__.py +0 -0
  13. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/auth/auth_backend.py +0 -0
  14. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/auth/crypto.py +0 -0
  15. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/auth/rate_limiter.py +0 -0
  16. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/js/__init__.py +0 -0
  17. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/js/jc.py +0 -0
  18. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/log/__init__.py +0 -0
  19. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/log/mylogger.py +0 -0
  20. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/myconf/__init__.py +0 -0
  21. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/myconf/myconf.py +0 -0
  22. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/mysql/__init__.py +0 -0
  23. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/mysql/unique_.py +0 -0
  24. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/other/__init__.py +0 -0
  25. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/other/error_handler.py +0 -0
  26. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/other/otk.py +0 -0
  27. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/other/pov_city.py +0 -0
  28. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/other/ua_sj.py +0 -0
  29. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/pbix/__init__.py +0 -0
  30. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/pbix/pbix_refresh.py +0 -0
  31. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/pbix/refresh_all.py +0 -0
  32. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/redis/__init__.py +0 -0
  33. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/redis/getredis.py +0 -0
  34. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/redis/redis_cache.py +0 -0
  35. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/route/__init__.py +0 -0
  36. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/route/analytics.py +0 -0
  37. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/route/monitor.py +0 -0
  38. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/route/routes.py +0 -0
  39. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/selenium/__init__.py +0 -0
  40. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/selenium/get_driver.py +0 -0
  41. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq/spider/__init__.py +0 -0
  42. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq.egg-info/SOURCES.txt +0 -0
  43. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq.egg-info/dependency_links.txt +0 -0
  44. {mdbq-4.2.3 → mdbq-4.2.5}/mdbq.egg-info/top_level.txt +0 -0
  45. {mdbq-4.2.3 → mdbq-4.2.5}/setup.cfg +0 -0
  46. {mdbq-4.2.3 → mdbq-4.2.5}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdbq
3
- Version: 4.2.3
3
+ Version: 4.2.5
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -0,0 +1 @@
1
+ VERSION = '4.2.5'
@@ -39,7 +39,7 @@ class MySQLDeduplicator:
39
39
 
40
40
  主要参数说明:
41
41
  - columns: 指定去重分组字段,控制唯一性分组行为。若 columns 有值且不包含 date_column,则全表去重,否则按天分区。
42
- - exclude_columns: 去重时排除的列名列表,自动合并 ['id', '更新时间'],在分组时排除。
42
+ - exclude_columns: 去重时排除的列名列表,自动合并 ['id', 'update_at'],在分组时排除。
43
43
  - date_column: 指定日期分区字段,默认为 '日期'。如表存在该字段且 columns 未排除,则按天分区去重。
44
44
  - duplicate_keep_mode: 'keep_one'(默认,重复组保留一条),'remove_all'(全部删除重复组)。
45
45
  - dry_run: 是否为模拟运行,不实际删除数据。
@@ -81,7 +81,7 @@ class MySQLDeduplicator:
81
81
  :param date_range: 指定去重的日期区间 [start_date, end_date],格式'YYYY-MM-DD'
82
82
  :param recent_month: 最近N个月的数据去重(与date_range互斥,优先生效)
83
83
  :param date_column: 时间列名,默认为'日期'
84
- :param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
84
+ :param exclude_columns: 去重时排除的列名列表,默认为['id', 'update_at']
85
85
  :param exclude_databases: 排除的数据库名列表
86
86
  :param exclude_tables: 排除的表名字典 {数据库名: [表名, ...]}
87
87
  :param duplicate_keep_mode: 'keep_one'(默认,重复组保留一条),'remove_all'(全部删除重复组)
@@ -171,7 +171,7 @@ class MySQLDeduplicator:
171
171
  logger.debug('去重日期范围', {'开始': self._dedup_start_date, '结束': self._dedup_end_date})
172
172
 
173
173
  # 排除列处理,直接合并去重
174
- self.exclude_columns = list(set((exclude_columns or []) + ['id', '更新时间']))
174
+ self.exclude_columns = list(set((exclude_columns or []) + ['id', 'update_at']))
175
175
 
176
176
  # 线程安全控制
177
177
  self._lock = threading.Lock()
@@ -450,11 +450,11 @@ class MySQLDeduplicator:
450
450
 
451
451
  # 用Python查找重复
452
452
  if use_python_dedup:
453
- # 判断分组字段是否有“更新时间”
454
- has_update_time = any(col == '更新时间' for col in use_columns)
453
+ # 判断分组字段是否有“update_at”
454
+ has_update_time = any(col == 'update_at' for col in use_columns)
455
455
  select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
456
456
  if has_update_time:
457
- select_cols += ',`更新时间`'
457
+ select_cols += ',`update_at`'
458
458
  select_where = f"WHERE `{time_col}` = '{date_val}'" if date_val else ''
459
459
  grouped = defaultdict(list)
460
460
  for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
@@ -466,8 +466,7 @@ class MySQLDeduplicator:
466
466
  if len(ids) > 1:
467
467
  dup_count += 1
468
468
  if has_update_time:
469
- # 按更新时间最大保留
470
- keep_row = max(ids, key=lambda x: x.get('更新时间') or '')
469
+ keep_row = max(ids, key=lambda x: x.get('update_at') or '')
471
470
  else:
472
471
  # 按id保留
473
472
  if self.keep_order == 'max':
@@ -490,10 +489,10 @@ class MySQLDeduplicator:
490
489
  temp_table = self._make_temp_table_name(table)
491
490
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
492
491
  create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
493
- # 判断分组字段是否有“更新时间”
494
- has_update_time = any(col == '更新时间' for col in use_columns)
492
+
493
+ has_update_time = any(col == 'update_at' for col in use_columns)
495
494
  if has_update_time:
496
- keep_field = '更新时间'
495
+ keep_field = 'update_at'
497
496
  keep_func = 'MAX'
498
497
  else:
499
498
  keep_field = pk_real
@@ -1369,7 +1368,7 @@ def main():
1369
1368
  maxcached=5,
1370
1369
  # recent_month=1,
1371
1370
  # date_range=['2025-06-09', '2025-06-10'],
1372
- exclude_columns=['创建时间', '更新时间'],
1371
+ exclude_columns=['创建时间', '更新时间', "update_at", "create_at"],
1373
1372
  exclude_databases=['cookie文件', '日志', '视频数据', '云电影'],
1374
1373
  # exclude_tables={
1375
1374
  # '推广数据2': [
@@ -146,6 +146,8 @@ class MysqlUpload:
146
146
  __res_dict.update({k: 'DATE'})
147
147
  elif k == '更新时间':
148
148
  __res_dict.update({k: 'TIMESTAMP'})
149
+ elif k == 'update_at' or k == 'create_at':
150
+ __res_dict.update({k: 'TIMESTAMP'})
149
151
  elif result2:
150
152
  __res_dict.update({k: 'decimal(10,4)'})
151
153
  elif date_type == 1:
@@ -550,6 +552,8 @@ class MysqlUpload:
550
552
  __res_dict.update({k: 'DATE'})
551
553
  elif k == '更新时间':
552
554
  __res_dict.update({k: 'TIMESTAMP'})
555
+ elif k == 'update_at' or k == 'create_at':
556
+ __res_dict.update({k: 'TIMESTAMP'})
553
557
  elif result2: # 小数
554
558
  __res_dict.update({k: 'decimal(10,4)'})
555
559
  elif date_type == 1: # 纯日期
@@ -605,6 +609,8 @@ class MysqlUpload:
605
609
  __res_dict.update({k: 'date'})
606
610
  elif k == '更新时间':
607
611
  __res_dict.update({k: 'timestamp'})
612
+ elif k == 'update_at' or k == 'create_at':
613
+ __res_dict.update({k: 'timestamp'})
608
614
  elif v == 'int64':
609
615
  __res_dict.update({k: 'int'})
610
616
  elif v == 'float64':
@@ -936,10 +942,10 @@ class OptimizeDatas:
936
942
  self.db_name = db_name
937
943
  self.optimize()
938
944
 
939
- def optimize(self, except_key: list[str] = ['更新时间']) -> None:
945
+ def optimize(self, except_key: list[str] = ['update_at']) -> None:
940
946
  """
941
947
  优化当前数据库,移除冗余数据。
942
- :param except_key: 排除的字段名列表,默认['更新时间']
948
+ :param except_key: 排除的字段名列表,默认['update_at']
943
949
  """
944
950
  if not self.db_name:
945
951
  logger.info(f'尚未设置参数: self.db_name')
@@ -995,7 +1001,7 @@ class OptimizeDatas:
995
1001
  self.connection.close()
996
1002
  logger.info(f'mysql({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
997
1003
 
998
- def delete_duplicate(self, table_name: str, date: pd.Timestamp, except_key: list[str] = ['更新时间']) -> None:
1004
+ def delete_duplicate(self, table_name: str, date: pd.Timestamp, except_key: list[str] = ['update_at']) -> None:
999
1005
  """
1000
1006
  删除指定表指定日期的冗余数据。
1001
1007
  :param table_name: 表名
@@ -1029,7 +1035,7 @@ class OptimizeDatas:
1029
1035
  logger.error(f'{self.db_name}/{table_name}, {e}')
1030
1036
  self.connection.rollback()
1031
1037
 
1032
- def delete_duplicate2(self, table_name: str, except_key: list[str] = ['更新时间']) -> None:
1038
+ def delete_duplicate2(self, table_name: str, except_key: list[str] = ['update_at']) -> None:
1033
1039
  """
1034
1040
  删除指定表(无日期列)的冗余数据。
1035
1041
  :param table_name: 表名
@@ -485,7 +485,7 @@ class QueryDatas:
485
485
  logger.error('获取列类型失败', {'库': db_name, '表': table_name, '列': columns_name, '错误': str(e)})
486
486
  return []
487
487
 
488
- def check_condition(self, db_name: str, table_name: str, condition: str, columns: str = '更新时间') -> Optional[List[Dict[str, Any]]]:
488
+ def check_condition(self, db_name: str, table_name: str, condition: str, columns: str = 'update_at') -> Optional[List[Dict[str, Any]]]:
489
489
  """按指定条件查询数据库表"""
490
490
  if not self._get_table_info(db_name, table_name):
491
491
  return None
@@ -694,14 +694,16 @@ class TableManager:
694
694
  # 主键定义(始终使用id作为主键)
695
695
  primary_key_def = "PRIMARY KEY (`id`)"
696
696
 
697
- # 唯一约束定义
697
+ # 唯一约束定义 - 使用前缀索引处理超长字段
698
698
  unique_defs = []
699
699
  if unique_keys:
700
700
  for i, uk in enumerate(unique_keys):
701
701
  # 过滤掉系统列
702
702
  filtered_uk = [col for col in uk if col.lower() not in ['id', 'create_at', 'update_at']]
703
703
  if filtered_uk:
704
- safe_uk = [f"`{self._sanitize_identifier(col)}`" for col in filtered_uk]
704
+ # 使用前缀索引处理超长字段
705
+ optimized_uk = self._apply_prefix_index_to_columns(filtered_uk, columns)
706
+ safe_uk = [f"`{self._sanitize_identifier(col)}`" for col in optimized_uk]
705
707
  unique_name = f"uniq_{i}"
706
708
  unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_uk)})")
707
709
 
@@ -764,6 +766,50 @@ class TableManager:
764
766
  return f"`{cleaned}`"
765
767
 
766
768
  return cleaned
769
+
770
+ def _apply_prefix_index_to_columns(self, columns: List[str], column_definitions: Dict[str, str]) -> List[str]:
771
+ """
772
+ 为超长的varchar字段应用前缀索引,防止索引键长度超限
773
+
774
+ :param columns: 列名列表
775
+ :param column_definitions: 列定义字典
776
+ :return: 应用前缀索引后的列名列表
777
+ """
778
+ optimized_columns = []
779
+
780
+ for col in columns:
781
+ col_type = column_definitions.get(col, 'varchar(255)').lower()
782
+
783
+ # 只对varchar字段应用前缀索引
784
+ if 'varchar' in col_type:
785
+ # 提取varchar长度
786
+ match = re.search(r'varchar\((\d+)\)', col_type)
787
+ if match:
788
+ length = int(match.group(1))
789
+ # 如果varchar长度超过191字符,使用前缀索引
790
+ # 191 * 4 = 764字节,在3072字节限制内比较安全
791
+ if length > 191:
792
+ prefix_length = 191 # 使用191字符作为前缀
793
+ optimized_columns.append(f"{col}({prefix_length})")
794
+ logger.info('应用前缀索引', {
795
+ '列名': col,
796
+ '原始长度': length,
797
+ '前缀长度': prefix_length
798
+ })
799
+ else:
800
+ optimized_columns.append(col)
801
+ else:
802
+ # 如果没有指定长度,默认使用前缀索引
803
+ optimized_columns.append(f"{col}(191)")
804
+ logger.info('应用默认前缀索引', {
805
+ '列名': col,
806
+ '前缀长度': 191
807
+ })
808
+ else:
809
+ # 非varchar字段保持原样
810
+ optimized_columns.append(col)
811
+
812
+ return optimized_columns
767
813
 
768
814
 
769
815
  class DataProcessor:
@@ -477,7 +477,6 @@ class SkuPicture:
477
477
  '推荐卖点': desc,
478
478
  '获取与下载': '已获取',
479
479
  '类目': leimu,
480
- '更新时间': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
481
480
  }
482
481
  )
483
482
  except Exception as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdbq
3
- Version: 4.2.3
3
+ Version: 4.2.5
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1 +0,0 @@
1
- VERSION = '4.2.3'
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes