mdbq 3.10.7__py3-none-any.whl → 3.10.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.10.7'
1
+ VERSION = '3.10.9'
@@ -457,6 +457,7 @@ def main():
457
457
  '安全组',
458
458
  # '视频数据',
459
459
  # '聚合数据',
460
+ '数据引擎2'
460
461
  ]
461
462
  tables_list = op.get_table_in_databases(db_list=db_list, reset_id=False)
462
463
  op.deduplicate(
@@ -3995,6 +3995,7 @@ def main(days=150, months=3):
3995
3995
  "推广数据2",
3996
3996
  "推广数据_淘宝店",
3997
3997
  "推广数据_奥莱店",
3998
+ "推广数据_圣积天猫店",
3998
3999
  "爱库存2",
3999
4000
  "生意参谋3",
4000
4001
  "生意经3",
@@ -4003,6 +4004,7 @@ def main(days=150, months=3):
4003
4004
  '商品人群画像2',
4004
4005
  '市场数据3',
4005
4006
  '回传数据',
4007
+ '数据引擎2',
4006
4008
  ]
4007
4009
  # 使用 ThreadPoolExecutor 来并行运行
4008
4010
  # with concurrent.futures.ThreadPoolExecutor() as executor:
mdbq/log/mylogger.py CHANGED
@@ -239,8 +239,9 @@ class MyLogger:
239
239
  log_data['异常'] = self.formatException(record.exc_info)
240
240
 
241
241
  # 过滤敏感信息
242
- if hasattr(record, '过滤'):
243
- for field in record.sensitive_fields:
242
+ if hasattr(record, 'extra_data') and '过滤' in record.extra_data:
243
+ sensitive_fields = record.extra_data['过滤']
244
+ for field in sensitive_fields:
244
245
  if field in log_data:
245
246
  log_data[field] = '***'
246
247
  if isinstance(log_data.get('message'), str):
@@ -447,8 +448,11 @@ class MyLogger:
447
448
  try:
448
449
  self._log_queue.put((level, message, extra), timeout=0.1)
449
450
  except queue.Full:
450
- # 队列满时降级为同步日志
451
- self._sync_log(level, f"[ASYNC QUEUE FULL] {message}", extra)
451
+ # 队列满时降级为同步日志,添加队列状态信息到extra
452
+ if extra is None:
453
+ extra = {}
454
+ extra['queue_status'] = 'full'
455
+ self._sync_log(level, message, extra)
452
456
  else:
453
457
  self._sync_log(level, message, extra)
454
458
 
@@ -509,21 +513,6 @@ class MyLogger:
509
513
  if not extra:
510
514
  extra = {}
511
515
 
512
- # # 获取完整的异常堆栈
513
- # tb = exc_info.__traceback__
514
- # while tb.tb_next:
515
- # tb = tb.tb_next # 获取最内层的堆栈帧
516
- #
517
- # extra.update({
518
- # 'module': tb.tb_frame.f_globals.get('__name__', ''),
519
- # 'function': tb.tb_frame.f_code.co_name,
520
- # 'line': tb.tb_lineno,
521
- # 'file': tb.tb_frame.f_code.co_filename,
522
- # '异常': str(exc_info),
523
- # '类型': exc_info.__class__.__name__,
524
- # '堆栈': self._format_traceback(exc_info)
525
- # })
526
-
527
516
  # 使用inspect获取调用栈
528
517
  frame = inspect.currentframe()
529
518
  try:
@@ -24,7 +24,7 @@ logger = mylogger.MyLogger(
24
24
  max_log_size=50,
25
25
  backup_count=5,
26
26
  enable_async=False, # 是否启用异步日志
27
- sample_rate=1, # 采样50%的DEBUG/INFO日志
27
+ sample_rate=1, # 采样DEBUG/INFO日志, 0.5表示50%的日志会被采样
28
28
  sensitive_fields=[], # 敏感字段列表
29
29
  )
30
30
 
@@ -72,8 +72,9 @@ class MySQLDeduplicator:
72
72
  skip_system_dbs: bool = True,
73
73
  max_retries: int = 3,
74
74
  retry_interval: int = 5,
75
- pool_size: int = 5
76
- ):
75
+ pool_size: int = 5,
76
+ primary_key: str = 'id'
77
+ ) -> None:
77
78
  """
78
79
  初始化去重处理器
79
80
 
@@ -88,6 +89,7 @@ class MySQLDeduplicator:
88
89
  :param max_retries: 最大重试次数
89
90
  :param retry_interval: 重试间隔(秒)
90
91
  :param pool_size: 连接池大小
92
+ :param primary_key: 主键列名,默认为'id'
91
93
  """
92
94
  # 连接池状态标志
93
95
  self._closed = False
@@ -110,6 +112,7 @@ class MySQLDeduplicator:
110
112
  self.skip_system_dbs = skip_system_dbs
111
113
  self.max_retries = max_retries
112
114
  self.retry_interval = retry_interval
115
+ self.primary_key = primary_key
113
116
 
114
117
  # 线程安全控制
115
118
  self._lock = threading.Lock()
@@ -118,7 +121,7 @@ class MySQLDeduplicator:
118
121
  # 系统数据库列表
119
122
  self.SYSTEM_DATABASES = {'information_schema', 'mysql', 'performance_schema', 'sys'}
120
123
 
121
- def _get_connection(self):
124
+ def _get_connection(self) -> pymysql.connections.Connection:
122
125
  """从连接池获取连接"""
123
126
  if self._closed:
124
127
  raise ConnectionError("连接池已关闭")
@@ -131,7 +134,7 @@ class MySQLDeduplicator:
131
134
  raise ConnectionError(f"连接数据库失败: {str(e)}")
132
135
 
133
136
  @staticmethod
134
- def _retry_on_failure(func):
137
+ def _retry_on_failure(func: Any) -> Any:
135
138
  """重试装饰器"""
136
139
 
137
140
  @wraps(func)
@@ -187,7 +190,7 @@ class MySQLDeduplicator:
187
190
 
188
191
  @_retry_on_failure
189
192
  def _get_table_columns(self, database: str, table: str) -> List[str]:
190
- """获取表的列名(排除id列)"""
193
+ """获取表的列名(排除主键列)"""
191
194
  sql = """
192
195
  SELECT COLUMN_NAME
193
196
  FROM INFORMATION_SCHEMA.COLUMNS
@@ -199,7 +202,7 @@ class MySQLDeduplicator:
199
202
  with conn.cursor() as cursor:
200
203
  cursor.execute(sql, (database, table))
201
204
  return [row['COLUMN_NAME'] for row in cursor.fetchall()
202
- if row['COLUMN_NAME'].lower() != 'id']
205
+ if row['COLUMN_NAME'].lower() != self.primary_key.lower()]
203
206
 
204
207
  def _acquire_table_lock(self, database: str, table: str) -> bool:
205
208
  """获取表处理锁,防止并发处理同一张表"""
@@ -212,7 +215,7 @@ class MySQLDeduplicator:
212
215
  self._processing_tables.add(key)
213
216
  return True
214
217
 
215
- def _release_table_lock(self, database: str, table: str):
218
+ def _release_table_lock(self, database: str, table: str) -> None:
216
219
  """释放表处理锁"""
217
220
  key = f"{database}.{table}"
218
221
 
@@ -255,7 +258,7 @@ class MySQLDeduplicator:
255
258
  if invalid_columns:
256
259
  logger.warning(
257
260
  f"表 {database}.{table} 中不存在以下列: {invalid_columns},使用有效列",
258
- {'invalid_columns': invalid_columns}
261
+ {'invalid_columns': list(invalid_columns)}
259
262
  )
260
263
  use_columns = [col for col in use_columns if col in all_columns]
261
264
 
@@ -265,14 +268,17 @@ class MySQLDeduplicator:
265
268
 
266
269
  # 构建去重SQL
267
270
  column_list = ', '.join([f'`{col}`' for col in use_columns])
268
- # temp_table = f"temp_{table}_{int(time.time())}"
269
- temp_table = f"temp_{table}_dedup_{os.getpid()}" # 使用进程ID构建临时表
270
- temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table) # 确保表名合法
271
-
272
- # 使用临时表方案处理去重,避免锁表问题
271
+ # 临时表名限制64字符以内
272
+ temp_table = f"temp_{table}_dedup_{os.getpid()}"
273
+ temp_table = re.sub(r'[^a-zA-Z0-9_]', '_', temp_table)[:60]
274
+ pk = self.primary_key
275
+ # 校验主键列是否存在
276
+ if pk not in all_columns and pk != 'id':
277
+ logger.error(f"表 {database}.{table} 不存在主键列 {pk}")
278
+ return (0, 0)
273
279
  create_temp_sql = f"""
274
280
  CREATE TABLE `{database}`.`{temp_table}` AS
275
- SELECT MIN(`id`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
281
+ SELECT MIN(`{pk}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
276
282
  FROM `{database}`.`{table}`
277
283
  GROUP BY {column_list}
278
284
  HAVING COUNT(*) > 1
@@ -280,7 +286,7 @@ class MySQLDeduplicator:
280
286
 
281
287
  delete_dup_sql = f"""
282
288
  DELETE FROM `{database}`.`{table}`
283
- WHERE `id` NOT IN (
289
+ WHERE `{pk}` NOT IN (
284
290
  SELECT `min_id` FROM `{database}`.`{temp_table}`
285
291
  ) AND ({' OR '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
286
292
  """
@@ -292,7 +298,8 @@ class MySQLDeduplicator:
292
298
  # 创建临时表统计重复数据
293
299
  cursor.execute(create_temp_sql)
294
300
  cursor.execute(f"SELECT COUNT(*) as cnt FROM `{database}`.`{temp_table}`")
295
- dup_count = cursor.fetchone()['cnt']
301
+ dup_count_row = cursor.fetchone()
302
+ dup_count = dup_count_row['cnt'] if dup_count_row and 'cnt' in dup_count_row else 0
296
303
 
297
304
  if dup_count == 0:
298
305
  logger.info(f"表 {database}.{table} 没有重复数据")
@@ -557,7 +564,7 @@ class MySQLDeduplicator:
557
564
  cursor.execute(sql, (database, table))
558
565
  return bool(cursor.fetchone())
559
566
 
560
- def close(self):
567
+ def close(self) -> None:
561
568
  """关闭连接池"""
562
569
  try:
563
570
  if hasattr(self, 'pool') and self.pool and not self._closed:
@@ -567,17 +574,17 @@ class MySQLDeduplicator:
567
574
  except Exception as e:
568
575
  logger.error(f"关闭连接池时出错: {str(e)}", {'error_type': type(e).__name__})
569
576
 
570
- def __enter__(self):
577
+ def __enter__(self) -> 'MySQLDeduplicator':
571
578
  return self
572
579
 
573
- def __exit__(self, exc_type, exc_val, exc_tb):
580
+ def __exit__(self, exc_type: Optional[type], exc_val: Optional[BaseException], exc_tb: Optional[Any]) -> None:
574
581
  self.close()
575
582
 
576
583
 
577
584
  def main():
578
585
  deduplicator = MySQLDeduplicator(
579
586
  username='root',
580
- password='188988yang188',
587
+ password='pwd',
581
588
  host='localhost',
582
589
  port=3306
583
590
  )
@@ -595,4 +602,5 @@ def main():
595
602
  deduplicator.close()
596
603
 
597
604
  if __name__ == '__main__':
598
- main()
605
+ # main()
606
+ pass