mdbq 3.12.3__py3-none-any.whl → 3.12.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.12.3'
1
+ VERSION = '3.12.4'
@@ -16,12 +16,11 @@ from datetime import datetime
16
16
  import uuid
17
17
  from contextlib import contextmanager
18
18
 
19
-
20
19
  warnings.filterwarnings('ignore')
21
20
  logger = mylogger.MyLogger(
22
21
  name='deduplicator',
23
22
  logging_mode='file',
24
- log_level='debug',
23
+ log_level='info',
25
24
  log_file='deduplicator.log',
26
25
  log_format='json',
27
26
  max_log_size=50,
@@ -72,7 +71,8 @@ class MySQLDeduplicator:
72
71
  exclude_columns: Optional[List[str]] = None,
73
72
  exclude_databases: Optional[List[str]] = None,
74
73
  exclude_tables: Optional[Dict[str, List[str]]] = None,
75
- duplicate_keep_mode: str = 'keep_one'
74
+ duplicate_keep_mode: str = 'keep_one',
75
+ keep_order: str = 'min'
76
76
  ) -> None:
77
77
  """
78
78
  初始化去重处理器
@@ -178,6 +178,7 @@ class MySQLDeduplicator:
178
178
  self.exclude_tables = {k.lower(): set(t.lower() for t in v) for k, v in (exclude_tables or {}).items()}
179
179
 
180
180
  self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
181
+ self.keep_order = keep_order if keep_order in ('min', 'max') else 'min'
181
182
 
182
183
  def _get_connection(self) -> pymysql.connections.Connection:
183
184
  """
@@ -442,18 +443,31 @@ class MySQLDeduplicator:
442
443
 
443
444
  # 用Python查找重复
444
445
  if use_python_dedup:
446
+ # 判断分组字段是否有“更新时间”
447
+ has_update_time = any(col == '更新时间' for col in use_columns)
445
448
  select_cols = f'`{pk_real}`,' + ','.join([f'`{col}`' for col in use_columns])
446
- select_where = f"WHERE `{time_col}` = '{date_val}'"
449
+ if has_update_time:
450
+ select_cols += ',`更新时间`'
451
+ select_where = f"WHERE `{time_col}` = '{date_val}'" if date_val else ''
447
452
  grouped = defaultdict(list)
448
453
  for row in self._row_generator(database, table, select_cols, select_where, self.batch_size):
449
454
  key = tuple(row[col] for col in use_columns)
450
- grouped[key].append(row[pk_real])
455
+ grouped[key].append(row)
451
456
  dup_count = 0
452
457
  del_ids = []
453
458
  for ids in grouped.values():
454
459
  if len(ids) > 1:
455
460
  dup_count += 1
456
- del_ids.extend(ids[1:])
461
+ if has_update_time:
462
+ # 按更新时间最大保留
463
+ keep_row = max(ids, key=lambda x: x.get('更新时间') or '')
464
+ else:
465
+ # 按id保留
466
+ if self.keep_order == 'max':
467
+ keep_row = max(ids, key=lambda x: x[pk_real])
468
+ else:
469
+ keep_row = min(ids, key=lambda x: x[pk_real])
470
+ del_ids.extend([r[pk_real] for r in ids if r[pk_real] != keep_row[pk_real]])
457
471
  affected_rows = 0
458
472
  if not dry_run and del_ids:
459
473
  with self._conn_ctx() as conn:
@@ -469,9 +483,18 @@ class MySQLDeduplicator:
469
483
  temp_table = self._make_temp_table_name(table)
470
484
  drop_temp_sql = f"DROP TABLE IF EXISTS `{database}`.`{temp_table}`"
471
485
  create_temp_where = f"WHERE `{time_col}` = '{date_val}'"
486
+ # 判断分组字段是否有“更新时间”
487
+ has_update_time = any(col == '更新时间' for col in use_columns)
488
+ if has_update_time:
489
+ keep_field = '更新时间'
490
+ keep_func = 'MAX'
491
+ else:
492
+ keep_field = pk_real
493
+ keep_func = 'MAX' if self.keep_order == 'max' else 'MIN'
494
+ keep_alias = 'keep_val'
472
495
  create_temp_sql = f"""
473
496
  CREATE TABLE `{database}`.`{temp_table}` AS
474
- SELECT MIN(`{pk_real}`) as `min_id`, {column_list}, COUNT(*) as `dup_count`
497
+ SELECT {keep_func}(`{keep_field}`) as `{keep_alias}`, {column_list}, COUNT(*) as `dup_count`
475
498
  FROM `{database}`.`{table}`
476
499
  {create_temp_where}
477
500
  GROUP BY {column_list}
@@ -494,7 +517,7 @@ class MySQLDeduplicator:
494
517
  while True:
495
518
  where_clauses = []
496
519
  if self.duplicate_keep_mode == 'keep_one':
497
- where_clauses.append(f"t.`{pk_real}` <> tmp.`min_id`")
520
+ where_clauses.append(f"t.`{keep_field}` <> tmp.`{keep_alias}`")
498
521
  if where_sql.strip():
499
522
  where_clauses.append(where_sql.strip())
500
523
  where_full = "WHERE " + " AND ".join(where_clauses) if where_clauses else ""
@@ -752,7 +775,7 @@ class MySQLDeduplicator:
752
775
  logger.warning('分区处理失败', {"库": database, "表": table, "日期": date_val, "异常": err, "func": sys._getframe().f_code.co_name})
753
776
  total_dup += dup_count
754
777
  total_del += affected_rows
755
- logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}"})
778
+ logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": (total_dup, total_del), '日期范围': f"{start_date} - {end_date}", "唯一列": columns})
756
779
  # 自动重排id列(仅当有实际删除时且reorder_id为True)
757
780
  if reorder_id and total_del > 0:
758
781
  try:
@@ -761,11 +784,11 @@ class MySQLDeduplicator:
761
784
  except Exception as e:
762
785
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
763
786
  if affected_rows > 0:
764
- logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del})
787
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": total_dup, "实际删除": total_del, "唯一列": columns})
765
788
  return (total_dup, total_del)
766
789
  # 没有date_column,直接全表去重
767
790
  result = self._deduplicate_table(database, table, columns, dry_run, use_python_dedup, date_val=None)
768
- logger.debug('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表'})
791
+ logger.info('单表完成', {"库": database, "表": table, "结果[重复, 删除]": result, '日期范围': '全表', "唯一列": columns})
769
792
  dup_count, affected_rows = result
770
793
  if reorder_id and affected_rows > 0:
771
794
  try:
@@ -774,7 +797,7 @@ class MySQLDeduplicator:
774
797
  except Exception as e:
775
798
  logger.error('自动重排id列异常', {"库": database, "表": table, "异常": str(e)})
776
799
  if affected_rows > 0:
777
- logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows})
800
+ logger.info('单表完成(仅显示有删除的结果)', {"库": database, "表": table, "重复组": dup_count, "实际删除": affected_rows, "唯一列": columns})
778
801
  return result
779
802
  except Exception as e:
780
803
  logger.error('发生全局错误', {"库": database, "表": table, 'func': sys._getframe().f_code.co_name, "发生全局错误": str(e)})
@@ -1314,18 +1337,24 @@ class MySQLDeduplicator:
1314
1337
 
1315
1338
 
1316
1339
  def main():
1340
+ from mdbq.config import config
1341
+ dir_path = os.path.expanduser("~")
1342
+ my_cont = config.read_config(file_path=os.path.join(dir_path, 'spd.txt'))
1343
+ username, password, host, port = my_cont['username'], my_cont['password'], my_cont['host'], int(my_cont['port'])
1344
+ # host = 'localhost'
1345
+
1317
1346
  deduplicator = MySQLDeduplicator(
1318
- username='root',
1319
- password='pwd',
1320
- host='localhost',
1321
- port=3306,
1347
+ username=username,
1348
+ password=password,
1349
+ host=host,
1350
+ port=port,
1322
1351
  max_workers= 2,
1323
1352
  batch_size=1000,
1324
1353
  skip_system_dbs=True,
1325
1354
  max_retries=3,
1326
1355
  retry_waiting_time=5,
1327
1356
  pool_size=30,
1328
- recent_month=1,
1357
+ # recent_month=1,
1329
1358
  # date_range=['2025-06-09', '2025-06-10'],
1330
1359
  exclude_columns=['更新时间'],
1331
1360
  # exclude_databases=['测试库4'],
@@ -1338,6 +1367,7 @@ def main():
1338
1367
  # "商品排行_2025",
1339
1368
  # ],
1340
1369
  # },
1370
+ keep_order='MAX', # 保留重复组中指定列的最大值
1341
1371
  )
1342
1372
 
1343
1373
  # 全库去重(单线程)
@@ -1347,7 +1377,13 @@ def main():
1347
1377
  # deduplicator.deduplicate_database('数据引擎2', dry_run=False, parallel=True, reorder_id=True)
1348
1378
 
1349
1379
  # # 指定表去重(使用特定列)
1350
- deduplicator.deduplicate_table('安全组', '腾讯云cvm规则', columns=['平台', '本地主机', '端口范围', '授权ip'], dry_run=False, reorder_id=True)
1380
+ deduplicator.deduplicate_table(
1381
+ '京东数据3',
1382
+ '推广数据_京准通_2024',
1383
+ columns=['日期', '店铺名称', '产品线', '触发sku_id', '跟单sku_id', 'spu_id', '花费', '展现数', '点击数'],
1384
+ dry_run=False,
1385
+ reorder_id=True,
1386
+ )
1351
1387
 
1352
1388
  # # 重排id列
1353
1389
  # deduplicator.reorder_id_column('my_db', 'my_table', 'id', dry_run=False, auto_drop_backup=True)
mdbq/mysql/uploader.py CHANGED
@@ -323,7 +323,7 @@ class MySQLUploader:
323
323
  logger.error('无效的标识符', {'标识符': identifier})
324
324
  raise ValueError(f"无效的标识符: `{identifier}`")
325
325
  # 始终做特殊字符清理
326
- cleaned = re.sub(r'[^-\uFFFF\w\u4e00-\u9fff$]', '_', identifier)
326
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
327
327
  cleaned = re.sub(r'_+', '_', cleaned).strip('_')
328
328
  if not cleaned:
329
329
  logger.error('无法清理异常标识符', {'原始标识符': identifier})
@@ -332,6 +332,8 @@ class MySQLUploader:
332
332
  'select', 'insert', 'update', 'delete', 'from', 'where', 'and', 'or',
333
333
  'not', 'like', 'in', 'is', 'null', 'true', 'false', 'between'
334
334
  }
335
+ if len(cleaned) > 64:
336
+ cleaned = cleaned[:64]
335
337
  if cleaned.lower() in mysql_keywords:
336
338
  logger.debug('存在MySQL保留字', {'标识符': cleaned})
337
339
  return f"`{cleaned}`"
@@ -423,11 +425,11 @@ class MySQLUploader:
423
425
  # UNIQUE KEY定义
424
426
  unique_defs = []
425
427
  if unique_keys:
426
- for idx, unique_cols in enumerate(unique_keys):
428
+ for unique_cols in unique_keys:
427
429
  if not unique_cols:
428
430
  continue
429
431
  safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
430
- unique_name = f"uniq_{'_'.join(safe_unique_cols)}_{idx}"
432
+ unique_name = f"uniq_{'_'.join(safe_unique_cols)}"
431
433
  unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
432
434
  index_defs = list(set(index_defs))
433
435
  all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
@@ -651,7 +653,7 @@ class MySQLUploader:
651
653
  添加UNIQUE KEY
652
654
  """
653
655
  safe_cols = [self._normalize_col(col) for col in unique_cols]
654
- unique_name = f"uniq_{'_'.join(safe_cols)}_{int(time.time()*1000)%100000}"
656
+ unique_name = f"uniq_{'_'.join(safe_cols)}"
655
657
  sql = f'ALTER TABLE `{db_name}`.`{table_name}` ADD UNIQUE KEY `{unique_name}` ({','.join(f'`{col}`' for col in safe_cols)})'
656
658
  try:
657
659
  with self._get_connection() as conn:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.12.3
3
+ Version: 3.12.4
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=9x3pJeqVdjIfn83Ln01YuTBiHKPCTeK3xkXQT4NcYno,18
2
+ mdbq/__version__.py,sha256=V3m3JOLEsFbctTVRE9dNc1iuNQahT_FbWxcHtAoWWqc,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,10 +8,10 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=Crw6LwVo3I3IUbzIETu8f46Quza3CTCh-qYf4edbBPo,24139
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=uBRM2cBF-gzkFFrmBSKqBd_LLO-K67LYUJqpF9Fs928,70561
11
+ mdbq/mysql/deduplicator.py,sha256=MYcNe0rwWOFS2Bqac8yGwwothlz8H--cOi6yuZf3qIs,72602
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
- mdbq/mysql/uploader.py,sha256=szX6t4SObBF6fbHT2s5ixfh1-c288cigsJ66pFE02Qg,70266
14
+ mdbq/mysql/uploader.py,sha256=ekpPaJypnuwxi2v42e-khqwT_eZ5LRl1ylQP492xbkk,70271
15
15
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
16
16
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
17
17
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.12.3.dist-info/METADATA,sha256=kkujbmKfbA4mOHCloHBALjK3jI2sNoft1yyHmpbnmoI,364
28
- mdbq-3.12.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.12.3.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.12.3.dist-info/RECORD,,
27
+ mdbq-3.12.4.dist-info/METADATA,sha256=bptLkLuByUNJJnZ_ruWRjxeG2LOFvaqK26M3MtQkf78,364
28
+ mdbq-3.12.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.12.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.12.4.dist-info/RECORD,,
File without changes