mdbq 3.11.6__py3-none-any.whl → 3.11.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.11.6'
1
+ VERSION = '3.11.7'
@@ -80,7 +80,8 @@ class MySQLDeduplicator:
80
80
  date_column: str = '日期',
81
81
  exclude_columns: Optional[List[str]] = None,
82
82
  exclude_databases: Optional[List[str]] = None,
83
- exclude_tables: Optional[Dict[str, List[str]]] = None
83
+ exclude_tables: Optional[Dict[str, List[str]]] = None,
84
+ duplicate_keep_mode: str = 'keep_one' # 新增参数
84
85
  ) -> None:
85
86
  """
86
87
  初始化去重处理器
@@ -90,6 +91,7 @@ class MySQLDeduplicator:
90
91
  :param exclude_columns: 去重时排除的列名列表,默认为['id', '更新时间']
91
92
  :param exclude_databases: 排除的数据库名列表
92
93
  :param exclude_tables: 排除的表名字典 {数据库名: [表名, ...]}
94
+ :param duplicate_keep_mode: 'keep_one'(默认,重复组保留一条),'remove_all'(全部删除重复组)
93
95
  """
94
96
  # 连接池状态标志
95
97
  self._closed = False
@@ -173,6 +175,8 @@ class MySQLDeduplicator:
173
175
  self.exclude_databases = set([db.lower() for db in exclude_databases]) if exclude_databases else set()
174
176
  self.exclude_tables = {k.lower(): set([t.lower() for t in v]) for k, v in (exclude_tables or {}).items()}
175
177
 
178
+ self.duplicate_keep_mode = duplicate_keep_mode if duplicate_keep_mode in ('keep_one', 'remove_all') else 'keep_one'
179
+
176
180
  def _get_connection(self) -> pymysql.connections.Connection:
177
181
  """
178
182
  从连接池获取一个数据库连接。
@@ -416,14 +420,28 @@ class MySQLDeduplicator:
416
420
  if not dry_run:
417
421
  # 分批删除,避免锁表
418
422
  while True:
419
- delete_dup_sql = f"""
420
- DELETE FROM `{database}`.`{table}`
421
- WHERE `{pk_real}` NOT IN (
422
- SELECT `min_id` FROM `{database}`.`{temp_table}`
423
- ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
424
- AND ({' AND '.join([f'`{col}` IS NOT NULL' for col in use_columns])})
425
- LIMIT {self.batch_size}
426
- """
423
+ if self.duplicate_keep_mode == 'remove_all':
424
+ # 删除所有重复组的所有记录
425
+ delete_dup_sql = f"""
426
+ DELETE FROM `{database}`.`{table}`
427
+ WHERE ({', '.join([f'`{col}`' for col in use_columns])}) IN (
428
+ SELECT {column_list} FROM `{database}`.`{temp_table}`
429
+ ) {'AND' if use_time_filter else ''} {f'`{time_col}` >= \'{self._dedup_start_date}\' AND `{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
430
+ LIMIT {self.batch_size}
431
+ """
432
+ else:
433
+ # 修正:只删除重复组中不是min_id的行,唯一数据不动
434
+ delete_dup_sql = f"""
435
+ DELETE FROM `{database}`.`{table}` t
436
+ WHERE EXISTS (
437
+ SELECT 1 FROM `{database}`.`{temp_table}` tmp
438
+ WHERE
439
+ {' AND '.join([f't.`{col}` <=> tmp.`{col}`' for col in use_columns])}
440
+ AND t.`{pk_real}` <> tmp.`min_id`
441
+ )
442
+ {'AND' if use_time_filter else ''} {f't.`{time_col}` >= \'{self._dedup_start_date}\' AND t.`{time_col}` <= \'{self._dedup_end_date}\'' if use_time_filter else ''}
443
+ LIMIT {self.batch_size}
444
+ """
427
445
  logger.debug('执行删除重复数据SQL', {'sql': delete_dup_sql})
428
446
  cursor.execute(delete_dup_sql)
429
447
  batch_deleted = cursor.rowcount
@@ -431,7 +449,7 @@ class MySQLDeduplicator:
431
449
  conn.commit()
432
450
  if batch_deleted < self.batch_size:
433
451
  break
434
- logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns})
452
+ logger.info('操作删除', {"库": database, "表": table, "数据量": total_count, "重复组数": dup_count, "实际删除": affected_rows, "时间范围": [self._dedup_start_date, self._dedup_end_date] if use_time_filter else None, "实际去重列": use_columns, "去重模式": self.duplicate_keep_mode})
435
453
  # 新增:去重后重排id
436
454
  if reset_id and affected_rows > 0:
437
455
  self._reset_id_column(database, table)
@@ -873,7 +891,6 @@ def main():
873
891
  deduplicator.deduplicate_all(dry_run=False, parallel=False, reset_id=False)
874
892
 
875
893
  # # 指定数据库去重(多线程)
876
- # logger.info('调用deduplicate_database')
877
894
  # deduplicator.deduplicate_database('my_db', dry_run=False, parallel=True, reset_id=False)
878
895
 
879
896
  # # 指定表去重(使用特定列)
@@ -883,5 +900,5 @@ def main():
883
900
  deduplicator.close()
884
901
 
885
902
  if __name__ == '__main__':
886
- main()
903
+ # main()
887
904
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.11.6
3
+ Version: 3.11.7
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=jUdj4-uaa03JUoNnXK_fTx_XQfDwjeFprE71R3ZenRY,18
2
+ mdbq/__version__.py,sha256=KXKzNBZD4M6L-jW29owhKjBycLSiUdGeTf_uNAYvyGI,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/query_data.py,sha256=nxL8hSy8yI1QLlqnkTNHHQSxRfo-6WKL5OA-N4xLB7c,179832
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -8,7 +8,7 @@ mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
8
8
  mdbq/log/mylogger.py,sha256=HuxLBCXjm6fZrxYE0rdpUCz359WGeqOX0vvg9jTuRY4,24126
9
9
  mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,1664
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
- mdbq/mysql/deduplicator.py,sha256=XSAgt6HqvzDyZSiv4mHli5fA3p3ePn5g3HupqI2cyVo,41444
11
+ mdbq/mysql/deduplicator.py,sha256=dq40YBLVd5ho28pYzVfwm5pA90YA3iN6l9xX4k0Ynds,42808
12
12
  mdbq/mysql/mysql.py,sha256=Kjpi-LL00WQUmTTOfhEBsNrmo4-4kFFJzrHbVKfqiBE,56770
13
13
  mdbq/mysql/s_query.py,sha256=dlnrVJ3-Vp1Suv9CNbPxyYSRqRJUHjOpF39tb2F-wBc,10190
14
14
  mdbq/mysql/uploader.py,sha256=LxPlAfSNhQbLu-or4wxa-vLjCw5_PIN3ZVoksWUJazQ,61701
@@ -24,7 +24,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
24
24
  mdbq/redis/getredis.py,sha256=YHgCKO8mEsslwet33K5tGss-nrDDwPnOSlhA9iBu0jY,24078
25
25
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
26
26
  mdbq/spider/aikucun.py,sha256=cqK-JRd_DHbToC7hyo83m8o97NZkJFqmB2xBtr6aAVU,20961
27
- mdbq-3.11.6.dist-info/METADATA,sha256=gEcCBqGoPLhugYleGDv1r2YV_lHPL3AnGHD_dHTxY3Q,364
28
- mdbq-3.11.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
- mdbq-3.11.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
- mdbq-3.11.6.dist-info/RECORD,,
27
+ mdbq-3.11.7.dist-info/METADATA,sha256=j_0kmOn4tTbk8TY8LqbEZ2OWmJz0-70sUQNwP_N0VCc,364
28
+ mdbq-3.11.7.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
29
+ mdbq-3.11.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
30
+ mdbq-3.11.7.dist-info/RECORD,,
File without changes