mdbq 4.0.2__py3-none-any.whl → 4.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/mysql/uploader.py CHANGED
@@ -14,6 +14,7 @@ from dbutils.pooled_db import PooledDB
14
14
  import json
15
15
  import sys
16
16
  from decimal import Decimal, InvalidOperation
17
+ import math
17
18
 
18
19
  warnings.filterwarnings('ignore')
19
20
  logger = mylogger.MyLogger(
@@ -240,8 +241,16 @@ class MySQLUploader:
240
241
  conn = self.pool.connection()
241
242
  return conn
242
243
  except Exception as e:
243
- logger.error('从连接池获取数据库连接失败', {'error': str(e)})
244
- raise ConnectionError(f'连接数据库失败: {str(e)}')
244
+ logger.error('从连接池获取数据库连接失败,尝试重建连接池', {'error': str(e)})
245
+ # 强制重建连接池
246
+ try:
247
+ self.pool = self._create_connection_pool()
248
+ conn = self.pool.connection()
249
+ logger.info('重建连接池后获取连接成功')
250
+ return conn
251
+ except Exception as e2:
252
+ logger.error('重建连接池后依然获取连接失败', {'error': str(e2)})
253
+ raise ConnectionError(f'连接数据库失败: {str(e2)}')
245
254
 
246
255
  @_execute_with_retry
247
256
  def _check_database_exists(self, db_name: str) -> bool:
@@ -407,31 +416,36 @@ class MySQLUploader:
407
416
  col_def += " NOT NULL"
408
417
  column_defs.append(col_def)
409
418
  # 主键处理逻辑调整
419
+ def _index_col_sql(col):
420
+ col_type = set_typ.get(col, '').lower()
421
+ if 'varchar' in col_type or 'text' in col_type:
422
+ return f"`{self._normalize_col(col)}`(100)"
423
+ return f"`{self._normalize_col(col)}`"
410
424
  if primary_keys and len(primary_keys) > 0:
411
- safe_primary_keys = [self._normalize_col(pk) for pk in primary_keys]
412
- primary_key_sql = f"PRIMARY KEY (`{'`,`'.join(safe_primary_keys)}`)"
425
+ safe_primary_keys = [_index_col_sql(pk) for pk in primary_keys]
426
+ primary_key_sql = f"PRIMARY KEY ({','.join(safe_primary_keys)})"
413
427
  else:
414
- safe_primary_keys = [self._normalize_col('id')]
428
+ safe_primary_keys = [_index_col_sql('id')]
415
429
  primary_key_sql = f"PRIMARY KEY (`id`)"
416
430
  # 索引统一在CREATE TABLE中定义
417
431
  index_defs = []
418
432
  if date_column and date_column in set_typ:
419
- safe_date_col = self._normalize_col(date_column)
420
- index_defs.append(f"INDEX `idx_{safe_date_col}` (`{safe_date_col}`)")
433
+ safe_date_col = _index_col_sql(date_column)
434
+ index_defs.append(f"INDEX `idx_{self._normalize_col(date_column)}` ({safe_date_col})")
421
435
  if indexes:
422
436
  for idx_col in indexes:
423
437
  if idx_col in set_typ:
424
- safe_idx_col = self._normalize_col(idx_col)
425
- index_defs.append(f"INDEX `idx_{safe_idx_col}` (`{safe_idx_col}`)")
438
+ safe_idx_col = _index_col_sql(idx_col)
439
+ index_defs.append(f"INDEX `idx_{self._normalize_col(idx_col)}` ({safe_idx_col})")
426
440
  # UNIQUE KEY定义
427
441
  unique_defs = []
428
442
  if unique_keys:
429
443
  for unique_cols in unique_keys:
430
444
  if not unique_cols:
431
445
  continue
432
- safe_unique_cols = [self._normalize_col(col) for col in unique_cols]
433
- unique_name = f"uniq_{'_'.join(safe_unique_cols)}"
434
- unique_defs.append(f"UNIQUE KEY `{unique_name}` (`{'`,`'.join(safe_unique_cols)}`)")
446
+ safe_unique_cols = [_index_col_sql(col) for col in unique_cols]
447
+ unique_name = f"uniq_{'_'.join([self._normalize_col(c) for c in unique_cols])}"
448
+ unique_defs.append(f"UNIQUE KEY `{unique_name}` ({','.join(safe_unique_cols)})")
435
449
  index_defs = list(set(index_defs))
436
450
  all_defs = column_defs + [primary_key_sql] + index_defs + unique_defs
437
451
  sql = f"""
@@ -447,7 +461,7 @@ class MySQLUploader:
447
461
  conn.commit()
448
462
  logger.info('数据表及索引已创建', {'库': db_name, '表': table_name, '索引': indexes, '唯一约束': unique_keys})
449
463
  except Exception as e:
450
- logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e)})
464
+ logger.error('建表失败', {'库': db_name, '表': table_name, '错误': str(e), '异常类型': type(e).__name__})
451
465
  if conn is not None:
452
466
  conn.rollback()
453
467
  raise
@@ -491,25 +505,45 @@ class MySQLUploader:
491
505
  def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
492
506
  """
493
507
  根据列类型验证并转换数据值
494
-
495
- :param value: 要验证的值
496
- :param column_type: 列的数据类型
497
- :param allow_null: 是否允许空值
498
- :param db_name: 数据库名(用于日志)
499
- :param table_name: 表名(用于日志)
500
- :param col_name: 列名(用于日志)
501
- :return: 转换后的值
502
- :raises ValueError: 当值转换失败时抛出
503
508
  """
509
+ column_type_lower = column_type.lower() if column_type else ''
510
+ # 统一判断None/NaN
511
+ is_nan = False
504
512
  if value is None:
513
+ is_nan = True
514
+ elif isinstance(value, float) and math.isnan(value):
515
+ is_nan = True
516
+ elif str(value).lower() in ['nan', 'none']:
517
+ is_nan = True
518
+ if is_nan:
505
519
  if not allow_null:
506
- logger.warning('字段值为None但不允许空值, 已填充为none', {
507
- '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
508
- })
509
- return 'none'
520
+ if 'int' in column_type_lower:
521
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为0', {
522
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
523
+ })
524
+ return 0
525
+ elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
526
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为0.0', {
527
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
528
+ })
529
+ return 0.0
530
+ elif 'date' in column_type_lower or 'time' in column_type_lower:
531
+ # 判断是date还是datetime/timestamp
532
+ if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
533
+ default_date = '2000-01-01 00:00:00'
534
+ else:
535
+ default_date = '2000-01-01'
536
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为默认日期', {
537
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type, '默认值': default_date
538
+ })
539
+ return default_date
540
+ else:
541
+ logger.debug('字段值为None/NaN但不允许空值, 已填充为none字符串', {
542
+ '库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
543
+ })
544
+ return 'none'
510
545
  return None
511
546
  try:
512
- column_type_lower = column_type.lower()
513
547
  if isinstance(value, str) and value.strip().endswith('%'):
514
548
  try:
515
549
  percent_str = value.strip().replace('%', '')
@@ -881,22 +915,21 @@ class MySQLUploader:
881
915
  # set_typ的键清洗
882
916
  set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
883
917
 
884
- # 获取数据中实际存在的列名
885
- data_columns = set()
886
- if data and len(data) > 0:
887
- data_columns = set(data[0].keys())
888
-
889
- # 过滤set_typ,只保留数据中存在的列
918
+ # 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
890
919
  filtered_set_typ = {}
891
- for col in data_columns:
892
- if col in set_typ:
920
+ data_columns = list(data[0].keys()) if data and len(data) > 0 else []
921
+ # 先按set_typ顺序
922
+ for col in set_typ:
923
+ if col in data_columns:
893
924
  filtered_set_typ[col] = set_typ[col]
894
- else:
895
- # 如果列不在set_typ中,采样多个非None值推断类型
925
+ # 再补充data中有但set_typ没有的列
926
+ for col in data_columns:
927
+ if col not in filtered_set_typ:
928
+ # 推断类型
896
929
  sample_values = [row[col] for row in data if col in row and row[col] is not None][:5]
897
930
  inferred_type = None
898
931
  for val in sample_values:
899
- inferred_type = self._infer_data_type(val, no_log=True) # 推断日期类型不记录日志, 避免日志噪音过多
932
+ inferred_type = self._infer_data_type(val, no_log=True)
900
933
  if inferred_type:
901
934
  break
902
935
  if not inferred_type:
@@ -1326,8 +1359,7 @@ class MySQLUploader:
1326
1359
  if cached:
1327
1360
  return cached
1328
1361
  # 获取所有列名(排除id)
1329
- all_columns = [col for col in set_typ.keys()
1330
- if col.lower() != 'id']
1362
+ all_columns = [col for col in set_typ.keys() if col.lower() != 'id']
1331
1363
  if not check_duplicate:
1332
1364
  sql = self._build_simple_insert_sql(db_name, table_name, all_columns,
1333
1365
  update_on_duplicate)
@@ -1364,7 +1396,6 @@ class MySQLUploader:
1364
1396
  - 只有遇到严重的数据库错误(如所有行都因唯一约束冲突且没有ON DUPLICATE KEY UPDATE),才会整体回滚。
1365
1397
  - 返回值为(插入行数, 跳过行数, 失败行数)。
1366
1398
  """
1367
- import pymysql # 确保异常类型可用
1368
1399
  def get_optimal_batch_size(total_rows: int) -> int:
1369
1400
  if total_rows <= 100:
1370
1401
  return total_rows
@@ -1612,5 +1643,5 @@ def main():
1612
1643
 
1613
1644
 
1614
1645
  if __name__ == '__main__':
1615
- main()
1646
+ # main()
1616
1647
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 4.0.2
3
+ Version: 4.0.4
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,7 +1,7 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=_fU0Mj16CzZHiwvkoGULFRC0vc6b0FxDy6MgQQON3Gw,17
2
+ mdbq/__version__.py,sha256=lAYjWBa6ThlPWc3_1b0lWM6fn_3Z9ckuALnMMZXhBbs,17
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
- mdbq/aggregation/query_data.py,sha256=U6dYK8_gEaNnsGKooEkzfAWnzNA8kt0uomec49e4olE,177536
4
+ mdbq/aggregation/query_data.py,sha256=_k6Jg60RaaT056sIaiSO6v84dEnOIOGq-nUJtSr65kI,171861
5
5
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
6
6
  mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
7
7
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
@@ -10,9 +10,9 @@ mdbq/log/spider_logging.py,sha256=-ozWWEGm3HVv604ozs_OOvVwumjokmUPwbaodesUrPY,16
10
10
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
11
11
  mdbq/mysql/deduplicator.py,sha256=8v3MC6TJ0YEiExWrTP9OXAxTYnL9XbpYL2vWaER1h2M,73099
12
12
  mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
13
- mdbq/mysql/s_query.py,sha256=tSBEbyuVQBeE6tckHSbguAQh9T07tvPhf4J6DlpUBP8,10508
13
+ mdbq/mysql/s_query.py,sha256=RnVCwMQ_n9PcAimbMWbHe9k8eil8shtCfa3LwLBZi6c,41909
14
14
  mdbq/mysql/unique_.py,sha256=Wgqq_PjAAD757JTa10wjYaJgssZ_C_ypU6DW56jbuyw,21074
15
- mdbq/mysql/uploader.py,sha256=n6K2rn7cGHGLGevX2JO7pKrUiQiGNluRx3CwnxARZGI,72503
15
+ mdbq/mysql/uploader.py,sha256=e49Gk09K766QXaus_p3VOMcH2VbexQzKsqDTCGrWoWQ,74419
16
16
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
17
17
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
18
18
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
25
25
  mdbq/redis/getredis.py,sha256=l3zBK7wrZl0oO42-_UGylyatnIp_SBw8wDDvof9fht4,23534
26
26
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
27
27
  mdbq/spider/aikucun.py,sha256=hPRzLQvFIF4ibN8aP3Dg_ru5meac90faPyzOB22cj-o,20965
28
- mdbq-4.0.2.dist-info/METADATA,sha256=EdCDmGum97Hjm9aKPV-2qAf9IoMkh-F0DE6ATMLJTX8,363
29
- mdbq-4.0.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
- mdbq-4.0.2.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
- mdbq-4.0.2.dist-info/RECORD,,
28
+ mdbq-4.0.4.dist-info/METADATA,sha256=VM2dtOiBJ74NlYhq9UWAFAPloayAXPX9bLKnvZJd7Xg,363
29
+ mdbq-4.0.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
+ mdbq-4.0.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
+ mdbq-4.0.4.dist-info/RECORD,,
File without changes