mdbq 3.9.9__py3-none-any.whl → 3.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/__version__.py CHANGED
@@ -1 +1 @@
1
- VERSION = '3.9.9'
1
+ VERSION = '3.9.10'
mdbq/mysql/uploader.py CHANGED
@@ -99,7 +99,6 @@ class MySQLUploader:
99
99
  :param port: 数据库端口,默认为3306
100
100
  :param charset: 字符集,默认为utf8mb4
101
101
  :param collation: 排序规则,默认为utf8mb4_0900_ai_ci
102
-
103
102
  :param max_retries: 最大重试次数,默认为10
104
103
  :param retry_interval: 重试间隔(秒),默认为10
105
104
  :param pool_size: 连接池大小,默认为5
@@ -122,7 +121,7 @@ class MySQLUploader:
122
121
  self.write_timeout = write_timeout
123
122
  self.ssl = ssl
124
123
  self._prepared_statements = StatementCache(maxsize=100)
125
- self._max_cached_statements = 100
124
+ self._max_cached_statements = 100 # 用于控制 StatementCache 类中缓存的 SQL 语句数量,最多缓存 100 条 SQL 语句
126
125
  self._table_metadata_cache = {}
127
126
  self.metadata_cache_ttl = 300 # 5分钟缓存时间
128
127
 
@@ -386,7 +385,11 @@ class MySQLUploader:
386
385
  raise ValueError(error_msg)
387
386
 
388
387
  # 移除非法字符,只保留字母、数字、下划线和美元符号
389
- cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '', identifier)
388
+ cleaned = re.sub(r'[^\w\u4e00-\u9fff$]', '_', identifier)
389
+
390
+ # 将多个连续的下划线替换为单个下划线, 移除开头和结尾的下划线
391
+ cleaned = re.sub(r'_+', '_', cleaned).strip('_')
392
+
390
393
  if not cleaned:
391
394
  error_msg = f"无法清理异常标识符: {identifier}"
392
395
  logger.error(error_msg)
@@ -597,19 +600,26 @@ class MySQLUploader:
597
600
  column_type_lower = column_type.lower()
598
601
 
599
602
  # 处理百分比值
600
- if isinstance(value, str) and '%' in value:
601
- try:
602
- # 移除百分号并转换为小数
603
- percent_value = float(value.strip().replace('%', ''))
604
- decimal_value = percent_value / 100
605
- return decimal_value
606
- except ValueError:
607
- pass # 如果不是有效的百分比数字,继续正常处理
608
-
609
- if 'int' in column_type_lower:
610
- if isinstance(value, (str, bytes)) and not value.strip().isdigit():
611
- raise ValueError("非数字字符串无法转换为整数")
612
- return int(value)
603
+ if isinstance(value, str):
604
+ if value.endswith('%'):
605
+ try:
606
+ # 移除百分号并转换为小数
607
+ percent_value = float(value.strip().replace('%', ''))
608
+ decimal_value = percent_value / 100
609
+ return decimal_value
610
+ except ValueError:
611
+ pass # 如果不是有效的百分比数字,继续正常处理
612
+
613
+ elif 'int' in column_type_lower:
614
+ if isinstance(value, str):
615
+ # 移除可能的逗号和空格
616
+ value = value.replace(',', '').strip()
617
+ # 尝试转换为浮点数再转整数
618
+ try:
619
+ return int(float(value))
620
+ except ValueError:
621
+ raise ValueError(f"`{value}` 无法转为整数")
622
+ return int(value) if value is not None else None
613
623
  elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
614
624
  if isinstance(value, str):
615
625
  # 处理可能包含逗号的数字字符串
@@ -630,12 +640,11 @@ class MySQLUploader:
630
640
  return value.replace('\\', '\\\\').replace("'", "\\'")
631
641
  return str(value)
632
642
  elif 'json' in column_type_lower:
633
- import json
634
643
  return json.dumps(value) if value is not None else None
635
644
  else:
636
645
  return value
637
646
  except (ValueError, TypeError) as e:
638
- error_msg = f"数据类型转换异常 {value} to type {column_type}: {str(e)}"
647
+ error_msg = f"转换异常 -> 无法将 `{value}` 的数据类型转为: `{column_type}` -> {str(e)}"
639
648
  logger.error(error_msg)
640
649
  raise ValueError(error_msg)
641
650
 
@@ -681,7 +690,8 @@ class MySQLUploader:
681
690
  auto_create: bool,
682
691
  date_column: Optional[str],
683
692
  indexes: Optional[List[str]],
684
- batch_id: Optional[str] = None
693
+ batch_id: Optional[str] = None,
694
+ update_on_duplicate: bool = False
685
695
  ):
686
696
  """实际执行表上传的方法"""
687
697
  # 检查表是否存在
@@ -711,7 +721,9 @@ class MySQLUploader:
711
721
  # 插入数据
712
722
  self._insert_data(
713
723
  db_name, table_name, data, set_typ,
714
- check_duplicate, duplicate_columns
724
+ check_duplicate, duplicate_columns,
725
+ batch_id=batch_id,
726
+ update_on_duplicate=update_on_duplicate
715
727
  )
716
728
 
717
729
  def _infer_data_type(self, value: Any) -> str:
@@ -721,12 +733,13 @@ class MySQLUploader:
721
733
  :param value: 要推断的值
722
734
  :return: MySQL数据类型字符串
723
735
  """
724
- if value is None:
736
+ if value is None or str(value).lower() in ['', 'none', 'nan']:
725
737
  return 'VARCHAR(255)' # 默认字符串类型
726
738
 
727
739
  # 检查是否是百分比字符串
728
- if isinstance(value, str) and '%' in value:
729
- return 'DECIMAL(10,4)' # 百分比统一使用DECIMAL(10,4)
740
+ if isinstance(value, str):
741
+ if value.endswith('%'):
742
+ return 'DECIMAL(10,4)' # 百分比统一使用DECIMAL(10,4)
730
743
 
731
744
  if isinstance(value, bool):
732
745
  return 'TINYINT(1)'
@@ -773,6 +786,26 @@ class MySQLUploader:
773
786
  else:
774
787
  return 'VARCHAR(255)'
775
788
 
789
+ def normalize_column_names(self, data: Union[pd.DataFrame, List[Dict[str, Any]]]) -> Union[
790
+ pd.DataFrame, List[Dict[str, Any]]]:
791
+ """
792
+ 1. pandas:规范化列名
793
+ 2. 字典列表:规范化每个字典的键
794
+
795
+ 参数:
796
+ data: 输入数据,支持两种类型:
797
+ - pandas.DataFrame:将规范化其列名
798
+ - List[Dict[str, Any]]:将规范化列表中每个字典的键
799
+ """
800
+ if isinstance(data, pd.DataFrame):
801
+ # 处理DataFrame
802
+ data.columns = [self._validate_identifier(col) for col in data.columns]
803
+ return data
804
+ elif isinstance(data, list):
805
+ # 处理字典列表
806
+ return [{self._validate_identifier(k): v for k, v in item.items()} for item in data]
807
+ return data
808
+
776
809
  def _prepare_data(
777
810
  self,
778
811
  data: Union[Dict, List[Dict], pd.DataFrame],
@@ -807,6 +840,9 @@ class MySQLUploader:
807
840
  logger.error(error_msg)
808
841
  raise ValueError(error_msg)
809
842
 
843
+ # 统一处理原始数据中列名的特殊字符
844
+ data = self.normalize_column_names(data)
845
+
810
846
  # 将set_typ的键转为小写
811
847
  set_typ = {k.lower(): v for k, v in set_typ.items()}
812
848
 
@@ -826,11 +862,11 @@ class MySQLUploader:
826
862
  if sample_values:
827
863
  inferred_type = self._infer_data_type(sample_values[0])
828
864
  filtered_set_typ[col] = inferred_type
829
- logger.debug(f"自动推断列'{col}'的数据类型为: {inferred_type}")
865
+ logger.debug(f"自动推断列 `{col}` 的数据类型为: {inferred_type}")
830
866
  else:
831
867
  # 没有样本值,使用默认类型
832
868
  filtered_set_typ[col] = 'VARCHAR(255)'
833
- logger.debug(f"为列'{col}'使用默认数据类型: VARCHAR(255)")
869
+ logger.debug(f"列 `{col}` 使用默认数据类型: VARCHAR(255)")
834
870
 
835
871
  prepared_data = []
836
872
  for row_idx, row in enumerate(data, 1):
@@ -842,7 +878,7 @@ class MySQLUploader:
842
878
 
843
879
  if col_name not in row:
844
880
  if not allow_null:
845
- error_msg = f"Row {row_idx}: Missing required column '{col_name}' in data"
881
+ error_msg = f"行号:{row_idx} -> 缺失列: `{col_name}`"
846
882
  logger.error(error_msg)
847
883
  raise ValueError(error_msg)
848
884
  prepared_row[col_name] = None
@@ -850,7 +886,7 @@ class MySQLUploader:
850
886
  try:
851
887
  prepared_row[col_name] = self._validate_value(row[col_name], filtered_set_typ[col_name], allow_null)
852
888
  except ValueError as e:
853
- error_msg = f"Row {row_idx}, column '{col_name}': {str(e)}"
889
+ error_msg = f"行号:{row_idx}, 列名:`{col_name}`-> 报错: {str(e)}"
854
890
  logger.error(error_msg)
855
891
  raise ValueError(error_msg)
856
892
  prepared_data.append(prepared_row)
@@ -871,7 +907,8 @@ class MySQLUploader:
871
907
  partition_by: Optional[str] = None,
872
908
  partition_date_column: str = '日期',
873
909
  auto_create: bool = True,
874
- indexes: Optional[List[str]] = None
910
+ indexes: Optional[List[str]] = None,
911
+ update_on_duplicate: bool = False
875
912
  ):
876
913
  """
877
914
  上传数据到数据库的主入口方法
@@ -888,6 +925,7 @@ class MySQLUploader:
888
925
  :param partition_date_column: 用于分表的日期列名,默认为'日期'
889
926
  :param auto_create: 表不存在时是否自动创建,默认为True
890
927
  :param indexes: 需要创建索引的列列表,可选
928
+ :param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
891
929
  :raises: 可能抛出各种验证和数据库相关异常
892
930
  """
893
931
  upload_start = time.time()
@@ -898,10 +936,10 @@ class MySQLUploader:
898
936
 
899
937
  logger.info("开始上传数据", {
900
938
  '批次号': batch_id,
901
- 'database': db_name,
902
- 'table': table_name,
939
+ '': db_name,
940
+ '': table_name,
903
941
  '分表方式': partition_by,
904
- '是否排重': check_duplicate,
942
+ '排重': check_duplicate,
905
943
  '总计行数': len(data) if hasattr(data, '__len__') else 1,
906
944
  '自动建表': auto_create
907
945
  })
@@ -913,10 +951,12 @@ class MySQLUploader:
913
951
  # logger.error(error_msg)
914
952
  # raise ValueError(error_msg)
915
953
 
916
- if partition_by and partition_by not in ['year', 'month']:
917
- error_msg = "分表方式必须是 'year' 或 'month'"
918
- logger.error(error_msg)
919
- raise ValueError(error_msg)
954
+ if partition_by:
955
+ partition_by = str(partition_by).lower()
956
+ if partition_by not in ['year', 'month']:
957
+ error_msg = "分表方式必须是 'year' 或 'month'"
958
+ logger.error(error_msg)
959
+ raise ValueError(error_msg)
920
960
 
921
961
  # 准备数据
922
962
  prepared_data, filtered_set_typ = self._prepare_data(data, set_typ, allow_null)
@@ -962,7 +1002,7 @@ class MySQLUploader:
962
1002
  db_name, part_table, part_data, filtered_set_typ,
963
1003
  primary_keys, check_duplicate, duplicate_columns,
964
1004
  allow_null, auto_create, partition_date_column,
965
- indexes, batch_id
1005
+ indexes, batch_id, update_on_duplicate
966
1006
  )
967
1007
  except Exception as e:
968
1008
  logger.error("分表上传失败", {
@@ -976,7 +1016,7 @@ class MySQLUploader:
976
1016
  db_name, table_name, prepared_data, filtered_set_typ,
977
1017
  primary_keys, check_duplicate, duplicate_columns,
978
1018
  allow_null, auto_create, partition_date_column,
979
- indexes, batch_id
1019
+ indexes, batch_id, update_on_duplicate
980
1020
  )
981
1021
 
982
1022
  success_flag = True
@@ -1004,7 +1044,8 @@ class MySQLUploader:
1004
1044
  check_duplicate: bool = False,
1005
1045
  duplicate_columns: Optional[List[str]] = None,
1006
1046
  batch_size: int = 1000,
1007
- batch_id: Optional[str] = None
1047
+ batch_id: Optional[str] = None,
1048
+ update_on_duplicate: bool = False
1008
1049
  ):
1009
1050
  """
1010
1051
  实际执行数据插入的方法
@@ -1016,6 +1057,7 @@ class MySQLUploader:
1016
1057
  :param check_duplicate: 是否检查重复数据,默认为False
1017
1058
  :param duplicate_columns: 用于检查重复的列,可选
1018
1059
  :param batch_size: 批量插入大小,默认为1000
1060
+ :param update_on_duplicate: 遇到重复数据时是否更新旧数据(默认为False)
1019
1061
  :param batch_id: 批次ID用于日志追踪,可选
1020
1062
  """
1021
1063
  if not data:
@@ -1048,8 +1090,21 @@ class MySQLUploader:
1048
1090
 
1049
1091
  where_clause = " AND ".join(conditions)
1050
1092
 
1051
- sql = f"""
1093
+ if update_on_duplicate:
1094
+ # 更新模式 - 使用ON DUPLICATE KEY UPDATE语法
1095
+ update_clause = ", ".join([f"`{col}` = VALUES(`{col}`)" for col in all_columns])
1096
+ sql = f"""
1052
1097
  INSERT INTO `{db_name}`.`{table_name}`
1098
+ (`{'`,`'.join(safe_columns)}`)
1099
+ VALUES ({placeholders})
1100
+ ON DUPLICATE KEY UPDATE {update_clause}
1101
+ """
1102
+
1103
+ # 注意:在update_on_duplicate模式下,row_values只需要插入数据,不需要排重列值
1104
+ def prepare_values(row):
1105
+ return [row.get(col) for col in all_columns]
1106
+ else:
1107
+ sql = f"""INSERT INTO `{db_name}`.`{table_name}`
1053
1108
  (`{'`,`'.join(safe_columns)}`)
1054
1109
  SELECT {placeholders}
1055
1110
  FROM DUAL
@@ -1058,6 +1113,10 @@ class MySQLUploader:
1058
1113
  WHERE {where_clause}
1059
1114
  )
1060
1115
  """
1116
+
1117
+ # 在check_duplicate模式下,row_values需要插入数据+排重列值
1118
+ def prepare_values(row):
1119
+ return [row.get(col) for col in all_columns] + [row.get(col) for col in duplicate_columns]
1061
1120
  else:
1062
1121
  sql = f"""
1063
1122
  INSERT INTO `{db_name}`.`{table_name}`
@@ -1065,6 +1124,10 @@ class MySQLUploader:
1065
1124
  VALUES ({placeholders})
1066
1125
  """
1067
1126
 
1127
+ # 普通模式下,row_values只需要插入数据
1128
+ def prepare_values(row):
1129
+ return [row.get(col) for col in all_columns]
1130
+
1068
1131
  total_inserted = 0
1069
1132
  total_skipped = 0
1070
1133
  total_failed = 0 # 失败计数器
@@ -1080,11 +1143,7 @@ class MySQLUploader:
1080
1143
  for row in batch:
1081
1144
  try:
1082
1145
  # 准备参数
1083
- row_values = [row.get(col) for col in all_columns]
1084
- # 如果是排重检查,添加排重列值
1085
- if check_duplicate:
1086
- row_values += [row.get(col) for col in duplicate_columns]
1087
-
1146
+ row_values = prepare_values(row)
1088
1147
  cursor.execute(sql, row_values)
1089
1148
  successful_rows += 1
1090
1149
  conn.commit() # 每次成功插入后提交
@@ -1096,13 +1155,13 @@ class MySQLUploader:
1096
1155
  # 记录失败行详细信息
1097
1156
  error_details = {
1098
1157
  '批次号': batch_id,
1099
- 'database': db_name,
1100
- 'table': table_name,
1158
+ '': db_name,
1159
+ '': table_name,
1101
1160
  'error_type': type(e).__name__,
1102
1161
  'error_message': str(e),
1103
- 'column_types': set_typ,
1162
+ '数据类型': set_typ,
1104
1163
  '是否排重': check_duplicate,
1105
- 'duplicate_columns': duplicate_columns
1164
+ '排重列': duplicate_columns
1106
1165
  }
1107
1166
  logger.error(f"单行插入失败: {error_details}")
1108
1167
  continue # 跳过当前行,继续处理下一行
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mdbq
3
- Version: 3.9.9
3
+ Version: 3.9.10
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -1,5 +1,5 @@
1
1
  mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
- mdbq/__version__.py,sha256=Z38j4uvZuqpFYiUEq0FTd82-1Y90RoVwpNEDWVHNTkk,17
2
+ mdbq/__version__.py,sha256=83jYP6xnYylgp029cctX2BP7k_exd-phUiwATgIjhH0,18
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/optimize.py,sha256=2oalzD9weZhDclUC22OLxYa8Zj7KnmsGUoUau_Jlyc4,19796
5
5
  mdbq/aggregation/query_data.py,sha256=5_OzjGR5Sq00q-EgAYmSE5V9i4Solw9y4hkldl4mvt8,179808
@@ -12,7 +12,7 @@ mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
12
12
  mdbq/mysql/deduplicator.py,sha256=brhX3eyE8-kn3nAYweKfBbAkXiNcyw_pL4CTyPqmPBg,21983
13
13
  mdbq/mysql/mysql.py,sha256=jTcizvUtRdwMhWK2i_LA9yDPmcifLjUzVhwTbC3wfJk,119785
14
14
  mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
15
- mdbq/mysql/uploader.py,sha256=mIgUnV7MwIkrbG-dchMkMzWo_N-XrQROLWTGGGuD_ts,49171
15
+ mdbq/mysql/uploader.py,sha256=V23PAzT59lMUqijkUiwV6a1qNwk9T76k8HKxY8fYW9w,52140
16
16
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
17
17
  mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
18
18
  mdbq/other/otk.py,sha256=iclBIFbQbhlqzUbcMMoePXBpcP1eZ06ZtjnhcA_EbmE,7241
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
25
25
  mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
26
26
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
27
27
  mdbq/spider/aikucun.py,sha256=OhyEv1VyAKTOHjLDM37iNDQeRg5OnrNoKODoG2VxHes,19806
28
- mdbq-3.9.9.dist-info/METADATA,sha256=F6RAyI8aGmpT-VLwVeY7jw13qemIce-PMH2Ri335GAE,363
29
- mdbq-3.9.9.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
- mdbq-3.9.9.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
- mdbq-3.9.9.dist-info/RECORD,,
28
+ mdbq-3.9.10.dist-info/METADATA,sha256=Ln51lgeqZn0zAjgLUKXaMNJ5ZXCkX3Eyu0iao37_IQw,364
29
+ mdbq-3.9.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
30
+ mdbq-3.9.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
31
+ mdbq-3.9.10.dist-info/RECORD,,
File without changes