mdbq 4.0.39__py3-none-any.whl → 4.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +80 -126
- {mdbq-4.0.39.dist-info → mdbq-4.0.41.dist-info}/METADATA +1 -1
- {mdbq-4.0.39.dist-info → mdbq-4.0.41.dist-info}/RECORD +6 -6
- {mdbq-4.0.39.dist-info → mdbq-4.0.41.dist-info}/WHEEL +0 -0
- {mdbq-4.0.39.dist-info → mdbq-4.0.41.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.41'
|
mdbq/mysql/uploader.py
CHANGED
@@ -11,7 +11,6 @@ from mdbq.log import mylogger
|
|
11
11
|
from mdbq.myconf import myconf
|
12
12
|
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
13
13
|
from dbutils.pooled_db import PooledDB
|
14
|
-
import json
|
15
14
|
import sys
|
16
15
|
from decimal import Decimal, InvalidOperation
|
17
16
|
import math
|
@@ -526,8 +525,7 @@ class MySQLUploader:
|
|
526
525
|
'%Y-%m-%dT%H:%M:%S',
|
527
526
|
'%Y-%m-%d %H:%M:%S.%f',
|
528
527
|
'%Y/%-m/%-d', # 2023/1/8
|
529
|
-
'%Y
|
530
|
-
'%Y-%-m-%-d' # 2023-1-8
|
528
|
+
'%Y-%-m-%-d', # 2023-01-8
|
531
529
|
]
|
532
530
|
for fmt in formats:
|
533
531
|
try:
|
@@ -549,106 +547,70 @@ class MySQLUploader:
|
|
549
547
|
"""
|
550
548
|
column_type_lower = column_type.lower() if column_type else ''
|
551
549
|
# 统一判断None/NaN
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
elif isinstance(value, float) and math.isnan(value):
|
556
|
-
is_nan = True
|
557
|
-
elif str(value).lower() in ['nan', 'none']:
|
558
|
-
is_nan = True
|
559
|
-
elif value == '':
|
560
|
-
is_nan = True
|
561
|
-
if is_nan:
|
562
|
-
if not allow_null:
|
563
|
-
if 'int' in column_type_lower:
|
564
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为0', {
|
565
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
566
|
-
})
|
567
|
-
return 0
|
568
|
-
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
569
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为0.0', {
|
570
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
571
|
-
})
|
572
|
-
return 0.0
|
573
|
-
elif 'date' in column_type_lower or 'time' in column_type_lower:
|
574
|
-
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
575
|
-
default_date = '2000-01-01 00:00:00'
|
576
|
-
else:
|
577
|
-
default_date = '2000-01-01'
|
578
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为默认日期', {
|
579
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type, '默认值': default_date
|
580
|
-
})
|
581
|
-
return default_date
|
582
|
-
else:
|
583
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为none字符串', {
|
584
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
585
|
-
})
|
586
|
-
return 'none'
|
587
|
-
return None
|
588
|
-
try:
|
589
|
-
if isinstance(value, str) and value.strip().endswith('%'):
|
590
|
-
if re.match(r'^\d+(\.\d+)?%$', value.strip()):
|
591
|
-
percent_str = value.strip().replace('%', '')
|
592
|
-
percent_value = float(percent_str)
|
593
|
-
decimal_value = percent_value / 100
|
594
|
-
logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
|
595
|
-
return decimal_value
|
596
|
-
else:
|
597
|
-
logger.warning('百分比字符串不符合格式,跳过转换', {
|
598
|
-
'库': db_name, '表': table_name, '列': col_name, '原始': value
|
599
|
-
})
|
600
|
-
elif 'int' in column_type_lower:
|
601
|
-
if isinstance(value, str):
|
602
|
-
value = value.replace(',', '').strip()
|
603
|
-
try:
|
604
|
-
return int(float(value))
|
605
|
-
except ValueError:
|
606
|
-
logger.error('字符串转整数失败', {
|
607
|
-
'库': db_name, '表': table_name, '列': col_name, '值': value
|
608
|
-
})
|
609
|
-
raise ValueError(f"`{value}` -> 无法转为整数")
|
610
|
-
return int(value) if value is not None else None
|
611
|
-
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
612
|
-
if isinstance(value, str):
|
613
|
-
value = value.replace(',', '')
|
614
|
-
return float(value) if value is not None else None
|
615
|
-
elif 'date' in column_type_lower or 'time' in column_type_lower:
|
616
|
-
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
617
|
-
return value.strftime('%Y-%m-%d %H:%M:%S')
|
618
|
-
elif isinstance(value, str):
|
619
|
-
try:
|
620
|
-
return self._validate_datetime(value=value, date_type=False, no_log=False)
|
621
|
-
except ValueError as e:
|
622
|
-
logger.error('无效日期格式', {
|
623
|
-
'库': db_name, '表': table_name, '列': col_name, '值': value, '错误': str(e)
|
624
|
-
})
|
625
|
-
raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
|
626
|
-
return str(value)
|
627
|
-
elif 'varchar' in column_type_lower:
|
628
|
-
if isinstance(value, str):
|
629
|
-
return value.replace('\\', '\\\\').replace("'", "\\'")
|
630
|
-
else:
|
631
|
-
return str(value)
|
632
|
-
elif 'text' in column_type_lower:
|
633
|
-
if isinstance(value, str):
|
634
|
-
max_length = 65535
|
635
|
-
if len(value) > max_length:
|
636
|
-
logger.warning(f'TEXT字符串长度不允许超过 {max_length},已截断', {
|
637
|
-
'库': db_name, '表': table_name, '列': col_name, '原始值': f'{value[:50]}...', '截断后值': f'{value[:50]}...'
|
638
|
-
})
|
639
|
-
value = value[:max_length]
|
640
|
-
return value.replace('\\', '\\\\').replace("'", "\\'")
|
641
|
-
else:
|
642
|
-
return str(value)
|
643
|
-
elif 'json' in column_type_lower:
|
644
|
-
return json.dumps(value) if value is not None else None
|
550
|
+
if value == '' or pd.isna(value) or (isinstance(value, (float, Decimal)) and math.isinf(value)):
|
551
|
+
if allow_null:
|
552
|
+
return None
|
645
553
|
else:
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
554
|
+
logger.error("该列不允许为空值", {"库": db_name, "表": table_name, "列": col_name, "值": value})
|
555
|
+
raise ValueError("该列不允许为空值")
|
556
|
+
|
557
|
+
original_value = value
|
558
|
+
|
559
|
+
# 日期时间类型验证
|
560
|
+
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
561
|
+
return self._validate_datetime(value, date_type=False, no_log=True)
|
562
|
+
elif 'date' in column_type_lower:
|
563
|
+
return self._validate_datetime(value, date_type=True, no_log=True)
|
564
|
+
# 数值类型验证
|
565
|
+
elif 'int' in column_type_lower:
|
566
|
+
try:
|
567
|
+
return int(value)
|
568
|
+
except (ValueError, TypeError):
|
569
|
+
logger.error(f"值 `{value}` 无法转换为整数", {"库": db_name, "表": table_name, "列": col_name})
|
570
|
+
raise ValueError(f"值 `{value}` 无法转换为整数")
|
571
|
+
elif any(t in column_type_lower for t in ['decimal', 'float', 'double']):
|
572
|
+
# 百分比字符串处理
|
573
|
+
if isinstance(value, str) and '%' in value:
|
574
|
+
try:
|
575
|
+
# 仅当值是'xx.xx%'格式时才转换
|
576
|
+
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
577
|
+
value = float(value.strip().replace('%', '')) / 100
|
578
|
+
else:
|
579
|
+
# 不符合格式的百分比字符串,保留原始值
|
580
|
+
logger.warning("百分比字符串不符合格式,跳过转换", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
581
|
+
value = original_value
|
582
|
+
except (ValueError, TypeError):
|
583
|
+
logger.warning("百分比字符串转换失败,保留原始值", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
584
|
+
value = original_value
|
585
|
+
|
586
|
+
try:
|
587
|
+
# 检查是否为Decimal类型
|
588
|
+
if 'decimal' in column_type_lower:
|
589
|
+
precision, scale = self._get_decimal_scale(column_type)
|
590
|
+
value_decimal = Decimal(str(value))
|
591
|
+
# 检查整数部分长度
|
592
|
+
if len(value_decimal.as_tuple().digits) - abs(value_decimal.as_tuple().exponent) > precision - scale:
|
593
|
+
raise ValueError(f"整数部分超出范围")
|
594
|
+
return value_decimal
|
595
|
+
else: # float/double
|
596
|
+
return float(value)
|
597
|
+
except (ValueError, TypeError, InvalidOperation) as e:
|
598
|
+
logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
|
599
|
+
raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
|
600
|
+
# 字符串类型验证
|
601
|
+
elif 'varchar' in column_type_lower:
|
602
|
+
str_value = str(value)
|
603
|
+
try:
|
604
|
+
max_len = int(re.search(r'\((\d+)\)', column_type).group(1))
|
605
|
+
if len(str_value.encode('utf-8')) > max_len:
|
606
|
+
logger.warning(f"列`{col_name}`的值`{str_value}`长度({len(str_value.encode('utf-8'))})超出varchar({max_len})限制,将进行截断", {"库": db_name, "表": table_name})
|
607
|
+
return str_value.encode('utf-8')[:max_len].decode('utf-8', 'ignore')
|
608
|
+
except (AttributeError, IndexError):
|
609
|
+
# 没有找到长度定义,不截断
|
610
|
+
pass
|
611
|
+
return str_value
|
612
|
+
|
613
|
+
return value
|
652
614
|
|
653
615
|
@_execute_with_retry
|
654
616
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
@@ -853,8 +815,11 @@ class MySQLUploader:
|
|
853
815
|
|
854
816
|
# 检查是否是百分比字符串
|
855
817
|
if isinstance(value, str):
|
856
|
-
if
|
857
|
-
|
818
|
+
if '%' in value:
|
819
|
+
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
820
|
+
return 'DECIMAL(10, 4)' # 百分比转为小数,使用DECIMAL
|
821
|
+
else:
|
822
|
+
return 'VARCHAR(255)' # 不符合格式的百分比,视为字符串
|
858
823
|
|
859
824
|
if isinstance(value, bool):
|
860
825
|
return 'TINYINT(1)'
|
@@ -898,8 +863,8 @@ class MySQLUploader:
|
|
898
863
|
return 'MEDIUMTEXT'
|
899
864
|
else:
|
900
865
|
return 'LONGTEXT'
|
901
|
-
|
902
|
-
|
866
|
+
|
867
|
+
return 'VARCHAR(255)' # 默认字符串类型
|
903
868
|
|
904
869
|
def normalize_column_names(self, data: Union[pd.DataFrame, List[Dict[str, Any]]]) -> Union[
|
905
870
|
pd.DataFrame, List[Dict[str, Any]]]:
|
@@ -967,6 +932,8 @@ class MySQLUploader:
|
|
967
932
|
data = self.normalize_column_names(data)
|
968
933
|
|
969
934
|
# set_typ的键清洗
|
935
|
+
if not set_typ:
|
936
|
+
set_typ = {}
|
970
937
|
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
971
938
|
|
972
939
|
# 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
|
@@ -1017,7 +984,6 @@ class MySQLUploader:
|
|
1017
984
|
})
|
1018
985
|
raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
|
1019
986
|
prepared_data.append(prepared_row)
|
1020
|
-
|
1021
987
|
return prepared_data, filtered_set_typ
|
1022
988
|
|
1023
989
|
def upload_data(
|
@@ -1353,7 +1319,7 @@ class MySQLUploader:
|
|
1353
1319
|
for col in dup_cols:
|
1354
1320
|
col_type = set_typ.get(col, '').lower()
|
1355
1321
|
if col_type.startswith('decimal'):
|
1356
|
-
scale = self._get_decimal_scale(col_type)
|
1322
|
+
_, scale = self._get_decimal_scale(col_type)
|
1357
1323
|
conditions.append(f"ROUND(`{col}`, {scale}) = ROUND(%s, {scale})")
|
1358
1324
|
else:
|
1359
1325
|
conditions.append(f"`{col}` = %s")
|
@@ -1382,24 +1348,12 @@ class MySQLUploader:
|
|
1382
1348
|
"""
|
1383
1349
|
return sql
|
1384
1350
|
|
1385
|
-
def _get_decimal_scale(self, decimal_type: str) -> int:
|
1386
|
-
"""
|
1387
|
-
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
:raises: 无显式抛出异常,但解析失败时返回默认值2
|
1392
|
-
"""
|
1393
|
-
try:
|
1394
|
-
# 匹配DECIMAL类型中的精度和小数位数
|
1395
|
-
match = re.match(r'decimal\((\d+),\s*(\d+)\)', decimal_type.lower())
|
1396
|
-
if match:
|
1397
|
-
return int(match.group(2))
|
1398
|
-
except (ValueError, AttributeError, IndexError):
|
1399
|
-
pass
|
1400
|
-
|
1401
|
-
# 默认返回2位小数
|
1402
|
-
return 2
|
1351
|
+
def _get_decimal_scale(self, decimal_type: str) -> Tuple[int, int]:
|
1352
|
+
"""从DECIMAL类型字符串中提取精度和标度"""
|
1353
|
+
match = re.search(r'\((\d+)\s*,\s*(\d+)\)', decimal_type)
|
1354
|
+
if match:
|
1355
|
+
return int(match.group(1)), int(match.group(2))
|
1356
|
+
return 18, 2 # 默认值
|
1403
1357
|
|
1404
1358
|
def _prepare_insert_sql(
|
1405
1359
|
self,
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=X4VgvOxMoRU-YHIymW2NhCV8yy6pvwj9wlFGdVP8Di8,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=WtTFMN78jn43Y-nBTPAXhAK56w3wDuv_cj4YtzzGbZk,169797
|
5
5
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
@@ -11,7 +11,7 @@ mdbq/mysql/deduplicator.py,sha256=AB3gL7ZwhcmzGHSu4UY4M6YZVPFZ2wlAN3BCcwAhegQ,73
|
|
11
11
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
12
12
|
mdbq/mysql/s_query.py,sha256=1wJ3HVjHEF6FA-bVeeesRlsf73CZSvVTEQ51CF1OsE4,46786
|
13
13
|
mdbq/mysql/unique_.py,sha256=MaztT-WIyEQUs-OOYY4pFulgHVcXR1BfCy3QUz0XM_U,21127
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=PQKFohU32hRyUW3sAgWigbLnBh4h9ZydF4RNC3wNAyo,82640
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=X66sVdvVgzoNzmgVJyPtd7bjEvctEKtLPblEPF65EWc,46940
|
17
17
|
mdbq/other/error_handler.py,sha256=4p5haAXSY-P78stp4Xwo_MwAngWYqyKj5ogWIuYXMeY,12631
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=XptHjGzbout9IYzWAOQUpMMV5qEgLTU8pL1ZGt8oNEA,21868
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
28
|
+
mdbq-4.0.41.dist-info/METADATA,sha256=FDyMAo_9iwFqvELSDiPqPM6GhkNj0htFROeZiZYNMcU,364
|
29
|
+
mdbq-4.0.41.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-4.0.41.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-4.0.41.dist-info/RECORD,,
|
File without changes
|
File without changes
|