mdbq 4.0.39__py3-none-any.whl → 4.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/uploader.py +65 -101
- {mdbq-4.0.39.dist-info → mdbq-4.0.40.dist-info}/METADATA +1 -1
- {mdbq-4.0.39.dist-info → mdbq-4.0.40.dist-info}/RECORD +6 -6
- {mdbq-4.0.39.dist-info → mdbq-4.0.40.dist-info}/WHEEL +0 -0
- {mdbq-4.0.39.dist-info → mdbq-4.0.40.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.40'
|
mdbq/mysql/uploader.py
CHANGED
@@ -11,7 +11,6 @@ from mdbq.log import mylogger
|
|
11
11
|
from mdbq.myconf import myconf
|
12
12
|
from typing import Union, List, Dict, Optional, Any, Tuple, Set
|
13
13
|
from dbutils.pooled_db import PooledDB
|
14
|
-
import json
|
15
14
|
import sys
|
16
15
|
from decimal import Decimal, InvalidOperation
|
17
16
|
import math
|
@@ -549,106 +548,70 @@ class MySQLUploader:
|
|
549
548
|
"""
|
550
549
|
column_type_lower = column_type.lower() if column_type else ''
|
551
550
|
# 统一判断None/NaN
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
elif isinstance(value, float) and math.isnan(value):
|
556
|
-
is_nan = True
|
557
|
-
elif str(value).lower() in ['nan', 'none']:
|
558
|
-
is_nan = True
|
559
|
-
elif value == '':
|
560
|
-
is_nan = True
|
561
|
-
if is_nan:
|
562
|
-
if not allow_null:
|
563
|
-
if 'int' in column_type_lower:
|
564
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为0', {
|
565
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
566
|
-
})
|
567
|
-
return 0
|
568
|
-
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
569
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为0.0', {
|
570
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
571
|
-
})
|
572
|
-
return 0.0
|
573
|
-
elif 'date' in column_type_lower or 'time' in column_type_lower:
|
574
|
-
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
575
|
-
default_date = '2000-01-01 00:00:00'
|
576
|
-
else:
|
577
|
-
default_date = '2000-01-01'
|
578
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为默认日期', {
|
579
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type, '默认值': default_date
|
580
|
-
})
|
581
|
-
return default_date
|
582
|
-
else:
|
583
|
-
logger.debug('字段值为None/NaN但不允许空值, 已填充为none字符串', {
|
584
|
-
'库': db_name, '表': table_name, '列': col_name, '字段类型': column_type
|
585
|
-
})
|
586
|
-
return 'none'
|
587
|
-
return None
|
588
|
-
try:
|
589
|
-
if isinstance(value, str) and value.strip().endswith('%'):
|
590
|
-
if re.match(r'^\d+(\.\d+)?%$', value.strip()):
|
591
|
-
percent_str = value.strip().replace('%', '')
|
592
|
-
percent_value = float(percent_str)
|
593
|
-
decimal_value = percent_value / 100
|
594
|
-
logger.debug('百分比字符串转小数', {'原始': value, '结果': decimal_value})
|
595
|
-
return decimal_value
|
596
|
-
else:
|
597
|
-
logger.warning('百分比字符串不符合格式,跳过转换', {
|
598
|
-
'库': db_name, '表': table_name, '列': col_name, '原始': value
|
599
|
-
})
|
600
|
-
elif 'int' in column_type_lower:
|
601
|
-
if isinstance(value, str):
|
602
|
-
value = value.replace(',', '').strip()
|
603
|
-
try:
|
604
|
-
return int(float(value))
|
605
|
-
except ValueError:
|
606
|
-
logger.error('字符串转整数失败', {
|
607
|
-
'库': db_name, '表': table_name, '列': col_name, '值': value
|
608
|
-
})
|
609
|
-
raise ValueError(f"`{value}` -> 无法转为整数")
|
610
|
-
return int(value) if value is not None else None
|
611
|
-
elif any(t in column_type_lower for t in ['float', 'double', 'decimal']):
|
612
|
-
if isinstance(value, str):
|
613
|
-
value = value.replace(',', '')
|
614
|
-
return float(value) if value is not None else None
|
615
|
-
elif 'date' in column_type_lower or 'time' in column_type_lower:
|
616
|
-
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
617
|
-
return value.strftime('%Y-%m-%d %H:%M:%S')
|
618
|
-
elif isinstance(value, str):
|
619
|
-
try:
|
620
|
-
return self._validate_datetime(value=value, date_type=False, no_log=False)
|
621
|
-
except ValueError as e:
|
622
|
-
logger.error('无效日期格式', {
|
623
|
-
'库': db_name, '表': table_name, '列': col_name, '值': value, '错误': str(e)
|
624
|
-
})
|
625
|
-
raise ValueError(f"无效日期格式: `{value}` -> {str(e)}")
|
626
|
-
return str(value)
|
627
|
-
elif 'varchar' in column_type_lower:
|
628
|
-
if isinstance(value, str):
|
629
|
-
return value.replace('\\', '\\\\').replace("'", "\\'")
|
630
|
-
else:
|
631
|
-
return str(value)
|
632
|
-
elif 'text' in column_type_lower:
|
633
|
-
if isinstance(value, str):
|
634
|
-
max_length = 65535
|
635
|
-
if len(value) > max_length:
|
636
|
-
logger.warning(f'TEXT字符串长度不允许超过 {max_length},已截断', {
|
637
|
-
'库': db_name, '表': table_name, '列': col_name, '原始值': f'{value[:50]}...', '截断后值': f'{value[:50]}...'
|
638
|
-
})
|
639
|
-
value = value[:max_length]
|
640
|
-
return value.replace('\\', '\\\\').replace("'", "\\'")
|
641
|
-
else:
|
642
|
-
return str(value)
|
643
|
-
elif 'json' in column_type_lower:
|
644
|
-
return json.dumps(value) if value is not None else None
|
551
|
+
if value == '' or pd.isna(value) or (isinstance(value, (float, Decimal)) and math.isinf(value)):
|
552
|
+
if allow_null:
|
553
|
+
return None
|
645
554
|
else:
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
555
|
+
logger.error("该列不允许为空值", {"库": db_name, "表": table_name, "列": col_name, "值": value})
|
556
|
+
raise ValueError("该列不允许为空值")
|
557
|
+
|
558
|
+
original_value = value
|
559
|
+
|
560
|
+
# 日期时间类型验证
|
561
|
+
if 'datetime' in column_type_lower or 'timestamp' in column_type_lower:
|
562
|
+
return self._validate_datetime(value, date_type=False, no_log=True)
|
563
|
+
elif 'date' in column_type_lower:
|
564
|
+
return self._validate_datetime(value, date_type=True, no_log=True)
|
565
|
+
# 数值类型验证
|
566
|
+
elif 'int' in column_type_lower:
|
567
|
+
try:
|
568
|
+
return int(value)
|
569
|
+
except (ValueError, TypeError):
|
570
|
+
logger.error(f"值 `{value}` 无法转换为整数", {"库": db_name, "表": table_name, "列": col_name})
|
571
|
+
raise ValueError(f"值 `{value}` 无法转换为整数")
|
572
|
+
elif any(t in column_type_lower for t in ['decimal', 'float', 'double']):
|
573
|
+
# 百分比字符串处理
|
574
|
+
if isinstance(value, str) and '%' in value:
|
575
|
+
try:
|
576
|
+
# 仅当值是'xx.xx%'格式时才转换
|
577
|
+
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
578
|
+
value = float(value.strip().replace('%', '')) / 100
|
579
|
+
else:
|
580
|
+
# 不符合格式的百分比字符串,保留原始值
|
581
|
+
logger.warning("百分比字符串不符合格式,跳过转换", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
582
|
+
value = original_value
|
583
|
+
except (ValueError, TypeError):
|
584
|
+
logger.warning("百分比字符串转换失败,保留原始值", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
585
|
+
value = original_value
|
586
|
+
|
587
|
+
try:
|
588
|
+
# 检查是否为Decimal类型
|
589
|
+
if 'decimal' in column_type_lower:
|
590
|
+
precision, scale = self._get_decimal_scale(column_type)
|
591
|
+
value_decimal = Decimal(str(value))
|
592
|
+
# 检查整数部分长度
|
593
|
+
if len(value_decimal.as_tuple().digits) - abs(value_decimal.as_tuple().exponent) > precision - scale:
|
594
|
+
raise ValueError(f"整数部分超出范围")
|
595
|
+
return value_decimal
|
596
|
+
else: # float/double
|
597
|
+
return float(value)
|
598
|
+
except (ValueError, TypeError, InvalidOperation) as e:
|
599
|
+
logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
|
600
|
+
raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
|
601
|
+
# 字符串类型验证
|
602
|
+
elif 'varchar' in column_type_lower:
|
603
|
+
str_value = str(value)
|
604
|
+
try:
|
605
|
+
max_len = int(re.search(r'\((\d+)\)', column_type).group(1))
|
606
|
+
if len(str_value.encode('utf-8')) > max_len:
|
607
|
+
logger.warning(f"列`{col_name}`的值`{str_value}`长度({len(str_value.encode('utf-8'))})超出varchar({max_len})限制,将进行截断", {"库": db_name, "表": table_name})
|
608
|
+
return str_value.encode('utf-8')[:max_len].decode('utf-8', 'ignore')
|
609
|
+
except (AttributeError, IndexError):
|
610
|
+
# 没有找到长度定义,不截断
|
611
|
+
pass
|
612
|
+
return str_value
|
613
|
+
|
614
|
+
return value
|
652
615
|
|
653
616
|
@_execute_with_retry
|
654
617
|
def _get_table_columns(self, db_name: str, table_name: str) -> Dict[str, str]:
|
@@ -967,6 +930,8 @@ class MySQLUploader:
|
|
967
930
|
data = self.normalize_column_names(data)
|
968
931
|
|
969
932
|
# set_typ的键清洗
|
933
|
+
if not set_typ:
|
934
|
+
set_typ = {}
|
970
935
|
set_typ = {self._normalize_col(k): v for k, v in set_typ.items()}
|
971
936
|
|
972
937
|
# 新实现:严格按set_typ顺序过滤,后补充data中有但set_typ没有的列
|
@@ -1017,7 +982,6 @@ class MySQLUploader:
|
|
1017
982
|
})
|
1018
983
|
raise ValueError(f"行:{row_idx}, 列:`{col_name}`-> 报错: {str(e)}")
|
1019
984
|
prepared_data.append(prepared_row)
|
1020
|
-
|
1021
985
|
return prepared_data, filtered_set_typ
|
1022
986
|
|
1023
987
|
def upload_data(
|
@@ -1,5 +1,5 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=YnJ4yS3LCjrBMdYJ8VXs6BM72GyNYWV1Zwk0iW2hB1k,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=WtTFMN78jn43Y-nBTPAXhAK56w3wDuv_cj4YtzzGbZk,169797
|
5
5
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
@@ -11,7 +11,7 @@ mdbq/mysql/deduplicator.py,sha256=AB3gL7ZwhcmzGHSu4UY4M6YZVPFZ2wlAN3BCcwAhegQ,73
|
|
11
11
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
12
12
|
mdbq/mysql/s_query.py,sha256=1wJ3HVjHEF6FA-bVeeesRlsf73CZSvVTEQ51CF1OsE4,46786
|
13
13
|
mdbq/mysql/unique_.py,sha256=MaztT-WIyEQUs-OOYY4pFulgHVcXR1BfCy3QUz0XM_U,21127
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=FOroXUIsxJaMCqBeepUuymCpdhJsid4yiC_Rs1BT1sw,82823
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=X66sVdvVgzoNzmgVJyPtd7bjEvctEKtLPblEPF65EWc,46940
|
17
17
|
mdbq/other/error_handler.py,sha256=4p5haAXSY-P78stp4Xwo_MwAngWYqyKj5ogWIuYXMeY,12631
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=XptHjGzbout9IYzWAOQUpMMV5qEgLTU8pL1ZGt8oNEA,21868
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
28
|
+
mdbq-4.0.40.dist-info/METADATA,sha256=FUd8oXQ4zlMFv9neapfYa24uQl84PHgt3S0aAEbEjGo,364
|
29
|
+
mdbq-4.0.40.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-4.0.40.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-4.0.40.dist-info/RECORD,,
|
File without changes
|
File without changes
|