mdbq 4.0.42__py3-none-any.whl → 4.0.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/query_data.py +14 -10
- mdbq/mysql/uploader.py +103 -70
- {mdbq-4.0.42.dist-info → mdbq-4.0.44.dist-info}/METADATA +1 -1
- {mdbq-4.0.42.dist-info → mdbq-4.0.44.dist-info}/RECORD +7 -7
- {mdbq-4.0.42.dist-info → mdbq-4.0.44.dist-info}/WHEEL +0 -0
- {mdbq-4.0.42.dist-info → mdbq-4.0.44.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '4.0.
|
1
|
+
VERSION = '4.0.44'
|
mdbq/aggregation/query_data.py
CHANGED
@@ -972,6 +972,7 @@ class MysqlDatasQuery:
|
|
972
972
|
'用户年龄': 'varchar(100)',
|
973
973
|
'人群分类': 'varchar(100)',
|
974
974
|
}
|
975
|
+
df.fillna(0, inplace=True)
|
975
976
|
return df, {
|
976
977
|
'db_name': db_name,
|
977
978
|
'table_name': table_name,
|
@@ -1066,7 +1067,7 @@ class MysqlDatasQuery:
|
|
1066
1067
|
)
|
1067
1068
|
df.insert(loc=1, column='推广渠道', value='万相台无界版') # df中插入新列
|
1068
1069
|
df['是否品牌词'] = df['词名字_词包名字'].str.contains('万里马|wanlima', regex=True)
|
1069
|
-
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
1070
|
+
df['是否品牌词'] = df['是否品牌词'].apply(lambda x: '品牌词' if x else '-')
|
1070
1071
|
dir_file = f'\\\\192.168.1.198\\时尚事业部\\01.运营部\\0-电商周报-每周五更新\\分类配置文件.xlsx'
|
1071
1072
|
dir_file2 = '/Volumes/时尚事业部/01.运营部/0-电商周报-每周五更新/分类配置文件.xlsx'
|
1072
1073
|
if not os.path.isfile(dir_file):
|
@@ -1514,8 +1515,6 @@ class MysqlDatasQuery:
|
|
1514
1515
|
projection=projection,
|
1515
1516
|
)
|
1516
1517
|
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='ignore') # 转换日期列
|
1517
|
-
df = df[df['日期'] == pd.to_datetime('2024-12-12')]
|
1518
|
-
|
1519
1518
|
df_set['商品id'] = df_set['商品id'].astype('int64')
|
1520
1519
|
df['商品id'] = df['商品id'].astype('int64')
|
1521
1520
|
df_set.sort_values('商品id', ascending=False, ignore_index=True, inplace=True)
|
@@ -1888,11 +1887,15 @@ class MysqlDatasQuery:
|
|
1888
1887
|
end_date=end_date,
|
1889
1888
|
projection=projection,
|
1890
1889
|
)
|
1891
|
-
|
1890
|
+
if 'spu_id' in df.columns:
|
1891
|
+
df = df.drop(columns=['spu_id']) # 删除原有 spu_id,避免冲突
|
1892
1892
|
df = pd.merge(df, df_sku, how='left', left_on='跟单sku_id', right_on='sku_id')
|
1893
|
-
df.
|
1894
|
-
|
1895
|
-
|
1893
|
+
df = df.drop(columns=['sku_id']) # 删除 merge 进来的 sku_id
|
1894
|
+
df['spu_id'] = df['spu_id'].fillna(0) # 填充 spu_id 空值
|
1895
|
+
# 调整 spu_id 到第3列
|
1896
|
+
cols = list(df.columns)
|
1897
|
+
cols.insert(3, cols.pop(cols.index('spu_id')))
|
1898
|
+
df = df[cols]
|
1896
1899
|
set_typ = {
|
1897
1900
|
'日期': 'date',
|
1898
1901
|
'店铺名称': 'varchar(100)',
|
@@ -2049,9 +2052,9 @@ class MysqlDatasQuery:
|
|
2049
2052
|
)
|
2050
2053
|
df = pd.merge(df, df_lin, how='left', left_on='计划id', right_on='计划id')
|
2051
2054
|
df['k_是否品牌词'] = df['关键词'].str.contains('万里马|wanlima', regex=True)
|
2052
|
-
df['k_是否品牌词'] = df['k_是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
2055
|
+
df['k_是否品牌词'] = df['k_是否品牌词'].apply(lambda x: '品牌词' if x else '-')
|
2053
2056
|
df['s_是否品牌词'] = df['搜索词'].str.contains('万里马|wanlima', regex=True)
|
2054
|
-
df['s_是否品牌词'] = df['s_是否品牌词'].apply(lambda x: '品牌词' if x else '')
|
2057
|
+
df['s_是否品牌词'] = df['s_是否品牌词'].apply(lambda x: '品牌词' if x else '-')
|
2055
2058
|
set_typ = {
|
2056
2059
|
'日期': 'date',
|
2057
2060
|
'产品线': 'varchar(100)',
|
@@ -3737,6 +3740,7 @@ def query3(months=1, download_manager=None):
|
|
3737
3740
|
sdq = MysqlDatasQuery(download_manager=download_manager) # 实例化数据处理类
|
3738
3741
|
sdq.months = months # 设置数据周期, 1 表示近 2 个月
|
3739
3742
|
sdq.spph(db_name='聚合数据', table_name='天猫_商品排行')
|
3743
|
+
sdq.item_up(db_name='聚合数据', table_name='淘宝店铺货品')
|
3740
3744
|
|
3741
3745
|
|
3742
3746
|
def main(months=3):
|
@@ -3774,4 +3778,4 @@ if __name__ == '__main__':
|
|
3774
3778
|
)
|
3775
3779
|
sdq = MysqlDatasQuery(download_manager=download_manager)
|
3776
3780
|
sdq.months = 3
|
3777
|
-
sdq.
|
3781
|
+
sdq.item_up(db_name='聚合数据', table_name='淘宝店铺货品')
|
mdbq/mysql/uploader.py
CHANGED
@@ -553,18 +553,112 @@ class MySQLUploader:
|
|
553
553
|
logger.error('无效的日期格式', {'值': value})
|
554
554
|
raise ValueError(f"无效的日期格式: `{value}`")
|
555
555
|
|
556
|
+
def _convert_to_int(self, value):
|
557
|
+
"""
|
558
|
+
尝试将value转换为int
|
559
|
+
"""
|
560
|
+
# 处理numpy/pandas标量
|
561
|
+
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
562
|
+
try:
|
563
|
+
value = value.item()
|
564
|
+
except Exception:
|
565
|
+
pass
|
566
|
+
elif hasattr(value, 'value') and not isinstance(value, str):
|
567
|
+
try:
|
568
|
+
extracted_value = value.value
|
569
|
+
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').isdigit():
|
570
|
+
value = extracted_value
|
571
|
+
except Exception:
|
572
|
+
pass
|
573
|
+
try:
|
574
|
+
return int(value)
|
575
|
+
except (ValueError, TypeError):
|
576
|
+
try:
|
577
|
+
return int(float(value))
|
578
|
+
except (ValueError, TypeError):
|
579
|
+
raise
|
580
|
+
|
581
|
+
def _convert_to_float(self, value):
|
582
|
+
"""
|
583
|
+
尝试将value转换为float,兼容常见数值类型。
|
584
|
+
"""
|
585
|
+
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
586
|
+
try:
|
587
|
+
value = value.item()
|
588
|
+
except Exception:
|
589
|
+
pass
|
590
|
+
elif hasattr(value, 'value') and not isinstance(value, str):
|
591
|
+
try:
|
592
|
+
extracted_value = value.value
|
593
|
+
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
|
594
|
+
value = extracted_value
|
595
|
+
except Exception:
|
596
|
+
pass
|
597
|
+
return float(value)
|
598
|
+
|
599
|
+
def _convert_to_decimal(self, value):
|
600
|
+
"""
|
601
|
+
尝试将value转换为Decimal,兼容常见数值类型。
|
602
|
+
"""
|
603
|
+
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
604
|
+
try:
|
605
|
+
value = value.item()
|
606
|
+
except Exception:
|
607
|
+
pass
|
608
|
+
elif hasattr(value, 'value') and not isinstance(value, str):
|
609
|
+
try:
|
610
|
+
extracted_value = value.value
|
611
|
+
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
|
612
|
+
value = extracted_value
|
613
|
+
except Exception:
|
614
|
+
pass
|
615
|
+
return Decimal(str(value))
|
616
|
+
|
617
|
+
def _truncate_str(self, str_value, max_len):
|
618
|
+
"""
|
619
|
+
截断字符串到指定字节长度(utf-8)。
|
620
|
+
"""
|
621
|
+
return str_value.encode('utf-8')[:max_len].decode('utf-8', 'ignore')
|
622
|
+
|
556
623
|
def _validate_value(self, value: Any, column_type: str, allow_null: bool, db_name: str = None, table_name: str = None, col_name: str = None) -> Any:
|
557
624
|
"""
|
558
625
|
根据列类型验证并转换数据值
|
559
626
|
"""
|
560
627
|
column_type_lower = column_type.lower() if column_type else ''
|
561
628
|
# 统一判断None/NaN
|
629
|
+
if value == '':
|
630
|
+
if any(t in column_type_lower for t in ['varchar', 'text', 'char', 'mediumtext', 'longtext']):
|
631
|
+
return ""
|
562
632
|
if value == '' or pd.isna(value) or (isinstance(value, (float, Decimal)) and math.isinf(value)):
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
633
|
+
# 兜底填充值映射
|
634
|
+
fallback_map = {
|
635
|
+
'int': 0,
|
636
|
+
'bigint': 0,
|
637
|
+
'tinyint': 0,
|
638
|
+
'smallint': 0,
|
639
|
+
'mediumint': 0,
|
640
|
+
'decimal': 0.0,
|
641
|
+
'float': 0.0,
|
642
|
+
'double': 0.0,
|
643
|
+
'date': '1970-01-01',
|
644
|
+
'datetime': '1970-01-01 00:00:00',
|
645
|
+
'timestamp': '1970-01-01 00:00:00',
|
646
|
+
'json': '{}',
|
647
|
+
'varchar': 'none',
|
648
|
+
'text': 'none',
|
649
|
+
'char': 'none',
|
650
|
+
'mediumtext': 'none',
|
651
|
+
'longtext': 'none',
|
652
|
+
}
|
653
|
+
fallback = 'none'
|
654
|
+
for typ, val in fallback_map.items():
|
655
|
+
if typ in column_type_lower:
|
656
|
+
fallback = val
|
657
|
+
break
|
658
|
+
if not allow_null:
|
659
|
+
logger.warning("该列不允许为空值", {"库": db_name, "表": table_name, "allow_null": allow_null, "列": col_name, "值": value, "兜底值": fallback})
|
567
660
|
raise ValueError("该列不允许为空值")
|
661
|
+
return fallback
|
568
662
|
|
569
663
|
original_value = value
|
570
664
|
|
@@ -576,25 +670,7 @@ class MySQLUploader:
|
|
576
670
|
# 数值类型验证
|
577
671
|
elif 'int' in column_type_lower:
|
578
672
|
try:
|
579
|
-
|
580
|
-
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
581
|
-
# numpy 标量类型
|
582
|
-
try:
|
583
|
-
value = value.item()
|
584
|
-
except (ValueError, TypeError):
|
585
|
-
# 如果不是标量,保持原值
|
586
|
-
pass
|
587
|
-
elif hasattr(value, 'value') and not isinstance(value, str):
|
588
|
-
# pandas 或其他有 value 属性的对象
|
589
|
-
try:
|
590
|
-
extracted_value = value.value
|
591
|
-
# 验证提取的值是数值类型
|
592
|
-
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').isdigit():
|
593
|
-
value = extracted_value
|
594
|
-
except (ValueError, TypeError, AttributeError):
|
595
|
-
# 如果提取失败,保持原值
|
596
|
-
pass
|
597
|
-
return int(value)
|
673
|
+
return self._convert_to_int(value)
|
598
674
|
except (ValueError, TypeError):
|
599
675
|
logger.error(f"值 `{value}` 无法转换为整数", {"库": db_name, "表": table_name, "列": col_name})
|
600
676
|
raise ValueError(f"值 `{value}` 无法转换为整数")
|
@@ -602,64 +678,23 @@ class MySQLUploader:
|
|
602
678
|
# 百分比字符串处理
|
603
679
|
if isinstance(value, str) and '%' in value:
|
604
680
|
try:
|
605
|
-
# 仅当值是'xx.xx%'格式时才转换
|
606
681
|
if re.match(r'^-?\d+(\.\d+)?%$', value.strip()):
|
607
682
|
value = float(value.strip().replace('%', '')) / 100
|
608
683
|
else:
|
609
|
-
# 不符合格式的百分比字符串,保留原始值
|
610
684
|
logger.warning("百分比字符串不符合格式,跳过转换", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
611
685
|
value = original_value
|
612
686
|
except (ValueError, TypeError):
|
613
687
|
logger.warning("百分比字符串转换失败,保留原始值", {"库": db_name, "表": table_name, "列": col_name, "原始": original_value})
|
614
688
|
value = original_value
|
615
|
-
|
616
689
|
try:
|
617
|
-
# 检查是否为Decimal类型
|
618
690
|
if 'decimal' in column_type_lower:
|
619
691
|
precision, scale = self._get_decimal_scale(column_type)
|
620
|
-
|
621
|
-
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
622
|
-
# numpy 标量类型
|
623
|
-
try:
|
624
|
-
value = value.item()
|
625
|
-
except (ValueError, TypeError):
|
626
|
-
# 如果不是标量,保持原值
|
627
|
-
pass
|
628
|
-
elif hasattr(value, 'value') and not isinstance(value, str):
|
629
|
-
# pandas 或其他有 value 属性的对象
|
630
|
-
try:
|
631
|
-
extracted_value = value.value
|
632
|
-
# 验证提取的值是数值类型
|
633
|
-
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
|
634
|
-
value = extracted_value
|
635
|
-
except (ValueError, TypeError, AttributeError):
|
636
|
-
# 如果提取失败,保持原值
|
637
|
-
pass
|
638
|
-
value_decimal = Decimal(str(value))
|
639
|
-
# 检查整数部分长度
|
692
|
+
value_decimal = self._convert_to_decimal(value)
|
640
693
|
if len(value_decimal.as_tuple().digits) - abs(value_decimal.as_tuple().exponent) > precision - scale:
|
641
694
|
raise ValueError(f"整数部分超出范围")
|
642
695
|
return value_decimal
|
643
|
-
else:
|
644
|
-
|
645
|
-
if hasattr(value, 'item') and callable(getattr(value, 'item', None)):
|
646
|
-
# numpy 标量类型
|
647
|
-
try:
|
648
|
-
value = value.item()
|
649
|
-
except (ValueError, TypeError):
|
650
|
-
# 如果不是标量,保持原值
|
651
|
-
pass
|
652
|
-
elif hasattr(value, 'value') and not isinstance(value, str):
|
653
|
-
# pandas 或其他有 value 属性的对象
|
654
|
-
try:
|
655
|
-
extracted_value = value.value
|
656
|
-
# 验证提取的值是数值类型
|
657
|
-
if isinstance(extracted_value, (int, float, str)) and str(extracted_value).replace('.', '').replace('-', '').replace('e', '').replace('E', '').isdigit():
|
658
|
-
value = extracted_value
|
659
|
-
except (ValueError, TypeError, AttributeError):
|
660
|
-
# 如果提取失败,保持原值
|
661
|
-
pass
|
662
|
-
return float(value)
|
696
|
+
else: # float/double
|
697
|
+
return self._convert_to_float(value)
|
663
698
|
except (ValueError, TypeError, InvalidOperation) as e:
|
664
699
|
logger.error(f"值 `{value}` 无法转换为数值类型: {e}", {"库": db_name, "表": table_name, "列": col_name})
|
665
700
|
raise ValueError(f"值 `{value}` 无法转换为数值类型: {e}")
|
@@ -670,12 +705,10 @@ class MySQLUploader:
|
|
670
705
|
max_len = int(re.search(r'\((\d+)\)', column_type).group(1))
|
671
706
|
if len(str_value.encode('utf-8')) > max_len:
|
672
707
|
logger.warning(f"列`{col_name}`的值`{str_value}`长度({len(str_value.encode('utf-8'))})超出varchar({max_len})限制,将进行截断", {"库": db_name, "表": table_name})
|
673
|
-
return
|
708
|
+
return self._truncate_str(str_value, max_len)
|
674
709
|
except (AttributeError, IndexError):
|
675
|
-
# 没有找到长度定义,不截断
|
676
710
|
pass
|
677
711
|
return str_value
|
678
|
-
|
679
712
|
return value
|
680
713
|
|
681
714
|
@_execute_with_retry
|
@@ -1,7 +1,7 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=Yq_JgKwKONwVexEyE66trDbripXgbesAkvt1eQ___20,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/query_data.py,sha256=
|
4
|
+
mdbq/aggregation/query_data.py,sha256=ZWLJghNiEtyA4rvgUqMCLorY0R4-Likd6i4mVMuOni0,170025
|
5
5
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
6
6
|
mdbq/log/mylogger.py,sha256=kPe3wsQNaB1slfX-Z7VMqzZoMoqPfc7ylYXZDBeFzzI,24945
|
7
7
|
mdbq/myconf/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -11,7 +11,7 @@ mdbq/mysql/deduplicator.py,sha256=AB3gL7ZwhcmzGHSu4UY4M6YZVPFZ2wlAN3BCcwAhegQ,73
|
|
11
11
|
mdbq/mysql/mysql.py,sha256=pDg771xBugCMSTWeskIFTi3pFLgaqgyG3smzf-86Wn8,56772
|
12
12
|
mdbq/mysql/s_query.py,sha256=1wJ3HVjHEF6FA-bVeeesRlsf73CZSvVTEQ51CF1OsE4,46786
|
13
13
|
mdbq/mysql/unique_.py,sha256=MaztT-WIyEQUs-OOYY4pFulgHVcXR1BfCy3QUz0XM_U,21127
|
14
|
-
mdbq/mysql/uploader.py,sha256=
|
14
|
+
mdbq/mysql/uploader.py,sha256=LPfYEj7ywoAynY2Nl9gg0IurgIWd_bTwkda2ifD1TeE,86443
|
15
15
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
16
16
|
mdbq/other/download_sku_picture.py,sha256=X66sVdvVgzoNzmgVJyPtd7bjEvctEKtLPblEPF65EWc,46940
|
17
17
|
mdbq/other/error_handler.py,sha256=4p5haAXSY-P78stp4Xwo_MwAngWYqyKj5ogWIuYXMeY,12631
|
@@ -25,7 +25,7 @@ mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
|
25
25
|
mdbq/redis/getredis.py,sha256=vpBuNc22uj9Vr-_Dh25_wpwWM1e-072EAAIBdB_IpL0,23494
|
26
26
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
27
27
|
mdbq/spider/aikucun.py,sha256=XptHjGzbout9IYzWAOQUpMMV5qEgLTU8pL1ZGt8oNEA,21868
|
28
|
-
mdbq-4.0.
|
29
|
-
mdbq-4.0.
|
30
|
-
mdbq-4.0.
|
31
|
-
mdbq-4.0.
|
28
|
+
mdbq-4.0.44.dist-info/METADATA,sha256=6BCrO5mef08KVeODcA3rgk-gOsNtI_8_CTh0mghCKeE,364
|
29
|
+
mdbq-4.0.44.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
30
|
+
mdbq-4.0.44.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
31
|
+
mdbq-4.0.44.dist-info/RECORD,,
|
File without changes
|
File without changes
|