mdbq 3.8.9__py3-none-any.whl → 3.8.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/mysql/mysql.py +11 -134
- mdbq/mysql/s_query.py +0 -4
- mdbq/other/otk.py +72 -2
- mdbq/spider/aikucun.py +2 -13
- {mdbq-3.8.9.dist-info → mdbq-3.8.10.dist-info}/METADATA +1 -1
- {mdbq-3.8.9.dist-info → mdbq-3.8.10.dist-info}/RECORD +9 -18
- mdbq/bdup/__init__.py +0 -5
- mdbq/bdup/bdup.py +0 -111
- mdbq/config/set_support.py +0 -20
- mdbq/dataframe/__init__.py +0 -4
- mdbq/dataframe/converter.py +0 -107
- mdbq/log/mylogger.py +0 -66
- mdbq/mongo/__init__.py +0 -4
- mdbq/mysql/year_month_day.py +0 -38
- mdbq/other/porxy.py +0 -115
- {mdbq-3.8.9.dist-info → mdbq-3.8.10.dist-info}/WHEEL +0 -0
- {mdbq-3.8.9.dist-info → mdbq-3.8.10.dist-info}/top_level.txt +0 -0
mdbq/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION = '3.8.
|
1
|
+
VERSION = '3.8.10'
|
mdbq/mysql/mysql.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# -*- coding:utf-8 -*-
|
2
2
|
import datetime
|
3
|
-
import platform
|
4
3
|
import re
|
5
4
|
import time
|
6
5
|
from functools import wraps
|
@@ -10,8 +9,8 @@ import numpy as np
|
|
10
9
|
import pandas as pd
|
11
10
|
from sqlalchemy import create_engine
|
12
11
|
import os
|
13
|
-
import calendar
|
14
12
|
import logging
|
13
|
+
from mdbq.other import otk
|
15
14
|
|
16
15
|
warnings.filterwarnings('ignore')
|
17
16
|
"""
|
@@ -21,44 +20,6 @@ warnings.filterwarnings('ignore')
|
|
21
20
|
logger = logging.getLogger(__name__)
|
22
21
|
|
23
22
|
|
24
|
-
def is_valid_date(date_string):
|
25
|
-
"""
|
26
|
-
判断是否是日期格式, 且允许没有前导零, 且允许带时间
|
27
|
-
纯日期格式: 返回 1
|
28
|
-
日期+时间: 返回 2
|
29
|
-
"""
|
30
|
-
date_pattern = r"^(\d{4})-(0?[1-9]|1[0-2])-(0?[1-9]|[12]\d|3[01])$"
|
31
|
-
match = re.match(date_pattern, str(date_string)) # 判断纯日期格式:2024-11-09
|
32
|
-
if match is None:
|
33
|
-
date_pattern = r".*\d+:\d+:\d+$"
|
34
|
-
match = re.match(date_pattern, date_string) # 判断日期+时间:2024-11-09 00:36:45
|
35
|
-
if match is not None:
|
36
|
-
return 2
|
37
|
-
else:
|
38
|
-
return 1
|
39
|
-
|
40
|
-
|
41
|
-
def is_integer(int_str):
|
42
|
-
""" 判断是否整数, 允许包含千分位分隔符, 允许科学计数法 """
|
43
|
-
# 如果是科学计数法
|
44
|
-
match = re.findall(r'^[-+]?(\d+)\.(\d+)[eE][-+]?(\d+)$', str(int_str))
|
45
|
-
if match:
|
46
|
-
if len(match[0]) == 3:
|
47
|
-
if int(match[0][0]) == 0: # 0 开头
|
48
|
-
if int(match[0][2]) > 10: # 转换后整数长度超过 10 位
|
49
|
-
return False
|
50
|
-
else: # 不是 0 开头
|
51
|
-
if len(match[0][0]) + int(match[0][2]) > 10: # 转换后整数长度超过 10 位
|
52
|
-
return False
|
53
|
-
if int(match[0][2]) >= len(match[0][1]):
|
54
|
-
return True
|
55
|
-
else:
|
56
|
-
return False
|
57
|
-
# 如果是普通数字, 且允许千分符
|
58
|
-
__pattern = r'^[-+]?\d{1,3}(,\d{3}){0,3}$|^[-+]?\d{1,9}$'
|
59
|
-
return re.match(__pattern, str(int_str)) is not None
|
60
|
-
|
61
|
-
|
62
23
|
def count_decimal_places(num_str):
|
63
24
|
""" 计算小数位数, 允许科学计数法 """
|
64
25
|
match = re.match(r'^[-+]?\d+(\.\d+)?([eE][-+]?\d+)?$', str(num_str))
|
@@ -144,8 +105,8 @@ class MysqlUpload:
|
|
144
105
|
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
145
106
|
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
146
107
|
|
147
|
-
date_type = is_valid_date(v) # 判断日期时间
|
148
|
-
int_num = is_integer(v) # 判断整数
|
108
|
+
date_type = otk.is_valid_date(v) # 判断日期时间
|
109
|
+
int_num = otk.is_integer(v) # 判断整数
|
149
110
|
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
150
111
|
if result1: # 京东sku/spu商品信息
|
151
112
|
__res_dict.update({k: 'varchar(100)'})
|
@@ -204,13 +165,7 @@ class MysqlUpload:
|
|
204
165
|
database_exists = cursor.fetchone()
|
205
166
|
if not database_exists:
|
206
167
|
# 如果数据库不存在,则新建
|
207
|
-
|
208
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
209
|
-
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
210
|
-
if '192.168.1.100' in str(self.host):
|
211
|
-
sql = f"CREATE DATABASE `{db_name}`"
|
212
|
-
else:
|
213
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
168
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
214
169
|
cursor.execute(sql)
|
215
170
|
connection.commit()
|
216
171
|
logger.info(f"创建Database: {db_name}")
|
@@ -369,13 +324,8 @@ class MysqlUpload:
|
|
369
324
|
database_exists = cursor.fetchone()
|
370
325
|
if not database_exists:
|
371
326
|
# 如果数据库不存在,则新建
|
372
|
-
|
373
|
-
|
374
|
-
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
375
|
-
if '192.168.1.100' in str(self.host):
|
376
|
-
sql = f"CREATE DATABASE `{db_name}`"
|
377
|
-
else:
|
378
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
327
|
+
|
328
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
379
329
|
cursor.execute(sql)
|
380
330
|
connection.commit()
|
381
331
|
logger.info(f"创建Database: {db_name}")
|
@@ -580,13 +530,7 @@ class MysqlUpload:
|
|
580
530
|
database_exists = cursor.fetchone()
|
581
531
|
if not database_exists:
|
582
532
|
# 如果数据库不存在,则新建
|
583
|
-
|
584
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
585
|
-
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
586
|
-
if '192.168.1.100' in str(self.host):
|
587
|
-
sql = f"CREATE DATABASE `{db_name}`"
|
588
|
-
else:
|
589
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
533
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
590
534
|
cursor.execute(sql)
|
591
535
|
connection.commit()
|
592
536
|
logger.info(f"创建Database: {db_name}")
|
@@ -770,8 +714,8 @@ class MysqlUpload:
|
|
770
714
|
result3 = re.findall(r'同比$|环比$', k, re.IGNORECASE)
|
771
715
|
result4 = re.findall(r'花费$|消耗$|金额$', k, re.IGNORECASE)
|
772
716
|
|
773
|
-
date_type = is_valid_date(v) # 判断日期时间
|
774
|
-
int_num = is_integer(v) # 判断整数
|
717
|
+
date_type = otk.is_valid_date(v) # 判断日期时间
|
718
|
+
int_num = otk.is_integer(v) # 判断整数
|
775
719
|
count_int, count_float = count_decimal_places(v) # 判断小数,返回小数位数
|
776
720
|
if result1: # 京东sku/spu商品信息
|
777
721
|
__res_dict.update({k: 'varchar(100)'})
|
@@ -806,44 +750,9 @@ class MysqlUpload:
|
|
806
750
|
new_dict_data.update({k: v})
|
807
751
|
return __res_dict, new_dict_data
|
808
752
|
|
809
|
-
def cover_df(self, df):
|
810
|
-
""" 清理 df 的值和列名 """
|
811
|
-
df.replace([np.inf, -np.inf], '0', inplace=True) # 清理一些非法值
|
812
|
-
# df.replace(to_replace=['\\N', '-', '--', '', 'nan', 'NAN'], value='0', regex=False, inplace=True) # 替换掉特殊字符
|
813
|
-
df.replace(to_replace=['\\N', '', 'nan', 'NAN'], value='0', regex=False, inplace=True) # 替换掉特殊字符
|
814
|
-
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
815
|
-
df.replace(to_replace=['="'], value='', regex=True, inplace=True) # ="和"不可以放在一起清洗, 因为有: id=86785565
|
816
|
-
df.replace(to_replace=['"'], value='', regex=True, inplace=True)
|
817
|
-
cols = df.columns.tolist()
|
818
|
-
for col in cols:
|
819
|
-
if col == 'id':
|
820
|
-
df.pop('id')
|
821
|
-
continue
|
822
|
-
df[col] = df[col].apply(lambda x: float(re.sub(r'%$', '', str(x))) / 100 if (
|
823
|
-
str(x) != '' and str(x).endswith('%')) and not re.findall('[\\u4e00-\\u9fa5]', str(x)) else '0.0' if str(x) == '0%' else x)
|
824
|
-
try:
|
825
|
-
# 不能直接使用 int() ,对于大数,可能转为uint64,导致数据库入库可能异常
|
826
|
-
df[col] = df[col].apply(
|
827
|
-
lambda x: np.int64(str(x)) if '_' not in str(x) and '.' not in str(x) else x) # 不含小数点尝试转整数
|
828
|
-
except:
|
829
|
-
pass
|
830
|
-
try:
|
831
|
-
if df[col].dtype == 'object': # 有些列没有被 pandas 识别数据类型,会没有 dtype 属性
|
832
|
-
df[col] = df[col].apply(lambda x: float(x) if '.' in str(x) and '_' not in str(x) else x)
|
833
|
-
except:
|
834
|
-
pass
|
835
|
-
new_col = col.lower()
|
836
|
-
new_col = re.sub(r'[()\-,,&~^、 ()\"\'“”=·/。》《><!!`]', '_', new_col, re.IGNORECASE)
|
837
|
-
new_col = new_col.replace(')', '')
|
838
|
-
new_col = re.sub(r'_{2,}', '_', new_col)
|
839
|
-
new_col = re.sub(r'_+$', '', new_col)
|
840
|
-
df.rename(columns={col: new_col}, inplace=True)
|
841
|
-
df.fillna(0, inplace=True)
|
842
|
-
return df
|
843
|
-
|
844
753
|
def convert_df_dtypes(self, df: pd.DataFrame):
|
845
754
|
""" 清理 df 的值和列名,并转换数据类型 """
|
846
|
-
df =
|
755
|
+
df = otk.cover_df(df=df) # 清理 df 的值和列名
|
847
756
|
[pd.to_numeric(df[col], errors='ignore') for col in df.columns.tolist()]
|
848
757
|
dtypes = df.dtypes.to_dict()
|
849
758
|
__res_dict = {}
|
@@ -949,13 +858,7 @@ class MysqlUpload:
|
|
949
858
|
database_exists = cursor.fetchone()
|
950
859
|
if not database_exists:
|
951
860
|
# 如果数据库不存在,则新建
|
952
|
-
|
953
|
-
# sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
954
|
-
# self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
955
|
-
if '192.168.1.100' in str(self.host):
|
956
|
-
sql = f"CREATE DATABASE `{db_name}`"
|
957
|
-
else:
|
958
|
-
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
861
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
959
862
|
cursor.execute(sql)
|
960
863
|
connection.commit()
|
961
864
|
logger.info(f"创建Database: {db_name}")
|
@@ -1705,31 +1608,5 @@ class OptimizeDatas:
|
|
1705
1608
|
self.connection.close()
|
1706
1609
|
|
1707
1610
|
|
1708
|
-
def year_month_day_bak(start_date, end_date):
|
1709
|
-
"""
|
1710
|
-
使用date_range函数和DataFrame来获取从start_date至end_date之间的所有年月日
|
1711
|
-
calendar.monthrange: 获取当月第一个工作日的星期值(0,6) 以及当月天数
|
1712
|
-
"""
|
1713
|
-
# 替换年月日中的日, 以便即使传入当月日期也有返回值
|
1714
|
-
try:
|
1715
|
-
start_date = f'{pd.to_datetime(start_date).year}-{pd.to_datetime(start_date).month}-01'
|
1716
|
-
except Exception as e:
|
1717
|
-
logger.error(e)
|
1718
|
-
return []
|
1719
|
-
# 使用pandas的date_range创建一个日期范围,频率为'MS'代表每月开始
|
1720
|
-
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')
|
1721
|
-
# 转换格式
|
1722
|
-
year_months = date_range.strftime('%Y-%m').drop_duplicates().sort_values()
|
1723
|
-
|
1724
|
-
results = []
|
1725
|
-
for year_month in year_months:
|
1726
|
-
year = re.findall(r'(\d{4})', year_month)[0]
|
1727
|
-
month = re.findall(r'\d{4}-(\d{2})', year_month)[0]
|
1728
|
-
s, d = calendar.monthrange(int(year), int(month))
|
1729
|
-
results.append({'起始日期': f'{year_month}-01', '结束日期': f'{year_month}-{d}'})
|
1730
|
-
|
1731
|
-
return results # start_date至end_date之间的所有年月日
|
1732
|
-
|
1733
|
-
|
1734
1611
|
if __name__ == '__main__':
|
1735
1612
|
pass
|
mdbq/mysql/s_query.py
CHANGED
@@ -1,16 +1,12 @@
|
|
1
1
|
# -*- coding:utf-8 -*-
|
2
2
|
import datetime
|
3
|
-
import platform
|
4
3
|
import re
|
5
4
|
import time
|
6
|
-
from functools import wraps
|
7
5
|
import warnings
|
8
6
|
import pymysql
|
9
7
|
import numpy as np
|
10
8
|
import pandas as pd
|
11
|
-
from sqlalchemy import create_engine
|
12
9
|
import os
|
13
|
-
import calendar
|
14
10
|
from decimal import Decimal
|
15
11
|
import logging
|
16
12
|
|
mdbq/other/otk.py
CHANGED
@@ -38,7 +38,7 @@ def dates_between(start_date, end_date, fm=None) -> list:
|
|
38
38
|
fm: 日期输出格式
|
39
39
|
"""
|
40
40
|
if not fm:
|
41
|
-
fm ='%Y
|
41
|
+
fm ='%Y-%m-%d'
|
42
42
|
start_date = pd.to_datetime(start_date)
|
43
43
|
end_date = pd.to_datetime(end_date)
|
44
44
|
dates = []
|
@@ -57,9 +57,37 @@ def cover_df(df):
|
|
57
57
|
df.replace(to_replace=['"'], value='', regex=True, inplace=True)
|
58
58
|
cols = df.columns.tolist()
|
59
59
|
for col in cols:
|
60
|
+
if col == 'id':
|
61
|
+
df.pop('id')
|
62
|
+
continue
|
63
|
+
# df[col] = df[col].apply(
|
64
|
+
# lambda x: float(float((str(x).rstrip("%"))) / 100) if re.findall(r'^\d+\.?\d*%$', str(x)) else x)
|
65
|
+
# df[col] = df[col].apply(lambda x:
|
66
|
+
# float(re.sub(r'%$', '', str(x))) / 100
|
67
|
+
# if (str(x) != '' and str(x).endswith('%')) and not re.findall(
|
68
|
+
# '[\\u4e00-\\u9fa5]', str(x)) else '0.0' if str(x) == '0%' else x)
|
60
69
|
df[col] = df[col].apply(
|
61
|
-
lambda x: float(
|
70
|
+
lambda x: float(str(x).rstrip("%")) / 100
|
71
|
+
if (
|
72
|
+
re.fullmatch(r'^\d+\.?\d*%$', str(x)) # 匹配数字加%格式
|
73
|
+
and not re.search(r'[\u4e00-\u9fa5]', str(x)) # 排除含中文的情况
|
74
|
+
)
|
75
|
+
else (
|
76
|
+
'0.0' if str(x) == '0%' else x # 处理 "0%"
|
77
|
+
)
|
78
|
+
)
|
62
79
|
|
80
|
+
try:
|
81
|
+
# 不能直接使用 int() ,对于大数,可能转为uint64,导致数据库入库可能异常
|
82
|
+
df[col] = df[col].apply(
|
83
|
+
lambda x: np.int64(str(x)) if '_' not in str(x) and '.' not in str(x) else x) # 不含小数点尝试转整数
|
84
|
+
except:
|
85
|
+
pass
|
86
|
+
try:
|
87
|
+
if df[col].dtype == 'object': # 有些列没有被 pandas 识别数据类型,会没有 dtype 属性
|
88
|
+
df[col] = df[col].apply(lambda x: float(x) if '.' in str(x) and '_' not in str(x) else x)
|
89
|
+
except:
|
90
|
+
pass
|
63
91
|
new_col = col.lower()
|
64
92
|
new_col = re.sub(r'[()\-,,&~^、 ()\"\'“”=·/。》《><!!`]', '_', new_col, re.IGNORECASE)
|
65
93
|
new_col = new_col.replace(')', '')
|
@@ -77,5 +105,47 @@ def translate_keys(original_dict:dict, translation_dict:dict) -> dict:
|
|
77
105
|
return {translation_dict.get(k, k): v for k, v in original_dict.items()}
|
78
106
|
|
79
107
|
|
108
|
+
def is_valid_date(date_string):
|
109
|
+
"""
|
110
|
+
mysql调用
|
111
|
+
判断是否是日期格式, 且允许没有前导零, 且允许带时间
|
112
|
+
纯日期格式: 返回 1
|
113
|
+
日期+时间: 返回 2
|
114
|
+
"""
|
115
|
+
date_pattern = r"^(\d{4})-(0?[1-9]|1[0-2])-(0?[1-9]|[12]\d|3[01])$"
|
116
|
+
match = re.match(date_pattern, str(date_string)) # 判断纯日期格式:2024-11-09
|
117
|
+
if match is None:
|
118
|
+
date_pattern = r".*\d+:\d+:\d+$"
|
119
|
+
match = re.match(date_pattern, date_string) # 判断日期+时间:2024-11-09 00:36:45
|
120
|
+
if match is not None:
|
121
|
+
return 2
|
122
|
+
else:
|
123
|
+
return 1
|
124
|
+
|
125
|
+
|
126
|
+
def is_integer(int_str):
|
127
|
+
"""
|
128
|
+
mysql调用
|
129
|
+
判断是否整数, 允许包含千分位分隔符, 允许科学计数法
|
130
|
+
"""
|
131
|
+
# 如果是科学计数法
|
132
|
+
match = re.findall(r'^[-+]?(\d+)\.(\d+)[eE][-+]?(\d+)$', str(int_str))
|
133
|
+
if match:
|
134
|
+
if len(match[0]) == 3:
|
135
|
+
if int(match[0][0]) == 0: # 0 开头
|
136
|
+
if int(match[0][2]) > 10: # 转换后整数长度超过 10 位
|
137
|
+
return False
|
138
|
+
else: # 不是 0 开头
|
139
|
+
if len(match[0][0]) + int(match[0][2]) > 10: # 转换后整数长度超过 10 位
|
140
|
+
return False
|
141
|
+
if int(match[0][2]) >= len(match[0][1]):
|
142
|
+
return True
|
143
|
+
else:
|
144
|
+
return False
|
145
|
+
# 如果是普通数字, 且允许千分符
|
146
|
+
__pattern = r'^[-+]?\d{1,3}(,\d{3}){0,3}$|^[-+]?\d{1,9}$'
|
147
|
+
return re.match(__pattern, str(int_str)) is not None
|
148
|
+
|
149
|
+
|
80
150
|
if __name__ == '__main__':
|
81
151
|
pass
|
mdbq/spider/aikucun.py
CHANGED
@@ -21,6 +21,7 @@ from mdbq.mysql import mysql
|
|
21
21
|
from mdbq.mysql import s_query
|
22
22
|
from mdbq.config import config
|
23
23
|
from mdbq.other import ua_sj
|
24
|
+
from mdbq.other import otk
|
24
25
|
|
25
26
|
dir_path = os.path.expanduser("~")
|
26
27
|
config_file = os.path.join(dir_path, 'spd.txt')
|
@@ -47,18 +48,6 @@ def keep_connect(_db_name, _config, max_try: int=10):
|
|
47
48
|
return None
|
48
49
|
|
49
50
|
|
50
|
-
def dates_between(start_date, end_date) -> list:
|
51
|
-
""" 获取两个日期之间的所有日期, 返回 list """
|
52
|
-
start_date = pd.to_datetime(start_date)
|
53
|
-
end_date = pd.to_datetime(end_date)
|
54
|
-
dates = []
|
55
|
-
current_date = start_date
|
56
|
-
while current_date <= end_date:
|
57
|
-
dates.append(current_date.strftime('%Y-%m-%d'))
|
58
|
-
current_date += datetime.timedelta(days=1)
|
59
|
-
return dates
|
60
|
-
|
61
|
-
|
62
51
|
class AikuCun:
|
63
52
|
def __init__(self):
|
64
53
|
self.url = 'https://gray-merc.aikucun.com/index.html'
|
@@ -204,7 +193,7 @@ class AikuCun:
|
|
204
193
|
self.start_date = start_date
|
205
194
|
if end_date:
|
206
195
|
self.end_date = end_date
|
207
|
-
date_list = dates_between(start_date=self.start_date, end_date=self.end_date)
|
196
|
+
date_list = otk.dates_between(start_date=self.start_date, end_date=self.end_date)
|
208
197
|
|
209
198
|
df = download.data_to_df(
|
210
199
|
db_name=self.db_name,
|
@@ -1,26 +1,17 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
|
-
mdbq/__version__.py,sha256=
|
2
|
+
mdbq/__version__.py,sha256=30BOEMWMMdvugdYm1n90xiBvxiQzusLf7XtVO4-Zjr8,18
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/query_data.py,sha256=-4HWC1HZmgqUAuvcRiHZU4FLtI70nRq_Hp3eXVZTyH8,185843
|
5
|
-
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
6
|
-
mdbq/bdup/bdup.py,sha256=hJs815hGFwm_X5bP2i9XugG2w2ZY_F0n3-Q0hVpIPPw,4892
|
7
5
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
8
6
|
mdbq/config/config.py,sha256=eaTfrfXQ65xLqjr5I8-HkZd_jEY1JkGinEgv3TSLeoQ,3170
|
9
|
-
mdbq/config/set_support.py,sha256=7C7NFy7Em_uC7lig54qQlIlKG_AJeMCskxzK87anGkM,462
|
10
|
-
mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
|
11
|
-
mdbq/dataframe/converter.py,sha256=lETYhT7KXlWzWwqguqhk6vI6kj4rnOBEW1lhqKy2Abc,5035
|
12
7
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
13
|
-
mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
14
8
|
mdbq/log/spider_logging.py,sha256=KX9TTUn9naZNBACCEFhyTktnWhr5JaSNQLppLGyrm9Y,1645
|
15
|
-
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
16
9
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
17
|
-
mdbq/mysql/mysql.py,sha256=
|
18
|
-
mdbq/mysql/s_query.py,sha256=
|
19
|
-
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
10
|
+
mdbq/mysql/mysql.py,sha256=2cPuqX4zq2b6ghFWxTylr52DPZGE2WNrCdFV0RcF6LY,89048
|
11
|
+
mdbq/mysql/s_query.py,sha256=X055aLRAgxVvueXx4NbfNjp6MyBI02_XBb1pTKw09L0,8660
|
20
12
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
21
13
|
mdbq/other/download_sku_picture.py,sha256=YU8DxKMXbdeE1OOKEA848WVp62jYHw5O4tXTjUdq9H0,44832
|
22
|
-
mdbq/other/otk.py,sha256=
|
23
|
-
mdbq/other/porxy.py,sha256=UHfgEyXugogvXgsG68a7QouUCKaohTKKkI4RN-kYSdQ,4961
|
14
|
+
mdbq/other/otk.py,sha256=amIFeLDNUJpSi0U6hXbnqXeGTbYL-8-5U5yAATzSM3Y,5947
|
24
15
|
mdbq/other/pov_city.py,sha256=AEOmCOzOwyjHi9LLZWPKi6DUuSC-_M163664I52u9qw,21050
|
25
16
|
mdbq/other/ua_sj.py,sha256=JuVYzc_5QZ9s_oQSrTHVKkQv4S_7-CWx4oIKOARn_9U,22178
|
26
17
|
mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
@@ -29,8 +20,8 @@ mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
|
29
20
|
mdbq/redis/__init__.py,sha256=YtgBlVSMDphtpwYX248wGge1x-Ex_mMufz4-8W0XRmA,12
|
30
21
|
mdbq/redis/getredis.py,sha256=Uk8-cOWT0JU1qRyIVqdbYokSLvkDIAfcokmYj1ebw8k,24104
|
31
22
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
32
|
-
mdbq/spider/aikucun.py,sha256=
|
33
|
-
mdbq-3.8.
|
34
|
-
mdbq-3.8.
|
35
|
-
mdbq-3.8.
|
36
|
-
mdbq-3.8.
|
23
|
+
mdbq/spider/aikucun.py,sha256=YLRTDgOKPGDyNB-z5dPOJhBoTzM6Rmbjy1Qng_KyJQc,19906
|
24
|
+
mdbq-3.8.10.dist-info/METADATA,sha256=bYHqcx9saoyDq-BQSXUtUQWTcDNICdZeF5jaKP6Dlyc,364
|
25
|
+
mdbq-3.8.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
26
|
+
mdbq-3.8.10.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
27
|
+
mdbq-3.8.10.dist-info/RECORD,,
|
mdbq/bdup/bdup.py
DELETED
@@ -1,111 +0,0 @@
|
|
1
|
-
# -*- coding: UTF-8 –*-
|
2
|
-
import os
|
3
|
-
import platform
|
4
|
-
import subprocess
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
6
|
-
from bypy import ByPy
|
7
|
-
|
8
|
-
|
9
|
-
class BaiDu:
|
10
|
-
"""
|
11
|
-
如果通过调用命令行终端运行, 云端路径必须使用linux格式,不要使用windows格式,否则在windows系统里面会上传失败(无法在云端创建文件)
|
12
|
-
"""
|
13
|
-
def __init__(self):
|
14
|
-
self.local_path = None
|
15
|
-
self.remote_path = None
|
16
|
-
self.skip:list = []
|
17
|
-
self.delete_remote_files:list = []
|
18
|
-
self.bp = ByPy()
|
19
|
-
self.count = 0
|
20
|
-
self.total = 0
|
21
|
-
|
22
|
-
def upload_dir(self, local_path, remote_path):
|
23
|
-
"""
|
24
|
-
上传整个文件夹,执行完后删除指定文件, 指定 self.delete_remote_files
|
25
|
-
如果通过调用命令行终端运行, 《云端路径!!》必须使用linux格式,不要使用反斜杆,否则在windows系统里面会上传失败
|
26
|
-
"""
|
27
|
-
self.local_path = local_path
|
28
|
-
self.remote_path = remote_path.replace('\\', '/')
|
29
|
-
if not os.path.exists(self.local_path):
|
30
|
-
print(f'{self.local_path}: 本地目录不存在,没有什么可传的')
|
31
|
-
return
|
32
|
-
|
33
|
-
if platform.system() == 'Windows':
|
34
|
-
self.bp.upload(localpath=self.local_path, remotepath=self.remote_path.replace('\\', '/')) # 上传文件到百度云
|
35
|
-
else:
|
36
|
-
command = f'bypy upload "{self.local_path}" "{self.remote_path}" --on-dup skip' # 相同文件跳过
|
37
|
-
try:
|
38
|
-
subprocess.run(command, shell=True)
|
39
|
-
except Exception as e:
|
40
|
-
print(e)
|
41
|
-
self.delete_files() # 最好是在内部执行删除, 避免路径异常
|
42
|
-
|
43
|
-
def upload_file(self, local_path, remote_path, processes=False):
|
44
|
-
"""
|
45
|
-
上传文件夹,按单个文件上传,可以跳过指定文件/文件夹, 指定 self.skip
|
46
|
-
《云端路径!!》必须使用linux格式
|
47
|
-
"""
|
48
|
-
if not isinstance(self.skip, list):
|
49
|
-
raise TypeError('skip must be a list')
|
50
|
-
self.skip += ['.DS_Store', '.localized', 'desktop.ini', '$RECYCLE.BIN', 'Icon']
|
51
|
-
self.local_path = local_path
|
52
|
-
self.remote_path = remote_path.replace('\\', '/')
|
53
|
-
if not os.path.exists(self.local_path):
|
54
|
-
print(f'{self.local_path}: 本地目录不存在,没有什么可传的')
|
55
|
-
return
|
56
|
-
|
57
|
-
local_files = os.listdir(self.local_path)
|
58
|
-
|
59
|
-
local_file_list = []
|
60
|
-
for file in local_files:
|
61
|
-
if file in self.skip: # 跳过指定文件/文件夹
|
62
|
-
continue
|
63
|
-
local_p = os.path.join(self.local_path, file)
|
64
|
-
if os.path.isfile(local_p):
|
65
|
-
rt_path = os.path.join(self.remote_path, file).replace('\\', '/')
|
66
|
-
self.total += 1
|
67
|
-
local_file_list.append({local_p: rt_path})
|
68
|
-
elif os.path.isdir(local_p):
|
69
|
-
for root, dirs, files in os.walk(local_p, topdown=False):
|
70
|
-
for name in files:
|
71
|
-
if name in self.skip: # 从子文件夹内跳过指定文件
|
72
|
-
continue
|
73
|
-
lc_path = os.path.join(root, name)
|
74
|
-
rt_path = lc_path.replace(self.local_path, self.remote_path).replace('\\', '/')
|
75
|
-
self.total += 1
|
76
|
-
local_file_list.append({lc_path: rt_path})
|
77
|
-
if processes:
|
78
|
-
# 不指定 max_workers 参数,默认值是 os.cpu_count() * 5
|
79
|
-
with ThreadPoolExecutor() as executor:
|
80
|
-
executor.map(self.up_one_file, local_file_list)
|
81
|
-
else:
|
82
|
-
for item in local_file_list:
|
83
|
-
self.up_one_file(file_dict=item)
|
84
|
-
|
85
|
-
def up_one_file(self, file_dict:dict):
|
86
|
-
if not isinstance(file_dict, dict):
|
87
|
-
raise TypeError('file_dict must be a dict')
|
88
|
-
for k, v in file_dict.items():
|
89
|
-
self.count += 1
|
90
|
-
print(f'上传: {self.count}/{self.total} {k}')
|
91
|
-
self.bp.upload(localpath=k, remotepath=v) # 上传文件到百度云
|
92
|
-
|
93
|
-
def delete_files(self):
|
94
|
-
""" 移除云端文件,位于 self.remote_path 文件夹下的子文件 """
|
95
|
-
self.delete_remote_files += ['.DS_Store', '.localized', 'desktop.ini', '$RECYCLE.BIN', 'Icon']
|
96
|
-
for delete_file in self.delete_remote_files:
|
97
|
-
self.bp.remove(remotepath=f'{self.remote_path.replace('\\', '/')}/{delete_file}') # 移除文件
|
98
|
-
|
99
|
-
def download_dir(self, local_path, remote_path):
|
100
|
-
""" 下载文件夹到本地 """
|
101
|
-
self.local_path = local_path
|
102
|
-
self.remote_path = remote_path.replace('\\', '/')
|
103
|
-
if not os.path.exists(self.local_path):
|
104
|
-
os.mkdir(self.local_path)
|
105
|
-
|
106
|
-
self.bp.download(localpath=f'{self.local_path}', remotepath=f'{self.remote_path.replace('\\', '/')}')
|
107
|
-
|
108
|
-
|
109
|
-
if __name__ == '__main__':
|
110
|
-
bp = ByPy()
|
111
|
-
bp.list()
|
mdbq/config/set_support.py
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
# -*- coding: UTF-8 –*-
|
2
|
-
import platform
|
3
|
-
import getpass
|
4
|
-
import os
|
5
|
-
import sys
|
6
|
-
|
7
|
-
"""
|
8
|
-
专门用来设置 support 文件夹路径
|
9
|
-
support 文件夹包含很多配置类文件,是程序必不可少的依赖
|
10
|
-
"""
|
11
|
-
|
12
|
-
|
13
|
-
class SetSupport:
|
14
|
-
def __init__(self, dirname):
|
15
|
-
self.dirname = os.path.join(os.path.realpath(os.path.dirname(sys.argv[0])), dirname)
|
16
|
-
|
17
|
-
|
18
|
-
if __name__ == '__main__':
|
19
|
-
s = SetSupport(dirname='support').dirname
|
20
|
-
print(s)
|
mdbq/dataframe/__init__.py
DELETED
mdbq/dataframe/converter.py
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import pandas as pd
|
3
|
-
import numpy as np
|
4
|
-
from decimal import Decimal
|
5
|
-
import re
|
6
|
-
|
7
|
-
|
8
|
-
class DataFrameConverter(object):
|
9
|
-
def __init__(self, df=pd.DataFrame({})):
|
10
|
-
self.df = df
|
11
|
-
|
12
|
-
def convert_df_cols(self, df=pd.DataFrame({})):
|
13
|
-
"""
|
14
|
-
清理 dataframe 非法值
|
15
|
-
对数据类型进行转换(尝试将 object 类型转为 int 或 float)
|
16
|
-
"""
|
17
|
-
if len(df) == 0:
|
18
|
-
df = self.df
|
19
|
-
if len(df) == 0:
|
20
|
-
return
|
21
|
-
|
22
|
-
def find_longest_decimal_value(number_list):
|
23
|
-
# 取列表中小数位数最长的值
|
24
|
-
longest_value = None
|
25
|
-
max_decimals = 0
|
26
|
-
for num in number_list:
|
27
|
-
decimal_places = len(str(num).split('.')[1])
|
28
|
-
if decimal_places > max_decimals:
|
29
|
-
max_decimals = decimal_places
|
30
|
-
longest_value = num
|
31
|
-
return longest_value
|
32
|
-
|
33
|
-
# dtypes = df.dtypes.apply(str).to_dict() # 将 dataframe 数据类型转为字典形式
|
34
|
-
df.replace([np.inf, -np.inf], '0', inplace=True) # 清理一些非法值
|
35
|
-
# df.replace(to_replace=['\\N', '-', '--', '', 'nan', 'NAN'], value='0', regex=False, inplace=True) # 替换掉特殊字符
|
36
|
-
df.replace(to_replace=['\\N', '', 'nan', 'NAN'], value='0', regex=False, inplace=True) # 替换掉特殊字符
|
37
|
-
# df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
38
|
-
df.replace(to_replace=['="'], value='', regex=True, inplace=True) # ="和"不可以放在一起清洗, 因为有: id=86785565
|
39
|
-
df.replace(to_replace=['"'], value='', regex=True, inplace=True)
|
40
|
-
cols = df.columns.tolist()
|
41
|
-
|
42
|
-
df.reset_index(inplace=True, drop=True) # 重置索引,避免下面的 df.loc[0, col] 会出错
|
43
|
-
|
44
|
-
for col in cols:
|
45
|
-
if col.lower() == 'id':
|
46
|
-
df.pop(col) # 等待插入的 df 不能包含 id 列,否则可能跟现有 id 主键冲突
|
47
|
-
continue
|
48
|
-
|
49
|
-
try:
|
50
|
-
# 百分比在某些数据库中不兼容, 转换百分比为小数, # 转百分比的列不能含有中文或特殊字符
|
51
|
-
df[col] = df[col].apply(
|
52
|
-
lambda x: float(float((str(x).rstrip("%"))) / 100) if re.findall(r'^\d+\.?\d*%$', str(x)) else x)
|
53
|
-
except Exception as e:
|
54
|
-
print(f'留意错误信息: 位于列 -> {col} -> {e}')
|
55
|
-
|
56
|
-
if (col.endswith('占比') or col.endswith('率') or col.endswith('同比')
|
57
|
-
or col.endswith('环比') or col.lower().endswith('roi')
|
58
|
-
or col.endswith('产出比')):
|
59
|
-
df = df.astype({col: 'float64'}, errors='raise')
|
60
|
-
|
61
|
-
# 尝试转换合适的数据类型
|
62
|
-
if df[col].dtype == 'object':
|
63
|
-
# "_"符号会被错误识别
|
64
|
-
try:
|
65
|
-
# 不能直接使用 int() ,对于大数,可能转为uint64,导致数据库入库可能异常
|
66
|
-
df[col] = df[col].apply(
|
67
|
-
lambda x: np.int64(str(x)) if '_' not in str(x) and '.' not in str(x) else x) # 不含小数点尝试转整数
|
68
|
-
# df[col] = df[col].apply(lambda x: int(x) if '_' not in str(x) and '.' not in str(x) else x) # 不含小数点尝试转整数
|
69
|
-
except:
|
70
|
-
pass
|
71
|
-
if df[col].dtype == 'object':
|
72
|
-
try:
|
73
|
-
df[col] = df[col].apply(lambda x: float(x) if '.' in str(x) and '_' not in str(x) else x)
|
74
|
-
except:
|
75
|
-
pass
|
76
|
-
if df[col].dtype == 'float' or df[col].dtype == 'float64': # 对于小数类型, 保留 6 位小数
|
77
|
-
df[col] = df[col].fillna(0.0).apply(lambda x: round(x, 6))
|
78
|
-
|
79
|
-
# 转换日期样式的列为日期类型
|
80
|
-
value = df.loc[0, col]
|
81
|
-
if value:
|
82
|
-
res = re.match(r'\d{4}-\d{2}-\d{2}|\d{4}-\d{2}-\d{2} |\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
|
83
|
-
r'|\d{4}/\d{1}/\d{1}|\d{4}/\d{1}/\d{2}|\d{4}/\d{2}/\d{1}|\d{4}/\d{2}/\d{2}', str(value))
|
84
|
-
if res:
|
85
|
-
try:
|
86
|
-
df[col] = df[col].apply(lambda x: pd.to_datetime(x))
|
87
|
-
except:
|
88
|
-
pass
|
89
|
-
new_col = col.lower()
|
90
|
-
new_col = re.sub(r'[()\-,,&~^、 ()\"\'“”=·/。》《><!!`]', '_', new_col, re.IGNORECASE)
|
91
|
-
new_col = new_col.replace(')', '')
|
92
|
-
new_col = re.sub(r'_{2,}', '_', new_col)
|
93
|
-
new_col = re.sub(r'_+$', '', new_col)
|
94
|
-
df.rename(columns={col: new_col}, inplace=True)
|
95
|
-
df.fillna(0, inplace=True)
|
96
|
-
return df
|
97
|
-
|
98
|
-
|
99
|
-
if __name__ == '__main__':
|
100
|
-
# df = pd.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c'])
|
101
|
-
# converter = DataFrameConverter()
|
102
|
-
# df = converter.convert_df_cols(df)
|
103
|
-
# print(df['a'].dtype)
|
104
|
-
# print(df)
|
105
|
-
pattern = '1540%'
|
106
|
-
pattern = re.findall(r'^\d+\.?\d*%$', pattern)
|
107
|
-
print(pattern)
|
mdbq/log/mylogger.py
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from logging import Logger
|
3
|
-
from logging import handlers
|
4
|
-
|
5
|
-
|
6
|
-
class MyLogger(Logger):
|
7
|
-
"""
|
8
|
-
从Logger类中继承,实例化一个日志器
|
9
|
-
"""
|
10
|
-
def __init__(self, logger_name, level='INFO', is_stream_handler=True, file=None, debug_file=None,
|
11
|
-
max_bytes=False, back_count=10, when=None):
|
12
|
-
"""
|
13
|
-
:param logger_name: 日志器的名字
|
14
|
-
:param level: 日志级别 # DEBUG INFO WARNING ERROR CRITICAL
|
15
|
-
:param is_stream_handler: 默认True输出到控制台
|
16
|
-
:param file: 传入文件名,默认None不输出到 file
|
17
|
-
param debug_file: 传入文件名,记录详细debug时使用,默认None不输出, 尽量不要和file同时使用,会重复写
|
18
|
-
:param when: 按周期分割日志,默认不分割,除非指定其他值
|
19
|
-
:param max_bytes: 按文件大小分割日志
|
20
|
-
:param back_count: 保留日志的数量, 值从0开始
|
21
|
-
"""
|
22
|
-
# 设置日志器名字、级别
|
23
|
-
super().__init__(logger_name, level)
|
24
|
-
|
25
|
-
# 定义日志格式, 使用Formatter类实例化一个日志类
|
26
|
-
fmt_stream = "%(asctime)s %(levelname)s %(name)s: %(message)s"
|
27
|
-
fmt_file = "%(asctime)s %(name)s: %(message)s"
|
28
|
-
fmt_debug_file = "%(asctime)s %(levelname)s %(name)s %(funcName)s: %(message)s"
|
29
|
-
formatter_stream = logging.Formatter(fmt_stream, datefmt="%Y-%m-%d %H:%M:%S")
|
30
|
-
formatter_file = logging.Formatter(fmt_file, datefmt="%Y-%m-%d %H:%M:%S")
|
31
|
-
formatter_debug_file = logging.Formatter(fmt_debug_file, datefmt="%Y-%m-%d %H:%M:%S")
|
32
|
-
|
33
|
-
# 创建一个handler,默认输出到控制台,如果设置为False,日志将不输出到控制台
|
34
|
-
if is_stream_handler:
|
35
|
-
stream_handler = logging.StreamHandler() # 设置渠道当中的日志格式
|
36
|
-
stream_handler.setFormatter(formatter_stream) # 将渠道与实例日志器绑定
|
37
|
-
self.addHandler(stream_handler)
|
38
|
-
|
39
|
-
# 创建一个handler,输出到文件file
|
40
|
-
if file:
|
41
|
-
file_handle = logging.FileHandler(file, mode='a', encoding='utf-8')
|
42
|
-
file_handle.setFormatter(formatter_file)
|
43
|
-
self.addHandler(file_handle)
|
44
|
-
|
45
|
-
# 创建一个handler,输出到文件file,记录详细的debug信息
|
46
|
-
if debug_file:
|
47
|
-
debug_file_handle = logging.FileHandler(debug_file, mode='a', encoding='utf-8')
|
48
|
-
debug_file_handle.setFormatter(formatter_debug_file)
|
49
|
-
self.addHandler(debug_file_handle)
|
50
|
-
|
51
|
-
# 创建一个handler,按日志文件大小分割
|
52
|
-
if max_bytes:
|
53
|
-
formatter_ = logging.Formatter(fmt='%(asctime)s %(name)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
|
54
|
-
formatter_time = handlers.RotatingFileHandler(filename='日志_分割.txt', encoding='utf-8',
|
55
|
-
maxBytes=max_bytes, backupCount=back_count)
|
56
|
-
formatter_time.setLevel(level)
|
57
|
-
formatter_time.setFormatter(formatter_)
|
58
|
-
self.addHandler(formatter_time)
|
59
|
-
|
60
|
-
# 创建一个handler,按指定周期分割日志
|
61
|
-
if when:
|
62
|
-
pass
|
63
|
-
|
64
|
-
|
65
|
-
if __name__ == '__main__':
|
66
|
-
pass
|
mdbq/mongo/__init__.py
DELETED
mdbq/mysql/year_month_day.py
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import warnings
|
3
|
-
import pandas as pd
|
4
|
-
import calendar
|
5
|
-
|
6
|
-
warnings.filterwarnings('ignore')
|
7
|
-
|
8
|
-
|
9
|
-
def year_month_day(start_date, end_date):
|
10
|
-
"""
|
11
|
-
使用date_range函数和DataFrame来获取从start_date至end_date之间的所有年月日
|
12
|
-
calendar.monthrange: 获取当月第一个工作日的星期值(0,6) 以及当月天数
|
13
|
-
返回值: [{'起始日期': '2025-05-01', '结束日期': '2025-05-31'}, {'起始日期': '2025-06-01', '结束日期': '2025-06-30'}]
|
14
|
-
"""
|
15
|
-
# 替换年月日中的日, 以便即使传入当月日期也有返回值
|
16
|
-
try:
|
17
|
-
start_date = f'{pd.to_datetime(start_date).year}-{pd.to_datetime(start_date).month}-01'
|
18
|
-
except Exception as e:
|
19
|
-
print(e)
|
20
|
-
return []
|
21
|
-
# 使用pandas的date_range创建一个日期范围,频率为'MS'代表每月开始
|
22
|
-
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')
|
23
|
-
# 转换格式
|
24
|
-
year_months = date_range.strftime('%Y-%m').drop_duplicates().sort_values()
|
25
|
-
|
26
|
-
results = []
|
27
|
-
for year_month in year_months:
|
28
|
-
year = re.findall(r'(\d{4})', year_month)[0]
|
29
|
-
month = re.findall(r'\d{4}-(\d{2})', year_month)[0]
|
30
|
-
s, d = calendar.monthrange(int(year), int(month))
|
31
|
-
results.append({'起始日期': f'{year_month}-01', '结束日期': f'{year_month}-{d}'})
|
32
|
-
|
33
|
-
return results # start_date至end_date之间的所有年月日
|
34
|
-
|
35
|
-
|
36
|
-
if __name__ == '__main__':
|
37
|
-
results = year_month_day(start_date='2025-05-01', end_date='2025-08-01')
|
38
|
-
print(results)
|
mdbq/other/porxy.py
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
import requests
|
2
|
-
import kdl
|
3
|
-
import warnings
|
4
|
-
import os
|
5
|
-
import requests
|
6
|
-
import datetime
|
7
|
-
import re
|
8
|
-
import time
|
9
|
-
import socket
|
10
|
-
warnings.filterwarnings('ignore')
|
11
|
-
"""
|
12
|
-
需要传入 订单的 secret_id 和 secret_key
|
13
|
-
"""
|
14
|
-
|
15
|
-
|
16
|
-
class MyProxy(object):
|
17
|
-
|
18
|
-
def __init__(self, secret_id, secret_key):
|
19
|
-
self.secret_id = secret_id
|
20
|
-
self.secret_key = secret_key
|
21
|
-
self.cookie_path = 'cookies'
|
22
|
-
if not os.path.exists(self.cookie_path):
|
23
|
-
os.mkdir(self.cookie_path)
|
24
|
-
|
25
|
-
def get_proxy(self):
|
26
|
-
"""
|
27
|
-
从代理网站获取代理ip, 默认参数是文件位置,不需要修改
|
28
|
-
"""
|
29
|
-
secret_id = self.secret_id
|
30
|
-
secret_key = self.secret_key
|
31
|
-
cookie_path = self.cookie_path
|
32
|
-
headers = {
|
33
|
-
"User-Agent": 'Mozilla/5.0'
|
34
|
-
}
|
35
|
-
auth = kdl.Auth(secret_id=secret_id, secret_key=secret_key)
|
36
|
-
client = kdl.Client(auth)
|
37
|
-
|
38
|
-
def ip_address():
|
39
|
-
try:
|
40
|
-
_response = requests.get("https://api.ipify.org/?format=json")
|
41
|
-
_ip = _response.json()["ip"]
|
42
|
-
except:
|
43
|
-
_ip = ''
|
44
|
-
return str(_ip)
|
45
|
-
|
46
|
-
myip_path = f'{cookie_path}/本机ip_{socket.gethostname()}.txt' # 将本机地址保存本地, 下次直接使用, 避免获取失败
|
47
|
-
if os.path.exists(myip_path):
|
48
|
-
file_timestamp = os.path.getmtime(myip_path)
|
49
|
-
file_date = datetime.datetime.fromtimestamp(file_timestamp).strftime('%Y-%m-%d')
|
50
|
-
today_date = datetime.datetime.today().strftime('%Y-%m-%d')
|
51
|
-
if file_date == today_date:
|
52
|
-
with open(myip_path) as m:
|
53
|
-
my_ip = m.read().strip()
|
54
|
-
else:
|
55
|
-
my_ip = ip_address()
|
56
|
-
with open(f'{cookie_path}/本机ip_{socket.gethostname()}.txt', 'w') as f:
|
57
|
-
f.write(my_ip)
|
58
|
-
else:
|
59
|
-
my_ip = ip_address()
|
60
|
-
with open(f'{cookie_path}/本机ip_{socket.gethostname()}.txt', 'w') as f:
|
61
|
-
f.write(my_ip)
|
62
|
-
try:
|
63
|
-
ip_whitelist = client.get_ip_whitelist() # 检查ip白名单, 如果这句报错,就直接设置白名单
|
64
|
-
if my_ip not in ip_whitelist:
|
65
|
-
ip_whitelist.append(my_ip)
|
66
|
-
client.set_ip_whitelist(ip_whitelist) # 添加本机到白名单
|
67
|
-
except Exception as e:
|
68
|
-
print(e)
|
69
|
-
client.set_ip_whitelist(my_ip) # 设置本机到白名单,会清空其他ip
|
70
|
-
|
71
|
-
if not os.path.isfile(f'{cookie_path}/secret_token_{socket.gethostname()}.txt'): # 如果本地没有密钥令牌则创建
|
72
|
-
secret_token = client.get_secret_token()
|
73
|
-
with open(f'{cookie_path}/secret_token_{socket.gethostname()}.txt', 'w') as f:
|
74
|
-
f.write(secret_token)
|
75
|
-
else:
|
76
|
-
with open(f'{cookie_path}/secret_token_{socket.gethostname()}.txt', 'r') as f:
|
77
|
-
secret_token = f.read()
|
78
|
-
data = f'secret_id={secret_id}&secret_token={secret_token}' # 检查密钥令牌的有效时长
|
79
|
-
token_expire = requests.post(
|
80
|
-
'https://dev.kdlapi.com/api/check_secret_token',
|
81
|
-
data, headers=headers).json()['data']['expire']
|
82
|
-
if token_expire < 300: # token_expire 密钥令牌距离过期的剩余时长(单位:秒),不足5分钟则重新创建令牌
|
83
|
-
secret_token = client.get_secret_token()
|
84
|
-
with open(f'{cookie_path}/secret_token_{socket.gethostname()}.txt', 'w') as f:
|
85
|
-
f.write(secret_token)
|
86
|
-
# api地址
|
87
|
-
proxy_url = (f'https://dev.kdlapi.com/api/getdps/?'
|
88
|
-
f'secret_id={secret_id}'
|
89
|
-
f'&signature={secret_token}'
|
90
|
-
f'&num=1&pt=1&format=text&sep=1&f_loc=1&f_citycode=1&area=440100')
|
91
|
-
# expire_time = client.get_order_expire_time() # 账户有效期
|
92
|
-
_proxy = requests.get(proxy_url, headers=headers).text # 通过api地址获取代理ip
|
93
|
-
ip_times = client.get_dps_valid_time(proxy=_proxy).values() # ip有效时间
|
94
|
-
for t in ip_times:
|
95
|
-
if str(t) != '0':
|
96
|
-
ip_times = t
|
97
|
-
balance = client.get_ip_balance(sign_type='hmacsha1') # 可用ip余额
|
98
|
-
d_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
99
|
-
ip_proxy = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+', _proxy)[0]
|
100
|
-
city_proxy = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+,([\u4e00-\u9fa5]+),', _proxy)[0]
|
101
|
-
ip_port = ip_proxy.split(':')
|
102
|
-
content = (f'{d_time} 中转IP:{ip_port[0]}, '
|
103
|
-
f'端口:{ip_port[1]}, '
|
104
|
-
f'出口地址:{city_proxy}, '
|
105
|
-
f'ip时长:{ip_times}秒, '
|
106
|
-
f'可用ip余额:{balance}, '
|
107
|
-
)
|
108
|
-
# print(content)
|
109
|
-
with open(f'{cookie_path}/代理ip地址.txt', 'a', encoding='utf-8') as f:
|
110
|
-
f.write(content)
|
111
|
-
return ip_proxy
|
112
|
-
|
113
|
-
|
114
|
-
if __name__ == '__main__':
|
115
|
-
cookie_path = 'cookies'
|
File without changes
|
File without changes
|