mdbq 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +2 -2
- mdbq/aggregation/query_data.py +1 -1
- mdbq/dataframe/converter.py +45 -0
- mdbq/mysql/data_types.py +2 -2
- mdbq/mysql/mysql.py +109 -3
- {mdbq-0.4.4.dist-info → mdbq-0.4.6.dist-info}/METADATA +1 -1
- {mdbq-0.4.4.dist-info → mdbq-0.4.6.dist-info}/RECORD +9 -9
- {mdbq-0.4.4.dist-info → mdbq-0.4.6.dist-info}/WHEEL +0 -0
- {mdbq-0.4.4.dist-info → mdbq-0.4.6.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
mdbq/aggregation/query_data.py
CHANGED
@@ -525,5 +525,5 @@ def data_aggregation(service_databases=[{}]):
|
|
525
525
|
|
526
526
|
|
527
527
|
if __name__ == '__main__':
|
528
|
-
data_aggregation(service_databases=[{'
|
528
|
+
data_aggregation(service_databases=[{'company': 'mysql'}])
|
529
529
|
# optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
|
mdbq/dataframe/converter.py
CHANGED
@@ -9,6 +9,51 @@ class DataFrameConverter(object):
|
|
9
9
|
self.df = df
|
10
10
|
|
11
11
|
def convert_df_cols(self, df=pd.DataFrame({})):
|
12
|
+
"""
|
13
|
+
清理 dataframe 非法值
|
14
|
+
对数据类型进行转换(尝试将 object 类型转为 int 或 float)
|
15
|
+
"""
|
16
|
+
if len(df) == 0:
|
17
|
+
df = self.df
|
18
|
+
if len(df) == 0:
|
19
|
+
return
|
20
|
+
# dtypes = df.dtypes.apply(str).to_dict() # 将 dataframe 数据类型转为字典形式
|
21
|
+
df.replace([np.inf, -np.inf], 0, inplace=True) # 清理一些非法值
|
22
|
+
df.replace(to_replace=['\\N', '-', '--', '', 'nan'], value=0, regex=False, inplace=True) # 替换掉特殊字符
|
23
|
+
df.replace(to_replace=[','], value='', regex=True, inplace=True)
|
24
|
+
df.replace(to_replace=['="'], value='', regex=True, inplace=True) # ="和"不可以放在一起清洗, 因为有: id=86785565
|
25
|
+
df.replace(to_replace=['"'], value='', regex=True, inplace=True)
|
26
|
+
cols = df.columns.tolist()
|
27
|
+
|
28
|
+
for col in cols:
|
29
|
+
# df[col] = df[col].apply(lambda x: re.sub('[="]', '', str(x)) if '="' in str(x) else x)
|
30
|
+
# 百分比在某些数据库中不兼容, 转换百分比为小数
|
31
|
+
df[col] = df[col].apply(lambda x: float(float((str(x).rstrip("%"))) / 100) if str(x).endswith('%') and '~' not in str(x) else x)
|
32
|
+
# 尝试转换合适的数据类型
|
33
|
+
if df[col].dtype == 'object':
|
34
|
+
try:
|
35
|
+
# df[col] = df[col].astype(int) # 尝试转换 int
|
36
|
+
df[col] = df[col].apply(lambda x: int(x) if '_' not in str(x) else x)
|
37
|
+
except:
|
38
|
+
# df[col] = df[col].astype('float64', errors='ignore') # 尝试转换 float, 报错则忽略
|
39
|
+
try:
|
40
|
+
df[col] = df[col].apply(lambda x: float(x) if '_' not in str(x) else x)
|
41
|
+
except:
|
42
|
+
pass
|
43
|
+
if df[col].dtype == 'float': # 对于小数类型, 保留 6 位小数
|
44
|
+
df[col] = df[col].apply(lambda x: round(float(x), 6) if x != 0 else x)
|
45
|
+
# 清理列名, 在 mysql 里面列名不能含有某些特殊字符
|
46
|
+
if '日期' in col or '时间' in col:
|
47
|
+
try:
|
48
|
+
df[col] = df[col].apply(lambda x: pd.to_datetime(x))
|
49
|
+
except:
|
50
|
+
pass
|
51
|
+
new_col = col.lower()
|
52
|
+
df.rename(columns={col: new_col}, inplace=True)
|
53
|
+
df.fillna(0, inplace=True)
|
54
|
+
return df
|
55
|
+
|
56
|
+
def convert_df_cols_bak(self, df=pd.DataFrame({})):
|
12
57
|
"""
|
13
58
|
清理 dataframe 列名的不合规字符(mysql)
|
14
59
|
对数据类型进行转换(尝试将 object 类型转为 int 或 float)
|
mdbq/mysql/data_types.py
CHANGED
@@ -166,9 +166,9 @@ class DataTypes:
|
|
166
166
|
return {}
|
167
167
|
|
168
168
|
|
169
|
-
def mysql_all_dtypes(path
|
169
|
+
def mysql_all_dtypes(path=None):
|
170
170
|
"""
|
171
|
-
|
171
|
+
更新笔记本 mysql 中所有数据库的 dtypes 信息到本地 json
|
172
172
|
"""
|
173
173
|
if not os.path.isdir(path):
|
174
174
|
path = set_support.SetSupport(dirname='support').dirname
|
mdbq/mysql/mysql.py
CHANGED
@@ -9,6 +9,7 @@ import warnings
|
|
9
9
|
import pymysql
|
10
10
|
import numpy as np
|
11
11
|
import pandas as pd
|
12
|
+
from more_itertools.more import iequals
|
12
13
|
from sqlalchemy import create_engine
|
13
14
|
import os
|
14
15
|
import calendar
|
@@ -63,7 +64,7 @@ class MysqlUpload:
|
|
63
64
|
return False
|
64
65
|
|
65
66
|
# @try_except
|
66
|
-
def
|
67
|
+
def df_to_mysql_bak(self, df, tabel_name, db_name='远程数据源'):
|
67
68
|
"""
|
68
69
|
将 df 写入数据库
|
69
70
|
db_name: 数据库名称
|
@@ -182,6 +183,106 @@ class MysqlUpload:
|
|
182
183
|
finally:
|
183
184
|
connection.close()
|
184
185
|
|
186
|
+
def df_to_mysql(self, df, tabel_name, db_name='远程数据源', drop_duplicates=False):
|
187
|
+
"""
|
188
|
+
将 df 写入数据库
|
189
|
+
db_name: 数据库名称
|
190
|
+
tabel_name: 集合/表名称
|
191
|
+
drop_duplicates:仅限于聚合数据使用,其他情况不要设置
|
192
|
+
"""
|
193
|
+
cv = converter.DataFrameConverter()
|
194
|
+
df = cv.convert_df_cols(df=df) # 清理 dataframe 非法值
|
195
|
+
|
196
|
+
connection = pymysql.connect(**self.config) # 连接数据库
|
197
|
+
with connection.cursor() as cursor:
|
198
|
+
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
199
|
+
database_exists = cursor.fetchone()
|
200
|
+
if not database_exists:
|
201
|
+
# 如果数据库不存在,则新建
|
202
|
+
if '8.138.27' in str(self.host) or platform.system() == "Linux": # 阿里云 mysql 低版本不支持 0900
|
203
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_unicode_ci"
|
204
|
+
self.config.update({'charset': 'utf8mb4_unicode_ci'})
|
205
|
+
if '192.168.1.100' in str(self.host):
|
206
|
+
sql = f"CREATE DATABASE `{db_name}`"
|
207
|
+
else:
|
208
|
+
sql = f"CREATE DATABASE `{db_name}` COLLATE utf8mb4_0900_ai_ci"
|
209
|
+
cursor.execute(sql)
|
210
|
+
connection.commit()
|
211
|
+
print(f"创建Database: {db_name}")
|
212
|
+
|
213
|
+
self.config.update({'database': db_name}) # 添加更新 config 字段
|
214
|
+
connection = pymysql.connect(**self.config) # 重新连接数据库
|
215
|
+
with connection.cursor() as cursor:
|
216
|
+
# 1. 查询表, 不存在则创建一个空表
|
217
|
+
sql = f"SHOW TABLES LIKE '{tabel_name}';" # 有特殊字符不需转义
|
218
|
+
cursor.execute(sql)
|
219
|
+
if not cursor.fetchone():
|
220
|
+
sql = f"CREATE TABLE IF NOT EXISTS `{tabel_name}` (id INT AUTO_INCREMENT PRIMARY KEY)"
|
221
|
+
cursor.execute(sql)
|
222
|
+
print(f'创建 mysql 表: {tabel_name}')
|
223
|
+
|
224
|
+
# 2. 列数据类型转换
|
225
|
+
dtypes = self.convert_dtypes(df=df, db_name=db_name, tabel_name=tabel_name)
|
226
|
+
|
227
|
+
# 有特殊字符不需转义
|
228
|
+
sql = f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{tabel_name}';"
|
229
|
+
cursor.execute(sql)
|
230
|
+
col_exist = [item['COLUMN_NAME'] for item in cursor.fetchall()]
|
231
|
+
cols = df.columns.tolist()
|
232
|
+
col_not_exist = [col for col in cols if col not in col_exist]
|
233
|
+
# 检查列,不存在则新建列
|
234
|
+
if col_not_exist: # 数据表中不存在的列
|
235
|
+
for col in col_not_exist:
|
236
|
+
try:
|
237
|
+
# 创建列,需转义
|
238
|
+
sql = f"ALTER TABLE `{tabel_name}` ADD COLUMN `{col}` {dtypes[col]} DEFAULT NULL;"
|
239
|
+
cursor.execute(sql)
|
240
|
+
print(f"添加列: {col}({dtypes[col]})") # 添加列并指定数据类型
|
241
|
+
|
242
|
+
# 创建索引
|
243
|
+
if col == '日期':
|
244
|
+
cursor.execute(f"SHOW INDEXES FROM `{tabel_name}` WHERE `Column_name` = '{col}'")
|
245
|
+
result = cursor.fetchone() # 检查索引是否存在
|
246
|
+
if not result:
|
247
|
+
cursor.execute(f"CREATE INDEX index_name ON `{tabel_name}`(`{col}`)")
|
248
|
+
except:
|
249
|
+
pass
|
250
|
+
connection.commit() # 提交事务
|
251
|
+
|
252
|
+
# 4. 移除指定日期范围内的数据,仅限于聚合数据使用,其他情况不要设置
|
253
|
+
if drop_duplicates and '日期' in df.columns.tolist():
|
254
|
+
dates = df['日期'].values.tolist()
|
255
|
+
start_date = pd.to_datetime(min(dates)).strftime('%Y-%m-%d')
|
256
|
+
end_date = (pd.to_datetime(max(dates)) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
257
|
+
sql = f"DELETE FROM `{tabel_name}` WHERE {'日期'} BETWEEN '%s' AND '%s'" % (start_date, end_date)
|
258
|
+
cursor.execute(sql)
|
259
|
+
connection.commit()
|
260
|
+
|
261
|
+
# 5. 更新插入数据
|
262
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
263
|
+
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{tabel_name}')
|
264
|
+
datas = df.to_dict(orient='records')
|
265
|
+
for data in datas:
|
266
|
+
try:
|
267
|
+
cols = ', '.join(f"`{item}`" for item in data.keys()) # 列名转义
|
268
|
+
# data.update({item: f"{data[item]}" for item in data.keys()}) # 全部值转字符, 不是必须的
|
269
|
+
values = ', '.join([f"'{item}'" for item in data.values()]) # 值要加单引号 ''
|
270
|
+
condition = []
|
271
|
+
for k, v in data.items():
|
272
|
+
condition += [f"`{k}` = '{v}'"]
|
273
|
+
condition = ' AND '.join(condition) # 构建查询条件
|
274
|
+
# print(condition)
|
275
|
+
|
276
|
+
sql = f"SELECT {cols} FROM `{tabel_name}` WHERE {condition}"
|
277
|
+
cursor.execute(sql)
|
278
|
+
result = cursor.fetchall() # 获取查询结果, 如果有结果返回 list,没有则返回空元组 tuple
|
279
|
+
if not result: # 数据不存在则插入
|
280
|
+
sql = f"INSERT INTO `{tabel_name}` ({cols}) VALUES ({values});"
|
281
|
+
cursor.execute(sql)
|
282
|
+
except:
|
283
|
+
pass
|
284
|
+
connection.commit() # 提交事务
|
285
|
+
|
185
286
|
def convert_dtypes(self, df, db_name, tabel_name):
|
186
287
|
"""
|
187
288
|
根据本地已经存在的记录着 mysql dtypes 的 json 文件转换 df 的类型为 mysql 专有的数据类型
|
@@ -240,7 +341,7 @@ class MysqlUpload:
|
|
240
341
|
return 'mediumtext'
|
241
342
|
return 'INT'
|
242
343
|
elif dtype == 'float64':
|
243
|
-
return '
|
344
|
+
return 'double' # mysql 中不要使用 float 类型,会影响计算结果
|
244
345
|
elif dtype == 'object':
|
245
346
|
return 'mediumtext'
|
246
347
|
else:
|
@@ -689,8 +790,13 @@ def download_datas(tabel_name, save_path, start_date):
|
|
689
790
|
|
690
791
|
|
691
792
|
if __name__ == '__main__':
|
692
|
-
username, password, host, port = get_myconf.select_config_values(target_service='
|
793
|
+
username, password, host, port = get_myconf.select_config_values(target_service='company', database='mysql')
|
693
794
|
print(username, password, host, port)
|
694
795
|
|
796
|
+
df = pd.read_csv('/Users/xigua/Downloads/余额查询.csv', encoding='utf-8_sig', header=0, na_filter=False)
|
797
|
+
# df = df.to_dict(orient='records')
|
798
|
+
m = MysqlUpload(username=username, password=password, host=host, port=port)
|
799
|
+
m.df_to_mysql_new(df=df, db_name='te2- %s t', tabel_name='测 -sdf @%试 表')
|
800
|
+
|
695
801
|
|
696
802
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256=
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=UGrhmhlu0oz-96yiaHez6EEacRJ1aALKanHbSWrcGww,58254
|
5
5
|
mdbq/aggregation/optimize_data.py,sha256=jLAWtxPUuhpo4XTVrhKtT4xK3grs7r73ePQfLhxlu1I,779
|
6
|
-
mdbq/aggregation/query_data.py,sha256=
|
6
|
+
mdbq/aggregation/query_data.py,sha256=5lzvEokjHuKtlaSBYjOFH8VA2MTtX8R3MwEUNs03qKg,24491
|
7
7
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
8
8
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
9
9
|
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
@@ -16,14 +16,14 @@ mdbq/config/products.py,sha256=tFqSfFSXyZXcof0gAeHq0Ftn4F5i9ucoMyIqZ1H_D2Q,4260
|
|
16
16
|
mdbq/config/set_support.py,sha256=LJLEbUFrv8y-GVskiwOI8A9uRaCEAUa0Yfjugt4yLp0,768
|
17
17
|
mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
|
18
18
|
mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
|
19
|
-
mdbq/dataframe/converter.py,sha256=
|
19
|
+
mdbq/dataframe/converter.py,sha256=cD9u9eaDkOcxMaiZH6Wq_0Jp9PLsoPJOmrys7yZpGvI,5535
|
20
20
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
21
21
|
mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
22
22
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
23
23
|
mdbq/mongo/mongo.py,sha256=q0B4wXDSTtXg_vMN7MPh6zdxl6tT68tM74LmdVNQQek,31892
|
24
24
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
25
|
-
mdbq/mysql/data_types.py,sha256=
|
26
|
-
mdbq/mysql/mysql.py,sha256=
|
25
|
+
mdbq/mysql/data_types.py,sha256=N7_SyRviE7H25grmMydLT3W3PLk6s3gIK4i36hut4Ms,9791
|
26
|
+
mdbq/mysql/mysql.py,sha256=ItSxepU7XPnzv5SHC2X4jBhqZPF3H6CgdWQe7IgI8bQ,39583
|
27
27
|
mdbq/mysql/s_query.py,sha256=6-8O9MHhi3-7n3isJ7t2kTCYL2mSBC_HrxSQmXM5UtI,7901
|
28
28
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
29
29
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -34,7 +34,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
|
34
34
|
mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
|
35
35
|
mdbq/pbix/refresh_all.py,sha256=tgy762608HMaXWynbOURIf2UVMuSPybzrDXQnOOcnZU,6102
|
36
36
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
37
|
-
mdbq-0.4.
|
38
|
-
mdbq-0.4.
|
39
|
-
mdbq-0.4.
|
40
|
-
mdbq-0.4.
|
37
|
+
mdbq-0.4.6.dist-info/METADATA,sha256=Ko9S2D8tJWAkE46mRmNzCiOHeOuJ6RI7TwI002tHEfA,245
|
38
|
+
mdbq-0.4.6.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
39
|
+
mdbq-0.4.6.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
40
|
+
mdbq-0.4.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|