mdbq 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +31 -30
- mdbq/aggregation/mysql_types.py +83 -90
- mdbq/aggregation/optimize_data.py +31 -52
- mdbq/aggregation/query_data.py +256 -295
- mdbq/clean/clean_upload.py +106 -194
- mdbq/config/myconfig.py +30 -0
- mdbq/config/products.py +32 -34
- mdbq/mysql/mysql.py +12 -42
- mdbq/mysql/s_query.py +4 -3
- {mdbq-2.7.1.dist-info → mdbq-2.7.3.dist-info}/METADATA +1 -1
- {mdbq-2.7.1.dist-info → mdbq-2.7.3.dist-info}/RECORD +13 -12
- {mdbq-2.7.1.dist-info → mdbq-2.7.3.dist-info}/WHEEL +1 -1
- {mdbq-2.7.1.dist-info → mdbq-2.7.3.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -1290,33 +1290,35 @@ def file_dir(one_file=True, target_service='company'):
|
|
1290
1290
|
|
1291
1291
|
|
1292
1292
|
def test():
|
1293
|
-
path = os.path.relpath(r'
|
1293
|
+
path = os.path.relpath(r'/Users/xigua/数据中心/原始文件3/达摩盘/dmp人群报表')
|
1294
1294
|
for root, dirs, files in os.walk(path, topdown=False):
|
1295
1295
|
for name in files:
|
1296
1296
|
if name.endswith('.csv') and 'baidu' not in name and '~' not in name:
|
1297
|
-
print(name)
|
1297
|
+
# print(name)
|
1298
1298
|
# df = pd.read_excel(os.path.join(root, name), header=0)
|
1299
1299
|
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
1300
|
-
|
1301
|
-
if
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1300
|
+
print(name)
|
1301
|
+
if len(df) == 0:
|
1302
|
+
print(name)
|
1303
|
+
os.remove(os.path.join(root, name))
|
1304
|
+
continue
|
1305
|
+
df = df[df['日期'] != '']
|
1306
|
+
# cols = df.columns.tolist()
|
1307
|
+
# if '千次展现花费' not in cols:
|
1308
|
+
# df.insert(loc=14, column='千次展现花费', value=0.0)
|
1309
|
+
# df.rename(columns={'总费用': '花费', '直接购物车数': '直接加购数', '总购物车数': '总加购数'}, inplace=True)
|
1310
|
+
# if '类目ID' not in cols:
|
1311
|
+
# df['类目ID'] = 'null'
|
1312
|
+
# if '类目名称' not in cols:
|
1313
|
+
# df['类目名称'] = 'null'
|
1314
|
+
# if '店铺名称' not in cols:
|
1315
|
+
# df.insert(loc=1, column='店铺名称', value='京东箱包旗舰店')
|
1316
|
+
# if '全站roi' in cols:
|
1317
|
+
# df.rename(columns={'全站roi': '全站投产比'}, inplace=True)
|
1318
|
+
# new_name = f'{os.path.splitext(name)[0]}.csv'
|
1319
1319
|
df.to_csv(os.path.join(root, name), encoding='utf-8_sig', index=False, header=True)
|
1320
|
+
# breakpoint()
|
1321
|
+
# os.remove(os.path.join(root, name))
|
1320
1322
|
# new_name = f'{os.path.splitext(name)[0]}.xlsx'
|
1321
1323
|
# df.to_excel(os.path.join(root, name),
|
1322
1324
|
# index=False, header=True, engine='openpyxl', freeze_panes=(1, 0))
|
@@ -1331,23 +1333,22 @@ if __name__ == '__main__':
|
|
1331
1333
|
|
1332
1334
|
# # 上传 1 个文件到数据库
|
1333
1335
|
# one_file_to_mysql(
|
1334
|
-
# file='
|
1335
|
-
# db_name='
|
1336
|
-
# table_name='
|
1337
|
-
# target_service='
|
1336
|
+
# file=r'C:\同步空间\BaiduSyncdisk\原始文件2\属性设置\电商定价.csv',
|
1337
|
+
# db_name='属性设置3',
|
1338
|
+
# table_name='电商定价',
|
1339
|
+
# target_service='home_lx',
|
1338
1340
|
# database='mysql'
|
1339
1341
|
# )
|
1340
|
-
|
1342
|
+
#
|
1341
1343
|
# 上传一个目录到指定数据库
|
1342
1344
|
db_name = '京东数据3'
|
1343
|
-
table_name = '京东商智
|
1345
|
+
table_name = '京东商智_店铺来源'
|
1344
1346
|
upload_dir(
|
1345
|
-
path=os.path.relpath(r'
|
1347
|
+
path=os.path.relpath(r'/Users/xigua/数据中心/原始文件3/京东报表/店铺来源_三级来asdasdas源'),
|
1346
1348
|
db_name=db_name,
|
1347
1349
|
collection_name=table_name,
|
1348
1350
|
dbs={'mysql': True, 'mongodb': False},
|
1349
|
-
target_service='
|
1351
|
+
target_service='company',
|
1350
1352
|
)
|
1351
1353
|
|
1352
|
-
|
1353
1354
|
# test()
|
mdbq/aggregation/mysql_types.py
CHANGED
@@ -1,28 +1,20 @@
|
|
1
1
|
# -*- coding:utf-8 -*-
|
2
2
|
import warnings
|
3
3
|
import pandas as pd
|
4
|
-
import numpy as np
|
5
|
-
import chardet
|
6
|
-
import zipfile
|
7
|
-
|
8
|
-
from numpy import dtype
|
9
|
-
from pandas.tseries.holiday import next_monday
|
10
|
-
from pyzipper import PyZipFile
|
11
4
|
import os
|
12
5
|
import platform
|
13
6
|
import json
|
14
7
|
import pymysql
|
8
|
+
import socket
|
15
9
|
from mdbq.mongo import mongo
|
16
10
|
from mdbq.mysql import mysql
|
17
11
|
from mdbq.mysql import s_query
|
18
|
-
from mdbq.config import
|
12
|
+
from mdbq.config import myconfig
|
19
13
|
from mdbq.config import set_support
|
20
14
|
from mdbq.dataframe import converter
|
21
15
|
import datetime
|
22
16
|
import time
|
23
17
|
import re
|
24
|
-
import shutil
|
25
|
-
import getpass
|
26
18
|
|
27
19
|
from sqlalchemy.dialects.postgresql.pg_catalog import pg_get_serial_sequence
|
28
20
|
|
@@ -54,7 +46,7 @@ class DataTypes:
|
|
54
46
|
self.path = set_support.SetSupport(dirname='support').dirname
|
55
47
|
self.service_name = service_name
|
56
48
|
if not self.service_name:
|
57
|
-
self.service_name = '
|
49
|
+
self.service_name = 'xigua_lx'
|
58
50
|
self.json_file = os.path.join(self.path, f'mysql_types_{self.service_name}.json')
|
59
51
|
if not os.path.isdir(self.path):
|
60
52
|
os.makedirs(self.path)
|
@@ -154,88 +146,89 @@ class DataTypes:
|
|
154
146
|
return {}, cl, None, None # 返回这些结果的目的是等添加完列再写 json 文件才能读到 types 信息
|
155
147
|
|
156
148
|
|
157
|
-
def mysql_all_dtypes(db_name=None, table_name=None,
|
149
|
+
def mysql_all_dtypes(db_name=None, table_name=None, path=None):
|
158
150
|
"""
|
159
|
-
|
151
|
+
更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
160
152
|
"""
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
153
|
+
username, password, host, port, service_name = None, None, None, None, None
|
154
|
+
conf = myconfig.main()
|
155
|
+
if socket.gethostname().lower() in ['xigua_lx', 'xigua1', 'macbook pro']:
|
156
|
+
data = conf['Windows']['xigua_lx']['mysql']['local']
|
157
|
+
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
158
|
+
service_name = 'xigua_lx' # 影响 mysql_types_xigua_lx.json 文件名
|
159
|
+
elif socket.gethostname().lower() in ['company', 'Mac2.local']:
|
160
|
+
data = conf['Windows']['company']['mysql']['local']
|
161
|
+
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
162
|
+
service_name = 'company' # 影响 mysql_types_company.json 文件名
|
163
|
+
if not username or not service_name:
|
164
|
+
return
|
165
|
+
|
166
|
+
config = {
|
167
|
+
'host': host,
|
168
|
+
'port': int(port),
|
169
|
+
'user': username,
|
170
|
+
'password': password,
|
171
|
+
'charset': 'utf8mb4', # utf8mb4 支持存储四字节的UTF-8字符集
|
172
|
+
'cursorclass': pymysql.cursors.DictCursor,
|
173
|
+
}
|
174
|
+
connection = pymysql.connect(**config) # 连接数据库
|
175
|
+
with connection.cursor() as cursor:
|
176
|
+
sql = "SHOW DATABASES;"
|
177
|
+
cursor.execute(sql)
|
178
|
+
db_name_lists = cursor.fetchall()
|
179
|
+
db_name_lists = [item['Database'] for item in db_name_lists]
|
180
|
+
connection.close()
|
181
|
+
|
182
|
+
sys_lists = ['information_schema', 'mysql', 'performance_schema', 'sakila', 'sys']
|
183
|
+
db_name_lists = [item for item in db_name_lists if item not in sys_lists]
|
184
|
+
|
185
|
+
results = [] # 返回结果示例: [{'云电影': '电影更新'}, {'生意经2': 'e3_零售明细统计'}]
|
186
|
+
for db_ in db_name_lists:
|
187
|
+
config.update({'database': db_}) # 添加更新 config 字段
|
172
188
|
connection = pymysql.connect(**config) # 连接数据库
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
189
|
+
try:
|
190
|
+
with connection.cursor() as cursor:
|
191
|
+
sql = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{db_}';"
|
192
|
+
sql = "SHOW TABLES;"
|
193
|
+
cursor.execute(sql)
|
194
|
+
res_tables = cursor.fetchall()
|
195
|
+
for res_table in res_tables:
|
196
|
+
for k, v in res_table.items():
|
197
|
+
results.append({db_: v})
|
198
|
+
except:
|
199
|
+
pass
|
200
|
+
finally:
|
178
201
|
connection.close()
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
time.sleep(0.5)
|
210
|
-
|
211
|
-
d = DataTypes(path=path, service_name=service_name)
|
212
|
-
for result in results:
|
213
|
-
for db_n, table_n in result.items():
|
214
|
-
# print(db_n, table_n, db_name, table_name)
|
215
|
-
if db_name and table_name: # 下载一个指定的数据表
|
216
|
-
if db_name != db_n or table_name != table_n:
|
217
|
-
continue
|
218
|
-
elif db_name: # 下载一个数据库的所有数据表
|
219
|
-
if db_name != db_n:
|
220
|
-
continue
|
221
|
-
# 如果 db_name 和 table_name 都不指定,则下载所有数据库的所有数据表
|
222
|
-
print(f'获取列信息 数据库: < {db_n} >, 数据表: < {table_n} >')
|
223
|
-
sq = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
224
|
-
# 获取数据表的指定列, 返回列表
|
225
|
-
# [{'视频bv号': 'BV1Dm4y1S7BU', '下载进度': 1}, {'视频bv号': 'BV1ov411c7US', '下载进度': 1}]
|
226
|
-
name_type = sq.dtypes_to_list(db_name=db_n, table_name=table_n)
|
227
|
-
if name_type:
|
228
|
-
dtypes = {item['COLUMN_NAME']: item['COLUMN_TYPE'] for item in name_type}
|
229
|
-
dtypes = {'mysql': {db_n: {table_n: dtypes}}}
|
230
|
-
d.get_mysql_types(
|
231
|
-
dtypes=dtypes,
|
232
|
-
cl='mysql',
|
233
|
-
db_name=db_n,
|
234
|
-
table_name=table_n,
|
235
|
-
is_file_dtype=True # True表示旧文件有限
|
236
|
-
)
|
237
|
-
else:
|
238
|
-
print(f'数据库回传数据(name_type)为空')
|
202
|
+
time.sleep(0.5)
|
203
|
+
|
204
|
+
d = DataTypes(path=path, service_name=service_name)
|
205
|
+
for result in results:
|
206
|
+
for db_n, table_n in result.items():
|
207
|
+
# print(db_n, table_n, db_name, table_name)
|
208
|
+
if db_name and table_name: # 下载一个指定的数据表
|
209
|
+
if db_name != db_n or table_name != table_n:
|
210
|
+
continue
|
211
|
+
elif db_name: # 下载一个数据库的所有数据表
|
212
|
+
if db_name != db_n:
|
213
|
+
continue
|
214
|
+
# 如果 db_name 和 table_name 都不指定,则下载所有数据库的所有数据表
|
215
|
+
print(f'获取列信息 数据库: < {db_n} >, 数据表: < {table_n} >')
|
216
|
+
sq = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
217
|
+
# 获取数据表的指定列, 返回列表
|
218
|
+
# [{'视频bv号': 'BV1Dm4y1S7BU', '下载进度': 1}, {'视频bv号': 'BV1ov411c7US', '下载进度': 1}]
|
219
|
+
name_type = sq.dtypes_to_list(db_name=db_n, table_name=table_n)
|
220
|
+
if name_type:
|
221
|
+
dtypes = {item['COLUMN_NAME']: item['COLUMN_TYPE'] for item in name_type}
|
222
|
+
dtypes = {'mysql': {db_n: {table_n: dtypes}}}
|
223
|
+
d.get_mysql_types(
|
224
|
+
dtypes=dtypes,
|
225
|
+
cl='mysql',
|
226
|
+
db_name=db_n,
|
227
|
+
table_name=table_n,
|
228
|
+
is_file_dtype=True # True表示旧文件有限
|
229
|
+
)
|
230
|
+
else:
|
231
|
+
print(f'数据库回传数据(name_type)为空')
|
239
232
|
# print(d.datas)
|
240
233
|
d.as_json_file()
|
241
234
|
|
@@ -243,5 +236,5 @@ def mysql_all_dtypes(db_name=None, table_name=None, service_database={'home_lx':
|
|
243
236
|
if __name__ == '__main__':
|
244
237
|
# 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
245
238
|
mysql_all_dtypes(
|
246
|
-
path='/Users/xigua/Downloads',
|
239
|
+
path='/Users/xigua/Downloads',
|
247
240
|
)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- coding: UTF-8 –*-
|
2
2
|
from mdbq.mongo import mongo
|
3
3
|
from mdbq.mysql import mysql
|
4
|
-
from mdbq.config import
|
4
|
+
from mdbq.config import myconfig
|
5
5
|
import socket
|
6
6
|
import subprocess
|
7
7
|
import psutil
|
@@ -10,6 +10,20 @@ import platform
|
|
10
10
|
"""
|
11
11
|
对指定数据库所有冗余数据进行清理
|
12
12
|
"""
|
13
|
+
username, password, host, port, service_database = None, None, None, None, None,
|
14
|
+
if socket.gethostname().lower() in ['xigua_lx', 'xigua1', 'macbook pro']:
|
15
|
+
conf = myconfig.main()
|
16
|
+
data = conf['Windows']['xigua_lx']['mysql']['local']
|
17
|
+
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
18
|
+
service_database = {'xigua_lx': 'mysql'}
|
19
|
+
elif socket.gethostname().lower() in ['company', 'mac2.local']:
|
20
|
+
conf = myconfig.main()
|
21
|
+
data = conf['Windows']['company']['mysql']['local']
|
22
|
+
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
23
|
+
service_database = {'company': 'mysql'}
|
24
|
+
if not username:
|
25
|
+
print(f'找不到主机:')
|
26
|
+
|
13
27
|
|
14
28
|
|
15
29
|
def restart_mongodb():
|
@@ -57,60 +71,25 @@ def restart_mongodb():
|
|
57
71
|
subprocess.call(command, shell=True)
|
58
72
|
|
59
73
|
|
60
|
-
def op_data(db_name_lists,
|
74
|
+
def op_data(db_name_lists, days: int = 63, is_mongo=True, is_mysql=True):
|
61
75
|
""" """
|
62
|
-
# for service_database in service_databases:
|
63
|
-
# for service_name, database in service_database.items():
|
64
|
-
# username, password, host, port = get_myconf.select_config_values(target_service=service_name, database=database)
|
65
|
-
# s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
66
|
-
# s.db_name_lists = [
|
67
|
-
# '聚合数据',
|
68
|
-
# ]
|
69
|
-
# s.days = days
|
70
|
-
# s.optimize_list()
|
71
|
-
for service_database in service_databases:
|
72
|
-
for service_name, database in service_database.items():
|
73
|
-
if socket.gethostname() == 'xigua_lx' or socket.gethostname() == 'xigua1' or socket.gethostname() == 'Mac2.local':
|
74
|
-
# mongodb
|
75
|
-
if is_mongo and database == 'mongodb':
|
76
|
-
username, password, host, port = get_myconf.select_config_values(
|
77
|
-
target_service=service_name,
|
78
|
-
database=database,
|
79
|
-
)
|
80
|
-
m = mongo.OptimizeDatas(username=username, password=password, host=host, port=port)
|
81
|
-
m.db_name_lists = db_name_lists
|
82
|
-
m.days = days
|
83
|
-
m.optimize_list()
|
84
|
-
if m.client:
|
85
|
-
m.client.close()
|
86
|
-
print(f'已关闭 mongodb 连接')
|
87
|
-
|
88
|
-
if socket.gethostname() == 'xigua_lx':
|
89
|
-
restart_mongodb() # mongodb 太占内存了, 重启服务, 释放内存
|
90
76
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
s.db_name_lists = db_name_lists
|
99
|
-
s.days = days
|
100
|
-
s.optimize_list()
|
77
|
+
if socket.gethostname() == 'xigua_lx' or socket.gethostname() == 'xigua1' or socket.gethostname() == 'mac2.local':
|
78
|
+
# Mysql
|
79
|
+
if is_mysql:
|
80
|
+
s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
81
|
+
s.db_name_lists = db_name_lists
|
82
|
+
s.days = days
|
83
|
+
s.optimize_list()
|
101
84
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
110
|
-
s.db_name_lists = db_name_lists
|
111
|
-
s.days = days
|
112
|
-
s.optimize_list()
|
85
|
+
elif socket.gethostname() == 'company':
|
86
|
+
# Mysql
|
87
|
+
if is_mysql:
|
88
|
+
s = mysql.OptimizeDatas(username=username, password=password, host=host, port=port)
|
89
|
+
s.db_name_lists = db_name_lists
|
90
|
+
s.days = days
|
91
|
+
s.optimize_list()
|
113
92
|
|
114
93
|
|
115
94
|
if __name__ == '__main__':
|
116
|
-
op_data(db_name_lists=['聚合数据'],
|
95
|
+
op_data(db_name_lists=['聚合数据'], days=10, is_mongo=True, is_mysql=True)
|