mdbq 3.3.4__py3-none-any.whl → 3.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/query_data.py +2 -2
- mdbq/mongo/mongo.py +24 -22
- mdbq/mysql/mysql.py +22 -22
- mdbq/mysql/recheck_mysql.py +1 -1
- mdbq/spider/aikucun.py +2 -24
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/METADATA +1 -1
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/RECORD +9 -20
- mdbq/aggregation/df_types.py +0 -188
- mdbq/aggregation/mysql_types.py +0 -240
- mdbq/clean/__init__.py +0 -4
- mdbq/clean/clean_upload.py +0 -1350
- mdbq/clean/data_clean.py +0 -1551
- mdbq/company/__init__.py +0 -4
- mdbq/company/copysh.py +0 -447
- mdbq/config/get_myconf.py +0 -131
- mdbq/config/update_conf.py +0 -102
- mdbq/req_post/__init__.py +0 -4
- mdbq/req_post/req_tb.py +0 -624
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/WHEEL +0 -0
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/top_level.txt +0 -0
mdbq/aggregation/mysql_types.py
DELETED
@@ -1,240 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import warnings
|
3
|
-
import pandas as pd
|
4
|
-
import os
|
5
|
-
import platform
|
6
|
-
import json
|
7
|
-
import pymysql
|
8
|
-
import socket
|
9
|
-
from mdbq.mongo import mongo
|
10
|
-
from mdbq.mysql import mysql
|
11
|
-
from mdbq.mysql import s_query
|
12
|
-
from mdbq.config import myconfig
|
13
|
-
from mdbq.config import set_support
|
14
|
-
from mdbq.dataframe import converter
|
15
|
-
import datetime
|
16
|
-
import time
|
17
|
-
import re
|
18
|
-
|
19
|
-
from sqlalchemy.dialects.postgresql.pg_catalog import pg_get_serial_sequence
|
20
|
-
|
21
|
-
warnings.filterwarnings('ignore')
|
22
|
-
"""
|
23
|
-
1. 记录 dataframe 或者数据库的列信息(dtypes)
|
24
|
-
2. 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
25
|
-
"""
|
26
|
-
|
27
|
-
|
28
|
-
class DataTypes:
|
29
|
-
"""
|
30
|
-
数据简介: 记录 dataframe 或者数据库的列信息(dtypes),可以记录其信息或者加载相关信息用于入库使用,
|
31
|
-
第一字段为分类(如 dataframe/mysql),第二字段为数据库名,第三字段为集合名,第四段列名及其数据类型
|
32
|
-
"""
|
33
|
-
def __init__(self, path=None, service_name=None):
|
34
|
-
self.datas = {
|
35
|
-
'_json统计':
|
36
|
-
{
|
37
|
-
'分类': 0,
|
38
|
-
'数据库量': 0,
|
39
|
-
'集合数量': 0,
|
40
|
-
'字段量': 0,
|
41
|
-
'数据简介': '记录数据库各表的数据类型信息',
|
42
|
-
}
|
43
|
-
}
|
44
|
-
self.path = path
|
45
|
-
if not self.path:
|
46
|
-
self.path = set_support.SetSupport(dirname='support').dirname
|
47
|
-
self.service_name = service_name
|
48
|
-
if not self.service_name:
|
49
|
-
self.service_name = 'xigua_lx'
|
50
|
-
self.json_file = os.path.join(self.path, f'mysql_types_{self.service_name}.json')
|
51
|
-
if not os.path.isdir(self.path):
|
52
|
-
os.makedirs(self.path)
|
53
|
-
if not os.path.isfile(self.json_file):
|
54
|
-
with open(self.json_file, 'w', encoding='utf-8_sig') as f:
|
55
|
-
json.dump(self.datas, f, ensure_ascii=False, sort_keys=True, indent=4)
|
56
|
-
self.json_before()
|
57
|
-
|
58
|
-
def json_before(self):
|
59
|
-
""" 本地 json 文件的 dtypes 信息, 初始化更新给 self.datas """
|
60
|
-
with open(self.json_file, 'r', encoding='utf-8_sig') as f:
|
61
|
-
json_ = json.load(f)
|
62
|
-
self.datas.update(json_)
|
63
|
-
|
64
|
-
def get_mysql_types(self, cl, dtypes, db_name, table_name, is_file_dtype=True):
|
65
|
-
""" 更新 mysql 的 types 信息到 json 文件 """
|
66
|
-
if cl in self.datas.keys():
|
67
|
-
if db_name in list(self.datas[cl].keys()): # ['京东数据2', '推广数据2', '生意参谋2', '生意经3']
|
68
|
-
if table_name in list(self.datas[cl][db_name].keys()):
|
69
|
-
if is_file_dtype: # 旧数据优先
|
70
|
-
# # 用 dtypes 更新, 允许手动指定 json 文件里面的数据类型
|
71
|
-
dtypes[cl][db_name][table_name].update(self.datas[cl][db_name][table_name])
|
72
|
-
# 将 dtypes 更新进去,使 self.datas 包含新旧信息
|
73
|
-
self.datas[cl][db_name][table_name].update(dtypes[cl][db_name][table_name])
|
74
|
-
else: # 新数据优先
|
75
|
-
self.datas[cl][db_name][table_name].update(dtypes[cl][db_name][table_name])
|
76
|
-
else:
|
77
|
-
if is_file_dtype: # 旧数据优先
|
78
|
-
dtypes[cl][db_name].update(self.datas[cl][db_name])
|
79
|
-
self.datas[cl][db_name].update(dtypes[cl][db_name])
|
80
|
-
else:
|
81
|
-
self.datas[cl][db_name].update(dtypes[cl][db_name])
|
82
|
-
else:
|
83
|
-
# dtypes.update(self.datas) # 可以注释掉, 因为旧数据 self.datas 是空的
|
84
|
-
self.datas[cl].update(dtypes[cl])
|
85
|
-
else:
|
86
|
-
self.datas.update(dtypes)
|
87
|
-
|
88
|
-
cif = 0 # 分类
|
89
|
-
dbs = 0 # 数据库
|
90
|
-
collections = 0 # 集合
|
91
|
-
cols = 0 # 字段
|
92
|
-
for k, v in self.datas.items():
|
93
|
-
if k == '_json统计':
|
94
|
-
continue # 不统计头信息
|
95
|
-
cif += 1
|
96
|
-
for t, g in v.items():
|
97
|
-
dbs += 1
|
98
|
-
for d, j in g.items():
|
99
|
-
collections += 1
|
100
|
-
for t, p in j.items():
|
101
|
-
cols += 1
|
102
|
-
tips = {'分类': cif, '数据库量': dbs, '集合数量': collections, '字段量': cols}
|
103
|
-
self.datas['_json统计'].update(tips)
|
104
|
-
# with open(json_file, 'w', encoding='utf-8_sig') as f:
|
105
|
-
# json.dump(
|
106
|
-
# self.datas,
|
107
|
-
# f,
|
108
|
-
# ensure_ascii=False, # 默认True,非ASCII字符将被转义。如为False,则非ASCII字符会以\uXXXX输出
|
109
|
-
# sort_keys=True, # 默认为False。如果为True,则字典的输出将按键排序。
|
110
|
-
# indent=4,
|
111
|
-
# )
|
112
|
-
|
113
|
-
def as_json_file(self):
|
114
|
-
""" 保存为本地 json 文件 """
|
115
|
-
with open(self.json_file, 'w', encoding='utf-8_sig') as f:
|
116
|
-
json.dump(
|
117
|
-
self.datas,
|
118
|
-
f,
|
119
|
-
ensure_ascii=False, # 默认True,非ASCII字符将被转义。如为False,则非ASCII字符会以\uXXXX输出
|
120
|
-
sort_keys=True, # 默认为False。如果为True,则字典的输出将按键排序。
|
121
|
-
indent=4,
|
122
|
-
)
|
123
|
-
print(f'已更新 json 文件: {self.json_file}')
|
124
|
-
time.sleep(1)
|
125
|
-
|
126
|
-
def load_dtypes(self, db_name, table_name, cl='mysql', ):
|
127
|
-
"""
|
128
|
-
mysql.py 程序从本地文件中读取 dtype 信息
|
129
|
-
如果缺失 dtypes 信息,则执行 mysql_all_dtypes 以便更新所有数据库 dtypes 信息到 json 文件
|
130
|
-
"""
|
131
|
-
if cl in self.datas.keys():
|
132
|
-
if db_name in list(self.datas[cl].keys()):
|
133
|
-
if table_name in list(self.datas[cl][db_name].keys()):
|
134
|
-
return self.datas[cl][db_name][table_name], None, None, None
|
135
|
-
else:
|
136
|
-
print(f'不存在的集合名信息: {table_name}, 文件位置: {self.json_file}')
|
137
|
-
# mysql_all_dtypes(db_name=db_name, table_name=table_name) # 更新一个表的 dtypes
|
138
|
-
return {}, cl, db_name, table_name
|
139
|
-
else:
|
140
|
-
print(f'不存在的数据库信息: {db_name}, 文件位置: {self.json_file}')
|
141
|
-
# mysql_all_dtypes(db_name=db_name) # 更新一个数据库的 dtypes
|
142
|
-
return {}, cl, db_name, None
|
143
|
-
else:
|
144
|
-
print(f'不存在的数据分类: {cl}, 文件位置: {self.json_file}')
|
145
|
-
# mysql_all_dtypes() # 更新所有数据库所有数据表的 dtypes 信息到本地 json
|
146
|
-
return {}, cl, None, None # 返回这些结果的目的是等添加完列再写 json 文件才能读到 types 信息
|
147
|
-
|
148
|
-
|
149
|
-
def mysql_all_dtypes(db_name=None, table_name=None, path=None):
|
150
|
-
"""
|
151
|
-
更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
152
|
-
"""
|
153
|
-
username, password, host, port, service_name = None, None, None, None, None
|
154
|
-
conf = myconfig.main()
|
155
|
-
if socket.gethostname() in ['xigua_lx', 'xigua1', 'MacBookPro']:
|
156
|
-
data = conf['Windows']['xigua_lx']['mysql']['local']
|
157
|
-
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
158
|
-
service_name = 'xigua_lx' # 影响 mysql_types_xigua_lx.json 文件名
|
159
|
-
elif socket.gethostname() in ['company', 'Mac2.local']:
|
160
|
-
data = conf['Windows']['company']['mysql']['local']
|
161
|
-
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
162
|
-
service_name = 'company' # 影响 mysql_types_company.json 文件名
|
163
|
-
if not username or not service_name:
|
164
|
-
return
|
165
|
-
|
166
|
-
config = {
|
167
|
-
'host': host,
|
168
|
-
'port': int(port),
|
169
|
-
'user': username,
|
170
|
-
'password': password,
|
171
|
-
'charset': 'utf8mb4', # utf8mb4 支持存储四字节的UTF-8字符集
|
172
|
-
'cursorclass': pymysql.cursors.DictCursor,
|
173
|
-
}
|
174
|
-
connection = pymysql.connect(**config) # 连接数据库
|
175
|
-
with connection.cursor() as cursor:
|
176
|
-
sql = "SHOW DATABASES;"
|
177
|
-
cursor.execute(sql)
|
178
|
-
db_name_lists = cursor.fetchall()
|
179
|
-
db_name_lists = [item['Database'] for item in db_name_lists]
|
180
|
-
connection.close()
|
181
|
-
|
182
|
-
sys_lists = ['information_schema', 'mysql', 'performance_schema', 'sakila', 'sys']
|
183
|
-
db_name_lists = [item for item in db_name_lists if item not in sys_lists]
|
184
|
-
|
185
|
-
results = [] # 返回结果示例: [{'云电影': '电影更新'}, {'生意经3': 'e3_零售明细统计'}]
|
186
|
-
for db_ in db_name_lists:
|
187
|
-
config.update({'database': db_}) # 添加更新 config 字段
|
188
|
-
connection = pymysql.connect(**config) # 连接数据库
|
189
|
-
try:
|
190
|
-
with connection.cursor() as cursor:
|
191
|
-
sql = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{db_}';"
|
192
|
-
sql = "SHOW TABLES;"
|
193
|
-
cursor.execute(sql)
|
194
|
-
res_tables = cursor.fetchall()
|
195
|
-
for res_table in res_tables:
|
196
|
-
for k, v in res_table.items():
|
197
|
-
results.append({db_: v})
|
198
|
-
except:
|
199
|
-
pass
|
200
|
-
finally:
|
201
|
-
connection.close()
|
202
|
-
time.sleep(0.5)
|
203
|
-
|
204
|
-
d = DataTypes(path=path, service_name=service_name)
|
205
|
-
for result in results:
|
206
|
-
for db_n, table_n in result.items():
|
207
|
-
# print(db_n, table_n, db_name, table_name)
|
208
|
-
if db_name and table_name: # 下载一个指定的数据表
|
209
|
-
if db_name != db_n or table_name != table_n:
|
210
|
-
continue
|
211
|
-
elif db_name: # 下载一个数据库的所有数据表
|
212
|
-
if db_name != db_n:
|
213
|
-
continue
|
214
|
-
# 如果 db_name 和 table_name 都不指定,则下载所有数据库的所有数据表
|
215
|
-
print(f'获取列信息 数据库: < {db_n} >, 数据表: < {table_n} >')
|
216
|
-
sq = s_query.QueryDatas(username=username, password=password, host=host, port=port)
|
217
|
-
# 获取数据表的指定列, 返回列表
|
218
|
-
# [{'视频bv号': 'BV1Dm4y1S7BU', '下载进度': 1}, {'视频bv号': 'BV1ov411c7US', '下载进度': 1}]
|
219
|
-
name_type = sq.dtypes_to_list(db_name=db_n, table_name=table_n)
|
220
|
-
if name_type:
|
221
|
-
dtypes = {item['COLUMN_NAME']: item['COLUMN_TYPE'] for item in name_type}
|
222
|
-
dtypes = {'mysql': {db_n: {table_n: dtypes}}}
|
223
|
-
d.get_mysql_types(
|
224
|
-
dtypes=dtypes,
|
225
|
-
cl='mysql',
|
226
|
-
db_name=db_n,
|
227
|
-
table_name=table_n,
|
228
|
-
is_file_dtype=True # True表示旧文件有限
|
229
|
-
)
|
230
|
-
else:
|
231
|
-
print(f'数据库回传数据(name_type)为空')
|
232
|
-
# print(d.datas)
|
233
|
-
d.as_json_file() # 2024.11.05 改
|
234
|
-
|
235
|
-
|
236
|
-
if __name__ == '__main__':
|
237
|
-
# 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
238
|
-
mysql_all_dtypes(
|
239
|
-
path='/Users/xigua/Downloads',
|
240
|
-
)
|
mdbq/clean/__init__.py
DELETED