mdbq 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/optimize_data.py +0 -46
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/METADATA +1 -1
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/RECORD +5 -7
- mdbq/aggregation/aggregation_bak.py +0 -1438
- mdbq/mongo/mongo.py +0 -729
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/WHEEL +0 -0
- {mdbq-3.7.6.dist-info → mdbq-3.7.8.dist-info}/top_level.txt +0 -0
mdbq/mongo/mongo.py
DELETED
@@ -1,729 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import datetime
|
3
|
-
import os
|
4
|
-
import re
|
5
|
-
import warnings
|
6
|
-
import time
|
7
|
-
import pandas as pd
|
8
|
-
import numpy as np
|
9
|
-
import pymongo
|
10
|
-
from functools import wraps
|
11
|
-
import socket
|
12
|
-
import platform
|
13
|
-
from concurrent.futures import ThreadPoolExecutor
|
14
|
-
from mdbq.config import myconfig
|
15
|
-
from mdbq.dataframe import converter
|
16
|
-
|
17
|
-
warnings.filterwarnings('ignore')
|
18
|
-
if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
|
19
|
-
conf = myconfig.main()
|
20
|
-
conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
|
21
|
-
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
22
|
-
'port']
|
23
|
-
else:
|
24
|
-
conf = myconfig.main()
|
25
|
-
conf_data = conf['Windows']['company']['mysql']['remoto']
|
26
|
-
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
27
|
-
'port']
|
28
|
-
|
29
|
-
|
30
|
-
def rename_col(username, password, host, db_name, collection_name, old_name, new_name, port: int = 27017,):
|
31
|
-
""" 修改mongodb数据源 某集合的某个字段名 """
|
32
|
-
# 连接到MongoDB
|
33
|
-
_link = f'mongodb://{username}:{password}@{host}:{port}/'
|
34
|
-
client = pymongo.MongoClient(_link)
|
35
|
-
db = client[db_name] # 数据库名
|
36
|
-
collection = db[collection_name] # 集合名
|
37
|
-
|
38
|
-
rename_operation = {"$rename": {old_name: new_name}} # 修改字段名的操作
|
39
|
-
|
40
|
-
collection.update_many({}, rename_operation)
|
41
|
-
if new_name == '日期':
|
42
|
-
collection.create_index([(new_name, -1)], background=True) # 必须, 创建索引
|
43
|
-
|
44
|
-
|
45
|
-
class CreateUser:
|
46
|
-
"""
|
47
|
-
通过python 创建 mongodb 管理员账户
|
48
|
-
"""
|
49
|
-
def __init__(self, username, password, host, port: int = 27017):
|
50
|
-
self.username = username
|
51
|
-
self.password = password
|
52
|
-
self.host = host
|
53
|
-
self.port = port
|
54
|
-
self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
|
55
|
-
self.client = None
|
56
|
-
|
57
|
-
self.db_roles = [{'市场数据2': 'read'}]
|
58
|
-
self.user_infos = [] # 现有用户信息
|
59
|
-
self.root = False
|
60
|
-
self.add_permission = True # 更新权限时, 默认新增, 设置为False 则减去权限
|
61
|
-
|
62
|
-
@staticmethod
|
63
|
-
def try_except(func): # 在类内部定义一个异常处理方法
|
64
|
-
@wraps(func)
|
65
|
-
def wrapper(*args, **kwargs):
|
66
|
-
try:
|
67
|
-
return func(*args, **kwargs)
|
68
|
-
except Exception as e:
|
69
|
-
print(f'{func.__name__}, {e}') # 将异常信息返回
|
70
|
-
|
71
|
-
return wrapper
|
72
|
-
|
73
|
-
@try_except
|
74
|
-
def create_user(self):
|
75
|
-
"""
|
76
|
-
role: read(只读), readAnyDatabase(读取所有), readWriteAnyDatabase(读写所有), userAdminAnyDatabase(用户管理权限)
|
77
|
-
|
78
|
-
"""
|
79
|
-
|
80
|
-
self.client = pymongo.MongoClient(self.link) # 连接数据库
|
81
|
-
db = self.client['admin'] # 切换到admin数据库
|
82
|
-
users = db.system.users.find() # 获取所有用户
|
83
|
-
for user in users:
|
84
|
-
self.user_infos.append({user['user']: user['roles']})
|
85
|
-
|
86
|
-
add_roles = []
|
87
|
-
for db_role in self.db_roles:
|
88
|
-
for key, value in db_role.items():
|
89
|
-
add_roles.append({
|
90
|
-
'role': value,
|
91
|
-
'db': key
|
92
|
-
})
|
93
|
-
# root_roles = ['root'], # root 权限用户, 正常情况下不要创建 root
|
94
|
-
root_roles = [
|
95
|
-
{'role': 'userAdminAnyDatabase', 'db': 'admin'}, # 赋予所有数据库的用户管理权限
|
96
|
-
{'role': 'readWriteAnyDatabase', 'db': 'admin'} # 赋予所有数据库的读写权限
|
97
|
-
]
|
98
|
-
|
99
|
-
user_list = [] # 现有用户列表
|
100
|
-
i = 0
|
101
|
-
for user_info in self.user_infos:
|
102
|
-
for key, value in user_info.items():
|
103
|
-
user_list.append(key)
|
104
|
-
if self.username == key:
|
105
|
-
print(f'{self.username}: 用户已存在, 权限为: {value}')
|
106
|
-
if self.root:
|
107
|
-
print(f'不支持直接升级管理员权限, 请先删除用户再重新创建root角色, 设置 self.root = True ')
|
108
|
-
if self.add_permission: # 新增权限
|
109
|
-
roles = value + add_roles
|
110
|
-
else: # 减去权限
|
111
|
-
roles = [item for item in value if item['db'] != add_roles[0]['db']]
|
112
|
-
db.command('updateUser', self.username, roles=roles) # 更新权限
|
113
|
-
i += 1
|
114
|
-
break
|
115
|
-
if self.root: # 设置超级管理员
|
116
|
-
db.command(command='createUser', value=self.username, pwd=self.password, roles=root_roles)
|
117
|
-
print(f'管理员创建成功: {self.username}, 权限为: {root_roles}')
|
118
|
-
self.client.close()
|
119
|
-
return
|
120
|
-
if i > 0:
|
121
|
-
self.client.close()
|
122
|
-
return
|
123
|
-
admin_user = db.command(command='createUser', value=self.username, pwd=self.password, roles=add_roles)
|
124
|
-
if admin_user['ok'] > 0:
|
125
|
-
print(f'普通用户创建成功: {self.username}, 权限为: {add_roles}')
|
126
|
-
self.client.close()
|
127
|
-
|
128
|
-
def delete_user(self):
|
129
|
-
""" 删除指定用户: self.username """
|
130
|
-
self.client = pymongo.MongoClient(self.link) # 连接数据库
|
131
|
-
db = self.client['admin'] # 切换到admin数据库
|
132
|
-
users = db.system.users.find() # 获取所有用户
|
133
|
-
for user in users:
|
134
|
-
self.user_infos.append({user['user']: user['roles']})
|
135
|
-
|
136
|
-
user_list = [] # 现有用户列表
|
137
|
-
i = 0
|
138
|
-
for user_info in self.user_infos:
|
139
|
-
for key, value in user_info.items():
|
140
|
-
user_list.append(key)
|
141
|
-
if self.username == key:
|
142
|
-
db.command("dropUser", self.username)
|
143
|
-
print(f'已删除用户: {self.username}')
|
144
|
-
i += 1
|
145
|
-
if i == 0:
|
146
|
-
print(f'不存在的用户: {self.username}, 无需执行删除操作')
|
147
|
-
self.client.close()
|
148
|
-
|
149
|
-
|
150
|
-
class DownMongo:
|
151
|
-
""" 下载数据 """
|
152
|
-
def __init__(self, save_path, username, password, host, port: int = 27017):
|
153
|
-
self.username = username
|
154
|
-
self.password = password
|
155
|
-
self.host = host
|
156
|
-
self.port = port
|
157
|
-
self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
|
158
|
-
self.client = None
|
159
|
-
self.db_name = None
|
160
|
-
self.collection_name = None
|
161
|
-
self.days = 5
|
162
|
-
self.start_date = None
|
163
|
-
self.end_date = datetime.datetime.now()
|
164
|
-
self.save_path = save_path
|
165
|
-
self.projection = {'_id': 0} # 读取数据库指定字段
|
166
|
-
|
167
|
-
def data_to_df(self, db_name, collection_name, projection: dict):
|
168
|
-
self.client = pymongo.MongoClient(self.link) # 连接数据库
|
169
|
-
self.db_name = db_name
|
170
|
-
self.collection_name = collection_name
|
171
|
-
collection = self.client[self.db_name][self.collection_name] # 连接集合
|
172
|
-
if not self.start_date:
|
173
|
-
self.start_date = datetime.datetime.now() - datetime.timedelta(days=self.days)
|
174
|
-
self.end_date = datetime.datetime.now()
|
175
|
-
else:
|
176
|
-
self.start_date = pd.to_datetime(self.start_date) # 对日期进行格式化并赋值
|
177
|
-
self.end_date = pd.to_datetime(self.end_date)
|
178
|
-
# print(self.start_date, '->', self.end_date)
|
179
|
-
|
180
|
-
self.projection.update(projection) # 指定字段
|
181
|
-
pipeline = [
|
182
|
-
{'$match': {'日期': {'$gte': self.start_date, '$lte': self.end_date}}},
|
183
|
-
{'$project': projection},
|
184
|
-
]
|
185
|
-
results = collection.aggregate(pipeline)
|
186
|
-
# print(results)
|
187
|
-
# 输出结果
|
188
|
-
datas = []
|
189
|
-
for doc in results:
|
190
|
-
# print(doc)
|
191
|
-
datas.append(doc)
|
192
|
-
if len(datas) == 0:
|
193
|
-
return pd.DataFrame()
|
194
|
-
df = pd.DataFrame(datas)
|
195
|
-
for col in df.columns.tolist():
|
196
|
-
if '日期' in col:
|
197
|
-
try:
|
198
|
-
df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='ignore') # 转换日期列
|
199
|
-
except ValueError as v:
|
200
|
-
print(f'{name}: {v}')
|
201
|
-
else:
|
202
|
-
df[col] = pd.to_numeric(df[col], errors='ignore').fillna(0) # 尝试转换数据类型
|
203
|
-
# self.client.close()
|
204
|
-
return df
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
def data_to_file(self, file_type, db_name, collection_name):
|
209
|
-
"""
|
210
|
-
用于 GUI 的函数
|
211
|
-
将 mongodb 数据保存本地
|
212
|
-
db_name: 数据库名
|
213
|
-
collections 集合名
|
214
|
-
file_type: 保存的文件类型 csv, json, xlsx, xls
|
215
|
-
"""
|
216
|
-
self.client = pymongo.MongoClient(self.link) # 连接数据库
|
217
|
-
self.db_name = db_name
|
218
|
-
self.collection_name = collection_name
|
219
|
-
_collection = self.client[self.db_name][self.collection_name] # 连接集合
|
220
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
221
|
-
if not self.start_date:
|
222
|
-
print(f'{now}正在下载 ({self.host}) {self.db_name}: {self.collection_name}, 区间: 近 {self.days} 天\n...')
|
223
|
-
else:
|
224
|
-
print(f'{now}正在下载 ({self.host}) {self.db_name}: {self.collection_name}, 区间: {self.start_date}~{self.end_date}')
|
225
|
-
|
226
|
-
if not self.start_date:
|
227
|
-
self.start_date = datetime.datetime.now() - datetime.timedelta(days=self.days)
|
228
|
-
self.end_date = datetime.datetime.now()
|
229
|
-
else:
|
230
|
-
self.start_date = pd.to_datetime(self.start_date) # 对日期进行格式化并赋值
|
231
|
-
self.end_date = pd.to_datetime(self.end_date)
|
232
|
-
pipeline = [
|
233
|
-
{'$match': {'日期': {'$gte': self.start_date, '$lte': self.end_date}}},
|
234
|
-
{'$project': {'_id': 0}}, # 不保留 id 字段
|
235
|
-
]
|
236
|
-
results = _collection.aggregate(pipeline)
|
237
|
-
|
238
|
-
# 输出结果
|
239
|
-
datas = []
|
240
|
-
for doc in results:
|
241
|
-
datas.append(doc)
|
242
|
-
_df = pd.DataFrame(datas)
|
243
|
-
if len(_df) == 0:
|
244
|
-
print(f'查询的数据量: {len(_df)}, 森么都米有花生')
|
245
|
-
self.client.close()
|
246
|
-
return
|
247
|
-
if '_id' in _df.columns.tolist():
|
248
|
-
_df.drop('_id', axis=1, inplace=True)
|
249
|
-
|
250
|
-
print(f'查询的数据量: {len(_df)}')
|
251
|
-
cv = converter.DataFrameConverter()
|
252
|
-
_df = cv.convert_df_cols(_df)
|
253
|
-
s_date = re.findall(r'(\d{4}-\d{2}-\d{2})', str(_df['日期'].values.min()))[0]
|
254
|
-
e_date = re.findall(r'(\d{4}-\d{2}-\d{2})', str(_df['日期'].values.max()))[0]
|
255
|
-
if not file_type.startswith('.'):
|
256
|
-
file_type = '.' + file_type
|
257
|
-
_path = os.path.join(self.save_path, f'{self.db_name}_{self.collection_name}_{s_date}_{e_date}{file_type}')
|
258
|
-
if file_type.endswith('json'):
|
259
|
-
_df.to_json(_path, orient='records', force_ascii=False)
|
260
|
-
elif file_type.endswith('csv'):
|
261
|
-
_df.to_csv(_path, encoding='utf-8_sig', index=False, header=True)
|
262
|
-
elif file_type.endswith('xlsx') or file_type.endswith('xls'):
|
263
|
-
_df.to_excel(_path, index=False, header=True, engine='openpyxl', freeze_panes=(1, 0)) # freeze_ 冻结列索引
|
264
|
-
else:
|
265
|
-
print(f'{file_type}: 未支持的文件类型')
|
266
|
-
print(f'<{self.collection_name}> 导出: {_path}, 数据完成!')
|
267
|
-
self.client.close()
|
268
|
-
|
269
|
-
|
270
|
-
class UploadMongo:
|
271
|
-
"""
|
272
|
-
上传更新数据库
|
273
|
-
目前有两类, 一类上传原始文件, 一类上传pandas数据源
|
274
|
-
单独调用 df_to_mongo 方法,最后必须手动关闭数据库连接
|
275
|
-
self.drop_duplicates: 原始文件不需要删除旧数据, pandas数据源则应删除旧数据
|
276
|
-
"""
|
277
|
-
|
278
|
-
def __init__(self, username, password, host, port: int = 27017, drop_duplicates=False):
|
279
|
-
self.username = username
|
280
|
-
self.password = password
|
281
|
-
self.host = host
|
282
|
-
self.port = port
|
283
|
-
self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
|
284
|
-
self.client = None
|
285
|
-
self.db_name = None # 上传到数据库时的数据库名
|
286
|
-
self.collection_name = None # 上传到数据库时的集合名, 这个类实际是以文件夹或者文件名作为集合名
|
287
|
-
self.data_days = 5 # 更新近期的数据, 不宜过大, 这个参数主要用于 pandas数据源, 其他不要设置
|
288
|
-
self.start_date = None
|
289
|
-
self.encoding = 'utf-8_sig'
|
290
|
-
self.drop_duplicates = drop_duplicates
|
291
|
-
|
292
|
-
@staticmethod
|
293
|
-
def try_except(func): # 在类内部定义一个异常处理方法
|
294
|
-
@wraps(func)
|
295
|
-
def wrapper(*args, **kwargs):
|
296
|
-
try:
|
297
|
-
return func(*args, **kwargs)
|
298
|
-
except Exception as e:
|
299
|
-
print(f'{func.__name__}, {e}') # 将异常信息返回
|
300
|
-
|
301
|
-
return wrapper
|
302
|
-
|
303
|
-
@try_except
|
304
|
-
def upload_file(self, path):
|
305
|
-
if '.DS_Store' in path or '.ini' in path or 'desktop' in path or 'baiduyun' in path or 'xunlei' in path:
|
306
|
-
return
|
307
|
-
if not path.endswith('csv') or path.endswith('年.csv'): # 跳过特定文件
|
308
|
-
return
|
309
|
-
df = pd.read_csv(path, encoding=self.encoding, header=0, na_filter=False)
|
310
|
-
if '日期' in df.columns.tolist():
|
311
|
-
df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else pd.to_datetime('2099-01-01'))
|
312
|
-
self.start_date = pd.to_datetime(datetime.date.today() - datetime.timedelta(days=self.data_days))
|
313
|
-
df = df[df['日期'] >= self.start_date]
|
314
|
-
if len(df) == 0:
|
315
|
-
# 有些跨月报表可能空数据, 所以读取近35天
|
316
|
-
df = df[df['日期'] >= pd.to_datetime(datetime.date.today() - datetime.timedelta(35))]
|
317
|
-
else:
|
318
|
-
df = df[df['日期'] >= self.start_date] # 选取大于该时间点的数据
|
319
|
-
|
320
|
-
if len(df) == 0: # 如果依然是空表,则不上传更新
|
321
|
-
return
|
322
|
-
self.df_to_mongo(df=df)
|
323
|
-
|
324
|
-
@try_except
|
325
|
-
def upload_dir(self, path):
|
326
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
327
|
-
for name in files:
|
328
|
-
if str(self.collection_name) not in name: # 理论上文件夹名必然在文件名中
|
329
|
-
continue
|
330
|
-
new_path = os.path.join(root, name)
|
331
|
-
self.upload_file(new_path)
|
332
|
-
|
333
|
-
def upload_pandas(self, upload_path, select_files: str = None, skip_files: str = '其他数据'):
|
334
|
-
"""
|
335
|
-
专门用于上传pandas数据源到数据库, 跳过 '其他数据' or '京东数据集'
|
336
|
-
要检查 db_name, 不检查 collection
|
337
|
-
select_files: 仅更新此文件
|
338
|
-
skip_files: 跳过文件
|
339
|
-
"""
|
340
|
-
if not self.db_name:
|
341
|
-
print(f' {self.host}/{self.port} 未设置 self.db_name ')
|
342
|
-
return
|
343
|
-
|
344
|
-
pd_files = os.listdir(upload_path)
|
345
|
-
for file in pd_files:
|
346
|
-
if select_files:
|
347
|
-
if select_files not in file:
|
348
|
-
continue
|
349
|
-
if skip_files:
|
350
|
-
if skip_files in file:
|
351
|
-
continue
|
352
|
-
path = os.path.join(upload_path, file)
|
353
|
-
if os.path.isfile(path): # path: 单文件
|
354
|
-
self.collection_name = f'{os.path.splitext(file)[0]}_f'
|
355
|
-
self.upload_file(path=path)
|
356
|
-
elif os.path.isdir(path): # path: 文件夹
|
357
|
-
if '其他数据' in path or '京东数据集' in path:
|
358
|
-
continue # 跳过的文件夹
|
359
|
-
self.collection_name = f'{os.path.splitext(file)[0]}'
|
360
|
-
self.upload_dir(path=path)
|
361
|
-
|
362
|
-
@staticmethod
|
363
|
-
def split_list(lst, _num=None):
|
364
|
-
"""
|
365
|
-
传入列表,并将其 _num 等分
|
366
|
-
"""
|
367
|
-
length = len(lst)
|
368
|
-
if not _num:
|
369
|
-
if length > 20000:
|
370
|
-
_num = 30
|
371
|
-
elif length > 10000:
|
372
|
-
_num = 20
|
373
|
-
elif length > 1000:
|
374
|
-
_num = 15
|
375
|
-
elif length > 200:
|
376
|
-
_num = 5
|
377
|
-
else:
|
378
|
-
_num = 2
|
379
|
-
if length % _num == 0:
|
380
|
-
# print(length, _num)
|
381
|
-
sublist_length = length // _num
|
382
|
-
return [lst[i:i + sublist_length] for i in range(0, length, sublist_length)]
|
383
|
-
else:
|
384
|
-
sublist_length = length // _num
|
385
|
-
extra = length % _num
|
386
|
-
return [lst[i * sublist_length:i * sublist_length + sublist_length] for i in range(_num)] + \
|
387
|
-
[lst[-extra:]] # 添加剩余文档到列表末尾
|
388
|
-
|
389
|
-
@staticmethod
|
390
|
-
def duplicates_list(_datas):
|
391
|
-
"""
|
392
|
-
<mongodb> 对传进来的 _datas 排重,数据量大时将消耗大量系统资源
|
393
|
-
"""
|
394
|
-
if len(_datas) > 100 * 1000:
|
395
|
-
print(f'数据量太大,可能大量消耗系统资源,谨慎执行!!! {len(_datas)}')
|
396
|
-
_my_list = []
|
397
|
-
for _data in _datas:
|
398
|
-
if _data in _my_list:
|
399
|
-
continue
|
400
|
-
else:
|
401
|
-
_my_list.append(_data)
|
402
|
-
return _my_list
|
403
|
-
|
404
|
-
def df_to_mongo(self, df, db_name=None, collection_name=None):
|
405
|
-
"""
|
406
|
-
需要检查 self.db_name 和 self.collection_name
|
407
|
-
df: 待插入数据, dataframe 格式
|
408
|
-
可以初始化时指定 db_name 和 collection_name 或者在这个函数指定
|
409
|
-
"""
|
410
|
-
|
411
|
-
if db_name:
|
412
|
-
self.db_name = db_name
|
413
|
-
if collection_name:
|
414
|
-
self.collection_name = collection_name
|
415
|
-
if not self.db_name or not self.collection_name:
|
416
|
-
print(f'{self.host}/{self.port} 未指定 self.db_name/collection: {self.db_name}/{self.collection_name}')
|
417
|
-
return
|
418
|
-
|
419
|
-
self.db_name = re.sub(r'[\',,()()/=<>+\-*^"’\[\]~#|&% .]', '_', self.db_name)
|
420
|
-
self.collection_name = re.sub(r'[\',,()()/=<>+\-*^"’\[\]~#|&% .]', '_', self.collection_name)
|
421
|
-
self.client = pymongo.MongoClient(self.link)
|
422
|
-
collections = self.client[self.db_name][self.collection_name] # 连接数据库
|
423
|
-
start_date = None
|
424
|
-
end_date = None
|
425
|
-
|
426
|
-
cv = converter.DataFrameConverter()
|
427
|
-
df = cv.convert_df_cols(df=df) # 清理列名中的不合规字符
|
428
|
-
if '日期' in df.columns.tolist():
|
429
|
-
# df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x))
|
430
|
-
collections.create_index([('日期', -1)], background=True) # 必须, 创建索引, background 不阻塞
|
431
|
-
start_date = pd.to_datetime(df['日期'].values.min())
|
432
|
-
end_date = pd.to_datetime(df['日期'].values.max())
|
433
|
-
|
434
|
-
# for col in df.columns.tolist(): # 除日期列外,所有数据类型转为 str 再上传
|
435
|
-
# if '日期' not in col:
|
436
|
-
# df[col] = df[col].astype(str)
|
437
|
-
|
438
|
-
datas = df.to_dict('records') # 待插入的数据, [dict, dict, ....]
|
439
|
-
|
440
|
-
new_list = self.split_list(datas, )
|
441
|
-
# new_list: map 多线程只能传迭代对象,不能直接传其他参数, 所以将 _collection 封装到 list 内
|
442
|
-
# new_list: [[_collection, [dict, dict, ...]], [_collection, [dict, dict, ...]]]
|
443
|
-
new_list = [[collections, item] for item in new_list]
|
444
|
-
|
445
|
-
def delete_data(data_list):
|
446
|
-
""" data_list: [_collection, [dict, dict, ...]]
|
447
|
-
delete_many 接受入参是 dict 文档, 所以需要将 data_list 的第二个参数遍历出来 """
|
448
|
-
for my_datas in data_list[1]:
|
449
|
-
data_list[0].delete_many(my_datas)
|
450
|
-
|
451
|
-
if self.drop_duplicates:
|
452
|
-
# 如果有日期列,按日期范围删除旧数据,没有日期,则直接删除旧数据
|
453
|
-
if '日期' in df.columns.tolist():
|
454
|
-
query = {
|
455
|
-
'日期': {
|
456
|
-
'$gte': start_date,
|
457
|
-
'$lt': end_date + datetime.timedelta(days=1)
|
458
|
-
}
|
459
|
-
}
|
460
|
-
collections.delete_many(query)
|
461
|
-
else:
|
462
|
-
with ThreadPoolExecutor() as pool: # 删除重复数据
|
463
|
-
pool.map(delete_data, new_list)
|
464
|
-
|
465
|
-
def insert_data(data_list): # insert_many 可以直接传入列表,表中包含一堆 dict
|
466
|
-
data_list[0].insert_many(data_list[1])
|
467
|
-
|
468
|
-
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
|
469
|
-
print(f'{now}正在更新 mongoDB ({self.host}:{self.port}) {self.db_name}/{self.collection_name}')
|
470
|
-
|
471
|
-
with ThreadPoolExecutor() as pool: # 插入新数据
|
472
|
-
pool.map(insert_data, new_list)
|
473
|
-
|
474
|
-
self.client.close() #
|
475
|
-
|
476
|
-
|
477
|
-
class OptimizeDatas:
|
478
|
-
"""
|
479
|
-
数据维护 删除 mongodb 的冗余数据
|
480
|
-
更新过程:
|
481
|
-
1. 读取所有数据库和集合
|
482
|
-
2. 遍历所有集合, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
|
483
|
-
3. 按天删除所有冗余数据(存在日期列时)
|
484
|
-
tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
|
485
|
-
"""
|
486
|
-
def __init__(self, username: str, password: str, host: str, port: int, drop_duplicates=False):
|
487
|
-
self.username = username
|
488
|
-
self.password = password
|
489
|
-
self.host = host
|
490
|
-
self.port = port
|
491
|
-
self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
|
492
|
-
self.client = None
|
493
|
-
self.db_name = None # 数据库名称
|
494
|
-
self.db_name_lists = [] # 更新多个数据库 删除重复数据
|
495
|
-
self.collection_name = None # 集合名, 实际应是以文件夹或者文件名作为集合名
|
496
|
-
self.days: int = 60 # 处理近 N 天数据
|
497
|
-
self.end_date = None
|
498
|
-
self.start_date = None
|
499
|
-
|
500
|
-
@staticmethod
|
501
|
-
def try_except(func): # 在类内部定义一个异常处理方法
|
502
|
-
@wraps(func)
|
503
|
-
def wrapper(*args, **kwargs):
|
504
|
-
try:
|
505
|
-
return func(*args, **kwargs)
|
506
|
-
except Exception as e:
|
507
|
-
print(f'{func.__name__}, {e}') # 将异常信息返回
|
508
|
-
|
509
|
-
return wrapper
|
510
|
-
|
511
|
-
# @try_except
|
512
|
-
def optimize_list(self):
|
513
|
-
"""
|
514
|
-
更新多个数据库 移除冗余数据
|
515
|
-
需要设置 self.db_name_lists
|
516
|
-
"""
|
517
|
-
if not self.db_name_lists:
|
518
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
519
|
-
print(f'{now}尚未设置参数: self.db_name_lists')
|
520
|
-
return
|
521
|
-
for db_name in self.db_name_lists:
|
522
|
-
self.db_name = db_name
|
523
|
-
self.optimize()
|
524
|
-
|
525
|
-
# @try_except
|
526
|
-
def my_collection_names(self, db_name) -> list:
|
527
|
-
""" 获取指定数据库的所有集合 """
|
528
|
-
database_names = self.client.list_database_names() # 所有数据库名称
|
529
|
-
if db_name not in database_names:
|
530
|
-
print(f'{self.host}/{self.port} 数据库: {database_names}, 不存在的数据库: {db_name}')
|
531
|
-
results = self.client[db_name].list_collection_names()
|
532
|
-
return results
|
533
|
-
|
534
|
-
# @try_except
|
535
|
-
def optimize(self):
|
536
|
-
""" 获取指定集合的数据 """
|
537
|
-
if not self.db_name:
|
538
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
539
|
-
print(f'{now}{self.host}/{self.port} 尚未设置参数: self.db_name')
|
540
|
-
return
|
541
|
-
self.client = pymongo.MongoClient(self.link) # 连接数据库
|
542
|
-
database_names = self.client.list_database_names() # 所有数据库名称
|
543
|
-
if self.db_name not in database_names:
|
544
|
-
print(f'{self.host}/{self.port} 当前数据库: {database_names}, 不存在的数据库: {self.db_name}')
|
545
|
-
return
|
546
|
-
|
547
|
-
# 日期初始化
|
548
|
-
if not self.end_date:
|
549
|
-
self.end_date = pd.to_datetime(datetime.datetime.today())
|
550
|
-
else:
|
551
|
-
self.end_date = pd.to_datetime(self.end_date)
|
552
|
-
if self.days:
|
553
|
-
self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
|
554
|
-
if not self.start_date:
|
555
|
-
self.start_date = self.end_date
|
556
|
-
else:
|
557
|
-
self.start_date = pd.to_datetime(self.start_date)
|
558
|
-
start_date_before = self.start_date
|
559
|
-
end_date_before = self.end_date
|
560
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
561
|
-
print(f'{now}mongodb({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
|
562
|
-
|
563
|
-
collections = self.my_collection_names(db_name=self.db_name) # 所有集合名称
|
564
|
-
for collection_name in collections:
|
565
|
-
collection = self.client[self.db_name].get_collection(collection_name)
|
566
|
-
# 查询集合中是否包含日期列
|
567
|
-
has_date_field = collection.find_one({'日期': {'$exists': True}}) is not None
|
568
|
-
if not has_date_field: # 没有日期则全集更新
|
569
|
-
self.delete_duplicate2(collection_name=collection_name)
|
570
|
-
continue
|
571
|
-
pipeline = [
|
572
|
-
{"$group": {"_id": None, "min_date": {"$min": "$日期"}, "max_date": {"$max": "$日期"}}}
|
573
|
-
]
|
574
|
-
results = collection.aggregate(pipeline)
|
575
|
-
for result in results: # {'_id': None, 'min_date': datetime.datetime(2023, 1, 1, 0, 0)}
|
576
|
-
start_date = pd.to_datetime(result['min_date']) # 当前集合中的最小日期
|
577
|
-
end_date = pd.to_datetime(result['max_date']) # 当前集合中的最大日期
|
578
|
-
if self.start_date < start_date: # 匹配修改为合适的起始和结束日期
|
579
|
-
self.start_date = start_date
|
580
|
-
if self.end_date > end_date:
|
581
|
-
self.end_date = end_date
|
582
|
-
break
|
583
|
-
# print(collection_name, self.start_date, start_date, self.end_date, end_date)
|
584
|
-
dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
|
585
|
-
for date in dates_list:
|
586
|
-
self.delete_duplicate(collection_name=collection_name, date=date)
|
587
|
-
self.start_date = start_date_before # 重置,不然日期错乱
|
588
|
-
self.end_date = end_date_before
|
589
|
-
|
590
|
-
# self.client.close() # 断开连接
|
591
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
592
|
-
print(f'{now}mongodb({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
|
593
|
-
|
594
|
-
def delete_duplicate(self, collection_name, date,):
|
595
|
-
""" 更新数据 集合中有日期列的情况 """
|
596
|
-
collection = self.client[self.db_name].get_collection(collection_name)
|
597
|
-
pipeline = [
|
598
|
-
{'$match': {'日期': {'$gte': date, '$lte': date}}},
|
599
|
-
# {'$project': {'_id': 0}}, # 不保留 id 字段
|
600
|
-
]
|
601
|
-
docs = collection.aggregate(pipeline)
|
602
|
-
datas = []
|
603
|
-
for doc in docs:
|
604
|
-
datas.append(doc)
|
605
|
-
duplicate_id = [] # 出现重复的 id
|
606
|
-
all_datas = [] # 迭代器
|
607
|
-
for data in datas:
|
608
|
-
delete_id = data['_id']
|
609
|
-
del data['_id']
|
610
|
-
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
611
|
-
if data in all_datas: # 数据出现重复时
|
612
|
-
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
613
|
-
continue
|
614
|
-
all_datas.append(data) # 数据没有重复
|
615
|
-
del all_datas
|
616
|
-
|
617
|
-
if not duplicate_id: # 如果没有重复数据,则跳过
|
618
|
-
return
|
619
|
-
collection.delete_many({'_id': {'$in': duplicate_id}})
|
620
|
-
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
|
621
|
-
print(f'{now}{collection_name} -> {date.strftime("%Y-%m-%d")} '
|
622
|
-
f'before: {len(datas)}, remove: {len(duplicate_id)}')
|
623
|
-
|
624
|
-
def delete_duplicate2(self, collection_name,):
|
625
|
-
""" 更新数据 正常按日期逐天检查,如果没有日期列的情况"""
|
626
|
-
collection = self.client[self.db_name].get_collection(collection_name)
|
627
|
-
docs = collection.find({})
|
628
|
-
datas = []
|
629
|
-
for doc in docs:
|
630
|
-
datas.append(doc)
|
631
|
-
duplicate_id = [] # 出现重复的 id
|
632
|
-
all_datas = [] # 迭代器
|
633
|
-
for data in datas:
|
634
|
-
delete_id = data['_id']
|
635
|
-
del data['_id']
|
636
|
-
data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
|
637
|
-
if data in all_datas: # 数据出现重复时
|
638
|
-
duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
|
639
|
-
continue
|
640
|
-
all_datas.append(data) # 数据没有重复
|
641
|
-
del all_datas
|
642
|
-
|
643
|
-
if not duplicate_id: # 如果没有重复数据,则跳过
|
644
|
-
return
|
645
|
-
collection.delete_many({'_id': {'$in': duplicate_id}})
|
646
|
-
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
|
647
|
-
print(f'{now}{collection_name} -> before: {len(datas)}, remove: {len(duplicate_id)}')
|
648
|
-
|
649
|
-
def get_collection_datas_bak(self, db_name, collection_name):
|
650
|
-
database_names = self.client.list_database_names() # 所有数据库名称
|
651
|
-
if db_name not in database_names:
|
652
|
-
print(f'{self.host}/{self.port} 当前数据库: {database_names}, 不存在的数据库: {db_name}')
|
653
|
-
collection = self.client[self.db_name].get_collection(collection_name)
|
654
|
-
batch_size = 1000 # 设置批次大小
|
655
|
-
cursor = collection.find().batch_size(batch_size) # 获取游标
|
656
|
-
results = []
|
657
|
-
for doc in cursor:
|
658
|
-
results.append(doc)
|
659
|
-
return results
|
660
|
-
|
661
|
-
def day_list(self, start_date, end_date):
|
662
|
-
start_date = pd.to_datetime(start_date)
|
663
|
-
end_date = pd.to_datetime(end_date)
|
664
|
-
date_list = []
|
665
|
-
while start_date <= end_date:
|
666
|
-
date_list.append(pd.to_datetime(start_date.date()))
|
667
|
-
start_date += datetime.timedelta(days=1)
|
668
|
-
return date_list
|
669
|
-
|
670
|
-
def rename_column(self):
|
671
|
-
""" 批量修改数据库的列名 """
|
672
|
-
"""
|
673
|
-
# for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
|
674
|
-
# s = OptimizeDatas(username=username, password=password, host=host, port=port)
|
675
|
-
# s.db_name = db_name
|
676
|
-
# s.rename_column()
|
677
|
-
# s.client.close()
|
678
|
-
"""
|
679
|
-
self.client = pymongo.MongoClient(self.link) # 连接数据库
|
680
|
-
database_names = self.client.list_database_names() # 所有数据库名称
|
681
|
-
collections = self.my_collection_names(db_name=self.db_name) # 所有集合名称
|
682
|
-
for collection_name in collections:
|
683
|
-
collection = self.client[self.db_name].get_collection(collection_name)
|
684
|
-
has_date_field = collection.find_one({})
|
685
|
-
for key, value in has_date_field.items():
|
686
|
-
if key.endswith('_'):
|
687
|
-
new_name = re.sub(r'_+$', '', key)
|
688
|
-
query = {key: {'$exists': True}}
|
689
|
-
update = {'$rename': {key: new_name}}
|
690
|
-
collection.update_many(query, update)
|
691
|
-
|
692
|
-
|
693
|
-
def upload_one_dir():
|
694
|
-
if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
|
695
|
-
conf = myconfig.main()
|
696
|
-
conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
|
697
|
-
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
698
|
-
'port']
|
699
|
-
else:
|
700
|
-
conf = myconfig.main()
|
701
|
-
conf_data = conf['Windows']['company']['mysql']['remoto']
|
702
|
-
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
703
|
-
'port']
|
704
|
-
|
705
|
-
p = UploadMongo(username=username, password=password, host=host, port=port, drop_duplicates=False)
|
706
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
707
|
-
print(f'{now}数据处理中...')
|
708
|
-
|
709
|
-
|
710
|
-
def main():
|
711
|
-
pass
|
712
|
-
|
713
|
-
|
714
|
-
if __name__ == '__main__':
|
715
|
-
# main()
|
716
|
-
print(username, password, host, port)
|
717
|
-
|
718
|
-
# for db_name in [
|
719
|
-
# '京东数据2',
|
720
|
-
# '推广数据2',
|
721
|
-
# '市场数据2',
|
722
|
-
# '生意参谋2',
|
723
|
-
# '生意经2',
|
724
|
-
# '属性设置2',
|
725
|
-
# ]:
|
726
|
-
# s = OptimizeDatas(username=username, password=password, host=host, port=port)
|
727
|
-
# s.db_name = db_name
|
728
|
-
# s.rename_column()
|
729
|
-
# s.client.close()
|