mdbq 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdbq/mongo/mongo.py DELETED
@@ -1,729 +0,0 @@
1
- # -*- coding:utf-8 -*-
2
- import datetime
3
- import os
4
- import re
5
- import warnings
6
- import time
7
- import pandas as pd
8
- import numpy as np
9
- import pymongo
10
- from functools import wraps
11
- import socket
12
- import platform
13
- from concurrent.futures import ThreadPoolExecutor
14
- from mdbq.config import myconfig
15
- from mdbq.dataframe import converter
16
-
17
- warnings.filterwarnings('ignore')
18
- if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
19
- conf = myconfig.main()
20
- conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
21
- username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
22
- 'port']
23
- else:
24
- conf = myconfig.main()
25
- conf_data = conf['Windows']['company']['mysql']['remoto']
26
- username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
27
- 'port']
28
-
29
-
30
- def rename_col(username, password, host, db_name, collection_name, old_name, new_name, port: int = 27017,):
31
- """ 修改mongodb数据源 某集合的某个字段名 """
32
- # 连接到MongoDB
33
- _link = f'mongodb://{username}:{password}@{host}:{port}/'
34
- client = pymongo.MongoClient(_link)
35
- db = client[db_name] # 数据库名
36
- collection = db[collection_name] # 集合名
37
-
38
- rename_operation = {"$rename": {old_name: new_name}} # 修改字段名的操作
39
-
40
- collection.update_many({}, rename_operation)
41
- if new_name == '日期':
42
- collection.create_index([(new_name, -1)], background=True) # 必须, 创建索引
43
-
44
-
45
- class CreateUser:
46
- """
47
- 通过python 创建 mongodb 管理员账户
48
- """
49
- def __init__(self, username, password, host, port: int = 27017):
50
- self.username = username
51
- self.password = password
52
- self.host = host
53
- self.port = port
54
- self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
55
- self.client = None
56
-
57
- self.db_roles = [{'市场数据2': 'read'}]
58
- self.user_infos = [] # 现有用户信息
59
- self.root = False
60
- self.add_permission = True # 更新权限时, 默认新增, 设置为False 则减去权限
61
-
62
- @staticmethod
63
- def try_except(func): # 在类内部定义一个异常处理方法
64
- @wraps(func)
65
- def wrapper(*args, **kwargs):
66
- try:
67
- return func(*args, **kwargs)
68
- except Exception as e:
69
- print(f'{func.__name__}, {e}') # 将异常信息返回
70
-
71
- return wrapper
72
-
73
- @try_except
74
- def create_user(self):
75
- """
76
- role: read(只读), readAnyDatabase(读取所有), readWriteAnyDatabase(读写所有), userAdminAnyDatabase(用户管理权限)
77
-
78
- """
79
-
80
- self.client = pymongo.MongoClient(self.link) # 连接数据库
81
- db = self.client['admin'] # 切换到admin数据库
82
- users = db.system.users.find() # 获取所有用户
83
- for user in users:
84
- self.user_infos.append({user['user']: user['roles']})
85
-
86
- add_roles = []
87
- for db_role in self.db_roles:
88
- for key, value in db_role.items():
89
- add_roles.append({
90
- 'role': value,
91
- 'db': key
92
- })
93
- # root_roles = ['root'], # root 权限用户, 正常情况下不要创建 root
94
- root_roles = [
95
- {'role': 'userAdminAnyDatabase', 'db': 'admin'}, # 赋予所有数据库的用户管理权限
96
- {'role': 'readWriteAnyDatabase', 'db': 'admin'} # 赋予所有数据库的读写权限
97
- ]
98
-
99
- user_list = [] # 现有用户列表
100
- i = 0
101
- for user_info in self.user_infos:
102
- for key, value in user_info.items():
103
- user_list.append(key)
104
- if self.username == key:
105
- print(f'{self.username}: 用户已存在, 权限为: {value}')
106
- if self.root:
107
- print(f'不支持直接升级管理员权限, 请先删除用户再重新创建root角色, 设置 self.root = True ')
108
- if self.add_permission: # 新增权限
109
- roles = value + add_roles
110
- else: # 减去权限
111
- roles = [item for item in value if item['db'] != add_roles[0]['db']]
112
- db.command('updateUser', self.username, roles=roles) # 更新权限
113
- i += 1
114
- break
115
- if self.root: # 设置超级管理员
116
- db.command(command='createUser', value=self.username, pwd=self.password, roles=root_roles)
117
- print(f'管理员创建成功: {self.username}, 权限为: {root_roles}')
118
- self.client.close()
119
- return
120
- if i > 0:
121
- self.client.close()
122
- return
123
- admin_user = db.command(command='createUser', value=self.username, pwd=self.password, roles=add_roles)
124
- if admin_user['ok'] > 0:
125
- print(f'普通用户创建成功: {self.username}, 权限为: {add_roles}')
126
- self.client.close()
127
-
128
- def delete_user(self):
129
- """ 删除指定用户: self.username """
130
- self.client = pymongo.MongoClient(self.link) # 连接数据库
131
- db = self.client['admin'] # 切换到admin数据库
132
- users = db.system.users.find() # 获取所有用户
133
- for user in users:
134
- self.user_infos.append({user['user']: user['roles']})
135
-
136
- user_list = [] # 现有用户列表
137
- i = 0
138
- for user_info in self.user_infos:
139
- for key, value in user_info.items():
140
- user_list.append(key)
141
- if self.username == key:
142
- db.command("dropUser", self.username)
143
- print(f'已删除用户: {self.username}')
144
- i += 1
145
- if i == 0:
146
- print(f'不存在的用户: {self.username}, 无需执行删除操作')
147
- self.client.close()
148
-
149
-
150
- class DownMongo:
151
- """ 下载数据 """
152
- def __init__(self, save_path, username, password, host, port: int = 27017):
153
- self.username = username
154
- self.password = password
155
- self.host = host
156
- self.port = port
157
- self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
158
- self.client = None
159
- self.db_name = None
160
- self.collection_name = None
161
- self.days = 5
162
- self.start_date = None
163
- self.end_date = datetime.datetime.now()
164
- self.save_path = save_path
165
- self.projection = {'_id': 0} # 读取数据库指定字段
166
-
167
- def data_to_df(self, db_name, collection_name, projection: dict):
168
- self.client = pymongo.MongoClient(self.link) # 连接数据库
169
- self.db_name = db_name
170
- self.collection_name = collection_name
171
- collection = self.client[self.db_name][self.collection_name] # 连接集合
172
- if not self.start_date:
173
- self.start_date = datetime.datetime.now() - datetime.timedelta(days=self.days)
174
- self.end_date = datetime.datetime.now()
175
- else:
176
- self.start_date = pd.to_datetime(self.start_date) # 对日期进行格式化并赋值
177
- self.end_date = pd.to_datetime(self.end_date)
178
- # print(self.start_date, '->', self.end_date)
179
-
180
- self.projection.update(projection) # 指定字段
181
- pipeline = [
182
- {'$match': {'日期': {'$gte': self.start_date, '$lte': self.end_date}}},
183
- {'$project': projection},
184
- ]
185
- results = collection.aggregate(pipeline)
186
- # print(results)
187
- # 输出结果
188
- datas = []
189
- for doc in results:
190
- # print(doc)
191
- datas.append(doc)
192
- if len(datas) == 0:
193
- return pd.DataFrame()
194
- df = pd.DataFrame(datas)
195
- for col in df.columns.tolist():
196
- if '日期' in col:
197
- try:
198
- df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='ignore') # 转换日期列
199
- except ValueError as v:
200
- print(f'{name}: {v}')
201
- else:
202
- df[col] = pd.to_numeric(df[col], errors='ignore').fillna(0) # 尝试转换数据类型
203
- # self.client.close()
204
- return df
205
-
206
-
207
-
208
- def data_to_file(self, file_type, db_name, collection_name):
209
- """
210
- 用于 GUI 的函数
211
- 将 mongodb 数据保存本地
212
- db_name: 数据库名
213
- collections 集合名
214
- file_type: 保存的文件类型 csv, json, xlsx, xls
215
- """
216
- self.client = pymongo.MongoClient(self.link) # 连接数据库
217
- self.db_name = db_name
218
- self.collection_name = collection_name
219
- _collection = self.client[self.db_name][self.collection_name] # 连接集合
220
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
221
- if not self.start_date:
222
- print(f'{now}正在下载 ({self.host}) {self.db_name}: {self.collection_name}, 区间: 近 {self.days} 天\n...')
223
- else:
224
- print(f'{now}正在下载 ({self.host}) {self.db_name}: {self.collection_name}, 区间: {self.start_date}~{self.end_date}')
225
-
226
- if not self.start_date:
227
- self.start_date = datetime.datetime.now() - datetime.timedelta(days=self.days)
228
- self.end_date = datetime.datetime.now()
229
- else:
230
- self.start_date = pd.to_datetime(self.start_date) # 对日期进行格式化并赋值
231
- self.end_date = pd.to_datetime(self.end_date)
232
- pipeline = [
233
- {'$match': {'日期': {'$gte': self.start_date, '$lte': self.end_date}}},
234
- {'$project': {'_id': 0}}, # 不保留 id 字段
235
- ]
236
- results = _collection.aggregate(pipeline)
237
-
238
- # 输出结果
239
- datas = []
240
- for doc in results:
241
- datas.append(doc)
242
- _df = pd.DataFrame(datas)
243
- if len(_df) == 0:
244
- print(f'查询的数据量: {len(_df)}, 森么都米有花生')
245
- self.client.close()
246
- return
247
- if '_id' in _df.columns.tolist():
248
- _df.drop('_id', axis=1, inplace=True)
249
-
250
- print(f'查询的数据量: {len(_df)}')
251
- cv = converter.DataFrameConverter()
252
- _df = cv.convert_df_cols(_df)
253
- s_date = re.findall(r'(\d{4}-\d{2}-\d{2})', str(_df['日期'].values.min()))[0]
254
- e_date = re.findall(r'(\d{4}-\d{2}-\d{2})', str(_df['日期'].values.max()))[0]
255
- if not file_type.startswith('.'):
256
- file_type = '.' + file_type
257
- _path = os.path.join(self.save_path, f'{self.db_name}_{self.collection_name}_{s_date}_{e_date}{file_type}')
258
- if file_type.endswith('json'):
259
- _df.to_json(_path, orient='records', force_ascii=False)
260
- elif file_type.endswith('csv'):
261
- _df.to_csv(_path, encoding='utf-8_sig', index=False, header=True)
262
- elif file_type.endswith('xlsx') or file_type.endswith('xls'):
263
- _df.to_excel(_path, index=False, header=True, engine='openpyxl', freeze_panes=(1, 0)) # freeze_ 冻结列索引
264
- else:
265
- print(f'{file_type}: 未支持的文件类型')
266
- print(f'<{self.collection_name}> 导出: {_path}, 数据完成!')
267
- self.client.close()
268
-
269
-
270
- class UploadMongo:
271
- """
272
- 上传更新数据库
273
- 目前有两类, 一类上传原始文件, 一类上传pandas数据源
274
- 单独调用 df_to_mongo 方法,最后必须手动关闭数据库连接
275
- self.drop_duplicates: 原始文件不需要删除旧数据, pandas数据源则应删除旧数据
276
- """
277
-
278
- def __init__(self, username, password, host, port: int = 27017, drop_duplicates=False):
279
- self.username = username
280
- self.password = password
281
- self.host = host
282
- self.port = port
283
- self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
284
- self.client = None
285
- self.db_name = None # 上传到数据库时的数据库名
286
- self.collection_name = None # 上传到数据库时的集合名, 这个类实际是以文件夹或者文件名作为集合名
287
- self.data_days = 5 # 更新近期的数据, 不宜过大, 这个参数主要用于 pandas数据源, 其他不要设置
288
- self.start_date = None
289
- self.encoding = 'utf-8_sig'
290
- self.drop_duplicates = drop_duplicates
291
-
292
- @staticmethod
293
- def try_except(func): # 在类内部定义一个异常处理方法
294
- @wraps(func)
295
- def wrapper(*args, **kwargs):
296
- try:
297
- return func(*args, **kwargs)
298
- except Exception as e:
299
- print(f'{func.__name__}, {e}') # 将异常信息返回
300
-
301
- return wrapper
302
-
303
- @try_except
304
- def upload_file(self, path):
305
- if '.DS_Store' in path or '.ini' in path or 'desktop' in path or 'baiduyun' in path or 'xunlei' in path:
306
- return
307
- if not path.endswith('csv') or path.endswith('年.csv'): # 跳过特定文件
308
- return
309
- df = pd.read_csv(path, encoding=self.encoding, header=0, na_filter=False)
310
- if '日期' in df.columns.tolist():
311
- df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x) if x else pd.to_datetime('2099-01-01'))
312
- self.start_date = pd.to_datetime(datetime.date.today() - datetime.timedelta(days=self.data_days))
313
- df = df[df['日期'] >= self.start_date]
314
- if len(df) == 0:
315
- # 有些跨月报表可能空数据, 所以读取近35天
316
- df = df[df['日期'] >= pd.to_datetime(datetime.date.today() - datetime.timedelta(35))]
317
- else:
318
- df = df[df['日期'] >= self.start_date] # 选取大于该时间点的数据
319
-
320
- if len(df) == 0: # 如果依然是空表,则不上传更新
321
- return
322
- self.df_to_mongo(df=df)
323
-
324
- @try_except
325
- def upload_dir(self, path):
326
- for root, dirs, files in os.walk(path, topdown=False):
327
- for name in files:
328
- if str(self.collection_name) not in name: # 理论上文件夹名必然在文件名中
329
- continue
330
- new_path = os.path.join(root, name)
331
- self.upload_file(new_path)
332
-
333
- def upload_pandas(self, upload_path, select_files: str = None, skip_files: str = '其他数据'):
334
- """
335
- 专门用于上传pandas数据源到数据库, 跳过 '其他数据' or '京东数据集'
336
- 要检查 db_name, 不检查 collection
337
- select_files: 仅更新此文件
338
- skip_files: 跳过文件
339
- """
340
- if not self.db_name:
341
- print(f' {self.host}/{self.port} 未设置 self.db_name ')
342
- return
343
-
344
- pd_files = os.listdir(upload_path)
345
- for file in pd_files:
346
- if select_files:
347
- if select_files not in file:
348
- continue
349
- if skip_files:
350
- if skip_files in file:
351
- continue
352
- path = os.path.join(upload_path, file)
353
- if os.path.isfile(path): # path: 单文件
354
- self.collection_name = f'{os.path.splitext(file)[0]}_f'
355
- self.upload_file(path=path)
356
- elif os.path.isdir(path): # path: 文件夹
357
- if '其他数据' in path or '京东数据集' in path:
358
- continue # 跳过的文件夹
359
- self.collection_name = f'{os.path.splitext(file)[0]}'
360
- self.upload_dir(path=path)
361
-
362
- @staticmethod
363
- def split_list(lst, _num=None):
364
- """
365
- 传入列表,并将其 _num 等分
366
- """
367
- length = len(lst)
368
- if not _num:
369
- if length > 20000:
370
- _num = 30
371
- elif length > 10000:
372
- _num = 20
373
- elif length > 1000:
374
- _num = 15
375
- elif length > 200:
376
- _num = 5
377
- else:
378
- _num = 2
379
- if length % _num == 0:
380
- # print(length, _num)
381
- sublist_length = length // _num
382
- return [lst[i:i + sublist_length] for i in range(0, length, sublist_length)]
383
- else:
384
- sublist_length = length // _num
385
- extra = length % _num
386
- return [lst[i * sublist_length:i * sublist_length + sublist_length] for i in range(_num)] + \
387
- [lst[-extra:]] # 添加剩余文档到列表末尾
388
-
389
- @staticmethod
390
- def duplicates_list(_datas):
391
- """
392
- <mongodb> 对传进来的 _datas 排重,数据量大时将消耗大量系统资源
393
- """
394
- if len(_datas) > 100 * 1000:
395
- print(f'数据量太大,可能大量消耗系统资源,谨慎执行!!! {len(_datas)}')
396
- _my_list = []
397
- for _data in _datas:
398
- if _data in _my_list:
399
- continue
400
- else:
401
- _my_list.append(_data)
402
- return _my_list
403
-
404
- def df_to_mongo(self, df, db_name=None, collection_name=None):
405
- """
406
- 需要检查 self.db_name 和 self.collection_name
407
- df: 待插入数据, dataframe 格式
408
- 可以初始化时指定 db_name 和 collection_name 或者在这个函数指定
409
- """
410
-
411
- if db_name:
412
- self.db_name = db_name
413
- if collection_name:
414
- self.collection_name = collection_name
415
- if not self.db_name or not self.collection_name:
416
- print(f'{self.host}/{self.port} 未指定 self.db_name/collection: {self.db_name}/{self.collection_name}')
417
- return
418
-
419
- self.db_name = re.sub(r'[\',,()()/=<>+\-*^"’\[\]~#|&% .]', '_', self.db_name)
420
- self.collection_name = re.sub(r'[\',,()()/=<>+\-*^"’\[\]~#|&% .]', '_', self.collection_name)
421
- self.client = pymongo.MongoClient(self.link)
422
- collections = self.client[self.db_name][self.collection_name] # 连接数据库
423
- start_date = None
424
- end_date = None
425
-
426
- cv = converter.DataFrameConverter()
427
- df = cv.convert_df_cols(df=df) # 清理列名中的不合规字符
428
- if '日期' in df.columns.tolist():
429
- # df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x))
430
- collections.create_index([('日期', -1)], background=True) # 必须, 创建索引, background 不阻塞
431
- start_date = pd.to_datetime(df['日期'].values.min())
432
- end_date = pd.to_datetime(df['日期'].values.max())
433
-
434
- # for col in df.columns.tolist(): # 除日期列外,所有数据类型转为 str 再上传
435
- # if '日期' not in col:
436
- # df[col] = df[col].astype(str)
437
-
438
- datas = df.to_dict('records') # 待插入的数据, [dict, dict, ....]
439
-
440
- new_list = self.split_list(datas, )
441
- # new_list: map 多线程只能传迭代对象,不能直接传其他参数, 所以将 _collection 封装到 list 内
442
- # new_list: [[_collection, [dict, dict, ...]], [_collection, [dict, dict, ...]]]
443
- new_list = [[collections, item] for item in new_list]
444
-
445
- def delete_data(data_list):
446
- """ data_list: [_collection, [dict, dict, ...]]
447
- delete_many 接受入参是 dict 文档, 所以需要将 data_list 的第二个参数遍历出来 """
448
- for my_datas in data_list[1]:
449
- data_list[0].delete_many(my_datas)
450
-
451
- if self.drop_duplicates:
452
- # 如果有日期列,按日期范围删除旧数据,没有日期,则直接删除旧数据
453
- if '日期' in df.columns.tolist():
454
- query = {
455
- '日期': {
456
- '$gte': start_date,
457
- '$lt': end_date + datetime.timedelta(days=1)
458
- }
459
- }
460
- collections.delete_many(query)
461
- else:
462
- with ThreadPoolExecutor() as pool: # 删除重复数据
463
- pool.map(delete_data, new_list)
464
-
465
- def insert_data(data_list): # insert_many 可以直接传入列表,表中包含一堆 dict
466
- data_list[0].insert_many(data_list[1])
467
-
468
- now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
469
- print(f'{now}正在更新 mongoDB ({self.host}:{self.port}) {self.db_name}/{self.collection_name}')
470
-
471
- with ThreadPoolExecutor() as pool: # 插入新数据
472
- pool.map(insert_data, new_list)
473
-
474
- self.client.close() #
475
-
476
-
477
- class OptimizeDatas:
478
- """
479
- 数据维护 删除 mongodb 的冗余数据
480
- 更新过程:
481
- 1. 读取所有数据库和集合
482
- 2. 遍历所有集合, 遍历列, 如果存在日期列则按天遍历所有日期, 不存在则全表读取
483
- 3. 按天删除所有冗余数据(存在日期列时)
484
- tips: 查找冗余数据的方式是创建一个临时迭代器, 逐行读取数据并添加到迭代器, 出现重复时将重复数据的 id 添加到临时列表, 按列表 id 执行删除
485
- """
486
- def __init__(self, username: str, password: str, host: str, port: int, drop_duplicates=False):
487
- self.username = username
488
- self.password = password
489
- self.host = host
490
- self.port = port
491
- self.link = f'mongodb://{self.username}:{self.password}@{self.host}:{self.port}/'
492
- self.client = None
493
- self.db_name = None # 数据库名称
494
- self.db_name_lists = [] # 更新多个数据库 删除重复数据
495
- self.collection_name = None # 集合名, 实际应是以文件夹或者文件名作为集合名
496
- self.days: int = 60 # 处理近 N 天数据
497
- self.end_date = None
498
- self.start_date = None
499
-
500
- @staticmethod
501
- def try_except(func): # 在类内部定义一个异常处理方法
502
- @wraps(func)
503
- def wrapper(*args, **kwargs):
504
- try:
505
- return func(*args, **kwargs)
506
- except Exception as e:
507
- print(f'{func.__name__}, {e}') # 将异常信息返回
508
-
509
- return wrapper
510
-
511
- # @try_except
512
- def optimize_list(self):
513
- """
514
- 更新多个数据库 移除冗余数据
515
- 需要设置 self.db_name_lists
516
- """
517
- if not self.db_name_lists:
518
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
519
- print(f'{now}尚未设置参数: self.db_name_lists')
520
- return
521
- for db_name in self.db_name_lists:
522
- self.db_name = db_name
523
- self.optimize()
524
-
525
- # @try_except
526
- def my_collection_names(self, db_name) -> list:
527
- """ 获取指定数据库的所有集合 """
528
- database_names = self.client.list_database_names() # 所有数据库名称
529
- if db_name not in database_names:
530
- print(f'{self.host}/{self.port} 数据库: {database_names}, 不存在的数据库: {db_name}')
531
- results = self.client[db_name].list_collection_names()
532
- return results
533
-
534
- # @try_except
535
- def optimize(self):
536
- """ 获取指定集合的数据 """
537
- if not self.db_name:
538
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
539
- print(f'{now}{self.host}/{self.port} 尚未设置参数: self.db_name')
540
- return
541
- self.client = pymongo.MongoClient(self.link) # 连接数据库
542
- database_names = self.client.list_database_names() # 所有数据库名称
543
- if self.db_name not in database_names:
544
- print(f'{self.host}/{self.port} 当前数据库: {database_names}, 不存在的数据库: {self.db_name}')
545
- return
546
-
547
- # 日期初始化
548
- if not self.end_date:
549
- self.end_date = pd.to_datetime(datetime.datetime.today())
550
- else:
551
- self.end_date = pd.to_datetime(self.end_date)
552
- if self.days:
553
- self.start_date = pd.to_datetime(self.end_date - datetime.timedelta(days=self.days))
554
- if not self.start_date:
555
- self.start_date = self.end_date
556
- else:
557
- self.start_date = pd.to_datetime(self.start_date)
558
- start_date_before = self.start_date
559
- end_date_before = self.end_date
560
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
561
- print(f'{now}mongodb({self.host}: {self.port}) {self.db_name} 数据库优化中(日期长度: {self.days} 天)...')
562
-
563
- collections = self.my_collection_names(db_name=self.db_name) # 所有集合名称
564
- for collection_name in collections:
565
- collection = self.client[self.db_name].get_collection(collection_name)
566
- # 查询集合中是否包含日期列
567
- has_date_field = collection.find_one({'日期': {'$exists': True}}) is not None
568
- if not has_date_field: # 没有日期则全集更新
569
- self.delete_duplicate2(collection_name=collection_name)
570
- continue
571
- pipeline = [
572
- {"$group": {"_id": None, "min_date": {"$min": "$日期"}, "max_date": {"$max": "$日期"}}}
573
- ]
574
- results = collection.aggregate(pipeline)
575
- for result in results: # {'_id': None, 'min_date': datetime.datetime(2023, 1, 1, 0, 0)}
576
- start_date = pd.to_datetime(result['min_date']) # 当前集合中的最小日期
577
- end_date = pd.to_datetime(result['max_date']) # 当前集合中的最大日期
578
- if self.start_date < start_date: # 匹配修改为合适的起始和结束日期
579
- self.start_date = start_date
580
- if self.end_date > end_date:
581
- self.end_date = end_date
582
- break
583
- # print(collection_name, self.start_date, start_date, self.end_date, end_date)
584
- dates_list = self.day_list(start_date=self.start_date, end_date=self.end_date)
585
- for date in dates_list:
586
- self.delete_duplicate(collection_name=collection_name, date=date)
587
- self.start_date = start_date_before # 重置,不然日期错乱
588
- self.end_date = end_date_before
589
-
590
- # self.client.close() # 断开连接
591
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
592
- print(f'{now}mongodb({self.host}: {self.port}) {self.db_name} 数据库优化完成!')
593
-
594
- def delete_duplicate(self, collection_name, date,):
595
- """ 更新数据 集合中有日期列的情况 """
596
- collection = self.client[self.db_name].get_collection(collection_name)
597
- pipeline = [
598
- {'$match': {'日期': {'$gte': date, '$lte': date}}},
599
- # {'$project': {'_id': 0}}, # 不保留 id 字段
600
- ]
601
- docs = collection.aggregate(pipeline)
602
- datas = []
603
- for doc in docs:
604
- datas.append(doc)
605
- duplicate_id = [] # 出现重复的 id
606
- all_datas = [] # 迭代器
607
- for data in datas:
608
- delete_id = data['_id']
609
- del data['_id']
610
- data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
611
- if data in all_datas: # 数据出现重复时
612
- duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
613
- continue
614
- all_datas.append(data) # 数据没有重复
615
- del all_datas
616
-
617
- if not duplicate_id: # 如果没有重复数据,则跳过
618
- return
619
- collection.delete_many({'_id': {'$in': duplicate_id}})
620
- now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
621
- print(f'{now}{collection_name} -> {date.strftime("%Y-%m-%d")} '
622
- f'before: {len(datas)}, remove: {len(duplicate_id)}')
623
-
624
- def delete_duplicate2(self, collection_name,):
625
- """ 更新数据 正常按日期逐天检查,如果没有日期列的情况"""
626
- collection = self.client[self.db_name].get_collection(collection_name)
627
- docs = collection.find({})
628
- datas = []
629
- for doc in docs:
630
- datas.append(doc)
631
- duplicate_id = [] # 出现重复的 id
632
- all_datas = [] # 迭代器
633
- for data in datas:
634
- delete_id = data['_id']
635
- del data['_id']
636
- data = re.sub(r'\.0+\', ', '\', ', str(data)) # 统一移除小数点后面的 0
637
- if data in all_datas: # 数据出现重复时
638
- duplicate_id.append(delete_id) # 添加 id 到 duplicate_id
639
- continue
640
- all_datas.append(data) # 数据没有重复
641
- del all_datas
642
-
643
- if not duplicate_id: # 如果没有重复数据,则跳过
644
- return
645
- collection.delete_many({'_id': {'$in': duplicate_id}})
646
- now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S ')
647
- print(f'{now}{collection_name} -> before: {len(datas)}, remove: {len(duplicate_id)}')
648
-
649
- def get_collection_datas_bak(self, db_name, collection_name):
650
- database_names = self.client.list_database_names() # 所有数据库名称
651
- if db_name not in database_names:
652
- print(f'{self.host}/{self.port} 当前数据库: {database_names}, 不存在的数据库: {db_name}')
653
- collection = self.client[self.db_name].get_collection(collection_name)
654
- batch_size = 1000 # 设置批次大小
655
- cursor = collection.find().batch_size(batch_size) # 获取游标
656
- results = []
657
- for doc in cursor:
658
- results.append(doc)
659
- return results
660
-
661
- def day_list(self, start_date, end_date):
662
- start_date = pd.to_datetime(start_date)
663
- end_date = pd.to_datetime(end_date)
664
- date_list = []
665
- while start_date <= end_date:
666
- date_list.append(pd.to_datetime(start_date.date()))
667
- start_date += datetime.timedelta(days=1)
668
- return date_list
669
-
670
- def rename_column(self):
671
- """ 批量修改数据库的列名 """
672
- """
673
- # for db_name in ['京东数据2', '推广数据2', '市场数据2', '生意参谋2', '生意经2', '属性设置2',]:
674
- # s = OptimizeDatas(username=username, password=password, host=host, port=port)
675
- # s.db_name = db_name
676
- # s.rename_column()
677
- # s.client.close()
678
- """
679
- self.client = pymongo.MongoClient(self.link) # 连接数据库
680
- database_names = self.client.list_database_names() # 所有数据库名称
681
- collections = self.my_collection_names(db_name=self.db_name) # 所有集合名称
682
- for collection_name in collections:
683
- collection = self.client[self.db_name].get_collection(collection_name)
684
- has_date_field = collection.find_one({})
685
- for key, value in has_date_field.items():
686
- if key.endswith('_'):
687
- new_name = re.sub(r'_+$', '', key)
688
- query = {key: {'$exists': True}}
689
- update = {'$rename': {key: new_name}}
690
- collection.update_many(query, update)
691
-
692
-
693
- def upload_one_dir():
694
- if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
695
- conf = myconfig.main()
696
- conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
697
- username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
698
- 'port']
699
- else:
700
- conf = myconfig.main()
701
- conf_data = conf['Windows']['company']['mysql']['remoto']
702
- username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
703
- 'port']
704
-
705
- p = UploadMongo(username=username, password=password, host=host, port=port, drop_duplicates=False)
706
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
707
- print(f'{now}数据处理中...')
708
-
709
-
710
- def main():
711
- pass
712
-
713
-
714
- if __name__ == '__main__':
715
- # main()
716
- print(username, password, host, port)
717
-
718
- # for db_name in [
719
- # '京东数据2',
720
- # '推广数据2',
721
- # '市场数据2',
722
- # '生意参谋2',
723
- # '生意经2',
724
- # '属性设置2',
725
- # ]:
726
- # s = OptimizeDatas(username=username, password=password, host=host, port=port)
727
- # s.db_name = db_name
728
- # s.rename_column()
729
- # s.client.close()