mdbq 3.3.4__py3-none-any.whl → 3.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/query_data.py +2 -2
- mdbq/mongo/mongo.py +24 -22
- mdbq/mysql/mysql.py +22 -22
- mdbq/mysql/recheck_mysql.py +1 -1
- mdbq/spider/aikucun.py +2 -24
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/METADATA +1 -1
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/RECORD +9 -20
- mdbq/aggregation/df_types.py +0 -188
- mdbq/aggregation/mysql_types.py +0 -240
- mdbq/clean/__init__.py +0 -4
- mdbq/clean/clean_upload.py +0 -1350
- mdbq/clean/data_clean.py +0 -1551
- mdbq/company/__init__.py +0 -4
- mdbq/company/copysh.py +0 -447
- mdbq/config/get_myconf.py +0 -131
- mdbq/config/update_conf.py +0 -102
- mdbq/req_post/__init__.py +0 -4
- mdbq/req_post/req_tb.py +0 -624
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/WHEEL +0 -0
- {mdbq-3.3.4.dist-info → mdbq-3.3.7.dist-info}/top_level.txt +0 -0
mdbq/aggregation/query_data.py
CHANGED
@@ -1253,8 +1253,8 @@ class MysqlDatasQuery:
|
|
1253
1253
|
'三级来源索引': 'smallint',
|
1254
1254
|
}
|
1255
1255
|
# df.to_csv('/Users/xigua/Downloads/ll.csv', index=False, header=True, encoding='utf-8_sig')
|
1256
|
-
min_date = df['日期'].min()
|
1257
|
-
max_date = df['日期'].max()
|
1256
|
+
min_date = df['日期'].min().strftime("%Y-%m-%d")
|
1257
|
+
max_date = df['日期'].max().strftime("%Y-%m-%d")
|
1258
1258
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
1259
1259
|
print(f'{now} 正在更新: mysql ({host}:{port}) {db_name}/{table_name} -> {min_date}~{max_date}')
|
1260
1260
|
m_engine.df_to_mysql(
|
mdbq/mongo/mongo.py
CHANGED
@@ -8,11 +8,23 @@ import pandas as pd
|
|
8
8
|
import numpy as np
|
9
9
|
import pymongo
|
10
10
|
from functools import wraps
|
11
|
+
import socket
|
12
|
+
import platform
|
11
13
|
from concurrent.futures import ThreadPoolExecutor
|
12
|
-
from mdbq.config import
|
14
|
+
from mdbq.config import myconfig
|
13
15
|
from mdbq.dataframe import converter
|
14
16
|
|
15
17
|
warnings.filterwarnings('ignore')
|
18
|
+
if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
|
19
|
+
conf = myconfig.main()
|
20
|
+
conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
|
21
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
22
|
+
'port']
|
23
|
+
else:
|
24
|
+
conf = myconfig.main()
|
25
|
+
conf_data = conf['Windows']['company']['mysql']['remoto']
|
26
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
27
|
+
'port']
|
16
28
|
|
17
29
|
|
18
30
|
def rename_col(username, password, host, db_name, collection_name, old_name, new_name, port: int = 27017,):
|
@@ -679,30 +691,21 @@ class OptimizeDatas:
|
|
679
691
|
|
680
692
|
|
681
693
|
def upload_one_dir():
|
682
|
-
|
694
|
+
if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
|
695
|
+
conf = myconfig.main()
|
696
|
+
conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
|
697
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
698
|
+
'port']
|
699
|
+
else:
|
700
|
+
conf = myconfig.main()
|
701
|
+
conf_data = conf['Windows']['company']['mysql']['remoto']
|
702
|
+
username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
|
703
|
+
'port']
|
704
|
+
|
683
705
|
p = UploadMongo(username=username, password=password, host=host, port=port, drop_duplicates=False)
|
684
706
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
685
707
|
print(f'{now}数据处理中...')
|
686
708
|
|
687
|
-
p.db_name = ''
|
688
|
-
p.collection_name = f''
|
689
|
-
path = os.path.join('C:\\同步空间', 'BaiduSyncdisk', '原始文件2', r'京东报表', 'JD流量来源')
|
690
|
-
|
691
|
-
for root, dirs, files in os.walk(path, topdown=False):
|
692
|
-
for name in files:
|
693
|
-
if '按天_' not in name:
|
694
|
-
continue
|
695
|
-
if name.endswith('.csv') and 'baidu' not in name:
|
696
|
-
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
697
|
-
for col in df.columns.tolist():
|
698
|
-
if '日期' in col:
|
699
|
-
df[col] = df[col].apply(lambda x: pd.to_datetime(x) if x else pd.to_datetime('2099-01-01'))
|
700
|
-
p.df_to_mongo(df=df)
|
701
|
-
if p.client:
|
702
|
-
p.client.close()
|
703
|
-
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
704
|
-
print(f'{now}数据完成!')
|
705
|
-
|
706
709
|
|
707
710
|
def main():
|
708
711
|
pass
|
@@ -710,7 +713,6 @@ def main():
|
|
710
713
|
|
711
714
|
if __name__ == '__main__':
|
712
715
|
# main()
|
713
|
-
username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mongodb')
|
714
716
|
print(username, password, host, port)
|
715
717
|
|
716
718
|
# for db_name in [
|
mdbq/mysql/mysql.py
CHANGED
@@ -128,7 +128,7 @@ class MysqlUpload:
|
|
128
128
|
|
129
129
|
return wrapper
|
130
130
|
|
131
|
-
def keep_connect(self, _config, max_try: int=5):
|
131
|
+
def keep_connect(self, _db_name, _config, max_try: int=5):
|
132
132
|
attempts = 1
|
133
133
|
while attempts <= max_try:
|
134
134
|
try:
|
@@ -137,8 +137,8 @@ class MysqlUpload:
|
|
137
137
|
except Exception as e:
|
138
138
|
print(f'连接失败,正在重试: {attempts}/{max_try} {e}')
|
139
139
|
attempts += 1
|
140
|
-
time.sleep(
|
141
|
-
print(f'
|
140
|
+
time.sleep(20)
|
141
|
+
print(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
142
142
|
return None
|
143
143
|
|
144
144
|
def cover_doc_dtypes(self, dict_data):
|
@@ -215,7 +215,7 @@ class MysqlUpload:
|
|
215
215
|
return
|
216
216
|
|
217
217
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
218
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
218
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
219
219
|
if not connection:
|
220
220
|
return
|
221
221
|
with connection.cursor() as cursor:
|
@@ -236,7 +236,7 @@ class MysqlUpload:
|
|
236
236
|
|
237
237
|
self.config.update({'database': db_name}) # 添加更新 config 字段
|
238
238
|
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
239
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
239
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
240
240
|
if not connection:
|
241
241
|
return
|
242
242
|
with connection.cursor() as cursor:
|
@@ -382,7 +382,7 @@ class MysqlUpload:
|
|
382
382
|
print(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
|
383
383
|
|
384
384
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
385
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
385
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
386
386
|
if not connection:
|
387
387
|
return
|
388
388
|
with connection.cursor() as cursor:
|
@@ -403,7 +403,7 @@ class MysqlUpload:
|
|
403
403
|
|
404
404
|
self.config.update({'database': db_name}) # 添加更新 config 字段
|
405
405
|
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
406
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
406
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
407
407
|
if not connection:
|
408
408
|
return
|
409
409
|
with connection.cursor() as cursor:
|
@@ -749,7 +749,7 @@ class MysqlUpload:
|
|
749
749
|
[dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
|
750
750
|
|
751
751
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
752
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
752
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
753
753
|
if not connection:
|
754
754
|
return
|
755
755
|
with connection.cursor() as cursor:
|
@@ -770,7 +770,7 @@ class MysqlUpload:
|
|
770
770
|
|
771
771
|
self.config.update({'database': db_name}) # 添加更新 config 字段
|
772
772
|
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
773
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
773
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
774
774
|
if not connection:
|
775
775
|
return
|
776
776
|
with connection.cursor() as cursor:
|
@@ -1049,7 +1049,7 @@ class MysqlUpload:
|
|
1049
1049
|
print(f'未指定文件名: filename')
|
1050
1050
|
return
|
1051
1051
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
1052
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1052
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1053
1053
|
if not connection:
|
1054
1054
|
return
|
1055
1055
|
# try:
|
@@ -1061,7 +1061,7 @@ class MysqlUpload:
|
|
1061
1061
|
return
|
1062
1062
|
self.config.update({'database': db_name})
|
1063
1063
|
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
1064
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1064
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1065
1065
|
if not connection:
|
1066
1066
|
return
|
1067
1067
|
with connection.cursor() as cursor:
|
@@ -1094,7 +1094,7 @@ class MysqlUpload:
|
|
1094
1094
|
df = pd.DataFrame()
|
1095
1095
|
|
1096
1096
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
1097
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1097
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1098
1098
|
if not connection:
|
1099
1099
|
return
|
1100
1100
|
try:
|
@@ -1116,7 +1116,7 @@ class MysqlUpload:
|
|
1116
1116
|
# 读取数据
|
1117
1117
|
self.config.update({'database': db_name})
|
1118
1118
|
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
1119
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1119
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1120
1120
|
if not connection:
|
1121
1121
|
return
|
1122
1122
|
try:
|
@@ -1240,7 +1240,7 @@ class OptimizeDatas:
|
|
1240
1240
|
|
1241
1241
|
return wrapper
|
1242
1242
|
|
1243
|
-
def keep_connect(self, _config, max_try: int=5):
|
1243
|
+
def keep_connect(self, _db_name, _config, max_try: int=5):
|
1244
1244
|
attempts = 1
|
1245
1245
|
while attempts <= max_try:
|
1246
1246
|
try:
|
@@ -1249,8 +1249,8 @@ class OptimizeDatas:
|
|
1249
1249
|
except Exception as e:
|
1250
1250
|
print(f'连接失败,正在重试: {attempts}/{max_try} {e}')
|
1251
1251
|
attempts += 1
|
1252
|
-
time.sleep(
|
1253
|
-
print(f'
|
1252
|
+
time.sleep(20)
|
1253
|
+
print(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
|
1254
1254
|
return None
|
1255
1255
|
|
1256
1256
|
def optimize_list(self):
|
@@ -1300,7 +1300,7 @@ class OptimizeDatas:
|
|
1300
1300
|
# continue
|
1301
1301
|
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
1302
1302
|
# self.connection = pymysql.connect(**self.config)
|
1303
|
-
self.connection = self.keep_connect(_config=self.config, max_try=5)
|
1303
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1304
1304
|
if not self.connection:
|
1305
1305
|
return
|
1306
1306
|
with self.connection.cursor() as cursor:
|
@@ -1455,7 +1455,7 @@ class OptimizeDatas:
|
|
1455
1455
|
def database_list(self):
|
1456
1456
|
""" 获取所有数据库 """
|
1457
1457
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
1458
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1458
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1459
1459
|
if not connection:
|
1460
1460
|
return
|
1461
1461
|
with connection.cursor() as cursor:
|
@@ -1467,7 +1467,7 @@ class OptimizeDatas:
|
|
1467
1467
|
def table_list(self, db_name):
|
1468
1468
|
""" 获取指定数据库的所有数据表 """
|
1469
1469
|
# connection = pymysql.connect(**self.config) # 连接数据库
|
1470
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1470
|
+
connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1471
1471
|
if not connection:
|
1472
1472
|
return
|
1473
1473
|
try:
|
@@ -1486,7 +1486,7 @@ class OptimizeDatas:
|
|
1486
1486
|
|
1487
1487
|
self.config.update({'database': db_name}) # 添加更新 config 字段
|
1488
1488
|
# connection = pymysql.connect(**self.config) # 重新连接数据库
|
1489
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1489
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1490
1490
|
if not connection:
|
1491
1491
|
return
|
1492
1492
|
with connection.cursor() as cursor:
|
@@ -1501,7 +1501,7 @@ class OptimizeDatas:
|
|
1501
1501
|
"""
|
1502
1502
|
self.config.update({'database': db_name}) # 添加更新 config 字段
|
1503
1503
|
# connection = pymysql.connect(**self.config)
|
1504
|
-
connection = self.keep_connect(_config=self.config, max_try=5)
|
1504
|
+
connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
|
1505
1505
|
if not connection:
|
1506
1506
|
return
|
1507
1507
|
try:
|
@@ -1537,7 +1537,7 @@ class OptimizeDatas:
|
|
1537
1537
|
for key, table_name in table_dict.items():
|
1538
1538
|
self.config.update({'database': self.db_name}) # 添加更新 config 字段
|
1539
1539
|
# self.connection = pymysql.connect(**self.config)
|
1540
|
-
self.connection = self.keep_connect(_config=self.config, max_try=5)
|
1540
|
+
self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
|
1541
1541
|
if not self.connection:
|
1542
1542
|
return
|
1543
1543
|
with self.connection.cursor() as cursor:
|
mdbq/mysql/recheck_mysql.py
CHANGED
@@ -156,7 +156,7 @@ class ReCheckMysql:
|
|
156
156
|
|
157
157
|
|
158
158
|
def recheck_csv():
|
159
|
-
path = '
|
159
|
+
path = ''
|
160
160
|
for root, dirs, files in os.walk(path, topdown=False):
|
161
161
|
for name in files:
|
162
162
|
if '~' in name or 'baidu' in name or 'Ds_' in name or 'xunlei' in name:
|
mdbq/spider/aikucun.py
CHANGED
@@ -17,8 +17,6 @@ from selenium.webdriver.support import expected_conditions as EC
|
|
17
17
|
from selenium.webdriver.chrome.service import Service
|
18
18
|
from mdbq.config import set_support
|
19
19
|
from selenium.webdriver.common.keys import Keys
|
20
|
-
from mdbq.aggregation import aggregation
|
21
|
-
from mdbq.clean import data_clean
|
22
20
|
from mdbq.other import ua_sj
|
23
21
|
from mdbq.mysql import mysql
|
24
22
|
from mdbq.config import myconfig
|
@@ -46,7 +44,7 @@ else:
|
|
46
44
|
D_PATH = str(pathlib.Path(f'/Users/{getpass.getuser()}/Downloads'))
|
47
45
|
Share_Path = str(pathlib.Path('/Volumes/时尚事业部/01.运营部/天猫报表')) # 共享文件根目录
|
48
46
|
Source_Path = str(pathlib.Path(Data_Path, '原始文件2'))
|
49
|
-
upload_path = os.path.join(D_PATH, '数据上传中心') # 此目录位于下载文件夹
|
47
|
+
upload_path = os.path.join(D_PATH, '数据上传中心', '爱库存') # 此目录位于下载文件夹
|
50
48
|
|
51
49
|
m_engine = mysql.MysqlUpload(username='', password='', host='', port=0, charset='utf8mb4')
|
52
50
|
company_engine = mysql.MysqlUpload(username='', password='', host='', port=0, charset='utf8mb4')
|
@@ -458,26 +456,6 @@ class AikuCun:
|
|
458
456
|
def akucun(headless=True, date_num=10):
|
459
457
|
akc = AikuCun()
|
460
458
|
akc.get_data(shop_name='aikucun', date_num=date_num, headless=headless) # 获取最近 N 天数据,0表示今天
|
461
|
-
# akc.clean_data()
|
462
|
-
|
463
|
-
# # 新版 数据分类
|
464
|
-
# dp = aggregation.DatabaseUpdate(path=upload_path)
|
465
|
-
# dp.new_unzip(is_move=True)
|
466
|
-
# dp.cleaning(is_move=False, is_except=['临时文件']) # 清洗数据, 存入 self.datas, 不需要立即移除文件,仍保留文件到原始文件中
|
467
|
-
# # 将 self.datas 更新至数据库
|
468
|
-
# dp.upload_df(service_databases=[
|
469
|
-
# # {'home_lx': 'mongodb'},
|
470
|
-
# # {'home_lx': 'mysql'},
|
471
|
-
# {'company': 'mysql'},
|
472
|
-
# # {'nas': 'mysql'},
|
473
|
-
# ])
|
474
|
-
# # 数据分类
|
475
|
-
# c = data_clean.DataClean(path=upload_path, source_path=Source_Path)
|
476
|
-
# c.set_up_to_mogo = False # 不再使用 data_clean 更新数据库,改为 aggregation.py
|
477
|
-
# c.set_up_to_mysql = False # 不再使用 data_clean 更新数据库,改为 aggregation.py
|
478
|
-
# c.new_unzip(is_move=True, ) # 解压文件
|
479
|
-
# c.change_and_sort(is_except=['临时文件'])
|
480
|
-
# c.move_all(is_except=['临时文件']) # 移到文件到原始文件夹
|
481
459
|
|
482
460
|
|
483
461
|
class AikuCunNew:
|
@@ -529,7 +507,7 @@ class AikuCunNew:
|
|
529
507
|
|
530
508
|
if __name__ == '__main__':
|
531
509
|
get_cookie_aikucun() # 登录并获取 cookies
|
532
|
-
akucun(date_num=
|
510
|
+
akucun(date_num=5, headless=True) # 下载数据
|
533
511
|
|
534
512
|
# a = AikuCunNew(shop_name='aikucun')
|
535
513
|
# a.akc()
|
@@ -2,32 +2,23 @@ mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
4
|
mdbq/aggregation/aggregation.py,sha256=-yzApnlqSN2L0E1YMu5ml-W827qpKQvWPCOI7jj2kzY,80264
|
5
|
-
mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
|
6
|
-
mdbq/aggregation/mysql_types.py,sha256=YTGyrF9vcRgfkQbpT-e-JdJ7c7VF1dDHgyx9YZRES8w,10934
|
7
5
|
mdbq/aggregation/optimize_data.py,sha256=RXIv7cACCgYyehAxMjUYi_S7rVyjIwXKWMaM3nduGtA,3068
|
8
|
-
mdbq/aggregation/query_data.py,sha256=
|
6
|
+
mdbq/aggregation/query_data.py,sha256=_5mnSFHV6xAFs_1YF_H2zMOdJeMavgga4lZQ_qpqxPQ,167637
|
9
7
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
10
8
|
mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
|
11
|
-
mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
|
12
|
-
mdbq/clean/clean_upload.py,sha256=yMAb6tV9XHhFJbRrCOeaPfszApJ9y5M4-hQGuBSXNqE,67799
|
13
|
-
mdbq/clean/data_clean.py,sha256=ucfslhqXVZoH2QaXHSAWDky0GhIvH9f4GeNaHg4SrFE,104790
|
14
|
-
mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
|
15
|
-
mdbq/company/copysh.py,sha256=eFu6focRqm2Njn_XN1KW2ZYJiTv6EYgsdBCLokobyxQ,21572
|
16
9
|
mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
17
|
-
mdbq/config/get_myconf.py,sha256=cmNvsyoNa0RbZ9FOTjSd3jyyGwkxjUo0phvdHbGlrms,6010
|
18
10
|
mdbq/config/myconfig.py,sha256=EGymTlAimtHIDJ9egCtOehBEPOj6rea504kvsEZu64o,854
|
19
11
|
mdbq/config/products.py,sha256=Ab6eaAUMUtjRL8z9NvYukyCjp3nAi4OYISY_IdPhAJ0,6279
|
20
12
|
mdbq/config/set_support.py,sha256=xkZCX6y9Bq1ppBpJAofld4B2YtchA7fl0eT3dx3CrSI,777
|
21
|
-
mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
|
22
13
|
mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
|
23
14
|
mdbq/dataframe/converter.py,sha256=lETYhT7KXlWzWwqguqhk6vI6kj4rnOBEW1lhqKy2Abc,5035
|
24
15
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
25
16
|
mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
26
17
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
27
|
-
mdbq/mongo/mongo.py,sha256=
|
18
|
+
mdbq/mongo/mongo.py,sha256=M9DUeUCMPDngkwn9-ui0uTiFrvfNU1kLs22s5SmoNm0,31899
|
28
19
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
29
|
-
mdbq/mysql/mysql.py,sha256=
|
30
|
-
mdbq/mysql/recheck_mysql.py,sha256=
|
20
|
+
mdbq/mysql/mysql.py,sha256=ZK6E-idQWrURtoimc0uG8B1tnhtSFQXDJHfu8sWeJg4,85675
|
21
|
+
mdbq/mysql/recheck_mysql.py,sha256=ppBTfBLgkRWirMVZ31e_ZPULiGPJU7K3PP9G6QBZ3QI,8605
|
31
22
|
mdbq/mysql/s_query.py,sha256=MbIprZ4yJDAZ9AahZPzl7hqS695Vs0P-AJNwAtA_EEc,9287
|
32
23
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
33
24
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -40,11 +31,9 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
|
40
31
|
mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
|
41
32
|
mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
|
42
33
|
mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
|
43
|
-
mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
44
|
-
mdbq/req_post/req_tb.py,sha256=qg7pet73IgKGmCwxaeUyImJIoeK_pBQT9BBKD7fkBNg,36160
|
45
34
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
46
|
-
mdbq/spider/aikucun.py,sha256=
|
47
|
-
mdbq-3.3.
|
48
|
-
mdbq-3.3.
|
49
|
-
mdbq-3.3.
|
50
|
-
mdbq-3.3.
|
35
|
+
mdbq/spider/aikucun.py,sha256=UFY-TwlvquEYK58rTdRuv5Wx3KA21m-bIrwvvfPRyOk,23749
|
36
|
+
mdbq-3.3.7.dist-info/METADATA,sha256=aMT3CEx_q-0vG6CncxkAXv1PPk2RsieIZxrkYyk1jPE,243
|
37
|
+
mdbq-3.3.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
38
|
+
mdbq-3.3.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
39
|
+
mdbq-3.3.7.dist-info/RECORD,,
|
mdbq/aggregation/df_types.py
DELETED
@@ -1,188 +0,0 @@
|
|
1
|
-
# -*- coding:utf-8 -*-
|
2
|
-
import warnings
|
3
|
-
import pandas as pd
|
4
|
-
import numpy as np
|
5
|
-
import chardet
|
6
|
-
import zipfile
|
7
|
-
|
8
|
-
from numpy import dtype
|
9
|
-
from pandas.tseries.holiday import next_monday
|
10
|
-
from pyzipper import PyZipFile
|
11
|
-
import os
|
12
|
-
import platform
|
13
|
-
import json
|
14
|
-
import pymysql
|
15
|
-
from mdbq.mongo import mongo
|
16
|
-
from mdbq.mysql import mysql
|
17
|
-
from mdbq.mysql import s_query
|
18
|
-
from mdbq.config import get_myconf
|
19
|
-
from mdbq.config import set_support
|
20
|
-
from mdbq.dataframe import converter
|
21
|
-
import datetime
|
22
|
-
import time
|
23
|
-
import re
|
24
|
-
import shutil
|
25
|
-
import getpass
|
26
|
-
|
27
|
-
from sqlalchemy.dialects.postgresql.pg_catalog import pg_get_serial_sequence
|
28
|
-
|
29
|
-
warnings.filterwarnings('ignore')
|
30
|
-
"""
|
31
|
-
1. 记录 dataframe 或者数据库的列信息(dtypes)
|
32
|
-
2. 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
33
|
-
"""
|
34
|
-
|
35
|
-
|
36
|
-
class DataTypes:
|
37
|
-
"""
|
38
|
-
数据简介: 记录 dataframe 或者数据库的列信息(dtypes),可以记录其信息或者加载相关信息用于入库使用,
|
39
|
-
第一字段为分类(如 dataframe/mysql),第二字段为数据库名,第三字段为集合名,第四段列名及其数据类型
|
40
|
-
"""
|
41
|
-
def __init__(self, path=None):
|
42
|
-
self.datas = {
|
43
|
-
"json统计":
|
44
|
-
{
|
45
|
-
"字段量": 0,
|
46
|
-
"数据库量": 0,
|
47
|
-
"集合数量": 0
|
48
|
-
}
|
49
|
-
}
|
50
|
-
self.path = path
|
51
|
-
if not self.path:
|
52
|
-
self.path = set_support.SetSupport(dirname='support').dirname
|
53
|
-
self.json_file = os.path.join(self.path, 'df_types.json')
|
54
|
-
if not os.path.isdir(self.path):
|
55
|
-
os.makedirs(self.path)
|
56
|
-
if not os.path.isfile(self.json_file):
|
57
|
-
with open(self.json_file, 'w', encoding='utf-8_sig') as f:
|
58
|
-
json.dump(self.datas, f, ensure_ascii=False, sort_keys=True, indent=4)
|
59
|
-
self.json_before()
|
60
|
-
|
61
|
-
def json_before(self):
|
62
|
-
""" 本地 json 文件的 dtypes 信息, 初始化更新给 self.datas """
|
63
|
-
with open(self.json_file, 'r', encoding='utf-8_sig') as f:
|
64
|
-
json_ = json.load(f)
|
65
|
-
self.datas.update(json_)
|
66
|
-
|
67
|
-
def get_df_types(self, db_name, collection_name, df=pd.DataFrame(), is_file_dtype=True):
|
68
|
-
"""
|
69
|
-
读取 df 的 dtypes, 并更新本地 json 文件
|
70
|
-
期间会 清理不合规的列名, 并对数据类型进行转换(尝试将 object 类型转为 int 或 float)
|
71
|
-
返回: df 的 dtypes, 后续使用示例: df = df.astype(dtypes, errors='ignore')
|
72
|
-
is_file_dtype=True: 默认情况下以旧 json 优先, 即允许手动指定 json 文件里面的数据类型
|
73
|
-
"""
|
74
|
-
if len(df) == 0:
|
75
|
-
return
|
76
|
-
cv = converter.DataFrameConverter()
|
77
|
-
df = cv.convert_df_cols(df=df) # 清理 dataframe 非法值
|
78
|
-
dtypes = df.dtypes.apply(str).to_dict()
|
79
|
-
dtypes = {db_name: {collection_name: dtypes}}
|
80
|
-
|
81
|
-
if not self.datas: # 如果不存在本地 json 文件, 直接返回即可
|
82
|
-
self.datas.update(dtypes)
|
83
|
-
return self.datas[db_name][collection_name]
|
84
|
-
else: # 存在则读取,并更新 df 的 dtypes
|
85
|
-
if db_name in list(self.datas.keys()): # ['京东数据2', '推广数据2', '生意参谋2', '生意经2']
|
86
|
-
if collection_name in list(self.datas[db_name].keys()):
|
87
|
-
if is_file_dtype: # 旧数据优先
|
88
|
-
# # 用 dtypes 更新, 允许手动指定 json 文件里面的数据类型
|
89
|
-
dtypes[db_name][collection_name].update(self.datas[db_name][collection_name])
|
90
|
-
# 将 dtypes 更新进去,使 self.datas 包含新旧信息
|
91
|
-
self.datas[db_name][collection_name].update(dtypes[db_name][collection_name])
|
92
|
-
else: # 新数据优先
|
93
|
-
self.datas[db_name][collection_name].update(dtypes[db_name][collection_name])
|
94
|
-
else:
|
95
|
-
if is_file_dtype: # 旧数据优先
|
96
|
-
dtypes[db_name].update(self.datas[db_name])
|
97
|
-
self.datas[db_name].update(dtypes[db_name])
|
98
|
-
else:
|
99
|
-
self.datas[db_name].update(dtypes[db_name])
|
100
|
-
else:
|
101
|
-
# dtypes.update(self.datas) # 可以注释掉, 因为旧数据 self.datas 是空的
|
102
|
-
self.datas.update(dtypes)
|
103
|
-
dbs = 0
|
104
|
-
collections = 0
|
105
|
-
cols = 0
|
106
|
-
# self.datas.pop('json统计')
|
107
|
-
for k, v in self.datas.items():
|
108
|
-
if k == 'json统计':
|
109
|
-
continue
|
110
|
-
dbs += 1
|
111
|
-
for d, j in v.items():
|
112
|
-
collections += 1
|
113
|
-
for t, p in j.items():
|
114
|
-
cols += 1
|
115
|
-
tips = {'json统计': {'数据库量': dbs, '集合数量': collections, '字段量': cols}}
|
116
|
-
self.datas.update(tips)
|
117
|
-
return self.datas[db_name][collection_name] # 返回 df 的 dtypes
|
118
|
-
|
119
|
-
def as_json_file(self):
|
120
|
-
""" 保存为本地 json 文件 """
|
121
|
-
self.datas = {k: 'null' if v is None else v for k, v in self.datas.items()} # 替换字典中,值存在空值的值
|
122
|
-
self.datas = {k if k != None else 'null': v for k, v in self.datas.items()} # 替换字典中,键存在空值的键
|
123
|
-
if 'null' in str(self.datas):
|
124
|
-
print(f'self.datas 数据中存在空值,可能有未匹配的数据库名或数据表名,请检查 《标题对照表.csv》,已取消写入 df_types.json ')
|
125
|
-
print('self.datas: ', self.datas)
|
126
|
-
return
|
127
|
-
with open(self.json_file, 'w', encoding='utf-8_sig') as f:
|
128
|
-
json.dump(
|
129
|
-
self.datas,
|
130
|
-
f,
|
131
|
-
ensure_ascii=False, # 默认True,非ASCII字符将被转义。如为False,则非ASCII字符会以\uXXXX输出
|
132
|
-
sort_keys=True, # 默认为False。如果为True,则字典的输出将按键排序。
|
133
|
-
indent=4,
|
134
|
-
)
|
135
|
-
time.sleep(1)
|
136
|
-
|
137
|
-
def df_dtypes_to_json(self, db_name, collection_name, path, df=pd.DataFrame(), is_file_dtype=True):
|
138
|
-
if len(df) == 0:
|
139
|
-
return
|
140
|
-
cv = converter.DataFrameConverter()
|
141
|
-
df = cv.convert_df_cols(df=df) # 清理 dataframe 列名的不合规字符
|
142
|
-
dtypes = df.dtypes.apply(str).to_dict()
|
143
|
-
dtypes = {'dataframe': {db_name: {collection_name: dtypes}}}
|
144
|
-
self.dtypes_to_json(dtypes=dtypes, cl='dataframe', db_name=db_name, collection_name=collection_name, path=path, is_file_dtype=is_file_dtype)
|
145
|
-
|
146
|
-
def load_dtypes(self, db_name, collection_name):
|
147
|
-
if db_name in list(self.datas.keys()):
|
148
|
-
if collection_name in list(self.datas[db_name].keys()):
|
149
|
-
return self.datas[db_name][collection_name]
|
150
|
-
else:
|
151
|
-
print(f'不存在的集合名信息: {collection_name}, 文件位置: {self.json_file}')
|
152
|
-
return {}
|
153
|
-
else:
|
154
|
-
print(f'不存在的数据库信息: {db_name}, 文件位置: {self.json_file}')
|
155
|
-
return {}
|
156
|
-
|
157
|
-
|
158
|
-
def update_df_types_to_json(file, db_name, collection_name, is_file_dtype=True):
|
159
|
-
""" 更新一个文件的 dtype 信息到 json 文件 """
|
160
|
-
df = pd.read_csv(file, encoding='utf-8_sig', header=0, na_filter=False)
|
161
|
-
df_to_json = DataTypes()
|
162
|
-
df_to_json.get_df_types(
|
163
|
-
df=df,
|
164
|
-
db_name=db_name,
|
165
|
-
collection_name=collection_name,
|
166
|
-
is_file_dtype=is_file_dtype, # 日常需开启文件优先, 正常不要让新文件修改 json 已有的类型
|
167
|
-
)
|
168
|
-
df_to_json.as_json_file()
|
169
|
-
print(f'json文件已存储: {df_to_json.json_file}')
|
170
|
-
|
171
|
-
|
172
|
-
def test_load_dtypes(db_name, collection_name):
|
173
|
-
d = DataTypes()
|
174
|
-
res = d.load_dtypes(db_name=db_name, collection_name=collection_name)
|
175
|
-
print(res)
|
176
|
-
|
177
|
-
|
178
|
-
if __name__ == '__main__':
|
179
|
-
file = '/Users/xigua/数据中心/pandas数据源/店铺日报.csv'
|
180
|
-
update_df_types_to_json(
|
181
|
-
file=file,
|
182
|
-
db_name='pandas数据源',
|
183
|
-
collection_name='店铺日报',
|
184
|
-
is_file_dtype=True,
|
185
|
-
)
|
186
|
-
# test_load_dtypes(db_name='pandas数据源', collection_name='店铺日报')
|
187
|
-
|
188
|
-
|