mdbq 3.3.4__py3-none-any.whl → 3.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1253,8 +1253,8 @@ class MysqlDatasQuery:
1253
1253
  '三级来源索引': 'smallint',
1254
1254
  }
1255
1255
  # df.to_csv('/Users/xigua/Downloads/ll.csv', index=False, header=True, encoding='utf-8_sig')
1256
- min_date = df['日期'].min()
1257
- max_date = df['日期'].max()
1256
+ min_date = df['日期'].min().strftime("%Y-%m-%d")
1257
+ max_date = df['日期'].max().strftime("%Y-%m-%d")
1258
1258
  now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
1259
1259
  print(f'{now} 正在更新: mysql ({host}:{port}) {db_name}/{table_name} -> {min_date}~{max_date}')
1260
1260
  m_engine.df_to_mysql(
mdbq/mongo/mongo.py CHANGED
@@ -8,11 +8,23 @@ import pandas as pd
8
8
  import numpy as np
9
9
  import pymongo
10
10
  from functools import wraps
11
+ import socket
12
+ import platform
11
13
  from concurrent.futures import ThreadPoolExecutor
12
- from mdbq.config import get_myconf
14
+ from mdbq.config import myconfig
13
15
  from mdbq.dataframe import converter
14
16
 
15
17
  warnings.filterwarnings('ignore')
18
+ if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
19
+ conf = myconfig.main()
20
+ conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
21
+ username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
22
+ 'port']
23
+ else:
24
+ conf = myconfig.main()
25
+ conf_data = conf['Windows']['company']['mysql']['remoto']
26
+ username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
27
+ 'port']
16
28
 
17
29
 
18
30
  def rename_col(username, password, host, db_name, collection_name, old_name, new_name, port: int = 27017,):
@@ -679,30 +691,21 @@ class OptimizeDatas:
679
691
 
680
692
 
681
693
  def upload_one_dir():
682
- username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mongodb')
694
+ if socket.gethostname() == 'company' or socket.gethostname() == 'Mac2.local':
695
+ conf = myconfig.main()
696
+ conf_data = conf['Windows']['xigua_lx']['mysql']['remoto']
697
+ username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
698
+ 'port']
699
+ else:
700
+ conf = myconfig.main()
701
+ conf_data = conf['Windows']['company']['mysql']['remoto']
702
+ username, password, host, port = conf_data['username'], conf_data['password'], conf_data['host'], conf_data[
703
+ 'port']
704
+
683
705
  p = UploadMongo(username=username, password=password, host=host, port=port, drop_duplicates=False)
684
706
  now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
685
707
  print(f'{now}数据处理中...')
686
708
 
687
- p.db_name = ''
688
- p.collection_name = f''
689
- path = os.path.join('C:\\同步空间', 'BaiduSyncdisk', '原始文件2', r'京东报表', 'JD流量来源')
690
-
691
- for root, dirs, files in os.walk(path, topdown=False):
692
- for name in files:
693
- if '按天_' not in name:
694
- continue
695
- if name.endswith('.csv') and 'baidu' not in name:
696
- df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
697
- for col in df.columns.tolist():
698
- if '日期' in col:
699
- df[col] = df[col].apply(lambda x: pd.to_datetime(x) if x else pd.to_datetime('2099-01-01'))
700
- p.df_to_mongo(df=df)
701
- if p.client:
702
- p.client.close()
703
- now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
704
- print(f'{now}数据完成!')
705
-
706
709
 
707
710
  def main():
708
711
  pass
@@ -710,7 +713,6 @@ def main():
710
713
 
711
714
  if __name__ == '__main__':
712
715
  # main()
713
- username, password, host, port = get_myconf.select_config_values(target_service='home_lx', database='mongodb')
714
716
  print(username, password, host, port)
715
717
 
716
718
  # for db_name in [
mdbq/mysql/mysql.py CHANGED
@@ -128,7 +128,7 @@ class MysqlUpload:
128
128
 
129
129
  return wrapper
130
130
 
131
- def keep_connect(self, _config, max_try: int=5):
131
+ def keep_connect(self, _db_name, _config, max_try: int=5):
132
132
  attempts = 1
133
133
  while attempts <= max_try:
134
134
  try:
@@ -137,8 +137,8 @@ class MysqlUpload:
137
137
  except Exception as e:
138
138
  print(f'连接失败,正在重试: {attempts}/{max_try} {e}')
139
139
  attempts += 1
140
- time.sleep(10)
141
- print(f'连接失败,重试次数超限')
140
+ time.sleep(20)
141
+ print(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
142
142
  return None
143
143
 
144
144
  def cover_doc_dtypes(self, dict_data):
@@ -215,7 +215,7 @@ class MysqlUpload:
215
215
  return
216
216
 
217
217
  # connection = pymysql.connect(**self.config) # 连接数据库
218
- connection = self.keep_connect(_config=self.config, max_try=5)
218
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
219
219
  if not connection:
220
220
  return
221
221
  with connection.cursor() as cursor:
@@ -236,7 +236,7 @@ class MysqlUpload:
236
236
 
237
237
  self.config.update({'database': db_name}) # 添加更新 config 字段
238
238
  # connection = pymysql.connect(**self.config) # 重新连接数据库
239
- connection = self.keep_connect(_config=self.config, max_try=5)
239
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
240
240
  if not connection:
241
241
  return
242
242
  with connection.cursor() as cursor:
@@ -382,7 +382,7 @@ class MysqlUpload:
382
382
  print(f'{table_name} 将数据按年/月保存(cut_data),但在转换日期时报错 -> {e}')
383
383
 
384
384
  # connection = pymysql.connect(**self.config) # 连接数据库
385
- connection = self.keep_connect(_config=self.config, max_try=5)
385
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
386
386
  if not connection:
387
387
  return
388
388
  with connection.cursor() as cursor:
@@ -403,7 +403,7 @@ class MysqlUpload:
403
403
 
404
404
  self.config.update({'database': db_name}) # 添加更新 config 字段
405
405
  # connection = pymysql.connect(**self.config) # 重新连接数据库
406
- connection = self.keep_connect(_config=self.config, max_try=5)
406
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
407
407
  if not connection:
408
408
  return
409
409
  with connection.cursor() as cursor:
@@ -749,7 +749,7 @@ class MysqlUpload:
749
749
  [dtypes.update({k: inside_v}) for inside_k, inside_v in set_typ.items() if k == inside_k]
750
750
 
751
751
  # connection = pymysql.connect(**self.config) # 连接数据库
752
- connection = self.keep_connect(_config=self.config, max_try=5)
752
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
753
753
  if not connection:
754
754
  return
755
755
  with connection.cursor() as cursor:
@@ -770,7 +770,7 @@ class MysqlUpload:
770
770
 
771
771
  self.config.update({'database': db_name}) # 添加更新 config 字段
772
772
  # connection = pymysql.connect(**self.config) # 重新连接数据库
773
- connection = self.keep_connect(_config=self.config, max_try=5)
773
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
774
774
  if not connection:
775
775
  return
776
776
  with connection.cursor() as cursor:
@@ -1049,7 +1049,7 @@ class MysqlUpload:
1049
1049
  print(f'未指定文件名: filename')
1050
1050
  return
1051
1051
  # connection = pymysql.connect(**self.config) # 连接数据库
1052
- connection = self.keep_connect(_config=self.config, max_try=5)
1052
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
1053
1053
  if not connection:
1054
1054
  return
1055
1055
  # try:
@@ -1061,7 +1061,7 @@ class MysqlUpload:
1061
1061
  return
1062
1062
  self.config.update({'database': db_name})
1063
1063
  # connection = pymysql.connect(**self.config) # 重新连接数据库
1064
- connection = self.keep_connect(_config=self.config, max_try=5)
1064
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
1065
1065
  if not connection:
1066
1066
  return
1067
1067
  with connection.cursor() as cursor:
@@ -1094,7 +1094,7 @@ class MysqlUpload:
1094
1094
  df = pd.DataFrame()
1095
1095
 
1096
1096
  # connection = pymysql.connect(**self.config) # 连接数据库
1097
- connection = self.keep_connect(_config=self.config, max_try=5)
1097
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
1098
1098
  if not connection:
1099
1099
  return
1100
1100
  try:
@@ -1116,7 +1116,7 @@ class MysqlUpload:
1116
1116
  # 读取数据
1117
1117
  self.config.update({'database': db_name})
1118
1118
  # connection = pymysql.connect(**self.config) # 重新连接数据库
1119
- connection = self.keep_connect(_config=self.config, max_try=5)
1119
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
1120
1120
  if not connection:
1121
1121
  return
1122
1122
  try:
@@ -1240,7 +1240,7 @@ class OptimizeDatas:
1240
1240
 
1241
1241
  return wrapper
1242
1242
 
1243
- def keep_connect(self, _config, max_try: int=5):
1243
+ def keep_connect(self, _db_name, _config, max_try: int=5):
1244
1244
  attempts = 1
1245
1245
  while attempts <= max_try:
1246
1246
  try:
@@ -1249,8 +1249,8 @@ class OptimizeDatas:
1249
1249
  except Exception as e:
1250
1250
  print(f'连接失败,正在重试: {attempts}/{max_try} {e}')
1251
1251
  attempts += 1
1252
- time.sleep(10)
1253
- print(f'连接失败,重试次数超限')
1252
+ time.sleep(20)
1253
+ print(f'{_db_name}: 连接失败,重试次数超限,当前设定次数: {max_try}')
1254
1254
  return None
1255
1255
 
1256
1256
  def optimize_list(self):
@@ -1300,7 +1300,7 @@ class OptimizeDatas:
1300
1300
  # continue
1301
1301
  self.config.update({'database': self.db_name}) # 添加更新 config 字段
1302
1302
  # self.connection = pymysql.connect(**self.config)
1303
- self.connection = self.keep_connect(_config=self.config, max_try=5)
1303
+ self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
1304
1304
  if not self.connection:
1305
1305
  return
1306
1306
  with self.connection.cursor() as cursor:
@@ -1455,7 +1455,7 @@ class OptimizeDatas:
1455
1455
  def database_list(self):
1456
1456
  """ 获取所有数据库 """
1457
1457
  # connection = pymysql.connect(**self.config) # 连接数据库
1458
- connection = self.keep_connect(_config=self.config, max_try=5)
1458
+ connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
1459
1459
  if not connection:
1460
1460
  return
1461
1461
  with connection.cursor() as cursor:
@@ -1467,7 +1467,7 @@ class OptimizeDatas:
1467
1467
  def table_list(self, db_name):
1468
1468
  """ 获取指定数据库的所有数据表 """
1469
1469
  # connection = pymysql.connect(**self.config) # 连接数据库
1470
- connection = self.keep_connect(_config=self.config, max_try=5)
1470
+ connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
1471
1471
  if not connection:
1472
1472
  return
1473
1473
  try:
@@ -1486,7 +1486,7 @@ class OptimizeDatas:
1486
1486
 
1487
1487
  self.config.update({'database': db_name}) # 添加更新 config 字段
1488
1488
  # connection = pymysql.connect(**self.config) # 重新连接数据库
1489
- connection = self.keep_connect(_config=self.config, max_try=5)
1489
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
1490
1490
  if not connection:
1491
1491
  return
1492
1492
  with connection.cursor() as cursor:
@@ -1501,7 +1501,7 @@ class OptimizeDatas:
1501
1501
  """
1502
1502
  self.config.update({'database': db_name}) # 添加更新 config 字段
1503
1503
  # connection = pymysql.connect(**self.config)
1504
- connection = self.keep_connect(_config=self.config, max_try=5)
1504
+ connection = self.keep_connect(_db_name=db_name, _config=self.config, max_try=5)
1505
1505
  if not connection:
1506
1506
  return
1507
1507
  try:
@@ -1537,7 +1537,7 @@ class OptimizeDatas:
1537
1537
  for key, table_name in table_dict.items():
1538
1538
  self.config.update({'database': self.db_name}) # 添加更新 config 字段
1539
1539
  # self.connection = pymysql.connect(**self.config)
1540
- self.connection = self.keep_connect(_config=self.config, max_try=5)
1540
+ self.connection = self.keep_connect(_db_name=self.db_name, _config=self.config, max_try=5)
1541
1541
  if not self.connection:
1542
1542
  return
1543
1543
  with self.connection.cursor() as cursor:
@@ -156,7 +156,7 @@ class ReCheckMysql:
156
156
 
157
157
 
158
158
  def recheck_csv():
159
- path = '/Users/xigua/数据中心/原始文件2/推广报表34324234'
159
+ path = ''
160
160
  for root, dirs, files in os.walk(path, topdown=False):
161
161
  for name in files:
162
162
  if '~' in name or 'baidu' in name or 'Ds_' in name or 'xunlei' in name:
mdbq/spider/aikucun.py CHANGED
@@ -17,8 +17,6 @@ from selenium.webdriver.support import expected_conditions as EC
17
17
  from selenium.webdriver.chrome.service import Service
18
18
  from mdbq.config import set_support
19
19
  from selenium.webdriver.common.keys import Keys
20
- from mdbq.aggregation import aggregation
21
- from mdbq.clean import data_clean
22
20
  from mdbq.other import ua_sj
23
21
  from mdbq.mysql import mysql
24
22
  from mdbq.config import myconfig
@@ -46,7 +44,7 @@ else:
46
44
  D_PATH = str(pathlib.Path(f'/Users/{getpass.getuser()}/Downloads'))
47
45
  Share_Path = str(pathlib.Path('/Volumes/时尚事业部/01.运营部/天猫报表')) # 共享文件根目录
48
46
  Source_Path = str(pathlib.Path(Data_Path, '原始文件2'))
49
- upload_path = os.path.join(D_PATH, '数据上传中心') # 此目录位于下载文件夹
47
+ upload_path = os.path.join(D_PATH, '数据上传中心', '爱库存') # 此目录位于下载文件夹
50
48
 
51
49
  m_engine = mysql.MysqlUpload(username='', password='', host='', port=0, charset='utf8mb4')
52
50
  company_engine = mysql.MysqlUpload(username='', password='', host='', port=0, charset='utf8mb4')
@@ -458,26 +456,6 @@ class AikuCun:
458
456
  def akucun(headless=True, date_num=10):
459
457
  akc = AikuCun()
460
458
  akc.get_data(shop_name='aikucun', date_num=date_num, headless=headless) # 获取最近 N 天数据,0表示今天
461
- # akc.clean_data()
462
-
463
- # # 新版 数据分类
464
- # dp = aggregation.DatabaseUpdate(path=upload_path)
465
- # dp.new_unzip(is_move=True)
466
- # dp.cleaning(is_move=False, is_except=['临时文件']) # 清洗数据, 存入 self.datas, 不需要立即移除文件,仍保留文件到原始文件中
467
- # # 将 self.datas 更新至数据库
468
- # dp.upload_df(service_databases=[
469
- # # {'home_lx': 'mongodb'},
470
- # # {'home_lx': 'mysql'},
471
- # {'company': 'mysql'},
472
- # # {'nas': 'mysql'},
473
- # ])
474
- # # 数据分类
475
- # c = data_clean.DataClean(path=upload_path, source_path=Source_Path)
476
- # c.set_up_to_mogo = False # 不再使用 data_clean 更新数据库,改为 aggregation.py
477
- # c.set_up_to_mysql = False # 不再使用 data_clean 更新数据库,改为 aggregation.py
478
- # c.new_unzip(is_move=True, ) # 解压文件
479
- # c.change_and_sort(is_except=['临时文件'])
480
- # c.move_all(is_except=['临时文件']) # 移到文件到原始文件夹
481
459
 
482
460
 
483
461
  class AikuCunNew:
@@ -529,7 +507,7 @@ class AikuCunNew:
529
507
 
530
508
  if __name__ == '__main__':
531
509
  get_cookie_aikucun() # 登录并获取 cookies
532
- akucun(date_num=10, headless=True) # 下载数据
510
+ akucun(date_num=5, headless=True) # 下载数据
533
511
 
534
512
  # a = AikuCunNew(shop_name='aikucun')
535
513
  # a.akc()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mdbq
3
- Version: 3.3.4
3
+ Version: 3.3.7
4
4
  Home-page: https://pypi.org/project/mdbq
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -2,32 +2,23 @@ mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
2
2
  mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
3
3
  mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
4
4
  mdbq/aggregation/aggregation.py,sha256=-yzApnlqSN2L0E1YMu5ml-W827qpKQvWPCOI7jj2kzY,80264
5
- mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
6
- mdbq/aggregation/mysql_types.py,sha256=YTGyrF9vcRgfkQbpT-e-JdJ7c7VF1dDHgyx9YZRES8w,10934
7
5
  mdbq/aggregation/optimize_data.py,sha256=RXIv7cACCgYyehAxMjUYi_S7rVyjIwXKWMaM3nduGtA,3068
8
- mdbq/aggregation/query_data.py,sha256=4C9BmMUV4x1YvezIySO5Nh8tqGqg1z4GVnuYDGU5OJs,167595
6
+ mdbq/aggregation/query_data.py,sha256=_5mnSFHV6xAFs_1YF_H2zMOdJeMavgga4lZQ_qpqxPQ,167637
9
7
  mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
10
8
  mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
11
- mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
12
- mdbq/clean/clean_upload.py,sha256=yMAb6tV9XHhFJbRrCOeaPfszApJ9y5M4-hQGuBSXNqE,67799
13
- mdbq/clean/data_clean.py,sha256=ucfslhqXVZoH2QaXHSAWDky0GhIvH9f4GeNaHg4SrFE,104790
14
- mdbq/company/__init__.py,sha256=qz8F_GsP_pMB5PblgJAUAMjasuZbOEp3qQOCB39E8f0,21
15
- mdbq/company/copysh.py,sha256=eFu6focRqm2Njn_XN1KW2ZYJiTv6EYgsdBCLokobyxQ,21572
16
9
  mdbq/config/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
17
- mdbq/config/get_myconf.py,sha256=cmNvsyoNa0RbZ9FOTjSd3jyyGwkxjUo0phvdHbGlrms,6010
18
10
  mdbq/config/myconfig.py,sha256=EGymTlAimtHIDJ9egCtOehBEPOj6rea504kvsEZu64o,854
19
11
  mdbq/config/products.py,sha256=Ab6eaAUMUtjRL8z9NvYukyCjp3nAi4OYISY_IdPhAJ0,6279
20
12
  mdbq/config/set_support.py,sha256=xkZCX6y9Bq1ppBpJAofld4B2YtchA7fl0eT3dx3CrSI,777
21
- mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
22
13
  mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
23
14
  mdbq/dataframe/converter.py,sha256=lETYhT7KXlWzWwqguqhk6vI6kj4rnOBEW1lhqKy2Abc,5035
24
15
  mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
25
16
  mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
26
17
  mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
27
- mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
18
+ mdbq/mongo/mongo.py,sha256=M9DUeUCMPDngkwn9-ui0uTiFrvfNU1kLs22s5SmoNm0,31899
28
19
  mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
29
- mdbq/mysql/mysql.py,sha256=pTeZD0KgVNJnRLaMXP11wz8lBBq2_xkAqIuoRmnEMWA,85259
30
- mdbq/mysql/recheck_mysql.py,sha256=rgTpvDMWYTyEn7UQdlig-pdXDluTgiU8JG6lkMh8DV0,8665
20
+ mdbq/mysql/mysql.py,sha256=ZK6E-idQWrURtoimc0uG8B1tnhtSFQXDJHfu8sWeJg4,85675
21
+ mdbq/mysql/recheck_mysql.py,sha256=ppBTfBLgkRWirMVZ31e_ZPULiGPJU7K3PP9G6QBZ3QI,8605
31
22
  mdbq/mysql/s_query.py,sha256=MbIprZ4yJDAZ9AahZPzl7hqS695Vs0P-AJNwAtA_EEc,9287
32
23
  mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
33
24
  mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
@@ -40,11 +31,9 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
40
31
  mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
41
32
  mdbq/pbix/refresh_all.py,sha256=OBT9EewSZ0aRS9vL_FflVn74d4l2G00wzHiikCC4TC0,5926
42
33
  mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
43
- mdbq/req_post/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
44
- mdbq/req_post/req_tb.py,sha256=qg7pet73IgKGmCwxaeUyImJIoeK_pBQT9BBKD7fkBNg,36160
45
34
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
46
- mdbq/spider/aikucun.py,sha256=BKVa0xbTkyhIH5kQgOdyPDtwFPScbMNAFYup_-fFF9Y,24809
47
- mdbq-3.3.4.dist-info/METADATA,sha256=5HCgLOtVbO2JXkq0HhNkTOo2wqWf4MFVh78NWT_ruPk,243
48
- mdbq-3.3.4.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
49
- mdbq-3.3.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
50
- mdbq-3.3.4.dist-info/RECORD,,
35
+ mdbq/spider/aikucun.py,sha256=UFY-TwlvquEYK58rTdRuv5Wx3KA21m-bIrwvvfPRyOk,23749
36
+ mdbq-3.3.7.dist-info/METADATA,sha256=aMT3CEx_q-0vG6CncxkAXv1PPk2RsieIZxrkYyk1jPE,243
37
+ mdbq-3.3.7.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
38
+ mdbq-3.3.7.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
39
+ mdbq-3.3.7.dist-info/RECORD,,
@@ -1,188 +0,0 @@
1
- # -*- coding:utf-8 -*-
2
- import warnings
3
- import pandas as pd
4
- import numpy as np
5
- import chardet
6
- import zipfile
7
-
8
- from numpy import dtype
9
- from pandas.tseries.holiday import next_monday
10
- from pyzipper import PyZipFile
11
- import os
12
- import platform
13
- import json
14
- import pymysql
15
- from mdbq.mongo import mongo
16
- from mdbq.mysql import mysql
17
- from mdbq.mysql import s_query
18
- from mdbq.config import get_myconf
19
- from mdbq.config import set_support
20
- from mdbq.dataframe import converter
21
- import datetime
22
- import time
23
- import re
24
- import shutil
25
- import getpass
26
-
27
- from sqlalchemy.dialects.postgresql.pg_catalog import pg_get_serial_sequence
28
-
29
- warnings.filterwarnings('ignore')
30
- """
31
- 1. 记录 dataframe 或者数据库的列信息(dtypes)
32
- 2. 更新 mysql 中所有数据库的 dtypes 信息到本地 json
33
- """
34
-
35
-
36
- class DataTypes:
37
- """
38
- 数据简介: 记录 dataframe 或者数据库的列信息(dtypes),可以记录其信息或者加载相关信息用于入库使用,
39
- 第一字段为分类(如 dataframe/mysql),第二字段为数据库名,第三字段为集合名,第四段列名及其数据类型
40
- """
41
- def __init__(self, path=None):
42
- self.datas = {
43
- "json统计":
44
- {
45
- "字段量": 0,
46
- "数据库量": 0,
47
- "集合数量": 0
48
- }
49
- }
50
- self.path = path
51
- if not self.path:
52
- self.path = set_support.SetSupport(dirname='support').dirname
53
- self.json_file = os.path.join(self.path, 'df_types.json')
54
- if not os.path.isdir(self.path):
55
- os.makedirs(self.path)
56
- if not os.path.isfile(self.json_file):
57
- with open(self.json_file, 'w', encoding='utf-8_sig') as f:
58
- json.dump(self.datas, f, ensure_ascii=False, sort_keys=True, indent=4)
59
- self.json_before()
60
-
61
- def json_before(self):
62
- """ 本地 json 文件的 dtypes 信息, 初始化更新给 self.datas """
63
- with open(self.json_file, 'r', encoding='utf-8_sig') as f:
64
- json_ = json.load(f)
65
- self.datas.update(json_)
66
-
67
- def get_df_types(self, db_name, collection_name, df=pd.DataFrame(), is_file_dtype=True):
68
- """
69
- 读取 df 的 dtypes, 并更新本地 json 文件
70
- 期间会 清理不合规的列名, 并对数据类型进行转换(尝试将 object 类型转为 int 或 float)
71
- 返回: df 的 dtypes, 后续使用示例: df = df.astype(dtypes, errors='ignore')
72
- is_file_dtype=True: 默认情况下以旧 json 优先, 即允许手动指定 json 文件里面的数据类型
73
- """
74
- if len(df) == 0:
75
- return
76
- cv = converter.DataFrameConverter()
77
- df = cv.convert_df_cols(df=df) # 清理 dataframe 非法值
78
- dtypes = df.dtypes.apply(str).to_dict()
79
- dtypes = {db_name: {collection_name: dtypes}}
80
-
81
- if not self.datas: # 如果不存在本地 json 文件, 直接返回即可
82
- self.datas.update(dtypes)
83
- return self.datas[db_name][collection_name]
84
- else: # 存在则读取,并更新 df 的 dtypes
85
- if db_name in list(self.datas.keys()): # ['京东数据2', '推广数据2', '生意参谋2', '生意经2']
86
- if collection_name in list(self.datas[db_name].keys()):
87
- if is_file_dtype: # 旧数据优先
88
- # # 用 dtypes 更新, 允许手动指定 json 文件里面的数据类型
89
- dtypes[db_name][collection_name].update(self.datas[db_name][collection_name])
90
- # 将 dtypes 更新进去,使 self.datas 包含新旧信息
91
- self.datas[db_name][collection_name].update(dtypes[db_name][collection_name])
92
- else: # 新数据优先
93
- self.datas[db_name][collection_name].update(dtypes[db_name][collection_name])
94
- else:
95
- if is_file_dtype: # 旧数据优先
96
- dtypes[db_name].update(self.datas[db_name])
97
- self.datas[db_name].update(dtypes[db_name])
98
- else:
99
- self.datas[db_name].update(dtypes[db_name])
100
- else:
101
- # dtypes.update(self.datas) # 可以注释掉, 因为旧数据 self.datas 是空的
102
- self.datas.update(dtypes)
103
- dbs = 0
104
- collections = 0
105
- cols = 0
106
- # self.datas.pop('json统计')
107
- for k, v in self.datas.items():
108
- if k == 'json统计':
109
- continue
110
- dbs += 1
111
- for d, j in v.items():
112
- collections += 1
113
- for t, p in j.items():
114
- cols += 1
115
- tips = {'json统计': {'数据库量': dbs, '集合数量': collections, '字段量': cols}}
116
- self.datas.update(tips)
117
- return self.datas[db_name][collection_name] # 返回 df 的 dtypes
118
-
119
- def as_json_file(self):
120
- """ 保存为本地 json 文件 """
121
- self.datas = {k: 'null' if v is None else v for k, v in self.datas.items()} # 替换字典中,值存在空值的值
122
- self.datas = {k if k != None else 'null': v for k, v in self.datas.items()} # 替换字典中,键存在空值的键
123
- if 'null' in str(self.datas):
124
- print(f'self.datas 数据中存在空值,可能有未匹配的数据库名或数据表名,请检查 《标题对照表.csv》,已取消写入 df_types.json ')
125
- print('self.datas: ', self.datas)
126
- return
127
- with open(self.json_file, 'w', encoding='utf-8_sig') as f:
128
- json.dump(
129
- self.datas,
130
- f,
131
- ensure_ascii=False, # 默认True,非ASCII字符将被转义。如为False,则非ASCII字符会以\uXXXX输出
132
- sort_keys=True, # 默认为False。如果为True,则字典的输出将按键排序。
133
- indent=4,
134
- )
135
- time.sleep(1)
136
-
137
- def df_dtypes_to_json(self, db_name, collection_name, path, df=pd.DataFrame(), is_file_dtype=True):
138
- if len(df) == 0:
139
- return
140
- cv = converter.DataFrameConverter()
141
- df = cv.convert_df_cols(df=df) # 清理 dataframe 列名的不合规字符
142
- dtypes = df.dtypes.apply(str).to_dict()
143
- dtypes = {'dataframe': {db_name: {collection_name: dtypes}}}
144
- self.dtypes_to_json(dtypes=dtypes, cl='dataframe', db_name=db_name, collection_name=collection_name, path=path, is_file_dtype=is_file_dtype)
145
-
146
- def load_dtypes(self, db_name, collection_name):
147
- if db_name in list(self.datas.keys()):
148
- if collection_name in list(self.datas[db_name].keys()):
149
- return self.datas[db_name][collection_name]
150
- else:
151
- print(f'不存在的集合名信息: {collection_name}, 文件位置: {self.json_file}')
152
- return {}
153
- else:
154
- print(f'不存在的数据库信息: {db_name}, 文件位置: {self.json_file}')
155
- return {}
156
-
157
-
158
- def update_df_types_to_json(file, db_name, collection_name, is_file_dtype=True):
159
- """ 更新一个文件的 dtype 信息到 json 文件 """
160
- df = pd.read_csv(file, encoding='utf-8_sig', header=0, na_filter=False)
161
- df_to_json = DataTypes()
162
- df_to_json.get_df_types(
163
- df=df,
164
- db_name=db_name,
165
- collection_name=collection_name,
166
- is_file_dtype=is_file_dtype, # 日常需开启文件优先, 正常不要让新文件修改 json 已有的类型
167
- )
168
- df_to_json.as_json_file()
169
- print(f'json文件已存储: {df_to_json.json_file}')
170
-
171
-
172
- def test_load_dtypes(db_name, collection_name):
173
- d = DataTypes()
174
- res = d.load_dtypes(db_name=db_name, collection_name=collection_name)
175
- print(res)
176
-
177
-
178
- if __name__ == '__main__':
179
- file = '/Users/xigua/数据中心/pandas数据源/店铺日报.csv'
180
- update_df_types_to_json(
181
- file=file,
182
- db_name='pandas数据源',
183
- collection_name='店铺日报',
184
- is_file_dtype=True,
185
- )
186
- # test_load_dtypes(db_name='pandas数据源', collection_name='店铺日报')
187
-
188
-