mdbq 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/aggregation/aggregation.py +24 -2
- mdbq/aggregation/mysql_types.py +8 -2
- mdbq/dataframe/converter.py +5 -2
- mdbq/mysql/mysql.py +49 -12
- {mdbq-1.2.2.dist-info → mdbq-1.2.4.dist-info}/METADATA +1 -1
- {mdbq-1.2.2.dist-info → mdbq-1.2.4.dist-info}/RECORD +8 -8
- {mdbq-1.2.2.dist-info → mdbq-1.2.4.dist-info}/WHEEL +1 -1
- {mdbq-1.2.2.dist-info → mdbq-1.2.4.dist-info}/top_level.txt +0 -0
mdbq/aggregation/aggregation.py
CHANGED
@@ -438,6 +438,7 @@ class DatabaseUpdate:
|
|
438
438
|
new_name = f'未分类_{date1}_全部渠道_商品明细.csv'
|
439
439
|
df.rename(columns={'商品ID': '商品id'}, inplace=True)
|
440
440
|
df.insert(loc=0, column='日期', value=date1)
|
441
|
+
df['最近上架时间'].loc[0] = df['最近上架时间'].loc[1] # 填充这一列, 避免上传 mysql 日期类型报错
|
441
442
|
if 'sku' in new_name:
|
442
443
|
db_name = '京东数据2'
|
443
444
|
collection_name = 'sku_商品明细'
|
@@ -929,7 +930,7 @@ def one_file_to_mysql(file, db_name, table_name, target_service, database):
|
|
929
930
|
filename = os.path.basename(file)
|
930
931
|
df = pd.read_csv(file, encoding='utf-8_sig', header=0, na_filter=False, float_precision='high')
|
931
932
|
m = mysql.MysqlUpload(username=username, password=password, host=host, port=port)
|
932
|
-
m.df_to_mysql(df=df, db_name=db_name, table_name=table_name, filename=filename)
|
933
|
+
m.df_to_mysql(df=df, db_name=db_name, table_name=table_name, filename=filename, df_sql=True)
|
933
934
|
|
934
935
|
|
935
936
|
def file_dir(one_file=True):
|
@@ -974,6 +975,7 @@ def file_dir(one_file=True):
|
|
974
975
|
path=os.path.join(path, sub_path),
|
975
976
|
db_name = db_name,
|
976
977
|
collection_name = table_name,
|
978
|
+
dbs={'mysql': True, 'mongodb': True},
|
977
979
|
)
|
978
980
|
data.update({'入库进度': 1}) # 更新进度为已上传
|
979
981
|
# 将进度信息写回文件
|
@@ -981,7 +983,27 @@ def file_dir(one_file=True):
|
|
981
983
|
df.to_csv(os.path.join(support_file, filename), encoding='utf-8_sig', index=False, header=True)
|
982
984
|
|
983
985
|
|
986
|
+
def test():
|
987
|
+
path = '/Users/xigua/数据中心/原始文件2/京东报表/JD商品明细spu'
|
988
|
+
for root, dirs, files in os.walk(path, topdown=False):
|
989
|
+
for name in files:
|
990
|
+
if name.endswith('.csv') and 'baidu' not in name and '~' not in name:
|
991
|
+
df = pd.read_csv(os.path.join(root, name), encoding='utf-8_sig', header=0, na_filter=False)
|
992
|
+
df['最近上架时间'].loc[0] = df['最近上架时间'].loc[1]
|
993
|
+
# print(df[['日期', '最近上架时间']])
|
994
|
+
df.to_csv(os.path.join(root, name), encoding='utf-8_sig', index=False, header=True)
|
995
|
+
# break
|
996
|
+
# break
|
997
|
+
|
998
|
+
|
984
999
|
if __name__ == '__main__':
|
985
1000
|
# username, password, host, port = get_myconf.select_config_values(target_service='nas', database='mysql')
|
986
1001
|
# print(username, password, host, port)
|
987
|
-
file_dir(one_file=
|
1002
|
+
file_dir(one_file=False)
|
1003
|
+
# one_file_to_mysql(
|
1004
|
+
# file='/Users/xigua/数据中心/原始文件2/京东报表/商品信息导出/spu/京东商品信息_批量SPU导出-批量任务_2024-08-10.csv',
|
1005
|
+
# db_name='京东数据2',
|
1006
|
+
# table_name='京东spu商品信息',
|
1007
|
+
# target_service='home_lx',
|
1008
|
+
# database='mysql'
|
1009
|
+
# )
|
mdbq/aggregation/mysql_types.py
CHANGED
@@ -123,6 +123,7 @@ class DataTypes:
|
|
123
123
|
sort_keys=True, # 默认为False。如果为True,则字典的输出将按键排序。
|
124
124
|
indent=4,
|
125
125
|
)
|
126
|
+
print(f'已更新 json 文件: {self.json_file}')
|
126
127
|
time.sleep(1)
|
127
128
|
|
128
129
|
def load_dtypes(self, db_name, table_name, cl='mysql', ):
|
@@ -205,6 +206,7 @@ def mysql_all_dtypes(db_name=None, table_name=None, path=None):
|
|
205
206
|
time.sleep(0.5)
|
206
207
|
|
207
208
|
d = DataTypes()
|
209
|
+
d.json_file = os.path.join(path, 'mysql_types.json') # # json 保存位置
|
208
210
|
for result in results:
|
209
211
|
for db_n, table_n in result.items():
|
210
212
|
# print(db_n, table_n, db_name, table_name)
|
@@ -229,12 +231,16 @@ def mysql_all_dtypes(db_name=None, table_name=None, path=None):
|
|
229
231
|
cl='mysql',
|
230
232
|
db_name=db_n,
|
231
233
|
table_name=table_n,
|
232
|
-
is_file_dtype=True
|
234
|
+
is_file_dtype=True # True表示旧文件有限
|
233
235
|
)
|
234
236
|
else:
|
235
237
|
print(f'数据库回传数据(name_type)为空')
|
236
238
|
# print(d.datas)
|
237
239
|
d.as_json_file()
|
238
240
|
|
241
|
+
|
239
242
|
if __name__ == '__main__':
|
240
|
-
|
243
|
+
# 更新 mysql 中所有数据库的 dtypes 信息到本地 json
|
244
|
+
mysql_all_dtypes(
|
245
|
+
path='/Users/xigua/Downloads',
|
246
|
+
)
|
mdbq/dataframe/converter.py
CHANGED
@@ -43,11 +43,14 @@ class DataFrameConverter(object):
|
|
43
43
|
df[col] = df[col].apply(lambda x: float(float((str(x).rstrip("%"))) / 100) if str(x).endswith('%') and '~' not in str(x) else x)
|
44
44
|
# 尝试转换合适的数据类型
|
45
45
|
if df[col].dtype == 'object':
|
46
|
+
# "_"符号会被错误识别
|
46
47
|
try:
|
47
|
-
df[col] = df[col].apply(lambda x: int(x) if '_' not in str(x) else x)
|
48
|
+
df[col] = df[col].apply(lambda x: int(x) if '_' not in str(x) and '.' not in str(x) else x) # 不含小数点尝试转整数
|
48
49
|
except:
|
50
|
+
pass
|
51
|
+
if df[col].dtype == 'object':
|
49
52
|
try:
|
50
|
-
df[col] = df[col].apply(lambda x: float(x) if '_' not in str(x) else x)
|
53
|
+
df[col] = df[col].apply(lambda x: float(x) if '.' in str(x) and '_' not in str(x) else x)
|
51
54
|
except:
|
52
55
|
pass
|
53
56
|
if df[col].dtype == 'float' or df[col].dtype == 'float64': # 对于小数类型, 保留 6 位小数
|
mdbq/mysql/mysql.py
CHANGED
@@ -9,7 +9,9 @@ import warnings
|
|
9
9
|
import pymysql
|
10
10
|
import numpy as np
|
11
11
|
import pandas as pd
|
12
|
+
import sqlalchemy.types
|
12
13
|
from more_itertools.more import iequals
|
14
|
+
from pandas.core.dtypes.common import INT64_DTYPE
|
13
15
|
from sqlalchemy import create_engine
|
14
16
|
import os
|
15
17
|
import calendar
|
@@ -20,18 +22,20 @@ from mdbq.aggregation import mysql_types
|
|
20
22
|
|
21
23
|
warnings.filterwarnings('ignore')
|
22
24
|
"""
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
建表流程:
|
26
|
+
尽可能手动建表,再上传数据
|
27
|
+
1. 每个表手动上传一个文件建表
|
28
|
+
2. 全部建表完成,建议所有表的数据类型,有问题的在数据库修改
|
29
|
+
3. 清空所有数据表,仅保留列信息
|
30
|
+
4. 下载所有数据表的 dtypes 信息到 json 文件
|
31
|
+
5. 之后可以正常上传数据
|
29
32
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
建表规范:
|
34
|
+
1. 数据库和数据表名如果有字母,必须使用小写,大写在建库后会自动变小写,再次上传数据会找不到数据库(macos和linux都有这种情况)
|
35
|
+
2. 无论是数据库/表/列名还是值,尽量避免特殊字符或者表情符号,数据库/表/列名尽量都使用 `列名` 转义,避免错误
|
36
|
+
3. 小数必须使用 decimal, 禁止 float 和 double, 因为计算精度差异,后续需要聚合数据时会引发很多问题
|
37
|
+
4. 日期类型暂时全部用 DATETIME,使用 DATE 在后续可能会重复插入不能排重,因为 df 进来的数据, 日期是带时间的,而数据库中日期不含时间
|
38
|
+
5. 目前小数自动适配类型转换,对于文本或者大数全部用 mediumtext, 因为部分表涉及爬虫数据,进来的字符长度未知,暂时统一 mediumtext 避免入库失败
|
35
39
|
|
36
40
|
|
37
41
|
|
@@ -56,7 +60,7 @@ class MysqlUpload:
|
|
56
60
|
}
|
57
61
|
self.filename = None
|
58
62
|
|
59
|
-
def df_to_mysql(self, df, table_name, db_name='远程数据源', drop_dup=True, drop_duplicates=False, filename=None, count=None):
|
63
|
+
def df_to_mysql(self, df, table_name, db_name='远程数据源', df_sql=False, drop_dup=True, drop_duplicates=False, filename=None, count=None):
|
60
64
|
"""
|
61
65
|
将 df 写入数据库
|
62
66
|
db_name: 数据库名称
|
@@ -73,9 +77,23 @@ class MysqlUpload:
|
|
73
77
|
else:
|
74
78
|
print(f'{db_name}: {table_name} 传入的 df 不是有效的 dataframe 结构, {self.filename}')
|
75
79
|
return
|
80
|
+
|
76
81
|
cv = converter.DataFrameConverter()
|
77
82
|
df = cv.convert_df_cols(df=df) # 清理 dataframe 非法值
|
78
83
|
|
84
|
+
# if df_sql:
|
85
|
+
# now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
86
|
+
# print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count},{self.filename}')
|
87
|
+
# engine = create_engine(f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
88
|
+
# df.to_sql(
|
89
|
+
# name=table_name,
|
90
|
+
# con=engine,
|
91
|
+
# if_exists='append',
|
92
|
+
# index=False,
|
93
|
+
# chunksize=1000,
|
94
|
+
# dtype={'京东价': 'INT'},
|
95
|
+
# )
|
96
|
+
# return
|
79
97
|
connection = pymysql.connect(**self.config) # 连接数据库
|
80
98
|
with connection.cursor() as cursor:
|
81
99
|
cursor.execute(f"SHOW DATABASES LIKE '{db_name}'") # 检查数据库是否存在
|
@@ -132,6 +150,22 @@ class MysqlUpload:
|
|
132
150
|
print(f'{self.filename}: {e}')
|
133
151
|
connection.commit() # 提交事务
|
134
152
|
|
153
|
+
if df_sql:
|
154
|
+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
155
|
+
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count},{self.filename}')
|
156
|
+
engine = create_engine(
|
157
|
+
f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{db_name}") # 创建数据库引擎
|
158
|
+
df.to_sql(
|
159
|
+
name=table_name,
|
160
|
+
con=engine,
|
161
|
+
if_exists='append',
|
162
|
+
index=False,
|
163
|
+
chunksize=1000
|
164
|
+
)
|
165
|
+
connection.close()
|
166
|
+
return
|
167
|
+
|
168
|
+
# print(cl, db_n, tb_n)
|
135
169
|
# 返回这些结果的目的是等添加完列再写 json 文件才能读到 types 信息
|
136
170
|
if cl and db_n and tb_n:
|
137
171
|
mysql_types.mysql_all_dtypes(db_name=db_name, table_name=table_name) # 更新一个表的 dtypes
|
@@ -152,6 +186,7 @@ class MysqlUpload:
|
|
152
186
|
# 5. 更新插入数据
|
153
187
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")
|
154
188
|
print(f'{now}正在更新 mysql ({self.host}:{self.port}) {db_name}/{table_name}, {count},{self.filename}')
|
189
|
+
|
155
190
|
datas = df.to_dict(orient='records')
|
156
191
|
for data in datas:
|
157
192
|
try:
|
@@ -183,6 +218,7 @@ class MysqlUpload:
|
|
183
218
|
print(f'mysql -> df_to_mysql 报错: {e}, {self.filename}')
|
184
219
|
# breakpoint()
|
185
220
|
connection.commit() # 提交事务
|
221
|
+
connection.close()
|
186
222
|
|
187
223
|
def convert_dtypes(self, df, db_name, table_name):
|
188
224
|
"""
|
@@ -208,6 +244,7 @@ class MysqlUpload:
|
|
208
244
|
col_not_exist = cols
|
209
245
|
# 对文件不存在的列信息进行数据类型转换(按指定规则)
|
210
246
|
dtypes.update({col: self.convert_dtype_to_sql(df=df, col=col, dtype=df[col].dtype) for col in col_not_exist})
|
247
|
+
# print(dtypes)
|
211
248
|
# 至此 df 中全部列类型已经转换完成
|
212
249
|
# 返回结果, 示例: {'上市年份': 'mediumtext', '商品id': 'mediumtext', '平台': 'mediumtext'}
|
213
250
|
return dtypes, cl, db_n, tb_n # 返回这些结果的目的是等添加完列再写 json 文件才能读到 types 信息
|
@@ -1,9 +1,9 @@
|
|
1
1
|
mdbq/__init__.py,sha256=Il5Q9ATdX8yXqVxtP_nYqUhExzxPC_qk_WXQ_4h0exg,16
|
2
2
|
mdbq/__version__.py,sha256=y9Mp_8x0BCZSHsdLT_q5tX9wZwd5QgqrSIENLrb6vXA,62
|
3
3
|
mdbq/aggregation/__init__.py,sha256=EeDqX2Aml6SPx8363J-v1lz0EcZtgwIBYyCJV6CcEDU,40
|
4
|
-
mdbq/aggregation/aggregation.py,sha256
|
4
|
+
mdbq/aggregation/aggregation.py,sha256=a-lK0R6fEqj5InZZXly-xqfyZ4mCDMqAuEHuTzKMvv0,57467
|
5
5
|
mdbq/aggregation/df_types.py,sha256=rHLIgv82PJSFmDvXkZyOJAffXkFyyMyFO23w9tUt8EQ,7525
|
6
|
-
mdbq/aggregation/mysql_types.py,sha256=
|
6
|
+
mdbq/aggregation/mysql_types.py,sha256=umVixmbFZM63k-QhVWLvOuhcAde4P_oDKbdo8ry2O9w,10633
|
7
7
|
mdbq/aggregation/optimize_data.py,sha256=jLAWtxPUuhpo4XTVrhKtT4xK3grs7r73ePQfLhxlu1I,779
|
8
8
|
mdbq/aggregation/query_data.py,sha256=sdMWj0st_VFDfBHmqTrY05k-0yoagdnaiNMoB0otEuk,25255
|
9
9
|
mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
|
@@ -18,14 +18,14 @@ mdbq/config/products.py,sha256=9gqXJMsw8KKuD4Xs6krNgcF7AuWDvV7clI6wVi3QjcA,4260
|
|
18
18
|
mdbq/config/set_support.py,sha256=xkZCX6y9Bq1ppBpJAofld4B2YtchA7fl0eT3dx3CrSI,777
|
19
19
|
mdbq/config/update_conf.py,sha256=taL3ZqKgiVWwUrDFuaYhim9a72Hm4BHRhhDscJTziR8,4535
|
20
20
|
mdbq/dataframe/__init__.py,sha256=2HtCN8AdRj53teXDqzysC1h8aPL-mMFy561ESmhehGQ,22
|
21
|
-
mdbq/dataframe/converter.py,sha256=
|
21
|
+
mdbq/dataframe/converter.py,sha256=BAst61HvtXqN3yWguia47zNY19c-wpby8CsdS48PC6g,3592
|
22
22
|
mdbq/log/__init__.py,sha256=Mpbrav0s0ifLL7lVDAuePEi1hJKiSHhxcv1byBKDl5E,15
|
23
23
|
mdbq/log/mylogger.py,sha256=oaT7Bp-Hb9jZt52seP3ISUuxVcI19s4UiqTeouScBO0,3258
|
24
24
|
mdbq/mongo/__init__.py,sha256=SILt7xMtQIQl_m-ik9WLtJSXIVf424iYgCfE_tnQFbw,13
|
25
25
|
mdbq/mongo/mongo.py,sha256=v9qvrp6p1ZRWuPpbSilqveiE0FEcZF7U5xUPI0RN4xs,31880
|
26
26
|
mdbq/mysql/__init__.py,sha256=A_DPJyAoEvTSFojiI2e94zP0FKtCkkwKP1kYUCSyQzo,11
|
27
27
|
mdbq/mysql/data_types_即将删除.py,sha256=sjBBDKr9674LdjM5N_dwyJACdZPbdB8Beli59jGdgnQ,10378
|
28
|
-
mdbq/mysql/mysql.py,sha256=
|
28
|
+
mdbq/mysql/mysql.py,sha256=hZWk7rgoeCkNYbGMgJokiPJSoMJ3y-wSVH_ojzh49l8,37464
|
29
29
|
mdbq/mysql/s_query.py,sha256=4c24SwbqtnO33o8CgWlTQ_j8sZYl5BRIQkaD9CI-vTY,7901
|
30
30
|
mdbq/mysql/year_month_day.py,sha256=VgewoE2pJxK7ErjfviL_SMTN77ki8GVbTUcao3vFUCE,1523
|
31
31
|
mdbq/other/__init__.py,sha256=jso1oHcy6cJEfa7udS_9uO5X6kZLoPBF8l3wCYmr5dM,18
|
@@ -36,7 +36,7 @@ mdbq/pbix/__init__.py,sha256=Trtfaynu9RjoTyLLYBN2xdRxTvm_zhCniUkVTAYwcjo,24
|
|
36
36
|
mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,2396
|
37
37
|
mdbq/pbix/refresh_all.py,sha256=tgy762608HMaXWynbOURIf2UVMuSPybzrDXQnOOcnZU,6102
|
38
38
|
mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
|
39
|
-
mdbq-1.2.
|
40
|
-
mdbq-1.2.
|
41
|
-
mdbq-1.2.
|
42
|
-
mdbq-1.2.
|
39
|
+
mdbq-1.2.4.dist-info/METADATA,sha256=ekCyUS5vKgOSEK7iYzdo-70H1ThaVi89ATDo6jnJBSI,245
|
40
|
+
mdbq-1.2.4.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
41
|
+
mdbq-1.2.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
|
42
|
+
mdbq-1.2.4.dist-info/RECORD,,
|
File without changes
|