mdbq 3.10.7__py3-none-any.whl → 3.10.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdbq/__version__.py +1 -1
- mdbq/aggregation/optimize.py +1 -0
- mdbq/aggregation/query_data.py +2 -0
- mdbq/log/mylogger.py +8 -19
- mdbq/mysql/deduplicator.py +30 -22
- mdbq/mysql/mysql.py +336 -280
- mdbq/mysql/s_query.py +159 -143
- mdbq/mysql/uploader.py +125 -52
- mdbq/redis/getredis.py +0 -2
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/METADATA +1 -1
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/RECORD +13 -13
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/WHEEL +0 -0
- {mdbq-3.10.7.dist-info → mdbq-3.10.9.dist-info}/top_level.txt +0 -0
mdbq/mysql/s_query.py
CHANGED
@@ -1,14 +1,11 @@
|
|
1
1
|
# -*- coding:utf-8 -*-
|
2
2
|
import datetime
|
3
|
-
import re
|
4
|
-
import time
|
5
3
|
import warnings
|
6
4
|
import pymysql
|
7
|
-
import numpy as np
|
8
5
|
import pandas as pd
|
9
|
-
import os
|
10
6
|
from decimal import Decimal
|
11
7
|
import logging
|
8
|
+
from contextlib import closing
|
12
9
|
|
13
10
|
warnings.filterwarnings('ignore')
|
14
11
|
"""
|
@@ -18,7 +15,20 @@ logger = logging.getLogger(__name__)
|
|
18
15
|
|
19
16
|
|
20
17
|
class QueryDatas:
|
18
|
+
"""
|
19
|
+
数据库查询工具类。
|
20
|
+
用于连接MySQL数据库,支持表结构检查、条件查询、数据导出为DataFrame、列名和类型获取等功能。
|
21
|
+
"""
|
22
|
+
|
21
23
|
def __init__(self, username: str, password: str, host: str, port: int, charset: str = 'utf8mb4'):
|
24
|
+
"""
|
25
|
+
初始化数据库连接配置。
|
26
|
+
:param username: 数据库用户名
|
27
|
+
:param password: 数据库密码
|
28
|
+
:param host: 数据库主机
|
29
|
+
:param port: 数据库端口
|
30
|
+
:param charset: 字符集,默认utf8mb4
|
31
|
+
"""
|
22
32
|
self.username = username
|
23
33
|
self.password = password
|
24
34
|
self.host = host
|
@@ -32,175 +42,181 @@ class QueryDatas:
|
|
32
42
|
'cursorclass': pymysql.cursors.DictCursor,
|
33
43
|
}
|
34
44
|
|
35
|
-
def check_condition(self, db_name, table_name, condition):
|
36
|
-
"""
|
37
|
-
|
38
|
-
|
39
|
-
|
45
|
+
def check_condition(self, db_name, table_name, condition, columns='更新时间'):
|
46
|
+
"""
|
47
|
+
按指定条件查询数据库表,返回满足条件的指定字段数据。
|
48
|
+
:param db_name: 数据库名
|
49
|
+
:param table_name: 表名
|
50
|
+
:param condition: SQL条件字符串(不含WHERE)
|
51
|
+
:param columns: 查询字段字符串或以逗号分隔的字段名,默认'更新时间'
|
52
|
+
:return: 查询结果列表或None
|
53
|
+
"""
|
54
|
+
if not self.check_infos(db_name, table_name):
|
55
|
+
return None
|
40
56
|
self.config.update({'database': db_name})
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
57
|
+
try:
|
58
|
+
with closing(pymysql.connect(**self.config)) as connection:
|
59
|
+
with closing(connection.cursor()) as cursor:
|
60
|
+
sql = f"SELECT {columns} FROM `{table_name}` WHERE {condition}"
|
61
|
+
logger.debug(f"check_condition SQL: {sql}")
|
62
|
+
cursor.execute(sql)
|
63
|
+
result = cursor.fetchall()
|
64
|
+
return result
|
65
|
+
except Exception as e:
|
66
|
+
logger.error(f"check_condition error: {e}")
|
67
|
+
return None
|
48
68
|
|
49
|
-
def data_to_df(self, db_name, table_name, start_date, end_date, projection: dict = None):
|
69
|
+
def data_to_df(self, db_name, table_name, start_date, end_date, projection: dict = None, limit: int = None):
|
50
70
|
"""
|
51
|
-
从数据库表获取数据到DataFrame
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
71
|
+
从数据库表获取数据到DataFrame,支持列筛选、日期范围过滤和行数限制。
|
72
|
+
:param db_name: 数据库名
|
73
|
+
:param table_name: 表名
|
74
|
+
:param start_date: 起始日期(包含)
|
75
|
+
:param end_date: 结束日期(包含)
|
76
|
+
:param projection: 列筛选字典,e.g. {'日期': 1, '场景名字': 1}
|
77
|
+
:param limit: 限制返回的最大行数
|
78
|
+
:return: 查询结果的DataFrame
|
58
79
|
"""
|
59
|
-
# 初始化默认参数
|
60
80
|
projection = projection or {}
|
61
81
|
df = pd.DataFrame()
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
82
|
+
try:
|
83
|
+
start_date = pd.to_datetime(start_date or '1970-01-01').strftime('%Y-%m-%d')
|
84
|
+
end_date = pd.to_datetime(end_date or datetime.datetime.today()).strftime('%Y-%m-%d')
|
85
|
+
except Exception as e:
|
86
|
+
logger.error(f"日期格式错误: {e}")
|
87
|
+
return df
|
67
88
|
if not self.check_infos(db_name, table_name):
|
68
89
|
return df
|
69
|
-
|
70
|
-
# 配置数据库连接
|
71
90
|
self.config['database'] = db_name
|
72
|
-
connection = None
|
73
|
-
|
74
91
|
try:
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
selected_columns = [k for k, v in projection.items() if v and k in cols_exist]
|
92
|
+
with closing(pymysql.connect(**self.config)) as connection:
|
93
|
+
with closing(connection.cursor()) as cursor:
|
94
|
+
cursor.execute(
|
95
|
+
"""SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s""",
|
96
|
+
(db_name, table_name)
|
97
|
+
)
|
98
|
+
cols_exist = {col['COLUMN_NAME'] for col in cursor.fetchall()} - {'id'}
|
99
|
+
if projection:
|
100
|
+
selected_columns = [k for k, v in projection.items() if v and k in cols_exist]
|
101
|
+
if not selected_columns:
|
102
|
+
logger.info("Warning: Projection 参数不匹配任何数据库字段")
|
103
|
+
return df
|
104
|
+
else:
|
105
|
+
selected_columns = list(cols_exist)
|
90
106
|
if not selected_columns:
|
91
|
-
logger.info("
|
107
|
+
logger.info("未找到可用字段")
|
92
108
|
return df
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
# 类型转换优化
|
111
|
-
decimal_cols = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, Decimal)).any()]
|
112
|
-
df[decimal_cols] = df[decimal_cols].astype(float)
|
113
|
-
|
109
|
+
quoted_columns = [f'`{col}`' for col in selected_columns]
|
110
|
+
base_sql = f"SELECT {', '.join(quoted_columns)} FROM `{db_name}`.`{table_name}`"
|
111
|
+
params = []
|
112
|
+
if '日期' in cols_exist:
|
113
|
+
base_sql += f" WHERE 日期 BETWEEN %s AND %s"
|
114
|
+
params.extend([start_date, end_date])
|
115
|
+
if limit is not None and isinstance(limit, int) and limit > 0:
|
116
|
+
base_sql += f" LIMIT %s"
|
117
|
+
params.append(limit)
|
118
|
+
logger.debug(f"data_to_df SQL: {base_sql}, params: {params}")
|
119
|
+
cursor.execute(base_sql, tuple(params))
|
120
|
+
result = cursor.fetchall()
|
121
|
+
if result:
|
122
|
+
df = pd.DataFrame(result)
|
123
|
+
for col in df.columns:
|
124
|
+
if df[col].apply(lambda x: isinstance(x, Decimal)).any():
|
125
|
+
df[col] = df[col].astype(float)
|
114
126
|
except Exception as e:
|
115
|
-
logger.error(f"
|
116
|
-
finally:
|
117
|
-
if connection:
|
118
|
-
connection.close()
|
119
|
-
|
127
|
+
logger.error(f"data_to_df error: {e}")
|
120
128
|
return df
|
121
129
|
|
122
|
-
def columns_to_list(self, db_name, table_name,
|
130
|
+
def columns_to_list(self, db_name, table_name, columns_name, where: str = None) -> list:
|
123
131
|
"""
|
124
|
-
获取数据表的指定列,
|
125
|
-
|
132
|
+
获取数据表的指定列, 支持where条件筛选, 返回列表字典。
|
133
|
+
:param db_name: 数据库名
|
134
|
+
:param table_name: 表名
|
135
|
+
:param columns_name: 需要获取的列名列表
|
136
|
+
:param where: 可选,SQL条件字符串(不含WHERE)
|
137
|
+
:return: [{列1:值, 列2:值, ...}, ...]
|
126
138
|
"""
|
127
|
-
if self.check_infos(db_name, table_name)
|
139
|
+
if not self.check_infos(db_name, table_name):
|
128
140
|
return []
|
129
|
-
|
130
141
|
self.config.update({'database': db_name})
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
142
|
+
try:
|
143
|
+
with closing(pymysql.connect(**self.config)) as connection:
|
144
|
+
with closing(connection.cursor()) as cursor:
|
145
|
+
sql = 'SELECT COLUMN_NAME FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
146
|
+
cursor.execute(sql, (db_name, table_name))
|
147
|
+
cols_exist = [col['COLUMN_NAME'] for col in cursor.fetchall()]
|
148
|
+
columns_name = [item for item in columns_name if item in cols_exist]
|
149
|
+
if not columns_name:
|
150
|
+
logger.info("columns_to_list: 未找到匹配的列名")
|
151
|
+
return []
|
152
|
+
columns_in = ', '.join([f'`{col}`' for col in columns_name])
|
153
|
+
sql = f"SELECT {columns_in} FROM `{db_name}`.`{table_name}`"
|
154
|
+
if where:
|
155
|
+
sql += f" WHERE {where}"
|
156
|
+
logger.debug(f"columns_to_list SQL: {sql}")
|
157
|
+
cursor.execute(sql)
|
158
|
+
column_values = cursor.fetchall()
|
159
|
+
return column_values
|
160
|
+
except Exception as e:
|
161
|
+
logger.error(f"columns_to_list error: {e}")
|
162
|
+
return []
|
148
163
|
|
149
|
-
def dtypes_to_list(self, db_name, table_name) -> list:
|
164
|
+
def dtypes_to_list(self, db_name, table_name, columns_name=None) -> list:
|
150
165
|
"""
|
151
|
-
|
152
|
-
|
166
|
+
获取数据表的列名和类型, 支持只返回部分字段类型。
|
167
|
+
:param db_name: 数据库名
|
168
|
+
:param table_name: 表名
|
169
|
+
:param columns_name: 可选,字段名列表,仅返回这些字段的类型
|
170
|
+
:return: [{'COLUMN_NAME': ..., 'COLUMN_TYPE': ...}, ...]
|
153
171
|
"""
|
154
|
-
if self.check_infos(db_name, table_name)
|
172
|
+
if not self.check_infos(db_name, table_name):
|
155
173
|
return []
|
156
|
-
|
157
174
|
self.config.update({'database': db_name})
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
175
|
+
try:
|
176
|
+
with closing(pymysql.connect(**self.config)) as connection:
|
177
|
+
with closing(connection.cursor()) as cursor:
|
178
|
+
sql = 'SELECT COLUMN_NAME, COLUMN_TYPE FROM information_schema.columns WHERE table_schema = %s AND table_name = %s'
|
179
|
+
cursor.execute(sql, (db_name, table_name))
|
180
|
+
column_name_and_type = cursor.fetchall()
|
181
|
+
if columns_name:
|
182
|
+
columns_name = set(columns_name)
|
183
|
+
column_name_and_type = [row for row in column_name_and_type if row['COLUMN_NAME'] in columns_name]
|
184
|
+
return column_name_and_type
|
185
|
+
except Exception as e:
|
186
|
+
logger.error(f"dtypes_to_list error: {e}")
|
187
|
+
return []
|
166
188
|
|
167
189
|
def check_infos(self, db_name, table_name) -> bool:
|
168
|
-
"""
|
169
|
-
|
190
|
+
"""
|
191
|
+
检查数据库和数据表是否存在。
|
192
|
+
:param db_name: 数据库名
|
193
|
+
:param table_name: 表名
|
194
|
+
:return: 存在返回True,否则False
|
195
|
+
"""
|
170
196
|
try:
|
171
|
-
with
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
self.config.update({'database': db_name})
|
182
|
-
connection = pymysql.connect(**self.config) # 重新连接数据库
|
197
|
+
with closing(pymysql.connect(**self.config)) as connection:
|
198
|
+
with closing(connection.cursor()) as cursor:
|
199
|
+
cursor.execute(f"SHOW DATABASES LIKE %s", (db_name,))
|
200
|
+
database_exists = cursor.fetchone()
|
201
|
+
if not database_exists:
|
202
|
+
logger.info(f"Database <{db_name}>: 数据库不存在")
|
203
|
+
return False
|
204
|
+
except Exception as e:
|
205
|
+
logger.error(f"check_infos-db error: {e}")
|
206
|
+
return False
|
207
|
+
self.config.update({'database': db_name})
|
183
208
|
try:
|
184
|
-
with
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
return
|
191
|
-
return True
|
209
|
+
with closing(pymysql.connect(**self.config)) as connection:
|
210
|
+
with closing(connection.cursor()) as cursor:
|
211
|
+
cursor.execute(f"SHOW TABLES LIKE %s", (table_name,))
|
212
|
+
if not cursor.fetchone():
|
213
|
+
logger.info(f'{db_name} -> <{table_name}>: 表不存在')
|
214
|
+
return False
|
215
|
+
return True
|
192
216
|
except Exception as e:
|
193
|
-
logger.error(e)
|
217
|
+
logger.error(f"check_infos-table error: {e}")
|
194
218
|
return False
|
195
|
-
finally:
|
196
|
-
connection.close() # 断开连接
|
197
219
|
|
198
220
|
|
199
221
|
if __name__ == '__main__':
|
200
|
-
|
201
|
-
data = conf.config_datas['Windows']['xigua_lx']['mysql']['remoto']
|
202
|
-
username, password, host, port = data['username'], data['password'], data['host'], data['port']
|
203
|
-
|
204
|
-
q = QueryDatas(username, password, host, port)
|
205
|
-
res = q.columns_to_list(db_name='视频数据', table_name='bilibili视频', columns_name=['视频bv号', '下载进度'])
|
206
|
-
logger.info(res)
|
222
|
+
pass
|