cppackage 0.2.8__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cppackage-0.3.0/CPpackage/db/sql_model.py +286 -0
- cppackage-0.3.0/CPpackage/db/test.py +25 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/PKG-INFO +2 -15
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/SOURCES.txt +1 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/PKG-INFO +2 -15
- {cppackage-0.2.8 → cppackage-0.3.0}/setup.py +1 -1
- cppackage-0.2.8/CPpackage/db/sql_model.py +0 -171
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage/__init__.py +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage/core/__init__.py +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage/db/__init__.py +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage/db/config.py +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage/utils/__init__.py +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/dependency_links.txt +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/entry_points.txt +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/not-zip-safe +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/requires.txt +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/CPpackage.egg-info/top_level.txt +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/LICENSE +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/MANIFEST.in +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/readme.md +0 -0
- {cppackage-0.2.8 → cppackage-0.3.0}/setup.cfg +0 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import pymysql
|
|
2
|
+
import time
|
|
3
|
+
from pymysql import Error
|
|
4
|
+
try:
|
|
5
|
+
from .config import get_db_config
|
|
6
|
+
except (ImportError, ValueError):
|
|
7
|
+
# 直接运行时失败,尝试绝对路径
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
11
|
+
from config import get_db_config
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ===================== 数据库连接 =====================
|
|
15
|
+
|
|
16
|
+
def _get_connection(database=None, port=None):
|
|
17
|
+
cfg = get_db_config()
|
|
18
|
+
|
|
19
|
+
db = database if database else cfg.get('database')
|
|
20
|
+
prt = port if port else cfg.get('port', 3306)
|
|
21
|
+
|
|
22
|
+
return pymysql.connect(
|
|
23
|
+
host=cfg.get('host'),
|
|
24
|
+
user=cfg.get('user'),
|
|
25
|
+
password=cfg.get('password'),
|
|
26
|
+
database=db,
|
|
27
|
+
port=prt,
|
|
28
|
+
charset="utf8mb4",
|
|
29
|
+
autocommit=False
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ===================== 查询操作 =====================
|
|
34
|
+
|
|
35
|
+
def sel_data(sql, params=None, port=None, database=None):
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
with _get_connection(database, port) as conn:
|
|
39
|
+
with conn.cursor() as cursor:
|
|
40
|
+
cursor.execute(sql, params)
|
|
41
|
+
return cursor.fetchall()
|
|
42
|
+
|
|
43
|
+
except pymysql.MySQLError as e:
|
|
44
|
+
print("查询失败:", e)
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# ===================== 增量更新 =====================
|
|
48
|
+
def incremental_update(sql, values, port=None, database=None):
|
|
49
|
+
with _get_connection(database=database, port=port) as conn:
|
|
50
|
+
with conn.cursor() as cursor:
|
|
51
|
+
cursor.executemany(sql, values)
|
|
52
|
+
conn.commit()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ===================== 删除操作 =====================
|
|
56
|
+
def del_data(sql, params=None, port=None, database=None):
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
with _get_connection(database, port) as conn:
|
|
60
|
+
with conn.cursor() as cursor:
|
|
61
|
+
cursor.execute(sql, params)
|
|
62
|
+
conn.commit()
|
|
63
|
+
|
|
64
|
+
except pymysql.MySQLError as e:
|
|
65
|
+
print("删除失败:", e)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ===================== 表结构管理 =====================
|
|
69
|
+
|
|
70
|
+
def get_table_columns(table_name, database):
|
|
71
|
+
|
|
72
|
+
sql = """
|
|
73
|
+
SELECT COLUMN_NAME
|
|
74
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
75
|
+
WHERE TABLE_SCHEMA=%s
|
|
76
|
+
AND TABLE_NAME=%s
|
|
77
|
+
ORDER BY ORDINAL_POSITION
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
with _get_connection(database) as conn:
|
|
81
|
+
with conn.cursor() as cursor:
|
|
82
|
+
cursor.execute(sql, (database, table_name))
|
|
83
|
+
return [row[0] for row in cursor.fetchall()]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def add_columns(table_name, database, columns):
|
|
87
|
+
|
|
88
|
+
with _get_connection(database) as conn:
|
|
89
|
+
with conn.cursor() as cursor:
|
|
90
|
+
|
|
91
|
+
for col in columns:
|
|
92
|
+
sql = f"""
|
|
93
|
+
ALTER TABLE `{table_name}`
|
|
94
|
+
ADD COLUMN `{col}` TEXT
|
|
95
|
+
"""
|
|
96
|
+
cursor.execute(sql)
|
|
97
|
+
print(f"新增字段: {col}")
|
|
98
|
+
|
|
99
|
+
conn.commit()
|
|
100
|
+
|
|
101
|
+
# 表结构校验
|
|
102
|
+
def check_and_sync_columns(df ,table_name, database, max_new=5):
|
|
103
|
+
|
|
104
|
+
# df字段
|
|
105
|
+
df_cols = set(df.columns)
|
|
106
|
+
# 数据库字段
|
|
107
|
+
db_cols = set(get_table_columns(table_name, database))
|
|
108
|
+
# 新增字段
|
|
109
|
+
new_cols = df_cols - db_cols
|
|
110
|
+
# 缺失字段(减少的字段,不处理)
|
|
111
|
+
missing_cols = db_cols - df_cols
|
|
112
|
+
print("新增字段:", new_cols if new_cols else "无")
|
|
113
|
+
print("缺失字段(忽略):", missing_cols if missing_cols else "无")
|
|
114
|
+
|
|
115
|
+
# 最大新增字段限制
|
|
116
|
+
if len(db_cols) * 0.2 > max_new:
|
|
117
|
+
max_new = int(len(db_cols) *0.2)
|
|
118
|
+
# 自动补
|
|
119
|
+
if 0 < len(new_cols) <= max_new:
|
|
120
|
+
print("开始同步字段...")
|
|
121
|
+
add_columns(table_name, database, new_cols)
|
|
122
|
+
print("字段同步成功")
|
|
123
|
+
else:
|
|
124
|
+
raise Exception(f"新增字段超过{max_new}个: {new_cols}")
|
|
125
|
+
return True
|
|
126
|
+
# ===================== 通用查询重复记录函数(适配自定义索引) =====================
|
|
127
|
+
def find_duplicate_records(table_name, database, unique_index_fields, port=None):
|
|
128
|
+
"""
|
|
129
|
+
查询指定表中基于自定义唯一索引字段组合重复的记录
|
|
130
|
+
参数:
|
|
131
|
+
table_name: 表名(必选)
|
|
132
|
+
database: 数据库名(必选)
|
|
133
|
+
unique_index_fields: 唯一索引字段列表(必选,如 ['begindate', 'enddate'])
|
|
134
|
+
port: 数据库端口(可选)
|
|
135
|
+
返回:
|
|
136
|
+
DataFrame: 重复记录详情;None: 出错;空DataFrame: 无重复
|
|
137
|
+
"""
|
|
138
|
+
# 核心校验:确保unique_index_fields是可迭代的非空列表
|
|
139
|
+
if not isinstance(unique_index_fields, list) or len(unique_index_fields) == 0:
|
|
140
|
+
print("错误:unique_index_fields 必须是非空列表!")
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
fields_str = ",".join([f"`{field}`" for field in unique_index_fields])
|
|
144
|
+
group_by_str = ",".join([f"`{field}`" for field in unique_index_fields])
|
|
145
|
+
|
|
146
|
+
# 核心SQL:查询所有重复记录
|
|
147
|
+
sql = f"""
|
|
148
|
+
SELECT t.*
|
|
149
|
+
FROM `{table_name}` t
|
|
150
|
+
INNER JOIN (
|
|
151
|
+
SELECT {group_by_str}, COUNT(*) AS duplicate_count
|
|
152
|
+
FROM `{table_name}`
|
|
153
|
+
GROUP BY {group_by_str}
|
|
154
|
+
HAVING COUNT(*) > 1
|
|
155
|
+
) dup ON {
|
|
156
|
+
" AND ".join([f"t.`{field}` = dup.`{field}`" for field in unique_index_fields])
|
|
157
|
+
}
|
|
158
|
+
ORDER BY {group_by_str}
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# Lazy import pandas here to avoid heavy imports during module import
|
|
163
|
+
try:
|
|
164
|
+
import pandas as pd
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print("无法导入 pandas:", e)
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
with _get_connection(database, port) as conn:
|
|
170
|
+
df_duplicates = pd.read_sql(sql, conn)
|
|
171
|
+
if df_duplicates.empty:
|
|
172
|
+
print(f"表 {table_name} 中无重复记录")
|
|
173
|
+
else:
|
|
174
|
+
print(f"发现 {len(df_duplicates)} 条重复记录({len(df_duplicates.drop_duplicates(subset=unique_index_fields))} 组重复组合)")
|
|
175
|
+
# 可选:保存重复记录到CSV
|
|
176
|
+
# df_duplicates.to_csv(f"duplicate_records_{table_name}.csv", index=False, encoding="utf-8-sig")
|
|
177
|
+
# print("重复记录已保存到 duplicate_records_{table_name}.csv")
|
|
178
|
+
return df_duplicates
|
|
179
|
+
except pymysql.MySQLError as e:
|
|
180
|
+
print("数据库查询失败:", e)
|
|
181
|
+
return None
|
|
182
|
+
except Exception as e:
|
|
183
|
+
print("未知错误:", e)
|
|
184
|
+
return None
|
|
185
|
+
# ===================== 删除重复记录函数(修复参数+调用逻辑) =====================
|
|
186
|
+
def delete_duplicate_records(table_name, database, unique_index_fields, port=None, keep_strategy="max_id"):
|
|
187
|
+
"""
|
|
188
|
+
删除重复记录,仅保留每组唯一值的一条记录
|
|
189
|
+
参数:
|
|
190
|
+
table_name: 表名(必选)
|
|
191
|
+
database: 数据库名(必选)
|
|
192
|
+
unique_index_fields: 唯一索引字段列表(必选)
|
|
193
|
+
port: 数据库端口(可选)
|
|
194
|
+
keep_strategy: 保留策略(min_id/max_id)
|
|
195
|
+
返回:
|
|
196
|
+
int: 删除的记录数;None: 出错;0: 无重复
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
# 2. 构建删除SQL(核心逻辑)
|
|
200
|
+
group_by_str = ",".join([f"`{field}`" for field in unique_index_fields])
|
|
201
|
+
if keep_strategy == "min_id":
|
|
202
|
+
keep_condition = "MIN(`id`)"
|
|
203
|
+
elif keep_strategy == "max_id":
|
|
204
|
+
keep_condition = "MAX(`id`)"
|
|
205
|
+
else:
|
|
206
|
+
print(f"不支持的策略:{keep_strategy},默认使用min_id")
|
|
207
|
+
keep_condition = "MIN(`id`)"
|
|
208
|
+
|
|
209
|
+
delete_sql = f"""
|
|
210
|
+
DELETE t FROM `{table_name}` t
|
|
211
|
+
INNER JOIN (
|
|
212
|
+
SELECT {group_by_str}, {keep_condition} AS keep_id
|
|
213
|
+
FROM `{table_name}`
|
|
214
|
+
GROUP BY {group_by_str}
|
|
215
|
+
HAVING COUNT(*) > 1
|
|
216
|
+
) dup ON {
|
|
217
|
+
" AND ".join([f"t.`{field}` = dup.`{field}`" for field in unique_index_fields])
|
|
218
|
+
}
|
|
219
|
+
WHERE t.`id` != dup.keep_id
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
with _get_connection(database, port) as conn:
|
|
224
|
+
with conn.cursor() as cursor:
|
|
225
|
+
# 安全确认
|
|
226
|
+
print(f"\n⚠️ 即将删除表 {table_name} 的重复记录,保留策略:{keep_strategy}")
|
|
227
|
+
print("确认删除请按任意键,取消请关闭程序...")
|
|
228
|
+
input()
|
|
229
|
+
|
|
230
|
+
# 执行删除
|
|
231
|
+
affected_rows = cursor.execute(delete_sql)
|
|
232
|
+
conn.commit()
|
|
233
|
+
print(f"✅ 成功删除 {affected_rows} 条重复记录")
|
|
234
|
+
return affected_rows
|
|
235
|
+
except pymysql.MySQLError as e:
|
|
236
|
+
print("❌ 删除失败(数据库错误):", e)
|
|
237
|
+
if 'conn' in locals():
|
|
238
|
+
conn.rollback()
|
|
239
|
+
return None
|
|
240
|
+
except Exception as e:
|
|
241
|
+
print("❌ 删除失败(未知错误):", e)
|
|
242
|
+
if 'conn' in locals():
|
|
243
|
+
conn.rollback()
|
|
244
|
+
return None
|
|
245
|
+
# ===================== DataFrame 入库 =====================
|
|
246
|
+
|
|
247
|
+
def update_datas(df, table_name, database):
|
|
248
|
+
|
|
249
|
+
conn = None
|
|
250
|
+
for i in range(2):
|
|
251
|
+
try:
|
|
252
|
+
if df.empty:
|
|
253
|
+
print("数据为空,无需入库")
|
|
254
|
+
return
|
|
255
|
+
conn = _get_connection(database)
|
|
256
|
+
cursor = conn.cursor()
|
|
257
|
+
cols = list(df.columns)
|
|
258
|
+
col_str = ",".join([f"`{c}`" for c in cols])
|
|
259
|
+
value_tpl = "(" + ",".join(["%s"] * len(cols)) + ")"
|
|
260
|
+
values_str = ",".join([value_tpl] * len(df))
|
|
261
|
+
update_clause = ",".join(
|
|
262
|
+
[f"`{c}`=VALUES(`{c}`)" for c in cols if c != "id"]
|
|
263
|
+
)
|
|
264
|
+
sql = f"""
|
|
265
|
+
INSERT INTO `{table_name}` ({col_str})
|
|
266
|
+
VALUES {values_str}
|
|
267
|
+
ON DUPLICATE KEY UPDATE {update_clause}
|
|
268
|
+
"""
|
|
269
|
+
data = [tuple(row) for row in df.values]
|
|
270
|
+
flat_data = [v for row in data for v in row]
|
|
271
|
+
cursor.execute(sql, flat_data)
|
|
272
|
+
conn.commit()
|
|
273
|
+
print(f"成功插入/更新 {cursor.rowcount} 条记录")
|
|
274
|
+
break
|
|
275
|
+
except Exception as e:
|
|
276
|
+
if conn:
|
|
277
|
+
conn.rollback()
|
|
278
|
+
print("入库失败:", e)
|
|
279
|
+
if '1054, "Unknown column' in str(e):
|
|
280
|
+
# 表结构校验,df为抓取数据表
|
|
281
|
+
check_and_sync_columns(df,table_name,database)
|
|
282
|
+
else:
|
|
283
|
+
raise Exception(e)
|
|
284
|
+
finally:
|
|
285
|
+
if conn:
|
|
286
|
+
conn.close()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from CPpackage.db.sql_model import find_duplicate_records,delete_duplicate_records
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
if __name__ == "__main__":
|
|
6
|
+
# 配置你的参数
|
|
7
|
+
TABLE_NAME = "liuliang_plly"
|
|
8
|
+
DATABASE_NAME = "shengyicanmou"
|
|
9
|
+
# 你的唯一索引字段列表(必须是列表类型!)
|
|
10
|
+
UNIQUE_FIELDS = ['begindate', 'value_l', 'pageName', 'itemId', 'name_l1', 'name_l2', 'date_effect', 'store_id']
|
|
11
|
+
PORT = 3306 # 可选,默认从配置获取
|
|
12
|
+
|
|
13
|
+
# 第一步:查询重复记录
|
|
14
|
+
# dup_df = find_duplicate_records(TABLE_NAME, DATABASE_NAME, UNIQUE_FIELDS, PORT)
|
|
15
|
+
|
|
16
|
+
# # 第二步:确认有重复后删除
|
|
17
|
+
# if dup_df is not None and not dup_df.empty:
|
|
18
|
+
|
|
19
|
+
delete_count = delete_duplicate_records(
|
|
20
|
+
table_name=TABLE_NAME,
|
|
21
|
+
database=DATABASE_NAME,
|
|
22
|
+
unique_index_fields=UNIQUE_FIELDS, # 必须传列表!
|
|
23
|
+
port=PORT,
|
|
24
|
+
keep_strategy="max_id"
|
|
25
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: cppackage
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: 超品集团自用的Python包
|
|
5
5
|
Home-page: https://github.com/example/CPpackage
|
|
6
6
|
Author: team-数智组
|
|
@@ -17,19 +17,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Requires-Python: >=3.8
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
|
20
|
-
Requires-Dist: pymysql
|
|
21
|
-
Requires-Dist: pandas
|
|
22
|
-
Requires-Dist: numpy
|
|
23
|
-
Dynamic: author
|
|
24
|
-
Dynamic: author-email
|
|
25
|
-
Dynamic: classifier
|
|
26
|
-
Dynamic: description
|
|
27
|
-
Dynamic: description-content-type
|
|
28
|
-
Dynamic: home-page
|
|
29
|
-
Dynamic: license-file
|
|
30
|
-
Dynamic: requires-dist
|
|
31
|
-
Dynamic: requires-python
|
|
32
|
-
Dynamic: summary
|
|
33
20
|
|
|
34
21
|
# CPpackage
|
|
35
22
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: cppackage
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: 超品集团自用的Python包
|
|
5
5
|
Home-page: https://github.com/example/CPpackage
|
|
6
6
|
Author: team-数智组
|
|
@@ -17,19 +17,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Requires-Python: >=3.8
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
|
20
|
-
Requires-Dist: pymysql
|
|
21
|
-
Requires-Dist: pandas
|
|
22
|
-
Requires-Dist: numpy
|
|
23
|
-
Dynamic: author
|
|
24
|
-
Dynamic: author-email
|
|
25
|
-
Dynamic: classifier
|
|
26
|
-
Dynamic: description
|
|
27
|
-
Dynamic: description-content-type
|
|
28
|
-
Dynamic: home-page
|
|
29
|
-
Dynamic: license-file
|
|
30
|
-
Dynamic: requires-dist
|
|
31
|
-
Dynamic: requires-python
|
|
32
|
-
Dynamic: summary
|
|
33
20
|
|
|
34
21
|
# CPpackage
|
|
35
22
|
|
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
import pymysql
|
|
2
|
-
import time
|
|
3
|
-
from pymysql import Error
|
|
4
|
-
|
|
5
|
-
# 条件导入:支持直接运行和包模式
|
|
6
|
-
try:
|
|
7
|
-
from .config import get_db_config
|
|
8
|
-
except (ImportError, ValueError):
|
|
9
|
-
# 直接运行时失败,尝试绝对路径
|
|
10
|
-
import sys
|
|
11
|
-
import os
|
|
12
|
-
sys.path.insert(0, os.path.dirname(__file__))
|
|
13
|
-
from config import get_db_config
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# ===================== 数据库连接 =====================
|
|
17
|
-
|
|
18
|
-
def _get_connection(database=None, port=None):
|
|
19
|
-
cfg = get_db_config()
|
|
20
|
-
|
|
21
|
-
db = database if database else cfg.get('database')
|
|
22
|
-
prt = port if port else cfg.get('port', 3306)
|
|
23
|
-
|
|
24
|
-
return pymysql.connect(
|
|
25
|
-
host=cfg.get('host'),
|
|
26
|
-
user=cfg.get('user'),
|
|
27
|
-
password=cfg.get('password'),
|
|
28
|
-
database=db,
|
|
29
|
-
port=prt,
|
|
30
|
-
charset="utf8mb4",
|
|
31
|
-
autocommit=False
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# ===================== 查询操作 =====================
|
|
36
|
-
|
|
37
|
-
def sel_data(sql, params=None, port=None, database=None):
|
|
38
|
-
|
|
39
|
-
try:
|
|
40
|
-
with _get_connection(database, port) as conn:
|
|
41
|
-
with conn.cursor() as cursor:
|
|
42
|
-
cursor.execute(sql, params)
|
|
43
|
-
return cursor.fetchall()
|
|
44
|
-
|
|
45
|
-
except pymysql.MySQLError as e:
|
|
46
|
-
print("查询失败:", e)
|
|
47
|
-
return None
|
|
48
|
-
|
|
49
|
-
# ===================== 增量更新 =====================
|
|
50
|
-
def incremental_update(sql, values, port=None, database=None):
|
|
51
|
-
with _get_connection(database=database, port=port) as conn:
|
|
52
|
-
with conn.cursor() as cursor:
|
|
53
|
-
cursor.executemany(sql, values)
|
|
54
|
-
conn.commit()
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
# ===================== 删除操作 =====================
|
|
58
|
-
def del_data(sql, params=None, port=None, database=None):
|
|
59
|
-
|
|
60
|
-
try:
|
|
61
|
-
with _get_connection(database, port) as conn:
|
|
62
|
-
with conn.cursor() as cursor:
|
|
63
|
-
cursor.execute(sql, params)
|
|
64
|
-
conn.commit()
|
|
65
|
-
|
|
66
|
-
except pymysql.MySQLError as e:
|
|
67
|
-
print("删除失败:", e)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
# ===================== 表结构管理 =====================
|
|
71
|
-
|
|
72
|
-
def get_table_columns(table_name, database):
|
|
73
|
-
|
|
74
|
-
sql = """
|
|
75
|
-
SELECT COLUMN_NAME
|
|
76
|
-
FROM INFORMATION_SCHEMA.COLUMNS
|
|
77
|
-
WHERE TABLE_SCHEMA=%s
|
|
78
|
-
AND TABLE_NAME=%s
|
|
79
|
-
ORDER BY ORDINAL_POSITION
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
with _get_connection(database) as conn:
|
|
83
|
-
with conn.cursor() as cursor:
|
|
84
|
-
cursor.execute(sql, (database, table_name))
|
|
85
|
-
return [row[0] for row in cursor.fetchall()]
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def add_columns(table_name, database, columns):
|
|
89
|
-
|
|
90
|
-
with _get_connection(database) as conn:
|
|
91
|
-
with conn.cursor() as cursor:
|
|
92
|
-
|
|
93
|
-
for col in columns:
|
|
94
|
-
sql = f"""
|
|
95
|
-
ALTER TABLE `{table_name}`
|
|
96
|
-
ADD COLUMN `{col}` TEXT
|
|
97
|
-
"""
|
|
98
|
-
cursor.execute(sql)
|
|
99
|
-
print(f"新增字段: {col}")
|
|
100
|
-
|
|
101
|
-
conn.commit()
|
|
102
|
-
|
|
103
|
-
# 表结构校验
|
|
104
|
-
def check_and_sync_columns(df ,table_name, database, max_new=5):
|
|
105
|
-
|
|
106
|
-
# df字段
|
|
107
|
-
df_cols = set(df.columns)
|
|
108
|
-
# 数据库字段
|
|
109
|
-
db_cols = set(get_table_columns(table_name, database))
|
|
110
|
-
# 新增字段
|
|
111
|
-
new_cols = df_cols - db_cols
|
|
112
|
-
|
|
113
|
-
# 缺失字段(减少的字段,不处理)
|
|
114
|
-
missing_cols = db_cols - df_cols
|
|
115
|
-
print("新增字段:", new_cols if new_cols else "无")
|
|
116
|
-
print("缺失字段(忽略):", missing_cols if missing_cols else "无")
|
|
117
|
-
|
|
118
|
-
# 最大新增字段限制
|
|
119
|
-
if len(db_cols) * 0.2 > max_new:
|
|
120
|
-
max_new = int(len(db_cols) *0.2)
|
|
121
|
-
# 自动补
|
|
122
|
-
if 0 < len(new_cols) <= max_new:
|
|
123
|
-
print("开始同步字段...")
|
|
124
|
-
add_columns(table_name, database, new_cols)
|
|
125
|
-
print("字段同步成功")
|
|
126
|
-
else:
|
|
127
|
-
raise Exception(f"新增字段超过{max_new}个: {new_cols}")
|
|
128
|
-
return True
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
# ===================== DataFrame 入库 =====================
|
|
132
|
-
|
|
133
|
-
def update_datas(df, table_name, database):
|
|
134
|
-
|
|
135
|
-
conn = None
|
|
136
|
-
for i in range(2):
|
|
137
|
-
try:
|
|
138
|
-
# 表结构校验,df为抓取数据表,db_cols为数据库现有字段
|
|
139
|
-
conn = _get_connection(database)
|
|
140
|
-
cursor = conn.cursor()
|
|
141
|
-
cols = list(df.columns)
|
|
142
|
-
col_str = ",".join([f"`{c}`" for c in cols])
|
|
143
|
-
value_tpl = "(" + ",".join(["%s"] * len(cols)) + ")"
|
|
144
|
-
values_str = ",".join([value_tpl] * len(df))
|
|
145
|
-
update_clause = ",".join(
|
|
146
|
-
[f"`{c}`=VALUES(`{c}`)" for c in cols if c != "id"]
|
|
147
|
-
)
|
|
148
|
-
sql = f"""
|
|
149
|
-
INSERT INTO `{table_name}` ({col_str})
|
|
150
|
-
VALUES {values_str}
|
|
151
|
-
ON DUPLICATE KEY UPDATE {update_clause}
|
|
152
|
-
"""
|
|
153
|
-
data = [tuple(row) for row in df.values]
|
|
154
|
-
flat_data = [v for row in data for v in row]
|
|
155
|
-
cursor.execute(sql, flat_data)
|
|
156
|
-
conn.commit()
|
|
157
|
-
print(f"成功插入/更新 {cursor.rowcount} 条记录")
|
|
158
|
-
break
|
|
159
|
-
except Exception as e:
|
|
160
|
-
if conn:
|
|
161
|
-
conn.rollback()
|
|
162
|
-
print("入库失败:", e)
|
|
163
|
-
if '1054, "Unknown column' in str(e):
|
|
164
|
-
check_and_sync_columns(df,table_name,database)
|
|
165
|
-
else:
|
|
166
|
-
raise Exception(e)
|
|
167
|
-
finally:
|
|
168
|
-
if conn:
|
|
169
|
-
conn.close()
|
|
170
|
-
|
|
171
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|