cppackage 0.2.9__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage/db/sql_model.py +13 -12
- cppackage-0.3.1/CPpackage/db/test.py +25 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/PKG-INFO +2 -15
- {cppackage-0.2.9 → cppackage-0.3.1}/PKG-INFO +2 -15
- {cppackage-0.2.9 → cppackage-0.3.1}/setup.py +1 -1
- cppackage-0.2.9/CPpackage/db/test.py +0 -24
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage/__init__.py +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage/core/__init__.py +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage/db/__init__.py +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage/db/config.py +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage/utils/__init__.py +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/SOURCES.txt +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/dependency_links.txt +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/entry_points.txt +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/not-zip-safe +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/requires.txt +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/CPpackage.egg-info/top_level.txt +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/LICENSE +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/MANIFEST.in +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/readme.md +0 -0
- {cppackage-0.2.9 → cppackage-0.3.1}/setup.cfg +0 -0
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import pymysql
|
|
2
2
|
import time
|
|
3
3
|
from pymysql import Error
|
|
4
|
-
import pandas as pd
|
|
5
|
-
# 条件导入:支持直接运行和包模式
|
|
6
4
|
try:
|
|
7
5
|
from .config import get_db_config
|
|
8
6
|
except (ImportError, ValueError):
|
|
@@ -161,6 +159,13 @@ def find_duplicate_records(table_name, database, unique_index_fields, port=None)
|
|
|
161
159
|
"""
|
|
162
160
|
|
|
163
161
|
try:
|
|
162
|
+
# Lazy import pandas here to avoid heavy imports during module import
|
|
163
|
+
try:
|
|
164
|
+
import pandas as pd
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print("无法导入 pandas:", e)
|
|
167
|
+
return None
|
|
168
|
+
|
|
164
169
|
with _get_connection(database, port) as conn:
|
|
165
170
|
df_duplicates = pd.read_sql(sql, conn)
|
|
166
171
|
if df_duplicates.empty:
|
|
@@ -168,8 +173,8 @@ def find_duplicate_records(table_name, database, unique_index_fields, port=None)
|
|
|
168
173
|
else:
|
|
169
174
|
print(f"发现 {len(df_duplicates)} 条重复记录({len(df_duplicates.drop_duplicates(subset=unique_index_fields))} 组重复组合)")
|
|
170
175
|
# 可选:保存重复记录到CSV
|
|
171
|
-
df_duplicates.to_csv(f"duplicate_records_{table_name}.csv", index=False, encoding="utf-8-sig")
|
|
172
|
-
print("重复记录已保存到 duplicate_records_{table_name}.csv")
|
|
176
|
+
# df_duplicates.to_csv(f"duplicate_records_{table_name}.csv", index=False, encoding="utf-8-sig")
|
|
177
|
+
# print("重复记录已保存到 duplicate_records_{table_name}.csv")
|
|
173
178
|
return df_duplicates
|
|
174
179
|
except pymysql.MySQLError as e:
|
|
175
180
|
print("数据库查询失败:", e)
|
|
@@ -178,7 +183,7 @@ def find_duplicate_records(table_name, database, unique_index_fields, port=None)
|
|
|
178
183
|
print("未知错误:", e)
|
|
179
184
|
return None
|
|
180
185
|
# ===================== 删除重复记录函数(修复参数+调用逻辑) =====================
|
|
181
|
-
def delete_duplicate_records(table_name, database, unique_index_fields, port=None, keep_strategy="
|
|
186
|
+
def delete_duplicate_records(table_name, database, unique_index_fields, port=None, keep_strategy="max_id"):
|
|
182
187
|
"""
|
|
183
188
|
删除重复记录,仅保留每组唯一值的一条记录
|
|
184
189
|
参数:
|
|
@@ -190,10 +195,6 @@ def delete_duplicate_records(table_name, database, unique_index_fields, port=Non
|
|
|
190
195
|
返回:
|
|
191
196
|
int: 删除的记录数;None: 出错;0: 无重复
|
|
192
197
|
"""
|
|
193
|
-
# 1. 先调用修复后的find_duplicate_records查询重复数据
|
|
194
|
-
duplicate_df = find_duplicate_records(table_name, database, unique_index_fields, port)
|
|
195
|
-
if duplicate_df is None or duplicate_df.empty:
|
|
196
|
-
return 0
|
|
197
198
|
|
|
198
199
|
# 2. 构建删除SQL(核心逻辑)
|
|
199
200
|
group_by_str = ",".join([f"`{field}`" for field in unique_index_fields])
|
|
@@ -248,6 +249,9 @@ def update_datas(df, table_name, database):
|
|
|
248
249
|
conn = None
|
|
249
250
|
for i in range(2):
|
|
250
251
|
try:
|
|
252
|
+
if df.empty:
|
|
253
|
+
print("数据为空,无需入库")
|
|
254
|
+
return
|
|
251
255
|
conn = _get_connection(database)
|
|
252
256
|
cursor = conn.cursor()
|
|
253
257
|
cols = list(df.columns)
|
|
@@ -262,7 +266,6 @@ def update_datas(df, table_name, database):
|
|
|
262
266
|
VALUES {values_str}
|
|
263
267
|
ON DUPLICATE KEY UPDATE {update_clause}
|
|
264
268
|
"""
|
|
265
|
-
print(sql)
|
|
266
269
|
data = [tuple(row) for row in df.values]
|
|
267
270
|
flat_data = [v for row in data for v in row]
|
|
268
271
|
cursor.execute(sql, flat_data)
|
|
@@ -281,5 +284,3 @@ def update_datas(df, table_name, database):
|
|
|
281
284
|
finally:
|
|
282
285
|
if conn:
|
|
283
286
|
conn.close()
|
|
284
|
-
|
|
285
|
-
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from CPpackage.db.sql_model import find_duplicate_records,delete_duplicate_records
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
if __name__ == "__main__":
|
|
6
|
+
# 配置你的参数
|
|
7
|
+
TABLE_NAME = "liuliang_plly"
|
|
8
|
+
DATABASE_NAME = "shengyicanmou"
|
|
9
|
+
# 你的唯一索引字段列表(必须是列表类型!)
|
|
10
|
+
UNIQUE_FIELDS = ['begindate', 'value_l', 'pageName', 'itemId', 'name_l1', 'name_l2', 'date_effect', 'store_id']
|
|
11
|
+
PORT = 3306 # 可选,默认从配置获取
|
|
12
|
+
|
|
13
|
+
# 第一步:查询重复记录
|
|
14
|
+
# dup_df = find_duplicate_records(TABLE_NAME, DATABASE_NAME, UNIQUE_FIELDS, PORT)
|
|
15
|
+
|
|
16
|
+
# # 第二步:确认有重复后删除
|
|
17
|
+
# if dup_df is not None and not dup_df.empty:
|
|
18
|
+
|
|
19
|
+
delete_count = delete_duplicate_records(
|
|
20
|
+
table_name=TABLE_NAME,
|
|
21
|
+
database=DATABASE_NAME,
|
|
22
|
+
unique_index_fields=UNIQUE_FIELDS, # 必须传列表!
|
|
23
|
+
port=PORT,
|
|
24
|
+
keep_strategy="max_id"
|
|
25
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: cppackage
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: 超品集团自用的Python包
|
|
5
5
|
Home-page: https://github.com/example/CPpackage
|
|
6
6
|
Author: team-数智组
|
|
@@ -17,19 +17,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Requires-Python: >=3.8
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
|
20
|
-
Requires-Dist: pymysql
|
|
21
|
-
Requires-Dist: pandas
|
|
22
|
-
Requires-Dist: numpy
|
|
23
|
-
Dynamic: author
|
|
24
|
-
Dynamic: author-email
|
|
25
|
-
Dynamic: classifier
|
|
26
|
-
Dynamic: description
|
|
27
|
-
Dynamic: description-content-type
|
|
28
|
-
Dynamic: home-page
|
|
29
|
-
Dynamic: license-file
|
|
30
|
-
Dynamic: requires-dist
|
|
31
|
-
Dynamic: requires-python
|
|
32
|
-
Dynamic: summary
|
|
33
20
|
|
|
34
21
|
# CPpackage
|
|
35
22
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: cppackage
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: 超品集团自用的Python包
|
|
5
5
|
Home-page: https://github.com/example/CPpackage
|
|
6
6
|
Author: team-数智组
|
|
@@ -17,19 +17,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Requires-Python: >=3.8
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
|
20
|
-
Requires-Dist: pymysql
|
|
21
|
-
Requires-Dist: pandas
|
|
22
|
-
Requires-Dist: numpy
|
|
23
|
-
Dynamic: author
|
|
24
|
-
Dynamic: author-email
|
|
25
|
-
Dynamic: classifier
|
|
26
|
-
Dynamic: description
|
|
27
|
-
Dynamic: description-content-type
|
|
28
|
-
Dynamic: home-page
|
|
29
|
-
Dynamic: license-file
|
|
30
|
-
Dynamic: requires-dist
|
|
31
|
-
Dynamic: requires-python
|
|
32
|
-
Dynamic: summary
|
|
33
20
|
|
|
34
21
|
# CPpackage
|
|
35
22
|
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from CPpackage.db.sql_model import find_duplicate_records,delete_duplicate_records
|
|
2
|
-
import os
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
if __name__ == "__main__":
|
|
6
|
-
# 配置你的参数
|
|
7
|
-
TABLE_NAME = "pinlei_hgjk" # 替换为实际表名
|
|
8
|
-
DATABASE_NAME = "shengyicanmou_copy" # 替换为实际数据库名
|
|
9
|
-
# 你的唯一索引字段列表(必须是列表类型!)
|
|
10
|
-
UNIQUE_FIELDS = ['begindate', 'enddate', 'value_l', 'sellerId', 'statDate', 'date_effect', 'store_id']
|
|
11
|
-
PORT = 3306 # 可选,默认从配置获取
|
|
12
|
-
|
|
13
|
-
# 第一步:查询重复记录
|
|
14
|
-
dup_df = find_duplicate_records(TABLE_NAME, DATABASE_NAME, UNIQUE_FIELDS, PORT)
|
|
15
|
-
|
|
16
|
-
# 第二步:确认有重复后删除
|
|
17
|
-
if dup_df is not None and not dup_df.empty:
|
|
18
|
-
delete_count = delete_duplicate_records(
|
|
19
|
-
table_name=TABLE_NAME,
|
|
20
|
-
database=DATABASE_NAME,
|
|
21
|
-
unique_index_fields=UNIQUE_FIELDS, # 必须传列表!
|
|
22
|
-
port=PORT,
|
|
23
|
-
keep_strategy="min_id"
|
|
24
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|