pgsqldatatool 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.3
2
+ Name: pgsqldatatool
3
+ Version: 1.0.0
4
+ Summary: Add your description here
5
+ Author: manji
6
+ Author-email: manji <pnsm@qq.com>
7
+ Requires-Dist: asyncpg>=0.31.0
8
+ Requires-Dist: polars>=1.41.2
9
+ Requires-Dist: python-dotenv>=1.2.2
10
+ Requires-Dist: sqlalchemy>=2.0.51
11
+ Requires-Dist: tzdata>=2026.2
12
+ Requires-Python: >=3.14
13
+ Description-Content-Type: text/markdown
14
+
15
+ ## 使用教程
16
+
17
+ ### 安装
18
+ ```python
19
+ pip install pgsqldatatool
20
+ ```
21
+
22
+ ### date_clean(数据清洗)
23
+
24
+ ``` python
25
+ # 基础清洗 (去掉列名两端空白字符、去掉数据两段空白字符、删除空行)
26
+ lf_basic_clean(df)
27
+
28
+ # 根据字典重命名列
29
+ lf_rename_cols
30
+
31
+ # 移除重复行
32
+ lf_remove_dup_rows
33
+
34
+ # 删除指定列
35
+ lf_remove_cols
36
+
37
+ # 移除数字千分号
38
+ lf_remove_per_mille
39
+
40
+ # 移除数字百分号
41
+ lf_remove_percent
42
+
43
+ # 添加时间列
44
+ lf_add_time
45
+
46
+ # 删除完全为空的列 -- 这一步不是惰性计算,可能会降低性能
47
+ df_drop_empty_cols
48
+ ```
49
+ 示例
50
+
51
+ ```python
52
+ from pgsqldatatool import data_clean as dc
53
+ df = pl.read_excel(r"D:\manji\Downloads\判断中国域名.xlsx")
54
+ df = dc.lf_basic_clean(df)
55
+ df = dc.lf_remove_cols(df,["域名22"])
56
+ df = dc.lf_remove_dup_rows(df)
57
+ df = dc.df_drop_empty_cols(df)
58
+ df = dc.lf_remove_dup_rows(df)
59
+ df = df.collect()
60
+ print(df)
61
+ ```
62
+
63
+ ### PoolSingleton(连接池单例模式)
64
+
65
+ 归还连接(Release):把连接还给池子,让别的任务继续用。(async with 已经帮你自动做了)
66
+ 关闭连接池(Close):彻底断开与数据库的所有连接,销毁这个池子。这通常只在整个程序/服务准备退出时才需要做。
67
+ ```python
68
+ from pgsqldatatool import PoolSingleton
69
+
70
+ # 示例1:使用异步连接池
71
+ async def main_test_1():
72
+
73
+ # 调用方式1,
74
+ async with PoolSingleton.acquire() as conn:
75
+ records = await conn.fetch(""" SELECT * FROM public."test20260211" """)
76
+ print(records)
77
+
78
+ # 销毁整个连接池
79
+ await PoolSingleton.close()
80
+
81
+ ```
82
+
83
+
84
+ ```python
85
+
86
+ # 示例2:使用静态方法
87
+ async def main_test_2():
88
+ # 调用方式2:使用静态方法
89
+ records2 = await PoolSingleton.fetch(""" SELECT * FROM public."test20260211" """)
90
+ print(records2)
91
+
92
+ # 调用方式3:使用静态方法
93
+ records3 = await PoolSingleton.fetchrow(""" SELECT * FROM public."test20260211" """)
94
+ print(records3)
95
+
96
+ # 调用方式4:使用静态方法
97
+ records4 = await PoolSingleton.execute(""" SELECT * FROM public."test20260211" """)
98
+ print(records4)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ asyncio.run(main_test_1())
103
+ ```
104
+
105
+
106
+ ## 创建异步数据库链接
107
+
108
+ ### 推荐方案:asyncpg(性能最好,原生 asyncio)
109
+ 适合:FastAPI / aiohttp / asyncio 项目
110
+ ```python
111
+ pip install asyncpg
112
+ ```
113
+
114
+ ```python
115
+ import asyncpg
116
+ import asyncio
117
+
118
+ async def main():
119
+ # 创建连接
120
+ conn = await asyncpg.connect(
121
+ host="localhost",
122
+ port=5432,
123
+ user="postgres",
124
+ password="password",
125
+ database="testdb"
126
+ )
127
+
128
+ # 查询
129
+ row = await conn.fetchrow("SELECT NOW()")
130
+ print(row)
131
+
132
+ # 参数化查询
133
+ rows = await conn.fetch(
134
+ "SELECT * FROM users WHERE age > $1",
135
+ 18
136
+ )
137
+
138
+ await conn.close()
139
+
140
+ asyncio.run(main())
141
+ ```
142
+
143
+ ### 使用连接池(生产必选 ✅)
144
+
145
+ ```python
146
+ import asyncpg
147
+ import asyncio
148
+
149
+ async def get_pool():
150
+ return await asyncpg.create_pool(
151
+ dsn="postgresql://postgres:password@localhost:5432/testdb",
152
+ min_size=5,
153
+ max_size=20
154
+ )
155
+
156
+ async def main():
157
+ pool = await get_pool()
158
+
159
+ async with pool.acquire() as conn:
160
+ result = await conn.fetch("SELECT * FROM users")
161
+
162
+ await pool.close()
163
+
164
+ asyncio.run(main())
165
+ ```
166
+
167
+ 事务
168
+ ```python
169
+ async with conn.transaction():
170
+ await conn.execute(
171
+ "INSERT INTO users(name) VALUES($1)",
172
+ "Alice"
173
+ )
174
+ ```
175
+
176
+
177
+ ### SQLAlchemy 2.0 + asyncpg(ORM 场景)
178
+ 适合:需要 ORM、多数据库兼容
179
+
180
+ ```python
181
+ pip install sqlalchemy[asyncio] asyncpg
182
+ ```
183
+
184
+ ```python
185
+ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
186
+ from sqlalchemy.orm import sessionmaker
187
+ from sqlalchemy import select
188
+
189
+ engine = create_async_engine(
190
+ "postgresql+asyncpg://postgres:password@localhost/testdb",
191
+ pool_size=10,
192
+ max_overflow=20
193
+ )
194
+
195
+ AsyncSessionLocal = sessionmaker(
196
+ engine, class_=AsyncSession, expire_on_commit=False
197
+ )
198
+
199
+ async def query_users():
200
+ async with AsyncSessionLocal() as session:
201
+ result = await session.execute(select(User))
202
+ return result.scalars().all()
203
+ ```
204
+
@@ -0,0 +1,190 @@
1
+ ## 使用教程
2
+
3
+ ### 安装
4
+ ```python
5
+ pip install pgsqldatatool
6
+ ```
7
+
8
+ ### date_clean(数据清洗)
9
+
10
+ ``` python
11
+ # 基础清洗 (去掉列名两端空白字符、去掉数据两段空白字符、删除空行)
12
+ lf_basic_clean(df)
13
+
14
+ # 根据字典重命名列
15
+ lf_rename_cols
16
+
17
+ # 移除重复行
18
+ lf_remove_dup_rows
19
+
20
+ # 删除指定列
21
+ lf_remove_cols
22
+
23
+ # 移除数字千分号
24
+ lf_remove_per_mille
25
+
26
+ # 移除数字百分号
27
+ lf_remove_percent
28
+
29
+ # 添加时间列
30
+ lf_add_time
31
+
32
+ # 删除完全为空的列 -- 这一步不是惰性计算,可能会降低性能
33
+ df_drop_empty_cols
34
+ ```
35
+ 示例
36
+
37
+ ```python
38
+ from pgsqldatatool import data_clean as dc
39
+ df = pl.read_excel(r"D:\manji\Downloads\判断中国域名.xlsx")
40
+ df = dc.lf_basic_clean(df)
41
+ df = dc.lf_remove_cols(df,["域名22"])
42
+ df = dc.lf_remove_dup_rows(df)
43
+ df = dc.df_drop_empty_cols(df)
44
+ df = dc.lf_remove_dup_rows(df)
45
+ df = df.collect()
46
+ print(df)
47
+ ```
48
+
49
+ ### PoolSingleton(连接池单例模式)
50
+
51
+ 归还连接(Release):把连接还给池子,让别的任务继续用。(async with 已经帮你自动做了)
52
+ 关闭连接池(Close):彻底断开与数据库的所有连接,销毁这个池子。这通常只在整个程序/服务准备退出时才需要做。
53
+ ```python
54
+ from pgsqldatatool import PoolSingleton
55
+
56
+ # 示例1:使用异步连接池
57
+ async def main_test_1():
58
+
59
+ # 调用方式1,
60
+ async with PoolSingleton.acquire() as conn:
61
+ records = await conn.fetch(""" SELECT * FROM public."test20260211" """)
62
+ print(records)
63
+
64
+ # 销毁整个连接池
65
+ await PoolSingleton.close()
66
+
67
+ ```
68
+
69
+
70
+ ```python
71
+
72
+ # 示例2:使用静态方法
73
+ async def main_test_2():
74
+ # 调用方式2:使用静态方法
75
+ records2 = await PoolSingleton.fetch(""" SELECT * FROM public."test20260211" """)
76
+ print(records2)
77
+
78
+ # 调用方式3:使用静态方法
79
+ records3 = await PoolSingleton.fetchrow(""" SELECT * FROM public."test20260211" """)
80
+ print(records3)
81
+
82
+ # 调用方式4:使用静态方法
83
+ records4 = await PoolSingleton.execute(""" SELECT * FROM public."test20260211" """)
84
+ print(records4)
85
+
86
+
87
+ if __name__ == "__main__":
88
+ asyncio.run(main_test_1())
89
+ ```
90
+
91
+
92
+ ## 创建异步数据库链接
93
+
94
+ ### 推荐方案:asyncpg(性能最好,原生 asyncio)
95
+ 适合:FastAPI / aiohttp / asyncio 项目
96
+ ```python
97
+ pip install asyncpg
98
+ ```
99
+
100
+ ```python
101
+ import asyncpg
102
+ import asyncio
103
+
104
+ async def main():
105
+ # 创建连接
106
+ conn = await asyncpg.connect(
107
+ host="localhost",
108
+ port=5432,
109
+ user="postgres",
110
+ password="password",
111
+ database="testdb"
112
+ )
113
+
114
+ # 查询
115
+ row = await conn.fetchrow("SELECT NOW()")
116
+ print(row)
117
+
118
+ # 参数化查询
119
+ rows = await conn.fetch(
120
+ "SELECT * FROM users WHERE age > $1",
121
+ 18
122
+ )
123
+
124
+ await conn.close()
125
+
126
+ asyncio.run(main())
127
+ ```
128
+
129
+ ### 使用连接池(生产必选 ✅)
130
+
131
+ ```python
132
+ import asyncpg
133
+ import asyncio
134
+
135
+ async def get_pool():
136
+ return await asyncpg.create_pool(
137
+ dsn="postgresql://postgres:password@localhost:5432/testdb",
138
+ min_size=5,
139
+ max_size=20
140
+ )
141
+
142
+ async def main():
143
+ pool = await get_pool()
144
+
145
+ async with pool.acquire() as conn:
146
+ result = await conn.fetch("SELECT * FROM users")
147
+
148
+ await pool.close()
149
+
150
+ asyncio.run(main())
151
+ ```
152
+
153
+ 事务
154
+ ```python
155
+ async with conn.transaction():
156
+ await conn.execute(
157
+ "INSERT INTO users(name) VALUES($1)",
158
+ "Alice"
159
+ )
160
+ ```
161
+
162
+
163
+ ### SQLAlchemy 2.0 + asyncpg(ORM 场景)
164
+ 适合:需要 ORM、多数据库兼容
165
+
166
+ ```python
167
+ pip install sqlalchemy[asyncio] asyncpg
168
+ ```
169
+
170
+ ```python
171
+ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
172
+ from sqlalchemy.orm import sessionmaker
173
+ from sqlalchemy import select
174
+
175
+ engine = create_async_engine(
176
+ "postgresql+asyncpg://postgres:password@localhost/testdb",
177
+ pool_size=10,
178
+ max_overflow=20
179
+ )
180
+
181
+ AsyncSessionLocal = sessionmaker(
182
+ engine, class_=AsyncSession, expire_on_commit=False
183
+ )
184
+
185
+ async def query_users():
186
+ async with AsyncSessionLocal() as session:
187
+ result = await session.execute(select(User))
188
+ return result.scalars().all()
189
+ ```
190
+
@@ -0,0 +1,30 @@
1
+ [project]
2
+ name = "pgsqldatatool"
3
+ version = "1.0.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "manji", email = "pnsm@qq.com" }
8
+ ]
9
+ requires-python = ">=3.14"
10
+ dependencies = [
11
+ "asyncpg>=0.31.0",
12
+ "polars>=1.41.2",
13
+ "python-dotenv>=1.2.2",
14
+ "sqlalchemy>=2.0.51",
15
+ "tzdata>=2026.2",
16
+ ]
17
+
18
+ [project.scripts]
19
+ pgsqldatatool = "pgsqldatatool:main"
20
+
21
+ [build-system]
22
+ requires = ["uv_build>=0.10.11,<0.11.0"]
23
+ build-backend = "uv_build"
24
+
25
+
26
+ [[tool.uv.index]]
27
+ # 使用清华大学PyPI镜像源以提高下载速度
28
+ url = "https://pypi.tuna.tsinghua.edu.cn/simple/"
29
+ # 设置为默认索引
30
+ default = true
@@ -0,0 +1,8 @@
1
+ def main() -> None:
2
+ print("Hello from pgsqldatatool!")
3
+
4
+
5
+ # from .until_async import write_async
6
+ from .until_async import write_pg
7
+ from .tools import records_to_df
8
+ from .pgsql_connection_async import PoolSingleton
@@ -0,0 +1,230 @@
1
+
2
+ import re
3
+ import logging
4
+ from datetime import datetime
5
+ from zoneinfo import ZoneInfo
6
+ import polars as pl
7
+
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ # 基础清洗 (去掉列名两端空白字符、去掉数据两段空白字符、删除空行)
13
+ def lf_basic_clean(df:pl.LazyFrame|pl.DataFrame)->pl.LazyFrame:
14
+ """
15
+ 基础清洗
16
+ 1、去掉列名中的首尾空格和首尾换行符
17
+ 2、删除数据两端的空白字符("-",空格、制表符、回车、换行等)
18
+ 3、删除完全为空的行
19
+ Args:
20
+ df (pl.LazyFrame | pl.DataFrame): 要清洗的数据集。
21
+ Returns:
22
+ pl.LazyFrame: 清洗后的数据集。
23
+ """
24
+ if not isinstance(df, pl.LazyFrame):
25
+ df= df.lazy()
26
+
27
+ # 去掉列名中的首尾空格和首尾换行符-------------------------------------------------
28
+ pattern = r'^[\s\n\r]+|[\s\n\r]+$'
29
+ new_names = (
30
+ pl.Series(df.collect_schema().names())
31
+ .str.replace_all(pattern, "")
32
+ .to_list()
33
+ )
34
+ df = df.rename(dict(zip(df.collect_schema().names(), new_names)))
35
+
36
+
37
+ # 删除数据两端的空白字符("-",空格、制表符、回车、换行等)---------------------------
38
+ df = df.with_columns(
39
+ pl.col(pl.String)
40
+ .str.strip_chars()
41
+ .str.replace("-", "")
42
+ )
43
+ # 删除空白行--------------------------------------------------
44
+ df = df.filter(
45
+ pl.any_horizontal(pl.all().is_not_null())
46
+ )
47
+
48
+ return df
49
+
50
+ # 根据字典重命名列
51
+ def lf_rename_cols(df:pl.LazyFrame|pl.DataFrame, rename_cols: dict)->pl.LazyFrame:
52
+ """
53
+ 根据字典重命名列
54
+ Args:
55
+ df (pl.LazyFrame | pl.DataFrame): 要清洗的数据集。
56
+ rename_cols (dict, optional): 列名映射字典。
57
+ Returns:
58
+ df.LazyFrame: 数据集
59
+ """
60
+ if not isinstance(df, pl.LazyFrame):
61
+ df= df.lazy()
62
+
63
+ # 根据参数修改存在的列的列名
64
+ if rename_cols is not None:
65
+ df = df.rename({
66
+ k: v for k, v in rename_cols.items() if k in df.columns
67
+ })
68
+
69
+ return df
70
+
71
+ # 移除重复行
72
+ def lf_remove_dup_rows(df:pl.LazyFrame|pl.DataFrame)->pl.LazyFrame:
73
+ """
74
+ 保留第一次出现的行,删除其余完全相同的重复行
75
+ Args:
76
+ df (pl.LazyFrame|pl.DataFrame): 数据集
77
+
78
+ Returns:
79
+ pl.LazyFrame: 数据集
80
+
81
+ """
82
+ if isinstance(df, pl.DataFrame): # 转换为懒加载
83
+ df = df.lazy()
84
+ df = df.unique(
85
+ subset=None, # 所有列
86
+ keep="first" # 保留第一个
87
+ )
88
+ return df
89
+
90
+ # 删除指定列
91
+ def lf_remove_cols(df, cols:list[str] | None = None)->pl.LazyFrame:
92
+ """
93
+ 删除指定列
94
+ Args:
95
+ df (pl.LazyFrame|pl.DataFrame): 数据集
96
+ cols (list[str] | None): 需要删除的列名。
97
+ Returns:
98
+ pl.LazyFrame|pl.DataFrame: 处理后的数据集
99
+ """
100
+ if isinstance(df, pl.DataFrame): # 转换为懒加载
101
+ df = df.lazy()
102
+
103
+ # 根据参数删除存在的列
104
+ if cols is not None:
105
+ df = df.drop([c for c in cols if c in df.columns])
106
+
107
+ return df
108
+
109
+ # 移除数字千分号
110
+ def lf_remove_per_mille(df, cols: "all" | list[str] | None = None):
111
+ """
112
+ 移除字符串数字中的千分号
113
+ Args:
114
+ df (pl.LazyFrame|pl.DataFrame): 数据集
115
+ cols (all | list[str] | None): 需要将千分比转换为小数列的列名。\n
116
+ 默认为None。当值为all时,将所有列名转换为小数,返回仍是字符串。
117
+
118
+ Returns:
119
+ pl.LazyFrame: 数据集
120
+ """
121
+ if not isinstance(df, pl.LazyFrame):
122
+ df= df.lazy()
123
+
124
+ # 移除字符串数字中的千分号-------------------------------------------------------
125
+ if cols:
126
+ if isinstance(cols, str) and cols=="all":
127
+ df = df.with_columns(
128
+ pl.when(
129
+ pl.col(pl.String).str.contains(r'^\d{1,3}(,\d{3})*(\.\d+)?$')
130
+ )
131
+ .then(pl.col(pl.String).str.replace_all(",", ""))
132
+ .otherwise(pl.col(pl.String))
133
+ )
134
+ elif isinstance(cols, list):
135
+ df = df.with_columns(
136
+ pl.when(
137
+ pl.col(cols).str.contains(r'^\d{1,3}(,\d{3})*(\.\d+)?$')
138
+ )
139
+ .then(pl.col(cols).str.replace_all(",", ""))
140
+ .otherwise(pl.col(cols))
141
+ )
142
+ else:
143
+ raise ValueError("per_mille_cols参数错误")
144
+
145
+ # 移除数字百分号
146
+ def lf_remove_percent(df, cols: "all" | list[str] | None = None):
147
+ """
148
+ 移除字符串数字中的百分号
149
+ Args:
150
+ df (pl.LazyFrame|pl.DataFrame): 数据集
151
+ cols (all | list[str] | None): 需要将百分比转换为小数列的列名。\n
152
+ 默认为None。当值为all时,将所有列名转换为小数,返回仍是字符串。
153
+ Returns:
154
+ pl.LazyFrame: 数据集
155
+ """
156
+ if not isinstance(df, pl.LazyFrame):
157
+ df= df.lazy()
158
+
159
+
160
+ if isinstance(cols, str) and cols == "all":
161
+ target = pl.col(pl.String)
162
+ elif isinstance(cols, list):
163
+ target = pl.col(cols)
164
+ else:
165
+ raise ValueError("percent_cols参数错误")
166
+
167
+ df = df.with_columns(
168
+ target
169
+ .str.strip_chars()
170
+ .str.replace_all(",", "")
171
+ .str.extract(r"^(\d+\.?\d*)%$")
172
+ .cast(pl.Float64)
173
+ .truediv(100)
174
+ .fill_null(target)
175
+ )
176
+ return df
177
+
178
+ # 添加时间列
179
+ def lf_add_time(df, time_zone='Asia/Shanghai') -> pl.LazyFrame:
180
+ """
181
+ 添加时间列
182
+ Args:
183
+ df (pl.LazyFrame|pl.DataFrame): 数据集
184
+ time_zone (str, optional): 时区。默认为'Asia/Shanghai'。
185
+ Returns:
186
+ pl.LazyFrame: 数据集
187
+ """
188
+ if not isinstance(df, pl.LazyFrame):
189
+ df= df.lazy()
190
+
191
+ return df.with_columns(
192
+ # 使用 pl.lit(datetime.now(...)) 依然是安全的,
193
+ # 因为整个 with_columns 表达式在 LazyFrame 下是惰性的,
194
+ # datetime.now() 会在最终 .collect() 时才被求值!
195
+ pl.lit(datetime.now(ZoneInfo(time_zone))).alias('数据写入时间')
196
+ )
197
+
198
+ # 删除完全为空的列
199
+ def df_drop_empty_cols(df) -> pl.LazyFrame:
200
+ """
201
+ 删除完全为空的列 \n
202
+ 这个函数不是延迟执行的 \n
203
+ collect() 会触发运行一次计算,获取所有列的空值状态
204
+ Args:
205
+ df (pl.LazyFrame|pl.DataFrame): 数据集
206
+ Returns:
207
+ pl.LazyFrame: 数据集
208
+ """
209
+
210
+ # 1. 构建一个表达式:一次性计算每一列是否全为 null
211
+ # pl.struct 会将所有列打包,pl.all_horizontal 会并行计算
212
+ is_all_null_expr = pl.all_horizontal(
213
+ pl.col(c).is_null().alias(c) for c in df.collect_schema().names()
214
+ )
215
+
216
+ # 2. 仅触发一次收集,获取所有列的空值状态
217
+ # 注意:这里只计算聚合值,不扫描全表数据,速度极快
218
+ all_null_flags = df.select(is_all_null_expr).collect().row(0)
219
+
220
+ # 3. 在 Python 端解析出需要删除的列名
221
+ cols_to_drop = [
222
+ col_name for col_name, is_null in zip(df.collect_schema().names(), all_null_flags)
223
+ if is_null
224
+ ]
225
+
226
+ # 4. 延迟执行 drop 操作(drop 本身是惰性的,直到最终 collect 才生效)
227
+ if cols_to_drop:
228
+ df = df.drop(cols_to_drop)
229
+
230
+ return df