pgsqldatatool 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pgsqldatatool-1.0.0/PKG-INFO +204 -0
- pgsqldatatool-1.0.0/README.md +190 -0
- pgsqldatatool-1.0.0/pyproject.toml +30 -0
- pgsqldatatool-1.0.0/src/pgsqldatatool/__init__.py +8 -0
- pgsqldatatool-1.0.0/src/pgsqldatatool/data_clean.py +230 -0
- pgsqldatatool-1.0.0/src/pgsqldatatool/pgsql_connection_async.py +76 -0
- pgsqldatatool-1.0.0/src/pgsqldatatool/tools.py +24 -0
- pgsqldatatool-1.0.0/src/pgsqldatatool/until_async.py +678 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pgsqldatatool
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author: manji
|
|
6
|
+
Author-email: manji <pnsm@qq.com>
|
|
7
|
+
Requires-Dist: asyncpg>=0.31.0
|
|
8
|
+
Requires-Dist: polars>=1.41.2
|
|
9
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
10
|
+
Requires-Dist: sqlalchemy>=2.0.51
|
|
11
|
+
Requires-Dist: tzdata>=2026.2
|
|
12
|
+
Requires-Python: >=3.14
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
## 使用教程
|
|
16
|
+
|
|
17
|
+
### 安装
|
|
18
|
+
```python
|
|
19
|
+
pip install pgsqldatatool
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### date_clean(数据清洗)
|
|
23
|
+
|
|
24
|
+
``` python
|
|
25
|
+
# 基础清洗 (去掉列名两端空白字符、去掉数据两段空白字符、删除空行)
|
|
26
|
+
lf_basic_clean(df)
|
|
27
|
+
|
|
28
|
+
# 根据字典重命名列
|
|
29
|
+
lf_rename_cols
|
|
30
|
+
|
|
31
|
+
# 移除重复行
|
|
32
|
+
lf_remove_dup_rows
|
|
33
|
+
|
|
34
|
+
# 删除指定列
|
|
35
|
+
lf_remove_cols
|
|
36
|
+
|
|
37
|
+
# 移除数字千分号
|
|
38
|
+
lf_remove_per_mille
|
|
39
|
+
|
|
40
|
+
# 移除数字百分号
|
|
41
|
+
lf_remove_percent
|
|
42
|
+
|
|
43
|
+
# 添加时间列
|
|
44
|
+
lf_add_time
|
|
45
|
+
|
|
46
|
+
# 删除完全为空的列 -- 这一步不是惰性计算,可能会降低性能
|
|
47
|
+
df_drop_empty_cols
|
|
48
|
+
```
|
|
49
|
+
示例
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from pgsqldatatool import data_clean as dc
|
|
53
|
+
df = pl.read_excel(r"D:\manji\Downloads\判断中国域名.xlsx")
|
|
54
|
+
df = dc.lf_basic_clean(df)
|
|
55
|
+
df = dc.lf_remove_cols(df,["域名22"])
|
|
56
|
+
df = dc.lf_remove_dup_rows(df)
|
|
57
|
+
df = dc.df_drop_empty_cols(df)
|
|
58
|
+
df = dc.lf_remove_dup_rows(df)
|
|
59
|
+
df = df.collect()
|
|
60
|
+
print(df)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### PoolSingleton(连接池单例模式)
|
|
64
|
+
|
|
65
|
+
归还连接(Release):把连接还给池子,让别的任务继续用。(async with 已经帮你自动做了)
|
|
66
|
+
关闭连接池(Close):彻底断开与数据库的所有连接,销毁这个池子。这通常只在整个程序/服务准备退出时才需要做。
|
|
67
|
+
```python
|
|
68
|
+
from pgsqldatatool import PoolSingleton
|
|
69
|
+
|
|
70
|
+
# 示例1:使用异步连接池
|
|
71
|
+
async def main_test_1():
|
|
72
|
+
|
|
73
|
+
# 调用方式1,
|
|
74
|
+
async with PoolSingleton.acquire() as conn:
|
|
75
|
+
records = await conn.fetch(""" SELECT * FROM public."test20260211" """)
|
|
76
|
+
print(records)
|
|
77
|
+
|
|
78
|
+
# 销毁整个连接池
|
|
79
|
+
await PoolSingleton.close()
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
|
|
86
|
+
# 示例2:使用静态方法
|
|
87
|
+
async def main_test_2():
|
|
88
|
+
# 调用方式2:使用静态方法
|
|
89
|
+
records2 = await PoolSingleton.fetch(""" SELECT * FROM public."test20260211" """)
|
|
90
|
+
print(records2)
|
|
91
|
+
|
|
92
|
+
# 调用方式3:使用静态方法
|
|
93
|
+
records3 = await PoolSingleton.fetchrow(""" SELECT * FROM public."test20260211" """)
|
|
94
|
+
print(records3)
|
|
95
|
+
|
|
96
|
+
# 调用方式4:使用静态方法
|
|
97
|
+
records4 = await PoolSingleton.execute(""" SELECT * FROM public."test20260211" """)
|
|
98
|
+
print(records4)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
asyncio.run(main_test_1())
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
## 创建异步数据库链接
|
|
107
|
+
|
|
108
|
+
### 推荐方案:asyncpg(性能最好,原生 asyncio)
|
|
109
|
+
适合:FastAPI / aiohttp / asyncio 项目
|
|
110
|
+
```python
|
|
111
|
+
pip install asyncpg
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import asyncpg
|
|
116
|
+
import asyncio
|
|
117
|
+
|
|
118
|
+
async def main():
|
|
119
|
+
# 创建连接
|
|
120
|
+
conn = await asyncpg.connect(
|
|
121
|
+
host="localhost",
|
|
122
|
+
port=5432,
|
|
123
|
+
user="postgres",
|
|
124
|
+
password="password",
|
|
125
|
+
database="testdb"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# 查询
|
|
129
|
+
row = await conn.fetchrow("SELECT NOW()")
|
|
130
|
+
print(row)
|
|
131
|
+
|
|
132
|
+
# 参数化查询
|
|
133
|
+
rows = await conn.fetch(
|
|
134
|
+
"SELECT * FROM users WHERE age > $1",
|
|
135
|
+
18
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
await conn.close()
|
|
139
|
+
|
|
140
|
+
asyncio.run(main())
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### 使用连接池(生产必选 ✅)
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
import asyncpg
|
|
147
|
+
import asyncio
|
|
148
|
+
|
|
149
|
+
async def get_pool():
|
|
150
|
+
return await asyncpg.create_pool(
|
|
151
|
+
dsn="postgresql://postgres:password@localhost:5432/testdb",
|
|
152
|
+
min_size=5,
|
|
153
|
+
max_size=20
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
async def main():
|
|
157
|
+
pool = await get_pool()
|
|
158
|
+
|
|
159
|
+
async with pool.acquire() as conn:
|
|
160
|
+
result = await conn.fetch("SELECT * FROM users")
|
|
161
|
+
|
|
162
|
+
await pool.close()
|
|
163
|
+
|
|
164
|
+
asyncio.run(main())
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
事务
|
|
168
|
+
```python
|
|
169
|
+
async with conn.transaction():
|
|
170
|
+
await conn.execute(
|
|
171
|
+
"INSERT INTO users(name) VALUES($1)",
|
|
172
|
+
"Alice"
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
### SQLAlchemy 2.0 + asyncpg(ORM 场景)
|
|
178
|
+
适合:需要 ORM、多数据库兼容
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
pip install sqlalchemy[asyncio] asyncpg
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
|
186
|
+
from sqlalchemy.orm import sessionmaker
|
|
187
|
+
from sqlalchemy import select
|
|
188
|
+
|
|
189
|
+
engine = create_async_engine(
|
|
190
|
+
"postgresql+asyncpg://postgres:password@localhost/testdb",
|
|
191
|
+
pool_size=10,
|
|
192
|
+
max_overflow=20
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
AsyncSessionLocal = sessionmaker(
|
|
196
|
+
engine, class_=AsyncSession, expire_on_commit=False
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
async def query_users():
|
|
200
|
+
async with AsyncSessionLocal() as session:
|
|
201
|
+
result = await session.execute(select(User))
|
|
202
|
+
return result.scalars().all()
|
|
203
|
+
```
|
|
204
|
+
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
## 使用教程
|
|
2
|
+
|
|
3
|
+
### 安装
|
|
4
|
+
```python
|
|
5
|
+
pip install pgsqldatatool
|
|
6
|
+
```
|
|
7
|
+
|
|
8
|
+
### date_clean(数据清洗)
|
|
9
|
+
|
|
10
|
+
``` python
|
|
11
|
+
# 基础清洗 (去掉列名两端空白字符、去掉数据两段空白字符、删除空行)
|
|
12
|
+
lf_basic_clean(df)
|
|
13
|
+
|
|
14
|
+
# 根据字典重命名列
|
|
15
|
+
lf_rename_cols
|
|
16
|
+
|
|
17
|
+
# 移除重复行
|
|
18
|
+
lf_remove_dup_rows
|
|
19
|
+
|
|
20
|
+
# 删除指定列
|
|
21
|
+
lf_remove_cols
|
|
22
|
+
|
|
23
|
+
# 移除数字千分号
|
|
24
|
+
lf_remove_per_mille
|
|
25
|
+
|
|
26
|
+
# 移除数字百分号
|
|
27
|
+
lf_remove_percent
|
|
28
|
+
|
|
29
|
+
# 添加时间列
|
|
30
|
+
lf_add_time
|
|
31
|
+
|
|
32
|
+
# 删除完全为空的列 -- 这一步不是惰性计算,可能会降低性能
|
|
33
|
+
df_drop_empty_cols
|
|
34
|
+
```
|
|
35
|
+
示例
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from pgsqldatatool import data_clean as dc
|
|
39
|
+
df = pl.read_excel(r"D:\manji\Downloads\判断中国域名.xlsx")
|
|
40
|
+
df = dc.lf_basic_clean(df)
|
|
41
|
+
df = dc.lf_remove_cols(df,["域名22"])
|
|
42
|
+
df = dc.lf_remove_dup_rows(df)
|
|
43
|
+
df = dc.df_drop_empty_cols(df)
|
|
44
|
+
df = dc.lf_remove_dup_rows(df)
|
|
45
|
+
df = df.collect()
|
|
46
|
+
print(df)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### PoolSingleton(连接池单例模式)
|
|
50
|
+
|
|
51
|
+
归还连接(Release):把连接还给池子,让别的任务继续用。(async with 已经帮你自动做了)
|
|
52
|
+
关闭连接池(Close):彻底断开与数据库的所有连接,销毁这个池子。这通常只在整个程序/服务准备退出时才需要做。
|
|
53
|
+
```python
|
|
54
|
+
from pgsqldatatool import PoolSingleton
|
|
55
|
+
|
|
56
|
+
# 示例1:使用异步连接池
|
|
57
|
+
async def main_test_1():
|
|
58
|
+
|
|
59
|
+
# 调用方式1,
|
|
60
|
+
async with PoolSingleton.acquire() as conn:
|
|
61
|
+
records = await conn.fetch(""" SELECT * FROM public."test20260211" """)
|
|
62
|
+
print(records)
|
|
63
|
+
|
|
64
|
+
# 销毁整个连接池
|
|
65
|
+
await PoolSingleton.close()
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
|
|
72
|
+
# 示例2:使用静态方法
|
|
73
|
+
async def main_test_2():
|
|
74
|
+
# 调用方式2:使用静态方法
|
|
75
|
+
records2 = await PoolSingleton.fetch(""" SELECT * FROM public."test20260211" """)
|
|
76
|
+
print(records2)
|
|
77
|
+
|
|
78
|
+
# 调用方式3:使用静态方法
|
|
79
|
+
records3 = await PoolSingleton.fetchrow(""" SELECT * FROM public."test20260211" """)
|
|
80
|
+
print(records3)
|
|
81
|
+
|
|
82
|
+
# 调用方式4:使用静态方法
|
|
83
|
+
records4 = await PoolSingleton.execute(""" SELECT * FROM public."test20260211" """)
|
|
84
|
+
print(records4)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
asyncio.run(main_test_1())
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## 创建异步数据库链接
|
|
93
|
+
|
|
94
|
+
### 推荐方案:asyncpg(性能最好,原生 asyncio)
|
|
95
|
+
适合:FastAPI / aiohttp / asyncio 项目
|
|
96
|
+
```python
|
|
97
|
+
pip install asyncpg
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import asyncpg
|
|
102
|
+
import asyncio
|
|
103
|
+
|
|
104
|
+
async def main():
|
|
105
|
+
# 创建连接
|
|
106
|
+
conn = await asyncpg.connect(
|
|
107
|
+
host="localhost",
|
|
108
|
+
port=5432,
|
|
109
|
+
user="postgres",
|
|
110
|
+
password="password",
|
|
111
|
+
database="testdb"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# 查询
|
|
115
|
+
row = await conn.fetchrow("SELECT NOW()")
|
|
116
|
+
print(row)
|
|
117
|
+
|
|
118
|
+
# 参数化查询
|
|
119
|
+
rows = await conn.fetch(
|
|
120
|
+
"SELECT * FROM users WHERE age > $1",
|
|
121
|
+
18
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
await conn.close()
|
|
125
|
+
|
|
126
|
+
asyncio.run(main())
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 使用连接池(生产必选 ✅)
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
import asyncpg
|
|
133
|
+
import asyncio
|
|
134
|
+
|
|
135
|
+
async def get_pool():
|
|
136
|
+
return await asyncpg.create_pool(
|
|
137
|
+
dsn="postgresql://postgres:password@localhost:5432/testdb",
|
|
138
|
+
min_size=5,
|
|
139
|
+
max_size=20
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
async def main():
|
|
143
|
+
pool = await get_pool()
|
|
144
|
+
|
|
145
|
+
async with pool.acquire() as conn:
|
|
146
|
+
result = await conn.fetch("SELECT * FROM users")
|
|
147
|
+
|
|
148
|
+
await pool.close()
|
|
149
|
+
|
|
150
|
+
asyncio.run(main())
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
事务
|
|
154
|
+
```python
|
|
155
|
+
async with conn.transaction():
|
|
156
|
+
await conn.execute(
|
|
157
|
+
"INSERT INTO users(name) VALUES($1)",
|
|
158
|
+
"Alice"
|
|
159
|
+
)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
### SQLAlchemy 2.0 + asyncpg(ORM 场景)
|
|
164
|
+
适合:需要 ORM、多数据库兼容
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
pip install sqlalchemy[asyncio] asyncpg
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
|
172
|
+
from sqlalchemy.orm import sessionmaker
|
|
173
|
+
from sqlalchemy import select
|
|
174
|
+
|
|
175
|
+
engine = create_async_engine(
|
|
176
|
+
"postgresql+asyncpg://postgres:password@localhost/testdb",
|
|
177
|
+
pool_size=10,
|
|
178
|
+
max_overflow=20
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
AsyncSessionLocal = sessionmaker(
|
|
182
|
+
engine, class_=AsyncSession, expire_on_commit=False
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
async def query_users():
|
|
186
|
+
async with AsyncSessionLocal() as session:
|
|
187
|
+
result = await session.execute(select(User))
|
|
188
|
+
return result.scalars().all()
|
|
189
|
+
```
|
|
190
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pgsqldatatool"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "manji", email = "pnsm@qq.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.14"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"asyncpg>=0.31.0",
|
|
12
|
+
"polars>=1.41.2",
|
|
13
|
+
"python-dotenv>=1.2.2",
|
|
14
|
+
"sqlalchemy>=2.0.51",
|
|
15
|
+
"tzdata>=2026.2",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
pgsqldatatool = "pgsqldatatool:main"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["uv_build>=0.10.11,<0.11.0"]
|
|
23
|
+
build-backend = "uv_build"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
[[tool.uv.index]]
|
|
27
|
+
# 使用清华大学PyPI镜像源以提高下载速度
|
|
28
|
+
url = "https://pypi.tuna.tsinghua.edu.cn/simple/"
|
|
29
|
+
# 设置为默认索引
|
|
30
|
+
default = true
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from zoneinfo import ZoneInfo
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# 基础清洗 (去掉列名两端空白字符、去掉数据两段空白字符、删除空行)
|
|
13
|
+
def lf_basic_clean(df:pl.LazyFrame|pl.DataFrame)->pl.LazyFrame:
|
|
14
|
+
"""
|
|
15
|
+
基础清洗
|
|
16
|
+
1、去掉列名中的首尾空格和首尾换行符
|
|
17
|
+
2、删除数据两端的空白字符("-",空格、制表符、回车、换行等)
|
|
18
|
+
3、删除完全为空的行
|
|
19
|
+
Args:
|
|
20
|
+
df (pl.LazyFrame | pl.DataFrame): 要清洗的数据集。
|
|
21
|
+
Returns:
|
|
22
|
+
pl.LazyFrame: 清洗后的数据集。
|
|
23
|
+
"""
|
|
24
|
+
if not isinstance(df, pl.LazyFrame):
|
|
25
|
+
df= df.lazy()
|
|
26
|
+
|
|
27
|
+
# 去掉列名中的首尾空格和首尾换行符-------------------------------------------------
|
|
28
|
+
pattern = r'^[\s\n\r]+|[\s\n\r]+$'
|
|
29
|
+
new_names = (
|
|
30
|
+
pl.Series(df.collect_schema().names())
|
|
31
|
+
.str.replace_all(pattern, "")
|
|
32
|
+
.to_list()
|
|
33
|
+
)
|
|
34
|
+
df = df.rename(dict(zip(df.collect_schema().names(), new_names)))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# 删除数据两端的空白字符("-",空格、制表符、回车、换行等)---------------------------
|
|
38
|
+
df = df.with_columns(
|
|
39
|
+
pl.col(pl.String)
|
|
40
|
+
.str.strip_chars()
|
|
41
|
+
.str.replace("-", "")
|
|
42
|
+
)
|
|
43
|
+
# 删除空白行--------------------------------------------------
|
|
44
|
+
df = df.filter(
|
|
45
|
+
pl.any_horizontal(pl.all().is_not_null())
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return df
|
|
49
|
+
|
|
50
|
+
# 根据字典重命名列
|
|
51
|
+
def lf_rename_cols(df:pl.LazyFrame|pl.DataFrame, rename_cols: dict)->pl.LazyFrame:
|
|
52
|
+
"""
|
|
53
|
+
根据字典重命名列
|
|
54
|
+
Args:
|
|
55
|
+
df (pl.LazyFrame | pl.DataFrame): 要清洗的数据集。
|
|
56
|
+
rename_cols (dict, optional): 列名映射字典。
|
|
57
|
+
Returns:
|
|
58
|
+
df.LazyFrame: 数据集
|
|
59
|
+
"""
|
|
60
|
+
if not isinstance(df, pl.LazyFrame):
|
|
61
|
+
df= df.lazy()
|
|
62
|
+
|
|
63
|
+
# 根据参数修改存在的列的列名
|
|
64
|
+
if rename_cols is not None:
|
|
65
|
+
df = df.rename({
|
|
66
|
+
k: v for k, v in rename_cols.items() if k in df.columns
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
return df
|
|
70
|
+
|
|
71
|
+
# 移除重复行
|
|
72
|
+
def lf_remove_dup_rows(df:pl.LazyFrame|pl.DataFrame)->pl.LazyFrame:
|
|
73
|
+
"""
|
|
74
|
+
保留第一次出现的行,删除其余完全相同的重复行
|
|
75
|
+
Args:
|
|
76
|
+
df (pl.LazyFrame|pl.DataFrame): 数据集
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
pl.LazyFrame: 数据集
|
|
80
|
+
|
|
81
|
+
"""
|
|
82
|
+
if isinstance(df, pl.DataFrame): # 转换为懒加载
|
|
83
|
+
df = df.lazy()
|
|
84
|
+
df = df.unique(
|
|
85
|
+
subset=None, # 所有列
|
|
86
|
+
keep="first" # 保留第一个
|
|
87
|
+
)
|
|
88
|
+
return df
|
|
89
|
+
|
|
90
|
+
# 删除指定列
|
|
91
|
+
def lf_remove_cols(df, cols:list[str] | None = None)->pl.LazyFrame:
|
|
92
|
+
"""
|
|
93
|
+
删除指定列
|
|
94
|
+
Args:
|
|
95
|
+
df (pl.LazyFrame|pl.DataFrame): 数据集
|
|
96
|
+
cols (list[str] | None): 需要删除的列名。
|
|
97
|
+
Returns:
|
|
98
|
+
pl.LazyFrame|pl.DataFrame: 处理后的数据集
|
|
99
|
+
"""
|
|
100
|
+
if isinstance(df, pl.DataFrame): # 转换为懒加载
|
|
101
|
+
df = df.lazy()
|
|
102
|
+
|
|
103
|
+
# 根据参数删除存在的列
|
|
104
|
+
if cols is not None:
|
|
105
|
+
df = df.drop([c for c in cols if c in df.columns])
|
|
106
|
+
|
|
107
|
+
return df
|
|
108
|
+
|
|
109
|
+
# 移除数字千分号
|
|
110
|
+
def lf_remove_per_mille(df, cols: "all" | list[str] | None = None):
|
|
111
|
+
"""
|
|
112
|
+
移除字符串数字中的千分号
|
|
113
|
+
Args:
|
|
114
|
+
df (pl.LazyFrame|pl.DataFrame): 数据集
|
|
115
|
+
cols (all | list[str] | None): 需要将千分比转换为小数列的列名。\n
|
|
116
|
+
默认为None。当值为all时,将所有列名转换为小数,返回仍是字符串。
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
pl.LazyFrame: 数据集
|
|
120
|
+
"""
|
|
121
|
+
if not isinstance(df, pl.LazyFrame):
|
|
122
|
+
df= df.lazy()
|
|
123
|
+
|
|
124
|
+
# 移除字符串数字中的千分号-------------------------------------------------------
|
|
125
|
+
if cols:
|
|
126
|
+
if isinstance(cols, str) and cols=="all":
|
|
127
|
+
df = df.with_columns(
|
|
128
|
+
pl.when(
|
|
129
|
+
pl.col(pl.String).str.contains(r'^\d{1,3}(,\d{3})*(\.\d+)?$')
|
|
130
|
+
)
|
|
131
|
+
.then(pl.col(pl.String).str.replace_all(",", ""))
|
|
132
|
+
.otherwise(pl.col(pl.String))
|
|
133
|
+
)
|
|
134
|
+
elif isinstance(cols, list):
|
|
135
|
+
df = df.with_columns(
|
|
136
|
+
pl.when(
|
|
137
|
+
pl.col(cols).str.contains(r'^\d{1,3}(,\d{3})*(\.\d+)?$')
|
|
138
|
+
)
|
|
139
|
+
.then(pl.col(cols).str.replace_all(",", ""))
|
|
140
|
+
.otherwise(pl.col(cols))
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
raise ValueError("per_mille_cols参数错误")
|
|
144
|
+
|
|
145
|
+
# 移除数字百分号
|
|
146
|
+
def lf_remove_percent(df, cols: "all" | list[str] | None = None):
|
|
147
|
+
"""
|
|
148
|
+
移除字符串数字中的百分号
|
|
149
|
+
Args:
|
|
150
|
+
df (pl.LazyFrame|pl.DataFrame): 数据集
|
|
151
|
+
cols (all | list[str] | None): 需要将百分比转换为小数列的列名。\n
|
|
152
|
+
默认为None。当值为all时,将所有列名转换为小数,返回仍是字符串。
|
|
153
|
+
Returns:
|
|
154
|
+
pl.LazyFrame: 数据集
|
|
155
|
+
"""
|
|
156
|
+
if not isinstance(df, pl.LazyFrame):
|
|
157
|
+
df= df.lazy()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if isinstance(cols, str) and cols == "all":
|
|
161
|
+
target = pl.col(pl.String)
|
|
162
|
+
elif isinstance(cols, list):
|
|
163
|
+
target = pl.col(cols)
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError("percent_cols参数错误")
|
|
166
|
+
|
|
167
|
+
df = df.with_columns(
|
|
168
|
+
target
|
|
169
|
+
.str.strip_chars()
|
|
170
|
+
.str.replace_all(",", "")
|
|
171
|
+
.str.extract(r"^(\d+\.?\d*)%$")
|
|
172
|
+
.cast(pl.Float64)
|
|
173
|
+
.truediv(100)
|
|
174
|
+
.fill_null(target)
|
|
175
|
+
)
|
|
176
|
+
return df
|
|
177
|
+
|
|
178
|
+
# 添加时间列
|
|
179
|
+
def lf_add_time(df, time_zone='Asia/Shanghai') -> pl.LazyFrame:
|
|
180
|
+
"""
|
|
181
|
+
添加时间列
|
|
182
|
+
Args:
|
|
183
|
+
df (pl.LazyFrame|pl.DataFrame): 数据集
|
|
184
|
+
time_zone (str, optional): 时区。默认为'Asia/Shanghai'。
|
|
185
|
+
Returns:
|
|
186
|
+
pl.LazyFrame: 数据集
|
|
187
|
+
"""
|
|
188
|
+
if not isinstance(df, pl.LazyFrame):
|
|
189
|
+
df= df.lazy()
|
|
190
|
+
|
|
191
|
+
return df.with_columns(
|
|
192
|
+
# 使用 pl.lit(datetime.now(...)) 依然是安全的,
|
|
193
|
+
# 因为整个 with_columns 表达式在 LazyFrame 下是惰性的,
|
|
194
|
+
# datetime.now() 会在最终 .collect() 时才被求值!
|
|
195
|
+
pl.lit(datetime.now(ZoneInfo(time_zone))).alias('数据写入时间')
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# 删除完全为空的列
|
|
199
|
+
def df_drop_empty_cols(df) -> pl.LazyFrame:
|
|
200
|
+
"""
|
|
201
|
+
删除完全为空的列 \n
|
|
202
|
+
这个函数不是延迟执行的 \n
|
|
203
|
+
collect() 会触发运行一次计算,获取所有列的空值状态
|
|
204
|
+
Args:
|
|
205
|
+
df (pl.LazyFrame|pl.DataFrame): 数据集
|
|
206
|
+
Returns:
|
|
207
|
+
pl.LazyFrame: 数据集
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
# 1. 构建一个表达式:一次性计算每一列是否全为 null
|
|
211
|
+
# pl.struct 会将所有列打包,pl.all_horizontal 会并行计算
|
|
212
|
+
is_all_null_expr = pl.all_horizontal(
|
|
213
|
+
pl.col(c).is_null().alias(c) for c in df.collect_schema().names()
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# 2. 仅触发一次收集,获取所有列的空值状态
|
|
217
|
+
# 注意:这里只计算聚合值,不扫描全表数据,速度极快
|
|
218
|
+
all_null_flags = df.select(is_all_null_expr).collect().row(0)
|
|
219
|
+
|
|
220
|
+
# 3. 在 Python 端解析出需要删除的列名
|
|
221
|
+
cols_to_drop = [
|
|
222
|
+
col_name for col_name, is_null in zip(df.collect_schema().names(), all_null_flags)
|
|
223
|
+
if is_null
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
# 4. 延迟执行 drop 操作(drop 本身是惰性的,直到最终 collect 才生效)
|
|
227
|
+
if cols_to_drop:
|
|
228
|
+
df = df.drop(cols_to_drop)
|
|
229
|
+
|
|
230
|
+
return df
|