lidb 2.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lidb might be problematic. Click here for more details.

lidb/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+
4
+ from .init import (
5
+ NAME,
6
+ DB_PATH,
7
+ CONFIG_PATH,
8
+ get_settings,
9
+ )
10
+
11
+ from .database import (
12
+ sql,
13
+ put,
14
+ has,
15
+ tb_path,
16
+ read_mysql,
17
+ write_mysql,
18
+ execute_mysql,
19
+ read_ck,
20
+ scan,
21
+ )
22
+
23
+ from .table import Table, TableMode
24
+ from .dataset import Dataset, DataLoader
25
+ from .decorator import dataset
26
+ from .qdf import from_polars, Expr
27
+ from .svc import DataService, D
28
+
29
+ from .parse import parse_hive_partition_structure
30
+
31
+ __version__ = "2.0.20"
lidb/database.py ADDED
@@ -0,0 +1,234 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ---------------------------------------------
4
+ Copyright (c) 2025 ZhangYundi
5
+ Licensed under the MIT License.
6
+ Created on 2024/7/1 09:44
7
+ Email: yundi.xxii@outlook.com
8
+ ---------------------------------------------
9
+ """
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Literal
13
+
14
+ import pymysql
15
+
16
+ from .parse import extract_table_names_from_sql
17
+ from .init import DB_PATH, logger, get_settings
18
+ import urllib
19
+ import polars as pl
20
+
21
+ # ======================== 本地数据库 catdb ========================
22
+ def tb_path(tb_name: str) -> Path:
23
+ """
24
+ 返回指定表名 完整的本地路径
25
+ Parameters
26
+ ----------
27
+ tb_name: str
28
+ 表名,路径写法: a/b/c
29
+ Returns
30
+ -------
31
+ pathlib.Path
32
+ full_abs_path: pathlib.Path
33
+ 完整的本地绝对路径 $DB_PATH/a/b/c
34
+ """
35
+ return Path(DB_PATH, tb_name)
36
+
37
+
38
+ def put(df, tb_name: str, partitions: list[str] | None = None):
39
+ """
40
+ 将一个DataFrame写入到指定名称的表格目录中,支持分区存储。
41
+
42
+ 该函数负责将给定的DataFrame(df)根据提供的表名(tb_name)写入到本地文件系统中。
43
+ 如果指定了分区(partitions),则会按照这些分区列将数据分割存储。如果目录不存在,会自动创建目录。
44
+
45
+ Parameters
46
+ ----------
47
+ df: polars.DataFrame
48
+ tb_name: str
49
+ 表的名称,用于确定存储数据的目录
50
+ partitions: list[str] | None
51
+ 指定用于分区的列名列表。如果未提供,则不进行分区。
52
+
53
+ Returns
54
+ -------
55
+
56
+ """
57
+ if df is None:
58
+ logger.warning(f"put failed: input data is None.")
59
+ return
60
+ if df.is_empty():
61
+ logger.warning(f"put failed: input data is empty.")
62
+ return
63
+ tbpath = tb_path(tb_name)
64
+ if not tbpath.exists():
65
+ tbpath.mkdir(parents=True, exist_ok=True)
66
+ if partitions is not None:
67
+ df.write_parquet(tbpath, partition_by=partitions)
68
+ else:
69
+ df.write_parquet(tbpath / "data.parquet")
70
+
71
+ def has(tb_name: str) -> bool:
72
+ """
73
+ 判定给定的表名是否存在
74
+ Parameters
75
+ ----------
76
+ tb_name: str
77
+
78
+ Returns
79
+ -------
80
+
81
+ """
82
+ return tb_path(tb_name).exists()
83
+
84
+ def sql(query: str, ):
85
+ """
86
+ sql 查询,从本地paquet文件中查询数据
87
+
88
+ Parameters
89
+ ----------
90
+ query: str
91
+ sql查询语句
92
+ Returns
93
+ -------
94
+
95
+ """
96
+ import polars as pl
97
+
98
+ tbs = extract_table_names_from_sql(query)
99
+ convertor = dict()
100
+ for tb in tbs:
101
+ db_path = tb_path(tb)
102
+ format_tb = f"read_parquet('{db_path}/**/*.parquet')"
103
+ convertor[tb] = format_tb
104
+ pattern = re.compile("|".join(re.escape(k) for k in convertor.keys()))
105
+ new_query = pattern.sub(lambda m: convertor[m.group(0)], query)
106
+ return pl.sql(new_query)
107
+
108
+ def scan(tb: str,) -> pl.LazyFrame:
109
+ """polars.scan_parquet"""
110
+ tb = tb_path(tb)
111
+ return pl.scan_parquet(tb)
112
+
113
+ def read_mysql(query: str, db_conf: str = "DATABASES.mysql"):
114
+ """
115
+ 从MySQL数据库中读取数据。
116
+ Parameters
117
+ ----------
118
+ query: str
119
+ 查询语句
120
+ db_conf: str
121
+ 对应的配置 $DB_PATH/conf/settings.toml
122
+ Returns
123
+ -------
124
+ polars.DataFrame
125
+ """
126
+ import polars as pl
127
+ try:
128
+ db_setting = get_settings().get(db_conf, {})
129
+ required_keys = ['user', 'password', 'url', 'db']
130
+ missing_keys = [key for key in required_keys if key not in db_setting]
131
+ if missing_keys:
132
+ raise KeyError(f"Missing required keys in database config: {missing_keys}")
133
+
134
+ user = urllib.parse.quote_plus(db_setting['user'])
135
+ password = urllib.parse.quote_plus(db_setting['password'])
136
+ uri = f"mysql://{user}:{password}@{db_setting['url']}/{db_setting['db']}"
137
+ return pl.read_database_uri(query, uri)
138
+
139
+ except KeyError as e:
140
+ raise RuntimeError("Database configuration error: missing required fields.") from e
141
+ except Exception as e:
142
+ raise RuntimeError(f"Failed to execute MySQL query: {e}") from e
143
+
144
+ def write_mysql(df: pl.DataFrame,
145
+ remote_tb: str,
146
+ db_conf: str,
147
+ if_table_exists: Literal["append", "replace", "fail"]="append"):
148
+ """将 polars.DataFrame 写入mysql"""
149
+ try:
150
+ db_setting = get_settings().get(db_conf, {})
151
+ required_keys = ['user', 'password', 'url', 'db']
152
+ missing_keys = [key for key in required_keys if key not in db_setting]
153
+ if missing_keys:
154
+ raise KeyError(f"Missing required keys in database config: {missing_keys}")
155
+
156
+ user = urllib.parse.quote_plus(db_setting['user'])
157
+ password = urllib.parse.quote_plus(db_setting['password'])
158
+ uri = f"mysql+pymysql://{user}:{password}@{db_setting['url']}/{db_setting['db']}"
159
+ return df.write_database(remote_tb,
160
+ connection=uri,
161
+ if_table_exists=if_table_exists)
162
+
163
+ except KeyError as e:
164
+ raise RuntimeError("Database configuration error: missing required fields.") from e
165
+ except Exception as e:
166
+ raise RuntimeError(f"Failed to write MySQL: {e}") from e
167
+
168
+ def execute_mysql(sql: str, db_conf: str):
169
+ """执行mysql语句"""
170
+ try:
171
+ db_setting = get_settings().get(db_conf, {})
172
+ required_keys = ['user', 'password', 'url', 'db']
173
+ missing_keys = [key for key in required_keys if key not in db_setting]
174
+ if missing_keys:
175
+ raise KeyError(f"Missing required keys in database config: {missing_keys}")
176
+
177
+ user = urllib.parse.quote_plus(db_setting['user'])
178
+ password = urllib.parse.quote_plus(db_setting['password'])
179
+ url = urllib.parse.quote_plus(db_setting["url"])
180
+ host, port = url.split(":")
181
+
182
+ except KeyError as e:
183
+ raise RuntimeError("Database configuration error: missing required fields.") from e
184
+ except Exception as e:
185
+ raise RuntimeError(f"Failed to parse config: {e}") from e
186
+
187
+ connection = pymysql.connect(
188
+ host=host,
189
+ port=port,
190
+ user=user,
191
+ password=password,
192
+ database=db_setting['db'] # or extract from connection string
193
+ )
194
+ try:
195
+ with connection.cursor() as cursor:
196
+ cursor.execute(sql)
197
+ connection.commit()
198
+ except Exception as e:
199
+ raise RuntimeError(f"Failed to execute MySQL: {e}") from e
200
+ finally:
201
+ connection.close()
202
+
203
+
204
+ def read_ck(query: str, db_conf: str = "DATABASES.ck"):
205
+ """
206
+ 从Clickhouse集群读取数据。
207
+ Parameters
208
+ ----------
209
+ query: str
210
+ 查询语句
211
+ db_conf: str
212
+ 对应的配置 $DB_PATH/conf/settings.toml
213
+ Returns
214
+ -------
215
+ polars.DataFrame
216
+ """
217
+ import clickhouse_df
218
+ try:
219
+ db_setting = get_settings().get(db_conf, {})
220
+ required_keys = ['user', 'password', 'urls']
221
+ missing_keys = [key for key in required_keys if key not in db_setting]
222
+ if missing_keys:
223
+ raise KeyError(f"Missing required keys in database config: {missing_keys}")
224
+
225
+ user = urllib.parse.quote_plus(db_setting['user'])
226
+ password = urllib.parse.quote_plus(db_setting['password'])
227
+
228
+ with clickhouse_df.connect(db_setting['urls'], user=user, password=password):
229
+ return clickhouse_df.to_polars(query)
230
+
231
+ except KeyError as e:
232
+ raise RuntimeError("Database configuration error: missing required fields.") from e
233
+ except Exception as e:
234
+ raise RuntimeError(f"Failed to execute ClickHouse query: {e}") from e