lidb 2.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lidb might be problematic. Click here for more details.

lidb/decorator.py ADDED
@@ -0,0 +1,50 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/12/31 10:58
4
+ # Description:
5
+
6
+ from .dataset import Dataset
7
+ from typing import Callable, TypeVar, cast
8
+
9
+ F = TypeVar('F', bound=Callable)
10
+
11
+ def dataset(*depends: Dataset,
12
+ tb: str = "",
13
+ update_time: str = "",
14
+ window: str = "1d",
15
+ partitions: list[str] = None,
16
+ is_hft: bool = False) -> Callable[[F], Dataset]:
17
+ """
18
+ 装饰器:将函数转换为Dataset对象
19
+
20
+ Parameters
21
+ ----------
22
+ depends: Dataset
23
+ 底层依赖数据集
24
+ tb: str
25
+ 数据集保存表格, 如果没有指定,默认 {DEFAULT_DS_PATH}/
26
+ update_time: str
27
+ 更新时间: 默认没有-实时更新,也就是可以取到当天值
28
+ window: str
29
+ 配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
30
+ partitions: list[str]
31
+ 分区: 如果指定为 None, 则自动从 fn 参数推断,如果不需要分区,应该将其设定为空列表: []
32
+ is_hft: bool
33
+ 是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
34
+ hft定义为:时间步长 < 1min
35
+ """
36
+ def decorator(fn: F):
37
+ # 创建Dataset实例
38
+ ds = Dataset(
39
+ *depends,
40
+ fn=fn,
41
+ tb=tb,
42
+ update_time=update_time,
43
+ window=window,
44
+ partitions=partitions,
45
+ is_hft=is_hft,
46
+ data_name=fn.__name__,
47
+ frame=1
48
+ )
49
+ return ds
50
+ return decorator
lidb/init.py ADDED
@@ -0,0 +1,45 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/7/17 14:40
4
+ # Description:
5
+
6
+ from pathlib import Path
7
+ from dynaconf import Dynaconf
8
+ import logair
9
+ import os
10
+
11
+
12
+ USERHOME = Path("~").expanduser() # 用户家目录
13
+ NAME = "lidb"
14
+ DB_PATH = USERHOME / NAME
15
+ CONFIG_PATH = USERHOME / ".config" / NAME / "settings.toml"
16
+
17
+ logger = logair.get_logger(NAME)
18
+
19
+
20
+ if not CONFIG_PATH.exists():
21
+ try:
22
+ CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
23
+ except Exception as e:
24
+ logger.error(f"Failed to create settings file: {e}")
25
+ with open(CONFIG_PATH, "w") as f:
26
+ template_content = f'[GLOBAL]\npath="{DB_PATH}"\n\n[POLARS]\nmax_threads=32\n'
27
+ f.write(template_content)
28
+ logger.info(f"Settings file created: {CONFIG_PATH}")
29
+
30
+ def get_settings():
31
+ try:
32
+ return Dynaconf(settings_files=[CONFIG_PATH])
33
+ except Exception as e:
34
+ logger.error(f"Read settings file failed: {e}")
35
+ return {}
36
+
37
+ # 读取配置文件覆盖
38
+ _settiings = get_settings()
39
+ if _settiings is not None:
40
+ setting_db_path = _settiings.get(f"global.path", "")
41
+ # 配置 polars
42
+ setting_polars_threads = _settiings.get("polars.max_threads", 32)
43
+ os.environ["POLARS_MAX_THREADS"] = str(setting_polars_threads)
44
+ if setting_db_path:
45
+ DB_PATH = Path(setting_db_path)
lidb/parse.py ADDED
@@ -0,0 +1,111 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ---------------------------------------------
4
+ Copyright (c) 2025 ZhangYundi
5
+ Licensed under the MIT License.
6
+ Created on 2024/11/6 下午7:25
7
+ Email: yundi.xxii@outlook.com
8
+ ---------------------------------------------
9
+ """
10
+ import re
11
+ from pathlib import Path
12
+ from urllib.parse import unquote
13
+
14
+ import polars as pl
15
+ import sqlparse
16
+
17
+
18
+ def format_sql(sql_content):
19
+ """将sql语句进行规范化,并去除sql中的注释,输入和输出均为字符串"""
20
+ parse_str = sqlparse.format(sql_content, reindent=True, strip_comments=True)
21
+ return parse_str
22
+
23
+
24
+ def extract_temp_tables(with_clause):
25
+ """从WITH子句中提取临时表名,输出为列表"""
26
+ temp_tables = re.findall(r'\b(\w+)\s*as\s*\(', with_clause, re.IGNORECASE)
27
+ return temp_tables
28
+
29
+
30
+ def extract_table_names_from_sql(sql_query):
31
+ """从sql中提取对应的表名称,输出为列表"""
32
+ table_names = set()
33
+ # 解析SQL语句
34
+ parsed = sqlparse.parse(sql_query)
35
+ # 正则表达式模式,用于匹配表名
36
+ table_name_pattern = r'\bFROM\s+([^\s\(\)\,]+)|\bJOIN\s+([^\s\(\)\,]+)'
37
+
38
+ # 用于存储WITH子句中的临时表名
39
+ remove_with_name = []
40
+
41
+ # 遍历解析后的语句块
42
+ for statement in parsed:
43
+ # 转换为字符串
44
+ statement_str = str(statement) # .lower()
45
+
46
+ # 将字符串中的特殊语法置空
47
+ statement_str = re.sub(r'(substring|extract)\s*\(((.|\s)*?)\)', '', statement_str)
48
+
49
+ # 查找匹配的表名
50
+ matches = re.findall(table_name_pattern, statement_str, re.IGNORECASE)
51
+
52
+ for match in matches:
53
+ # 提取非空的表名部分
54
+ for name in match:
55
+ if name:
56
+ # 对于可能包含命名空间的情况,只保留最后一部分作为表名
57
+ table_name = name.split('.')[-1]
58
+ # 去除表名中的特殊符号
59
+ table_name = re.sub(r'("|`|\'|;)', '', table_name)
60
+ table_names.add(table_name)
61
+
62
+ # 处理特殊的WITH语句
63
+ if 'with' in statement_str:
64
+ remove_with_name = extract_temp_tables(statement_str)
65
+ # 移除多余的表名
66
+ if remove_with_name:
67
+ table_names = list(set(table_names) - set(remove_with_name))
68
+
69
+ return table_names
70
+
71
+
72
+ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") -> pl.DataFrame:
73
+ """
74
+ 通用Hive分区结构解析器
75
+
76
+ Args:
77
+ root_path: 根路径 (如 /data)
78
+ file_pattern: 文件匹配模式 (默认 "*.parquet")
79
+
80
+ Returns:
81
+ polars.DataFrame
82
+ """
83
+ if isinstance(root_path, str):
84
+ root_path = Path(root_path)
85
+
86
+ partition_combinations = set()
87
+
88
+ for file_path in root_path.rglob(file_pattern):
89
+ if file_path.stat().st_size == 0:
90
+ # 删除
91
+ file_path.unlink()
92
+ continue
93
+ relative_path = file_path.relative_to(root_path)
94
+
95
+ # 收集分区信息
96
+ partition_dict = {}
97
+ for part in relative_path.parts[:-1]: # 排除文件名
98
+ if '=' in part:
99
+ key, value = part.split('=', 1)
100
+ value = unquote(value)
101
+
102
+ partition_dict[key] = value
103
+
104
+ # 记录分区组合
105
+ combination = tuple(partition_dict.items())
106
+ partition_combinations.add(combination)
107
+
108
+ # 转换为普通dict
109
+ res = [dict(combo) for combo in partition_combinations]
110
+
111
+ return pl.DataFrame(res)
lidb/qdf/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ---------------------------------------------
4
+ Created on 2025/3/5 21:40
5
+ @author: ZhangYundi
6
+ @email: yundi.xxii@outlook.com
7
+ ---------------------------------------------
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from .qdf import QDF
12
+ from .lazy import LQDF
13
+ from .expr import Expr
14
+ from typing import TYPE_CHECKING
15
+ from pathlib import Path
16
+ from ..dataset import scan
17
+
18
+ if TYPE_CHECKING:
19
+ import polars as pl
20
+
21
+
22
+
23
+ def from_polars(df: pl.DataFrame | pl.LazyFrame | Path | str, index: tuple[str] = ("date", "time", "asset"), align: bool = False, ) -> QDF:
24
+ """polars dataframe 转为 表达式数据库"""
25
+ if isinstance(df, (Path, str)):
26
+ df = scan(df)
27
+ return QDF(df, index, align,)
28
+
29
+ # def to_lazy(df: pl.DataFrame | pl.LazyFrame | Path | str, index: tuple[str] = ("date", "time", "asset"), align: bool = False, ) -> LQDF:
30
+ # """polars dataframe/lazy frame/table path 转为 表达式数据库"""
31
+ # if isinstance(df, (Path, str)):
32
+ # df = scan(df)
33
+ # return LQDF(df, index, align,)
34
+
lidb/qdf/errors.py ADDED
@@ -0,0 +1,65 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ---------------------------------------------
4
+ Created on 2025/5/16 10:47
5
+ @author: ZhangYundi
6
+ @email: yundi.xxii@outlook.com
7
+ ---------------------------------------------
8
+ """
9
+
10
+ from dataclasses import dataclass
11
+
12
+ @dataclass
13
+ class ParseError(Exception):
14
+ message: str
15
+
16
+ def __str__(self):
17
+ return self.message
18
+
19
+ def __repr__(self):
20
+ return self.__str__()
21
+
22
+ @dataclass
23
+ class CalculateError(Exception):
24
+ message: str
25
+
26
+ def __str__(self):
27
+ return self.message
28
+
29
+ def __repr__(self):
30
+ return self.__str__()
31
+
32
+ @dataclass
33
+ class CompileError(Exception):
34
+ message: str
35
+
36
+ def __str__(self):
37
+ return self.message
38
+
39
+ def __repr__(self):
40
+ return self.__str__()
41
+
42
+ @dataclass
43
+ class PolarsError(Exception):
44
+ message: str
45
+
46
+ def __str__(self):
47
+ return self.message
48
+
49
+ def __repr__(self):
50
+ return self.__str__()
51
+
52
+ @dataclass
53
+ class FailError:
54
+ expr: str
55
+ error: Exception
56
+
57
+ def __str__(self):
58
+ return f"""
59
+ [失败表达式]: {self.expr}
60
+ [错误类型]: {self.error.__class__.__name__}
61
+ [错误信息]: \n{self.error}
62
+ """
63
+
64
+ def __repr__(self):
65
+ return self.__str__()
lidb/qdf/expr.py ADDED
@@ -0,0 +1,370 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ ---------------------------------------------
4
+ Created on 2025/3/3 19:52
5
+ @author: ZhangYundi
6
+ @email: yundi.xxii@outlook.com
7
+ ---------------------------------------------
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ import warnings
14
+ from dataclasses import dataclass
15
+
16
+ from lark import Lark, Transformer, v_args
17
+
18
+ from .errors import ParseError
19
+ from collections import defaultdict, deque
20
+
21
+
22
+ # 基类
23
+ class Token:
24
+ pass
25
+
26
+
27
+ @dataclass
28
+ class OperatorToken(Token):
29
+ """算子类型token"""
30
+ value: str
31
+
32
+
33
+ @dataclass
34
+ class OperandToken(Token):
35
+ """运算对象token"""
36
+ value: str | float | int
37
+
38
+
39
+ with warnings.catch_warnings():
40
+ warnings.simplefilter("ignore")
41
+ grammar = """
42
+ start: expr
43
+ ?expr: ternary_expr
44
+ ?ternary_expr: or_expr
45
+ | or_expr "?" or_expr ":" ternary_expr -> ternary
46
+ ?or_expr: and_expr
47
+ | or_expr "|" and_expr -> or_
48
+ ?and_expr: comp_expr
49
+ | and_expr "&" comp_expr -> and_
50
+ ?comp_expr: eq_expr
51
+ | comp_expr "<" eq_expr -> lt
52
+ | comp_expr ">" eq_expr -> gt
53
+ | comp_expr "<=" eq_expr -> le
54
+ | comp_expr ">=" eq_expr -> ge
55
+ ?eq_expr: arith_expr
56
+ | eq_expr "==" arith_expr -> eq
57
+ | eq_expr "!=" arith_expr -> neq
58
+ ?arith_expr: term
59
+ | arith_expr "+" term -> add
60
+ | arith_expr "-" term -> sub
61
+ ?term: pow_expr
62
+ | term "*" pow_expr -> mul
63
+ | term "/" pow_expr -> div
64
+ | term "//" pow_expr -> floordiv // 取整
65
+ | term "%" pow_expr -> mod // 求余
66
+ ?pow_expr: factor
67
+ | factor "**" pow_expr -> pow
68
+ ?factor: atom
69
+ | "-" factor -> neg
70
+ | "!" factor -> not_
71
+ | "~" factor -> not_
72
+ ?atom: function
73
+ | NAME
74
+ | NUMBER
75
+ | FLOAT
76
+ | "(" expr ")"
77
+ | implicit_mul // 隐式乘法
78
+ | attribute_access // 新增:属性访问
79
+ implicit_mul: (NUMBER | FLOAT) NAME -> implicit_mul // 隐式乘法
80
+ attribute_access: atom "." NAME -> attribute_access // 新增:属性访问
81
+ function: NAME "(" expr_list ")" -> function
82
+ // expr_list: expr ("," expr)*
83
+ keyword_arg: NAME "=" expr -> keyword_arg // 关键字参数
84
+ expr_list: (expr | keyword_arg) ("," (expr | keyword_arg))* // 支持关键字参数
85
+ NAME: /[a-zA-Z_$,][a-zA-Z0-9_$]*/
86
+ NUMBER: /\\d+/ // regex for numbers
87
+ FLOAT: /\\d+\\.\\d+([eE][+-]?\\d+)?/ | /\\d+[eE][+-]?\\d+/ // 支持科学计数法
88
+ %import common.WS
89
+ %ignore WS
90
+ """
91
+
92
+
93
+ class ExprParser(Transformer):
94
+ @v_args(inline=True)
95
+ def ternary(self, a, b, c):
96
+ return Expr.new("if_", [a, b, c])
97
+
98
+ def attribute_access(self, items):
99
+ return ".".join(items)
100
+
101
+ def keyword_arg(self, item):
102
+ k, v = item
103
+ return {k: v}
104
+
105
+ def NAME(self, name):
106
+ return str(name)
107
+
108
+ def NUMBER(self, number): # new transformer for numbers
109
+ return int(number)
110
+
111
+ def FLOAT(self, number):
112
+ return float(number)
113
+
114
+ def add(self, items):
115
+ return Expr.new("add", items)
116
+
117
+ def sub(self, items):
118
+ return Expr.new("sub", items)
119
+
120
+ def mul(self, items):
121
+ return Expr.new("mul", items)
122
+
123
+ def div(self, items):
124
+ return Expr.new("div", items)
125
+
126
+ def floordiv(self, items):
127
+ return Expr.new("floordiv", items)
128
+
129
+ def mod(self, items):
130
+ return Expr.new("mod", items)
131
+
132
+ def pow(self, items):
133
+ return Expr.new("pow", items)
134
+
135
+ def neg(self, items):
136
+ item = items[0]
137
+ if isinstance(item, (int, float)):
138
+ return -item
139
+ return Expr.new("neg", items)
140
+
141
+ def not_(self, item):
142
+ return Expr.new("not_", item)
143
+
144
+ def and_(self, items):
145
+ return Expr.new("and_", items)
146
+
147
+ def or_(self, items):
148
+ return Expr.new("or_", items)
149
+
150
+ def eq(self, items):
151
+ return Expr.new("eq", items)
152
+
153
+ def neq(self, items):
154
+ return Expr.new("neq", items)
155
+
156
+ def lt(self, items):
157
+ return Expr.new("lt", items)
158
+
159
+ def gt(self, items):
160
+ return Expr.new("gt", items)
161
+
162
+ def le(self, items):
163
+ return Expr.new("le", items)
164
+
165
+ def ge(self, items):
166
+ return Expr.new("ge", items)
167
+
168
+ def function(self, items):
169
+ name = items.pop(0)
170
+ return Expr.new(name, items[0])
171
+
172
+ def implicit_mul(self, items):
173
+ return Expr.new("mul", items)
174
+
175
+ def expr_list(self, items):
176
+ return items
177
+
178
+
179
+ parser = Lark(grammar, parser='lalr', transformer=ExprParser())
180
+
181
+
182
+ def parse_expr(expression: str) -> Expr:
183
+ return parser.parse(expression).children[0]
184
+
185
+
186
+ class Expr:
187
+
188
+ def __init__(self, expr: str | None = None):
189
+
190
+ self.fn_name: str | None = ""
191
+ self.args: list | None = None
192
+ self.alias: str | None = None
193
+ if expr:
194
+ try:
195
+ self._parse(expr)
196
+ except Exception as e:
197
+ raise ParseError(f"{expr}\n{e}")
198
+
199
+ @classmethod
200
+ def new(cls, fn_name: str | None, args: list | None, alias: str | None = None):
201
+ expr = cls()
202
+ expr.fn_name = fn_name
203
+ expr.args = args
204
+ expr.alias = alias if alias is not None else str(expr)
205
+ return expr
206
+
207
+ def __hash__(self):
208
+ return hash(str(self).strip())
209
+
210
+ def __eq__(self, other):
211
+ return isinstance(other, Expr) and str(self).strip() == str(other).strip()
212
+
213
+ def to_rpn(self) -> list[Token]:
214
+ """生成逆波兰表达式: (后缀表达式: 运算符在后)"""
215
+ rpn = list()
216
+
217
+ # 递归遍历子表达式
218
+ def _traverse(node: Expr):
219
+
220
+ if node.args is not None:
221
+ for child in node.args:
222
+ if isinstance(child, Expr):
223
+ _traverse(child)
224
+ else:
225
+ rpn.append(OperandToken(child))
226
+ rpn.append(OperatorToken(node.fn_name))
227
+
228
+ _traverse(self)
229
+
230
+ return rpn
231
+
232
+ def __str__(self):
233
+ unary_map = {"neg": "-", "not_": "!"}
234
+ binary_map = {"add": "+",
235
+ "mul": "*",
236
+ "div": "/",
237
+ "sub": "-",
238
+ "floordiv": "//",
239
+ "mod": "%",
240
+ "pow": "**",
241
+ "and_": "&",
242
+ "or_": "|",
243
+ "gt": ">",
244
+ "gte": ">=",
245
+ "lt": "<",
246
+ "lte": "<=",
247
+ "eq": "==",
248
+ "neq": "!=",
249
+ }
250
+ if self.fn_name is None:
251
+ return str(self.args[0])
252
+ if self.fn_name == "if_":
253
+ cond, body, orelse = self.args
254
+ return f"{cond}?{body}:{orelse}"
255
+ elif self.fn_name in ("neg", "not_"):
256
+ return f"{unary_map.get(self.fn_name)}{self.args[0]}"
257
+ elif self.fn_name in binary_map:
258
+ return f"({binary_map.get(self.fn_name).join([str(arg) for arg in self.args])})"
259
+ else:
260
+ return f"{self.fn_name}({', '.join([str(arg) for arg in self.args])})"
261
+
262
+ def __repr__(self):
263
+ return self.__str__()
264
+
265
+ def _parse(self, expr):
266
+ """
267
+ 解析表达式
268
+ """
269
+ convertor = {
270
+ 'if(': 'if_(',
271
+ 'not(': 'not_(',
272
+ 'and(': 'and_(',
273
+ 'or(': 'or_(',
274
+ '$': '',
275
+ "\n": '',
276
+ "!": "~",
277
+ ",": ", ",
278
+ }
279
+ for old, new in convertor.items():
280
+ expr = expr.replace(old, new)
281
+ new_expr = expr
282
+ match = re.search(r'(?i)(.+?)\s+AS\s+(\w+)', new_expr)
283
+ alias = None
284
+ if match:
285
+ new_expr = match.group(1).strip()
286
+ alias = match.group(2).strip()
287
+
288
+ expr_ = parse_expr(new_expr)
289
+ self.alias = alias if alias is not None else str(expr_)
290
+ if not isinstance(expr_, Expr):
291
+ self.args = [expr_]
292
+ else:
293
+ self.fn_name, self.args = expr_.fn_name, expr_.args
294
+
295
+ @property
296
+ def n_args(self) -> int:
297
+ """返回表达式的参数个数"""
298
+ return len(self.args)
299
+
300
+ @property
301
+ def depth(self) -> int:
302
+ """返回表达式的嵌套深度"""
303
+ _depth = 1
304
+ _depths = [0]
305
+ for arg in self.args:
306
+ if isinstance(arg, Expr):
307
+ _depths.append(arg.depth)
308
+ return _depth + max(_depths)
309
+
310
+ def build_dependency_graph(exprs: list[Expr], avail_cols: set[str]) -> tuple[dict, dict, dict]:
311
+ """
312
+ 构建表达式的依赖图
313
+ Parameters
314
+ ----------
315
+ exprs: list[Expr]
316
+ avail_cols: set[str]
317
+
318
+ Returns
319
+ -------
320
+ tuple[dict, dict]
321
+ - graph: {expr_alias: [依赖的 expr_alias]}
322
+ - indegree: {expr_alias: indegree}
323
+ - expr_map: {expr_alias: Expr}
324
+
325
+ """
326
+
327
+ graph = defaultdict(list)
328
+ indegree = defaultdict(int)
329
+ expr_map = {}
330
+
331
+ def collect_deps(e: Expr):
332
+ alias = e.alias
333
+ expr_map[alias] = e
334
+ for arg in e.args:
335
+ if isinstance(arg, Expr):
336
+ graph[arg.alias].append(alias)
337
+ indegree[alias] += 1
338
+ collect_deps(arg)
339
+ elif isinstance(arg, str) and arg.lower() != "null":
340
+ if arg not in avail_cols:
341
+ graph[arg].append(alias)
342
+ indegree[alias] += 1
343
+ for expr in exprs:
344
+ # parsed = parse_expr(expr) if isinstance(expr, str) else expr
345
+ collect_deps(expr)
346
+
347
+ for alias in expr_map:
348
+ if alias not in indegree:
349
+ indegree[alias] = 0
350
+
351
+ return graph, indegree, expr_map
352
+
353
+ def topological_sort(graph: dict, indegree: dict, expr_map: dict) -> list[list[str]]:
354
+ """
355
+ 返回按照层级划分的表达式执行顺序(每层内部没有依赖)
356
+ """
357
+ queue = deque([k for k in expr_map if indegree[k] == 0])
358
+ levels = []
359
+ while queue:
360
+ level_size = len(queue)
361
+ current_level = list()
362
+ for _ in range(level_size):
363
+ node = queue.popleft()
364
+ current_level.append(expr_map[node])
365
+ for neighbor in graph[node]:
366
+ indegree[neighbor] -= 1
367
+ if indegree[neighbor] == 0:
368
+ queue.append(neighbor)
369
+ levels.append(current_level)
370
+ return levels