lidb 2.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lidb might be problematic. Click here for more details.
- lidb/__init__.py +31 -0
- lidb/database.py +234 -0
- lidb/dataset.py +696 -0
- lidb/decorator.py +50 -0
- lidb/init.py +45 -0
- lidb/parse.py +111 -0
- lidb/qdf/__init__.py +34 -0
- lidb/qdf/errors.py +65 -0
- lidb/qdf/expr.py +370 -0
- lidb/qdf/lazy.py +174 -0
- lidb/qdf/lazy2.py +161 -0
- lidb/qdf/qdf.py +163 -0
- lidb/qdf/udf/__init__.py +14 -0
- lidb/qdf/udf/base_udf.py +146 -0
- lidb/qdf/udf/cs_udf.py +115 -0
- lidb/qdf/udf/d_udf.py +183 -0
- lidb/qdf/udf/itd_udf.py +209 -0
- lidb/qdf/udf/ts_udf.py +182 -0
- lidb/svc/__init__.py +6 -0
- lidb/svc/data.py +138 -0
- lidb/table.py +138 -0
- lidb-2.0.20.dist-info/METADATA +282 -0
- lidb-2.0.20.dist-info/RECORD +25 -0
- lidb-2.0.20.dist-info/WHEEL +5 -0
- lidb-2.0.20.dist-info/top_level.txt +1 -0
lidb/decorator.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Copyright (c) ZhangYundi.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
# Created on 2025/12/31 10:58
|
|
4
|
+
# Description:
|
|
5
|
+
|
|
6
|
+
from .dataset import Dataset
|
|
7
|
+
from typing import Callable, TypeVar, cast
|
|
8
|
+
|
|
9
|
+
F = TypeVar('F', bound=Callable)
|
|
10
|
+
|
|
11
|
+
def dataset(*depends: Dataset,
|
|
12
|
+
tb: str = "",
|
|
13
|
+
update_time: str = "",
|
|
14
|
+
window: str = "1d",
|
|
15
|
+
partitions: list[str] = None,
|
|
16
|
+
is_hft: bool = False) -> Callable[[F], Dataset]:
|
|
17
|
+
"""
|
|
18
|
+
装饰器:将函数转换为Dataset对象
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
depends: Dataset
|
|
23
|
+
底层依赖数据集
|
|
24
|
+
tb: str
|
|
25
|
+
数据集保存表格, 如果没有指定,默认 {DEFAULT_DS_PATH}/
|
|
26
|
+
update_time: str
|
|
27
|
+
更新时间: 默认没有-实时更新,也就是可以取到当天值
|
|
28
|
+
window: str
|
|
29
|
+
配合depends使用,在取depends时,会回看window周期,最小单位为`d`。不足 `d` 的会往上取整为`1d`
|
|
30
|
+
partitions: list[str]
|
|
31
|
+
分区: 如果指定为 None, 则自动从 fn 参数推断,如果不需要分区,应该将其设定为空列表: []
|
|
32
|
+
is_hft: bool
|
|
33
|
+
是否是高频数据,如果是,则会按照asset进行分区存储,默认 False
|
|
34
|
+
hft定义为:时间步长 < 1min
|
|
35
|
+
"""
|
|
36
|
+
def decorator(fn: F):
|
|
37
|
+
# 创建Dataset实例
|
|
38
|
+
ds = Dataset(
|
|
39
|
+
*depends,
|
|
40
|
+
fn=fn,
|
|
41
|
+
tb=tb,
|
|
42
|
+
update_time=update_time,
|
|
43
|
+
window=window,
|
|
44
|
+
partitions=partitions,
|
|
45
|
+
is_hft=is_hft,
|
|
46
|
+
data_name=fn.__name__,
|
|
47
|
+
frame=1
|
|
48
|
+
)
|
|
49
|
+
return ds
|
|
50
|
+
return decorator
|
lidb/init.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright (c) ZhangYundi.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
# Created on 2025/7/17 14:40
|
|
4
|
+
# Description:
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from dynaconf import Dynaconf
|
|
8
|
+
import logair
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
USERHOME = Path("~").expanduser() # 用户家目录
|
|
13
|
+
NAME = "lidb"
|
|
14
|
+
DB_PATH = USERHOME / NAME
|
|
15
|
+
CONFIG_PATH = USERHOME / ".config" / NAME / "settings.toml"
|
|
16
|
+
|
|
17
|
+
logger = logair.get_logger(NAME)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if not CONFIG_PATH.exists():
|
|
21
|
+
try:
|
|
22
|
+
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
logger.error(f"Failed to create settings file: {e}")
|
|
25
|
+
with open(CONFIG_PATH, "w") as f:
|
|
26
|
+
template_content = f'[GLOBAL]\npath="{DB_PATH}"\n\n[POLARS]\nmax_threads=32\n'
|
|
27
|
+
f.write(template_content)
|
|
28
|
+
logger.info(f"Settings file created: {CONFIG_PATH}")
|
|
29
|
+
|
|
30
|
+
def get_settings():
|
|
31
|
+
try:
|
|
32
|
+
return Dynaconf(settings_files=[CONFIG_PATH])
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.error(f"Read settings file failed: {e}")
|
|
35
|
+
return {}
|
|
36
|
+
|
|
37
|
+
# 读取配置文件覆盖
|
|
38
|
+
_settiings = get_settings()
|
|
39
|
+
if _settiings is not None:
|
|
40
|
+
setting_db_path = _settiings.get(f"global.path", "")
|
|
41
|
+
# 配置 polars
|
|
42
|
+
setting_polars_threads = _settiings.get("polars.max_threads", 32)
|
|
43
|
+
os.environ["POLARS_MAX_THREADS"] = str(setting_polars_threads)
|
|
44
|
+
if setting_db_path:
|
|
45
|
+
DB_PATH = Path(setting_db_path)
|
lidb/parse.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
---------------------------------------------
|
|
4
|
+
Copyright (c) 2025 ZhangYundi
|
|
5
|
+
Licensed under the MIT License.
|
|
6
|
+
Created on 2024/11/6 下午7:25
|
|
7
|
+
Email: yundi.xxii@outlook.com
|
|
8
|
+
---------------------------------------------
|
|
9
|
+
"""
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from urllib.parse import unquote
|
|
13
|
+
|
|
14
|
+
import polars as pl
|
|
15
|
+
import sqlparse
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def format_sql(sql_content):
|
|
19
|
+
"""将sql语句进行规范化,并去除sql中的注释,输入和输出均为字符串"""
|
|
20
|
+
parse_str = sqlparse.format(sql_content, reindent=True, strip_comments=True)
|
|
21
|
+
return parse_str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def extract_temp_tables(with_clause):
|
|
25
|
+
"""从WITH子句中提取临时表名,输出为列表"""
|
|
26
|
+
temp_tables = re.findall(r'\b(\w+)\s*as\s*\(', with_clause, re.IGNORECASE)
|
|
27
|
+
return temp_tables
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_table_names_from_sql(sql_query):
|
|
31
|
+
"""从sql中提取对应的表名称,输出为列表"""
|
|
32
|
+
table_names = set()
|
|
33
|
+
# 解析SQL语句
|
|
34
|
+
parsed = sqlparse.parse(sql_query)
|
|
35
|
+
# 正则表达式模式,用于匹配表名
|
|
36
|
+
table_name_pattern = r'\bFROM\s+([^\s\(\)\,]+)|\bJOIN\s+([^\s\(\)\,]+)'
|
|
37
|
+
|
|
38
|
+
# 用于存储WITH子句中的临时表名
|
|
39
|
+
remove_with_name = []
|
|
40
|
+
|
|
41
|
+
# 遍历解析后的语句块
|
|
42
|
+
for statement in parsed:
|
|
43
|
+
# 转换为字符串
|
|
44
|
+
statement_str = str(statement) # .lower()
|
|
45
|
+
|
|
46
|
+
# 将字符串中的特殊语法置空
|
|
47
|
+
statement_str = re.sub(r'(substring|extract)\s*\(((.|\s)*?)\)', '', statement_str)
|
|
48
|
+
|
|
49
|
+
# 查找匹配的表名
|
|
50
|
+
matches = re.findall(table_name_pattern, statement_str, re.IGNORECASE)
|
|
51
|
+
|
|
52
|
+
for match in matches:
|
|
53
|
+
# 提取非空的表名部分
|
|
54
|
+
for name in match:
|
|
55
|
+
if name:
|
|
56
|
+
# 对于可能包含命名空间的情况,只保留最后一部分作为表名
|
|
57
|
+
table_name = name.split('.')[-1]
|
|
58
|
+
# 去除表名中的特殊符号
|
|
59
|
+
table_name = re.sub(r'("|`|\'|;)', '', table_name)
|
|
60
|
+
table_names.add(table_name)
|
|
61
|
+
|
|
62
|
+
# 处理特殊的WITH语句
|
|
63
|
+
if 'with' in statement_str:
|
|
64
|
+
remove_with_name = extract_temp_tables(statement_str)
|
|
65
|
+
# 移除多余的表名
|
|
66
|
+
if remove_with_name:
|
|
67
|
+
table_names = list(set(table_names) - set(remove_with_name))
|
|
68
|
+
|
|
69
|
+
return table_names
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") -> pl.DataFrame:
|
|
73
|
+
"""
|
|
74
|
+
通用Hive分区结构解析器
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
root_path: 根路径 (如 /data)
|
|
78
|
+
file_pattern: 文件匹配模式 (默认 "*.parquet")
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
polars.DataFrame
|
|
82
|
+
"""
|
|
83
|
+
if isinstance(root_path, str):
|
|
84
|
+
root_path = Path(root_path)
|
|
85
|
+
|
|
86
|
+
partition_combinations = set()
|
|
87
|
+
|
|
88
|
+
for file_path in root_path.rglob(file_pattern):
|
|
89
|
+
if file_path.stat().st_size == 0:
|
|
90
|
+
# 删除
|
|
91
|
+
file_path.unlink()
|
|
92
|
+
continue
|
|
93
|
+
relative_path = file_path.relative_to(root_path)
|
|
94
|
+
|
|
95
|
+
# 收集分区信息
|
|
96
|
+
partition_dict = {}
|
|
97
|
+
for part in relative_path.parts[:-1]: # 排除文件名
|
|
98
|
+
if '=' in part:
|
|
99
|
+
key, value = part.split('=', 1)
|
|
100
|
+
value = unquote(value)
|
|
101
|
+
|
|
102
|
+
partition_dict[key] = value
|
|
103
|
+
|
|
104
|
+
# 记录分区组合
|
|
105
|
+
combination = tuple(partition_dict.items())
|
|
106
|
+
partition_combinations.add(combination)
|
|
107
|
+
|
|
108
|
+
# 转换为普通dict
|
|
109
|
+
res = [dict(combo) for combo in partition_combinations]
|
|
110
|
+
|
|
111
|
+
return pl.DataFrame(res)
|
lidb/qdf/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
---------------------------------------------
|
|
4
|
+
Created on 2025/3/5 21:40
|
|
5
|
+
@author: ZhangYundi
|
|
6
|
+
@email: yundi.xxii@outlook.com
|
|
7
|
+
---------------------------------------------
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .qdf import QDF
|
|
12
|
+
from .lazy import LQDF
|
|
13
|
+
from .expr import Expr
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from ..dataset import scan
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
import polars as pl
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def from_polars(df: pl.DataFrame | pl.LazyFrame | Path | str, index: tuple[str] = ("date", "time", "asset"), align: bool = False, ) -> QDF:
|
|
24
|
+
"""polars dataframe 转为 表达式数据库"""
|
|
25
|
+
if isinstance(df, (Path, str)):
|
|
26
|
+
df = scan(df)
|
|
27
|
+
return QDF(df, index, align,)
|
|
28
|
+
|
|
29
|
+
# def to_lazy(df: pl.DataFrame | pl.LazyFrame | Path | str, index: tuple[str] = ("date", "time", "asset"), align: bool = False, ) -> LQDF:
|
|
30
|
+
# """polars dataframe/lazy frame/table path 转为 表达式数据库"""
|
|
31
|
+
# if isinstance(df, (Path, str)):
|
|
32
|
+
# df = scan(df)
|
|
33
|
+
# return LQDF(df, index, align,)
|
|
34
|
+
|
lidb/qdf/errors.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
---------------------------------------------
|
|
4
|
+
Created on 2025/5/16 10:47
|
|
5
|
+
@author: ZhangYundi
|
|
6
|
+
@email: yundi.xxii@outlook.com
|
|
7
|
+
---------------------------------------------
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ParseError(Exception):
|
|
14
|
+
message: str
|
|
15
|
+
|
|
16
|
+
def __str__(self):
|
|
17
|
+
return self.message
|
|
18
|
+
|
|
19
|
+
def __repr__(self):
|
|
20
|
+
return self.__str__()
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CalculateError(Exception):
|
|
24
|
+
message: str
|
|
25
|
+
|
|
26
|
+
def __str__(self):
|
|
27
|
+
return self.message
|
|
28
|
+
|
|
29
|
+
def __repr__(self):
|
|
30
|
+
return self.__str__()
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class CompileError(Exception):
|
|
34
|
+
message: str
|
|
35
|
+
|
|
36
|
+
def __str__(self):
|
|
37
|
+
return self.message
|
|
38
|
+
|
|
39
|
+
def __repr__(self):
|
|
40
|
+
return self.__str__()
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class PolarsError(Exception):
|
|
44
|
+
message: str
|
|
45
|
+
|
|
46
|
+
def __str__(self):
|
|
47
|
+
return self.message
|
|
48
|
+
|
|
49
|
+
def __repr__(self):
|
|
50
|
+
return self.__str__()
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class FailError:
|
|
54
|
+
expr: str
|
|
55
|
+
error: Exception
|
|
56
|
+
|
|
57
|
+
def __str__(self):
|
|
58
|
+
return f"""
|
|
59
|
+
[失败表达式]: {self.expr}
|
|
60
|
+
[错误类型]: {self.error.__class__.__name__}
|
|
61
|
+
[错误信息]: \n{self.error}
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __repr__(self):
|
|
65
|
+
return self.__str__()
|
lidb/qdf/expr.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
---------------------------------------------
|
|
4
|
+
Created on 2025/3/3 19:52
|
|
5
|
+
@author: ZhangYundi
|
|
6
|
+
@email: yundi.xxii@outlook.com
|
|
7
|
+
---------------------------------------------
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
import warnings
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
|
|
16
|
+
from lark import Lark, Transformer, v_args
|
|
17
|
+
|
|
18
|
+
from .errors import ParseError
|
|
19
|
+
from collections import defaultdict, deque
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# 基类
|
|
23
|
+
class Token:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class OperatorToken(Token):
|
|
29
|
+
"""算子类型token"""
|
|
30
|
+
value: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class OperandToken(Token):
|
|
35
|
+
"""运算对象token"""
|
|
36
|
+
value: str | float | int
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
with warnings.catch_warnings():
|
|
40
|
+
warnings.simplefilter("ignore")
|
|
41
|
+
grammar = """
|
|
42
|
+
start: expr
|
|
43
|
+
?expr: ternary_expr
|
|
44
|
+
?ternary_expr: or_expr
|
|
45
|
+
| or_expr "?" or_expr ":" ternary_expr -> ternary
|
|
46
|
+
?or_expr: and_expr
|
|
47
|
+
| or_expr "|" and_expr -> or_
|
|
48
|
+
?and_expr: comp_expr
|
|
49
|
+
| and_expr "&" comp_expr -> and_
|
|
50
|
+
?comp_expr: eq_expr
|
|
51
|
+
| comp_expr "<" eq_expr -> lt
|
|
52
|
+
| comp_expr ">" eq_expr -> gt
|
|
53
|
+
| comp_expr "<=" eq_expr -> le
|
|
54
|
+
| comp_expr ">=" eq_expr -> ge
|
|
55
|
+
?eq_expr: arith_expr
|
|
56
|
+
| eq_expr "==" arith_expr -> eq
|
|
57
|
+
| eq_expr "!=" arith_expr -> neq
|
|
58
|
+
?arith_expr: term
|
|
59
|
+
| arith_expr "+" term -> add
|
|
60
|
+
| arith_expr "-" term -> sub
|
|
61
|
+
?term: pow_expr
|
|
62
|
+
| term "*" pow_expr -> mul
|
|
63
|
+
| term "/" pow_expr -> div
|
|
64
|
+
| term "//" pow_expr -> floordiv // 取整
|
|
65
|
+
| term "%" pow_expr -> mod // 求余
|
|
66
|
+
?pow_expr: factor
|
|
67
|
+
| factor "**" pow_expr -> pow
|
|
68
|
+
?factor: atom
|
|
69
|
+
| "-" factor -> neg
|
|
70
|
+
| "!" factor -> not_
|
|
71
|
+
| "~" factor -> not_
|
|
72
|
+
?atom: function
|
|
73
|
+
| NAME
|
|
74
|
+
| NUMBER
|
|
75
|
+
| FLOAT
|
|
76
|
+
| "(" expr ")"
|
|
77
|
+
| implicit_mul // 隐式乘法
|
|
78
|
+
| attribute_access // 新增:属性访问
|
|
79
|
+
implicit_mul: (NUMBER | FLOAT) NAME -> implicit_mul // 隐式乘法
|
|
80
|
+
attribute_access: atom "." NAME -> attribute_access // 新增:属性访问
|
|
81
|
+
function: NAME "(" expr_list ")" -> function
|
|
82
|
+
// expr_list: expr ("," expr)*
|
|
83
|
+
keyword_arg: NAME "=" expr -> keyword_arg // 关键字参数
|
|
84
|
+
expr_list: (expr | keyword_arg) ("," (expr | keyword_arg))* // 支持关键字参数
|
|
85
|
+
NAME: /[a-zA-Z_$,][a-zA-Z0-9_$]*/
|
|
86
|
+
NUMBER: /\\d+/ // regex for numbers
|
|
87
|
+
FLOAT: /\\d+\\.\\d+([eE][+-]?\\d+)?/ | /\\d+[eE][+-]?\\d+/ // 支持科学计数法
|
|
88
|
+
%import common.WS
|
|
89
|
+
%ignore WS
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ExprParser(Transformer):
|
|
94
|
+
@v_args(inline=True)
|
|
95
|
+
def ternary(self, a, b, c):
|
|
96
|
+
return Expr.new("if_", [a, b, c])
|
|
97
|
+
|
|
98
|
+
def attribute_access(self, items):
|
|
99
|
+
return ".".join(items)
|
|
100
|
+
|
|
101
|
+
def keyword_arg(self, item):
|
|
102
|
+
k, v = item
|
|
103
|
+
return {k: v}
|
|
104
|
+
|
|
105
|
+
def NAME(self, name):
|
|
106
|
+
return str(name)
|
|
107
|
+
|
|
108
|
+
def NUMBER(self, number): # new transformer for numbers
|
|
109
|
+
return int(number)
|
|
110
|
+
|
|
111
|
+
def FLOAT(self, number):
|
|
112
|
+
return float(number)
|
|
113
|
+
|
|
114
|
+
def add(self, items):
|
|
115
|
+
return Expr.new("add", items)
|
|
116
|
+
|
|
117
|
+
def sub(self, items):
|
|
118
|
+
return Expr.new("sub", items)
|
|
119
|
+
|
|
120
|
+
def mul(self, items):
|
|
121
|
+
return Expr.new("mul", items)
|
|
122
|
+
|
|
123
|
+
def div(self, items):
|
|
124
|
+
return Expr.new("div", items)
|
|
125
|
+
|
|
126
|
+
def floordiv(self, items):
|
|
127
|
+
return Expr.new("floordiv", items)
|
|
128
|
+
|
|
129
|
+
def mod(self, items):
|
|
130
|
+
return Expr.new("mod", items)
|
|
131
|
+
|
|
132
|
+
def pow(self, items):
|
|
133
|
+
return Expr.new("pow", items)
|
|
134
|
+
|
|
135
|
+
def neg(self, items):
|
|
136
|
+
item = items[0]
|
|
137
|
+
if isinstance(item, (int, float)):
|
|
138
|
+
return -item
|
|
139
|
+
return Expr.new("neg", items)
|
|
140
|
+
|
|
141
|
+
def not_(self, item):
|
|
142
|
+
return Expr.new("not_", item)
|
|
143
|
+
|
|
144
|
+
def and_(self, items):
|
|
145
|
+
return Expr.new("and_", items)
|
|
146
|
+
|
|
147
|
+
def or_(self, items):
|
|
148
|
+
return Expr.new("or_", items)
|
|
149
|
+
|
|
150
|
+
def eq(self, items):
|
|
151
|
+
return Expr.new("eq", items)
|
|
152
|
+
|
|
153
|
+
def neq(self, items):
|
|
154
|
+
return Expr.new("neq", items)
|
|
155
|
+
|
|
156
|
+
def lt(self, items):
|
|
157
|
+
return Expr.new("lt", items)
|
|
158
|
+
|
|
159
|
+
def gt(self, items):
|
|
160
|
+
return Expr.new("gt", items)
|
|
161
|
+
|
|
162
|
+
def le(self, items):
|
|
163
|
+
return Expr.new("le", items)
|
|
164
|
+
|
|
165
|
+
def ge(self, items):
|
|
166
|
+
return Expr.new("ge", items)
|
|
167
|
+
|
|
168
|
+
def function(self, items):
|
|
169
|
+
name = items.pop(0)
|
|
170
|
+
return Expr.new(name, items[0])
|
|
171
|
+
|
|
172
|
+
def implicit_mul(self, items):
|
|
173
|
+
return Expr.new("mul", items)
|
|
174
|
+
|
|
175
|
+
def expr_list(self, items):
|
|
176
|
+
return items
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
parser = Lark(grammar, parser='lalr', transformer=ExprParser())
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def parse_expr(expression: str) -> Expr:
|
|
183
|
+
return parser.parse(expression).children[0]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class Expr:
|
|
187
|
+
|
|
188
|
+
def __init__(self, expr: str | None = None):
|
|
189
|
+
|
|
190
|
+
self.fn_name: str | None = ""
|
|
191
|
+
self.args: list | None = None
|
|
192
|
+
self.alias: str | None = None
|
|
193
|
+
if expr:
|
|
194
|
+
try:
|
|
195
|
+
self._parse(expr)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
raise ParseError(f"{expr}\n{e}")
|
|
198
|
+
|
|
199
|
+
@classmethod
|
|
200
|
+
def new(cls, fn_name: str | None, args: list | None, alias: str | None = None):
|
|
201
|
+
expr = cls()
|
|
202
|
+
expr.fn_name = fn_name
|
|
203
|
+
expr.args = args
|
|
204
|
+
expr.alias = alias if alias is not None else str(expr)
|
|
205
|
+
return expr
|
|
206
|
+
|
|
207
|
+
def __hash__(self):
|
|
208
|
+
return hash(str(self).strip())
|
|
209
|
+
|
|
210
|
+
def __eq__(self, other):
|
|
211
|
+
return isinstance(other, Expr) and str(self).strip() == str(other).strip()
|
|
212
|
+
|
|
213
|
+
def to_rpn(self) -> list[Token]:
|
|
214
|
+
"""生成逆波兰表达式: (后缀表达式: 运算符在后)"""
|
|
215
|
+
rpn = list()
|
|
216
|
+
|
|
217
|
+
# 递归遍历子表达式
|
|
218
|
+
def _traverse(node: Expr):
|
|
219
|
+
|
|
220
|
+
if node.args is not None:
|
|
221
|
+
for child in node.args:
|
|
222
|
+
if isinstance(child, Expr):
|
|
223
|
+
_traverse(child)
|
|
224
|
+
else:
|
|
225
|
+
rpn.append(OperandToken(child))
|
|
226
|
+
rpn.append(OperatorToken(node.fn_name))
|
|
227
|
+
|
|
228
|
+
_traverse(self)
|
|
229
|
+
|
|
230
|
+
return rpn
|
|
231
|
+
|
|
232
|
+
def __str__(self):
|
|
233
|
+
unary_map = {"neg": "-", "not_": "!"}
|
|
234
|
+
binary_map = {"add": "+",
|
|
235
|
+
"mul": "*",
|
|
236
|
+
"div": "/",
|
|
237
|
+
"sub": "-",
|
|
238
|
+
"floordiv": "//",
|
|
239
|
+
"mod": "%",
|
|
240
|
+
"pow": "**",
|
|
241
|
+
"and_": "&",
|
|
242
|
+
"or_": "|",
|
|
243
|
+
"gt": ">",
|
|
244
|
+
"gte": ">=",
|
|
245
|
+
"lt": "<",
|
|
246
|
+
"lte": "<=",
|
|
247
|
+
"eq": "==",
|
|
248
|
+
"neq": "!=",
|
|
249
|
+
}
|
|
250
|
+
if self.fn_name is None:
|
|
251
|
+
return str(self.args[0])
|
|
252
|
+
if self.fn_name == "if_":
|
|
253
|
+
cond, body, orelse = self.args
|
|
254
|
+
return f"{cond}?{body}:{orelse}"
|
|
255
|
+
elif self.fn_name in ("neg", "not_"):
|
|
256
|
+
return f"{unary_map.get(self.fn_name)}{self.args[0]}"
|
|
257
|
+
elif self.fn_name in binary_map:
|
|
258
|
+
return f"({binary_map.get(self.fn_name).join([str(arg) for arg in self.args])})"
|
|
259
|
+
else:
|
|
260
|
+
return f"{self.fn_name}({', '.join([str(arg) for arg in self.args])})"
|
|
261
|
+
|
|
262
|
+
def __repr__(self):
|
|
263
|
+
return self.__str__()
|
|
264
|
+
|
|
265
|
+
def _parse(self, expr):
|
|
266
|
+
"""
|
|
267
|
+
解析表达式
|
|
268
|
+
"""
|
|
269
|
+
convertor = {
|
|
270
|
+
'if(': 'if_(',
|
|
271
|
+
'not(': 'not_(',
|
|
272
|
+
'and(': 'and_(',
|
|
273
|
+
'or(': 'or_(',
|
|
274
|
+
'$': '',
|
|
275
|
+
"\n": '',
|
|
276
|
+
"!": "~",
|
|
277
|
+
",": ", ",
|
|
278
|
+
}
|
|
279
|
+
for old, new in convertor.items():
|
|
280
|
+
expr = expr.replace(old, new)
|
|
281
|
+
new_expr = expr
|
|
282
|
+
match = re.search(r'(?i)(.+?)\s+AS\s+(\w+)', new_expr)
|
|
283
|
+
alias = None
|
|
284
|
+
if match:
|
|
285
|
+
new_expr = match.group(1).strip()
|
|
286
|
+
alias = match.group(2).strip()
|
|
287
|
+
|
|
288
|
+
expr_ = parse_expr(new_expr)
|
|
289
|
+
self.alias = alias if alias is not None else str(expr_)
|
|
290
|
+
if not isinstance(expr_, Expr):
|
|
291
|
+
self.args = [expr_]
|
|
292
|
+
else:
|
|
293
|
+
self.fn_name, self.args = expr_.fn_name, expr_.args
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def n_args(self) -> int:
|
|
297
|
+
"""返回表达式的参数个数"""
|
|
298
|
+
return len(self.args)
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def depth(self) -> int:
|
|
302
|
+
"""返回表达式的嵌套深度"""
|
|
303
|
+
_depth = 1
|
|
304
|
+
_depths = [0]
|
|
305
|
+
for arg in self.args:
|
|
306
|
+
if isinstance(arg, Expr):
|
|
307
|
+
_depths.append(arg.depth)
|
|
308
|
+
return _depth + max(_depths)
|
|
309
|
+
|
|
310
|
+
def build_dependency_graph(exprs: list[Expr], avail_cols: set[str]) -> tuple[dict, dict, dict]:
|
|
311
|
+
"""
|
|
312
|
+
构建表达式的依赖图
|
|
313
|
+
Parameters
|
|
314
|
+
----------
|
|
315
|
+
exprs: list[Expr]
|
|
316
|
+
avail_cols: set[str]
|
|
317
|
+
|
|
318
|
+
Returns
|
|
319
|
+
-------
|
|
320
|
+
tuple[dict, dict]
|
|
321
|
+
- graph: {expr_alias: [依赖的 expr_alias]}
|
|
322
|
+
- indegree: {expr_alias: indegree}
|
|
323
|
+
- expr_map: {expr_alias: Expr}
|
|
324
|
+
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
graph = defaultdict(list)
|
|
328
|
+
indegree = defaultdict(int)
|
|
329
|
+
expr_map = {}
|
|
330
|
+
|
|
331
|
+
def collect_deps(e: Expr):
|
|
332
|
+
alias = e.alias
|
|
333
|
+
expr_map[alias] = e
|
|
334
|
+
for arg in e.args:
|
|
335
|
+
if isinstance(arg, Expr):
|
|
336
|
+
graph[arg.alias].append(alias)
|
|
337
|
+
indegree[alias] += 1
|
|
338
|
+
collect_deps(arg)
|
|
339
|
+
elif isinstance(arg, str) and arg.lower() != "null":
|
|
340
|
+
if arg not in avail_cols:
|
|
341
|
+
graph[arg].append(alias)
|
|
342
|
+
indegree[alias] += 1
|
|
343
|
+
for expr in exprs:
|
|
344
|
+
# parsed = parse_expr(expr) if isinstance(expr, str) else expr
|
|
345
|
+
collect_deps(expr)
|
|
346
|
+
|
|
347
|
+
for alias in expr_map:
|
|
348
|
+
if alias not in indegree:
|
|
349
|
+
indegree[alias] = 0
|
|
350
|
+
|
|
351
|
+
return graph, indegree, expr_map
|
|
352
|
+
|
|
353
|
+
def topological_sort(graph: dict, indegree: dict, expr_map: dict) -> list[list[str]]:
|
|
354
|
+
"""
|
|
355
|
+
返回按照层级划分的表达式执行顺序(每层内部没有依赖)
|
|
356
|
+
"""
|
|
357
|
+
queue = deque([k for k in expr_map if indegree[k] == 0])
|
|
358
|
+
levels = []
|
|
359
|
+
while queue:
|
|
360
|
+
level_size = len(queue)
|
|
361
|
+
current_level = list()
|
|
362
|
+
for _ in range(level_size):
|
|
363
|
+
node = queue.popleft()
|
|
364
|
+
current_level.append(expr_map[node])
|
|
365
|
+
for neighbor in graph[node]:
|
|
366
|
+
indegree[neighbor] -= 1
|
|
367
|
+
if indegree[neighbor] == 0:
|
|
368
|
+
queue.append(neighbor)
|
|
369
|
+
levels.append(current_level)
|
|
370
|
+
return levels
|