additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/expressions/parser.py
CHANGED
|
@@ -1,176 +1,372 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
Expression parser for Additory.
|
|
2
3
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
import yaml
|
|
7
|
-
|
|
8
|
-
from .logging import log_info, log_warning
|
|
9
|
-
from .ast_builder import build_ast_from_expression # <-- NEW: your AST builder
|
|
4
|
+
Parses expression strings into Abstract Syntax Trees (AST) for compilation.
|
|
5
|
+
"""
|
|
10
6
|
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import List, Optional, Union
|
|
11
10
|
|
|
12
|
-
# ------------------------------------------------------------
|
|
13
|
-
# Parsed Expression Structure
|
|
14
|
-
# ------------------------------------------------------------
|
|
15
11
|
|
|
16
12
|
@dataclass
|
|
17
|
-
class
|
|
18
|
-
name: str
|
|
19
|
-
metadata: Dict[str, Any]
|
|
20
|
-
expression: str
|
|
21
|
-
raw_text: str
|
|
22
|
-
ast: Optional[Dict[str, Any]] = None # <-- NEW
|
|
23
|
-
sample_clean: Optional[Dict[str, List[Any]]] = None
|
|
24
|
-
sample_unclean: Optional[Dict[str, List[Any]]] = None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# ------------------------------------------------------------
|
|
28
|
-
# Public API
|
|
29
|
-
# ------------------------------------------------------------
|
|
30
|
-
|
|
31
|
-
def parse_expression(text: str) -> ParsedExpression:
|
|
13
|
+
class Token:
|
|
32
14
|
"""
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
15
|
+
Token from expression string.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
type: Token type
|
|
19
|
+
value: Token value
|
|
20
|
+
position: Position in original string
|
|
37
21
|
"""
|
|
22
|
+
type: str
|
|
23
|
+
value: Union[str, int, float]
|
|
24
|
+
position: int
|
|
25
|
+
|
|
26
|
+
def __repr__(self) -> str:
|
|
27
|
+
return f"Token({self.type}, {self.value!r}, pos={self.position})"
|
|
38
28
|
|
|
39
|
-
if not text.strip():
|
|
40
|
-
log_warning("[parser] Empty expression file")
|
|
41
|
-
return ParsedExpression(
|
|
42
|
-
name="unknown",
|
|
43
|
-
metadata={},
|
|
44
|
-
expression="",
|
|
45
|
-
raw_text=text,
|
|
46
|
-
ast=None,
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
# Detect YAML-style format
|
|
50
|
-
if _looks_like_yaml(text):
|
|
51
|
-
parsed = _parse_yaml_style(text)
|
|
52
|
-
else:
|
|
53
|
-
parsed = _parse_legacy_style(text)
|
|
54
|
-
|
|
55
|
-
# --------------------------------------------------------
|
|
56
|
-
# NEW: Build AST from parsed.expression
|
|
57
|
-
# --------------------------------------------------------
|
|
58
|
-
try:
|
|
59
|
-
parsed.ast = build_ast_from_expression(parsed.expression)
|
|
60
|
-
except Exception as e:
|
|
61
|
-
log_warning(f"[parser] Failed to build AST: {e}")
|
|
62
|
-
parsed.ast = None
|
|
63
|
-
|
|
64
|
-
return parsed
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# ------------------------------------------------------------
|
|
68
|
-
# YAML-STYLE PARSER
|
|
69
|
-
# ------------------------------------------------------------
|
|
70
|
-
|
|
71
|
-
def _looks_like_yaml(text: str) -> bool:
|
|
72
|
-
lowered = text.lower()
|
|
73
|
-
return ("formula:" in lowered) or ("sample:" in lowered)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def _parse_yaml_style(text: str) -> ParsedExpression:
|
|
77
|
-
try:
|
|
78
|
-
parsed = yaml.safe_load(text)
|
|
79
|
-
except Exception as e:
|
|
80
|
-
log_warning(f"[parser] YAML parse failed, falling back to legacy: {e}")
|
|
81
|
-
return _parse_legacy_style(text)
|
|
82
|
-
|
|
83
|
-
formula = parsed.get("formula", {})
|
|
84
|
-
sample = parsed.get("sample", {})
|
|
85
|
-
expression_block = formula.get("expression")
|
|
86
|
-
|
|
87
|
-
if not expression_block:
|
|
88
|
-
log_warning("[parser] YAML file missing 'formula.expression' block")
|
|
89
|
-
expression_block = ""
|
|
90
|
-
|
|
91
|
-
name = formula.get("name", "unknown")
|
|
92
|
-
|
|
93
|
-
return ParsedExpression(
|
|
94
|
-
name=name,
|
|
95
|
-
metadata=formula,
|
|
96
|
-
expression=_normalize_expression(expression_block),
|
|
97
|
-
raw_text=text,
|
|
98
|
-
sample_clean=sample.get("clean"),
|
|
99
|
-
sample_unclean=sample.get("unclean"),
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def _normalize_expression(expr):
|
|
104
|
-
if isinstance(expr, list):
|
|
105
|
-
return "\n".join(expr)
|
|
106
|
-
return str(expr).strip()
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
# ------------------------------------------------------------
|
|
110
|
-
# LEGACY PARSER
|
|
111
|
-
# ------------------------------------------------------------
|
|
112
29
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
30
|
+
@dataclass
|
|
31
|
+
class ASTNode:
|
|
32
|
+
"""
|
|
33
|
+
Node in Abstract Syntax Tree.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
type: Node type ('binary_op', 'unary_op', 'function', 'identifier', 'literal')
|
|
37
|
+
value: Node value (operator, function name, column name, or literal value)
|
|
38
|
+
left: Left child node (for binary operators)
|
|
39
|
+
right: Right child node (for binary operators)
|
|
40
|
+
children: Child nodes (for functions)
|
|
41
|
+
"""
|
|
42
|
+
type: str
|
|
43
|
+
value: Union[str, int, float, None]
|
|
44
|
+
left: Optional['ASTNode'] = None
|
|
45
|
+
right: Optional['ASTNode'] = None
|
|
46
|
+
children: Optional[List['ASTNode']] = None
|
|
47
|
+
|
|
48
|
+
def __repr__(self) -> str:
|
|
49
|
+
if self.type == 'binary_op':
|
|
50
|
+
return f"BinaryOp({self.value}, {self.left}, {self.right})"
|
|
51
|
+
elif self.type == 'unary_op':
|
|
52
|
+
return f"UnaryOp({self.value}, {self.right})"
|
|
53
|
+
elif self.type == 'function':
|
|
54
|
+
return f"Function({self.value}, {self.children})"
|
|
55
|
+
elif self.type == 'identifier':
|
|
56
|
+
return f"Identifier({self.value})"
|
|
57
|
+
elif self.type == 'literal':
|
|
58
|
+
return f"Literal({self.value})"
|
|
59
|
+
else:
|
|
60
|
+
return f"ASTNode({self.type}, {self.value})"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ExpressionParser:
|
|
64
|
+
"""
|
|
65
|
+
Parse expression strings into Abstract Syntax Tree (AST).
|
|
66
|
+
|
|
67
|
+
Supports:
|
|
68
|
+
- Arithmetic operators: +, -, *, /, **, %
|
|
69
|
+
- Comparison operators: ==, !=, >, <, >=, <=
|
|
70
|
+
- Logical operators: AND, OR, NOT
|
|
71
|
+
- Functions: sqrt, abs, log, if_else, etc.
|
|
72
|
+
- Parentheses for grouping
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
# Supported functions (case-insensitive)
|
|
76
|
+
FUNCTIONS = {
|
|
77
|
+
# Mathematical
|
|
78
|
+
'sqrt', 'abs', 'log', 'log10', 'exp', 'pow', 'round', 'floor', 'ceil',
|
|
79
|
+
# String
|
|
80
|
+
'lower', 'upper', 'trim', 'length', 'substring', 'replace', 'contains', 'matches',
|
|
81
|
+
# Date/Time
|
|
82
|
+
'year', 'month', 'day', 'hour', 'minute', 'second', 'day_of_week', 'time_of_day',
|
|
83
|
+
# Aggregation
|
|
84
|
+
'sum', 'mean', 'median', 'min', 'max', 'count', 'std',
|
|
85
|
+
# Conditional
|
|
86
|
+
'if_else', 'coalesce', 'is_null', 'is_not_null'
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Operator precedence (higher = higher precedence)
|
|
90
|
+
PRECEDENCE = {
|
|
91
|
+
'OR': 1,
|
|
92
|
+
'AND': 2,
|
|
93
|
+
'==': 3, '!=': 3, '>': 3, '<': 3, '>=': 3, '<=': 3,
|
|
94
|
+
'+': 4, '-': 4,
|
|
95
|
+
'*': 5, '/': 5, '%': 5,
|
|
96
|
+
'**': 6,
|
|
97
|
+
'UNARY': 7 # Unary minus and NOT
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
def __init__(self):
|
|
101
|
+
self.tokens = []
|
|
102
|
+
self.position = 0
|
|
103
|
+
|
|
104
|
+
def parse(self, expression: str) -> ASTNode:
|
|
105
|
+
"""
|
|
106
|
+
Parse expression string into AST.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
expression: Expression string to parse
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Root AST node
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
parser = ExpressionParser()
|
|
116
|
+
ast = parser.parse('weight / (height ** 2)')
|
|
117
|
+
"""
|
|
118
|
+
# Tokenize
|
|
119
|
+
self.tokens = self.tokenize(expression)
|
|
120
|
+
self.position = 0
|
|
121
|
+
|
|
122
|
+
# Build AST
|
|
123
|
+
if not self.tokens:
|
|
124
|
+
raise ValueError("Empty expression")
|
|
125
|
+
|
|
126
|
+
ast = self.build_ast(self.tokens)
|
|
127
|
+
|
|
128
|
+
# Check for unconsumed tokens
|
|
129
|
+
if self.position < len(self.tokens):
|
|
130
|
+
token = self.tokens[self.position]
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Unexpected token '{token.value}' at position {token.position}"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return ast
|
|
136
|
+
|
|
137
|
+
def tokenize(self, expression: str) -> List[Token]:
|
|
138
|
+
"""
|
|
139
|
+
Tokenize expression string.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
expression: Expression string
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of tokens
|
|
146
|
+
"""
|
|
147
|
+
tokens = []
|
|
148
|
+
i = 0
|
|
149
|
+
|
|
150
|
+
while i < len(expression):
|
|
151
|
+
# Skip whitespace
|
|
152
|
+
if expression[i].isspace():
|
|
153
|
+
i += 1
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Numbers (integers and floats)
|
|
157
|
+
if expression[i].isdigit() or (expression[i] == '.' and i + 1 < len(expression) and expression[i + 1].isdigit()):
|
|
158
|
+
start = i
|
|
159
|
+
has_dot = False
|
|
160
|
+
while i < len(expression) and (expression[i].isdigit() or (expression[i] == '.' and not has_dot)):
|
|
161
|
+
if expression[i] == '.':
|
|
162
|
+
has_dot = True
|
|
163
|
+
i += 1
|
|
164
|
+
value_str = expression[start:i]
|
|
165
|
+
value = float(value_str) if has_dot else int(value_str)
|
|
166
|
+
tokens.append(Token('NUMBER', value, start))
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# String literals (single or double quotes)
|
|
170
|
+
if expression[i] in ('"', "'"):
|
|
171
|
+
quote = expression[i]
|
|
172
|
+
start = i
|
|
173
|
+
i += 1
|
|
174
|
+
string_value = ''
|
|
175
|
+
while i < len(expression) and expression[i] != quote:
|
|
176
|
+
if expression[i] == '\\' and i + 1 < len(expression):
|
|
177
|
+
# Handle escape sequences
|
|
178
|
+
i += 1
|
|
179
|
+
if expression[i] == 'n':
|
|
180
|
+
string_value += '\n'
|
|
181
|
+
elif expression[i] == 't':
|
|
182
|
+
string_value += '\t'
|
|
183
|
+
elif expression[i] in ('"', "'", '\\'):
|
|
184
|
+
string_value += expression[i]
|
|
185
|
+
else:
|
|
186
|
+
string_value += expression[i]
|
|
187
|
+
else:
|
|
188
|
+
string_value += expression[i]
|
|
189
|
+
i += 1
|
|
190
|
+
|
|
191
|
+
if i >= len(expression):
|
|
192
|
+
raise ValueError(f"Unterminated string at position {start}")
|
|
193
|
+
|
|
194
|
+
i += 1 # Skip closing quote
|
|
195
|
+
tokens.append(Token('STRING', string_value, start))
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Identifiers and keywords
|
|
199
|
+
if expression[i].isalpha() or expression[i] == '_':
|
|
200
|
+
start = i
|
|
201
|
+
while i < len(expression) and (expression[i].isalnum() or expression[i] == '_'):
|
|
202
|
+
i += 1
|
|
203
|
+
value = expression[start:i]
|
|
204
|
+
|
|
205
|
+
# Check if it's a keyword
|
|
206
|
+
value_upper = value.upper()
|
|
207
|
+
if value_upper in ('AND', 'OR', 'NOT'):
|
|
208
|
+
tokens.append(Token('OPERATOR', value_upper, start))
|
|
209
|
+
elif value.lower() in self.FUNCTIONS:
|
|
210
|
+
tokens.append(Token('FUNCTION', value.lower(), start))
|
|
211
|
+
else:
|
|
212
|
+
tokens.append(Token('IDENTIFIER', value, start))
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
# Two-character operators
|
|
216
|
+
if i + 1 < len(expression):
|
|
217
|
+
two_char = expression[i:i+2]
|
|
218
|
+
if two_char in ('**', '==', '!=', '>=', '<='):
|
|
219
|
+
tokens.append(Token('OPERATOR', two_char, i))
|
|
220
|
+
i += 2
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# Single-character operators and punctuation
|
|
224
|
+
if expression[i] in '+-*/%><!':
|
|
225
|
+
tokens.append(Token('OPERATOR', expression[i], i))
|
|
226
|
+
i += 1
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
if expression[i] == '(':
|
|
230
|
+
tokens.append(Token('LPAREN', '(', i))
|
|
231
|
+
i += 1
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
if expression[i] == ')':
|
|
235
|
+
tokens.append(Token('RPAREN', ')', i))
|
|
236
|
+
i += 1
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
if expression[i] == ',':
|
|
240
|
+
tokens.append(Token('COMMA', ',', i))
|
|
241
|
+
i += 1
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
# Unknown character
|
|
245
|
+
raise ValueError(f"Invalid character '{expression[i]}' at position {i}")
|
|
246
|
+
|
|
247
|
+
return tokens
|
|
248
|
+
|
|
249
|
+
def build_ast(self, tokens: List[Token]) -> ASTNode:
|
|
250
|
+
"""
|
|
251
|
+
Build AST from tokens using recursive descent parsing.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
tokens: List of tokens
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Root AST node
|
|
258
|
+
"""
|
|
259
|
+
return self._parse_expression()
|
|
260
|
+
|
|
261
|
+
def _parse_expression(self, min_precedence: int = 0) -> ASTNode:
|
|
262
|
+
"""Parse expression with operator precedence."""
|
|
263
|
+
# Parse left side (primary expression)
|
|
264
|
+
left = self._parse_primary()
|
|
265
|
+
|
|
266
|
+
# Parse operators with precedence
|
|
267
|
+
while self.position < len(self.tokens):
|
|
268
|
+
token = self.tokens[self.position]
|
|
269
|
+
|
|
270
|
+
# Check if it's a binary operator
|
|
271
|
+
if token.type != 'OPERATOR' or token.value in ('NOT',):
|
|
272
|
+
break
|
|
273
|
+
|
|
274
|
+
# Get operator precedence
|
|
275
|
+
precedence = self.PRECEDENCE.get(token.value, 0)
|
|
276
|
+
if precedence < min_precedence:
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
# Consume operator
|
|
280
|
+
operator = token.value
|
|
281
|
+
self.position += 1
|
|
282
|
+
|
|
283
|
+
# Parse right side with higher precedence for left-associative operators
|
|
284
|
+
# For right-associative operators like **, use same precedence
|
|
285
|
+
next_min_precedence = precedence + (1 if operator != '**' else 0)
|
|
286
|
+
right = self._parse_expression(next_min_precedence)
|
|
287
|
+
|
|
288
|
+
# Create binary operation node
|
|
289
|
+
left = ASTNode('binary_op', operator, left=left, right=right)
|
|
290
|
+
|
|
291
|
+
return left
|
|
292
|
+
|
|
293
|
+
def _parse_primary(self) -> ASTNode:
|
|
294
|
+
"""Parse primary expression (literals, identifiers, functions, parentheses, unary ops)."""
|
|
295
|
+
if self.position >= len(self.tokens):
|
|
296
|
+
raise ValueError("Unexpected end of expression")
|
|
297
|
+
|
|
298
|
+
token = self.tokens[self.position]
|
|
299
|
+
|
|
300
|
+
# Unary operators (-, NOT)
|
|
301
|
+
if token.type == 'OPERATOR' and token.value in ('-', 'NOT'):
|
|
302
|
+
self.position += 1
|
|
303
|
+
operand = self._parse_primary()
|
|
304
|
+
return ASTNode('unary_op', token.value, right=operand)
|
|
305
|
+
|
|
306
|
+
# Parentheses
|
|
307
|
+
if token.type == 'LPAREN':
|
|
308
|
+
self.position += 1
|
|
309
|
+
expr = self._parse_expression()
|
|
310
|
+
|
|
311
|
+
if self.position >= len(self.tokens) or self.tokens[self.position].type != 'RPAREN':
|
|
312
|
+
raise ValueError(f"Unmatched parenthesis at position {token.position}")
|
|
313
|
+
|
|
314
|
+
self.position += 1
|
|
315
|
+
return expr
|
|
316
|
+
|
|
317
|
+
# Numbers
|
|
318
|
+
if token.type == 'NUMBER':
|
|
319
|
+
self.position += 1
|
|
320
|
+
return ASTNode('literal', token.value)
|
|
321
|
+
|
|
322
|
+
# Strings
|
|
323
|
+
if token.type == 'STRING':
|
|
324
|
+
self.position += 1
|
|
325
|
+
return ASTNode('literal', token.value)
|
|
326
|
+
|
|
327
|
+
# Functions
|
|
328
|
+
if token.type == 'FUNCTION':
|
|
329
|
+
return self._parse_function()
|
|
330
|
+
|
|
331
|
+
# Identifiers (column names)
|
|
332
|
+
if token.type == 'IDENTIFIER':
|
|
333
|
+
self.position += 1
|
|
334
|
+
return ASTNode('identifier', token.value)
|
|
335
|
+
|
|
336
|
+
raise ValueError(f"Unexpected token '{token.value}' at position {token.position}")
|
|
337
|
+
|
|
338
|
+
def _parse_function(self) -> ASTNode:
|
|
339
|
+
"""Parse function call."""
|
|
340
|
+
func_token = self.tokens[self.position]
|
|
341
|
+
func_name = func_token.value
|
|
342
|
+
self.position += 1
|
|
343
|
+
|
|
344
|
+
# Expect opening parenthesis
|
|
345
|
+
if self.position >= len(self.tokens) or self.tokens[self.position].type != 'LPAREN':
|
|
346
|
+
raise ValueError(f"Expected '(' after function '{func_name}' at position {func_token.position}")
|
|
347
|
+
|
|
348
|
+
self.position += 1
|
|
349
|
+
|
|
350
|
+
# Parse arguments
|
|
351
|
+
args = []
|
|
352
|
+
|
|
353
|
+
# Check for empty argument list
|
|
354
|
+
if self.position < len(self.tokens) and self.tokens[self.position].type == 'RPAREN':
|
|
355
|
+
self.position += 1
|
|
356
|
+
return ASTNode('function', func_name, children=args)
|
|
357
|
+
|
|
358
|
+
# Parse first argument
|
|
359
|
+
args.append(self._parse_expression())
|
|
360
|
+
|
|
361
|
+
# Parse remaining arguments
|
|
362
|
+
while self.position < len(self.tokens) and self.tokens[self.position].type == 'COMMA':
|
|
363
|
+
self.position += 1
|
|
364
|
+
args.append(self._parse_expression())
|
|
365
|
+
|
|
366
|
+
# Expect closing parenthesis
|
|
367
|
+
if self.position >= len(self.tokens) or self.tokens[self.position].type != 'RPAREN':
|
|
368
|
+
raise ValueError(f"Expected ')' after function arguments at position {func_token.position}")
|
|
369
|
+
|
|
370
|
+
self.position += 1
|
|
371
|
+
|
|
372
|
+
return ASTNode('function', func_name, children=args)
|