cnhkmcp 2.3.4__py3-none-any.whl → 2.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cnhkmcp/__init__.py +1 -1
  2. cnhkmcp/untracked/APP/Tranformer/validator.py +149 -32
  3. cnhkmcp/untracked/APP/blueprints/parsetab.py +60 -0
  4. cnhkmcp/untracked/APP/blueprints/validator.py +1080 -0
  5. cnhkmcp/untracked/APP/requirements.txt +0 -0
  6. cnhkmcp/untracked/APP/static/decoder.js +164 -38
  7. cnhkmcp/untracked/APP/static/simulator.js +15 -15
  8. cnhkmcp/untracked/APP/templates/feature_engineering.html +78 -78
  9. cnhkmcp/untracked/APP/templates/idea_house.html +49 -49
  10. cnhkmcp/untracked/APP/templates/index.html +58 -58
  11. cnhkmcp/untracked/APP/templates/inspiration_house.html +64 -64
  12. cnhkmcp/untracked/APP/templates/paper_analysis.html +21 -21
  13. cnhkmcp/untracked/APP/templates/simulator.html +38 -38
  14. cnhkmcp/untracked/APP/templates/transformer_web.html +24 -24
  15. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/SKILL.md +15 -15
  16. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental28_GLB_delay1/fundamental28_GLB_1_idea_1769927658009727000.json +10 -0
  17. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental28_GLB_delay1/fundamental28_GLB_1_idea_1769927658519220600.json +10 -0
  18. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental28_GLB_delay1/fundamental28_GLB_1_idea_1769927659002708800.json +10 -0
  19. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental28_GLB_delay1/fundamental28_GLB_1_idea_1769927659510920900.json +10 -0
  20. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental28_GLB_delay1/fundamental28_GLB_1_idea_1769927659982673800.json +10 -0
  21. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/validator.py +153 -32
  22. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +46 -0
  23. cnhkmcp/untracked/skills/alpha-expression-verifier/scripts/validator.py +149 -32
  24. cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/SKILL.md +8 -2
  25. cnhkmcp/untracked/skills/brain-inspectRawTemplate-create-Setting/ace.log +0 -0
  26. cnhkmcp/untracked/skills/brain-inspectRawTemplate-create-Setting/idea_context.json +15 -0
  27. cnhkmcp/untracked/skills/brain-inspectRawTemplate-create-Setting/scripts/__init__.py +0 -0
  28. cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/scripts/parse_idea_file.py +33 -1
  29. cnhkmcp/untracked/skills/brain-inspectRawTemplate-create-Setting/scripts/parsetab.py +60 -0
  30. cnhkmcp/untracked/skills/brain-inspectRawTemplate-create-Setting/scripts/validator.py +1086 -0
  31. {cnhkmcp-2.3.4.dist-info → cnhkmcp-2.3.5.dist-info}/METADATA +1 -1
  32. {cnhkmcp-2.3.4.dist-info → cnhkmcp-2.3.5.dist-info}/RECORD +49 -37
  33. /cnhkmcp/untracked/{skills/brain-inspectTemplate-create-Setting → APP/hkSimulator}/ace.log +0 -0
  34. /cnhkmcp/untracked/{skills/brain-inspectTemplate-create-Setting/scripts/__init__.py → APP/hkSimulator/autosim_20260201_172428.log} +0 -0
  35. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/.gitignore +0 -0
  36. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/ace_lib.py +0 -0
  37. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/config.json +0 -0
  38. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/fundamental28_GLB_1_idea_1769874845978315000.json +0 -0
  39. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/helpful_functions.py +0 -0
  40. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/scripts/build_alpha_list.py +0 -0
  41. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/scripts/fetch_sim_options.py +0 -0
  42. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/scripts/load_credentials.py +0 -0
  43. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/scripts/process_template.py +0 -0
  44. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/scripts/resolve_settings.py +0 -0
  45. /cnhkmcp/untracked/skills/{brain-inspectTemplate-create-Setting → brain-inspectRawTemplate-create-Setting}/sim_options_snapshot.json +0 -0
  46. {cnhkmcp-2.3.4.dist-info → cnhkmcp-2.3.5.dist-info}/WHEEL +0 -0
  47. {cnhkmcp-2.3.4.dist-info → cnhkmcp-2.3.5.dist-info}/entry_points.txt +0 -0
  48. {cnhkmcp-2.3.4.dist-info → cnhkmcp-2.3.5.dist-info}/licenses/LICENSE +0 -0
  49. {cnhkmcp-2.3.4.dist-info → cnhkmcp-2.3.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1080 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 表达式验证器 - 使用抽象语法树验证字符串表达式格式是否正确
5
+
6
+ 本模块实现了一个能够检测字符串表达式格式是否正确的系统,基于PLY(Python Lex-Yacc)
7
+ 构建词法分析器和语法分析器,识别表达式中的操作符、函数和字段,并验证其格式正确性。
8
+ """
9
+
10
+ import re
11
+ import sys
12
+ import json
13
+ import os
14
+ from typing import List, Dict, Any, Optional, Tuple
15
+
16
+ # 尝试导入PLY库,如果不存在则提供安装提示
17
+ try:
18
+ import ply.lex as lex
19
+ import ply.yacc as yacc
20
+ except ImportError:
21
+ print("错误: 需要安装PLY库。请运行 'pip install ply' 来安装。")
22
+ sys.exit(1)
23
+
24
+ # 1. 定义支持的操作符和函数
25
+ supported_functions = {
26
+ # Group 类别函数
27
+ 'group_min': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
28
+ 'group_mean': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression']},
29
+ 'group_median': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
30
+ 'group_max': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
31
+ 'group_rank': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
32
+ 'group_vector_proj': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'category']},
33
+ 'group_normalize': {'min_args': 2, 'max_args': 5, 'arg_types': ['expression', 'category', 'expression', 'expression', 'expression']},
34
+ 'group_extra': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'category']},
35
+ 'group_backfill': {'min_args': 3, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'expression'], 'param_names': ['x', 'cat', 'days', 'std']},
36
+ 'group_scale': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
37
+ 'group_count': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
38
+ 'group_zscore': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
39
+ 'group_std_dev': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
40
+ 'group_sum': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
41
+ 'group_neutralize': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'category']},
42
+ 'group_multi_regression': {'min_args': 4, 'max_args': 9, 'arg_types': ['expression'] * 9},
43
+ 'group_cartesian_product': {'min_args': 2, 'max_args': 2, 'arg_types': ['category', 'category']},
44
+ 'combo_a': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression']},
45
+
46
+ # Transformational 类别函数
47
+ 'right_tail': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression']},
48
+ 'bucket': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # 第二个参数可以是string类型的range参数
49
+ 'tail': {'min_args': 1, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'expression']},
50
+ 'left_tail': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression']},
51
+ 'trade_when': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression']},
52
+ 'generate_stats': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
53
+
54
+ # Cross Sectional 类别函数
55
+ 'winsorize': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['x', 'std']},
56
+ 'rank': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression']},
57
+ 'regression_proj': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
58
+ 'vector_neut': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
59
+ 'regression_neut': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
60
+ 'multi_regression': {'min_args': 2, 'max_args': 100, 'arg_types': ['expression'] * 100}, # 支持多个自变量
61
+
62
+ # Time Series 类别函数
63
+ 'ts_std_dev': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
64
+ 'ts_mean': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
65
+ 'ts_delay': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
66
+ 'ts_corr': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number']},
67
+ 'ts_zscore': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
68
+ 'ts_returns': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'mode'], 'keyword_only': True},
69
+ 'ts_product': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
70
+ # Platform: ts_backfill(x, d)
71
+ 'ts_backfill': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'd']},
72
+ 'days_from_last_change': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
73
+ 'last_diff_value': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
74
+ # Platform: ts_scale(x, d, constant=0)
75
+ 'ts_scale': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'constant'], 'keyword_only': True},
76
+ # Platform: ts_entropy(x, d)
77
+ 'ts_entropy': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'd']},
78
+ 'ts_step': {'min_args': 1, 'max_args': 1, 'arg_types': ['number']},
79
+ 'ts_sum': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
80
+ 'ts_co_kurtosis': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number']},
81
+ 'inst_tvr': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
82
+ 'ts_decay_exp_window': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'factor'], 'keyword_only': True},
83
+ 'ts_av_diff': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
84
+ 'ts_kurtosis': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
85
+ # Platform: ts_min_max_diff(x, d, f=0.5)
86
+ 'ts_min_max_diff': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'f'], 'keyword_only': True},
87
+ 'ts_arg_max': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
88
+ 'ts_max': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
89
+ # Platform: ts_min_max_cps(x, d, f=2)
90
+ 'ts_min_max_cps': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'f'], 'keyword_only': True},
91
+ # Platform: ts_rank(x, d, constant=0)
92
+ 'ts_rank': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'constant'], 'keyword_only': True},
93
+ 'ts_ir': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
94
+ 'ts_theilsen': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number']},
95
+ # Platform: hump_decay(x, p=0)
96
+ 'hump_decay': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'p'], 'keyword_only': True},
97
+ # Platform: ts_weighted_decay(x, k=0.5)
98
+ 'ts_weighted_decay': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'k'], 'keyword_only': True},
99
+ # Platform: ts_quantile(x, d, driver="gaussian")
100
+ 'ts_quantile': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'string'], 'param_names': ['x', 'd', 'driver'], 'keyword_only': True},
101
+ 'ts_min': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
102
+ 'ts_count_nans': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
103
+ 'ts_covariance': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number']},
104
+ 'ts_co_skewness': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number']},
105
+ 'ts_min_diff': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
106
+ # Platform: ts_decay_linear(x, d, dense=false)
107
+ 'ts_decay_linear': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'boolean'], 'param_names': ['x', 'd', 'dense'], 'keyword_only': True},
108
+ # Platform: jump_decay(x, d, sensitivity=0.5, force=0.1)
109
+ 'jump_decay': {'min_args': 2, 'max_args': 4, 'arg_types': ['expression', 'number', 'number', 'number'], 'param_names': ['x', 'd', 'sensitivity', 'force'], 'keyword_only': True},
110
+ # Platform: ts_moment(x, d, k=0)
111
+ 'ts_moment': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'k'], 'keyword_only': True},
112
+ 'ts_arg_min': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
113
+ 'ts_regression': {'min_args': 3, 'max_args': 5, 'arg_types': ['expression', 'expression', 'number', 'number', 'number'], 'param_names': ['y', 'x', 'd', 'lag', 'rettype'], 'keyword_only': True},
114
+ 'ts_skewness': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
115
+ 'ts_max_diff': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
116
+ 'kth_element': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'number', 'number']},
117
+ 'hump': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'hump']},
118
+ 'ts_median': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
119
+ 'ts_delta': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
120
+ # Platform: ts_poly_regression(y, x, d, k=1) and k must be keyword if provided
121
+ 'ts_poly_regression': {'min_args': 3, 'max_args': 4, 'arg_types': ['expression', 'expression', 'number', 'number'], 'param_names': ['y', 'x', 'd', 'k'], 'keyword_only': True, 'keyword_only_from': 3},
122
+ 'ts_target_tvr_decay': {'min_args': 1, 'max_args': 4, 'arg_types': ['expression', 'number', 'number', 'number'], 'param_names': ['x', 'lambda_min', 'lambda_max', 'target_tvr'], 'keyword_only': True},
123
+ 'ts_target_tvr_delta_limit': {'min_args': 2, 'max_args': 5, 'arg_types': ['expression', 'expression', 'number', 'number', 'number'], 'param_names': ['x', 'y', 'lambda_min', 'lambda_max', 'target_tvr'], 'keyword_only': True},
124
+ 'ts_target_tvr_hump': {'min_args': 1, 'max_args': 4, 'arg_types': ['expression', 'number', 'number', 'number'], 'param_names': ['x', 'lambda_min', 'lambda_max', 'target_tvr'], 'keyword_only': True},
125
+ # Platform: ts_delta_limit(x, y, limit_volume=0.1)
126
+ 'ts_delta_limit': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number'], 'param_names': ['x', 'y', 'limit_volume'], 'keyword_only': True},
127
+
128
+ # Special 类别函数
129
+ 'inst_pnl': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
130
+ 'self_corr': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
131
+ 'in': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # 注意:这是关键字
132
+ 'universe_size': {'min_args': 0, 'max_args': 0, 'arg_types': []},
133
+
134
+ # Missing functions from operators.py
135
+ 'quantile': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['x', 'driver', 'sigma']}, # quantile(x, driver = gaussian, sigma = 1.0)
136
+ 'normalize': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'boolean', 'number']}, # normalize(x, useStd = false, limit = 0.0)
137
+ 'zscore': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']}, # zscore(x)
138
+
139
+ # Logical 类别函数
140
+ 'or': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # 注意:这是关键字
141
+ 'and': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # 注意:这是关键字
142
+ 'not': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']}, # 注意:这是关键字
143
+ 'is_nan': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
144
+ 'is_not_nan': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
145
+ 'less': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
146
+ 'equal': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
147
+ 'greater': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
148
+ 'is_finite': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
149
+ 'if_else': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression']},
150
+ 'not_equal': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
151
+ 'less_equal': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
152
+ 'greater_equal': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
153
+
154
+ # Vector 类别函数
155
+ 'vec_kurtosis': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
156
+ 'vec_min': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
157
+ 'vec_count': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
158
+ 'vec_sum': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
159
+ 'vec_skewness': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
160
+ 'vec_max': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
161
+ 'vec_avg': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
162
+ 'vec_range': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
163
+ 'vec_choose': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'nth']},
164
+ 'vec_powersum': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'constant']},
165
+ 'vec_stddev': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
166
+ 'vec_percentage': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'percentage']},
167
+ 'vec_ir': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
168
+ 'vec_norm': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
169
+ 'ts_percentage': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'd', 'percentage']},
170
+ 'signed_power': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
171
+ 'ts_product': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number']},
172
+
173
+ # Additional functions from test cases
174
+ 'rank_by_side': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'number', 'number'], 'param_names': ['x', 'rate', 'scale']},
175
+ 'log_diff': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
176
+ 'nan_mask': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']},
177
+ 'ts_partial_corr': {'min_args': 4, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'number']},
178
+ 'ts_triple_corr': {'min_args': 4, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'number']},
179
+ 'clamp': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['x', 'lower', 'upper']},
180
+ 'keep': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number'], 'param_names': ['x', 'condition', 'period']},
181
+ 'replace': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['x', 'target', 'dest']},
182
+ 'filter': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['x', 'h', 't']},
183
+ 'one_side': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'string'], 'param_names': ['x', 'side']},
184
+ 'scale_down': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'number'], 'param_names': ['x', 'constant']},
185
+
186
+ # Arithmetic 类别函数
187
+ # add(x, y, ..., filter=false)
188
+ # NOTE: add() is variadic (>=2 terms) with an optional boolean filter flag.
189
+ # We validate it with custom logic in validate_function().
190
+ 'add': {'min_args': 2, 'max_args': 101, 'arg_types': ['expression'] * 101},
191
+ 'multiply': {'min_args': 2, 'max_args': 100, 'arg_types': ['expression'] * 99 + ['boolean'], 'param_names': ['x', 'y', 'filter']}, # multiply(x, y, ..., filter=false)
192
+ 'sign': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
193
+ 'subtract': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'expression', 'boolean']}, # subtract(x, y, filter=false)
194
+ 'pasteurize': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
195
+ 'log': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
196
+ 'purify': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
197
+ 'arc_tan': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
198
+ 'max': {'min_args': 2, 'max_args': 100, 'arg_types': ['expression'] * 100}, # max(x, y, ...)
199
+ 'to_nan': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'expression', 'boolean']}, # to_nan(x, value=0, reverse=false)
200
+ 'abs': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
201
+ 'sigmoid': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
202
+ 'divide': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # divide(x, y)
203
+ 'min': {'min_args': 2, 'max_args': 100, 'arg_types': ['expression'] * 100}, # min(x, y, ...)
204
+ 'tanh': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
205
+ 'nan_out': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['x', 'lower', 'upper']}, # nan_out(x, lower=0, upper=0)
206
+ 'signed_power': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # signed_power(x, y)
207
+ 'inverse': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
208
+ 'round': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
209
+ 'sqrt': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
210
+ 's_log_1p': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
211
+ 'reverse': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']}, # -x
212
+ 'power': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression']}, # power(x, y)
213
+ 'densify': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
214
+ 'floor': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression']},
215
+ # Appended missing operators
216
+ 'arc_cos': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['x']},
217
+ 'arc_sin': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['x']},
218
+ 'ceiling': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['x']},
219
+ 'exp': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['x']},
220
+ 'fraction': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['x']},
221
+ 'round_down': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['x', 'f']},
222
+ 'is_not_finite': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
223
+ 'negate': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
224
+ 'ts_rank_gmean_amean_diff': {'min_args': 5, 'max_args': 5, 'arg_types': ['expression', 'expression', 'expression', 'expression', 'number'], 'param_names': ['input1', 'input2', 'input3', '...', 'd']},
225
+ 'ts_vector_neut': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number'], 'param_names': ['x', 'y', 'd']},
226
+ 'ts_vector_proj': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'number'], 'param_names': ['x', 'y', 'd']},
227
+ 'scale': {'min_args': 1, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'expression'], 'param_names': ['x', 'scale', 'longscale', 'shortscale']},
228
+ 'generalized_rank': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['open', 'm']},
229
+ 'rank_gmean_amean_diff': {'min_args': 4, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'expression'], 'param_names': ['input1', 'input2', 'input3', '...']},
230
+ 'truncate': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['x', 'maxPercent']},
231
+ 'vector_proj': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['x', 'y']},
232
+ 'vec_filter': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['vec', 'value']},
233
+ 'group_coalesce': {'min_args': 4, 'max_args': 4, 'arg_types': ['expression', 'expression', 'expression', 'expression'], 'param_names': ['original_group', 'group2', 'group3', '…']},
234
+ 'group_percentage': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'category', 'expression'], 'param_names': ['x', 'group', 'percentage']},
235
+ 'group_vector_neut': {'min_args': 3, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['x', 'y', 'g']},
236
+ 'convert': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['x', 'mode']},
237
+ 'reduce_avg': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['input', 'threshold']},
238
+ 'reduce_choose': {'min_args': 2, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['input', 'nth', 'ignoreNan']},
239
+ 'reduce_count': {'min_args': 2, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['input', 'threshold']},
240
+ 'reduce_ir': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
241
+ 'reduce_kurtosis': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
242
+ 'reduce_max': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
243
+ 'reduce_min': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
244
+ 'reduce_norm': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
245
+ 'reduce_percentage': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['input', 'percentage']},
246
+ 'reduce_powersum': {'min_args': 1, 'max_args': 3, 'arg_types': ['expression', 'expression', 'expression'], 'param_names': ['input', 'constant', 'precise']},
247
+ 'reduce_range': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
248
+ 'reduce_skewness': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
249
+ 'reduce_stddev': {'min_args': 1, 'max_args': 2, 'arg_types': ['expression', 'expression'], 'param_names': ['input', 'threshold']},
250
+ 'reduce_sum': {'min_args': 1, 'max_args': 1, 'arg_types': ['expression'], 'param_names': ['input']},
251
+ }
252
+
253
+ # 2. 定义group类型字段
254
+ group_fields = {
255
+ 'sector', 'subindustry', 'industry', 'exchange', 'country', 'market'
256
+ }
257
+
258
+ # 3. 有效类别集合
259
+ valid_categories = group_fields
260
+
261
+ # 4. 字段命名模式 - 只校验字段是不是数字字母下划线组成
262
+ field_patterns = [
263
+ re.compile(r'^[a-zA-Z0-9_]+$'), # 只允许数字、字母和下划线组成的字段名
264
+ ]
265
+
266
+ # 4. 抽象语法树节点类型
267
+ class ASTNode:
268
+ """抽象语法树节点基类"""
269
+ def __init__(self, node_type: str, children: Optional[List['ASTNode']] = None,
270
+ value: Optional[Any] = None, line: Optional[int] = None):
271
+ self.node_type = node_type # 'function', 'operator', 'field', 'number', 'expression'
272
+ self.children = children or []
273
+ self.value = value
274
+ self.line = line
275
+
276
+ def __str__(self) -> str:
277
+ return f"ASTNode({self.node_type}, {self.value}, line={self.line})"
278
+
279
+ def __repr__(self) -> str:
280
+ return self.__str__()
281
+
282
+ class ExpressionValidator:
283
+ """表达式验证器类"""
284
+
285
+ def __init__(self):
286
+ """初始化词法分析器和语法分析器"""
287
+ # 构建词法分析器
288
+ self.lexer = lex.lex(module=self, debug=False)
289
+ # 构建语法分析器
290
+ self.parser = yacc.yacc(module=self, debug=False)
291
+ # 错误信息存储
292
+ self.errors = []
293
+ # Cache for unit inference (unit/scalar/category)
294
+ self._unit_cache: Dict[int, str] = {}
295
+ # Cache for derived category detection (bucket/group_cartesian_product outputs)
296
+ self._derived_category_cache: Dict[int, bool] = {}
297
+
298
+ # 词法分析器规则
299
+ tokens = ('FUNCTION', 'FIELD', 'NUMBER', 'LPAREN', 'RPAREN',
300
+ 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'COMMA', 'CATEGORY',
301
+ 'EQUAL', 'ASSIGN', 'IDENTIFIER', 'STRING', 'GREATER', 'LESS', 'GREATEREQUAL', 'LESSEQUAL', 'NOTEQUAL', 'BOOLEAN')
302
+
303
+ # 忽略空白字符
304
+ t_ignore = ' \t\n'
305
+
306
+ # 操作符 - 注意顺序很重要,长的操作符要放在前面
307
+ t_PLUS = r'\+'
308
+ t_MINUS = r'-'
309
+ t_TIMES = r'\*'
310
+ t_DIVIDE = r'/'
311
+ t_LPAREN = r'\('
312
+ t_RPAREN = r'\)'
313
+ t_COMMA = r','
314
+ t_EQUAL = r'=='
315
+ t_NOTEQUAL = r'!='
316
+ t_GREATEREQUAL = r'>='
317
+ t_LESSEQUAL = r'<='
318
+ t_GREATER = r'>'
319
+ t_LESS = r'<'
320
+ t_ASSIGN = r'='
321
+
322
+ # 数字(整数和浮点数)
323
+ def t_NUMBER(self, t):
324
+ r'\d+\.?\d*'
325
+ if '.' in t.value:
326
+ t.value = float(t.value)
327
+ else:
328
+ t.value = int(t.value)
329
+ return t
330
+
331
+ # 字符串 - 需要放在所有其他标识符规则之前
332
+ def t_STRING(self, t):
333
+ r"'[^']*'|\"[^\"]*\""
334
+ # 去除引号
335
+ t.value = t.value[1:-1]
336
+ return t
337
+
338
+ # 函数和字段名
339
+ def t_IDENTIFIER(self, t):
340
+ r'[a-zA-Z_][a-zA-Z0-9_]*'
341
+ # 检查是否为布尔值
342
+ if t.value.lower() in {'true', 'false'}:
343
+ t.type = 'BOOLEAN'
344
+ t.value = t.value.lower() # 转换为小写以保持一致性
345
+ else:
346
+ # 查看当前token后面的字符,判断是否为参数名(后面跟着'=')
347
+ lexpos = t.lexpos
348
+ next_chars = ''
349
+ if lexpos + len(t.value) < len(t.lexer.lexdata):
350
+ # 查看当前token后面的字符,跳过空格
351
+ next_pos = lexpos + len(t.value)
352
+ while next_pos < len(t.lexer.lexdata) and t.lexer.lexdata[next_pos].isspace():
353
+ next_pos += 1
354
+ if next_pos < len(t.lexer.lexdata):
355
+ next_chars = t.lexer.lexdata[next_pos:next_pos+1]
356
+
357
+ # 如果后面跟着'=',则为参数名
358
+ if next_chars == '=':
359
+ t.type = 'IDENTIFIER'
360
+ # 如果后面跟着'(',则为函数名
361
+ elif next_chars == '(':
362
+ t.type = 'FUNCTION'
363
+ t.value = t.value.lower() # 转换为小写以保持一致性
364
+ # 检查是否为参数名(支持更多参数名)
365
+ elif t.value in {'std', 'k', 'lambda_min', 'lambda_max', 'target_tvr', 'range', 'buckets', 'lag', 'rettype', 'mode', 'nth', 'constant', 'percentage', 'driver', 'sigma', 'rate', 'scale', 'filter', 'lower', 'upper', 'target', 'dest', 'event', 'sensitivity', 'force', 'h', 't', 'period', 'stddev', 'factor', 'k', 'useStd', 'limit', 'gaussian', 'uniform', 'cauchy'}:
366
+ t.type = 'IDENTIFIER'
367
+ # 检查是否为函数名(不区分大小写)
368
+ elif t.value.lower() in supported_functions:
369
+ t.type = 'FUNCTION'
370
+ t.value = t.value.lower() # 转换为小写以保持一致性
371
+ # 检查是否为有效类别
372
+ elif t.value in valid_categories:
373
+ t.type = 'CATEGORY'
374
+ # 检查是否为字段名
375
+ elif self._is_valid_field(t.value):
376
+ t.type = 'FIELD'
377
+ else:
378
+ # 其他标识符,保留为IDENTIFIER类型
379
+ t.type = 'IDENTIFIER'
380
+ return t
381
+
382
+ # 行号跟踪
383
+ def t_newline(self, t):
384
+ r'\n+'
385
+ t.lexer.lineno += len(t.value)
386
+
387
+ # 错误处理
388
+ def t_error(self, t):
389
+ if t:
390
+ # 检查是否为非法字符
391
+ if not re.match(r'[a-zA-Z0-9_\+\-\*/\(\)\,\s=<>!]', t.value[0]):
392
+ # 这是一个非法字符
393
+ self.errors.append(f"非法字符 '{t.value[0]}' (行 {t.lexer.lineno})")
394
+ else:
395
+ # 这是一个非法标记
396
+ self.errors.append(f"非法标记 '{t.value}' (行 {t.lexer.lineno})")
397
+ # 跳过这个字符,继续处理
398
+ t.lexer.skip(1)
399
+ else:
400
+ self.errors.append("词法分析器到达文件末尾")
401
+
402
+ # 语法分析器规则
403
+ def p_expression(self, p):
404
+ """expression : comparison
405
+ | expression EQUAL comparison
406
+ | expression NOTEQUAL comparison
407
+ | expression GREATER comparison
408
+ | expression LESS comparison
409
+ | expression GREATEREQUAL comparison
410
+ | expression LESSEQUAL comparison"""
411
+ if len(p) == 2:
412
+ p[0] = p[1]
413
+ else:
414
+ p[0] = ASTNode('binop', [p[1], p[3]], {'op': p[2]})
415
+
416
+ def p_comparison(self, p):
417
+ """comparison : term
418
+ | comparison PLUS term
419
+ | comparison MINUS term"""
420
+ if len(p) == 2:
421
+ p[0] = p[1]
422
+ else:
423
+ p[0] = ASTNode('binop', [p[1], p[3]], {'op': p[2]})
424
+
425
+ def p_term(self, p):
426
+ """term : factor
427
+ | term TIMES factor
428
+ | term DIVIDE factor"""
429
+ if len(p) == 2:
430
+ p[0] = p[1]
431
+ else:
432
+ p[0] = ASTNode('binop', [p[1], p[3]], {'op': p[2]})
433
+
434
+ def p_factor(self, p):
435
+ """factor : NUMBER
436
+ | STRING
437
+ | FIELD
438
+ | CATEGORY
439
+ | IDENTIFIER
440
+ | BOOLEAN
441
+ | MINUS factor
442
+ | LPAREN expression RPAREN
443
+ | function_call"""
444
+ if len(p) == 2:
445
+ # 数字、字符串、字段、类别或标识符
446
+ if p.slice[1].type == 'NUMBER':
447
+ p[0] = ASTNode('number', value=p[1])
448
+ elif p.slice[1].type == 'STRING':
449
+ p[0] = ASTNode('string', value=p[1])
450
+ elif p.slice[1].type == 'FIELD':
451
+ p[0] = ASTNode('field', value=p[1])
452
+ elif p.slice[1].type == 'CATEGORY':
453
+ p[0] = ASTNode('category', value=p[1])
454
+ elif p.slice[1].type == 'BOOLEAN':
455
+ p[0] = ASTNode('boolean', value=p[1])
456
+ elif p.slice[1].type == 'IDENTIFIER':
457
+ p[0] = ASTNode('identifier', value=p[1])
458
+ else:
459
+ p[0] = p[1]
460
+ elif len(p) == 3:
461
+ # 一元负号
462
+ p[0] = ASTNode('unop', [p[2]], {'op': p[1]})
463
+ elif len(p) == 4:
464
+ # 括号表达式
465
+ p[0] = p[2]
466
+ else:
467
+ # 函数调用
468
+ p[0] = p[1]
469
+
470
+ def p_function_call(self, p):
471
+ '''function_call : FUNCTION LPAREN args RPAREN'''
472
+ p[0] = ASTNode('function', p[3], p[1])
473
+
474
+ def p_args(self, p):
475
+ '''args : arg_list
476
+ | empty'''
477
+ if len(p) == 2 and p[1] is not None:
478
+ p[0] = p[1]
479
+ else:
480
+ p[0] = []
481
+
482
+ def p_arg_list(self, p):
483
+ '''arg_list : arg
484
+ | arg_list COMMA arg'''
485
+ if len(p) == 2:
486
+ p[0] = [p[1]]
487
+ else:
488
+ p[0] = p[1] + [p[3]]
489
+
490
+ def p_arg(self, p):
491
+ '''arg : expression
492
+ | IDENTIFIER ASSIGN expression'''
493
+ if len(p) == 2:
494
+ p[0] = {'type': 'positional', 'value': p[1]}
495
+ else:
496
+ p[0] = {'type': 'named', 'name': p[1], 'value': p[3]}
497
+
498
+ def p_empty(self, p):
499
+ '''empty :'''
500
+ p[0] = None
501
+
502
+ # 语法错误处理
503
+ def p_error(self, p):
504
+ if p:
505
+ self.errors.append(f"语法错误在位置 {p.lexpos}: 非法标记 '{p.value}'")
506
+ else:
507
+ self.errors.append("语法错误: 表达式不完整")
508
+
509
+ def _is_valid_field(self, field_name: str) -> bool:
510
+ """检查字段名是否符合模式"""
511
+ for pattern in field_patterns:
512
+ if pattern.match(field_name):
513
+ return True
514
+ return False
515
+
516
+ def validate_function(self, node: ASTNode, is_in_group_arg: bool = False) -> List[str]:
517
+ """验证函数调用的参数数量和类型"""
518
+ function_name = node.value
519
+ args = node.children
520
+ function_info = supported_functions.get(function_name)
521
+
522
+ if not function_info:
523
+ return [f"未知函数: {function_name}"]
524
+
525
+ # Custom validation for variadic functions with optional flags
526
+ if function_name == 'add':
527
+ return self._validate_add(args, is_in_group_arg)
528
+
529
+ errors = []
530
+
531
+ # Keyword-only enforcement for optional parameters.
532
+ # If enabled, only the required leading arguments can be positional.
533
+ keyword_only_from = function_info.get('keyword_only_from')
534
+ if keyword_only_from is None and function_info.get('keyword_only'):
535
+ keyword_only_from = function_info.get('min_args', 0)
536
+
537
+ # 检查参数数量
538
+ if len(args) < function_info['min_args']:
539
+ errors.append(f"函数 {function_name} 需要至少 {function_info['min_args']} 个参数,但只提供了 {len(args)}")
540
+ elif len(args) > function_info['max_args']:
541
+ errors.append(f"函数 {function_name} 最多接受 {function_info['max_args']} 个参数,但提供了 {len(args)}")
542
+
543
+ # 处理参数验证
544
+ # 跟踪已使用的位置参数索引
545
+ positional_index = 0
546
+
547
+ # 对于所有函数,支持命名参数
548
+ for arg in args:
549
+ if isinstance(arg, dict):
550
+ if arg['type'] == 'named':
551
+ # 命名参数
552
+ if 'param_names' in function_info and arg['name'] in function_info['param_names']:
553
+ # 查找参数在param_names中的索引
554
+ param_index = function_info['param_names'].index(arg['name'])
555
+ if param_index < len(function_info['arg_types']):
556
+ expected_type = function_info['arg_types'][param_index]
557
+ arg_errors = self._validate_arg_type(arg['value'], expected_type, param_index, function_name, is_in_group_arg)
558
+ errors.extend(arg_errors)
559
+ # 对于winsorize函数,支持std和clip参数
560
+ elif function_name == 'winsorize' and arg['name'] in ['std', 'clip']:
561
+ arg_errors = self._validate_arg_type(arg['value'], 'number', 0, function_name, is_in_group_arg)
562
+ errors.extend(arg_errors)
563
+ # 对于bucket函数,支持'range'和'buckets'参数
564
+ elif function_name == 'bucket' and arg['name'] in ['range', 'buckets']:
565
+ # range和buckets参数应该是string类型
566
+ arg_errors = self._validate_arg_type(arg['value'], 'string', 1, function_name, is_in_group_arg)
567
+ errors.extend(arg_errors)
568
+ else:
569
+ errors.append(f"函数 {function_name} 不存在参数 '{arg['name']}'")
570
+ elif arg['type'] == 'positional':
571
+ # 位置参数(字典形式)
572
+ if keyword_only_from is not None and positional_index >= keyword_only_from:
573
+ param_name = None
574
+ if 'param_names' in function_info and positional_index < len(function_info['param_names']):
575
+ param_name = function_info['param_names'][positional_index]
576
+ if param_name:
577
+ errors.append(f"函数 {function_name} 的第{positional_index+1}个参数必须使用命名参数 '{param_name}='")
578
+ else:
579
+ errors.append(f"函数 {function_name} 的第{positional_index+1}个参数必须使用命名参数")
580
+ else:
581
+ # 验证位置参数的类型
582
+ if positional_index < len(function_info['arg_types']):
583
+ expected_type = function_info['arg_types'][positional_index]
584
+ arg_errors = self._validate_arg_type(arg['value'], expected_type, positional_index, function_name, is_in_group_arg)
585
+ errors.extend(arg_errors)
586
+ positional_index += 1
587
+ else:
588
+ # 其他字典类型参数
589
+ errors.append(f"参数 {positional_index+1} 格式错误")
590
+ positional_index += 1
591
+ else:
592
+ # 位置参数(直接ASTNode形式)
593
+ if keyword_only_from is not None and positional_index >= keyword_only_from:
594
+ param_name = None
595
+ if 'param_names' in function_info and positional_index < len(function_info['param_names']):
596
+ param_name = function_info['param_names'][positional_index]
597
+ if param_name:
598
+ errors.append(f"函数 {function_name} 的第{positional_index+1}个参数必须使用命名参数 '{param_name}='")
599
+ else:
600
+ errors.append(f"函数 {function_name} 的第{positional_index+1}个参数必须使用命名参数")
601
+ else:
602
+ # 验证位置参数的类型
603
+ if positional_index < len(function_info['arg_types']):
604
+ expected_type = function_info['arg_types'][positional_index]
605
+ arg_errors = self._validate_arg_type(arg, expected_type, positional_index, function_name, is_in_group_arg)
606
+ errors.extend(arg_errors)
607
+ positional_index += 1
608
+
609
+ return errors
610
+
611
+ def _validate_arg_type(self, arg: ASTNode, expected_type: str, arg_index: int, function_name: str, is_in_group_arg: bool = False) -> List[str]:
612
+ """验证参数类型是否符合预期"""
613
+ errors = []
614
+
615
+ # Unit compatibility check
616
+ # bucket()/group_cartesian_product() output a derived category (grouping key).
617
+ # It can only be consumed where a category/grouping key is expected.
618
+ if self._is_derived_category(arg) and expected_type != 'category':
619
+ errors.append(
620
+ f"Incompatible unit for input of \"{function_name}\" at index {arg_index}, expected \"Unit[]\", found \"Unit[Group:1]\""
621
+ )
622
+ return errors
623
+
624
+ # 首先检查是否是group类型字段,如果是则只能用于Group类型函数
625
+ # 但是如果当前函数是group_xxx或在group函数的参数链中,则允许使用
626
+ if arg.node_type == 'category' and arg.value in group_fields:
627
+ if not (function_name.startswith('group_') or is_in_group_arg):
628
+ errors.append(f"Group类型字段 '{arg.value}' 只能用于Group类型函数的参数中")
629
+
630
+ # 然后验证参数类型是否符合预期
631
+ if expected_type == 'expression':
632
+ # 表达式可以是任何有效的AST节点
633
+ pass
634
+ elif expected_type == 'number':
635
+ if arg.node_type != 'number':
636
+ errors.append(f"参数 {arg_index+1} 应该是一个数字,但得到 {arg.node_type}")
637
+ elif expected_type == 'boolean':
638
+ # 布尔值可以是 true/false 或数字(0/1)
639
+ if arg.node_type not in {'boolean', 'number'}:
640
+ errors.append(f"参数 {arg_index+1} 应该是一个布尔值(true/false 或 0/1),但得到 {arg.node_type}")
641
+ elif expected_type == 'field':
642
+ if arg.node_type != 'field' and arg.node_type != 'category':
643
+ # 允许field或category作为字段参数
644
+ errors.append(f"参数 {arg_index+1} 应该是一个字段,但得到 {arg.node_type}")
645
+ elif arg.node_type == 'field' and not self._is_valid_field(arg.value):
646
+ errors.append(f"无效的字段名: {arg.value}")
647
+ elif expected_type == 'category':
648
+ if not function_name.startswith('group_'):
649
+ # 非group函数的category参数必须是category类型且在valid_categories中
650
+ if arg.node_type != 'category':
651
+ errors.append(f"参数 {arg_index+1} 应该是一个类别,但得到 {arg.node_type}")
652
+ elif arg.value not in valid_categories:
653
+ errors.append(f"无效的类别: {arg.value}")
654
+ # group函数的category参数可以是任何类型(field、category等),不进行类型校验
655
+
656
+ return errors
657
+
658
+ def _infer_unit(self, node: ASTNode) -> str:
659
+ """Infer the Unit kind of an AST node.
660
+
661
+ Returns:
662
+ 'unit' - regular numeric time-series Unit[]
663
+ 'scalar' - literals (numbers/booleans/strings)
664
+ 'category' - category/grouping keys (industry/sector or derived via bucket/cartesian)
665
+ """
666
+ if node is None:
667
+ return 'unit'
668
+
669
+ cache_key = id(node)
670
+ cached = self._unit_cache.get(cache_key)
671
+ if cached is not None:
672
+ return cached
673
+
674
+ unit = 'unit'
675
+
676
+ if node.node_type in {'number', 'boolean', 'string'}:
677
+ unit = 'scalar'
678
+ elif node.node_type in {'field', 'identifier'}:
679
+ unit = 'unit'
680
+ elif node.node_type == 'category':
681
+ unit = 'category'
682
+ elif node.node_type in {'unop', 'binop'}:
683
+ child_units = [self._infer_unit(child) for child in node.children if hasattr(child, 'node_type')]
684
+ unit = 'category' if 'category' in child_units else 'unit'
685
+ elif node.node_type == 'function':
686
+ fname = node.value
687
+ if fname in {'bucket', 'group_cartesian_product'}:
688
+ unit = 'category'
689
+ else:
690
+ first_arg = None
691
+ for child in node.children:
692
+ if isinstance(child, dict):
693
+ if child.get('type') == 'positional':
694
+ first_arg = child.get('value')
695
+ break
696
+ else:
697
+ first_arg = child
698
+ break
699
+ if hasattr(first_arg, 'node_type'):
700
+ unit = self._infer_unit(first_arg)
701
+ else:
702
+ unit = 'unit'
703
+
704
+ self._unit_cache[cache_key] = unit
705
+ return unit
706
+
707
+ def _is_derived_category(self, node: ASTNode) -> bool:
708
+ """Return True if node is a derived category/grouping key (e.g., bucket/cartesian output)."""
709
+ if node is None:
710
+ return False
711
+
712
+ cache_key = id(node)
713
+ cached = self._derived_category_cache.get(cache_key)
714
+ if cached is not None:
715
+ return cached
716
+
717
+ derived = False
718
+ if node.node_type == 'function' and node.value in {'bucket', 'group_cartesian_product'}:
719
+ derived = True
720
+ elif node.node_type in {'unop', 'binop'}:
721
+ derived = any(
722
+ self._is_derived_category(child)
723
+ for child in node.children
724
+ if hasattr(child, 'node_type')
725
+ )
726
+ elif node.node_type == 'function':
727
+ derived = any(
728
+ self._is_derived_category(child.get('value')) if isinstance(child, dict) else self._is_derived_category(child)
729
+ for child in node.children
730
+ )
731
+
732
+ self._derived_category_cache[cache_key] = derived
733
+ return derived
734
+
735
+ def _validate_add(self, args: List[Any], is_in_group_arg: bool = False) -> List[str]:
736
+ """Validate add(x, y, ..., filter=false).
737
+
738
+ Rules:
739
+ - At least 2 positional expression terms.
740
+ - Optional filter flag can be provided as:
741
+ - named argument: filter=<boolean>
742
+ - last positional argument: <boolean> or 0/1
743
+ """
744
+ errors: List[str] = []
745
+
746
+ if len(args) < 2:
747
+ return [f"函数 add 需要至少 2 个参数,但只提供了 {len(args)}"]
748
+
749
+ named_filter_nodes: List[ASTNode] = []
750
+ positional_nodes: List[ASTNode] = []
751
+
752
+ for arg in args:
753
+ if isinstance(arg, dict) and arg.get('type') == 'named':
754
+ name = arg.get('name')
755
+ value = arg.get('value')
756
+ if name != 'filter':
757
+ errors.append(f"函数 add 不存在参数 '{name}'")
758
+ continue
759
+ if not hasattr(value, 'node_type'):
760
+ errors.append("函数 add 的参数 filter 格式错误")
761
+ continue
762
+ named_filter_nodes.append(value)
763
+ elif isinstance(arg, dict) and arg.get('type') == 'positional':
764
+ value = arg.get('value')
765
+ if hasattr(value, 'node_type'):
766
+ positional_nodes.append(value)
767
+ else:
768
+ errors.append("函数 add 的位置参数格式错误")
769
+ elif hasattr(arg, 'node_type'):
770
+ positional_nodes.append(arg)
771
+ else:
772
+ errors.append("函数 add 的参数格式错误")
773
+
774
+ if len(named_filter_nodes) > 1:
775
+ errors.append("函数 add 的参数 'filter' 只能出现一次")
776
+
777
+ positional_filter_node: Optional[ASTNode] = None
778
+ # Only infer a positional filter flag when:
779
+ # - no named filter is provided
780
+ # - there are at least 3 positional args (x, y, filter)
781
+ # - the last arg is boolean or numeric 0/1
782
+ if not named_filter_nodes and len(positional_nodes) >= 3:
783
+ last = positional_nodes[-1]
784
+ if last.node_type == 'boolean' or (last.node_type == 'number' and last.value in {0, 1}):
785
+ positional_filter_node = positional_nodes.pop()
786
+
787
+ if len(positional_nodes) < 2:
788
+ errors.append(f"函数 add 需要至少 2 个输入项(不含filter),但只提供了 {len(positional_nodes)}")
789
+
790
+ for idx, node in enumerate(positional_nodes):
791
+ errors.extend(self._validate_arg_type(node, 'expression', idx, 'add', is_in_group_arg))
792
+
793
+ if positional_filter_node is not None and named_filter_nodes:
794
+ errors.append("函数 add 的 filter 不能同时用位置参数和命名参数传递")
795
+ if positional_filter_node is not None:
796
+ errors.extend(self._validate_arg_type(positional_filter_node, 'boolean', len(positional_nodes), 'add', is_in_group_arg))
797
+ if named_filter_nodes:
798
+ errors.extend(self._validate_arg_type(named_filter_nodes[0], 'boolean', len(positional_nodes), 'add', is_in_group_arg))
799
+
800
+ return errors
801
+
802
+ def validate_ast(self, ast: Optional[ASTNode], is_in_group_arg: bool = False) -> List[str]:
803
+ """递归验证抽象语法树"""
804
+ if not ast:
805
+ return ["无法解析表达式"]
806
+
807
+ errors = []
808
+
809
+ # 根据节点类型进行验证
810
+ if ast.node_type == 'function':
811
+ # 检查当前函数是否是group函数
812
+ is_group_function = ast.value.startswith('group_')
813
+ # 确定当前是否在group函数的参数链中
814
+ current_in_group_arg = is_in_group_arg or is_group_function
815
+ # 验证函数
816
+ function_errors = self.validate_function(ast, current_in_group_arg)
817
+ errors.extend(function_errors)
818
+
819
+ # 递归验证子节点时使用current_in_group_arg
820
+ for child in ast.children:
821
+ if isinstance(child, dict):
822
+ # 命名参数,验证其值
823
+ if 'value' in child and hasattr(child['value'], 'node_type'):
824
+ child_errors = self.validate_ast(child['value'], current_in_group_arg)
825
+ errors.extend(child_errors)
826
+ elif hasattr(child, 'node_type'):
827
+ child_errors = self.validate_ast(child, current_in_group_arg)
828
+ errors.extend(child_errors)
829
+ elif ast.node_type in ['unop', 'binop']:
830
+ # 对操作符的子节点进行验证
831
+ for child in ast.children:
832
+ if hasattr(child, 'node_type'):
833
+ child_errors = self.validate_ast(child, is_in_group_arg)
834
+ errors.extend(child_errors)
835
+ elif ast.node_type == 'field':
836
+ # 验证字段名
837
+ if not self._is_valid_field(ast.value):
838
+ errors.append(f"无效的字段名: {ast.value}")
839
+ else:
840
+ # 递归验证子节点
841
+ for child in ast.children:
842
+ if isinstance(child, dict):
843
+ # 命名参数,验证其值
844
+ if 'value' in child and hasattr(child['value'], 'node_type'):
845
+ child_errors = self.validate_ast(child['value'], is_in_group_arg)
846
+ errors.extend(child_errors)
847
+ elif hasattr(child, 'node_type'):
848
+ child_errors = self.validate_ast(child, is_in_group_arg)
849
+ errors.extend(child_errors)
850
+
851
+ return errors
852
+
853
+ def _process_semicolon_expression(self, expression: str) -> Tuple[bool, str]:
854
+ """处理带有分号的表达式,将其转换为不带分号的简化形式
855
+
856
+ Args:
857
+ expression: 要处理的表达式字符串
858
+
859
+ Returns:
860
+ Tuple[bool, str]: (是否成功, 转换后的表达式或错误信息)
861
+ """
862
+ # 检查表达式是否以分号结尾
863
+ if expression.strip().endswith(';'):
864
+ return False, "表达式不能以分号结尾"
865
+
866
+ # 分割表达式为语句列表
867
+ statements = [stmt.strip() for stmt in expression.split(';') if stmt.strip()]
868
+ if not statements:
869
+ return False, "表达式不能为空"
870
+
871
+ # 存储变量赋值
872
+ variables = {}
873
+
874
+ # 处理每个赋值语句(除了最后一个)
875
+ for i, stmt in enumerate(statements[:-1]):
876
+ # 检查是否包含赋值符号
877
+ if '=' not in stmt:
878
+ return False, f"第{i+1}个语句必须是赋值语句(使用=符号)"
879
+
880
+ # 检查是否是比较操作符(==, !=, <=, >=)
881
+ if any(op in stmt for op in ['==', '!=', '<=', '>=']):
882
+ # 如果包含比较操作符,需要确认是否有赋值符号
883
+ # 使用临时替换法:将比较操作符替换为临时标记,再检查是否还有=
884
+ temp_stmt = stmt
885
+ for op in ['==', '!=', '<=', '>=']:
886
+ temp_stmt = temp_stmt.replace(op, '---')
887
+
888
+ if '=' not in temp_stmt:
889
+ return False, f"第{i+1}个语句必须是赋值语句,不能只是比较表达式"
890
+
891
+ # 找到第一个=符号(不是比较操作符的一部分)
892
+ # 先将比较操作符替换为临时标记,再找=
893
+ temp_stmt = stmt
894
+ for op in ['==', '!=', '<=', '>=']:
895
+ temp_stmt = temp_stmt.replace(op, '---')
896
+
897
+ if '=' not in temp_stmt:
898
+ return False, f"第{i+1}个语句必须是赋值语句(使用=符号)"
899
+
900
+ # 找到实际的=位置
901
+ equals_pos = temp_stmt.index('=')
902
+
903
+ # 在原始语句中找到对应位置
904
+ real_equals_pos = 0
905
+ temp_count = 0
906
+ for char in stmt:
907
+ if temp_count == equals_pos:
908
+ break
909
+ if char in '!<>':
910
+ # 检查是否是比较操作符的一部分
911
+ if real_equals_pos + 1 < len(stmt) and stmt[real_equals_pos + 1] == '=':
912
+ # 是比较操作符,跳过两个字符
913
+ real_equals_pos += 2
914
+ temp_count += 3 # 因为替换成了三个字符的---
915
+ else:
916
+ real_equals_pos += 1
917
+ temp_count += 1
918
+ else:
919
+ real_equals_pos += 1
920
+ temp_count += 1
921
+
922
+ # 分割变量名和值
923
+ var_name = stmt[:real_equals_pos].strip()
924
+ var_value = stmt[real_equals_pos + 1:].strip()
925
+
926
+ # 检查变量名是否有效
927
+ if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', var_name):
928
+ return False, f"第{i+1}个语句的变量名'{var_name}'无效,只能包含字母、数字和下划线,且不能以数字开头"
929
+
930
+ var_name_lower = var_name.lower() # 变量名不区分大小写
931
+
932
+ # 检查变量名是否在后续表达式中使用
933
+ # 这里不需要,因为后面的表达式会检查
934
+
935
+ # 检查变量值中使用的变量是否已经定义
936
+ # 简单检查:提取所有可能的变量名
937
+ used_vars = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', var_value)
938
+ for used_var in used_vars:
939
+ used_var_lower = used_var.lower()
940
+ if used_var_lower not in variables:
941
+ # 检查是否是函数名
942
+ if used_var not in supported_functions:
943
+ # 对于单个字母或简单单词,不自动视为字段名,要求先定义
944
+ if len(used_var) <= 2:
945
+ return False, f"第{i+1}个语句中使用的变量'{used_var}'未在之前定义"
946
+ # 对于较长的字段名,仍然允许作为字段名
947
+ elif not self._is_valid_field(used_var):
948
+ return False, f"第{i+1}个语句中使用的变量'{used_var}'未在之前定义"
949
+
950
+ # 将之前定义的变量替换到当前值中
951
+ for existing_var, existing_val in variables.items():
952
+ # 使用单词边界匹配,避免替换到其他单词的一部分
953
+ var_value = re.sub(rf'\b{existing_var}\b', existing_val, var_value)
954
+
955
+ # 存储变量
956
+ variables[var_name_lower] = var_value
957
+
958
+ # 处理最后一个语句(实际的表达式)
959
+ final_stmt = statements[-1]
960
+
961
+ # 检查最后一个语句是否是赋值语句
962
+ if '=' in final_stmt:
963
+ # 替换比较操作符为临时标记,然后检查是否还有单独的=
964
+ temp_stmt = final_stmt
965
+ for op in ['==', '!=', '<=', '>=']:
966
+ temp_stmt = temp_stmt.replace(op, '---')
967
+
968
+ if '=' in temp_stmt:
969
+ return False, "最后一个语句不能是赋值语句"
970
+
971
+ # 检查最后一个语句中使用的变量是否已经定义
972
+ used_vars = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', final_stmt)
973
+ for used_var in used_vars:
974
+ used_var_lower = used_var.lower()
975
+ if used_var_lower not in variables:
976
+ # 检查是否是函数名
977
+ if used_var not in supported_functions:
978
+ # 在分号表达式中,所有非函数名的标识符都必须是变量,必须在之前定义
979
+ return False, f"最后一个语句中使用的变量'{used_var}'未在之前定义"
980
+
981
+ # 将变量替换到最后一个表达式中
982
+ final_expr = final_stmt
983
+ for var_name, var_value in variables.items():
984
+ final_expr = re.sub(rf'\b{var_name}\b', var_value, final_expr)
985
+
986
+ return True, final_expr
987
+
988
+ def check_expression(self, expression: str) -> Dict[str, Any]:
989
+ """
990
+ 检查表达式格式是否正确
991
+
992
+ Args:
993
+ expression: 要验证的表达式字符串
994
+
995
+ Returns:
996
+ 包含验证结果的字典
997
+ """
998
+ # 重置错误列表
999
+ self.errors = []
1000
+ # Reset unit inference cache for this expression
1001
+ self._unit_cache = {}
1002
+ self._derived_category_cache = {}
1003
+
1004
+ try:
1005
+ expression = expression.strip()
1006
+ if not expression:
1007
+ return {
1008
+ 'valid': False,
1009
+ 'errors': ['表达式不能为空'],
1010
+ 'tokens': [],
1011
+ 'ast': None
1012
+ }
1013
+
1014
+ # 处理带有分号的表达式
1015
+ if ';' in expression:
1016
+ success, result = self._process_semicolon_expression(expression)
1017
+ if not success:
1018
+ return {
1019
+ 'valid': False,
1020
+ 'errors': [result],
1021
+ 'tokens': [],
1022
+ 'ast': None
1023
+ }
1024
+ expression = result
1025
+
1026
+ # 重置词法分析器的行号
1027
+ self.lexer.lineno = 1
1028
+
1029
+ # 词法分析(用于调试)
1030
+ self.lexer.input(expression)
1031
+ tokens = []
1032
+ # 调试:打印识别的标记
1033
+ # print(f"\n调试 - 表达式: {expression}")
1034
+ # print("识别的标记:")
1035
+ for token in self.lexer:
1036
+ # print(f" - 类型: {token.type}, 值: '{token.value}', 位置: {token.lexpos}")
1037
+ tokens.append(token)
1038
+
1039
+ # 重新设置词法分析器的输入,以便语法分析器使用
1040
+ self.lexer.input(expression)
1041
+ self.lexer.lineno = 1
1042
+
1043
+ # 语法分析
1044
+ ast = self.parser.parse(expression, lexer=self.lexer)
1045
+
1046
+ # 验证AST
1047
+ validation_errors = self.validate_ast(ast)
1048
+
1049
+ # 合并所有错误
1050
+ all_errors = self.errors + validation_errors
1051
+
1052
+ # 检查括号是否匹配
1053
+ bracket_count = 0
1054
+ for char in expression:
1055
+ if char == '(':
1056
+ bracket_count += 1
1057
+ elif char == ')':
1058
+ bracket_count -= 1
1059
+ if bracket_count < 0:
1060
+ all_errors.append("括号不匹配: 右括号过多")
1061
+ break
1062
+ if bracket_count > 0:
1063
+ all_errors.append("括号不匹配: 左括号过多")
1064
+
1065
+ return {
1066
+ 'valid': len(all_errors) == 0,
1067
+ 'errors': all_errors,
1068
+ 'tokens': tokens,
1069
+ 'ast': ast
1070
+ }
1071
+ except Exception as e:
1072
+ return {
1073
+ 'valid': False,
1074
+ 'errors': [f"解析错误: {str(e)}"],
1075
+ 'tokens': [],
1076
+ 'ast': None
1077
+ }
1078
+
1079
+
1080
+