cnhkmcp 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. cnhkmcp/__init__.py +125 -0
  2. cnhkmcp/untracked/APP/.gitignore +32 -0
  3. cnhkmcp/untracked/APP/MODULAR_STRUCTURE.md +112 -0
  4. cnhkmcp/untracked/APP/README.md +309 -0
  5. cnhkmcp/untracked/APP/Tranformer/Transformer.py +2192 -0
  6. cnhkmcp/untracked/APP/Tranformer/ace.log +0 -0
  7. cnhkmcp/untracked/APP/Tranformer/ace_lib.py +1489 -0
  8. cnhkmcp/untracked/APP/Tranformer/helpful_functions.py +180 -0
  9. cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates.json +1786 -0
  10. cnhkmcp/untracked/APP/Tranformer/output/Alpha_candidates_/321/207/320/264/342/225/221/321/204/342/225/233/320/233.json +654 -0
  11. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_error.json +261 -0
  12. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_success.json +170 -0
  13. cnhkmcp/untracked/APP/Tranformer/output/Alpha_generated_expressions_/321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/320/237/320/277/321/207/320/253/342/224/244/321/206/320/236/320/265/321/210/342/225/234/342/225/234/321/205/320/225/320/265Machine_lib.json +22 -0
  14. cnhkmcp/untracked/APP/Tranformer/parsetab.py +60 -0
  15. cnhkmcp/untracked/APP/Tranformer/template_summary.txt +408 -0
  16. cnhkmcp/untracked/APP/Tranformer/transformer_config.json +7 -0
  17. cnhkmcp/untracked/APP/Tranformer/validator.py +889 -0
  18. cnhkmcp/untracked/APP/ace.log +65 -0
  19. cnhkmcp/untracked/APP/ace_lib.py +1489 -0
  20. cnhkmcp/untracked/APP/blueprints/__init__.py +6 -0
  21. cnhkmcp/untracked/APP/blueprints/feature_engineering.py +347 -0
  22. cnhkmcp/untracked/APP/blueprints/idea_house.py +221 -0
  23. cnhkmcp/untracked/APP/blueprints/inspiration_house.py +432 -0
  24. cnhkmcp/untracked/APP/blueprints/paper_analysis.py +570 -0
  25. cnhkmcp/untracked/APP/custom_templates/templates.json +1257 -0
  26. cnhkmcp/untracked/APP/give_me_idea/BRAIN_Alpha_Template_Expert_SystemPrompt.md +400 -0
  27. cnhkmcp/untracked/APP/give_me_idea/ace_lib.py +1489 -0
  28. cnhkmcp/untracked/APP/give_me_idea/alpha_data_specific_template_master.py +247 -0
  29. cnhkmcp/untracked/APP/give_me_idea/helpful_functions.py +180 -0
  30. cnhkmcp/untracked/APP/give_me_idea/what_is_Alpha_template.md +11 -0
  31. cnhkmcp/untracked/APP/helpful_functions.py +180 -0
  32. cnhkmcp/untracked/APP/hkSimulator/ace.log +0 -0
  33. cnhkmcp/untracked/APP/hkSimulator/ace_lib.py +1476 -0
  34. cnhkmcp/untracked/APP/hkSimulator/autosim_20251205_145240.log +0 -0
  35. cnhkmcp/untracked/APP/hkSimulator/autosim_20251215_030103.log +0 -0
  36. cnhkmcp/untracked/APP/hkSimulator/autosimulator.py +447 -0
  37. cnhkmcp/untracked/APP/hkSimulator/helpful_functions.py +180 -0
  38. cnhkmcp/untracked/APP/mirror_config.txt +20 -0
  39. cnhkmcp/untracked/APP/operaters.csv +129 -0
  40. cnhkmcp/untracked/APP/requirements.txt +53 -0
  41. cnhkmcp/untracked/APP/run_app.bat +28 -0
  42. cnhkmcp/untracked/APP/run_app.sh +34 -0
  43. cnhkmcp/untracked/APP/setup_tsinghua.bat +39 -0
  44. cnhkmcp/untracked/APP/setup_tsinghua.sh +43 -0
  45. cnhkmcp/untracked/APP/simulator/alpha_submitter.py +404 -0
  46. cnhkmcp/untracked/APP/simulator/simulator_wqb.py +618 -0
  47. cnhkmcp/untracked/APP/ssrn-3332513.pdf +109188 -19
  48. cnhkmcp/untracked/APP/static/brain.js +528 -0
  49. cnhkmcp/untracked/APP/static/decoder.js +1540 -0
  50. cnhkmcp/untracked/APP/static/feature_engineering.js +1729 -0
  51. cnhkmcp/untracked/APP/static/idea_house.js +937 -0
  52. cnhkmcp/untracked/APP/static/inspiration.js +465 -0
  53. cnhkmcp/untracked/APP/static/inspiration_house.js +868 -0
  54. cnhkmcp/untracked/APP/static/paper_analysis.js +390 -0
  55. cnhkmcp/untracked/APP/static/script.js +2942 -0
  56. cnhkmcp/untracked/APP/static/simulator.js +597 -0
  57. cnhkmcp/untracked/APP/static/styles.css +3127 -0
  58. cnhkmcp/untracked/APP/static/usage_widget.js +508 -0
  59. cnhkmcp/untracked/APP/templates/alpha_inspector.html +511 -0
  60. cnhkmcp/untracked/APP/templates/feature_engineering.html +960 -0
  61. cnhkmcp/untracked/APP/templates/idea_house.html +564 -0
  62. cnhkmcp/untracked/APP/templates/index.html +911 -0
  63. cnhkmcp/untracked/APP/templates/inspiration_house.html +861 -0
  64. cnhkmcp/untracked/APP/templates/paper_analysis.html +91 -0
  65. cnhkmcp/untracked/APP/templates/simulator.html +343 -0
  66. cnhkmcp/untracked/APP/templates/transformer_web.html +580 -0
  67. cnhkmcp/untracked/APP/usage.md +351 -0
  68. cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/ace_lib.py +1489 -0
  69. cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/brain_alpha_inspector.py +712 -0
  70. cnhkmcp/untracked/APP//321/207/342/225/235/320/250/321/205/320/230/320/226/321/204/342/225/225/320/220/321/211/320/221/320/243/321/206/320/261/320/265/helpful_functions.py +180 -0
  71. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +2393 -0
  72. cnhkmcp/untracked/arXiv_API_Tool_Manual.md +490 -0
  73. cnhkmcp/untracked/arxiv_api.py +229 -0
  74. cnhkmcp/untracked/forum_functions.py +998 -0
  75. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/forum_functions.py +407 -0
  76. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/platform_functions.py +2415 -0
  77. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/user_config.json +31 -0
  78. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272//321/210/320/276/320/271AI/321/210/320/277/342/225/227/321/210/342/224/220/320/251/321/204/342/225/225/320/272/321/206/320/246/320/227/321/206/320/261/320/263/321/206/320/255/320/265/321/205/320/275/320/266/321/204/342/225/235/320/252/321/204/342/225/225/320/233/321/210/342/225/234/342/225/234/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270.md +101 -0
  79. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +190 -0
  80. cnhkmcp/untracked/platform_functions.py +2886 -0
  81. cnhkmcp/untracked/sample_mcp_config.json +11 -0
  82. cnhkmcp/untracked/user_config.json +31 -0
  83. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/320/237/320/222/321/210/320/220/320/223/321/206/320/246/320/227/321/206/320/261/320/263_BRAIN_Alpha_Test_Requirements_and_Tips.md +202 -0
  84. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_Alpha_explaination_workflow.md +56 -0
  85. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_BRAIN_6_Tips_Datafield_Exploration_Guide.md +194 -0
  86. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_BRAIN_Alpha_Improvement_Workflow.md +101 -0
  87. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_Dataset_Exploration_Expert_Manual.md +436 -0
  88. cnhkmcp/untracked//321/207/320/264/342/225/221/321/204/342/225/233/320/233/321/205/342/225/226/320/265/321/204/342/225/234/320/254/321/206/342/225/241/320/221_daily_report_workflow.md +128 -0
  89. cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +190 -0
  90. cnhkmcp-2.0.1.dist-info/METADATA +187 -0
  91. cnhkmcp-2.0.1.dist-info/RECORD +95 -0
  92. cnhkmcp-2.0.1.dist-info/WHEEL +5 -0
  93. cnhkmcp-2.0.1.dist-info/entry_points.txt +2 -0
  94. cnhkmcp-2.0.1.dist-info/licenses/LICENSE +21 -0
  95. cnhkmcp-2.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2192 @@
1
+ import requests
2
+ import json
3
+ import sys
4
+ import asyncio
5
+ import openai
6
+ import re
7
+ from typing import Optional, Union # Added this import
8
+ try:
9
+ from .validator_hooks import is_valid_template_expr, has_empty_datafield_candidates
10
+ except Exception:
11
+ # Fallback for direct script execution
12
+ try:
13
+ from validator_hooks import is_valid_template_expr, has_empty_datafield_candidates
14
+ except Exception:
15
+ is_valid_template_expr = None
16
+ has_empty_datafield_candidates = None
17
+
18
+ # --- Validation wrappers to integrate into the pipeline ---
19
+ def _filter_valid_templates(
20
+ proposed_templates: dict,
21
+ operators_meta,
22
+ brain_session,
23
+ settings: dict,
24
+ parse_alpha_code_func,
25
+ ):
26
+ """Return dict of only templates that pass validation.
27
+
28
+ Safe no-op if validation helpers are unavailable.
29
+ """
30
+ if not is_valid_template_expr or not parse_alpha_code_func:
31
+ return proposed_templates
32
+ filtered = {}
33
+ for template_expr, template_expl in proposed_templates.items():
34
+ try:
35
+ if is_valid_template_expr(
36
+ template_expr,
37
+ operators_meta,
38
+ brain_session,
39
+ settings,
40
+ parse_alpha_code_func,
41
+ ):
42
+ filtered[template_expr] = template_expl
43
+ except Exception:
44
+ # Be conservative: drop on exceptions
45
+ continue
46
+ return filtered
47
+
48
+
49
+ def _should_skip_due_to_empty_candidates(populated_info: dict) -> bool:
50
+ """True if any data_field placeholder has zero candidates.
51
+
52
+ Safe no-op fallback when helper is missing.
53
+ """
54
+ if not has_empty_datafield_candidates:
55
+ return False
56
+ try:
57
+ return has_empty_datafield_candidates(populated_info)
58
+ except Exception:
59
+ return False
60
+ import logging
61
+ import pandas as pd
62
+ import os
63
+ from pathlib import Path
64
+ from urllib.parse import urljoin
65
+ import time
66
+ import threading
67
+ import itertools
68
+ import getpass
69
+ import io
70
+ import validator as val
71
+ from ace_lib import get_instrument_type_region_delay
72
+ # Force stdout/stderr to use utf-8 on Windows to avoid UnicodeEncodeError
73
+ if sys.platform.startswith('win'):
74
+ try:
75
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
76
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
77
+ except Exception:
78
+ pass
79
+
80
+ # 这些变量将在交互式输入中设置
81
+ LLM_model_name = None
82
+ LLM_API_KEY = None
83
+ llm_base_url = None
84
+ username = None
85
+ password = None
86
+ DATA_CATEGORIES = None
87
+
88
+
89
+ template_summary = """# BRAIN论坛Alpha模板精华总结
90
+
91
+ 本文档旨在系统性地整理和总结优秀Alpha模板,它是一种可复用的标准化框架性表达式,它承载着特定的经济逻辑,并预留出若干 “配置项”(包括数据字段、算子、分组方式、衰减规则、中性化方案等),用于生成多个候选阿尔法因子。其典型流程为:数据清洗(数据回填、缩尾处理)→ 跨时间或跨标的维度进行转换 / 对比 → 排序 / 中性化处理 →(可选步骤)衰减调整 / 换手率优化。这种模板模式能够推动系统化的因子挖掘、复用与多元化配置,同时确保每一个因子都具备清晰可追溯的经济逻辑支撑。
92
+ 以下每个模板都附有其核心思想、变量说明、适用场景及原帖链接,方便您理解、应用和进一步探索。
93
+ 使用时请思考如何将下列模板与有的Alpha表达式结合,创造出新的模板来捕捉和发现市场规律,找到”好“公司和”坏“公司
94
+ **使用前请注意:**
95
+ * **过拟合风险**:部分模板可能存在过拟合风险,请谨慎使用,并结合IS-Ladder测试、多市场回测等方法进行验证。
96
+ * **参数调整**:模板中的参数(如时间窗口、数据集字段)需要根据您的具体研究目标和数据特性进行调整。
97
+ * **持续学习**:最好的模板是您自己创造的。希望本文档能激发您的灵感,而不是限制您的思维。
98
+
99
+ ---
100
+
101
+ ## From: Alpha Examples from Learn101
102
+
103
+ ### Momentum after news
104
+ **Hypothesis**: After news is released, if a stock takes a longer time to rise, it may show strong evidence of upward momentum, and it could be beneficial to take a long position in it.
105
+ **Expression**: `ts_backfill(vec_avg(nws12_prez_4l),504)`
106
+ **Settings**: Region: USA, Universe: TOP500, Delay: 1, Decay: 0, Neutralization: INDUSTRY, Truncation: 0.08, Pasteurization: ON
107
+ **逻辑链深度解析**:
108
+ * **时序相对性 (Step 4)**: 这是一个典型的时序信号。`ts_backfill` 的使用暗示了新闻数据是稀疏的(Step 4.2.4),需要填补空白以维持信号连续性。
109
+ * **算子深意**: `vec_avg` 用于聚合多维新闻向量,提取核心情绪/强度;`ts_backfill` 确保在无新闻日也能维持上一次的观点,直到新消息到来。
110
+ **优化方向**:
111
+ * **去噪 (Step 0)**: 新闻情绪可能存在极端噪音,建议在 `vec_avg` 后增加 `winsorize` 或 `rank`。
112
+ * **从属信号 (Subordinate)**: 叠加 `Social Media Effect`。若新闻情绪好但社媒热度低(噪音少),则放大权重;若社媒过热,可能反转。
113
+ * **门限交易 (Step 5)**: 仅在新闻情绪显著偏离均值时交易,如 `trade_when(abs(zscore(news)) > 1.5, ...)`。
114
+
115
+ ### Pretax Income
116
+ **Hypothesis**: Pretax income is a good measure of a company's financial health and profitability.
117
+ **Expression**: `quantile(ts_rank(pretax_income,250))`
118
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 4, Neutralization: MARKET, Truncation: 0.01, Pasteurization: ON
119
+ **逻辑链深度解析**:
120
+ * **时序相对性 (Step 4)**: `ts_rank(..., 250)` 比较当前收入与过去一年的水平,寻找“自身改善”而非“绝对高收入”。
121
+ * **分布重塑 (Step 0)**: `quantile` 强制将信号拉伸为均匀分布,避免了极值影响,只关注相对排序。
122
+ **优化方向**:
123
+ * **区间优化 (Step 2)**: 收入微弱变化可能只是噪音。可改用 `ts_zscore` 并只在 >1 或 <-1 时交易。
124
+ * **从属信号**: 引入 `market_cap`。大市值的收入创新高可能比小市值更稳健(质量溢价)。
125
+
126
+ ### Operating Earnings Yield
127
+ **Hypothesis**: If the operating income of a company is currently higher than its past 1 year history, buy the company's stock and vice-versa.
128
+ **Expression**: `ts_rank(operating_income,252)`
129
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: SUBINDUSTRY, Truncation: 0.08, Pasteurization: ON
130
+ **逻辑链深度解析**:
131
+ * **时序相对性 (Step 4)**: 纯粹的时序动量逻辑。`ts_rank` 将当前值映射到历史分位,捕捉“业绩改善”趋势。
132
+ **优化方向**:
133
+ * **组内比较 (Step 3)**: 考虑行业周期性。先做 `group_zscore(operating_income, industry)` 再做 `ts_rank`,剔除行业景气度影响,只看个股相对行业的改善。
134
+ * **门限 (Step 5)**: `trade_when(ts_rank > 0.8, ...)` 只做多业绩显著改善的股票。
135
+
136
+ ### Appreciation of liabilities
137
+ **Hypothesis**: An increase in the fair value of liabilities could indicate a higher cost than expected.
138
+ **Expression**: `-ts_rank(fn_liab_fair_val_l1_a,252)`
139
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: SUBINDUSTRY, Truncation: 0.08, Pasteurization: ON
140
+ **逻辑链深度解析**:
141
+ * **反向信号**: 负号 `-` 表示这是一个反向指标(负债增加是坏事)。
142
+ * **时序相对性**: 同样基于 `ts_rank`,关注负债相对于自身历史的增长速度。
143
+ **优化方向**:
144
+ * **去噪**: 负债数据可能存在跳变,建议先 `winsorize`。
145
+ * **从属信号**: 结合 `cash_flow`。若负债增加但现金流同时也大幅增加(良性杠杆),则不应做空。
146
+
147
+ ### Deferred Revenue
148
+ **Hypothesis**: Firms with high deferred revenue will surprise the market in the future when the deferred revenue is recognized.
149
+ **Expression**: `ts_backfill(fnd6_drc, 252)/assets`
150
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: SECTOR, Truncation: 1, Pasteurization: ON
151
+ **逻辑链深度解析**:
152
+ * **截面比较 (Step 3)**: 除以 `assets` 是为了标准化(Size Adjustment),使其在截面上可比。
153
+ * **数据填补 (Step 0)**: `ts_backfill` 处理财报数据的低频更新特性。
154
+ **优化方向**:
155
+ * **行业中性 (Step 3)**: 递延收入在软件/服务业常见,在制造业少见。必须做 `group_zscore(..., sector)` 或 `neutralize`,否则只是在做多特定行业。
156
+ * **时序变化 (Step 4)**: 关注递延收入的 *增长率* `ts_delta`,而不仅仅是绝对值。
157
+
158
+ ### Reducing debt
159
+ **Hypothesis**: Take a long position in companies whose debt has decreased compared to the past.
160
+ **Expression**: `-ts_quantile(debt, 126)`
161
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: MARKET, Truncation: 0.01, Pasteurization: ON
162
+ **逻辑链深度解析**:
163
+ * **时序相对性**: `ts_quantile` 与 `ts_rank` 类似,捕捉债务下降趋势。
164
+ **优化方向**:
165
+ * **从属信号**: 结合 `interest_coverage` (利息保障倍数)。只有在偿债能力弱的公司中,债务减少才最重要(困境反转逻辑)。
166
+
167
+ ### Power of leverage
168
+ **Hypothesis**: Companies with high liability-to-asset ratios often leverage debt as a strategic tool.
169
+ **Expression**: `liabilities/assets`
170
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: MARKET, Truncation: 0.01, Pasteurization: ON
171
+ **逻辑链深度解析**:
172
+ * **截面比较 (Step 3)**: 这是一个经典的截面因子(杠杆率)。
173
+ **优化方向**:
174
+ * **非线性 (Step 1)**: 杠杆通常是倒U型关系(适度杠杆好,过高杠杆坏)。考虑使用 `bucket` 分段,或 `trade_when` 剔除极端高杠杆。
175
+ * **行业中性**: 银行/地产杠杆天生高,必须行业中性化。
176
+
177
+ ## From: Alpha Examples from Learn102
178
+
179
+ ### Social Media Effect
180
+ **Hypothesis**: Poorly performing stocks are discussed more in general on social media platforms.
181
+ **Expression**: `-scl12_buzz`
182
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: INDUSTRY, Truncation: 0.01, Pasteurization: ON
183
+ **逻辑链深度解析**:
184
+ * **反向指标**: 负号暗示“关注度高=坏事”(可能是负面新闻缠身)。
185
+ * **原始信号**: 直接使用 `buzz`,假设线性关系。
186
+ **优化方向**:
187
+ * **去噪 (Step 0)**: 社媒数据极值多,必须 `log` 或 `winsorize`。
188
+ * **从属信号**: 结合 `sentiment`。若关注度高且情感为正,可能是好事;关注度高且情感负,才是做空机会。
189
+ * **门限**: `trade_when(rank(buzz) > 0.9, ...)` 只在极度热门时做空。
190
+
191
+ ### Valuation Disconnect Swing Short
192
+ **Hypothesis**: A stock with high momentum and value score correlation suggests a disconnect between the stock's price and its intrinsic value.
193
+ **Expression**: `-ts_corr(ts_backfill(fscore_momentum,66),ts_backfill(fscore_value,66),756)`
194
+ **Settings**: Region: USA, Universe: TOP200, Delay: 1, Decay: 0, Neutralization: INDUSTRY, Truncation: 0.08, Pasteurization: ON
195
+ **逻辑链深度解析**:
196
+ * **高阶统计量**: 使用 `ts_corr` 捕捉两个因子之间的动态关系,而非因子本身。
197
+ * **逻辑**: 动量与价值相关性高,意味着价格脱离基本面(泡沫),因此做空(负号)。
198
+ **优化方向**:
199
+ * **窗口调整**: 756天(3年)非常长,捕捉的是长期结构变化。可尝试短窗口(如126天)捕捉短期背离。
200
+
201
+ ### Network Dependence
202
+ **Hypothesis**: Long stocks of companies whose hub score of customers are low over the past two years.
203
+ **Expression**: `-ts_mean(pv13_ustomergraphrank_hub_rank,504)`
204
+ **Settings**: Region: USA, Universe: TOP1000, Delay: 1, Decay: 0, Neutralization: INDUSTRY, Truncation: 0.08, Pasteurization: ON
205
+ **逻辑链深度解析**:
206
+ * **供应链逻辑**: 客户集中度/中心度过高可能意味着风险(依赖大客户)。
207
+ * **平滑 (Step 4)**: `ts_mean(..., 504)` 说明这是一个非常慢的变量,关注长期结构。
208
+ **优化方向**:
209
+ * **从属信号**: 结合 `volatility`。高依赖度+高波动 = 极度危险。
210
+
211
+ ## From: Alpha Examples from Learn103
212
+
213
+ ### News-driven Volatility
214
+ **Hypothesis**: Stocks of companies that face high differences in their prices after any news release can be subject to varying sentiments.
215
+ **Expression**: `(ts_arg_max(ts_backfill(news_session_range, 20), 60))`
216
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 0, Neutralization: SECTOR, Truncation: 0.08, Pasteurization: ON
217
+ **逻辑链深度解析**:
218
+ * **事件驱动 (Step 4.2.3)**: `ts_arg_max` 寻找过去60天内波动最大的那一天(新闻日)。
219
+ * **算子深意**: 这不是直接用波动率,而是用“最大波动发生的时间距离”作为信号。
220
+ **优化方向**:
221
+ * **衰减逻辑**: 结合 `days_from_last_change` 或 `exp_decay`,让信号随时间减弱。
222
+ * **从属信号**: 叠加 `IV Skew`。若波动大且 Skew 偏空,做空;若 Skew 偏多,做多。
223
+
224
+ ### Implied Volatility Spread as a predictor
225
+ **Hypothesis**: If the Call Open interest is higher than the Put Open interest, the stock may rise based on the intensity of the implied volatility spread.
226
+ **Expression**: `trade_when(pcr_oi_270 < 1, (implied_volatility_call_270-implied_volatility_put_270), -1)`
227
+ **Settings**: Region: USA, Universe: TOP3000, Delay: 1, Decay: 4, Neutralization: MARKET, Truncation: 0.08, Pasteurization: ON
228
+ **逻辑链深度解析**:
229
+ * **门限交易 (Step 5)**: `trade_when(pcr_oi < 1, ...)` 是典型的门禁逻辑。只有在看涨持仓量大于看跌时(情绪偏多),才使用 IV Spread 信号。
230
+ * **条件分支**: 不满足条件时给 `-1`(做空),这是一个激进的二元策略。
231
+ **优化方向**:
232
+ * **平滑**: IV 数据跳动大,建议对 Spread 做 `ts_mean` 或 `ts_decay_linear`。
233
+
234
+ ## 《151 Trading Strategies》论文精华模板
235
+
236
+ 本部分总结自Zura Kakushadze与Juan Andrés Serur合著的《151 Trading Strategies》一文,重点提炼其中适用于BRAIN平台的股票类策略,并将其泛化为可复用的Alpha模板。
237
+
238
+ ---
239
+
240
+ ### 1. 风险调整后动量模板 (Risk-Adjusted Momentum)
241
+
242
+ * **模板表达式**: `ts_mean(ts_delay(returns, <skip_period>), <lookback_period>) / ts_std_dev(ts_delay(returns, <skip_period>), <lookback_period>)`
243
+ * **核心思想**: 这是对经典动量因子的改进。它计算的是过去一段时间(lookback_period)的"时序夏普比率",即收益均值除以收益波动。同时,`ts_delay`跳过了最近一段时间(skip_period,通常为21天/1个月)的数据,以规避短期反转效应的干扰。该因子旨在寻找那些"高质量"的、持续且平稳的动量。
244
+ * **变量说明**:
245
+ * `<skip_period>`: 跳过的近期交易日数,如 `21`。
246
+ * `<lookback_period>`: 计算动量的回看窗口,如 `252`。
247
+ * **适用场景**: 通用性强,适用于构建稳健的动量类Alpha。
248
+ * **逻辑链深度解析**:
249
+ * **时序标准化 (Step 4)**: 分子是收益均值,分母是波动率。本质是 Rolling Sharpe Ratio。
250
+ * **去噪 (Step 0)**: `ts_delay` 跳过最近一个月,剔除了短期反转(Short-term Reversal)噪音,只保留中长期动量。
251
+ * **优化方向**:
252
+ * **从属信号**: 叠加 `turnover`。在低换手率时,动量更可靠(量价配合)。
253
+ * **残差化**: 先对 returns 做 `regression_neut` 剔除大盘影响,计算纯特异性动量。
254
+ * **适配自**: Section 3.1, "Price-momentum", `Rrisk.adj`
255
+
256
+ ### 2. 标准化盈利超预期模板 (SUE - Standardized Unexpected Earnings)
257
+
258
+ * **模板表达式**: `(fnd_eps_q - ts_delay(fnd_eps_q, 4)) / ts_std_dev(fnd_eps_q - ts_delay(fnd_eps_q, 4), 8)`
259
+ * **核心思想**: 捕捉超预期的盈利增长。它计算的是最新一季的EPS相较于去年同期的增量,并用该增量自身过去8个季度的波动性进行标准化。标准化后的值(SUE)越高,代表盈利惊喜越大,是经典的盈利动量因子。
260
+ * **变量说明**:
261
+ * `fnd_eps_q`: 季度每股收益(EPS)字段。
262
+ * **适用场景**: `Fundamental`(基本面)数据集,用于事件驱动型Alpha。
263
+ * **逻辑链深度解析**:
264
+ * **季节性调整**: `ts_delay(..., 4)` 比较同比季度,消除季节性影响。
265
+ * **波动率标准化 (Step 0)**: 除以过去8季度的波动,将“惊喜”转化为标准差单位(Z-Score),使其在不同波动率的公司间可比。
266
+ * **优化方向**:
267
+ * **事件衰减 (Step 4)**: 叠加 `days_from_last_change`,让 SUE 信号随财报发布时间衰减。
268
+ * **从属信号**: 叠加 `Analyst Revision`。若 SUE 高且分析师上调预期,信号更强。
269
+ * **适配自**: Section 3.2, "Earnings-momentum", SUE
270
+
271
+
272
+ ### 4. 隐含波动率偏斜动量模板 (Implied Volatility Skew Momentum)
273
+
274
+ * **模板表达式**: `ts_delta(implied_volatility_call_<window>, <period>) - ts_delta(implied_volatility_put_<window>, <period>)`
275
+ * **核心思想**: 捕捉市场情绪的变化。看涨期权IV的上升通常与乐观情绪相关,而看跌期权IV的上升则与悲观或避险情绪相关。该模板计算Call IV的变化量与Put IV变化量之差,旨在做多情绪改善、做空情绪恶化的股票。
276
+ * **变量说明**:
277
+ * `implied_volatility_call_<window>`: 不同期限的看涨期权隐含波动率。
278
+ * `implied_volatility_put_<window>`: 不同期限的看跌期权隐含波动率。
279
+ * `<period>`: 计算IV变化的时间窗口,如 `21` (月度变化)。
280
+ * **适用场景**: `Option`(期权)数据集,用于捕捉短中期市场情绪变化。
281
+ * **逻辑链深度解析**:
282
+ * **时序变化 (Step 4)**: 关注的是 IV 的 *变化* (`ts_delta`) 而非绝对值。
283
+ * **情绪差**: Call IV 涨幅 > Put IV 涨幅 -> 情绪改善。
284
+ * **优化方向**:
285
+ * **门限**: `trade_when(abs(skew_delta) > threshold, ...)` 只在情绪剧烈变化时交易。
286
+ * **事件驱动**: 在财报前(IV 高企时)该策略可能失效,需用 `days_to_earnings` 过滤。
287
+ * **适配自**: Section 3.5, "Implied volatility"
288
+
289
+ ### 5. 残差动量模板 (Residual Momentum)
290
+
291
+ * **模板表达式**: `ts_mean(regression_neut(regression_neut(regression_neut(returns, <factor_1/>), <factor_2/>), <factor_3/>), <window/>)`
292
+ * **核心思想**: 提纯动量信号。传统动量可能包含了市场Beta、市值、价值等多种因子的敞口。此模板通过连续的中性化(例如依次对`<factor_1/>`, `<factor_2/>`, `<factor_3/>`执行`regression_neut`)剥离可被通用因子解释的部分,然后仅对无法被解释的"残差等价物"部分计算动量。
293
+ * **变量说明**:
294
+ * `<factor_1/>`, `<factor_2/>`, `<factor_3/>`: 市场通用因子,如 `mkt_beta`, `size_factor`, `value_factor`。
295
+ * `<window/>`: 计算残差动量的时间窗口。
296
+ * **适用场景**: 通用性强,是因子提纯、构建高质量Alpha的关键步骤。
297
+ * **逻辑链深度解析**:
298
+ * **提纯 (Step 0)**: 通过连续 `regression_neut` 剥离 Beta、Size、Value 等风格暴露。
299
+ * **时序动量**: 对剥离后的残差求 `ts_mean`。
300
+ * **优化方向**:
301
+ * **加权**: 使用 `ts_decay_linear` 代替 `ts_mean`,给予近期残差更大权重。
302
+ * **组内比较**: 在残差基础上再做 `group_rank`,寻找行业内最强特异动量。
303
+ * **适配自**: Section 3.7, "Residual momentum"
304
+
305
+ ### 6. 风险加权回归均值回归模板 (Weighted Regression Mean-Reversion)
306
+
307
+ * **模板表达式**: `reverse(regression_neut(multiply(returns, power(inverse(ts_std_dev(returns, <window/>)), 2)), <group_matrix/>))`
308
+ * **核心思想**: 这是对标准行业中性化均值回归的增强。在对收益率进行行业中性化时,它为不同股票赋予了不同的权重。具体来说,它给历史波动率较低的股票更高的权重,认为这些股票的收益率数据更"可靠",在计算行业均值时应占更大比重。
309
+ * **变量说明**:
310
+ * `<group_matrix>`: 行业或分组的哑变量矩阵。
311
+ * `weights`: 回归权重,通常是可靠性的度量,如 `1/variance`。
312
+ * `<window>`: 计算波动率的时间窗口。
313
+ * **适用场景**: 适用于任何需要进行组内中性化或回归剥离的场景,尤其是当组内成员的信号质量或波动性差异较大时。
314
+ * **逻辑链深度解析**:
315
+ * **加权最小二乘 (WLS)**: 使用 `1/variance` 作为权重,认为低波动的股票信息更可靠。
316
+ * **均值回归**: `reverse` 捕捉残差的反转。
317
+ * **优化方向**:
318
+ * **从属信号**: 引入 `liquidity` 权重。流动性好的股票回归更快。
319
+ * **适配自**: Section 3.10, "Mean-reversion – weighted regression"
320
+
321
+ ### 7. 移动平均线交叉模板 (Moving Average Crossover)
322
+
323
+ * **模板表达式**: `sign(ts_mean(<price/>, <short_window>) - ts_mean(<price/>, <long_window>))`
324
+ * **核心思想**: 经典的趋势跟踪策略。当短期均线上穿长期均线("金叉")时,表明短期趋势走强,产生买入信号。当短期均线下穿长期均线("死叉")时,表明趋势走弱,产生卖出信号。
325
+ * **变量说明**:
326
+ * `<price/>`: `close`, `vwap` 等价格字段。
327
+ * `<short_window>`: 短期均线窗口,如 `10`, `20`。
328
+ * `<long_window>`: 长期均线窗口,如 `50`, `100`。
329
+ * **适用场景**: 适用于趋势性较强的市场或资产。
330
+ * **逻辑链深度解析**:
331
+ * **低通滤波**: MA 本质是滤除高频噪音。
332
+ * **二元信号**: `sign` 输出 +1/-1,不包含强度信息。
333
+ * **优化方向**:
334
+ * **连续化 (Step 1)**: 去掉 `sign`,直接使用差值并标准化 (`zscore`),保留强度信息。
335
+ * **从属信号**: 结合 `ADX` (趋势强度指标)。只有在趋势强时才使用 MA 交叉。
336
+ * **适配自**: Section 3.12, "Two moving averages"
337
+
338
+
339
+
340
+ ### 9. 渠道突破模板 (Channel Breakout)
341
+
342
+ * **模板表达式**: `alpha = if_else(greater(close, ts_max(high, <window/>)), 1, if_else(less(close, ts_min(low, <window/>)), -1, 0)); reverse(alpha)`
343
+ * **核心思想**: 这是一个经典的反转策略。它定义了一个由过去N日最高价和最低价构成的价格渠道(Channel)。当价格向上突破渠道上轨时,认为市场过热,产生卖出信号(-1);当价格向下突破渠道下轨时,认为市场超卖,产生买入信号(+1)。
344
+ * **变量说明**:
345
+ * `<window>`: 定义渠道的时间窗口,如 `20`。
346
+ * **适用场景**: 适用于有均值回归特性的市场或个股。
347
+ * **逻辑链深度解析**:
348
+ * **区间突破 (Step 2)**: 典型的“只在尾部交易”逻辑。中间区间为 0。
349
+ * **反转逻辑**: `reverse` 赌突破是假突破(False Breakout)。
350
+ * **优化方向**:
351
+ * **顺势/逆势切换**: 结合 `volatility`。低波时做反转(假突破),高波时做顺势(真突破)。
352
+ * **适配自**: Section 3.15, "Channel"
353
+
354
+
355
+ ### 11. 价值因子基础模板 (Value Factor)
356
+
357
+ * **模板表达式**: `group_rank(<book_value/> / <market_cap/>)`
358
+ * **核心思想**: 经典的价值投资策略。它旨在买入账面价值相对于市场价值被低估的"价值股",并卖出被高估的"成长股"。最核心的衡量指标是账面市值比(Book-to-Price / Book-to-Market Ratio)。
359
+ * **变量说明**:
360
+ * `<book_value/>`: 公司账面价值或每股净资产字段。
361
+ * `<market_cap/>`: 公司市值或收盘价字段。
362
+ * **适用场景**: `Fundamental` (基本面) 数据集,作为构建多因子模型的基础因子之一。
363
+ * **逻辑链深度解析**:
364
+ * **组内比较 (Step 3)**: 价值因子在不同行业间不可比(如科技 vs 银行),必须用 `group_rank`。
365
+ * **优化方向**:
366
+ * **去噪**: 先 `winsorize` 再 `group_rank`。
367
+ * **从属信号**: 叠加 `Quality` (ROE)。避免买入“价值陷阱”(便宜但烂的公司)。
368
+ * **适配自**: Section 3.3, "Value"
369
+
370
+
371
+
372
+ ### 13. 配对交易均值回归框架 (Pairs Trading)
373
+
374
+ * **模板表达式**: `signal_A = (close_A - close_B) - ts_mean(close_A - close_B, <window>); reverse(signal_A)`
375
+ * **核心思想**: 寻找历史上高度相关的两只股票(一个"配对"),当它们的价差(spread)偏离历史均值时进行套利。如果价差过大,则做空价高的股票、做多价低的股票,赌价差会回归。这是一个经典的统计套利和均值回归策略。
376
+ * **变量说明**:
377
+ * `close_A`, `close_B`: 配对股票A和B的价格序列。
378
+ * `<window>`: 计算历史价差均值的时间窗口。
379
+ * **适用场景**: 适用于同一行业内业务高度相似的公司,是构建市场中性策略的基础。
380
+ * **逻辑链深度解析**:
381
+ * **协整关系**: 构造平稳序列 `Spread`。
382
+ * **均值回归**: 赌 Spread 回归均值。
383
+ * **优化方向**:
384
+ * **动态阈值**: 使用 `ts_std_dev(Spread)` 设定动态开仓线(如 2倍标准差)。
385
+ * **止损**: 增加 `trade_when(abs(Spread) > 4*std, 0, ...)` 防止协整破裂。
386
+ * **适配自**: Section 3.8, "Pairs trading"
387
+
388
+ ---
389
+
390
+ ## 补充模板
391
+
392
+ ### A. Analyst交叉分组打底(模板名:示例)
393
+ * **核心结构**: `financial_data = ts_backfill(<vec_func/>(<analyst_metric/>), 60); gp = group_cartesian_product(country, industry); <ts_operator/>(<group_operator/>(financial_data, gp), <window/>)`
394
+ * **思想**: 先对分析师字段做向量聚合(`vec_avg`、`vec_kurtosis`、`vec_ir`等),用`group_cartesian_product`构建国家×行业组合,再做组内标准化/中性化+时序处理,形成稳定的截面信号。
395
+ * **变量要点**: `analyst_metric`覆盖`mdl26_*`、`star_arm_*`等Analyst/SmartEstimate场景;`vec_func`选择聚合方式;`group_operator`用于行业/国家组内的scale或neutralize;`ts_operator`用于时间平滑(`ts_mean`、`ts_zscore`等);`window`在20/60/90/200之间取值。
396
+ * **适用场景**: 适合Analyst情感、预期修正类主题,想要跨国+行业分组的稳健截面信号。
397
+ * **逻辑链深度解析**:
398
+ * **数据填补 (Step 0)**: 分析师数据稀疏,必须 `ts_backfill`。
399
+ * **精细分组 (Step 3)**: `group_cartesian_product` 实现了“国家x行业”的精细化中性化,适合全球策略。
400
+ * **优化方向**:
401
+ * **算子选择**: `vec_ir` (信息比率) 比 `vec_avg` 更能体现分析师的一致性。
402
+
403
+ ### B. 双重中性化(模板名:双重中性化:以Analyst15为例)
404
+ * **核心结构**: 与上类似,先`ts_backfill(vec_func(Analyst15字段), 60)`,再按国家×行业分组,做组内中性化与时序处理。
405
+ * **思想**: 针对`anl15_*`增长/估值/分红等字段,在截面层面做两次中性化(向量聚合后+组内处理),用于剥离共性行业/国家暴露。
406
+ * **变量要点**: 数据集中`anl15_*`覆盖多期增长率、PE、估值、分红等;`vec_func`与`ts_operator`选择决定信号平滑度;窗口建议60–200以保证填补稳定。
407
+ * **适用场景**: Analyst15预期修正、估值再定价类信号,需要同时消化国家+行业噪音的场景。
408
+ * **逻辑链深度解析**:
409
+ * **多重剥离**: 彻底消除风格暴露,追求纯 Alpha。
410
+ * **优化方向**:
411
+ * **顺序**: 先做行业中性,再做国家中性,通常更符合基本面逻辑。
412
+
413
+ ### C. 组间比较(模板名:组间比较_GLB_topdiv)
414
+ * **核心结构**: 先在`country × <group1/>`分组内对回填后的向量聚合结果做`ts_zscore`和`group_zscore`,再计算组均值/极值(`group_min/median/max/sum/count`),用`resid = <compare/>(alpha, alpha_gpm)`求组间残差,最后再做组内+时序处理。
415
+ * **思想**: 对同一层级(如行业/子行业/交易所)之间的相对强弱做剥离,得到“相对组均值”的残差信号,适合跨组对比的Alpha挖掘。
416
+ * **变量要点**: `analyst_field`来源于`fnd8_*`基本面/现金流字段;`vec_op`可选`vec_max/avg/min`;`compare`可用`regression_neut`或`signed_power`提取残差;`t_window`取20/60/200/600,控制平滑与稳定性。
417
+ * **适用场景**: GLB区域的分红/现金流因子(topdiv)在国家+行业框架下的相对价值比较,关注跨组差异的策略。
418
+ * **逻辑链深度解析**:
419
+ * **相对价值**: 关注的是“我在我的组里是否优秀”,而不是“我绝对值多少”。
420
+ * **优化方向**:
421
+ * **非线性**: 使用 `rank` 代替原始值计算残差,对异常值更鲁棒。
422
+
423
+ ### D. 组间比较(Analyst15版,模板名:组间比较_glb_topdiv_anl15)
424
+ * **核心结构**: 与上一模板相同,但`analyst_field`替换为`anl15_*`系列的增长/估值/分红字段。
425
+ * **思想**: 通过对Analyst15增长与估值预期的组间残差建模,捕捉行业/国家层面的相对高低估与预期修正。
426
+ * **变量要点**: `group1`可选industry/subindustry/sector/exchange;`compare`与`group_stats`同上;`ts_op`和`group_op`用于残差后再标准化和时序平滑。
427
+ * **适用场景**: 全球范围GLB,基于Analyst15预期数据的组间相对价值或动量信号。
428
+ * **逻辑链深度解析**:
429
+ * **预期差**: 寻找行业内被分析师低估/高估的股票。
430
+ * **优化方向**:
431
+ * **时序叠加**: 结合 `ts_delta`,寻找“行业内预期提升最快”的股票。
432
+
433
+ ### E. 顾问分析示例(模板名:顾问分析示例)
434
+ * **核心结构**: `financial_data = ts_backfill(<mixdata/>, 90); gp = industry; <ts_operator/>(<group_operator/>(financial_data, gp), <window/>)`
435
+ * **思想**: 直接对`anl69_*`多字段做90日回填,行业组内标准化后再做时序平滑,生成简洁的行业中性信号。
436
+ * **变量要点**: `mixdata`覆盖`anl69_*`的EPS/EBIT/现金分红/目标价/报告日期等;`ts_operator`可用`ts_zscore`、`ts_scale`、`ts_rank`等;`window`提供60/120/220/600可调节频率。
437
+ * **适用场景**: Analyst69数据驱动的行业内预期跟踪、财报节奏/指引变化监控。
438
+ * **逻辑链深度解析**:
439
+ * **标准流程**: 填补 -> 截面标准化 -> 时序平滑。这是构建稳健因子的标准三板斧。
440
+ * **优化方向**:
441
+ * **事件驱动**: 在财报日前后缩短 `ts_mean` 的窗口,提高灵敏度。
442
+
443
+ ---
444
+
445
+ ## 新增模板(CAPM與估值、分析師期限、期權、搜尋優化)
446
+
447
+ ### 1. CAPM殘差模板(市場/行業中性收益)
448
+ * **表達式**: `ts_regression(returns, group_mean(returns, log(ts_mean(cap,21)), sector), 252, rettype=0)`。
449
+ * **核心思想**: 回歸剔除市場/行業暴露,保留超額收益殘差作為Alpha。
450
+ * **適用場景**: 通用起手式,回歸殘差可作後續動量或價值信號的底板。
451
+ * **優化**: 改`rettype=2`獲取beta斜率,用於風險排序或低/高beta組合;可加入`winsorize`、`ts_backfill`預處理。
452
+
453
+ ### 2. CAPM廣義殘差(任意特徵)
454
+ * **表達式**: `data = winsorize(ts_backfill(<data>,63), std=4); gpm = group_mean(data, log(ts_mean(cap,21)), sector); resid = ts_regression(data, gpm, 252, rettype=0)`。
455
+ * **核心思想**: 將任意特徵去除組均值成分,提取行業相對的特異性部分。
456
+ * **適用場景**: 基本面、情緒、替代數據的組內殘差提純。
457
+ * **優化**: 先`group_zscore`再回歸;對`resid`再做`ts_zscore`或`ts_mean`平滑。
458
+
459
+ ### 3. CAPM Beta排序模板
460
+ * **表達式**: `target_data = winsorize(ts_backfill(<target>,63), std=4); market_data = winsorize(ts_backfill(<market>,63), std=4); beta = ts_regression(target_data, group_mean(market_data, log(ts_mean(cap,21)), sector), 252, rettype=2)`。
461
+ * **核心思想**: 提取行業內相對beta,作為風險/防禦排序;低beta偏防禦,高beta偏進攻。
462
+ * **優化**: 行業或國家分組;可按beta分桶做長低/短高,或反向用於高波段套利。
463
+
464
+ ### 4. 實際-預估差異模板(Analyst Surprise)
465
+ * **表達式**: `group_zscore(subtract(group_zscore(<act>, industry), group_zscore(<est>, industry)), industry)`。
466
+ * **核心思想**: 行業內標準化後的實際值與預估值差,捕捉超預期或低於預期的驚喜。
467
+ * **適用場景**: analyst7/analyst14/earnings估值類字段。
468
+ * **優化**: 對差分再做`ts_zscore`;門檻交易只在|z|>1.5時開倉。
469
+
470
+ ### 5. 分析師期限結構模板(近遠期預估斜率)
471
+ * **表達式**: `group_zscore(subtract(group_zscore(anl14_mean_eps_<p1>, industry), group_zscore(anl14_mean_eps_<p2>, industry)), industry)`,`<p1>/<p2>`為fp1/fp2/fy1/fy2等。
472
+ * **核心思想**: 比較短期與長期預估的行業內斜率,捕捉預期加速或鈍化。
473
+ * **適用場景**: analyst14/15 期別字段;適用成長/拐點挖掘。
474
+ * **優化**: 擴展到多期間差分或`ts_delta`跟蹤斜率變化;對斜率做`rank`或`winsorize`。
475
+
476
+ ### 6. 期權Greeks淨值模板
477
+ * **表達式**: `group_operator(<put_greek> - <call_greek>, <group>)`,Greek可選Delta/Gamma/Vega/Theta。
478
+ * **核心思想**: 同組內看多vs看空的期權敏感度差,反映隱含情緒或凸性差異。
479
+ * **適用場景**: Option數據集;行業或市值分組下的情緒/波動信號。
480
+ * **優化**: 多Greek加權組合;對淨值再`ts_mean`平滑;事件期(財報)可降權或過濾。
481
+
482
+ ### 7. IV Skew動量擴展
483
+ * **表達式**: `ts_delta(implied_volatility_call_<w>, <p>) - ts_delta(implied_volatility_put_<w>, <p>)`。
484
+ * **核心思想**: Call與Put隱含波動變化差捕捉情緒轉折;可做多情緒改善、做空情緒惡化。
485
+ * **優化**: 加`trade_when(abs(skew)>thr)`門檻;財報前後縮窗;行業中性。
486
+
487
+ ### 8. 殘差動量精簡版
488
+ * **表達式**: `res = regression_neut(returns, <common_factor_matrix>); ts_mean(res, <window>)`。
489
+ * **核心思想**: 先剝離市場/風格暴露,再對特異收益做動量;較原版多重回歸更輕量。
490
+ * **優化**: 使用`ts_decay_linear`增加近期權重;行業內`group_rank`提升截面穩定度。
491
+
492
+ ### 9. 分紅/現金流組間殘差(簡版)
493
+ * **表達式**: `alpha = ts_zscore(ts_backfill(<cf_or_div_field>,90)); g = group_mean(alpha, <group>, <weight_opt>); resid = alpha - g; group_zscore(resid, <group>)`。
494
+ * **核心思想**: 先回填平滑,再對組均值做殘差,捕捉組內相對高/低分紅或現金流質量。
495
+ * **適用場景**: fnd8/fnd6/topdiv等分紅現金流字段;行業/國家分組。
496
+ * **優化**: 權重可用log(cap)或vol逆;對resid再做`ts_mean`平滑。
497
+
498
+ """
499
+
500
+ class SingleSession(requests.Session):
501
+ _instance = None
502
+ _lock = threading.Lock()
503
+ _relogin_lock = threading.Lock()
504
+ _initialized = False
505
+
506
+ def __new__(cls, *args, **kwargs):
507
+ if cls._instance is None:
508
+ with cls._lock:
509
+ if cls._instance is None:
510
+ cls._instance = super().__new__(cls)
511
+ return cls._instance
512
+
513
+ def __init__(self, *args, **kwargs):
514
+ if not self._initialized:
515
+ super(SingleSession, self).__init__(*args, **kwargs)
516
+ self._initialized = True
517
+
518
+ def get_relogin_lock(self):
519
+ return self._relogin_lock
520
+
521
+ def load_template_summary(file_path: Optional[str] = None) -> str:
522
+ """
523
+ Loads the template summary from a file or returns the built-in template summary.
524
+
525
+ Args:
526
+ file_path: Optional path to a .txt or .md file containing the template summary.
527
+ If None or file doesn't exist, returns the built-in template summary.
528
+
529
+ Returns:
530
+ str: The template summary content.
531
+ """
532
+ if file_path:
533
+ try:
534
+ file_path_obj = Path(file_path)
535
+ if file_path_obj.exists() and file_path_obj.is_file():
536
+ with open(file_path_obj, 'r', encoding='utf-8') as f:
537
+ content = f.read()
538
+ print(f"✓ 成功从文件加载模板总结: {file_path}")
539
+ return content
540
+ else:
541
+ print(f"⚠ 警告: 文件不存在: {file_path},将使用内置模板总结")
542
+ except Exception as e:
543
+ print(f"⚠ 警告: 读取文件时出错: {e},将使用内置模板总结")
544
+
545
+ # 返回内置的模板总结
546
+ print("✓ 使用内置模板总结")
547
+ return template_summary
548
+
549
+
550
+ def get_credentials() -> tuple[str, str]:
551
+ """
552
+ Retrieve or prompt for platform credentials.
553
+
554
+ This function attempts to read credentials from a JSON file in the user's home directory.
555
+ If the file doesn't exist or is empty, it prompts the user to enter credentials and saves them.
556
+
557
+ Returns:
558
+ tuple: A tuple containing the email and password.
559
+
560
+ Raises:
561
+ json.JSONDecodeError: If the credentials file exists but contains invalid JSON.
562
+ """
563
+ # 声明使用全局变量
564
+ global username, password
565
+ # please input your own BRAIN Credentials into the function
566
+ return (username, password)
567
+
568
+ def get_token_from_auth_server() -> str:
569
+ # 声明使用全局变量
570
+ global LLM_API_KEY
571
+ # please input your own LLM Gateway token into the function, please note, we are using kimi-k2-turbo-preview model
572
+ return LLM_API_KEY
573
+
574
+ def interactive_input() -> dict:
575
+ """
576
+ 交互式输入函数,收集所有必要的配置信息。
577
+
578
+ Returns:
579
+ dict: 包含所有配置信息的字典
580
+ """
581
+ print("\n" + "="*60)
582
+ print("欢迎使用 Alpha Transformer 交互式配置")
583
+ print("此程序在于让您输入一个Alpha ID即可通过历史总结的Alpha模板,转化成更多的表达式")
584
+ print("72变,助您腾云驾雾")
585
+ print("如果你想修改模型,则可以使用新模型的url和api key")
586
+ print("不同模型效果不同,默认的kimi可能会产生语法错误,请检查生成的模板文件进行甄别")
587
+ print("强烈推荐你使用自己总结的模板文档,效果会更好")
588
+ print("="*60 + "\n")
589
+
590
+ config = {}
591
+
592
+ # 1. 询问 LLM 模型名称
593
+ print("【1/6】LLM 模型配置")
594
+ print("如果你想修改模型,则可以使用新模型的名称")
595
+ default_model = "kimi-k2-turbo-preview"
596
+ model_input = input(f"请输入 LLM 模型名称 (直接回车使用默认值: {default_model}): ").strip()
597
+ config['LLM_model_name'] = model_input if model_input else default_model
598
+ print(f"✓ LLM 模型名称: {config['LLM_model_name']}\n")
599
+
600
+ # 2. 询问 LLM API Key
601
+ print("【2/6】LLM API Key 配置")
602
+ api_key = getpass.getpass("请输入 LLM API Key (输入时不会显示): ").strip()
603
+ if not api_key:
604
+ print("⚠ 警告: API Key 为空,程序可能无法正常工作")
605
+ config['LLM_API_KEY'] = api_key
606
+ print("✓ API Key 已设置\n")
607
+
608
+ # 3. 询问 LLM Base URL
609
+ print("【3/6】LLM Base URL 配置")
610
+ print("提示:不同模型有不同的URL")
611
+ default_url = "https://api.moonshot.cn/v1"
612
+ url_input = input(f"请输入 LLM Base URL (直接回车使用默认值: {default_url}): ").strip()
613
+ config['llm_base_url'] = url_input if url_input else default_url
614
+ print(f"✓ LLM Base URL: {config['llm_base_url']}\n")
615
+
616
+ # 4. 询问 BRAIN 平台用户名
617
+ print("【4/6】BRAIN 平台认证信息")
618
+ username_input = input("请输入 BRAIN 平台用户名/邮箱: ").strip()
619
+ if not username_input:
620
+ print("⚠ 警告: 用户名为空,程序可能无法正常工作")
621
+ config['username'] = username_input
622
+ print("✓ 用户名已设置\n")
623
+
624
+ # 5. 询问 BRAIN 平台密码
625
+ password_input = getpass.getpass("请输入 BRAIN 平台密码 (输入时不会显示): ").strip()
626
+ if not password_input:
627
+ print("⚠ 警告: 密码为空,程序可能无法正常工作")
628
+ config['password'] = password_input
629
+ print("✓ 密码已设置\n")
630
+
631
+ # 6. 询问模板总结文件路径
632
+ print("【5/6】模板总结文件配置")
633
+ print("强烈推荐你使用自己总结的模板文档,效果会更好")
634
+ print("提示: 如果您有 template_summary 的 .txt 或 .md 文件,请输入完整路径")
635
+ print(" 如果没有,直接回车将使用内置模板总结")
636
+ template_path = input("请输入模板总结文件路径 (直接回车使用内置模板): ").strip()
637
+ config['template_summary_path'] = template_path if template_path else None
638
+ if template_path:
639
+ print(f"✓ 将尝试从文件加载: {template_path}\n")
640
+ else:
641
+ print("✓ 将使用内置模板总结\n")
642
+
643
+ # 7. 询问 Alpha ID
644
+ print("【6/7】Alpha ID 配置")
645
+ alpha_id = input("请输入要处理的 Alpha ID: ").strip()
646
+ if not alpha_id:
647
+ print("❌ 错误: Alpha ID 不能为空")
648
+ sys.exit(1)
649
+ config['alpha_id'] = alpha_id
650
+ print(f"✓ Alpha ID: {alpha_id}\n")
651
+
652
+ # 8. 询问 Top N 参数(仅数据字段)
653
+ print("【7/7】候选数量配置 (Top N)")
654
+ print("提示: 此参数控制为每个占位符生成的数据字段候选数量")
655
+
656
+ # Datafield top_n
657
+ default_datafield_topn = 50
658
+ datafield_topn_input = input(f"请输入数据字段候选数量 (直接回车使用默认值: {default_datafield_topn}): ").strip()
659
+ try:
660
+ config['top_n_datafield'] = int(datafield_topn_input) if datafield_topn_input else default_datafield_topn
661
+ except ValueError:
662
+ print(f"⚠ 警告: 输入无效,使用默认值: {default_datafield_topn}")
663
+ config['top_n_datafield'] = default_datafield_topn
664
+ print(f"✓ 数据字段候选数量: {config['top_n_datafield']}\n")
665
+
666
+ print("="*60)
667
+ print("配置完成!开始处理...")
668
+ print("="*60 + "\n")
669
+
670
+ return config
671
+
672
+
673
+
674
+ def expand_dict_columns(data: pd.DataFrame) -> pd.DataFrame:
675
+ """
676
+ Expand dictionary columns in a DataFrame into separate columns.
677
+
678
+ Args:
679
+ data (pandas.DataFrame): The input DataFrame with dictionary columns.
680
+
681
+ Returns:
682
+ pandas.DataFrame: A new DataFrame with expanded columns.
683
+ """
684
+ dict_columns = list(filter(lambda x: isinstance(data[x].iloc[0], dict), data.columns))
685
+ new_columns = pd.concat(
686
+ [data[col].apply(pd.Series).rename(columns=lambda x: f"{col}_{x}") for col in dict_columns],
687
+ axis=1,
688
+ )
689
+
690
+ data = pd.concat([data, new_columns], axis=1)
691
+ return data
692
+
693
+ def start_session() -> SingleSession:
694
+ """
695
+ Start a new session with the WorldQuant BRAIN platform.
696
+
697
+ This function authenticates the user, handles biometric authentication if required,
698
+ and creates a new session.
699
+
700
+ Returns:
701
+ SingleSession: An authenticated session object.
702
+
703
+ Raises:
704
+ requests.exceptions.RequestException: If there's an error during the authentication process.
705
+ """
706
+ brain_api_url = "https://api.worldquantbrain.com"
707
+ s = SingleSession()
708
+ s.auth = get_credentials()
709
+ r = s.post(brain_api_url + "/authentication")
710
+ print(f"New session created (ID: {id(s)}) with authentication response: {r.status_code}, {r.json()} (新会话已创建)")
711
+ if r.status_code == requests.status_codes.codes.unauthorized:
712
+ if r.headers["WWW-Authenticate"] == "persona":
713
+ print(
714
+ "Complete biometrics authentication and press any key to continue (请完成生物识别认证并按任意键继续): \n"
715
+ + urljoin(r.url, r.headers["Location"])
716
+ + "\n"
717
+ )
718
+ input()
719
+ s.post(urljoin(r.url, r.headers["Location"]))
720
+ while True:
721
+ if s.post(urljoin(r.url, r.headers["Location"])).status_code != 201:
722
+ input(
723
+ "Biometrics authentication is not complete. Please try again and press any key when completed (生物识别认证未完成,请重试并按任意键): \n"
724
+ )
725
+ else:
726
+ break
727
+ else:
728
+ print("\nIncorrect email or password (邮箱或密码错误)\n")
729
+ return start_session()
730
+ return s
731
+
732
+ def get_data_categories(s: SingleSession) -> list[dict]:
733
+ """
734
+ Fetch and cache data categories from the BRAIN API.
735
+ """
736
+ global DATA_CATEGORIES
737
+ if DATA_CATEGORIES is not None:
738
+ return DATA_CATEGORIES
739
+
740
+ try:
741
+ brain_api_url = "https://api.worldquantbrain.com"
742
+ response = s.get(brain_api_url + "/data-categories")
743
+ response.raise_for_status()
744
+ data = response.json()
745
+ if isinstance(data, list):
746
+ DATA_CATEGORIES = data
747
+ elif isinstance(data, dict):
748
+ DATA_CATEGORIES = data.get('results', [])
749
+ else:
750
+ DATA_CATEGORIES = []
751
+ return DATA_CATEGORIES
752
+ except Exception as e:
753
+ print(f"Error fetching data categories: {e}")
754
+ return []
755
+
756
+ def get_datafields(
757
+ s: SingleSession,
758
+ instrument_type: str = "EQUITY",
759
+ region: str = "USA",
760
+ delay: int = 1,
761
+ universe: str = "TOP3000",
762
+ theme: str = "false",
763
+ dataset_id: str = "",
764
+ data_type: str = "MATRIX",
765
+ search: str = "",
766
+ category: Union[str, list] = "",
767
+ ) -> pd.DataFrame:
768
+ """
769
+ Retrieve available datafields based on specified parameters.
770
+
771
+ Args:
772
+ s (SingleSession): An authenticated session object.
773
+ instrument_type (str, optional): The type of instrument. Defaults to "EQUITY".
774
+ region (str, optional): The region. Defaults to "USA".
775
+ delay (int, optional): The delay. Defaults to 1.
776
+ universe (str, optional): The universe. Defaults to "TOP3000".
777
+ theme (str, optional): The theme. Defaults to "false".
778
+ dataset_id (str, optional): The ID of a specific dataset. Defaults to "".
779
+ data_type (str, optional): The type of data. Defaults to "MATRIX".
780
+ search (str, optional): A search string to filter datafields. Defaults to "".
781
+ category (str or list, optional): A category ID or list of IDs to filter datafields. Defaults to "".
782
+
783
+ Returns:
784
+ pandas.DataFrame: A DataFrame containing information about available datafields.
785
+ """
786
+ brain_api_url = "https://api.worldquantbrain.com"
787
+ type_param = f"&type={data_type}" if data_type != "ALL" else ""
788
+
789
+ url_template = (
790
+ brain_api_url
791
+ + "/data-fields?"
792
+ + f"&instrumentType={instrument_type}"
793
+ + f"&region={region}&delay={str(delay)}&universe={universe}{type_param}&limit=50"
794
+ )
795
+
796
+ if dataset_id:
797
+ url_template += f"&dataset.id={dataset_id}"
798
+
799
+ if len(search) > 0:
800
+ url_template += f"&search={search}"
801
+
802
+ url_template += "&offset={x}"
803
+
804
+ count = 0
805
+ if len(search) == 0:
806
+ try:
807
+ count = s.get(url_template.format(x=0)).json()["count"]
808
+ except Exception as e:
809
+ print(f"Error getting count: {e}")
810
+ return pd.DataFrame()
811
+
812
+ if count == 0:
813
+ print(
814
+ f"No fields found (未找到字段): region={region}, delay={str(delay)}, universe={universe}, "
815
+ f"type={data_type}, dataset.id={dataset_id}"
816
+ )
817
+ return pd.DataFrame()
818
+ else:
819
+ if category:
820
+ count = 500 # Search deeper if filtering
821
+ else:
822
+ count = 100
823
+
824
+ max_try = 5
825
+ datafields_list = []
826
+ found_count = 0
827
+ target_found = 50 if category else count
828
+
829
+ for x in range(0, count, 50):
830
+ for _ in range(max_try):
831
+ try:
832
+ resp = s.get(url_template.format(x=x))
833
+ if resp.status_code == 200 and "results" in resp.json():
834
+ datafields = resp
835
+ break
836
+ except:
837
+ pass
838
+ time.sleep(5)
839
+ else:
840
+ continue
841
+
842
+ results = datafields.json().get("results", [])
843
+ if not results:
844
+ break
845
+
846
+ if category:
847
+ if isinstance(category, list):
848
+ filtered_results = [
849
+ item for item in results
850
+ if isinstance(item.get('category'), dict) and item['category'].get('id') in category
851
+ ]
852
+ else:
853
+ filtered_results = [
854
+ item for item in results
855
+ if isinstance(item.get('category'), dict) and item['category'].get('id') == category
856
+ ]
857
+ datafields_list.append(filtered_results)
858
+ found_count += len(filtered_results)
859
+ if len(search) > 0 and found_count >= target_found:
860
+ break
861
+ else:
862
+ datafields_list.append(results)
863
+
864
+ datafields_list_flat = [item for sublist in datafields_list for item in sublist]
865
+
866
+ if not datafields_list_flat:
867
+ return pd.DataFrame()
868
+
869
+ datafields_df = pd.DataFrame(datafields_list_flat)
870
+ datafields_df = expand_dict_columns(datafields_df)
871
+ return datafields_df
872
+
873
+ def set_alpha_properties(
874
+ s: SingleSession,
875
+ alpha_id: str,
876
+ name: Optional[str] = None,
877
+ color: Optional[str] = None,
878
+ regular_desc: Optional[str] = None,
879
+ selection_desc: str = "None",
880
+ combo_desc: str = "None",
881
+ tags: Optional[list[str]] = None,
882
+ ) -> requests.Response:
883
+ """
884
+ Update the properties of an alpha.
885
+
886
+ Args:
887
+ s (SingleSession): An authenticated session object.
888
+ alpha_id (str): The ID of the alpha to update.
889
+ name (str, optional): The new name for the alpha. Defaults to None.
890
+ color (str, optional): The new color for the alpha. Defaults to None.
891
+ regular_desc (str, optional): Description for regular alpha. Defaults to None.
892
+ selection_desc (str, optional): Description for the selection part of a super alpha. Defaults to "None".
893
+ combo_desc (str, optional): Description for the combo part of a super alpha. Defaults to "None".
894
+ tags (list, optional): List of tags to apply to the alpha. Defaults to None.
895
+
896
+ Returns:
897
+ requests.Response: The response object from the API call.
898
+ """
899
+ brain_api_url = "https://api.worldquantbrain.com"
900
+ params = {}
901
+ if name is not None:
902
+ params["name"] = name
903
+ if color is not None:
904
+ params["color"] = color
905
+ if tags is not None:
906
+ params["tags"] = tags
907
+ if regular_desc is not None:
908
+ params.setdefault("regular", {})["description"] = regular_desc
909
+ if selection_desc != "None": # Assuming "None" is the default string value for selection_desc
910
+ params.setdefault("selection", {})["description"] = selection_desc
911
+ if combo_desc != "None": # Assuming "None" is the default string value for combo_desc
912
+ params.setdefault("combo", {})["description"] = combo_desc
913
+
914
+ response = s.patch(brain_api_url + "/alphas/" + alpha_id, json=params)
915
+
916
+ return response
917
+
918
+
919
+ def extract_placeholders(template_expression: str) -> list[str]:
920
+ """
921
+ Extracts placeholders from a template expression using regular expressions.
922
+ Placeholders are identified by text enclosed in angle brackets (e.g., `<data_field/>`).
923
+ """
924
+ # Only match placeholders of the form `<name/>` or `<name/>` with alphanumeric and underscores
925
+ return re.findall(r'(<[A-Za-z0-9_]+/>)', template_expression)
926
+
927
+ def parse_alpha_code(alpha_code: str, all_operators: list[dict]) -> tuple[list[str], list[str]]:
928
+ """
929
+ Parses the alpha code to extract operators and data fields.
930
+ """
931
+ # Remove C-style comments /* ... */
932
+ alpha_code = re.sub(r"/\*[\s\S]*?\*/", "", alpha_code)
933
+ # Remove Python-style comments # ...
934
+ alpha_code = re.sub(r"#.*", "", alpha_code)
935
+
936
+ operators_names = [op['name'] for op in all_operators]
937
+
938
+ found_operators = []
939
+ found_datafields = []
940
+
941
+ # Regex to find potential identifiers (operators or datafields)
942
+ # This regex looks for words that could be operators or datafields,
943
+ # excluding numbers and common programming constructs.
944
+ identifiers = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', alpha_code)
945
+
946
+ for identifier in identifiers:
947
+ if identifier in operators_names:
948
+ found_operators.append(identifier)
949
+ elif not (identifier.isdigit() or identifier.lower() in ['true', 'false', 'null', 'nan', 'if', 'else', 'for', 'while', 'return', 'and', 'or', 'not', 'in', 'is', 'try', 'except', 'finally', 'with', 'as', 'def', 'class', 'import', 'from', 'yield', 'lambda', 'global', 'nonlocal', 'break', 'continue', 'pass', 'async', 'await', 'raise', 'assert', 'del', 'print', 'input', 'len', 'min', 'max', 'sum', 'abs', 'round', 'int', 'float', 'str', 'list', 'dict', 'set', 'tuple', 'range', 'map', 'filter', 'zip', 'open', 'file', 'type', 'id', 'dir', 'help', 'object', 'super', 'issubclass', 'isinstance', 'hasattr', 'getattr', 'setattr', 'delattr', '__import__', 'None', 'True', 'False']):
950
+ found_datafields.append(identifier)
951
+
952
+ # Remove duplicates
953
+ found_operators = list(set(found_operators))
954
+ found_datafields = list(set(found_datafields))
955
+
956
+ return found_operators, found_datafields
957
+
958
+ async def generate_alpha_description(alpha_id: str, brain_session: SingleSession) -> str:
959
+ """
960
+ Generates and potentially enriches the description of a given Alpha ID from the WorldQuant BRAIN API.
961
+
962
+ Args:
963
+ alpha_id (str): The ID of the alpha to retrieve.
964
+ brain_session (SingleSession): The active BRAIN API session.
965
+ llm_client (openai.AsyncOpenAI): The authenticated OpenAI-compatible client.
966
+
967
+ Returns:
968
+ str: A JSON string containing the alpha's settings, expression, and potentially enriched description,
969
+ or an empty JSON string if an error occurs.
970
+ """
971
+
972
+ async def call_llm_new(prompt: str) -> dict:
973
+ # 声明使用全局变量
974
+ global LLM_model_name, LLM_API_KEY, llm_base_url
975
+ try:
976
+ llm_api_key = get_token_from_auth_server()
977
+ llm_base_url_value = llm_base_url # 使用全局变量
978
+ llm_client = openai.AsyncOpenAI(base_url=llm_base_url_value, api_key=llm_api_key)
979
+ print("LLM Gateway Authentication successful. (LLM网关认证成功)")
980
+ except Exception as e:
981
+ print(f"LLM Gateway Authentication failed (LLM网关认证失败): {e}")
982
+ sys.exit(1)
983
+
984
+ print("--- Calling LLM to propose templates... (正在调用LLM生成模板...) ---")
985
+ try:
986
+ # Await the async create call
987
+ response = await llm_client.chat.completions.create(
988
+ model=LLM_model_name,
989
+ messages=[
990
+ {"role": "system", "content": "You are a quantitative finance expert and a helpful assistant designed to output JSON."},
991
+ {"role": "user", "content": prompt},
992
+ ],
993
+ # response_format={"type": "json_object"},
994
+ )
995
+
996
+ # The async client may return a nested structure. Try to extract content robustly.
997
+ content = None
998
+ if isinstance(response, dict):
999
+ # Some clients return raw dicts
1000
+ # Try common paths
1001
+ choices = response.get('choices')
1002
+ if choices and isinstance(choices, list):
1003
+ msg = choices[0].get('message') or choices[0]
1004
+ content = msg.get('content') if isinstance(msg, dict) else None
1005
+ elif 'content' in response:
1006
+ content = response.get('content')
1007
+ else:
1008
+ # Fallback: attempt attribute access
1009
+ try:
1010
+ content = response.choices[0].message.content
1011
+ except Exception:
1012
+ content = None
1013
+
1014
+ if content is None:
1015
+ # As a last resort, try to stringify the response
1016
+ content = str(response)
1017
+
1018
+ # If content is already a dict/list, return it directly; if it's a JSON string, parse it.
1019
+ if isinstance(content, (dict, list)):
1020
+ return content
1021
+ if isinstance(content, str):
1022
+ try:
1023
+ return json.loads(content)
1024
+ except json.JSONDecodeError:
1025
+ # Return wrapped string if not JSON
1026
+ return {"text": content}
1027
+
1028
+ return {}
1029
+ except Exception as e:
1030
+ print(f"Error calling LLM (调用LLM出错): {e}")
1031
+ return {}
1032
+
1033
+ try:
1034
+ brain_api_url = "https://api.worldquantbrain.com"
1035
+ alpha_url = f"{brain_api_url}/alphas/{alpha_id}"
1036
+ response = brain_session.get(alpha_url)
1037
+ response.raise_for_status() # Raise an exception for HTTP errors
1038
+
1039
+ alpha_data = response.json()
1040
+ settings = alpha_data.get('settings', {})
1041
+ expression_dict = alpha_data.get('regular', alpha_data.get('combo', None))
1042
+
1043
+ if not expression_dict or 'code' not in expression_dict:
1044
+ print(f"Error: Alpha expression code not found for Alpha ID (未找到Alpha表达式代码): {alpha_id}")
1045
+ return json.dumps({})
1046
+
1047
+ alpha_code = expression_dict['code']
1048
+ current_description = expression_dict.get('description', '')
1049
+
1050
+ # 1. Get all operators for parsing (no filter as per feedback)
1051
+ operators_data = get_brain_operators()
1052
+ all_operators = operators_data.get('operators', [])
1053
+
1054
+ # 2. Parse the code to get operators and datafields
1055
+ found_operators_names, found_datafields_names = parse_alpha_code(alpha_code, all_operators)
1056
+
1057
+ # 3. Get descriptions for operators
1058
+ operator_descriptions = {op['name']: op.get('description', 'No description available.') for op in all_operators if op['name'] in found_operators_names}
1059
+
1060
+ # 4. Get descriptions for datafields
1061
+ datafield_descriptions = {}
1062
+ if found_datafields_names:
1063
+ # Extract settings from alpha_data for the get_datafields call
1064
+ instrument_type = settings.get('instrumentType', 'EQUITY')
1065
+ region = settings.get('region', 'USA')
1066
+ universe = settings.get('universe', 'TOP3000')
1067
+ delay = settings.get('delay', 1)
1068
+
1069
+ for df_name in found_datafields_names:
1070
+ # get_datafields returns a DataFrame, so we need to process it
1071
+ datafield_df = get_datafields(s=brain_session, instrument_type=instrument_type, region=region, delay=delay, universe=universe, search=df_name)
1072
+ if not datafield_df.empty:
1073
+ # Assuming the first result is the most relevant
1074
+ datafield_descriptions[df_name] = datafield_df.iloc[0].get('description', 'No description available.')
1075
+ else:
1076
+ datafield_descriptions[df_name] = 'No description found.'
1077
+
1078
+ # 5. Use LLM to judge if current description is good
1079
+ judgment_prompt = f"""
1080
+ Given the following alpha code, its current description, and descriptions of its operators and datafields:
1081
+
1082
+ Alpha Code:
1083
+ {alpha_code}
1084
+
1085
+ Current Description:
1086
+ {current_description}
1087
+
1088
+ Operators and their descriptions:
1089
+ {json.dumps(operator_descriptions, indent=2)}
1090
+
1091
+ Datafields and their descriptions:
1092
+ {json.dumps(datafield_descriptions, indent=2)}
1093
+
1094
+ Alpha Settings:
1095
+ {json.dumps(settings, indent=2)}
1096
+
1097
+ Is the current description good enough? Respond with 'yes' or 'no' in a JSON object: {{"judgment": "yes/no"}}
1098
+ A "good" description should clearly explain the investment idea, rationale for data used, and rationale for operators used.
1099
+ """
1100
+
1101
+ judgment_response = await call_llm_new(judgment_prompt)
1102
+ is_description_good = judgment_response.get("judgment", "no").lower() == "yes"
1103
+
1104
+ new_description = current_description
1105
+ if not is_description_good:
1106
+ # 6. If not good, use another LLM to generate a new description
1107
+ generation_prompt = f"""
1108
+ Based on the following alpha code, its operators, datafields, and settings, generate a new, improved description.
1109
+ The description should clearly explain the investment idea, rationale for data used, and rationale for operators used.
1110
+ Format the output as:
1111
+ "Idea: xxxxx\\nRationale for data used: xxxxx\\nRationale for operators used: xxxxxxx"
1112
+
1113
+ Alpha Code:
1114
+ {alpha_code}
1115
+
1116
+ Operators and their descriptions:
1117
+ {json.dumps(operator_descriptions, indent=2)}
1118
+
1119
+ Datafields and their descriptions:
1120
+ {json.dumps(datafield_descriptions, indent=2)}
1121
+
1122
+ Alpha Settings:
1123
+ {json.dumps(settings, indent=2)}
1124
+ """
1125
+
1126
+ generated_description_response = await call_llm_new(generation_prompt)
1127
+ # Assuming LLM returns a string directly or a JSON with a 'description' key
1128
+ new_description = generated_description_response.get("description", generated_description_response)
1129
+ if isinstance(new_description, dict): # Handle cases where LLM might return a dict directly
1130
+ new_description = json.dumps(new_description, indent=2)
1131
+
1132
+ # 7. Override this new description and patch the alpha
1133
+ set_alpha_properties(
1134
+ s=brain_session,
1135
+ alpha_id=alpha_id,
1136
+ regular_desc=new_description
1137
+ )
1138
+ print(f"Alpha {alpha_id} description updated on platform. (Alpha描述已在平台更新)")
1139
+
1140
+ if 'regular' in alpha_data:
1141
+ alpha_data['regular']['description'] = new_description
1142
+ elif 'combo' in alpha_data:
1143
+ alpha_data['combo']['description'] = new_description
1144
+
1145
+ return json.dumps({
1146
+ 'settings': settings,
1147
+ 'expression': expression_dict
1148
+ })
1149
+
1150
+ except requests.exceptions.RequestException as e:
1151
+ print(f"Error during API request (API请求出错): {e}")
1152
+ return json.dumps({})
1153
+ except json.JSONDecodeError:
1154
+ print("Error: Could not decode JSON response from API. (无法解析API的JSON响应)")
1155
+ return json.dumps({})
1156
+ except Exception as e:
1157
+ print(f"An unexpected error occurred (发生意外错误): {e}")
1158
+ return json.dumps({})
1159
+
1160
+ def get_brain_operators(scope_filters: Optional[list[str]] = None) -> dict:
1161
+ """
1162
+ Retrieves the list of available operators from the WorldQuant BRAIN API,
1163
+ optionally filtered by a list of scopes. If no scopes are provided, all operators are returned.
1164
+
1165
+ Args:
1166
+ scope_filters (list[str], optional): A list of strings to filter operators by their scope (e.g., ["REGULAR", "TS_OPERATOR"]).
1167
+ If None or empty, all operators are returned.
1168
+
1169
+ Returns:
1170
+ dict: A dictionary containing the operators list and count,
1171
+ or an empty dictionary if an error occurs.
1172
+ """
1173
+ try:
1174
+ brain_api_url = "https://api.worldquantbrain.com"
1175
+ session = start_session()
1176
+ operators_url = f"{brain_api_url}/operators"
1177
+ response = session.get(operators_url)
1178
+ response.raise_for_status() # Raise an exception for HTTP errors
1179
+
1180
+ operators_list = response.json()
1181
+
1182
+ if not isinstance(operators_list, list):
1183
+ print(f"Error: Expected a list of operators, but received type (预期运算符列表,但收到类型): {type(operators_list)}")
1184
+ return {}
1185
+
1186
+ if scope_filters:
1187
+ filtered_operators = [
1188
+ op for op in operators_list
1189
+ if any(s_filter in op.get('scope', []) for s_filter in scope_filters)
1190
+ ]
1191
+ return {
1192
+ 'operators': filtered_operators,
1193
+ 'count': len(filtered_operators)
1194
+ }
1195
+ else:
1196
+ return {
1197
+ 'operators': operators_list,
1198
+ 'count': len(operators_list)
1199
+ }
1200
+
1201
+ except requests.exceptions.RequestException as e:
1202
+ print(f"Error during API request for operators (获取运算符时API请求出错): {e}")
1203
+ return {}
1204
+ except json.JSONDecodeError:
1205
+ print("Error: Could not decode JSON response from operators API. (无法解析运算符API的JSON响应)")
1206
+ return {}
1207
+ except Exception as e:
1208
+ print(f"An unexpected error occurred while getting operators (获取运算符时发生意外错误): {e}")
1209
+ return {}
1210
+
1211
+ async def call_llm(prompt: str, llm_client: openai.AsyncOpenAI, max_retries: int = 3) -> dict:
1212
+ """
1213
+ Interface with a Large Language Model to process prompts and get a JSON response.
1214
+ Includes retry logic for JSON parsing errors.
1215
+ """
1216
+ # 声明使用全局变量
1217
+ global LLM_model_name
1218
+ if not llm_client:
1219
+ print("LLM client not initialized. Please check authentication. (LLM客户端未初始化,请检查认证)")
1220
+ return {}
1221
+
1222
+ print("--- Calling LLM... (正在调用LLM...) ---")
1223
+
1224
+ for attempt in range(max_retries):
1225
+ try:
1226
+ response = await llm_client.chat.completions.create(
1227
+ model=LLM_model_name, # Or your preferred model
1228
+ messages=[
1229
+ {"role": "system", "content": "You are a quantitative finance expert and a helpful assistant designed to output JSON."},
1230
+ {"role": "user", "content": prompt},
1231
+ ],
1232
+ # response_format={"type": "json_object"},
1233
+ )
1234
+ content = response.choices[0].message.content
1235
+
1236
+ # Try to clean markdown code blocks if present
1237
+ if "```json" in content:
1238
+ content = content.split("```json")[1].split("```")[0].strip()
1239
+ elif "```" in content:
1240
+ content = content.split("```")[1].split("```")[0].strip()
1241
+
1242
+ return json.loads(content)
1243
+ except json.JSONDecodeError as e:
1244
+ print(f"⚠ JSON Decode Error (Attempt {attempt + 1}/{max_retries}): {e}")
1245
+ if attempt == max_retries - 1:
1246
+ print(f"❌ Failed to parse JSON after {max_retries} attempts. Raw content: {content[:100]}...")
1247
+ except Exception as e:
1248
+ print(f"⚠ LLM Call Error (Attempt {attempt + 1}/{max_retries}): {e}")
1249
+ if attempt == max_retries - 1:
1250
+ print(f"❌ Failed to call LLM after {max_retries} attempts.")
1251
+
1252
+ # Wait before retrying
1253
+ await asyncio.sleep(2)
1254
+
1255
+ return {}
1256
+
1257
+ async def propose_alpha_templates(alpha_details: dict, template_summary: str, llm_client: openai.AsyncOpenAI) -> dict:
1258
+ """
1259
+ Uses an LLM to propose new alpha templates based on a seed alpha's details.
1260
+
1261
+ Args:
1262
+ alpha_details (dict): The details of the seed alpha.
1263
+ template_summary (str): A summary of alpha templates to guide the LLM.
1264
+ llm_client (openai.AsyncOpenAI): The authenticated OpenAI-compatible client.
1265
+
1266
+ Returns:
1267
+ dict: A dictionary of proposed alpha templates in JSON format.
1268
+ """
1269
+ if not alpha_details.get('expression'):
1270
+ print("Error: Alpha expression is missing. (错误:缺少Alpha表达式)")
1271
+ return {}
1272
+ else:
1273
+ print(f"current seed alpha detail (当前种子Alpha详情): {alpha_details.get('expression')}")
1274
+ prompt = f"""
1275
+ As a world-class BRAIN consultant, your task is to design new alpha templates based on an existing seed alpha.
1276
+ You will be provided with the seed alpha's expression and a summary of successful alpha templates for inspiration.
1277
+
1278
+ **Seed Alpha Expression:**
1279
+ {alpha_details['expression']}
1280
+
1281
+ **Inspiration: Summary of Alpha Templates:**
1282
+ {template_summary}
1283
+
1284
+ **Your Task:**
1285
+ Based on the structure and potential economic rationale of the seed alpha, by the aid of the Alpha template summary, propose 3-5 new, diverse alpha templates.
1286
+
1287
+ **Rules:**
1288
+ 1. The proposed templates must be valid BRAIN alpha expressions.
1289
+ 2. Use placeholders like `<data_field/>` for data fields and `<operator/>` for operators that can be programmatically replaced later.
1290
+ 3. For each proposed template, provide a brief, clear explanation of its investment rationale.
1291
+ 4. Return the output as a single, valid JSON object where keys are the proposed template strings and values are their corresponding explanations. Do not include any other text or formatting outside of the JSON object.
1292
+ 5. The proposed new alpha template should be related to the seed Alpha {alpha_details}, either in its format or in economic sense but in different format. utilize the inspiration well.
1293
+ **Example Output Format:**
1294
+ {{
1295
+ "group_zscore(<ts_rank(<data_field/>, 60)/>, industry)": "A cross-sectional momentum signal, neutralized by industry, to capture relative strength within peer groups.",
1296
+ "ts_delta(<data_field/>, 20)": "A simple short-term momentum operator applied to a data field."
1297
+ }}
1298
+
1299
+ Now, generate the JSON object with your proposed templates.
1300
+ """
1301
+
1302
+ try:
1303
+ # print(f"现在的template summary是{template_summary}")
1304
+ proposed_templates = await call_llm(prompt, llm_client)
1305
+ return proposed_templates
1306
+ except Exception as e:
1307
+ print(f"An error occurred while calling the LLM (调用LLM时发生错误): {e}")
1308
+ return {}
1309
+
1310
+ async def propose_datafield_keywords(template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI) -> list[str]:
1311
+ """
1312
+ Uses an LLM to propose search keywords for finding data fields.
1313
+ """
1314
+ prompt = f"""
1315
+ As a quantitative researcher, you need to find the best data fields for an alpha template placeholder.
1316
+ Based on the template's logic and the placeholder's name, suggest a list of 3-5 concise search keywords to use with the WorldQuant BRAIN `get_datafields` tool.
1317
+
1318
+ **Alpha Template:**
1319
+ `{template_expression}`
1320
+
1321
+ **Template Explanation:**
1322
+ `{template_explanation}`
1323
+
1324
+ **Placeholder to Fill:**
1325
+ `{placeholder}`
1326
+
1327
+ **Your Task:**
1328
+ Provide a list of search keywords that are likely to yield relevant data fields for this placeholder. The keywords should be specific and diverse. Return the output as a single, valid JSON array of strings.
1329
+
1330
+ **Example Input:**
1331
+ Placeholder: `<slow_moving_characteristic/>`
1332
+ Explanation: "measures the time-series evolution of a fund's relative rank on a slow-moving characteristic (e.g., fund style, expense tier)"
1333
+
1334
+ **Example Output:**
1335
+ ["fund style", "expense ratio", "management fee", "turnover", "aum"]
1336
+
1337
+ Now, generate the JSON array of search keywords for the given placeholder.
1338
+ """
1339
+ print(f"--- Calling LLM to get keywords for placeholder (正在调用LLM获取占位符关键词): {placeholder} ---")
1340
+ response = await call_llm(prompt, llm_client)
1341
+ # Accept either a direct list or a dict containing a 'keywords' key
1342
+ if isinstance(response, list) and all(isinstance(item, str) for item in response):
1343
+ return response
1344
+ if isinstance(response, dict):
1345
+ # Common keys that might contain the list
1346
+ for key in ('keywords', 'data', 'result', 'items'):
1347
+ if key in response and isinstance(response[key], list) and all(isinstance(i, str) for i in response[key]):
1348
+ return response[key]
1349
+ print(f"Warning: LLM did not return a valid list of strings for keywords (警告:LLM未返回有效的关键词列表). Got: {response}")
1350
+ return []
1351
+
1352
+ async def get_datafield_candidates(s: SingleSession, alpha_details: dict, template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, top_n: int = 50, user_region: Optional[str] = None, user_universe: Optional[str] = None, user_delay: Optional[int] = None, user_category: Optional[Union[str, list]] = None, user_data_type: str = "MATRIX") -> list[dict]:
1353
+ """
1354
+ Gets candidate data fields for a placeholder by using an LLM to generate search keywords
1355
+ and then calling the BRAIN API's get_datafields to retrieve the top N results for each keyword.
1356
+ """
1357
+ keywords = await propose_datafield_keywords(template_expression, template_explanation, placeholder, llm_client)
1358
+ if not keywords:
1359
+ print(f"Could not generate keywords for placeholder (无法生成占位符关键词): {placeholder}")
1360
+ return []
1361
+
1362
+ print(f"LLM-proposed keywords for '{placeholder}' (LLM提议的关键词): {keywords}")
1363
+
1364
+ # Extract settings from alpha_details for the get_datafields call
1365
+ settings = alpha_details.get('settings', {})
1366
+ print(f"Alpha settings for datafield search (用于数据字段搜索的Alpha设置):")
1367
+ instrument_type = settings.get('instrumentType', 'EQUITY')
1368
+
1369
+ if user_region is not None:
1370
+ region = user_region
1371
+ elif 'region' in settings:
1372
+ region = settings['region']
1373
+ else:
1374
+ print(f"❌ Error: Could not determine 'region' for datafield search. It is missing in Alpha settings and not provided by user. (错误:无法确定数据搜索的地区,Alpha设置中缺失且用户未提供)")
1375
+ return []
1376
+ print(f" 数据地区: {region}")
1377
+
1378
+ if user_universe is not None:
1379
+ universe = user_universe
1380
+ elif 'universe' in settings:
1381
+ universe = settings['universe']
1382
+ else:
1383
+ print(f"❌ Error: Could not determine 'universe' for datafield search. It is missing in Alpha settings and not provided by user. (错误:无法确定数据搜索的范围,Alpha设置中缺失且用户未提供)")
1384
+ return []
1385
+ print(f" 数据范围: {universe}")
1386
+
1387
+ if user_delay is not None:
1388
+ delay = user_delay
1389
+ elif 'delay' in settings:
1390
+ delay = settings['delay']
1391
+ else:
1392
+ print(f"❌ Error: Could not determine 'delay' for datafield search. It is missing in Alpha settings and not provided by user. (错误:无法确定数据搜索的Delay,Alpha设置中缺失且用户未提供)")
1393
+ return []
1394
+ print(f" Delay: {delay} 类别")
1395
+
1396
+ if user_category:
1397
+ print(f" Category Filter: {user_category}")
1398
+
1399
+ # Use asyncio.gather to make parallel API calls for efficiency
1400
+ tasks = []
1401
+ for keyword in keywords:
1402
+ tasks.append(
1403
+ asyncio.to_thread(get_datafields,
1404
+ s=s,
1405
+ instrument_type=instrument_type,
1406
+ region=region,
1407
+ delay=delay,
1408
+ universe=universe,
1409
+ search=keyword,
1410
+ category=user_category if user_category else "",
1411
+ data_type=user_data_type
1412
+ )
1413
+ )
1414
+
1415
+ results = await asyncio.gather(*tasks)
1416
+
1417
+ # Process results to get top N from each keyword search
1418
+ top_results_per_keyword = []
1419
+ for res_df in results:
1420
+ if not res_df.empty:
1421
+ top_results_per_keyword.append(res_df.head(top_n))
1422
+
1423
+ candidate_datafields = []
1424
+ if top_results_per_keyword:
1425
+ # Concatenate the top N results from all keywords
1426
+ combined_df = pd.concat(top_results_per_keyword, ignore_index=True)
1427
+ # Remove duplicates from the combined list
1428
+ combined_df.drop_duplicates(subset=['id'], inplace=True)
1429
+ # Format the final list of candidates
1430
+ candidate_datafields = combined_df[['id', 'description']].to_dict(orient='records')
1431
+
1432
+ return candidate_datafields
1433
+
1434
+ async def get_group_datafield_candidates(template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, top_n: int = 3) -> list[dict]:
1435
+ """
1436
+ Uses an LLM to select suitable group data fields from a predefined list.
1437
+ """
1438
+ predefined_group_fields = ["industry", "subindustry", "sector", "market", "exchange"]
1439
+
1440
+ prompt = f"""
1441
+ As a quantitative researcher, you need to select the most relevant group data fields for an alpha template placeholder.
1442
+ Based on the template's logic and the placeholder's name, select {top_n} group fields from the following list that are most suitable: {predefined_group_fields}.
1443
+
1444
+ **Alpha Template:**
1445
+ `{template_expression}`
1446
+
1447
+ **Template Explanation:**
1448
+ `{template_explanation}`
1449
+
1450
+ **Placeholder to Fill:**
1451
+ `{placeholder}`
1452
+
1453
+ **Your Task:**
1454
+ Provide a list of selected group data fields. Return the output as a single, valid JSON array of strings.
1455
+
1456
+ **Example Output Format:**
1457
+ ["industry", "sector"]
1458
+
1459
+ Now, generate the JSON array of selected group data fields.
1460
+ """
1461
+ print(f"--- Calling LLM to select group datafields for placeholder (正在调用LLM选择分组数据字段): {placeholder} ---")
1462
+ response = await call_llm(prompt, llm_client)
1463
+
1464
+ if isinstance(response, list) and all(isinstance(item, str) for item in response):
1465
+ return [{"name": field} for field in response[:top_n]]
1466
+ print(f"Warning: LLM did not return a valid list of strings for group datafields (警告:LLM未返回有效的分组数据字段列表). Got: {response}")
1467
+ return [{"name": field} for field in predefined_group_fields[:top_n]] # Fallback to default if LLM fails
1468
+
1469
+ async def get_operator_candidates(template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, top_n: int = 3) -> list[dict]:
1470
+ """
1471
+ Gets candidate operators for a placeholder by first fetching all REGULAR scope operators
1472
+ and then using an LLM to select the most relevant ones.
1473
+ """
1474
+ operators_data = get_brain_operators(scope_filters=["REGULAR"])
1475
+ all_operators = operators_data.get('operators', [])
1476
+
1477
+ if not all_operators:
1478
+ print("No REGULAR scope operators found. (未找到REGULAR范围的运算符)")
1479
+ return []
1480
+
1481
+ # Create a summary of available operators for the LLM
1482
+ operator_names_and_descriptions = "\n".join([f"- {op['name']}: {op.get('description', 'No description available.')}" for op in all_operators])
1483
+
1484
+ prompt = f"""
1485
+ As a quantitative finance expert, you need to select the most relevant operators for an alpha template placeholder.
1486
+ Based on the template's logic, its explanation, and the specific placeholder, select {top_n} operators from the provided list that are most suitable.
1487
+
1488
+ **Alpha Template:**
1489
+ `{template_expression}`
1490
+
1491
+ **Template Explanation:**
1492
+ `{template_explanation}`
1493
+
1494
+ **Placeholder to Fill:**
1495
+ `{placeholder}`
1496
+
1497
+ **Available REGULAR Scope Operators:**
1498
+ {operator_names_and_descriptions}
1499
+
1500
+ **Your Task:**
1501
+ Provide a list of selected operator names. Return the output as a single, valid JSON array of strings.
1502
+
1503
+ **Example Output Format:**
1504
+ ["ts_mean", "ts_rank", "ts_decay"]
1505
+
1506
+ Now, generate the JSON array of selected operators.
1507
+ """
1508
+ print(f"--- Calling LLM to select operator candidates for placeholder (正在调用LLM选择运算符候选): {placeholder} ---")
1509
+ response = await call_llm(prompt, llm_client)
1510
+
1511
+ if isinstance(response, list) and all(isinstance(item, str) for item in response):
1512
+ # Filter the full list of operators to return the selected ones with their descriptions
1513
+ selected_ops_details = []
1514
+ for selected_name in response:
1515
+ for op in all_operators:
1516
+ if op['name'] == selected_name:
1517
+ selected_ops_details.append({"name": op['name'], "description": op.get('description', '')})
1518
+ break
1519
+ return selected_ops_details[:top_n]
1520
+
1521
+ print(f"Warning: LLM did not return a valid list of strings for operator candidates (警告:LLM未返回有效的运算符候选列表). Got: {response}")
1522
+ # Fallback to a default set if LLM fails
1523
+ return [{"name": op['name'], "description": op.get('description', '')} for op in all_operators[:top_n]]
1524
+
1525
+ async def get_parameter_candidates(param_type: str, template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI) -> list[dict]:
1526
+ """
1527
+ Uses an LLM to suggest sensible numerical candidates for parameters.
1528
+ """
1529
+ param_description = "an integer value, typically a window length or count (e.g., `d` in `ts_mean(x, d)`)" if param_type == "integer_parameter" else \
1530
+ "a floating-point number, typically a threshold or factor"
1531
+
1532
+ prompt = f"""
1533
+ As a quantitative finance expert, you need to suggest sensible numerical candidates for a placeholder parameter.
1534
+ Based on the alpha template's logic, its explanation, and the placeholder's type and context, propose 3-5 diverse numerical candidates.
1535
+
1536
+ **Alpha Template:**
1537
+ `{template_expression}`
1538
+
1539
+ **Template Explanation:**
1540
+ `{template_explanation}`
1541
+
1542
+ **Placeholder to Fill:**
1543
+ `{placeholder}`
1544
+
1545
+ **Parameter Type:**
1546
+ This placeholder represents {param_description}.
1547
+
1548
+ **Your Task:**
1549
+ Provide a list of numerical candidates that are appropriate for this parameter. Return the output as a single, valid JSON array of numbers.
1550
+
1551
+ **Example Output (for integer_parameter):**
1552
+ [10, 20, 60, 120, 252]
1553
+
1554
+ **Example Output (for float_parameter):**
1555
+ [0.01, 0.05, 0.1, 0.2, 0.5]
1556
+
1557
+ Now, generate the JSON array of numerical candidates.
1558
+ """
1559
+ print(f"--- Calling LLM to suggest candidates for {param_type} placeholder (正在调用LLM建议参数候选): {placeholder} ---")
1560
+ response = await call_llm(prompt, llm_client)
1561
+
1562
+ if isinstance(response, list) and all(isinstance(item, (int, float)) for item in response):
1563
+ return [{"value": val} for val in response]
1564
+ print(f"Warning: LLM did not return a valid list of numbers for {param_type} candidates (警告:LLM未返回有效的数字候选列表). Got: {response}")
1565
+
1566
+ # Fallback to default if LLM fails
1567
+ if param_type == "integer_parameter":
1568
+ return [{"value": x} for x in [10, 20, 60, 120, 252]]
1569
+ elif param_type == "float_parameter":
1570
+ return [{"value": x} for x in [0.01, 0.05, 0.1, 0.2, 0.5]]
1571
+ return []
1572
+
1573
+ async def judge_placeholder_type(placeholder: str, template_expression: str, template_explanation: str, operator_summary: str, llm_client: openai.AsyncOpenAI) -> str:
1574
+ """
1575
+ Uses an LLM to judge the type of placeholder (e.g., "data_field", "integer_parameter", "group_operator").
1576
+ """
1577
+ prompt = f"""
1578
+ As a world-class quantitative finance expert, your task is to classify the type of a placeholder within an alpha expression.
1579
+ You will be provided with the alpha template, its explanation, the specific placeholder, and a comprehensive summary of available BRAIN operators and data field characteristics.
1580
+
1581
+ **Alpha Template:**
1582
+ `{template_expression}`
1583
+
1584
+ **Template Explanation:**
1585
+ `{template_explanation}`
1586
+
1587
+ **Placeholder to Classify:**
1588
+ `{placeholder}`
1589
+
1590
+ **Available BRAIN Operators and Data Field Characteristics:**
1591
+ {operator_summary}
1592
+
1593
+ **Your Task:**
1594
+ Classify the `{placeholder}` based on the provided context. The classification should be one of the following types:
1595
+ - "data_field": If the placeholder clearly represents a financial data series (e.g., price, volume, fundamental ratio).
1596
+ - "group_data_field": If the placeholder represents a categorical field used for grouping or neutralization (e.g., `industry` in `group_zscore(x, industry)`).
1597
+ - "operator": If the placeholder represents a BRAIN operator that performs a calculation or transformation.
1598
+ - "integer_parameter": If the placeholder represents an integer value, typically a window length or count (e.g., `d` in `ts_mean(x, d)`).
1599
+ - "float_parameter": If the placeholder represents a floating-point number, typically a threshold or factor.
1600
+ - "string_parameter": If the placeholder represents a string value, like a group name (e.g., `industry` in `group_zscore(x, industry)`).
1601
+ - "unknown": If the type cannot be determined from the context.
1602
+
1603
+ Return the classification as a single JSON object with a key "placeholder_type" and its corresponding value. Do not include any other text or formatting outside of the JSON object.
1604
+
1605
+ **Example Output Format:**
1606
+ {{"placeholder_type": "data_field"}}
1607
+ {{"placeholder_type": "integer_parameter"}}
1608
+
1609
+ Now, classify the placeholder.
1610
+ """
1611
+ print(f"--- Calling LLM to judge type for placeholder (正在调用LLM判断占位符类型): {placeholder} ---")
1612
+
1613
+ response = await call_llm(prompt, llm_client)
1614
+ return response.get("placeholder_type", "unknown")
1615
+
1616
+ async def populate_template(s: SingleSession, alpha_details: dict, template_expression: str, template_explanation: str, operator_summary: str, llm_client: openai.AsyncOpenAI, top_n_datafield: int = 50, user_region: Optional[str] = None, user_universe: Optional[str] = None, user_delay: Optional[int] = None, user_category: Optional[Union[str, list]] = None, user_data_type: str = "MATRIX") -> dict:
1617
+ """
1618
+ Populates placeholders in an alpha template with candidate data fields, operators, or parameters.
1619
+ """
1620
+ placeholders = extract_placeholders(template_expression)
1621
+
1622
+ if not placeholders:
1623
+ print("No placeholders found in the template. (模板中未找到占位符)")
1624
+ return {}
1625
+
1626
+ """
1627
+ Populates placeholders in an alpha template with candidate data fields, operators, or parameters.
1628
+ """
1629
+ placeholders = extract_placeholders(template_expression)
1630
+ print(f"Found placeholders in template (在模板中找到占位符): {placeholders}")
1631
+
1632
+ populated_placeholders = {}
1633
+
1634
+ for ph in placeholders:
1635
+ # Use LLM to judge placeholder type
1636
+ ph_type = await judge_placeholder_type(ph, template_expression, template_explanation, operator_summary, llm_client)
1637
+ print(f"'{ph}' judged as type (判断类型为): {ph_type}")
1638
+
1639
+ if ph_type == "data_field":
1640
+ candidates = await get_datafield_candidates(s, alpha_details, template_expression, template_explanation, ph, llm_client, top_n=top_n_datafield, user_region=user_region, user_universe=user_universe, user_delay=user_delay, user_category=user_category, user_data_type=user_data_type)
1641
+ populated_placeholders[ph] = {"type": "data_field", "candidates": candidates}
1642
+ elif ph_type == "group_data_field":
1643
+ candidates = await get_group_datafield_candidates(template_expression, template_explanation, ph, llm_client)
1644
+ populated_placeholders[ph] = {"type": "group_data_field", "candidates": candidates}
1645
+ elif ph_type in ["operator", "group_operator", "ts_operator"]:
1646
+ candidates = await get_operator_candidates(template_expression, template_explanation, ph, llm_client)
1647
+ populated_placeholders[ph] = {"type": ph_type, "candidates": candidates}
1648
+ elif ph_type in ["integer_parameter", "float_parameter"]:
1649
+ candidates = await get_parameter_candidates(ph_type, template_expression, template_explanation, ph, llm_client)
1650
+ populated_placeholders[ph] = {"type": ph_type, "candidates": candidates}
1651
+ elif ph_type == "string_parameter":
1652
+ # Add logic for string_parameter if needed, for now it returns empty
1653
+ populated_placeholders[ph] = {"type": "string_parameter", "candidates": []}
1654
+ else:
1655
+ print(f"Could not determine type for placeholder (无法确定占位符类型): {ph} (LLM classified as {ph_type})")
1656
+ populated_placeholders[ph] = {"type": "unknown", "candidates": []}
1657
+
1658
+ return populated_placeholders
1659
+
1660
+ def get_datafield_prefix(datafield_name: str) -> str:
1661
+ """Extracts the prefix from a datafield name (e.g., 'anl44_...' -> 'anl44')."""
1662
+ if '_' in datafield_name:
1663
+ return datafield_name.split('_')[0]
1664
+ return datafield_name
1665
+
1666
+
1667
+
1668
+ async def generate_new_alphas(alpha_description, brain_session, template_summary: Optional[str] = None, top_n_datafield: int = 50, user_region: Optional[str] = None, user_universe: Optional[str] = None, user_delay: Optional[int] = None, user_category: Optional[Union[str, list]] = None, user_data_type: str = "MATRIX"):
1669
+ """
1670
+ Main function to generate new alpha templates based on a seed alpha.
1671
+
1672
+ Args:
1673
+ alpha_description: The alpha description JSON string.
1674
+ brain_session: The BRAIN session object.
1675
+ template_summary: Optional template summary string. If None, will load from built-in.
1676
+ top_n_datafield: Number of data field candidates to retrieve (default: 50).
1677
+ user_data_type: Data type for datafield search (MATRIX or VECTOR).
1678
+ """
1679
+ # 声明使用全局变量
1680
+ global LLM_model_name, LLM_API_KEY, llm_base_url
1681
+
1682
+ # Load template summary if not provided
1683
+ if template_summary is None:
1684
+ template_summary = load_template_summary()
1685
+ # --- Load Operator Summary ---
1686
+ operator_summary = get_brain_operators(scope_filters=["REGULAR"])
1687
+
1688
+ try:
1689
+ llm_api_key = get_token_from_auth_server()
1690
+ llm_base_url_value = llm_base_url # 使用全局变量
1691
+ llm_client = openai.AsyncOpenAI(base_url=llm_base_url_value, api_key=llm_api_key)
1692
+ print("✓ LLM Gateway 认证成功")
1693
+ except Exception as e:
1694
+ print(f"❌ LLM Gateway 认证失败: {e}")
1695
+ sys.exit(1)
1696
+
1697
+ details = json.loads(alpha_description)
1698
+
1699
+ if not details:
1700
+ print(f"Failed to retrieve details for Alpha (获取Alpha详情失败)")
1701
+ sys.exit(1)
1702
+
1703
+ print("Alpha Details Retrieved (已获取Alpha详情):")
1704
+ print(json.dumps(details, indent=4))
1705
+
1706
+
1707
+ # --- Step 4: Propose New Alpha Templates ---
1708
+ print(f"\n--- Proposing new alpha templates for Alpha (正在为Alpha提议新模板) ---")
1709
+ proposed_templates = await propose_alpha_templates(details, template_summary, llm_client)
1710
+
1711
+ if not proposed_templates:
1712
+ print("Failed to generate proposed alpha templates. (生成提议模板失败)")
1713
+ sys.exit(1)
1714
+
1715
+ print("\n--- Proposed Alpha Templates (JSON) (建议的Alpha模板,多样性会受到模型和模板总结文档的影响) ---")
1716
+ print(json.dumps(proposed_templates, indent=4))
1717
+
1718
+ # --- Validation: Drop templates with suspicious literal identifiers ---
1719
+ try:
1720
+ operators_meta = get_brain_operators().get('operators', [])
1721
+ proposed_templates = _filter_valid_templates(
1722
+ proposed_templates,
1723
+ operators_meta,
1724
+ brain_session,
1725
+ details.get('settings', {}),
1726
+ parse_alpha_code,
1727
+ )
1728
+ except Exception as e:
1729
+ print(f"⚠ 模板校验步骤出现异常,跳过校验: {e}")
1730
+
1731
+ if not proposed_templates:
1732
+ print("❌ 所有模板在校验后被丢弃,无法继续。")
1733
+ sys.exit(1)
1734
+
1735
+ # --- Step 5: Process all proposed templates and gather candidates ---
1736
+ # --- Step 6: Prepare for Output ---
1737
+ # Ensure the output directory exists next to this script
1738
+ output_dir = Path(__file__).parent / "output"
1739
+ try:
1740
+ output_dir.mkdir(parents=True, exist_ok=True)
1741
+ except Exception as e:
1742
+ print(f"Warning: could not create directory {output_dir}: {e}")
1743
+
1744
+ output_filepath = output_dir / f"Alpha_candidates.json"
1745
+
1746
+ final_output = {}
1747
+
1748
+ # --- Step 5: Process all proposed templates and gather candidates ---
1749
+ for template_expr, template_expl in proposed_templates.items():
1750
+ print(f"\n--- Populating template (正在填充模板): '{template_expr}' ---")
1751
+ try:
1752
+ populated_info = await populate_template(brain_session, details, template_expr, template_expl, operator_summary, llm_client, top_n_datafield=top_n_datafield, user_region=user_region, user_universe=user_universe, user_delay=user_delay, user_category=user_category, user_data_type=user_data_type)
1753
+
1754
+ # Skip templates where any data_field placeholder has zero candidates
1755
+ if _should_skip_due_to_empty_candidates(populated_info):
1756
+ print("⚠ 该模板存在数据字段候选为空的占位符,跳过此模板。")
1757
+ continue
1758
+
1759
+ final_output[template_expr] = {
1760
+ "template_explanation": template_expl,
1761
+ "seed_alpha_settings": details.get('settings', {}),
1762
+ "placeholder_candidates": populated_info
1763
+ }
1764
+
1765
+ # --- Incremental Saving ---
1766
+ try:
1767
+ with output_filepath.open('w', encoding='utf-8') as f:
1768
+ json.dump(final_output, f, indent=4)
1769
+ print(f"✓ Progress saved to {output_filepath.name}")
1770
+ except IOError as e:
1771
+ print(f"⚠️ Warning: Failed to save progress: {e}")
1772
+
1773
+ except Exception as e:
1774
+ print(f"❌ Error processing template '{template_expr}': {e}")
1775
+ print("Skipping this template and continuing...")
1776
+ continue
1777
+
1778
+ print("\n--- Final Consolidated Output (最终合并输出) ---")
1779
+ print(json.dumps(final_output, indent=4))
1780
+
1781
+
1782
+ generated_expressions = set()
1783
+
1784
+ for template_expression, template_data in final_output.items():
1785
+ placeholder_candidates = template_data["placeholder_candidates"]
1786
+ seed_alpha_settings = template_data["seed_alpha_settings"]
1787
+
1788
+ # Prepare a dictionary to hold lists of candidates for each placeholder
1789
+ candidates_for_placeholders = {}
1790
+ for placeholder, details in placeholder_candidates.items():
1791
+ # Extract only the 'value' or 'name' from the candidates list
1792
+ if details["type"] == "data_field":
1793
+ candidates_for_placeholders[placeholder] = [c["id"] for c in details["candidates"]]
1794
+ elif details["type"] in ["integer_parameter", "float_parameter"]:
1795
+ candidates_for_placeholders[placeholder] = [str(c["value"]) for c in details["candidates"]]
1796
+ elif details["type"] == "group_data_field":
1797
+ candidates_for_placeholders[placeholder] = [c["name"] for c in details["candidates"]]
1798
+ elif details["type"] == "operator":
1799
+ candidates_for_placeholders[placeholder] = [c["name"] for c in details["candidates"]]
1800
+ else:
1801
+ candidates_for_placeholders[placeholder] = []
1802
+
1803
+
1804
+ # --- Step 3: Implement logic to generate all alpha expression combinations from the candidates ---
1805
+ # Generate all possible combinations of placeholder values
1806
+ placeholder_names = list(candidates_for_placeholders.keys())
1807
+ all_combinations_values = list(itertools.product(*candidates_for_placeholders.values()))
1808
+
1809
+ for combination_values in all_combinations_values:
1810
+
1811
+ # --- ATOM Mode ---
1812
+
1813
+ datafield_values_in_combo = []
1814
+ placeholder_types = {ph: details["type"] for ph, details in placeholder_candidates.items()}
1815
+
1816
+ for i, placeholder_name in enumerate(placeholder_names):
1817
+ if placeholder_types.get(placeholder_name) == 'data_field':
1818
+ datafield_values_in_combo.append(combination_values[i])
1819
+
1820
+ if len(datafield_values_in_combo) > 1:
1821
+ first_prefix = get_datafield_prefix(datafield_values_in_combo[0])
1822
+ if not all(get_datafield_prefix(df) == first_prefix for df in datafield_values_in_combo):
1823
+ continue # Skip this combination as prefixes do not match
1824
+
1825
+ current_expression = template_expression
1826
+ for i, placeholder_name in enumerate(placeholder_names):
1827
+ current_expression = current_expression.replace(placeholder_name, combination_values[i])
1828
+
1829
+ # Check for duplicates before adding
1830
+ if current_expression not in generated_expressions:
1831
+ generated_expressions.add(current_expression)
1832
+ # dump all unique generated expressions to a file, a list of strings in json file
1833
+ print(f"\n--- Total Unique Generated Alpha Expressions (生成的唯一Alpha表达式总数): {len(generated_expressions)} ---")
1834
+ # output_filepath = output_dir / f"Alpha_generated_expressions.json"
1835
+ # try:
1836
+ # with output_filepath.open('w', encoding='utf-8') as f:
1837
+ # json.dump(list(generated_expressions), f, indent=4)
1838
+ # print(f"\nGenerated expressions successfully written to {output_filepath} (生成的表达式已成功写入)")
1839
+ # except IOError as e:
1840
+ # print(f"Error writing generated expressions to file {output_filepath} (写入生成的表达式出错): {e}")
1841
+
1842
+
1843
+
1844
+ validator = val.ExpressionValidator()
1845
+ print("开始表达式语法检查感谢社区贡献,原帖https://support.worldquantbrain.com/hc/en-us/community/posts/36740689434391--check%E7%8E%8B-%E9%AA%8C%E8%AF%81%E8%A1%A8%E8%BE%BE%E5%BC%8F%E6%98%AF%E5%90%A6%E6%AD%A3%E7%A1%AE%E7%9A%84%E8%84%9A%E6%9C%AC-%E4%B8%83%E5%8D%81%E4%BA%8C%E5%8F%98%E9%BB%84%E9%87%91%E6%90%AD%E6%A1%A3?page=1#community_comment_36798176158999")
1846
+ print("请注意,该文件仅用于验证表达式的格式正确性,\n不保证表达式在实际使用中的逻辑正确性或可执行性。\n")
1847
+ print("不在内置函数列表中的operator将无法检查,如有需要,请使用AI按需修改本源代码添加")
1848
+
1849
+ expressions_data = list(generated_expressions)
1850
+ # 提取表达式列表
1851
+ # 假设JSON文件结构为 {"expressions": ["expr1", "expr2", ...]} 或直接是 ["expr1", "expr2", ...]
1852
+ if isinstance(expressions_data, dict) and "expressions" in expressions_data:
1853
+ expressions = expressions_data["expressions"]
1854
+ elif isinstance(expressions_data, list):
1855
+ expressions = expressions_data
1856
+ else:
1857
+ print("错误: JSON文件格式不正确,需要包含表达式列表")
1858
+ return
1859
+
1860
+ # 验证表达式
1861
+ valid_expressions = []
1862
+ invalid_expressions = []
1863
+
1864
+ print(f"开始验证 {len(expressions)} 个表达式...")
1865
+ for i, expr in enumerate(expressions, 1):
1866
+ if i % 10 == 0:
1867
+ print(f"已验证 {i}/{len(expressions)} 个表达式")
1868
+
1869
+ result = validator.check_expression(expr)
1870
+ if result["valid"]:
1871
+ valid_expressions.append(expr)
1872
+ else:
1873
+ invalid_expressions.append({"expression": expr, "errors": result["errors"]})
1874
+
1875
+ # 生成输出文件路径
1876
+ name = "Alpha_generated_expressions"
1877
+ valid_output_path = os.path.join(output_dir, f"{name}_success.json")
1878
+ invalid_output_path = os.path.join(output_dir, f"{name}_error.json")
1879
+
1880
+ # 保存结果到JSON文件
1881
+ print(f"\n验证完成!")
1882
+ print(f"有效表达式: {len(valid_expressions)}")
1883
+ print(f"无效表达式: {len(invalid_expressions)}")
1884
+
1885
+ # 保存有效表达式
1886
+ try:
1887
+ with open(valid_output_path, 'w', encoding='utf-8') as f:
1888
+ json.dump(valid_expressions, f, ensure_ascii=False, indent=2)
1889
+ print(f"有效表达式已保存到: {valid_output_path}")
1890
+ except Exception as e:
1891
+ print(f"错误: 保存有效表达式失败 - {e}")
1892
+
1893
+ # 保存无效表达式
1894
+ try:
1895
+ with open(invalid_output_path, 'w', encoding='utf-8') as f:
1896
+ json.dump(invalid_expressions, f, ensure_ascii=False, indent=2)
1897
+ print(f"无效表达式已保存到: {invalid_output_path},文件包含错误详情")
1898
+ print("查看该文件,你将获得修改模板的灵感,你可以定位到错误的模板并在APP里修改")
1899
+ except Exception as e:
1900
+ print(f"错误: 保存无效表达式失败 - {e}")
1901
+
1902
+ print("请注意,该文件仅用于验证表达式的格式正确性,\n不保证表达式在实际使用中的逻辑正确性或可执行性。\n")
1903
+ print("不在内置函数列表中的operator将无法检查,如有需要,请使用AI按需修改validator源代码添加")
1904
+
1905
+ print("不同模型效果不同,默认的kimi模型可能会产生Alpha语法错误,请检查生成的模板文件进行甄别")
1906
+ print("下一步,请下载已完成的模板,放入APP首页进行解析和语法检查,强烈建议生成表达式后手动尝试回测")
1907
+
1908
+
1909
+ async def main():
1910
+ """
1911
+ Main execution function.
1912
+ """
1913
+
1914
+ # Check for command line argument for config file
1915
+ if len(sys.argv) > 1:
1916
+ config_path = sys.argv[1]
1917
+ if os.path.exists(config_path):
1918
+ try:
1919
+ with open(config_path, 'r', encoding='utf-8') as f:
1920
+ config = json.load(f)
1921
+ print(f"✓ 已从命令行参数加载配置: {config_path}")
1922
+ # Ensure all required fields are present or set defaults
1923
+ if 'top_n_datafield' not in config:
1924
+ config['top_n_datafield'] = 50
1925
+ if 'template_summary_path' not in config:
1926
+ config['template_summary_path'] = None
1927
+ except Exception as e:
1928
+ print(f"❌ 加载配置文件失败: {e}")
1929
+ sys.exit(1)
1930
+ else:
1931
+ print(f"❌ 配置文件不存在: {config_path}")
1932
+ sys.exit(1)
1933
+ else:
1934
+ # --- Step 0: 交互式输入收集配置信息 ---
1935
+ print("输入回车加载同文件夹下的transformer_config.json文件,否则按其他任意键并回车,进入交互式输入账号信息")
1936
+ input_str = input()
1937
+ if input_str == "":
1938
+ config_path = os.path.join(os.path.dirname(__file__), 'transformer_config.json')
1939
+ with open(config_path, 'r') as f:
1940
+ config = json.load(f)
1941
+ print("\n" + "="*60)
1942
+ print("✓ 已从 transformer_config.json 加载账号配置")
1943
+ print("="*60 + "\n")
1944
+
1945
+ # 继续交互式输入运行时参数
1946
+ # 1. 询问模板总结文件路径
1947
+ print("【1/3】模板总结文件配置")
1948
+ print("强烈推荐你使用自己总结的模板文档,效果会更好")
1949
+ print("提示: 如果您有 template_summary 的 .txt 或 .md 文件,请输入完整路径")
1950
+ print(" 如果没有,直接回车将使用内置模板总结")
1951
+ template_path = input("请输入模板总结文件路径 (直接回车使用内置模板): ").strip()
1952
+ config['template_summary_path'] = template_path if template_path else None
1953
+ if template_path:
1954
+ print(f"✓ 将尝试从文件加载: {template_path}\n")
1955
+ else:
1956
+ print("✓ 将使用内置模板总结\n")
1957
+
1958
+ # 2. 询问 Alpha ID
1959
+ print("【2/3】Alpha ID 配置")
1960
+ alpha_id = input("请输入要处理的 Alpha ID: ").strip()
1961
+ if not alpha_id:
1962
+ print("❌ 错误: Alpha ID 不能为空")
1963
+ sys.exit(1)
1964
+ config['alpha_id'] = alpha_id
1965
+ print(f"✓ Alpha ID: {alpha_id}\n")
1966
+
1967
+ # 3. 询问 Top N 参数(仅数据字段)
1968
+ print("【3/3】候选数量配置 (Top N)")
1969
+ print("提示: 此参数控制为每个占位符生成的数据字段候选数量")
1970
+ default_datafield_topn = 50
1971
+ datafield_topn_input = input(f"请输入数据字段候选数量 (直接回车使用默认值: {default_datafield_topn}): ").strip()
1972
+ try:
1973
+ config['top_n_datafield'] = int(datafield_topn_input) if datafield_topn_input else default_datafield_topn
1974
+ except ValueError:
1975
+ print(f"⚠ 警告: 输入无效,使用默认值: {default_datafield_topn}")
1976
+ config['top_n_datafield'] = default_datafield_topn
1977
+ print(f"✓ 数据字段候选数量: {config['top_n_datafield']}\n")
1978
+
1979
+ print("="*60)
1980
+ print("配置完成!开始处理...")
1981
+ print("="*60 + "\n")
1982
+ else:
1983
+ config = interactive_input()
1984
+
1985
+ # 设置全局变量
1986
+ global LLM_model_name, LLM_API_KEY, llm_base_url, username, password
1987
+ LLM_model_name = config['LLM_model_name']
1988
+ LLM_API_KEY = config['LLM_API_KEY']
1989
+ llm_base_url = config['llm_base_url']
1990
+ username = config['username']
1991
+ password = config['password']
1992
+
1993
+ # --- Step 1: 加载模板总结 ---
1994
+ template_summary = load_template_summary(config.get('template_summary_path'))
1995
+
1996
+ # --- Step 2: 启动 BRAIN 会话 ---
1997
+ print("--- 正在启动 BRAIN 会话... ---")
1998
+ s = start_session()
1999
+
2000
+ # --- Step 3: 认证 LLM Gateway ---
2001
+ llm_client = None
2002
+ print("--- 正在认证 LLM Gateway... ---")
2003
+ try:
2004
+ llm_api_key = get_token_from_auth_server()
2005
+ llm_base_url_value = llm_base_url
2006
+ llm_client = openai.AsyncOpenAI(base_url=llm_base_url_value, api_key=llm_api_key)
2007
+ print("✓ LLM Gateway 认证成功")
2008
+ except Exception as e:
2009
+ print(f"❌ LLM Gateway 认证失败: {e}")
2010
+ sys.exit(1)
2011
+
2012
+ # --- Step 4: 获取 Alpha 详情 ---
2013
+ alpha_id = config['alpha_id']
2014
+ print(f"\n--- 正在获取 Alpha ID: {alpha_id} 的详情... ---")
2015
+
2016
+ # --- Step 4.5: 交互式选择数据字段范围 ---
2017
+ if len(sys.argv) > 1:
2018
+ user_datafield_config = {
2019
+ 'user_region': config.get('user_region'),
2020
+ 'user_universe': config.get('user_universe'),
2021
+ 'user_delay': config.get('user_delay'),
2022
+ 'user_category': config.get('user_category'),
2023
+ 'user_data_type': config.get('user_data_type', 'MATRIX')
2024
+ }
2025
+ else:
2026
+ user_datafield_config = interactive_datafield_selection(s)
2027
+
2028
+ details_str = await generate_alpha_description(alpha_id, brain_session=s)
2029
+ await generate_new_alphas(
2030
+ alpha_description=details_str,
2031
+ brain_session=s,
2032
+ template_summary=template_summary,
2033
+ top_n_datafield=config.get('top_n_datafield', 50),
2034
+ user_region=user_datafield_config.get('user_region'),
2035
+ user_universe=user_datafield_config.get('user_universe'),
2036
+ user_delay=user_datafield_config.get('user_delay'),
2037
+ user_category=user_datafield_config.get('user_category'),
2038
+ user_data_type=user_datafield_config.get('user_data_type', 'MATRIX')
2039
+ )
2040
+
2041
+ def interactive_datafield_selection(s: SingleSession) -> dict:
2042
+ """
2043
+ Interactively ask the user for datafield search configuration (Region, Universe, Delay).
2044
+ """
2045
+ print("\n" + "="*60)
2046
+ print("【附加配置】数据字段搜索范围配置")
2047
+ print("正在获取有效的 Region/Universe/Delay 组合...")
2048
+
2049
+ try:
2050
+ df = get_instrument_type_region_delay(s)
2051
+ except Exception as e:
2052
+ print(f"⚠ 获取配置选项失败: {e}")
2053
+ print("将使用 Seed Alpha 的默认设置")
2054
+ return {}
2055
+
2056
+ # Filter for EQUITY only as per current logic
2057
+ df_equity = df[df['InstrumentType'] == 'EQUITY']
2058
+
2059
+ if df_equity.empty:
2060
+ print("未找到 EQUITY 类型的配置选项。")
2061
+ return {}
2062
+
2063
+ # 1. Select Region
2064
+ regions = df_equity['Region'].unique().tolist()
2065
+ print(f"\n可用地区 (Region): {regions}")
2066
+ region_input = input(f"请输入地区 (直接回车使用 Seed Alpha 默认值): ").strip()
2067
+
2068
+ selected_region = None
2069
+ if region_input:
2070
+ if region_input in regions:
2071
+ selected_region = region_input
2072
+ else:
2073
+ print(f"⚠ 输入无效,将使用默认值")
2074
+
2075
+ # 2. Select Delay
2076
+ # If region is selected, filter delays for that region
2077
+ if selected_region:
2078
+ delays = df_equity[df_equity['Region'] == selected_region]['Delay'].unique().tolist()
2079
+ else:
2080
+ delays = df_equity['Delay'].unique().tolist()
2081
+
2082
+ print(f"\n可用延迟 (Delay): {delays}")
2083
+ delay_input = input(f"请输入延迟 (直接回车使用 Seed Alpha 默认值): ").strip()
2084
+
2085
+ selected_delay = None
2086
+ if delay_input:
2087
+ try:
2088
+ d_val = int(delay_input)
2089
+ if d_val in delays:
2090
+ selected_delay = d_val
2091
+ else:
2092
+ print(f"⚠ 输入不在列表中,将使用默认值")
2093
+ except ValueError:
2094
+ print(f"⚠ 输入无效,将使用默认值")
2095
+
2096
+ # 3. Select Universe
2097
+ # If region and delay are selected, filter universes
2098
+ if selected_region and selected_delay is not None:
2099
+ subset = df_equity[(df_equity['Region'] == selected_region) & (df_equity['Delay'] == selected_delay)]
2100
+ if not subset.empty:
2101
+ universes = subset.iloc[0]['Universe']
2102
+ else:
2103
+ universes = []
2104
+ else:
2105
+ # Just show all unique universes if we can't filter precisely
2106
+ universes = set()
2107
+ for u_list in df_equity['Universe']:
2108
+ universes.update(u_list)
2109
+ universes = list(universes)
2110
+
2111
+ print(f"\n可用范围 (Universe): {universes}")
2112
+ universe_input = input(f"请输入范围 (直接回车使用 Seed Alpha 默认值): ").strip()
2113
+
2114
+ selected_universe = None
2115
+ if universe_input:
2116
+ if universe_input in universes:
2117
+ selected_universe = universe_input
2118
+ else:
2119
+ print(f"⚠ 输入无效,将使用默认值")
2120
+
2121
+ # 4. Select Category
2122
+ print("\n正在获取数据类别 (Data Categories)...")
2123
+ categories = get_data_categories(s)
2124
+
2125
+ selected_category = None
2126
+ if categories:
2127
+ print("\n可用类别 (Categories):")
2128
+ for i, cat in enumerate(categories):
2129
+ print(f"{i+1}. {cat['name']} (ID: {cat['id']})")
2130
+
2131
+ cat_input = input(f"请输入类别编号或ID (多个用逗号分隔, 直接回车不筛选): ").strip()
2132
+
2133
+ if cat_input:
2134
+ selected_categories = []
2135
+ inputs = [x.strip() for x in cat_input.split(',')]
2136
+
2137
+ for inp in inputs:
2138
+ # Check if input is an index
2139
+ if inp.isdigit():
2140
+ idx = int(inp) - 1
2141
+ if 0 <= idx < len(categories):
2142
+ selected_categories.append(categories[idx]['id'])
2143
+ print(f"已选择类别: {categories[idx]['name']}")
2144
+ else:
2145
+ # Check if input is an ID
2146
+ found = False
2147
+ for cat in categories:
2148
+ if cat['id'] == inp:
2149
+ selected_categories.append(cat['id'])
2150
+ print(f"已选择类别: {cat['name']}")
2151
+ found = True
2152
+ break
2153
+ if not found:
2154
+ print(f"⚠ 输入无效: {inp}")
2155
+
2156
+ if selected_categories:
2157
+ selected_category = selected_categories
2158
+ else:
2159
+ print(f"⚠ 未选择有效类别,将不筛选类别")
2160
+ else:
2161
+ print("⚠ 无法获取类别列表,跳过类别选择")
2162
+
2163
+ # 5. Select Data Type
2164
+ print("\n可用数据类型 (Data Type): [MATRIX, VECTOR]")
2165
+ data_type_input = input(f"请输入数据类型 (直接回车默认 MATRIX): ").strip().upper()
2166
+
2167
+ selected_data_type = "MATRIX"
2168
+ if data_type_input == "VECTOR":
2169
+ print("⚠ 警告: 请确保您输入的原型Alpha中正确地使用了vector operator,否则极容易造成数据类型错误")
2170
+ confirm = input("确认使用 VECTOR 吗? (y/n): ").strip().lower()
2171
+ if confirm == 'y':
2172
+ selected_data_type = "VECTOR"
2173
+ else:
2174
+ print("已取消 VECTOR 选择,使用默认值 MATRIX")
2175
+ elif data_type_input and data_type_input != "MATRIX":
2176
+ print(f"⚠ 输入无效,将使用默认值 MATRIX")
2177
+
2178
+ return {
2179
+ 'user_region': selected_region,
2180
+ 'user_universe': selected_universe,
2181
+ 'user_delay': selected_delay,
2182
+ 'user_category': selected_category,
2183
+ 'user_data_type': selected_data_type
2184
+ }
2185
+
2186
+ if __name__ == "__main__":
2187
+ # To allow asyncio to run in environments like Jupyter notebooks
2188
+ if sys.platform.startswith('win') and sys.version_info[:2] >= (3, 8):
2189
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
2190
+
2191
+ asyncio.run(main())
2192
+