cnhkmcp 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. cnhkmcp/__init__.py +1 -1
  2. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +1 -1
  3. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +2 -2
  4. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +1 -1
  5. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/chroma.sqlite3 +0 -0
  6. cnhkmcp/untracked/APP/Tranformer/Transformer.py +2 -2
  7. cnhkmcp/untracked/APP/Tranformer/transformer_config.json +1 -1
  8. cnhkmcp/untracked/APP/blueprints/feature_engineering.py +2 -2
  9. cnhkmcp/untracked/APP/blueprints/inspiration_house.py +4 -4
  10. cnhkmcp/untracked/APP/blueprints/paper_analysis.py +3 -3
  11. cnhkmcp/untracked/APP/give_me_idea/BRAIN_Alpha_Template_Expert_SystemPrompt.md +34 -73
  12. cnhkmcp/untracked/APP/give_me_idea/alpha_data_specific_template_master.py +2 -2
  13. cnhkmcp/untracked/APP/give_me_idea/what_is_Alpha_template.md +366 -1
  14. cnhkmcp/untracked/APP/static/inspiration.js +345 -13
  15. cnhkmcp/untracked/APP/templates/index.html +11 -3
  16. cnhkmcp/untracked/APP/templates/transformer_web.html +1 -1
  17. cnhkmcp/untracked/APP/trailSomeAlphas/README.md +38 -0
  18. cnhkmcp/untracked/APP/trailSomeAlphas/ace.log +66 -0
  19. cnhkmcp/untracked/APP/trailSomeAlphas/enhance_template.py +588 -0
  20. cnhkmcp/untracked/APP/trailSomeAlphas/requirements.txt +3 -0
  21. cnhkmcp/untracked/APP/trailSomeAlphas/run_pipeline.py +1001 -0
  22. cnhkmcp/untracked/APP/trailSomeAlphas/run_pipeline_step_by_step.ipynb +5258 -0
  23. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/OUTPUT_TEMPLATE.md +325 -0
  24. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/SKILL.md +503 -0
  25. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/examples.md +244 -0
  26. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/output_report/ASI_delay1_analyst11_ideas.md +285 -0
  27. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/reference.md +399 -0
  28. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/SKILL.md +40 -0
  29. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/config.json +6 -0
  30. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709385783386000.json +388 -0
  31. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709386274840400.json +131 -0
  32. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709386838244700.json +1926 -0
  33. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709387369198500.json +31 -0
  34. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709387908905800.json +1926 -0
  35. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709388486243600.json +240 -0
  36. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709389024058600.json +1926 -0
  37. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709389549608700.json +41 -0
  38. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709390068714000.json +110 -0
  39. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709390591996900.json +36 -0
  40. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709391129137100.json +31 -0
  41. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709391691643500.json +41 -0
  42. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709392192099200.json +31 -0
  43. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709392703423500.json +46 -0
  44. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709393213729400.json +246 -0
  45. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710186683932500.json +388 -0
  46. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710187165414300.json +131 -0
  47. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710187665211700.json +1926 -0
  48. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710188149193400.json +31 -0
  49. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710188667627400.json +1926 -0
  50. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710189220822000.json +240 -0
  51. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710189726189500.json +1926 -0
  52. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710190248066100.json +41 -0
  53. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710190768298700.json +110 -0
  54. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710191282588100.json +36 -0
  55. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710191838960900.json +31 -0
  56. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710192396688000.json +41 -0
  57. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710192941922400.json +31 -0
  58. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710193473524600.json +46 -0
  59. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710194001961200.json +246 -0
  60. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710420975888800.json +46 -0
  61. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710421647590100.json +196 -0
  62. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710422131378500.json +5 -0
  63. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710422644184400.json +196 -0
  64. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710423702350600.json +196 -0
  65. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710424244661800.json +5 -0
  66. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_delay1.csv +211 -0
  67. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/final_expressions.json +7062 -0
  68. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/ace.log +3 -0
  69. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/ace_lib.py +1514 -0
  70. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/fetch_dataset.py +113 -0
  71. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/helpful_functions.py +180 -0
  72. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/implement_idea.py +236 -0
  73. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/merge_expression_list.py +90 -0
  74. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/parsetab.py +60 -0
  75. cnhkmcp/untracked/APP/trailSomeAlphas/skills/template_final_enhance/op/321/206/320/220/342/225/227/321/207/342/225/227/320/243.md +434 -0
  76. cnhkmcp/untracked/APP/trailSomeAlphas/skills/template_final_enhance/sample_prompt.md +62 -0
  77. cnhkmcp/untracked/APP/trailSomeAlphas/skills/template_final_enhance//321/205/320/235/320/245/321/205/320/253/320/260/321/205/320/275/320/240/321/206/320/220/320/255/321/210/320/220/320/223/321/211/320/220/342/225/227/321/210/342/225/233/320/241/321/211/320/243/342/225/233.md +354 -0
  78. cnhkmcp/untracked/APP/usage.md +2 -2
  79. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +388 -8
  80. cnhkmcp/untracked/skills/alpha-expression-verifier/scripts/validator.py +889 -0
  81. cnhkmcp/untracked/skills/brain-feature-implementation/scripts/implement_idea.py +4 -3
  82. cnhkmcp/untracked/skills/brain-improve-alpha-performance/arXiv_API_Tool_Manual.md +490 -0
  83. cnhkmcp/untracked/skills/brain-improve-alpha-performance/reference.md +1 -1
  84. cnhkmcp/untracked/skills/brain-improve-alpha-performance/scripts/arxiv_api.py +229 -0
  85. cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +35 -11
  86. cnhkmcp/vector_db/_manifest.json +1 -0
  87. cnhkmcp/vector_db/_meta.json +1 -0
  88. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.0.dist-info}/METADATA +1 -1
  89. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.0.dist-info}/RECORD +96 -30
  90. /cnhkmcp/untracked/{skills/expression_verifier → APP/trailSomeAlphas/skills/brain-feature-implementation}/scripts/validator.py +0 -0
  91. /cnhkmcp/untracked/skills/{expression_verifier → alpha-expression-verifier}/SKILL.md +0 -0
  92. /cnhkmcp/untracked/skills/{expression_verifier → alpha-expression-verifier}/scripts/verify_expr.py +0 -0
  93. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.0.dist-info}/WHEEL +0 -0
  94. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.0.dist-info}/entry_points.txt +0 -0
  95. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.0.dist-info}/licenses/LICENSE +0 -0
  96. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1001 @@
1
+ import argparse
2
+ import datetime as dt
3
+ import json
4
+ import os
5
+ import re
6
+ import subprocess
7
+ import sys
8
+ import csv
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import requests
13
+
14
+ # Ensure UTF-8 stdout on Windows to avoid UnicodeEncodeError
15
+ try:
16
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
17
+ except Exception:
18
+ pass
19
+
20
+ BASE_DIR = Path(__file__).resolve().parent
21
+ SKILLS_DIR = BASE_DIR / "skills"
22
+ FEATURE_ENGINEERING_DIR = SKILLS_DIR / "brain-data-feature-engineering"
23
+ FEATURE_IMPLEMENTATION_DIR = SKILLS_DIR / "brain-feature-implementation"
24
+ FEATURE_IMPLEMENTATION_SCRIPTS = FEATURE_IMPLEMENTATION_DIR / "scripts"
25
+
26
+ sys.path.insert(0, str(FEATURE_IMPLEMENTATION_SCRIPTS))
27
+ try:
28
+ import ace_lib # type: ignore
29
+ except Exception as exc:
30
+ raise SystemExit(f"Failed to import ace_lib from {FEATURE_IMPLEMENTATION_SCRIPTS}: {exc}")
31
+ try:
32
+ from validator import ExpressionValidator # type: ignore
33
+ except Exception as exc:
34
+ raise SystemExit(f"Failed to import ExpressionValidator from {FEATURE_IMPLEMENTATION_SCRIPTS}: {exc}")
35
+
36
+ def load_brain_credentials(config_path: Path) -> tuple[str, str]:
37
+ if not config_path.exists():
38
+ raise FileNotFoundError(f"Config not found: {config_path}")
39
+ with config_path.open("r", encoding="utf-8") as f:
40
+ data = json.load(f)
41
+ creds = data.get("BRAIN_CREDENTIALS", {})
42
+ email = creds.get("email")
43
+ password = creds.get("password")
44
+ if not email or not password:
45
+ raise ValueError("BRAIN_CREDENTIALS missing in config.json")
46
+ return email, password
47
+
48
+ def load_brain_credentials_from_env_or_args(username: str | None, password: str | None, config_path: Path) -> tuple[str, str]:
49
+ env_user = os.environ.get("BRAIN_USERNAME") or os.environ.get("BRAIN_EMAIL")
50
+ env_pass = os.environ.get("BRAIN_PASSWORD")
51
+ final_user = username or env_user
52
+ final_pass = password or env_pass
53
+ if final_user and final_pass:
54
+ return final_user, final_pass
55
+ return load_brain_credentials(config_path)
56
+
57
+ def start_brain_session(email: str, password: str):
58
+ ace_lib.get_credentials = lambda: (email, password)
59
+ return ace_lib.start_session()
60
+
61
+ def pick_first_present_column(df, candidates):
62
+ for c in candidates:
63
+ if c in df.columns:
64
+ return c
65
+ # also try case-insensitive
66
+ lower_map = {col.lower(): col for col in df.columns}
67
+ for c in candidates:
68
+ if c.lower() in lower_map:
69
+ return lower_map[c.lower()]
70
+ return None
71
+
72
+
73
+ def select_dataset(datasets_df, data_category: str, dataset_id: str | None):
74
+ if dataset_id:
75
+ return dataset_id, None, None, datasets_df
76
+
77
+ category_col = pick_first_present_column(
78
+ datasets_df,
79
+ ["category", "data_category", "dataCategory", "category_name", "dataCategory_name"],
80
+ )
81
+
82
+ filtered = datasets_df
83
+ if category_col:
84
+ filtered = datasets_df[datasets_df[category_col].astype(str).str.lower() == data_category.lower()]
85
+
86
+ if filtered.empty:
87
+ filtered = datasets_df
88
+
89
+ id_col = pick_first_present_column(filtered, ["id", "dataset_id", "datasetId"])
90
+ name_col = pick_first_present_column(filtered, ["name", "dataset_name", "datasetName"])
91
+ desc_col = pick_first_present_column(filtered, ["description", "desc", "dataset_description"])
92
+
93
+ if not id_col:
94
+ raise ValueError("Unable to locate dataset id column from dataset list")
95
+
96
+ row = filtered.iloc[0]
97
+ return row[id_col], row.get(name_col) if name_col else None, row.get(desc_col) if desc_col else None, datasets_df
98
+
99
+
100
+ def build_field_summary(fields_df, max_fields: int | None = None, default_sample_size: int = 50):
101
+ id_col = pick_first_present_column(fields_df, ["id", "field_id", "fieldId"])
102
+ desc_col = pick_first_present_column(fields_df, ["description", "desc"])
103
+
104
+ if max_fields is None:
105
+ # If user did NOT specify --max-fields, randomly sample 50 rows for the prompt.
106
+ # If there are fewer than 50 rows, pass all.
107
+ total = int(fields_df.shape[0])
108
+ n = min(default_sample_size, total)
109
+ subset = fields_df if n >= total else fields_df.sample(n=n, random_state=42)
110
+ else:
111
+ # If user specified --max-fields, pass the TOP N rows.
112
+ total = int(fields_df.shape[0])
113
+ n = min(int(max_fields), total)
114
+ subset = fields_df.head(n)
115
+
116
+ rows = []
117
+ for _, row in subset.iterrows():
118
+ rows.append(
119
+ {
120
+ "id": row.get(id_col),
121
+ "description": row.get(desc_col)
122
+ }
123
+ )
124
+ return rows, fields_df.shape[0]
125
+
126
+
127
+ def read_text_optional(path: Path) -> str:
128
+ try:
129
+ return path.read_text(encoding="utf-8")
130
+ except Exception:
131
+ return ""
132
+
133
+
134
+ def build_allowed_metric_suffixes(fields_df, max_suffixes: int = 300) -> list[str]:
135
+ """Derive a practical list of placeholder candidates from dataset field ids.
136
+
137
+ `implement_idea.py` matches `{variable}` by searching for that substring in the
138
+ field id and then using the *base* (everything before first occurrence) to
139
+ align the other variables. In practice, good placeholders tend to be the
140
+ trailing 2-5 underscore-joined tokens.
141
+ """
142
+
143
+ id_col = pick_first_present_column(fields_df, ["id", "field_id", "fieldId"])
144
+ if not id_col:
145
+ return []
146
+
147
+ field_ids = fields_df[id_col].dropna().astype(str).tolist()
148
+ dataset_code = detect_dataset_code(field_ids)
149
+
150
+ counts: dict[str, int] = {}
151
+ for raw in field_ids:
152
+ parts = [p for p in str(raw).split("_") if p]
153
+ if len(parts) < 2:
154
+ continue
155
+
156
+ # Collect suffix candidates from the tail.
157
+ # Prefer multi-token names, but allow single-token suffixes when they're
158
+ # specific enough (e.g., "inventories").
159
+ # IMPORTANT: never allow the "suffix" to equal the full id (that would
160
+ # encourage the LLM to emit {full_field_id}, violating the suffix-only rule).
161
+ for n in range(1, min(6, len(parts))):
162
+ suffix = "_".join(parts[-n:])
163
+ # Filter out overly-generic / numeric suffixes
164
+ if suffix.replace("_", "").isdigit():
165
+ continue
166
+ if dataset_code and suffix.lower().startswith(dataset_code.lower() + "_"):
167
+ continue
168
+ if n == 1 and len(suffix) < 8:
169
+ continue
170
+ if len(suffix) < 6:
171
+ continue
172
+ counts[suffix] = counts.get(suffix, 0) + 1
173
+
174
+ # Prefer suffixes that show up multiple times and have underscores
175
+ ranked = sorted(
176
+ counts.items(),
177
+ key=lambda kv: (kv[1], kv[0].count("_"), len(kv[0])),
178
+ reverse=True,
179
+ )
180
+
181
+ suffixes: list[str] = []
182
+ for suffix, _ in ranked:
183
+ if suffix not in suffixes:
184
+ suffixes.append(suffix)
185
+ if len(suffixes) >= max_suffixes:
186
+ break
187
+ return suffixes
188
+
189
+
190
+ def build_allowed_suffixes_from_ids(dataset_ids: list[str], max_suffixes: int = 300) -> list[str]:
191
+ """Build suffix candidates from downloaded dataset ids.
192
+
193
+ This is used to normalize/validate templates for `implement_idea.py`.
194
+ """
195
+
196
+ counts: dict[str, int] = {}
197
+ for raw in dataset_ids:
198
+ parts = [p for p in str(raw).split("_") if p]
199
+ if len(parts) < 2:
200
+ continue
201
+ for n in range(1, 6):
202
+ if len(parts) >= n:
203
+ suffix = "_".join(parts[-n:])
204
+ if suffix.replace("_", "").isdigit():
205
+ continue
206
+ if n == 1 and len(suffix) < 8:
207
+ continue
208
+ if len(suffix) < 6:
209
+ continue
210
+ counts[suffix] = counts.get(suffix, 0) + 1
211
+
212
+ ranked = sorted(
213
+ counts.items(),
214
+ key=lambda kv: (kv[1], kv[0].count("_"), len(kv[0])),
215
+ reverse=True,
216
+ )
217
+
218
+ suffixes: list[str] = []
219
+ for suffix, _ in ranked:
220
+ if suffix not in suffixes:
221
+ suffixes.append(suffix)
222
+ if len(suffixes) >= max_suffixes:
223
+ break
224
+ return suffixes
225
+
226
+
227
+ def detect_dataset_code(dataset_ids: list[str]) -> str | None:
228
+ if not dataset_ids:
229
+ return None
230
+ counts: dict[str, int] = {}
231
+ for fid in dataset_ids:
232
+ tok = (str(fid).split("_", 1)[0] or "").strip()
233
+ if tok:
234
+ counts[tok] = counts.get(tok, 0) + 1
235
+ if not counts:
236
+ return None
237
+ return max(counts.items(), key=lambda kv: kv[1])[0]
238
+
239
+ def ensure_metadata_block(markdown_text: str, dataset_id: str, region: str, delay: int) -> str:
240
+ """Ensure the ideas markdown contains the metadata block used by the pipeline."""
241
+
242
+ has_dataset = re.search(r"^\*\*Dataset\*\*:\s*\S+", markdown_text, flags=re.MULTILINE) is not None
243
+ has_region = re.search(r"^\*\*Region\*\*:\s*\S+", markdown_text, flags=re.MULTILINE) is not None
244
+ has_delay = re.search(r"^\*\*Delay\*\*:\s*\d+", markdown_text, flags=re.MULTILINE) is not None
245
+ if has_dataset and has_region and has_delay:
246
+ return markdown_text
247
+
248
+ block = [
249
+ "",
250
+ f"**Dataset**: {dataset_id}",
251
+ f"**Region**: {region}",
252
+ f"**Delay**: {delay}",
253
+ "",
254
+ ]
255
+
256
+ lines = markdown_text.splitlines()
257
+ insert_at = 0
258
+ for i, line in enumerate(lines[:10]):
259
+ if line.strip():
260
+ insert_at = i + 1
261
+ break
262
+ new_lines = lines[:insert_at] + block + lines[insert_at:]
263
+ return "\n".join(new_lines).lstrip("\n")
264
+
265
+ def compress_to_known_suffix(var: str, allowed_suffixes: list[str]) -> str | None:
266
+ v = var.lower()
267
+ for sfx in sorted(allowed_suffixes, key=len, reverse=True):
268
+ if v.endswith(sfx.lower()):
269
+ return sfx
270
+ return None
271
+
272
+ def placeholder_is_reasonably_matchable(var: str, dataset_ids: list[str]) -> bool:
273
+ """Heuristic check that a placeholder is likely to match real ids.
274
+
275
+ We avoid treating very short tokens as valid unless they match a token boundary.
276
+ """
277
+
278
+ v = var
279
+ if len(v) <= 3:
280
+ pat = re.compile(rf"(^|_){re.escape(v)}(_|$)", flags=re.IGNORECASE)
281
+ return any(pat.search(str(fid)) for fid in dataset_ids)
282
+ return any(v in str(fid) for fid in dataset_ids)
283
+
284
+ def normalize_template_placeholders(
285
+ template: str,
286
+ dataset_ids: list[str],
287
+ allowed_suffixes: list[str],
288
+ dataset_code: str | None,
289
+ ) -> tuple[str, bool]:
290
+ """Normalize placeholders to suffix-only form, without dataset-specific aliasing.
291
+
292
+ - Strips dataset code prefix (e.g. fnd72_*) when present.
293
+ - Compresses placeholders to the longest known suffix.
294
+ - Returns (normalized_template, is_valid).
295
+ """
296
+
297
+ vars_in_template = re.findall(r"\{([A-Za-z0-9_]+)\}", template)
298
+ if not vars_in_template:
299
+ return template, False
300
+
301
+ mapping: dict[str, str] = {}
302
+ for var in set(vars_in_template):
303
+ new_var = var
304
+ if dataset_code and new_var.lower().startswith(dataset_code.lower() + "_"):
305
+ new_var = new_var[len(dataset_code) + 1 :]
306
+
307
+ compressed = compress_to_known_suffix(new_var, allowed_suffixes)
308
+ if compressed:
309
+ new_var = compressed
310
+
311
+ mapping[var] = new_var
312
+
313
+ normalized = template
314
+ for src, dst in mapping.items():
315
+ normalized = normalized.replace("{" + src + "}", "{" + dst + "}")
316
+
317
+ # Validate: every placeholder should look matchable in real ids.
318
+ vars_after = re.findall(r"\{([A-Za-z0-9_]+)\}", normalized)
319
+ ok = all(placeholder_is_reasonably_matchable(v, dataset_ids) for v in vars_after)
320
+ return normalized, ok
321
+
322
+ def build_prompt(
323
+ dataset_id: str,
324
+ dataset_name: str | None,
325
+ dataset_description: str | None,
326
+ data_category: str,
327
+ region: str,
328
+ delay: int,
329
+ universe: str,
330
+ fields_summary: list[dict],
331
+ field_count: int,
332
+ feature_engineering_skill_md: str,
333
+ feature_implementation_skill_md: str,
334
+ allowed_metric_suffixes: list[str],
335
+ allowed_operators,
336
+ ):
337
+ # NOTE: The user requested that we DO NOT invent our own system prompt.
338
+ # Instead, we embed the two skill specs as the authoritative instructions.
339
+ system_prompt = "\n".join(
340
+ [
341
+ "You are executing two skills in sequence:",
342
+ "1) brain-data-feature-engineering",
343
+ "2) brain-feature-implementation",
344
+ "The following SKILL.md documents are authoritative; follow them exactly.",
345
+ "",
346
+ "--- SKILL.md (brain-data-feature-engineering) ---",
347
+ feature_engineering_skill_md.strip(),
348
+ "",
349
+ "--- SKILL.md (brain-feature-implementation) ---",
350
+ feature_implementation_skill_md.strip(),
351
+ "------"
352
+ f'"allowed_operators": {allowed_operators}',
353
+ "-------",
354
+ f'"allowed_placeholders": {allowed_metric_suffixes}',
355
+ "",
356
+ "CRITICAL OUTPUT RULES (to ensure implement_idea.py can generate expressions):",
357
+ "- Every Implementation Example MUST be a Python format template using {variable}.",
358
+ "- Every {variable} MUST come from the allowed_placeholders list provided in user content.",
359
+ "- When you implement ideas, ONLY use operators from allowed_operators provided.",
360
+ "- Do NOT include dataset codes/prefixes/horizons in {variable} (suffix-only).",
361
+ "- If you show raw field ids in tables, use backticks `like_this`, NOT {braces}.",
362
+ "- Include these metadata lines verbatim somewhere near the top:",
363
+ " **Dataset**: <dataset_id>",
364
+ " **Region**: <region>",
365
+ " **Delay**: <delay>",
366
+ ]
367
+ )
368
+
369
+ user_prompt = {
370
+ "instructions": {
371
+ "output_format": "Fill OUTPUT_TEMPLATE.md with concrete content.",
372
+ "implementation_examples": (
373
+ "Each Implementation Example must be a template with {variable} placeholders. "
374
+ "Use only placeholders from allowed_placeholders. "
375
+ "Use suffix-only names; do not include dataset code/prefix/horizon."
376
+ ),
377
+ "no_code_fences": True,
378
+ "do_not_invent_placeholders": True,
379
+ },
380
+ "dataset_context": {
381
+ "dataset_id": dataset_id,
382
+ "dataset_name": dataset_name,
383
+ "dataset_description": dataset_description,
384
+ "category": data_category,
385
+ "region": region,
386
+ "delay": delay,
387
+ "universe": universe,
388
+ "field_count": field_count,
389
+ },
390
+
391
+ "fields": fields_summary,
392
+ }
393
+ # print(user_prompt) #for debug
394
+ # print(system_prompt) #for debug
395
+ return system_prompt, json.dumps(user_prompt, ensure_ascii=False, indent=2)
396
+
397
+
398
+ def _vector_ratio_from_datafields_df(datafields_df) -> float:
399
+ if datafields_df is None or getattr(datafields_df, "empty", True):
400
+ return 0.0
401
+ dtype_col = pick_first_present_column(datafields_df, ["type", "dataType", "data_type"])
402
+ if not dtype_col:
403
+ return 0.0
404
+ counts = datafields_df[dtype_col].astype(str).value_counts().to_dict()
405
+ vector_count = counts.get("VECTOR", 0)
406
+ total = sum(counts.values())
407
+ return (vector_count / total) if total else 0.0
408
+
409
+
410
+ def filter_operators_df(operators_df, keep_vector: bool):
411
+ """Apply user-confirmed operator filters.
412
+
413
+ Rules:
414
+ - Keep only scope == REGULAR
415
+ - Drop category == Group
416
+ - Keep category == Vector only if keep_vector is True
417
+ - Drop names matching /rank|neutral|normal|scal/i
418
+ """
419
+
420
+ df = operators_df.copy()
421
+
422
+ name_col = pick_first_present_column(df, ["name", "operator", "op", "id"])
423
+ scope_col = pick_first_present_column(df, ["scope", "scopes"])
424
+ category_col = pick_first_present_column(df, ["category", "group", "type"])
425
+ desc_col = pick_first_present_column(df, ["description", "desc", "help", "doc", "documentation"])
426
+ definition_col = pick_first_present_column(df, ["definition", "syntax"])
427
+
428
+ if scope_col:
429
+ df = df[df[scope_col].astype(str).str.upper() == "REGULAR"]
430
+
431
+ if category_col:
432
+ df = df[df[category_col].astype(str).str.lower() != "group"]
433
+ if not keep_vector:
434
+ df = df[df[category_col].astype(str).str.lower() != "vector"]
435
+
436
+ if name_col:
437
+ banned = re.compile(r"(?:rank|neutral|normal|scal|zscore)", flags=re.IGNORECASE)
438
+ df = df[~df[name_col].astype(str).str.contains(banned, na=False)]
439
+
440
+ # de-dup by operator name
441
+ df = df.drop_duplicates(subset=[name_col]).reset_index(drop=True)
442
+
443
+ cols = [c for c in [name_col, category_col, scope_col, desc_col, definition_col] if c]
444
+ allowed = []
445
+ for _, row in df.iterrows():
446
+ item = {
447
+ "name": row.get(name_col) if name_col else None,
448
+ "category": row.get(category_col) if category_col else None,
449
+ "scope": row.get(scope_col) if scope_col else None,
450
+ "description": row.get(desc_col) if desc_col else None,
451
+ "definition": row.get(definition_col) if definition_col else None,
452
+ }
453
+ # drop None keys to keep prompt compact
454
+ allowed.append({k: v for k, v in item.items() if v is not None})
455
+
456
+ return df, allowed, cols
457
+
458
+ def call_moonshot(api_key: str, model: str, system_prompt: str, user_prompt: str, timeout_s: int = 120):
459
+ base_url = os.environ.get("MOONSHOT_BASE_URL", "https://api.moonshot.cn/v1")
460
+ url = f"{base_url.rstrip('/')}/chat/completions"
461
+ headers = {
462
+ "Authorization": f"Bearer {api_key}",
463
+ "Content-Type": "application/json",
464
+ }
465
+ payload = {
466
+ "model": model,
467
+ "messages": [
468
+ {"role": "system", "content": system_prompt},
469
+ {"role": "user", "content": user_prompt},
470
+ ],
471
+
472
+ # Default to streaming so the user can observe model progress.
473
+ "stream": True,
474
+ }
475
+
476
+ retries = int(os.environ.get("MOONSHOT_RETRIES", "2"))
477
+ backoff_s = float(os.environ.get("MOONSHOT_RETRY_BACKOFF", "2"))
478
+
479
+ def _stream_sse_and_collect(resp: requests.Response) -> str:
480
+ """Read OpenAI-compatible SSE stream and print deltas live.
481
+
482
+ Still returns the full accumulated assistant content so existing callers
483
+ (which expect a string) keep working.
484
+ """
485
+
486
+ content_parts: list[str] = []
487
+ thinking_parts: list[str] = []
488
+ thinking = False
489
+
490
+ # Ensure requests doesn't try to decode as bytes.
491
+ for raw_line in resp.iter_lines(decode_unicode=True):
492
+ if not raw_line:
493
+ continue
494
+ line = raw_line.strip()
495
+ if not line.startswith("data:"):
496
+ continue
497
+ data_str = line[5:].strip()
498
+ if data_str == "[DONE]":
499
+ break
500
+
501
+ try:
502
+ event = json.loads(data_str)
503
+ except Exception:
504
+ continue
505
+
506
+ choices = event.get("choices") or []
507
+ if not choices:
508
+ continue
509
+ choice0 = choices[0] if isinstance(choices[0], dict) else None
510
+ if not choice0:
511
+ continue
512
+
513
+ delta = choice0.get("delta") or {}
514
+ if not isinstance(delta, dict):
515
+ delta = {}
516
+
517
+ # Moonshot/Kimi exposes reasoning tokens as `reasoning_content`.
518
+ reasoning = delta.get("reasoning_content")
519
+ if reasoning:
520
+ if not thinking:
521
+ thinking = True
522
+ print("=============开始思考=============", flush=True)
523
+ thinking_parts.append(str(reasoning))
524
+ print(str(reasoning), end="", flush=True)
525
+
526
+ piece = delta.get("content")
527
+ if piece:
528
+ if thinking:
529
+ thinking = False
530
+ print("\n=============思考结束=============", flush=True)
531
+ content_parts.append(str(piece))
532
+ print(str(piece), end="", flush=True)
533
+
534
+ finish_reason = choice0.get("finish_reason")
535
+ if finish_reason:
536
+ break
537
+
538
+ # If the stream ended while still "thinking", close the marker cleanly.
539
+ if thinking:
540
+ print("\n=============思考结束=============", flush=True)
541
+
542
+ return "".join(content_parts)
543
+
544
+ last_exc: Exception | None = None
545
+ for attempt in range(retries + 1):
546
+ try:
547
+ resp = requests.post(url, headers=headers, json=payload, timeout=timeout_s, stream=True)
548
+ resp.encoding = "utf-8"
549
+ if resp.status_code >= 300:
550
+ raise RuntimeError(f"Moonshot API error {resp.status_code}: {resp.text}")
551
+
552
+ # Prefer SSE streaming when available.
553
+ ctype = (resp.headers.get("Content-Type") or "").lower()
554
+ if "text/event-stream" in ctype or payload.get("stream"):
555
+ return _stream_sse_and_collect(resp)
556
+
557
+ data = resp.json()
558
+ break
559
+ except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as exc:
560
+ last_exc = exc
561
+ if attempt >= retries:
562
+ raise
563
+ time.sleep(backoff_s * (2**attempt))
564
+ except requests.exceptions.RequestException as exc:
565
+ # Other request-layer issues: retry a bit, but don't loop forever.
566
+ last_exc = exc
567
+ if attempt >= retries:
568
+ raise
569
+ time.sleep(backoff_s * (2**attempt))
570
+ else:
571
+ raise last_exc or RuntimeError("Moonshot request failed")
572
+
573
+ try:
574
+ return data["choices"][0]["message"]["content"]
575
+ except Exception as exc:
576
+ raise RuntimeError(f"Unexpected Moonshot response: {data}") from exc
577
+ def save_ideas_report(content: str, region: str, delay: int, dataset_id: str) -> Path:
578
+ output_dir = FEATURE_ENGINEERING_DIR / "output_report"
579
+ output_dir.mkdir(parents=True, exist_ok=True)
580
+ filename = f"{region}_delay{delay}_{dataset_id}_ideas.md"
581
+ output_path = output_dir / filename
582
+ output_path.write_text(content, encoding="utf-8")
583
+ return output_path
584
+
585
+ def extract_templates(markdown_text: str) -> list[str]:
586
+ """Extract implementation templates from idea markdown.
587
+
588
+ For pipeline robustness, this function returns ONLY the template strings.
589
+ The recommended, higher-fidelity parser is `extract_template_blocks()`,
590
+ which returns both template + idea text per **Concept** block.
591
+ """
592
+
593
+ blocks = extract_template_blocks(markdown_text)
594
+ templates = [b["template"] for b in blocks if b.get("template")]
595
+ return sorted(set(t.strip() for t in templates if t and t.strip()))
596
+
597
+
598
+ def extract_template_blocks(markdown_text: str) -> list[dict[str, str]]:
599
+ """Parse **Concept** blocks and extract {template, idea}.
600
+
601
+ A "block" is a section that starts with a line like:
602
+ **Concept**: ...
603
+ and contains a line like:
604
+ - **Implementation Example**: `...`
605
+
606
+ Output:
607
+ [{"template": <string>, "idea": <string>}, ...]
608
+
609
+ Notes:
610
+ - `template` is taken from inside backticks when present; otherwise uses the
611
+ remainder of the line after ':'.
612
+ - `idea` is the rest of the block text (including the concept line and
613
+ bullets) excluding the implementation example line.
614
+ """
615
+
616
+ concept_re = re.compile(r"^\*\*Concept\*\*\s*:\s*(.*)\s*$")
617
+ impl_re = re.compile(r"\*\*Implementation Example\*\*\s*:\s*(.*)$", flags=re.IGNORECASE)
618
+ backtick_re = re.compile(r"`([^`]*)`")
619
+ boundary_re = re.compile(r"^(?:-{3,}|#{1,6}\s+.*)\s*$")
620
+
621
+ lines = markdown_text.splitlines()
622
+ blocks: list[list[str]] = []
623
+ current: list[str] = []
624
+
625
+ def _flush():
626
+ nonlocal current
627
+ if current:
628
+ # Trim leading/trailing blank lines in block.
629
+ while current and not current[0].strip():
630
+ current.pop(0)
631
+ while current and not current[-1].strip():
632
+ current.pop()
633
+ if current:
634
+ blocks.append(current)
635
+ current = []
636
+
637
+ for line in lines:
638
+ if concept_re.match(line.strip()):
639
+ _flush()
640
+ current = [line]
641
+ continue
642
+
643
+ # If we are inside a concept block and hit a section boundary (e.g. '---', '### Q2'),
644
+ # close the block so unrelated headings don't get included in the idea text.
645
+ if current and boundary_re.match(line.strip()):
646
+ _flush()
647
+ continue
648
+
649
+ if current:
650
+ current.append(line)
651
+
652
+ _flush()
653
+
654
+ out: list[dict[str, str]] = []
655
+ for block_lines in blocks:
656
+ template: str | None = None
657
+ impl_line_idx: int | None = None
658
+
659
+ # Find the implementation example line (or its continuation).
660
+ for i, raw in enumerate(block_lines):
661
+ m = impl_re.search(raw)
662
+ if not m:
663
+ continue
664
+
665
+ impl_line_idx = i
666
+ tail = (m.group(1) or "").strip()
667
+
668
+ # Case 1: template is in backticks on the same line.
669
+ bt = backtick_re.search(tail)
670
+ if bt:
671
+ template = bt.group(1).strip()
672
+ break
673
+
674
+ # Case 2: tail itself is the template.
675
+ if tail and ("{" in tail and "}" in tail):
676
+ template = tail.strip().strip("`")
677
+ break
678
+
679
+ # Case 3: template is on the next non-empty line, often in backticks.
680
+ for j in range(i + 1, min(i + 4, len(block_lines))):
681
+ nxt = block_lines[j].strip()
682
+ if not nxt:
683
+ continue
684
+ bt2 = backtick_re.search(nxt)
685
+ if bt2:
686
+ template = bt2.group(1).strip()
687
+ break
688
+ if "{" in nxt and "}" in nxt:
689
+ template = nxt.strip().strip("`")
690
+ break
691
+ break
692
+
693
+ if not template or "{" not in template or "}" not in template:
694
+ continue
695
+
696
+ # idea = all block text except the implementation example line itself.
697
+ idea_lines: list[str] = []
698
+ for i, raw in enumerate(block_lines):
699
+ if impl_line_idx is not None and i == impl_line_idx:
700
+ continue
701
+ idea_lines.append(raw)
702
+
703
+ idea = "\n".join(idea_lines).strip()
704
+ out.append({"template": template.strip(), "idea": idea})
705
+
706
+ return out
707
+
708
+ def load_dataset_ids_from_csv(dataset_csv_path: Path) -> list[str]:
709
+ if not dataset_csv_path.exists():
710
+ return []
711
+ ids: list[str] = []
712
+ with dataset_csv_path.open("r", encoding="utf-8", newline="") as f:
713
+ reader = csv.DictReader(f)
714
+ if "id" not in (reader.fieldnames or []):
715
+ return []
716
+ for row in reader:
717
+ v = (row.get("id") or "").strip()
718
+ if v:
719
+ ids.append(v)
720
+ return ids
721
+
722
+ def safe_dataset_id(dataset_id: str) -> str:
723
+ return "".join([c for c in dataset_id if c.isalnum() or c in ("-", "_")])
724
+
725
+ def run_script(args_list: list[str], cwd: Path):
726
+ result = subprocess.run(args_list, cwd=cwd, capture_output=True, text=True)
727
+ if result.returncode != 0:
728
+ raise RuntimeError(
729
+ "Command failed: "
730
+ + " ".join(args_list)
731
+ + f"\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
732
+ )
733
+ return result.stdout
734
+
735
+ def main():
736
+ parser = argparse.ArgumentParser(description="Run feature engineering + implementation pipeline")
737
+ parser.add_argument("--data-category", required=True, help="Dataset category (e.g., analyst, fundamental)")
738
+ parser.add_argument("--region", required=True, help="Region (e.g., USA, GLB, EUR)")
739
+ parser.add_argument("--delay", required=True, type=int, help="Delay (0 or 1)")
740
+ parser.add_argument("--universe", default="TOP3000", help="Universe (default: TOP3000)")
741
+ parser.add_argument("--dataset-id", required=True, help="Dataset id (required)")
742
+ parser.add_argument("--instrument-type", default="EQUITY", help="Instrument type (default: EQUITY)")
743
+ parser.add_argument("--ideas-file", default=None, help="Use existing ideas markdown instead of generating")
744
+ parser.add_argument(
745
+ "--regen-ideas",
746
+ action="store_true",
747
+ help="Force regenerating ideas markdown even if the default ideas file already exists",
748
+ )
749
+ parser.add_argument("--moonshot-api-key", default=None, help="Moonshot API key (prefer env MOONSHOT_API_KEY)")
750
+ parser.add_argument("--moonshot-model", default="kimi-k2.5", help="Moonshot model (default: k2.5)")
751
+ parser.add_argument("--username", default=None, help="BRAIN username/email (override config/env)")
752
+ parser.add_argument("--password", default=None, help="BRAIN password (override config/env)")
753
+ parser.add_argument(
754
+ "--max-fields",
755
+ type=int,
756
+ default=None,
757
+ help="If set, pass TOP N fields to LLM; if omitted, randomly sample 50 (or all if <50)",
758
+ )
759
+ parser.add_argument(
760
+ "--no-operators-in-prompt",
761
+ action="store_true",
762
+ help="Do not include allowed_operators in the idea-generation prompt",
763
+ )
764
+ parser.add_argument(
765
+ "--max-operators",
766
+ type=int,
767
+ default=300,
768
+ help="Max filtered operators to include in prompt (default: 300)",
769
+ )
770
+
771
+ args = parser.parse_args()
772
+
773
+ config_path = FEATURE_IMPLEMENTATION_DIR / "config.json"
774
+ email, password = load_brain_credentials_from_env_or_args(args.username, args.password, config_path)
775
+ session = start_brain_session(email, password)
776
+
777
+ ideas_path = None
778
+ if args.ideas_file:
779
+ ideas_path = Path(args.ideas_file).resolve()
780
+ if not ideas_path.exists():
781
+ raise FileNotFoundError(f"Ideas file not found: {ideas_path}")
782
+ else:
783
+ default_ideas = (
784
+ FEATURE_ENGINEERING_DIR
785
+ / "output_report"
786
+ / f"{args.region}_delay{args.delay}_{args.dataset_id}_ideas.md"
787
+ )
788
+ if default_ideas.exists() and not args.regen_ideas:
789
+ ideas_path = default_ideas
790
+ else:
791
+ datasets_df = ace_lib.get_datasets(
792
+ session,
793
+ instrument_type=args.instrument_type,
794
+ region=args.region,
795
+ delay=args.delay,
796
+ universe=args.universe,
797
+ theme="ALL",
798
+ )
799
+
800
+ dataset_name = None
801
+ dataset_description = None
802
+ id_col = pick_first_present_column(datasets_df, ["id", "dataset_id", "datasetId"])
803
+ name_col = pick_first_present_column(datasets_df, ["name", "dataset_name", "datasetName"])
804
+ desc_col = pick_first_present_column(datasets_df, ["description", "desc", "dataset_description"])
805
+ if id_col:
806
+ matched = datasets_df[datasets_df[id_col].astype(str) == str(args.dataset_id)]
807
+ if not matched.empty:
808
+ row = matched.iloc[0]
809
+ dataset_name = row.get(name_col) if name_col else None
810
+ dataset_description = row.get(desc_col) if desc_col else None
811
+
812
+ fields_df = ace_lib.get_datafields(
813
+ session,
814
+ instrument_type=args.instrument_type,
815
+ region=args.region,
816
+ delay=args.delay,
817
+ universe=args.universe,
818
+ dataset_id=args.dataset_id,
819
+ data_type="ALL",
820
+ )
821
+
822
+ fields_summary, field_count = build_field_summary(fields_df, max_fields=args.max_fields)
823
+
824
+ feature_engineering_skill_md = read_text_optional(FEATURE_ENGINEERING_DIR / "SKILL.md")
825
+ feature_implementation_skill_md = read_text_optional(FEATURE_IMPLEMENTATION_DIR / "SKILL.md")
826
+ allowed_metric_suffixes = build_allowed_metric_suffixes(fields_df, max_suffixes=300)
827
+
828
+ allowed_operators = []
829
+ if not args.no_operators_in_prompt:
830
+ try:
831
+ operators_df = ace_lib.get_operators(session)
832
+ keep_vector = _vector_ratio_from_datafields_df(fields_df) > 0.5
833
+ _, allowed_ops, _ = filter_operators_df(operators_df, keep_vector=keep_vector)
834
+ if args.max_operators is not None and args.max_operators > 0:
835
+ allowed_operators = allowed_ops[: args.max_operators]
836
+ else:
837
+ allowed_operators = allowed_ops
838
+ except Exception as exc:
839
+ print(f"Warning: failed to fetch/filter operators; continuing without operators in prompt. Error: {exc}", file=sys.stderr)
840
+
841
+ system_prompt, user_prompt = build_prompt(
842
+ dataset_id=args.dataset_id,
843
+ dataset_name=dataset_name,
844
+ dataset_description=dataset_description,
845
+ data_category=args.data_category,
846
+ region=args.region,
847
+ delay=args.delay,
848
+ universe=args.universe,
849
+ fields_summary=fields_summary,
850
+ field_count=field_count,
851
+ feature_engineering_skill_md=feature_engineering_skill_md,
852
+ feature_implementation_skill_md=feature_implementation_skill_md,
853
+ allowed_metric_suffixes=allowed_metric_suffixes,
854
+ allowed_operators=allowed_operators,
855
+ )
856
+
857
+ api_key = (
858
+ args.moonshot_api_key
859
+ or os.environ.get("MOONSHOT_API_KEY")
860
+ )
861
+ if not api_key:
862
+ raise ValueError("Moonshot API key missing. Set MOONSHOT_API_KEY or pass --moonshot-api-key")
863
+
864
+ report = call_moonshot(api_key, args.moonshot_model, system_prompt, user_prompt)
865
+ # Save first, then normalize placeholders after dataset download.
866
+ ideas_path = save_ideas_report(report, args.region, args.delay, args.dataset_id)
867
+
868
+ ideas_text = ideas_path.read_text(encoding="utf-8")
869
+
870
+ # Ensure metadata exists for downstream parsing/reuse.
871
+ ideas_text = ensure_metadata_block(ideas_text, dataset_id=args.dataset_id, region=args.region, delay=args.delay)
872
+ ideas_path.write_text(ideas_text, encoding="utf-8")
873
+
874
+ # Parse metadata
875
+ dataset_id_match = re.search(r"\*\*Dataset\*\*:\s*(\S+)", ideas_text)
876
+ dataset_id = dataset_id_match.group(1) if dataset_id_match else args.dataset_id
877
+
878
+ # Download dataset for implementation
879
+ fetch_script = FEATURE_IMPLEMENTATION_SCRIPTS / "fetch_dataset.py"
880
+ run_script(
881
+ [
882
+ sys.executable,
883
+ str(fetch_script),
884
+ "--datasetid",
885
+ dataset_id,
886
+ "--region",
887
+ args.region,
888
+ "--delay",
889
+ str(args.delay),
890
+ "--universe",
891
+ args.universe,
892
+ "--instrument-type",
893
+ args.instrument_type,
894
+ ],
895
+ cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
896
+ )
897
+
898
+ dataset_folder = f"{safe_dataset_id(dataset_id)}_{args.region}_delay{args.delay}"
899
+ dataset_csv_path = FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder / f"{dataset_folder}.csv"
900
+ if not dataset_csv_path.exists():
901
+ raise RuntimeError(
902
+ "Dataset CSV was not created by fetch_dataset.py. "
903
+ f"Expected: {dataset_csv_path}"
904
+ )
905
+ dataset_ids = load_dataset_ids_from_csv(dataset_csv_path)
906
+ allowed_suffixes = build_allowed_suffixes_from_ids(dataset_ids, max_suffixes=300) if dataset_ids else []
907
+ dataset_code = detect_dataset_code(dataset_ids) if dataset_ids else None
908
+
909
+ # Extract {template, idea} pairs from **Concept** blocks.
910
+ block_pairs = extract_template_blocks(ideas_text)
911
+ if not block_pairs:
912
+ raise ValueError("No **Concept** blocks with **Implementation Example** found in the ideas file.")
913
+
914
+ normalized_pairs: list[tuple[str, str]] = []
915
+ for item in block_pairs:
916
+ t = str(item.get("template") or "").strip()
917
+ idea_text = str(item.get("idea") or "").strip()
918
+ if not t:
919
+ continue
920
+
921
+ if dataset_ids and allowed_suffixes:
922
+ normalized_t, ok = normalize_template_placeholders(t, dataset_ids, allowed_suffixes, dataset_code)
923
+ if not ok:
924
+ continue
925
+ normalized_pairs.append((normalized_t, idea_text))
926
+ else:
927
+ # No dataset ids to validate against; pass through.
928
+ normalized_pairs.append((t, idea_text))
929
+
930
+ if not normalized_pairs:
931
+ raise ValueError("No valid templates remain after normalization/validation.")
932
+
933
+ # De-dup by template; keep the first non-empty idea.
934
+ template_to_idea: dict[str, str] = {}
935
+ for t, idea_text in normalized_pairs:
936
+ if t not in template_to_idea or (not template_to_idea[t] and idea_text):
937
+ template_to_idea[t] = idea_text
938
+
939
+ templates = sorted(template_to_idea.keys())
940
+
941
+ implement_script = FEATURE_IMPLEMENTATION_SCRIPTS / "implement_idea.py"
942
+
943
+ for template in templates:
944
+ idea_text = template_to_idea.get(template, "")
945
+ run_script(
946
+ [
947
+ sys.executable,
948
+ str(implement_script),
949
+ "--template",
950
+ template,
951
+ "--dataset",
952
+ dataset_folder,
953
+ "--idea",
954
+ idea_text,
955
+ ],
956
+ cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
957
+ )
958
+
959
+ merge_script = FEATURE_IMPLEMENTATION_SCRIPTS / "merge_expression_list.py"
960
+ run_script(
961
+ [
962
+ sys.executable,
963
+ str(merge_script),
964
+ "--dataset",
965
+ dataset_folder,
966
+ ],
967
+ cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
968
+ )
969
+
970
+ final_path = FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder / "final_expressions.json"
971
+ if final_path.exists():
972
+ try:
973
+ raw = json.loads(final_path.read_text(encoding="utf-8"))
974
+ except Exception as exc:
975
+ raise RuntimeError(f"Failed to read final expressions: {final_path}. Error: {exc}")
976
+
977
+ expressions = raw if isinstance(raw, list) else []
978
+ validator = ExpressionValidator()
979
+ valid_expressions: list[str] = []
980
+ invalid_count = 0
981
+ for expr in expressions:
982
+ if not isinstance(expr, str) or not expr.strip():
983
+ invalid_count += 1
984
+ continue
985
+ result = validator.check_expression(expr.strip())
986
+ if result.get("valid"):
987
+ valid_expressions.append(expr.strip())
988
+ else:
989
+ invalid_count += 1
990
+
991
+ final_path.write_text(json.dumps(valid_expressions, ensure_ascii=False, indent=4), encoding="utf-8")
992
+ print(f"Filtered invalid expressions: {invalid_count}")
993
+ else:
994
+ print(f"Warning: final_expressions.json not found: {final_path}")
995
+
996
+ print(f"Ideas report: {ideas_path}")
997
+ print(f"Expressions: {FEATURE_IMPLEMENTATION_DIR / 'data' / dataset_folder / 'final_expressions.json'}")
998
+
999
+
1000
+ if __name__ == "__main__":
1001
+ main()