cnhkmcp 2.2.0__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. cnhkmcp/__init__.py +1 -1
  2. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/README.md +1 -1
  3. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/config.json +2 -2
  4. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/main.py +1 -1
  5. cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/chroma.sqlite3 +0 -0
  6. cnhkmcp/untracked/APP/Tranformer/Transformer.py +2 -2
  7. cnhkmcp/untracked/APP/Tranformer/transformer_config.json +1 -1
  8. cnhkmcp/untracked/APP/blueprints/feature_engineering.py +2 -2
  9. cnhkmcp/untracked/APP/blueprints/inspiration_house.py +4 -4
  10. cnhkmcp/untracked/APP/blueprints/paper_analysis.py +3 -3
  11. cnhkmcp/untracked/APP/give_me_idea/BRAIN_Alpha_Template_Expert_SystemPrompt.md +34 -73
  12. cnhkmcp/untracked/APP/give_me_idea/alpha_data_specific_template_master.py +2 -2
  13. cnhkmcp/untracked/APP/give_me_idea/what_is_Alpha_template.md +366 -1
  14. cnhkmcp/untracked/APP/simulator/wqb20260130130030.log +210 -0
  15. cnhkmcp/untracked/APP/simulator/wqb20260130131757.log +104 -0
  16. cnhkmcp/untracked/APP/simulator/wqb20260130172245.log +70 -0
  17. cnhkmcp/untracked/APP/static/inspiration.js +350 -14
  18. cnhkmcp/untracked/APP/templates/index.html +18 -3
  19. cnhkmcp/untracked/APP/templates/transformer_web.html +1 -1
  20. cnhkmcp/untracked/APP/trailSomeAlphas/README.md +38 -0
  21. cnhkmcp/untracked/APP/trailSomeAlphas/ace.log +66 -0
  22. cnhkmcp/untracked/APP/trailSomeAlphas/enhance_template.py +588 -0
  23. cnhkmcp/untracked/APP/trailSomeAlphas/requirements.txt +3 -0
  24. cnhkmcp/untracked/APP/trailSomeAlphas/run_pipeline.py +1051 -0
  25. cnhkmcp/untracked/APP/trailSomeAlphas/run_pipeline_step_by_step.ipynb +5258 -0
  26. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/OUTPUT_TEMPLATE.md +325 -0
  27. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/SKILL.md +503 -0
  28. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/examples.md +244 -0
  29. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/output_report/ASI_delay1_analyst11_ideas.md +285 -0
  30. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/output_report/GLB_delay1_fundamental72_ideas.md +362 -0
  31. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/reference.md +399 -0
  32. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/SKILL.md +40 -0
  33. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/config.json +6 -0
  34. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709385783386000.json +388 -0
  35. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709386274840400.json +131 -0
  36. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709386838244700.json +1926 -0
  37. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709387369198500.json +31 -0
  38. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709387908905800.json +1926 -0
  39. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709388486243600.json +240 -0
  40. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709389024058600.json +1926 -0
  41. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709389549608700.json +41 -0
  42. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709390068714000.json +110 -0
  43. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709390591996900.json +36 -0
  44. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709391129137100.json +31 -0
  45. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709391691643500.json +41 -0
  46. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709392192099200.json +31 -0
  47. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709392703423500.json +46 -0
  48. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769709393213729400.json +246 -0
  49. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710186683932500.json +388 -0
  50. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710187165414300.json +131 -0
  51. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710187665211700.json +1926 -0
  52. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710188149193400.json +31 -0
  53. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710188667627400.json +1926 -0
  54. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710189220822000.json +240 -0
  55. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710189726189500.json +1926 -0
  56. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710190248066100.json +41 -0
  57. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710190768298700.json +110 -0
  58. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710191282588100.json +36 -0
  59. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710191838960900.json +31 -0
  60. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710192396688000.json +41 -0
  61. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710192941922400.json +31 -0
  62. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710193473524600.json +46 -0
  63. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710194001961200.json +246 -0
  64. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710420975888800.json +46 -0
  65. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710421647590100.json +196 -0
  66. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710422131378500.json +5 -0
  67. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710422644184400.json +196 -0
  68. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710423702350600.json +196 -0
  69. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_1_idea_1769710424244661800.json +5 -0
  70. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/analyst11_ASI_delay1.csv +211 -0
  71. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/analyst11_ASI_delay1/final_expressions.json +7062 -0
  72. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/final_expressions.json +138 -0
  73. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759441444909600.json +38 -0
  74. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759441920092000.json +14 -0
  75. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759442418767100.json +14 -0
  76. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759442902507600.json +14 -0
  77. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759443377036200.json +10 -0
  78. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759443845377000.json +14 -0
  79. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759444313546700.json +10 -0
  80. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759444784598600.json +14 -0
  81. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759445274311200.json +14 -0
  82. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759445747421700.json +10 -0
  83. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759446222137800.json +22 -0
  84. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759446686222600.json +14 -0
  85. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759447154698500.json +10 -0
  86. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759447629677000.json +10 -0
  87. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759448102331200.json +10 -0
  88. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_1_idea_1769759448573382000.json +14 -0
  89. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/data/fundamental72_GLB_delay1/fundamental72_GLB_delay1.csv +330 -0
  90. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/ace.log +3 -0
  91. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/ace_lib.py +1514 -0
  92. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/fetch_dataset.py +119 -0
  93. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/helpful_functions.py +180 -0
  94. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/implement_idea.py +236 -0
  95. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/merge_expression_list.py +90 -0
  96. cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-feature-implementation/scripts/parsetab.py +60 -0
  97. cnhkmcp/untracked/APP/trailSomeAlphas/skills/template_final_enhance/op/321/206/320/220/342/225/227/321/207/342/225/227/320/243.md +434 -0
  98. cnhkmcp/untracked/APP/trailSomeAlphas/skills/template_final_enhance/sample_prompt.md +62 -0
  99. cnhkmcp/untracked/APP/trailSomeAlphas/skills/template_final_enhance//321/205/320/235/320/245/321/205/320/253/320/260/321/205/320/275/320/240/321/206/320/220/320/255/321/210/320/220/320/223/321/211/320/220/342/225/227/321/210/342/225/233/320/241/321/211/320/243/342/225/233.md +354 -0
  100. cnhkmcp/untracked/APP/usage.md +2 -2
  101. cnhkmcp/untracked/APP//321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/231/320/243/321/205/342/225/235/320/220/321/206/320/230/320/241.py +400 -9
  102. cnhkmcp/untracked/back_up/platform_functions.py +2 -2
  103. cnhkmcp/untracked/mcp/321/206/320/246/320/227/321/204/342/225/227/342/225/242/321/210/320/276/342/225/221/321/205/320/255/320/253/321/207/320/231/320/2302_/321/205/320/266/320/222/321/206/320/256/320/254/321/205/320/236/320/257/321/207/320/231/320/230/321/205/320/240/320/277/321/205/320/232/320/270/321/204/342/225/225/320/235/321/204/342/225/221/320/226/321/206/342/225/241/320/237/321/210/320/267/320/230/321/205/320/251/320/270/321/205/342/226/221/342/226/222/321/210/320/277/320/245/321/210/342/224/220/320/251/321/204/342/225/225/320/272/platform_functions.py +2 -2
  104. cnhkmcp/untracked/platform_functions.py +2 -2
  105. cnhkmcp/untracked/skills/alpha-expression-verifier/scripts/validator.py +889 -0
  106. cnhkmcp/untracked/skills/brain-feature-implementation/scripts/implement_idea.py +4 -3
  107. cnhkmcp/untracked/skills/brain-improve-alpha-performance/arXiv_API_Tool_Manual.md +490 -0
  108. cnhkmcp/untracked/skills/brain-improve-alpha-performance/reference.md +1 -1
  109. cnhkmcp/untracked/skills/brain-improve-alpha-performance/scripts/arxiv_api.py +229 -0
  110. cnhkmcp/untracked//321/211/320/225/320/235/321/207/342/225/234/320/276/321/205/320/231/320/235/321/210/342/224/220/320/240/321/210/320/261/320/234/321/206/320/230/320/241_/321/205/320/276/320/231/321/210/320/263/320/225/321/205/342/224/220/320/225/321/210/320/266/320/221/321/204/342/225/233/320/255/321/210/342/225/241/320/246/321/205/320/234/320/225.py +35 -11
  111. cnhkmcp/vector_db/_manifest.json +1 -0
  112. cnhkmcp/vector_db/_meta.json +1 -0
  113. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.1.dist-info}/METADATA +1 -1
  114. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.1.dist-info}/RECORD +121 -33
  115. /cnhkmcp/untracked/{skills/expression_verifier → APP/trailSomeAlphas/skills/brain-feature-implementation}/scripts/validator.py +0 -0
  116. /cnhkmcp/untracked/skills/{expression_verifier → alpha-expression-verifier}/SKILL.md +0 -0
  117. /cnhkmcp/untracked/skills/{expression_verifier → alpha-expression-verifier}/scripts/verify_expr.py +0 -0
  118. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.1.dist-info}/WHEEL +0 -0
  119. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.1.dist-info}/entry_points.txt +0 -0
  120. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.1.dist-info}/licenses/LICENSE +0 -0
  121. {cnhkmcp-2.2.0.dist-info → cnhkmcp-2.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1051 @@
1
+ import argparse
2
+ import datetime as dt
3
+ import json
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import csv
10
+ import time
11
+ from pathlib import Path
12
+
13
+ import requests
14
+
15
+ # Ensure UTF-8 stdout on Windows to avoid UnicodeEncodeError
16
+ try:
17
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
18
+ except Exception:
19
+ pass
20
+
21
+ BASE_DIR = Path(__file__).resolve().parent
22
+ SKILLS_DIR = BASE_DIR / "skills"
23
+ FEATURE_ENGINEERING_DIR = SKILLS_DIR / "brain-data-feature-engineering"
24
+ FEATURE_IMPLEMENTATION_DIR = SKILLS_DIR / "brain-feature-implementation"
25
+ FEATURE_IMPLEMENTATION_SCRIPTS = FEATURE_IMPLEMENTATION_DIR / "scripts"
26
+
27
+ sys.path.insert(0, str(FEATURE_IMPLEMENTATION_SCRIPTS))
28
+ try:
29
+ import ace_lib # type: ignore
30
+ except Exception as exc:
31
+ raise SystemExit(f"Failed to import ace_lib from {FEATURE_IMPLEMENTATION_SCRIPTS}: {exc}")
32
+ try:
33
+ from validator import ExpressionValidator # type: ignore
34
+ except Exception as exc:
35
+ raise SystemExit(f"Failed to import ExpressionValidator from {FEATURE_IMPLEMENTATION_SCRIPTS}: {exc}")
36
+
37
+ def load_brain_credentials(config_path: Path) -> tuple[str, str]:
38
+ if not config_path.exists():
39
+ raise FileNotFoundError(f"Config not found: {config_path}")
40
+ with config_path.open("r", encoding="utf-8") as f:
41
+ data = json.load(f)
42
+ creds = data.get("BRAIN_CREDENTIALS", {})
43
+ email = creds.get("email")
44
+ password = creds.get("password")
45
+ if not email or not password:
46
+ raise ValueError("BRAIN_CREDENTIALS missing in config.json")
47
+ return email, password
48
+
49
+ def load_brain_credentials_from_env_or_args(username: str | None, password: str | None, config_path: Path) -> tuple[str, str]:
50
+ env_user = os.environ.get("BRAIN_USERNAME") or os.environ.get("BRAIN_EMAIL")
51
+ env_pass = os.environ.get("BRAIN_PASSWORD")
52
+ final_user = username or env_user
53
+ final_pass = password or env_pass
54
+ if final_user and final_pass:
55
+ return final_user, final_pass
56
+ return load_brain_credentials(config_path)
57
+
58
+ def start_brain_session(email: str, password: str):
59
+ ace_lib.get_credentials = lambda: (email, password)
60
+ return ace_lib.start_session()
61
+
62
+ def pick_first_present_column(df, candidates):
63
+ for c in candidates:
64
+ if c in df.columns:
65
+ return c
66
+ # also try case-insensitive
67
+ lower_map = {col.lower(): col for col in df.columns}
68
+ for c in candidates:
69
+ if c.lower() in lower_map:
70
+ return lower_map[c.lower()]
71
+ return None
72
+
73
+
74
+ def select_dataset(datasets_df, data_category: str, dataset_id: str | None):
75
+ if dataset_id:
76
+ return dataset_id, None, None, datasets_df
77
+
78
+ category_col = pick_first_present_column(
79
+ datasets_df,
80
+ ["category", "data_category", "dataCategory", "category_name", "dataCategory_name"],
81
+ )
82
+
83
+ filtered = datasets_df
84
+ if category_col:
85
+ filtered = datasets_df[datasets_df[category_col].astype(str).str.lower() == data_category.lower()]
86
+
87
+ if filtered.empty:
88
+ filtered = datasets_df
89
+
90
+ id_col = pick_first_present_column(filtered, ["id", "dataset_id", "datasetId"])
91
+ name_col = pick_first_present_column(filtered, ["name", "dataset_name", "datasetName"])
92
+ desc_col = pick_first_present_column(filtered, ["description", "desc", "dataset_description"])
93
+
94
+ if not id_col:
95
+ raise ValueError("Unable to locate dataset id column from dataset list")
96
+
97
+ row = filtered.iloc[0]
98
+ return row[id_col], row.get(name_col) if name_col else None, row.get(desc_col) if desc_col else None, datasets_df
99
+
100
+
101
+ def build_field_summary(fields_df, max_fields: int | None = None, default_sample_size: int = 50):
102
+ id_col = pick_first_present_column(fields_df, ["id", "field_id", "fieldId"])
103
+ desc_col = pick_first_present_column(fields_df, ["description", "desc"])
104
+
105
+ if max_fields is None:
106
+ # If user did NOT specify --max-fields, randomly sample 50 rows for the prompt.
107
+ # If there are fewer than 50 rows, pass all.
108
+ total = int(fields_df.shape[0])
109
+ n = min(default_sample_size, total)
110
+ subset = fields_df if n >= total else fields_df.sample(n=n, random_state=42)
111
+ else:
112
+ # If user specified --max-fields, pass the TOP N rows.
113
+ total = int(fields_df.shape[0])
114
+ n = min(int(max_fields), total)
115
+ subset = fields_df.head(n)
116
+
117
+ rows = []
118
+ for _, row in subset.iterrows():
119
+ rows.append(
120
+ {
121
+ "id": row.get(id_col),
122
+ "description": row.get(desc_col)
123
+ }
124
+ )
125
+ return rows, fields_df.shape[0]
126
+
127
+
128
+ def read_text_optional(path: Path) -> str:
129
+ try:
130
+ return path.read_text(encoding="utf-8")
131
+ except Exception:
132
+ return ""
133
+
134
+
135
+ def build_allowed_metric_suffixes(fields_df, max_suffixes: int = 300) -> list[str]:
136
+ """Derive a practical list of placeholder candidates from dataset field ids.
137
+
138
+ `implement_idea.py` matches `{variable}` by searching for that substring in the
139
+ field id and then using the *base* (everything before first occurrence) to
140
+ align the other variables. In practice, good placeholders tend to be the
141
+ trailing 2-5 underscore-joined tokens.
142
+ """
143
+
144
+ id_col = pick_first_present_column(fields_df, ["id", "field_id", "fieldId"])
145
+ if not id_col:
146
+ return []
147
+
148
+ field_ids = fields_df[id_col].dropna().astype(str).tolist()
149
+ dataset_code = detect_dataset_code(field_ids)
150
+
151
+ counts: dict[str, int] = {}
152
+ for raw in field_ids:
153
+ parts = [p for p in str(raw).split("_") if p]
154
+ if len(parts) < 2:
155
+ continue
156
+
157
+ # Collect suffix candidates from the tail.
158
+ # Prefer multi-token names, but allow single-token suffixes when they're
159
+ # specific enough (e.g., "inventories").
160
+ # IMPORTANT: never allow the "suffix" to equal the full id (that would
161
+ # encourage the LLM to emit {full_field_id}, violating the suffix-only rule).
162
+ for n in range(1, min(6, len(parts))):
163
+ suffix = "_".join(parts[-n:])
164
+ # Filter out overly-generic / numeric suffixes
165
+ if suffix.replace("_", "").isdigit():
166
+ continue
167
+ if dataset_code and suffix.lower().startswith(dataset_code.lower() + "_"):
168
+ continue
169
+ if n == 1 and len(suffix) < 8:
170
+ continue
171
+ if len(suffix) < 6:
172
+ continue
173
+ counts[suffix] = counts.get(suffix, 0) + 1
174
+
175
+ # Prefer suffixes that show up multiple times and have underscores
176
+ ranked = sorted(
177
+ counts.items(),
178
+ key=lambda kv: (kv[1], kv[0].count("_"), len(kv[0])),
179
+ reverse=True,
180
+ )
181
+
182
+ suffixes: list[str] = []
183
+ for suffix, _ in ranked:
184
+ if suffix not in suffixes:
185
+ suffixes.append(suffix)
186
+ if len(suffixes) >= max_suffixes:
187
+ break
188
+ return suffixes
189
+
190
+
191
+ def build_allowed_suffixes_from_ids(dataset_ids: list[str], max_suffixes: int = 300) -> list[str]:
192
+ """Build suffix candidates from downloaded dataset ids.
193
+
194
+ This is used to normalize/validate templates for `implement_idea.py`.
195
+ """
196
+
197
+ counts: dict[str, int] = {}
198
+ for raw in dataset_ids:
199
+ parts = [p for p in str(raw).split("_") if p]
200
+ if len(parts) < 2:
201
+ continue
202
+ for n in range(1, 6):
203
+ if len(parts) >= n:
204
+ suffix = "_".join(parts[-n:])
205
+ if suffix.replace("_", "").isdigit():
206
+ continue
207
+ if n == 1 and len(suffix) < 8:
208
+ continue
209
+ if len(suffix) < 6:
210
+ continue
211
+ counts[suffix] = counts.get(suffix, 0) + 1
212
+
213
+ ranked = sorted(
214
+ counts.items(),
215
+ key=lambda kv: (kv[1], kv[0].count("_"), len(kv[0])),
216
+ reverse=True,
217
+ )
218
+
219
+ suffixes: list[str] = []
220
+ for suffix, _ in ranked:
221
+ if suffix not in suffixes:
222
+ suffixes.append(suffix)
223
+ if len(suffixes) >= max_suffixes:
224
+ break
225
+ return suffixes
226
+
227
+
228
+ def detect_dataset_code(dataset_ids: list[str]) -> str | None:
229
+ if not dataset_ids:
230
+ return None
231
+ counts: dict[str, int] = {}
232
+ for fid in dataset_ids:
233
+ tok = (str(fid).split("_", 1)[0] or "").strip()
234
+ if tok:
235
+ counts[tok] = counts.get(tok, 0) + 1
236
+ if not counts:
237
+ return None
238
+ return max(counts.items(), key=lambda kv: kv[1])[0]
239
+
240
+ def ensure_metadata_block(markdown_text: str, dataset_id: str, region: str, delay: int) -> str:
241
+ """Ensure the ideas markdown contains the metadata block used by the pipeline."""
242
+
243
+ has_dataset = re.search(r"^\*\*Dataset\*\*:\s*\S+", markdown_text, flags=re.MULTILINE) is not None
244
+ has_region = re.search(r"^\*\*Region\*\*:\s*\S+", markdown_text, flags=re.MULTILINE) is not None
245
+ has_delay = re.search(r"^\*\*Delay\*\*:\s*\d+", markdown_text, flags=re.MULTILINE) is not None
246
+ if has_dataset and has_region and has_delay:
247
+ return markdown_text
248
+
249
+ block = [
250
+ "",
251
+ f"**Dataset**: {dataset_id}",
252
+ f"**Region**: {region}",
253
+ f"**Delay**: {delay}",
254
+ "",
255
+ ]
256
+
257
+ lines = markdown_text.splitlines()
258
+ insert_at = 0
259
+ for i, line in enumerate(lines[:10]):
260
+ if line.strip():
261
+ insert_at = i + 1
262
+ break
263
+ new_lines = lines[:insert_at] + block + lines[insert_at:]
264
+ return "\n".join(new_lines).lstrip("\n")
265
+
266
+ def compress_to_known_suffix(var: str, allowed_suffixes: list[str]) -> str | None:
267
+ v = var.lower()
268
+ for sfx in sorted(allowed_suffixes, key=len, reverse=True):
269
+ if v.endswith(sfx.lower()):
270
+ return sfx
271
+ return None
272
+
273
+ def placeholder_is_reasonably_matchable(var: str, dataset_ids: list[str]) -> bool:
274
+ """Heuristic check that a placeholder is likely to match real ids.
275
+
276
+ We avoid treating very short tokens as valid unless they match a token boundary.
277
+ """
278
+
279
+ v = var
280
+ if len(v) <= 3:
281
+ pat = re.compile(rf"(^|_){re.escape(v)}(_|$)", flags=re.IGNORECASE)
282
+ return any(pat.search(str(fid)) for fid in dataset_ids)
283
+ return any(v in str(fid) for fid in dataset_ids)
284
+
285
+ def normalize_template_placeholders(
286
+ template: str,
287
+ dataset_ids: list[str],
288
+ allowed_suffixes: list[str],
289
+ dataset_code: str | None,
290
+ ) -> tuple[str, bool]:
291
+ """Normalize placeholders to suffix-only form, without dataset-specific aliasing.
292
+
293
+ - Strips dataset code prefix (e.g. fnd72_*) when present.
294
+ - Compresses placeholders to the longest known suffix.
295
+ - Returns (normalized_template, is_valid).
296
+ """
297
+
298
+ vars_in_template = re.findall(r"\{([A-Za-z0-9_]+)\}", template)
299
+ if not vars_in_template:
300
+ return template, False
301
+
302
+ mapping: dict[str, str] = {}
303
+ for var in set(vars_in_template):
304
+ new_var = var
305
+ if dataset_code and new_var.lower().startswith(dataset_code.lower() + "_"):
306
+ new_var = new_var[len(dataset_code) + 1 :]
307
+
308
+ compressed = compress_to_known_suffix(new_var, allowed_suffixes)
309
+ if compressed:
310
+ new_var = compressed
311
+
312
+ mapping[var] = new_var
313
+
314
+ normalized = template
315
+ for src, dst in mapping.items():
316
+ normalized = normalized.replace("{" + src + "}", "{" + dst + "}")
317
+
318
+ # Validate: every placeholder should look matchable in real ids.
319
+ vars_after = re.findall(r"\{([A-Za-z0-9_]+)\}", normalized)
320
+ ok = all(placeholder_is_reasonably_matchable(v, dataset_ids) for v in vars_after)
321
+ return normalized, ok
322
+
323
+ def build_prompt(
324
+ dataset_id: str,
325
+ dataset_name: str | None,
326
+ dataset_description: str | None,
327
+ data_category: str,
328
+ region: str,
329
+ delay: int,
330
+ universe: str,
331
+ data_type: str,
332
+ fields_summary: list[dict],
333
+ field_count: int,
334
+ feature_engineering_skill_md: str,
335
+ feature_implementation_skill_md: str,
336
+ allowed_metric_suffixes: list[str],
337
+ allowed_operators,
338
+ ):
339
+ # NOTE: The user requested that we DO NOT invent our own system prompt.
340
+ # Instead, we embed the two skill specs as the authoritative instructions.
341
+ prompt_lines = [
342
+ "You are executing two skills in sequence:",
343
+ "1) brain-data-feature-engineering",
344
+ "2) brain-feature-implementation",
345
+ "The following SKILL.md documents are authoritative; follow them exactly.",
346
+ "",
347
+ "--- SKILL.md (brain-data-feature-engineering) ---",
348
+ feature_engineering_skill_md.strip(),
349
+ "",
350
+ "--- SKILL.md (brain-feature-implementation) ---",
351
+ feature_implementation_skill_md.strip(),
352
+ "------"
353
+ f'"allowed_operators": {allowed_operators}',
354
+ "-------",
355
+ f'"allowed_placeholders": {allowed_metric_suffixes}',
356
+ "",
357
+ ]
358
+
359
+ if str(data_type).upper() == "VECTOR":
360
+ prompt_lines.append(
361
+ "since all the following the data is vector type data, before you do any process, you should choose a vector operator to generate its statistical feature to use, the data cannot be directly use. for example, if datafieldA and datafieldB are vector type data, you can use vec_avg(datafieldA) - vec_avg(datafieldB), where vec_avg() operator is used to generate the average of the data on a certain date. similarly, vector type operator can only be used on the vector type operator directly and cannot be nested, for example vec_avg(vec_sum(datafield)) is a false use."
362
+ )
363
+
364
+ prompt_lines.extend(
365
+ [
366
+ "CRITICAL OUTPUT RULES (to ensure implement_idea.py can generate expressions):",
367
+ "- Every Implementation Example MUST be a Python format template using {variable}.",
368
+ "- Every {variable} MUST come from the allowed_placeholders list provided in user content.",
369
+ "- When you implement ideas, ONLY use operators from allowed_operators provided.",
370
+ "- Do NOT include dataset codes/prefixes/horizons in {variable} (suffix-only).",
371
+ "- If you show raw field ids in tables, use backticks `like_this`, NOT {braces}.",
372
+ "- Include these metadata lines verbatim somewhere near the top:",
373
+ " **Dataset**: <dataset_id>",
374
+ " **Region**: <region>",
375
+ " **Delay**: <delay>",
376
+ ]
377
+ )
378
+
379
+ system_prompt = "\n".join(prompt_lines)
380
+
381
+ user_prompt = {
382
+ "instructions": {
383
+ "output_format": "Fill OUTPUT_TEMPLATE.md with concrete content.",
384
+ "implementation_examples": (
385
+ "Each Implementation Example must be a template with {variable} placeholders. "
386
+ "Use only placeholders from allowed_placeholders. "
387
+ "Use suffix-only names; do not include dataset code/prefix/horizon."
388
+ ),
389
+ "no_code_fences": True,
390
+ "do_not_invent_placeholders": True,
391
+ },
392
+ "dataset_context": {
393
+ "dataset_id": dataset_id,
394
+ "dataset_name": dataset_name,
395
+ "dataset_description": dataset_description,
396
+ "category": data_category,
397
+ "region": region,
398
+ "delay": delay,
399
+ "universe": universe,
400
+ "field_count": field_count,
401
+ },
402
+
403
+ "fields": fields_summary,
404
+ }
405
+ # print(user_prompt) #for debug
406
+ # print(system_prompt) #for debug
407
+ return system_prompt, json.dumps(user_prompt, ensure_ascii=False, indent=2)
408
+
409
+
410
+ def _vector_ratio_from_datafields_df(datafields_df) -> float:
411
+ if datafields_df is None or getattr(datafields_df, "empty", True):
412
+ return 0.0
413
+ dtype_col = pick_first_present_column(datafields_df, ["type", "dataType", "data_type"])
414
+ if not dtype_col:
415
+ return 0.0
416
+ counts = datafields_df[dtype_col].astype(str).value_counts().to_dict()
417
+ vector_count = counts.get("VECTOR", 0)
418
+ total = sum(counts.values())
419
+ return (vector_count / total) if total else 0.0
420
+
421
+
422
+ def filter_operators_df(operators_df, keep_vector: bool):
423
+ """Apply user-confirmed operator filters.
424
+
425
+ Rules:
426
+ - Keep only scope == REGULAR
427
+ - Drop category == Group
428
+ - Keep category == Vector only if keep_vector is True
429
+ - Drop names matching /rank|neutral|normal|scal/i
430
+ """
431
+
432
+ df = operators_df.copy()
433
+
434
+ name_col = pick_first_present_column(df, ["name", "operator", "op", "id"])
435
+ scope_col = pick_first_present_column(df, ["scope", "scopes"])
436
+ category_col = pick_first_present_column(df, ["category", "group", "type"])
437
+ desc_col = pick_first_present_column(df, ["description", "desc", "help", "doc", "documentation"])
438
+ definition_col = pick_first_present_column(df, ["definition", "syntax"])
439
+
440
+ if scope_col:
441
+ df = df[df[scope_col].astype(str).str.upper() == "REGULAR"]
442
+
443
+ if category_col:
444
+ df = df[df[category_col].astype(str).str.lower() != "group"]
445
+ if not keep_vector:
446
+ df = df[df[category_col].astype(str).str.lower() != "vector"]
447
+
448
+ if name_col:
449
+ banned = re.compile(r"(?:rank|neutral|normal|scal|zscore)", flags=re.IGNORECASE)
450
+ df = df[~df[name_col].astype(str).str.contains(banned, na=False)]
451
+
452
+ # de-dup by operator name
453
+ df = df.drop_duplicates(subset=[name_col]).reset_index(drop=True)
454
+
455
+ cols = [c for c in [name_col, category_col, scope_col, desc_col, definition_col] if c]
456
+ allowed = []
457
+ for _, row in df.iterrows():
458
+ item = {
459
+ "name": row.get(name_col) if name_col else None,
460
+ "category": row.get(category_col) if category_col else None,
461
+ "scope": row.get(scope_col) if scope_col else None,
462
+ "description": row.get(desc_col) if desc_col else None,
463
+ "definition": row.get(definition_col) if definition_col else None,
464
+ }
465
+ # drop None keys to keep prompt compact
466
+ allowed.append({k: v for k, v in item.items() if v is not None})
467
+
468
+ return df, allowed, cols
469
+
470
+ def call_moonshot(api_key: str, model: str, system_prompt: str, user_prompt: str, timeout_s: int = 120):
471
+ base_url = os.environ.get("MOONSHOT_BASE_URL", "https://api.moonshot.cn/v1")
472
+ url = f"{base_url.rstrip('/')}/chat/completions"
473
+ headers = {
474
+ "Authorization": f"Bearer {api_key}",
475
+ "Content-Type": "application/json",
476
+ }
477
+ payload = {
478
+ "model": model,
479
+ "messages": [
480
+ {"role": "system", "content": system_prompt},
481
+ {"role": "user", "content": user_prompt},
482
+ ],
483
+
484
+ # Default to streaming so the user can observe model progress.
485
+ "stream": True,
486
+ }
487
+
488
+ retries = int(os.environ.get("MOONSHOT_RETRIES", "2"))
489
+ backoff_s = float(os.environ.get("MOONSHOT_RETRY_BACKOFF", "2"))
490
+
491
+ def _stream_sse_and_collect(resp: requests.Response) -> str:
492
+ """Read OpenAI-compatible SSE stream and print deltas live.
493
+
494
+ Still returns the full accumulated assistant content so existing callers
495
+ (which expect a string) keep working.
496
+ """
497
+
498
+ content_parts: list[str] = []
499
+ thinking_parts: list[str] = []
500
+ thinking = False
501
+
502
+ # Ensure requests doesn't try to decode as bytes.
503
+ for raw_line in resp.iter_lines(decode_unicode=True):
504
+ if not raw_line:
505
+ continue
506
+ line = raw_line.strip()
507
+ if not line.startswith("data:"):
508
+ continue
509
+ data_str = line[5:].strip()
510
+ if data_str == "[DONE]":
511
+ break
512
+
513
+ try:
514
+ event = json.loads(data_str)
515
+ except Exception:
516
+ continue
517
+
518
+ choices = event.get("choices") or []
519
+ if not choices:
520
+ continue
521
+ choice0 = choices[0] if isinstance(choices[0], dict) else None
522
+ if not choice0:
523
+ continue
524
+
525
+ delta = choice0.get("delta") or {}
526
+ if not isinstance(delta, dict):
527
+ delta = {}
528
+
529
+ # Moonshot/Kimi exposes reasoning tokens as `reasoning_content`.
530
+ reasoning = delta.get("reasoning_content")
531
+ if reasoning:
532
+ if not thinking:
533
+ thinking = True
534
+ print("=============开始思考=============", flush=True)
535
+ thinking_parts.append(str(reasoning))
536
+ print(str(reasoning), end="", flush=True)
537
+
538
+ piece = delta.get("content")
539
+ if piece:
540
+ if thinking:
541
+ thinking = False
542
+ print("\n=============思考结束=============", flush=True)
543
+ content_parts.append(str(piece))
544
+ print(str(piece), end="", flush=True)
545
+
546
+ finish_reason = choice0.get("finish_reason")
547
+ if finish_reason:
548
+ break
549
+
550
+ # If the stream ended while still "thinking", close the marker cleanly.
551
+ if thinking:
552
+ print("\n=============思考结束=============", flush=True)
553
+
554
+ return "".join(content_parts)
555
+
556
+ last_exc: Exception | None = None
557
+ for attempt in range(retries + 1):
558
+ try:
559
+ resp = requests.post(url, headers=headers, json=payload, timeout=timeout_s, stream=True)
560
+ resp.encoding = "utf-8"
561
+ if resp.status_code >= 300:
562
+ raise RuntimeError(f"Moonshot API error {resp.status_code}: {resp.text}")
563
+
564
+ # Prefer SSE streaming when available.
565
+ ctype = (resp.headers.get("Content-Type") or "").lower()
566
+ if "text/event-stream" in ctype or payload.get("stream"):
567
+ return _stream_sse_and_collect(resp)
568
+
569
+ data = resp.json()
570
+ break
571
+ except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as exc:
572
+ last_exc = exc
573
+ if attempt >= retries:
574
+ raise
575
+ time.sleep(backoff_s * (2**attempt))
576
+ except requests.exceptions.RequestException as exc:
577
+ # Other request-layer issues: retry a bit, but don't loop forever.
578
+ last_exc = exc
579
+ if attempt >= retries:
580
+ raise
581
+ time.sleep(backoff_s * (2**attempt))
582
+ else:
583
+ raise last_exc or RuntimeError("Moonshot request failed")
584
+
585
+ try:
586
+ return data["choices"][0]["message"]["content"]
587
+ except Exception as exc:
588
+ raise RuntimeError(f"Unexpected Moonshot response: {data}") from exc
589
+ def save_ideas_report(content: str, region: str, delay: int, dataset_id: str) -> Path:
590
+ output_dir = FEATURE_ENGINEERING_DIR / "output_report"
591
+ output_dir.mkdir(parents=True, exist_ok=True)
592
+ filename = f"{region}_delay{delay}_{dataset_id}_ideas.md"
593
+ output_path = output_dir / filename
594
+ output_path.write_text(content, encoding="utf-8")
595
+ return output_path
596
+
597
+ def extract_templates(markdown_text: str) -> list[str]:
598
+ """Extract implementation templates from idea markdown.
599
+
600
+ For pipeline robustness, this function returns ONLY the template strings.
601
+ The recommended, higher-fidelity parser is `extract_template_blocks()`,
602
+ which returns both template + idea text per **Concept** block.
603
+ """
604
+
605
+ blocks = extract_template_blocks(markdown_text)
606
+ templates = [b["template"] for b in blocks if b.get("template")]
607
+ return sorted(set(t.strip() for t in templates if t and t.strip()))
608
+
609
+
610
+ def extract_template_blocks(markdown_text: str) -> list[dict[str, str]]:
611
+ """Parse **Concept** blocks and extract {template, idea}.
612
+
613
+ A "block" is a section that starts with a line like:
614
+ **Concept**: ...
615
+ and contains a line like:
616
+ - **Implementation Example**: `...`
617
+
618
+ Output:
619
+ [{"template": <string>, "idea": <string>}, ...]
620
+
621
+ Notes:
622
+ - `template` is taken from inside backticks when present; otherwise uses the
623
+ remainder of the line after ':'.
624
+ - `idea` is the rest of the block text (including the concept line and
625
+ bullets) excluding the implementation example line.
626
+ """
627
+
628
+ concept_re = re.compile(r"^\*\*Concept\*\*\s*:\s*(.*)\s*$")
629
+ impl_re = re.compile(r"\*\*Implementation Example\*\*\s*:\s*(.*)$", flags=re.IGNORECASE)
630
+ backtick_re = re.compile(r"`([^`]*)`")
631
+ boundary_re = re.compile(r"^(?:-{3,}|#{1,6}\s+.*)\s*$")
632
+
633
+ lines = markdown_text.splitlines()
634
+ blocks: list[list[str]] = []
635
+ current: list[str] = []
636
+
637
+ def _flush():
638
+ nonlocal current
639
+ if current:
640
+ # Trim leading/trailing blank lines in block.
641
+ while current and not current[0].strip():
642
+ current.pop(0)
643
+ while current and not current[-1].strip():
644
+ current.pop()
645
+ if current:
646
+ blocks.append(current)
647
+ current = []
648
+
649
+ for line in lines:
650
+ if concept_re.match(line.strip()):
651
+ _flush()
652
+ current = [line]
653
+ continue
654
+
655
+ # If we are inside a concept block and hit a section boundary (e.g. '---', '### Q2'),
656
+ # close the block so unrelated headings don't get included in the idea text.
657
+ if current and boundary_re.match(line.strip()):
658
+ _flush()
659
+ continue
660
+
661
+ if current:
662
+ current.append(line)
663
+
664
+ _flush()
665
+
666
+ out: list[dict[str, str]] = []
667
+ for block_lines in blocks:
668
+ template: str | None = None
669
+ impl_line_idx: int | None = None
670
+
671
+ # Find the implementation example line (or its continuation).
672
+ for i, raw in enumerate(block_lines):
673
+ m = impl_re.search(raw)
674
+ if not m:
675
+ continue
676
+
677
+ impl_line_idx = i
678
+ tail = (m.group(1) or "").strip()
679
+
680
+ # Case 1: template is in backticks on the same line.
681
+ bt = backtick_re.search(tail)
682
+ if bt:
683
+ template = bt.group(1).strip()
684
+ break
685
+
686
+ # Case 2: tail itself is the template.
687
+ if tail and ("{" in tail and "}" in tail):
688
+ template = tail.strip().strip("`")
689
+ break
690
+
691
+ # Case 3: template is on the next non-empty line, often in backticks.
692
+ for j in range(i + 1, min(i + 4, len(block_lines))):
693
+ nxt = block_lines[j].strip()
694
+ if not nxt:
695
+ continue
696
+ bt2 = backtick_re.search(nxt)
697
+ if bt2:
698
+ template = bt2.group(1).strip()
699
+ break
700
+ if "{" in nxt and "}" in nxt:
701
+ template = nxt.strip().strip("`")
702
+ break
703
+ break
704
+
705
+ if not template or "{" not in template or "}" not in template:
706
+ continue
707
+
708
+ # idea = all block text except the implementation example line itself.
709
+ idea_lines: list[str] = []
710
+ for i, raw in enumerate(block_lines):
711
+ if impl_line_idx is not None and i == impl_line_idx:
712
+ continue
713
+ idea_lines.append(raw)
714
+
715
+ idea = "\n".join(idea_lines).strip()
716
+ out.append({"template": template.strip(), "idea": idea})
717
+
718
+ return out
719
+
720
+ def load_dataset_ids_from_csv(dataset_csv_path: Path) -> list[str]:
721
+ if not dataset_csv_path.exists():
722
+ return []
723
+ ids: list[str] = []
724
+ with dataset_csv_path.open("r", encoding="utf-8", newline="") as f:
725
+ reader = csv.DictReader(f)
726
+ if "id" not in (reader.fieldnames or []):
727
+ return []
728
+ for row in reader:
729
+ v = (row.get("id") or "").strip()
730
+ if v:
731
+ ids.append(v)
732
+ return ids
733
+
734
+ def safe_dataset_id(dataset_id: str) -> str:
735
+ return "".join([c for c in dataset_id if c.isalnum() or c in ("-", "_")])
736
+
737
+ def run_script(args_list: list[str], cwd: Path):
738
+ result = subprocess.run(args_list, cwd=cwd, capture_output=True, text=True)
739
+ if result.returncode != 0:
740
+ raise RuntimeError(
741
+ "Command failed: "
742
+ + " ".join(args_list)
743
+ + f"\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
744
+ )
745
+ return result.stdout
746
+
747
+
748
+ def delete_path_if_exists(path: Path):
749
+ """Best-effort delete a file or directory."""
750
+
751
+ try:
752
+ if not path.exists():
753
+ return
754
+ if path.is_dir():
755
+ shutil.rmtree(path, ignore_errors=True)
756
+ else:
757
+ path.unlink(missing_ok=True)
758
+ except Exception:
759
+ # Best-effort cleanup only; rerun should still proceed.
760
+ return
761
+
762
+ def main():
763
+ parser = argparse.ArgumentParser(description="Run feature engineering + implementation pipeline")
764
+ parser.add_argument("--data-category", required=True, help="Dataset category (e.g., analyst, fundamental)")
765
+ parser.add_argument("--region", required=True, help="Region (e.g., USA, GLB, EUR)")
766
+ parser.add_argument("--delay", required=True, type=int, help="Delay (0 or 1)")
767
+ parser.add_argument("--universe", default="TOP3000", help="Universe (default: TOP3000)")
768
+ parser.add_argument("--dataset-id", required=True, help="Dataset id (required)")
769
+ parser.add_argument("--instrument-type", default="EQUITY", help="Instrument type (default: EQUITY)")
770
+ parser.add_argument(
771
+ "--data-type",
772
+ default="MATRIX",
773
+ choices=["MATRIX", "VECTOR"],
774
+ help="Data type to request from BRAIN datafields (MATRIX or VECTOR). Default: MATRIX",
775
+ )
776
+ parser.add_argument("--ideas-file", default=None, help="Use existing ideas markdown instead of generating")
777
+ parser.add_argument(
778
+ "--regen-ideas",
779
+ action="store_true",
780
+ help="Force regenerating ideas markdown even if the default ideas file already exists",
781
+ )
782
+ parser.add_argument("--moonshot-api-key", default=None, help="Moonshot API key (prefer env MOONSHOT_API_KEY)")
783
+ parser.add_argument("--moonshot-model", default="kimi-k2.5", help="Moonshot model (default: k2.5)")
784
+ parser.add_argument("--username", default=None, help="BRAIN username/email (override config/env)")
785
+ parser.add_argument("--password", default=None, help="BRAIN password (override config/env)")
786
+ parser.add_argument(
787
+ "--max-fields",
788
+ type=int,
789
+ default=None,
790
+ help="If set, pass TOP N fields to LLM; if omitted, randomly sample 50 (or all if <50)",
791
+ )
792
+ parser.add_argument(
793
+ "--no-operators-in-prompt",
794
+ action="store_true",
795
+ help="Do not include allowed_operators in the idea-generation prompt",
796
+ )
797
+ parser.add_argument(
798
+ "--max-operators",
799
+ type=int,
800
+ default=300,
801
+ help="Max filtered operators to include in prompt (default: 300)",
802
+ )
803
+
804
+ args = parser.parse_args()
805
+
806
+ config_path = FEATURE_IMPLEMENTATION_DIR / "config.json"
807
+ email, password = load_brain_credentials_from_env_or_args(args.username, args.password, config_path)
808
+ session = start_brain_session(email, password)
809
+
810
+ # Always rerun cleanly: remove prior generated artifacts so we never reuse stale ideas/data.
811
+ # - If --ideas-file is provided, we treat it as user-managed input and do NOT delete it.
812
+ # - We DO delete the dataset-specific folder under feature-implementation/data.
813
+ if not args.ideas_file:
814
+ default_ideas = (
815
+ FEATURE_ENGINEERING_DIR
816
+ / "output_report"
817
+ / f"{args.region}_delay{args.delay}_{args.dataset_id}_ideas.md"
818
+ )
819
+ delete_path_if_exists(default_ideas)
820
+
821
+ guessed_dataset_folder = f"{safe_dataset_id(args.dataset_id)}_{args.region}_delay{args.delay}"
822
+ guessed_dataset_dir = FEATURE_IMPLEMENTATION_DIR / "data" / guessed_dataset_folder
823
+ delete_path_if_exists(guessed_dataset_dir)
824
+
825
+ ideas_path = None
826
+ if args.ideas_file:
827
+ ideas_path = Path(args.ideas_file).resolve()
828
+ if not ideas_path.exists():
829
+ raise FileNotFoundError(f"Ideas file not found: {ideas_path}")
830
+ else:
831
+ # Always regenerate ideas (never reuse an existing markdown report).
832
+ datasets_df = ace_lib.get_datasets(
833
+ session,
834
+ instrument_type=args.instrument_type,
835
+ region=args.region,
836
+ delay=args.delay,
837
+ universe=args.universe,
838
+ theme="ALL",
839
+ )
840
+
841
+ dataset_name = None
842
+ dataset_description = None
843
+ id_col = pick_first_present_column(datasets_df, ["id", "dataset_id", "datasetId"])
844
+ name_col = pick_first_present_column(datasets_df, ["name", "dataset_name", "datasetName"])
845
+ desc_col = pick_first_present_column(datasets_df, ["description", "desc", "dataset_description"])
846
+ if id_col:
847
+ matched = datasets_df[datasets_df[id_col].astype(str) == str(args.dataset_id)]
848
+ if not matched.empty:
849
+ row = matched.iloc[0]
850
+ dataset_name = row.get(name_col) if name_col else None
851
+ dataset_description = row.get(desc_col) if desc_col else None
852
+
853
+ fields_df = ace_lib.get_datafields(
854
+ session,
855
+ instrument_type=args.instrument_type,
856
+ region=args.region,
857
+ delay=args.delay,
858
+ universe=args.universe,
859
+ dataset_id=args.dataset_id,
860
+ data_type=args.data_type,
861
+ )
862
+
863
+ fields_summary, field_count = build_field_summary(fields_df, max_fields=args.max_fields)
864
+
865
+ feature_engineering_skill_md = read_text_optional(FEATURE_ENGINEERING_DIR / "SKILL.md")
866
+ feature_implementation_skill_md = read_text_optional(FEATURE_IMPLEMENTATION_DIR / "SKILL.md")
867
+ allowed_metric_suffixes = build_allowed_metric_suffixes(fields_df, max_suffixes=300)
868
+
869
+ allowed_operators = []
870
+ if not args.no_operators_in_prompt:
871
+ try:
872
+ operators_df = ace_lib.get_operators(session)
873
+ keep_vector = _vector_ratio_from_datafields_df(fields_df) > 0.5
874
+ _, allowed_ops, _ = filter_operators_df(operators_df, keep_vector=keep_vector)
875
+ if args.max_operators is not None and args.max_operators > 0:
876
+ allowed_operators = allowed_ops[: args.max_operators]
877
+ else:
878
+ allowed_operators = allowed_ops
879
+ except Exception as exc:
880
+ print(f"Warning: failed to fetch/filter operators; continuing without operators in prompt. Error: {exc}", file=sys.stderr)
881
+
882
+ system_prompt, user_prompt = build_prompt(
883
+ dataset_id=args.dataset_id,
884
+ dataset_name=dataset_name,
885
+ dataset_description=dataset_description,
886
+ data_category=args.data_category,
887
+ region=args.region,
888
+ delay=args.delay,
889
+ universe=args.universe,
890
+ data_type=args.data_type,
891
+ fields_summary=fields_summary,
892
+ field_count=field_count,
893
+ feature_engineering_skill_md=feature_engineering_skill_md,
894
+ feature_implementation_skill_md=feature_implementation_skill_md,
895
+ allowed_metric_suffixes=allowed_metric_suffixes,
896
+ allowed_operators=allowed_operators,
897
+ )
898
+
899
+ api_key = (
900
+ args.moonshot_api_key
901
+ or os.environ.get("MOONSHOT_API_KEY")
902
+ )
903
+ if not api_key:
904
+ raise ValueError("Moonshot API key missing. Set MOONSHOT_API_KEY or pass --moonshot-api-key")
905
+
906
+ report = call_moonshot(api_key, args.moonshot_model, system_prompt, user_prompt)
907
+ # Save first, then normalize placeholders after dataset download.
908
+ ideas_path = save_ideas_report(report, args.region, args.delay, args.dataset_id)
909
+
910
+ ideas_text = ideas_path.read_text(encoding="utf-8")
911
+
912
+ # Ensure metadata exists for downstream parsing/reuse.
913
+ ideas_text = ensure_metadata_block(ideas_text, dataset_id=args.dataset_id, region=args.region, delay=args.delay)
914
+ ideas_path.write_text(ideas_text, encoding="utf-8")
915
+
916
+ # Parse metadata
917
+ dataset_id_match = re.search(r"\*\*Dataset\*\*:\s*(\S+)", ideas_text)
918
+ dataset_id = dataset_id_match.group(1) if dataset_id_match else args.dataset_id
919
+
920
+ # Download dataset for implementation
921
+ fetch_script = FEATURE_IMPLEMENTATION_SCRIPTS / "fetch_dataset.py"
922
+ run_script(
923
+ [
924
+ sys.executable,
925
+ str(fetch_script),
926
+ "--datasetid",
927
+ dataset_id,
928
+ "--region",
929
+ args.region,
930
+ "--delay",
931
+ str(args.delay),
932
+ "--universe",
933
+ args.universe,
934
+ "--instrument-type",
935
+ args.instrument_type,
936
+ "--data-type",
937
+ args.data_type,
938
+ ],
939
+ cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
940
+ )
941
+
942
+ dataset_folder = f"{safe_dataset_id(dataset_id)}_{args.region}_delay{args.delay}"
943
+
944
+ # If the ideas file references a different dataset id than the CLI args,
945
+ # ensure we also clean that dataset folder before fetching.
946
+ if dataset_folder != guessed_dataset_folder:
947
+ delete_path_if_exists(FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder)
948
+
949
+ dataset_csv_path = FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder / f"{dataset_folder}.csv"
950
+ if not dataset_csv_path.exists():
951
+ raise RuntimeError(
952
+ "Dataset CSV was not created by fetch_dataset.py. "
953
+ f"Expected: {dataset_csv_path}"
954
+ )
955
+ dataset_ids = load_dataset_ids_from_csv(dataset_csv_path)
956
+ allowed_suffixes = build_allowed_suffixes_from_ids(dataset_ids, max_suffixes=300) if dataset_ids else []
957
+ dataset_code = detect_dataset_code(dataset_ids) if dataset_ids else None
958
+
959
+ # Extract {template, idea} pairs from **Concept** blocks.
960
+ block_pairs = extract_template_blocks(ideas_text)
961
+ if not block_pairs:
962
+ raise ValueError("No **Concept** blocks with **Implementation Example** found in the ideas file.")
963
+
964
+ normalized_pairs: list[tuple[str, str]] = []
965
+ for item in block_pairs:
966
+ t = str(item.get("template") or "").strip()
967
+ idea_text = str(item.get("idea") or "").strip()
968
+ if not t:
969
+ continue
970
+
971
+ if dataset_ids and allowed_suffixes:
972
+ normalized_t, ok = normalize_template_placeholders(t, dataset_ids, allowed_suffixes, dataset_code)
973
+ if not ok:
974
+ continue
975
+ normalized_pairs.append((normalized_t, idea_text))
976
+ else:
977
+ # No dataset ids to validate against; pass through.
978
+ normalized_pairs.append((t, idea_text))
979
+
980
+ if not normalized_pairs:
981
+ raise ValueError("No valid templates remain after normalization/validation.")
982
+
983
+ # De-dup by template; keep the first non-empty idea.
984
+ template_to_idea: dict[str, str] = {}
985
+ for t, idea_text in normalized_pairs:
986
+ if t not in template_to_idea or (not template_to_idea[t] and idea_text):
987
+ template_to_idea[t] = idea_text
988
+
989
+ templates = sorted(template_to_idea.keys())
990
+
991
+ implement_script = FEATURE_IMPLEMENTATION_SCRIPTS / "implement_idea.py"
992
+
993
+ for template in templates:
994
+ idea_text = template_to_idea.get(template, "")
995
+ run_script(
996
+ [
997
+ sys.executable,
998
+ str(implement_script),
999
+ "--template",
1000
+ template,
1001
+ "--dataset",
1002
+ dataset_folder,
1003
+ "--idea",
1004
+ idea_text,
1005
+ ],
1006
+ cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
1007
+ )
1008
+
1009
+ merge_script = FEATURE_IMPLEMENTATION_SCRIPTS / "merge_expression_list.py"
1010
+ run_script(
1011
+ [
1012
+ sys.executable,
1013
+ str(merge_script),
1014
+ "--dataset",
1015
+ dataset_folder,
1016
+ ],
1017
+ cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
1018
+ )
1019
+
1020
+ final_path = FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder / "final_expressions.json"
1021
+ if final_path.exists():
1022
+ try:
1023
+ raw = json.loads(final_path.read_text(encoding="utf-8"))
1024
+ except Exception as exc:
1025
+ raise RuntimeError(f"Failed to read final expressions: {final_path}. Error: {exc}")
1026
+
1027
+ expressions = raw if isinstance(raw, list) else []
1028
+ validator = ExpressionValidator()
1029
+ valid_expressions: list[str] = []
1030
+ invalid_count = 0
1031
+ for expr in expressions:
1032
+ if not isinstance(expr, str) or not expr.strip():
1033
+ invalid_count += 1
1034
+ continue
1035
+ result = validator.check_expression(expr.strip())
1036
+ if result.get("valid"):
1037
+ valid_expressions.append(expr.strip())
1038
+ else:
1039
+ invalid_count += 1
1040
+
1041
+ final_path.write_text(json.dumps(valid_expressions, ensure_ascii=False, indent=4), encoding="utf-8")
1042
+ print(f"Filtered invalid expressions: {invalid_count}")
1043
+ else:
1044
+ print(f"Warning: final_expressions.json not found: {final_path}")
1045
+
1046
+ print(f"Ideas report: {ideas_path}")
1047
+ print(f"Expressions: {FEATURE_IMPLEMENTATION_DIR / 'data' / dataset_folder / 'final_expressions.json'}")
1048
+
1049
+
1050
+ if __name__ == "__main__":
1051
+ main()