cnhkmcp 2.1.9__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnhkmcp/__init__.py +1 -1
- cnhkmcp/untracked/AI/321/206/320/231/320/243/321/205/342/225/226/320/265/321/204/342/225/221/342/225/221/BRAIN_AI/321/206/320/231/320/243/321/205/342/225/226/320/265/321/204/342/225/221/342/225/221Mac_Linux/321/207/320/231/320/230/321/206/320/254/320/274.zip +0 -0
- cnhkmcp/untracked/AI/321/206/320/231/320/243/321/205/342/225/226/320/265/321/204/342/225/221/342/225/221//321/205/320/237/320/234/321/205/320/227/342/225/227/321/205/320/276/320/231/321/210/320/263/320/225AI/321/206/320/231/320/243/321/205/342/225/226/320/265/321/204/342/225/221/342/225/221_Windows/321/207/320/231/320/230/321/206/320/254/320/274.exe +0 -0
- cnhkmcp/untracked/AI/321/206/320/261/320/234/321/211/320/255/320/262/321/206/320/237/320/242/321/204/342/225/227/342/225/242/vector_db/chroma.sqlite3 +0 -0
- cnhkmcp/untracked/skills/brain-data-feature-engineering/OUTPUT_TEMPLATE.md +325 -0
- cnhkmcp/untracked/skills/brain-data-feature-engineering/SKILL.md +263 -0
- cnhkmcp/untracked/skills/brain-data-feature-engineering/examples.md +244 -0
- cnhkmcp/untracked/skills/brain-data-feature-engineering/reference.md +493 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/SKILL.md +87 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/config.json +6 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/analyst15_GLB_delay1.csv +289 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/final_expressions.json +410 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588244.json +4 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588251.json +20 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588273.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588293.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588319.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588322.json +14 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588325.json +20 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588328.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588354.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588357.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588361.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588364.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588368.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588391.json +14 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588394.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588397.json +59 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588400.json +35 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588403.json +20 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588428.json +23 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588431.json +32 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588434.json +20 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588438.json +20 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588441.json +14 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/data/analyst15_GLB_delay1/idea_1768588468.json +20 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/scripts/ace_lib.py +1514 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/scripts/fetch_dataset.py +107 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/scripts/helpful_functions.py +180 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/scripts/implement_idea.py +164 -0
- cnhkmcp/untracked/skills/brain-feature-implementation/scripts/merge_expression_list.py +88 -0
- cnhkmcp/untracked/skills/planning-with-files/SKILL.md +211 -0
- cnhkmcp/untracked/skills/planning-with-files/examples.md +202 -0
- cnhkmcp/untracked/skills/planning-with-files/reference.md +218 -0
- cnhkmcp/untracked/skills/planning-with-files/scripts/check-complete.sh +44 -0
- cnhkmcp/untracked/skills/planning-with-files/scripts/init-session.sh +120 -0
- cnhkmcp/untracked/skills/planning-with-files/templates/findings.md +95 -0
- cnhkmcp/untracked/skills/planning-with-files/templates/progress.md +114 -0
- cnhkmcp/untracked/skills/planning-with-files/templates/task_plan.md +132 -0
- {cnhkmcp-2.1.9.dist-info → cnhkmcp-2.2.0.dist-info}/METADATA +1 -1
- {cnhkmcp-2.1.9.dist-info → cnhkmcp-2.2.0.dist-info}/RECORD +55 -10
- {cnhkmcp-2.1.9.dist-info → cnhkmcp-2.2.0.dist-info}/WHEEL +0 -0
- {cnhkmcp-2.1.9.dist-info → cnhkmcp-2.2.0.dist-info}/entry_points.txt +0 -0
- {cnhkmcp-2.1.9.dist-info → cnhkmcp-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {cnhkmcp-2.1.9.dist-info → cnhkmcp-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
print("Script started...", flush=True)
|
|
9
|
+
|
|
10
|
+
# Ensure local imports work by adding the script directory to sys.path
|
|
11
|
+
script_dir = Path(__file__).resolve().parent
|
|
12
|
+
sys.path.append(str(script_dir))
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import ace_lib
|
|
16
|
+
except ImportError:
|
|
17
|
+
print("Error: Could not import 'ace_lib'. Make sure it is in the same directory.")
|
|
18
|
+
sys.exit(1)
|
|
19
|
+
|
|
20
|
+
def load_config(config_path):
|
|
21
|
+
try:
|
|
22
|
+
with open(config_path, 'r') as f:
|
|
23
|
+
return json.load(f)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f"Error loading config file: {e}")
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
def main():
|
|
29
|
+
parser = argparse.ArgumentParser(description="Fetch dataset fields from WorldQuant BRAIN")
|
|
30
|
+
parser.add_argument("--datasetid", required=True, help="ID of the dataset to fetch (e.g., specific dataset ID)")
|
|
31
|
+
parser.add_argument("--region", default="USA", help="Region (default: USA)")
|
|
32
|
+
parser.add_argument("--delay", type=int, default=1, help="Delay (default: 1)")
|
|
33
|
+
parser.add_argument("--universe", default="TOP3000", help="Universe (default: TOP3000)")
|
|
34
|
+
parser.add_argument("--instrument-type", default="EQUITY", dest="instrument_type", help="Instrument Type (default: EQUITY)")
|
|
35
|
+
|
|
36
|
+
args = parser.parse_args()
|
|
37
|
+
|
|
38
|
+
# Determine paths relative to this script
|
|
39
|
+
# User requested: robust and no absolute paths hardcoded
|
|
40
|
+
workspace_dir = script_dir.parent
|
|
41
|
+
config_path = workspace_dir / "config.json"
|
|
42
|
+
data_dir = workspace_dir / "data"
|
|
43
|
+
|
|
44
|
+
# Ensure data directory exists
|
|
45
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
# Load configuration
|
|
48
|
+
if not config_path.exists():
|
|
49
|
+
print(f"Error: Config file not found at {config_path}")
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
config = load_config(config_path)
|
|
53
|
+
if not config:
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
# Extract credentials
|
|
57
|
+
creds = config.get("BRAIN_CREDENTIALS", {})
|
|
58
|
+
email = creds.get("email")
|
|
59
|
+
password = creds.get("password")
|
|
60
|
+
|
|
61
|
+
if not email or not password:
|
|
62
|
+
print("Error: BRAIN_CREDENTIALS (email/password) not found in config.json")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
# Override ace_lib.get_credentials to use our config values
|
|
66
|
+
# ace_lib.start_session() internally calls get_credentials()
|
|
67
|
+
ace_lib.get_credentials = lambda: (email, password)
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
print(f"Logging in as {email}...")
|
|
71
|
+
session = ace_lib.start_session()
|
|
72
|
+
|
|
73
|
+
print(f"Fetching datafields for dataset: {args.datasetid} (Region: {args.region}, Delay: {args.delay})...")
|
|
74
|
+
|
|
75
|
+
# Fetch datafields using the library function
|
|
76
|
+
df = ace_lib.get_datafields(
|
|
77
|
+
session,
|
|
78
|
+
dataset_id=args.datasetid,
|
|
79
|
+
region=args.region,
|
|
80
|
+
delay=args.delay,
|
|
81
|
+
universe=args.universe,
|
|
82
|
+
instrument_type=args.instrument_type
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if df is None or df.empty:
|
|
86
|
+
print("Warning: No data found or empty response.")
|
|
87
|
+
else:
|
|
88
|
+
# Construct a safe filename and folder name
|
|
89
|
+
safe_dataset_id = "".join([c for c in args.datasetid if c.isalnum() or c in ('-','_')])
|
|
90
|
+
folder_name = f"{safe_dataset_id}_{args.region}_delay{args.delay}"
|
|
91
|
+
dataset_folder = data_dir / folder_name
|
|
92
|
+
dataset_folder.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
filename = f"{folder_name}.csv"
|
|
95
|
+
output_path = dataset_folder / filename
|
|
96
|
+
|
|
97
|
+
print(f"Saving {len(df)} records to {output_path}...")
|
|
98
|
+
df.to_csv(output_path, index=False)
|
|
99
|
+
print("Success.")
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
print(f"An error occurred during execution: {e}")
|
|
103
|
+
import traceback
|
|
104
|
+
traceback.print_exc()
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
main()
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pandas.io.formats.style import Styler
|
|
7
|
+
|
|
8
|
+
brain_api_url = os.environ.get("BRAIN_API_URL", "https://api.worldquantbrain.com")
|
|
9
|
+
brain_url = os.environ.get("BRAIN_URL", "https://platform.worldquantbrain.com")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def make_clickable_alpha_id(alpha_id: str) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Create a clickable HTML link for an alpha ID.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
alpha_id (str): The ID of the alpha.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
str: An HTML string containing a clickable link to the alpha's page on the platform.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
url = brain_url + "/alpha/"
|
|
24
|
+
return f'<a href="{url}{alpha_id}">{alpha_id}</a>'
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def prettify_result(
|
|
28
|
+
result: list, detailed_tests_view: bool = False, clickable_alpha_id: bool = False
|
|
29
|
+
) -> Union[pd.DataFrame, Styler]:
|
|
30
|
+
"""
|
|
31
|
+
Combine and format simulation results into a single DataFrame for analysis.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
result (list): A list of dictionaries containing simulation results.
|
|
35
|
+
detailed_tests_view (bool, optional): If True, include detailed test results. Defaults to False.
|
|
36
|
+
clickable_alpha_id (bool, optional): If True, make alpha IDs clickable. Defaults to False.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
pandas.DataFrame or pandas.io.formats.style.Styler: A DataFrame containing formatted results,
|
|
40
|
+
optionally with clickable alpha IDs.
|
|
41
|
+
"""
|
|
42
|
+
list_of_is_stats = [result[x]["is_stats"] for x in range(len(result)) if result[x]["is_stats"] is not None]
|
|
43
|
+
is_stats_df = pd.concat(list_of_is_stats).reset_index(drop=True)
|
|
44
|
+
is_stats_df = is_stats_df.sort_values("fitness", ascending=False)
|
|
45
|
+
|
|
46
|
+
expressions = {
|
|
47
|
+
result[x]["alpha_id"]: (
|
|
48
|
+
{
|
|
49
|
+
"selection": result[x]["simulate_data"]["selection"],
|
|
50
|
+
"combo": result[x]["simulate_data"]["combo"],
|
|
51
|
+
}
|
|
52
|
+
if result[x]["simulate_data"]["type"] == "SUPER"
|
|
53
|
+
else result[x]["simulate_data"]["regular"]
|
|
54
|
+
)
|
|
55
|
+
for x in range(len(result))
|
|
56
|
+
if result[x]["is_stats"] is not None
|
|
57
|
+
}
|
|
58
|
+
expression_df = pd.DataFrame(list(expressions.items()), columns=["alpha_id", "expression"])
|
|
59
|
+
|
|
60
|
+
list_of_is_tests = [result[x]["is_tests"] for x in range(len(result)) if result[x]["is_tests"] is not None]
|
|
61
|
+
is_tests_df = pd.concat(list_of_is_tests, sort=True).reset_index(drop=True)
|
|
62
|
+
is_tests_df = is_tests_df[is_tests_df["result"] != "WARNING"]
|
|
63
|
+
if detailed_tests_view:
|
|
64
|
+
cols = ["limit", "result", "value"]
|
|
65
|
+
is_tests_df["details"] = is_tests_df[cols].to_dict(orient="records")
|
|
66
|
+
is_tests_df = is_tests_df.pivot(index="alpha_id", columns="name", values="details").reset_index()
|
|
67
|
+
else:
|
|
68
|
+
is_tests_df = is_tests_df.pivot(index="alpha_id", columns="name", values="result").reset_index()
|
|
69
|
+
|
|
70
|
+
alpha_stats = pd.merge(is_stats_df, expression_df, on="alpha_id")
|
|
71
|
+
alpha_stats = pd.merge(alpha_stats, is_tests_df, on="alpha_id")
|
|
72
|
+
alpha_stats = alpha_stats.drop(columns=alpha_stats.columns[(alpha_stats == "PENDING").any()])
|
|
73
|
+
alpha_stats.columns = alpha_stats.columns.str.replace("(?<=[a-z])(?=[A-Z])", "_", regex=True).str.lower()
|
|
74
|
+
if clickable_alpha_id:
|
|
75
|
+
return alpha_stats.style.format({"alpha_id": lambda x: make_clickable_alpha_id(str(x))})
|
|
76
|
+
return alpha_stats
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def concat_pnl(result: list) -> pd.DataFrame:
|
|
80
|
+
"""
|
|
81
|
+
Combine PnL results from multiple alphas into a single DataFrame.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
result (list): A list of dictionaries containing simulation results with PnL data.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
pandas.DataFrame: A DataFrame containing combined PnL data for all alphas.
|
|
88
|
+
"""
|
|
89
|
+
list_of_pnls = [result[x]["pnl"] for x in range(len(result)) if result[x]["pnl"] is not None]
|
|
90
|
+
pnls_df = pd.concat(list_of_pnls).reset_index()
|
|
91
|
+
|
|
92
|
+
return pnls_df
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def concat_is_tests(result: list) -> pd.DataFrame:
|
|
96
|
+
"""
|
|
97
|
+
Combine in-sample test results from multiple alphas into a single DataFrame.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
result (list): A list of dictionaries containing simulation results with in-sample test data.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
pandas.DataFrame: A DataFrame containing combined in-sample test results for all alphas.
|
|
104
|
+
"""
|
|
105
|
+
is_tests_list = [result[x]["is_tests"] for x in range(len(result)) if result[x]["is_tests"] is not None]
|
|
106
|
+
is_tests_df = pd.concat(is_tests_list, sort=True).reset_index(drop=True)
|
|
107
|
+
return is_tests_df
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def save_simulation_result(result: dict) -> None:
|
|
111
|
+
"""
|
|
112
|
+
Save the simulation result to a JSON file in the 'simulation_results' folder.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
result (dict): A dictionary containing the simulation result for an alpha.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
alpha_id = result["id"]
|
|
119
|
+
region = result["settings"]["region"]
|
|
120
|
+
folder_path = "simulation_results/"
|
|
121
|
+
file_path = os.path.join(folder_path, f"{alpha_id}_{region}")
|
|
122
|
+
|
|
123
|
+
os.makedirs(folder_path, exist_ok=True)
|
|
124
|
+
|
|
125
|
+
with open(file_path, "w", encoding="utf-8") as file:
|
|
126
|
+
json.dump(result, file)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def save_pnl(pnl_df: pd.DataFrame, alpha_id: str, region: str) -> None:
|
|
130
|
+
"""
|
|
131
|
+
Save the PnL data for an alpha to a CSV file in the 'alphas_pnl' folder.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
pnl_df (pandas.DataFrame): The DataFrame containing PnL data.
|
|
135
|
+
alpha_id (str): The ID of the alpha.
|
|
136
|
+
region (str): The region for which the PnL data was generated.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
folder_path = "alphas_pnl/"
|
|
140
|
+
file_path = os.path.join(folder_path, f"{alpha_id}_{region}.csv")
|
|
141
|
+
os.makedirs(folder_path, exist_ok=True)
|
|
142
|
+
|
|
143
|
+
pnl_df.to_csv(file_path)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def save_yearly_stats(yearly_stats: pd.DataFrame, alpha_id: str, region: str):
|
|
147
|
+
"""
|
|
148
|
+
Save the yearly statistics for an alpha to a CSV file in the 'yearly_stats' folder.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
yearly_stats (pandas.DataFrame): The DataFrame containing yearly statistics.
|
|
152
|
+
alpha_id (str): The ID of the alpha.
|
|
153
|
+
region (str): The region for which the statistics were generated.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
folder_path = "yearly_stats/"
|
|
157
|
+
file_path = os.path.join(folder_path, f"{alpha_id}_{region}.csv")
|
|
158
|
+
os.makedirs(folder_path, exist_ok=True)
|
|
159
|
+
|
|
160
|
+
yearly_stats.to_csv(file_path, index=False)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def expand_dict_columns(data: pd.DataFrame) -> pd.DataFrame:
|
|
164
|
+
"""
|
|
165
|
+
Expand dictionary columns in a DataFrame into separate columns.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
data (pandas.DataFrame): The input DataFrame with dictionary columns.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
pandas.DataFrame: A new DataFrame with expanded columns.
|
|
172
|
+
"""
|
|
173
|
+
dict_columns = list(filter(lambda x: isinstance(data[x].iloc[0], dict), data.columns))
|
|
174
|
+
new_columns = pd.concat(
|
|
175
|
+
[data[col].apply(pd.Series).rename(columns=lambda x: f"{col}_{x}") for col in dict_columns],
|
|
176
|
+
axis=1,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
data = pd.concat([data, new_columns], axis=1)
|
|
180
|
+
return data
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
import re
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
def load_data(dataset_name=None):
|
|
10
|
+
script_dir = Path(__file__).resolve().parent
|
|
11
|
+
workspace_dir = script_dir.parent
|
|
12
|
+
|
|
13
|
+
if not dataset_name:
|
|
14
|
+
data_root = workspace_dir / "data"
|
|
15
|
+
if not data_root.exists():
|
|
16
|
+
print("Error: Data directory not found.", file=sys.stderr)
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
|
|
19
|
+
subdirs = [d for d in data_root.iterdir() if d.is_dir()]
|
|
20
|
+
|
|
21
|
+
if len(subdirs) == 1:
|
|
22
|
+
dataset_name = subdirs[0].name
|
|
23
|
+
print(f"Auto-detected dataset: {dataset_name}", file=sys.stderr)
|
|
24
|
+
elif len(subdirs) > 1:
|
|
25
|
+
print("Error: Multiple datasets found. Please specify --dataset.", file=sys.stderr)
|
|
26
|
+
print("Available datasets:", file=sys.stderr)
|
|
27
|
+
for d in subdirs:
|
|
28
|
+
print(f" {d.name}", file=sys.stderr)
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
else:
|
|
31
|
+
print("Error: No dataset folders found inside data directory.", file=sys.stderr)
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
dataset_dir = workspace_dir / "data" / dataset_name
|
|
35
|
+
data_path = dataset_dir / f"{dataset_name}.csv"
|
|
36
|
+
|
|
37
|
+
print(f"Loading data from {data_path}...", file=sys.stderr)
|
|
38
|
+
try:
|
|
39
|
+
df = pd.read_csv(data_path)
|
|
40
|
+
return df, dataset_dir
|
|
41
|
+
except FileNotFoundError:
|
|
42
|
+
print(f"Error: Data file not found at {data_path}. Please run fetch_dataset.py first.", file=sys.stderr)
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
def extract_keys_from_template(template):
|
|
46
|
+
return re.findall(r'\{([A-Za-z0-9_]+)\}', template)
|
|
47
|
+
|
|
48
|
+
def match_single_horizon_auto(df, template):
|
|
49
|
+
"""
|
|
50
|
+
Auto-detects metrics from template and finds matching fields.
|
|
51
|
+
"""
|
|
52
|
+
metrics = extract_keys_from_template(template)
|
|
53
|
+
if not metrics:
|
|
54
|
+
print("Error: No variables found in template (use {variable} format).", file=sys.stderr)
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
# Sort metrics by length descending to match most specific suffixes first
|
|
58
|
+
metrics = sorted(metrics, key=len, reverse=True)
|
|
59
|
+
primary = metrics[0]
|
|
60
|
+
|
|
61
|
+
# Try different separators or exact match
|
|
62
|
+
# we look for columns that end with the primary metric, optionally followed by numeric suffix (e.g. _1234)
|
|
63
|
+
# Regex: .*<primary>(?:_\d+)?$
|
|
64
|
+
import re
|
|
65
|
+
primary_regex = re.escape(primary) + r'(?:_\d+)?$'
|
|
66
|
+
candidates = df[df['id'].str.match(f'.*{primary_regex}')]['id'].unique().tolist()
|
|
67
|
+
|
|
68
|
+
results = []
|
|
69
|
+
seen = set()
|
|
70
|
+
|
|
71
|
+
# Try different separators or exact match
|
|
72
|
+
# We look for columns that contain the primary metric at any position
|
|
73
|
+
import re
|
|
74
|
+
primary_regex = re.escape(primary)
|
|
75
|
+
candidates = df[df['id'].str.contains(primary_regex, regex=True)]['id'].unique().tolist()
|
|
76
|
+
|
|
77
|
+
results = []
|
|
78
|
+
seen = set()
|
|
79
|
+
|
|
80
|
+
for cand in candidates:
|
|
81
|
+
# Determine base prefix
|
|
82
|
+
# We identify the prefix by taking everything before the first occurrence of the primary metric
|
|
83
|
+
match = re.search(re.escape(primary), cand)
|
|
84
|
+
if not match:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Base includes everything up to the metric (e.g., "dataset_prefix_")
|
|
88
|
+
base = cand[:match.start()]
|
|
89
|
+
|
|
90
|
+
# Verify other metrics exist with this base
|
|
91
|
+
field_map = {primary: cand}
|
|
92
|
+
all_found = True
|
|
93
|
+
|
|
94
|
+
for m in metrics[1:]:
|
|
95
|
+
# Construct target pattern for other metrics: Must start with the same base followed by the metric
|
|
96
|
+
# We allow any suffix after the metric (e.g. IDs, versions)
|
|
97
|
+
target_pattern = f"^{re.escape(base)}{re.escape(m)}"
|
|
98
|
+
target_matches = df[df['id'].str.match(target_pattern)]['id'].tolist()
|
|
99
|
+
|
|
100
|
+
if not target_matches:
|
|
101
|
+
all_found = False
|
|
102
|
+
break
|
|
103
|
+
# Use the first match found for the secondary metric
|
|
104
|
+
field_map[m] = target_matches[0]
|
|
105
|
+
|
|
106
|
+
if all_found:
|
|
107
|
+
try:
|
|
108
|
+
expr = template.format(**field_map)
|
|
109
|
+
if expr not in seen:
|
|
110
|
+
seen.add(expr)
|
|
111
|
+
# Create a readable label for the horizon/group
|
|
112
|
+
if base:
|
|
113
|
+
# Strip standard separators
|
|
114
|
+
horizon_label = base.strip("_")
|
|
115
|
+
else:
|
|
116
|
+
horizon_label = "global"
|
|
117
|
+
|
|
118
|
+
results.append((horizon_label, expr))
|
|
119
|
+
except KeyError as e:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
return results
|
|
123
|
+
|
|
124
|
+
def main():
|
|
125
|
+
parser = argparse.ArgumentParser(description="Generate Alpha Expressions based on patterns")
|
|
126
|
+
parser.add_argument("--template", required=True, help="Python format string (e.g. '{st_dev} / abs({mean})')")
|
|
127
|
+
parser.add_argument("--dataset", help="Name of the dataset folder. Auto-detected if only one exists.")
|
|
128
|
+
|
|
129
|
+
args = parser.parse_args()
|
|
130
|
+
|
|
131
|
+
df, dataset_dir = load_data(args.dataset)
|
|
132
|
+
|
|
133
|
+
results = match_single_horizon_auto(df, args.template)
|
|
134
|
+
|
|
135
|
+
# Output
|
|
136
|
+
expression_list = []
|
|
137
|
+
if not results:
|
|
138
|
+
print("No matching expressions found.")
|
|
139
|
+
else:
|
|
140
|
+
print(f"Generated {len(results)} expressions:\n")
|
|
141
|
+
# print(f"{'Context':<30} | Expression")
|
|
142
|
+
# print("-" * 120)
|
|
143
|
+
|
|
144
|
+
for context, expr in results:
|
|
145
|
+
# print(f"{context:<30} | {expr}")
|
|
146
|
+
expression_list.append(expr)
|
|
147
|
+
|
|
148
|
+
# Save results to JSON (Always save for debugging)
|
|
149
|
+
timestamp = int(time.time())
|
|
150
|
+
json_output = {
|
|
151
|
+
"template": args.template,
|
|
152
|
+
"expression_list": expression_list
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
output_file = dataset_dir / f"idea_{timestamp}.json"
|
|
156
|
+
try:
|
|
157
|
+
with open(output_file, 'w') as f:
|
|
158
|
+
json.dump(json_output, f, indent=4)
|
|
159
|
+
print(f"\nSaved idea configuration to: {output_file}")
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f"Error saving JSON: {e}", file=sys.stderr)
|
|
162
|
+
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
main()
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import argparse
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
def load_data_dir(dataset_name=None):
|
|
7
|
+
script_dir = Path(__file__).resolve().parent
|
|
8
|
+
workspace_dir = script_dir.parent
|
|
9
|
+
|
|
10
|
+
if not dataset_name:
|
|
11
|
+
data_root = workspace_dir / "data"
|
|
12
|
+
if not data_root.exists():
|
|
13
|
+
print("Error: Data directory not found.", file=sys.stderr)
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
subdirs = [d for d in data_root.iterdir() if d.is_dir()]
|
|
17
|
+
|
|
18
|
+
if len(subdirs) == 1:
|
|
19
|
+
dataset_name = subdirs[0].name
|
|
20
|
+
print(f"Auto-detected dataset: {dataset_name}", file=sys.stderr)
|
|
21
|
+
return workspace_dir / "data" / dataset_name
|
|
22
|
+
elif len(subdirs) > 1:
|
|
23
|
+
print("Error: Multiple datasets found. Please specify --dataset.", file=sys.stderr)
|
|
24
|
+
sys.exit(1)
|
|
25
|
+
else:
|
|
26
|
+
print("Error: No dataset folders found inside data directory.", file=sys.stderr)
|
|
27
|
+
sys.exit(1)
|
|
28
|
+
|
|
29
|
+
return workspace_dir / "data" / dataset_name
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
parser = argparse.ArgumentParser(description="Merge all generated expressions from idea JSON files.")
|
|
33
|
+
parser.add_argument("--dataset", help="Name of the dataset folder containing idea JSONs.")
|
|
34
|
+
parser.add_argument("--output", default="final_expressions.json", help="Output filename.")
|
|
35
|
+
|
|
36
|
+
args = parser.parse_args()
|
|
37
|
+
|
|
38
|
+
dataset_dir = load_data_dir(args.dataset)
|
|
39
|
+
|
|
40
|
+
if not dataset_dir.exists():
|
|
41
|
+
print(f"Error: Dataset directory {dataset_dir} does not exist.", file=sys.stderr)
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
all_expressions = []
|
|
45
|
+
|
|
46
|
+
# Find all idea_*.json files
|
|
47
|
+
json_files = list(dataset_dir.glob("idea_*.json"))
|
|
48
|
+
|
|
49
|
+
if not json_files:
|
|
50
|
+
print(f"No idea_*.json files found in {dataset_dir}", file=sys.stderr)
|
|
51
|
+
sys.exit(0)
|
|
52
|
+
|
|
53
|
+
print(f"Found {len(json_files)} idea files. Merging...")
|
|
54
|
+
|
|
55
|
+
for jf in json_files:
|
|
56
|
+
try:
|
|
57
|
+
with open(jf, 'r') as f:
|
|
58
|
+
data = json.load(f)
|
|
59
|
+
exprs = data.get("expression_list", [])
|
|
60
|
+
if exprs:
|
|
61
|
+
all_expressions.extend(exprs)
|
|
62
|
+
print(f" + {jf.name}: {len(exprs)} expressions")
|
|
63
|
+
else:
|
|
64
|
+
print(f" - {jf.name}: 0 expressions")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f" ! Error reading {jf.name}: {e}", file=sys.stderr)
|
|
67
|
+
|
|
68
|
+
# Remove duplicates if desired? Usually we keep them or set them.
|
|
69
|
+
# Let's make unique to be safe, but preserve order as best as possible.
|
|
70
|
+
unique_expressions = []
|
|
71
|
+
seen = set()
|
|
72
|
+
for ex in all_expressions:
|
|
73
|
+
if ex not in seen:
|
|
74
|
+
unique_expressions.append(ex)
|
|
75
|
+
seen.add(ex)
|
|
76
|
+
|
|
77
|
+
output_path = dataset_dir / args.output
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
with open(output_path, 'w') as f:
|
|
81
|
+
json.dump(unique_expressions, f, indent=4)
|
|
82
|
+
print(f"\nSuccessfully merged {len(unique_expressions)} unique expressions.")
|
|
83
|
+
print(f"Output saved to: {output_path}")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
print(f"Error saving output: {e}", file=sys.stderr)
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
main()
|