PyPI - cnhkmcp - Versions diffs - 2.3.0__py3-none-any.whl → 2.3.2__py3-none-any.whl - Mend

cnhkmcp 2.3.0py3-none-any.whl → 2.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

cnhkmcp/untracked/APP/trailSomeAlphas/run_pipeline.py CHANGED Viewed

@@ -3,6 +3,7 @@ import datetime as dt
 import json
 import os
 import re
+import shutil
 import subprocess
 import sys
 import csv
@@ -327,6 +328,7 @@ def build_prompt(
     region: str,
     delay: int,
     universe: str,
+    data_type: str,
     fields_summary: list[dict],
     field_count: int,
     feature_engineering_skill_md: str,
@@ -336,8 +338,7 @@ def build_prompt(
 ):
     # NOTE: The user requested that we DO NOT invent our own system prompt.
     # Instead, we embed the two skill specs as the authoritative instructions.
-    system_prompt = "\n".join(
-        [
+    prompt_lines = [
             "You are executing two skills in sequence:",
             "1) brain-data-feature-engineering",
             "2) brain-feature-implementation",
@@ -353,6 +354,15 @@ def build_prompt(
             "-------",
             f'"allowed_placeholders": {allowed_metric_suffixes}',
             "",
+        ]
+    if str(data_type).upper() == "VECTOR":
+        prompt_lines.append(
+            "since all the following the data is vector type data, before you do any process, you should choose a vector operator to generate its statistical feature to use, the data cannot be directly use. for example, if datafieldA and datafieldB are vector type data, you can use vec_avg(datafieldA) -  vec_avg(datafieldB), where vec_avg() operator is used to generate the average of the data on a certain date. similarly, vector type operator can only be used on the vector type operator directly and cannot be nested, for example vec_avg(vec_sum(datafield)) is a false use."
+        )
+    prompt_lines.extend(
+        [
             "CRITICAL OUTPUT RULES (to ensure implement_idea.py can generate expressions):",
             "- Every Implementation Example MUST be a Python format template using {variable}.",
             "- Every {variable} MUST come from the allowed_placeholders list provided in user content.",
@@ -366,6 +376,8 @@ def build_prompt(
         ]
     )
+    system_prompt = "\n".join(prompt_lines)
     user_prompt = {
         "instructions": {
             "output_format": "Fill OUTPUT_TEMPLATE.md with concrete content.",
@@ -732,6 +744,21 @@ def run_script(args_list: list[str], cwd: Path):
         )
     return result.stdout
+def delete_path_if_exists(path: Path):
+    """Best-effort delete a file or directory."""
+    try:
+        if not path.exists():
+            return
+        if path.is_dir():
+            shutil.rmtree(path, ignore_errors=True)
+        else:
+            path.unlink(missing_ok=True)
+    except Exception:
+        # Best-effort cleanup only; rerun should still proceed.
+        return
 def main():
     parser = argparse.ArgumentParser(description="Run feature engineering + implementation pipeline")
     parser.add_argument("--data-category", required=True, help="Dataset category (e.g., analyst, fundamental)")
@@ -740,6 +767,12 @@ def main():
     parser.add_argument("--universe", default="TOP3000", help="Universe (default: TOP3000)")
     parser.add_argument("--dataset-id", required=True, help="Dataset id (required)")
     parser.add_argument("--instrument-type", default="EQUITY", help="Instrument type (default: EQUITY)")
+    parser.add_argument(
+        "--data-type",
+        default="MATRIX",
+        choices=["MATRIX", "VECTOR"],
+        help="Data type to request from BRAIN datafields (MATRIX or VECTOR). Default: MATRIX",
+    )
     parser.add_argument("--ideas-file", default=None, help="Use existing ideas markdown instead of generating")
     parser.add_argument(
         "--regen-ideas",
@@ -774,96 +807,105 @@ def main():
     email, password = load_brain_credentials_from_env_or_args(args.username, args.password, config_path)
     session = start_brain_session(email, password)
+    # Always rerun cleanly: remove prior generated artifacts so we never reuse stale ideas/data.
+    # - If --ideas-file is provided, we treat it as user-managed input and do NOT delete it.
+    # - We DO delete the dataset-specific folder under feature-implementation/data.
+    if not args.ideas_file:
+        default_ideas = (
+            FEATURE_ENGINEERING_DIR
+            / "output_report"
+            / f"{args.region}_delay{args.delay}_{args.dataset_id}_ideas.md"
+        )
+        delete_path_if_exists(default_ideas)
+    guessed_dataset_folder = f"{safe_dataset_id(args.dataset_id)}_{args.region}_delay{args.delay}"
+    guessed_dataset_dir = FEATURE_IMPLEMENTATION_DIR / "data" / guessed_dataset_folder
+    delete_path_if_exists(guessed_dataset_dir)
     ideas_path = None
     if args.ideas_file:
         ideas_path = Path(args.ideas_file).resolve()
         if not ideas_path.exists():
             raise FileNotFoundError(f"Ideas file not found: {ideas_path}")
     else:
-        default_ideas = (
-            FEATURE_ENGINEERING_DIR
-            / "output_report"
-            / f"{args.region}_delay{args.delay}_{args.dataset_id}_ideas.md"
+        # Always regenerate ideas (never reuse an existing markdown report).
+        datasets_df = ace_lib.get_datasets(
+            session,
+            instrument_type=args.instrument_type,
+            region=args.region,
+            delay=args.delay,
+            universe=args.universe,
+            theme="ALL",
         )
-        if default_ideas.exists() and not args.regen_ideas:
-            ideas_path = default_ideas
-        else:
-            datasets_df = ace_lib.get_datasets(
-                session,
-                instrument_type=args.instrument_type,
-                region=args.region,
-                delay=args.delay,
-                universe=args.universe,
-                theme="ALL",
-            )
-            dataset_name = None
-            dataset_description = None
-            id_col = pick_first_present_column(datasets_df, ["id", "dataset_id", "datasetId"])
-            name_col = pick_first_present_column(datasets_df, ["name", "dataset_name", "datasetName"])
-            desc_col = pick_first_present_column(datasets_df, ["description", "desc", "dataset_description"])
-            if id_col:
-                matched = datasets_df[datasets_df[id_col].astype(str) == str(args.dataset_id)]
-                if not matched.empty:
-                    row = matched.iloc[0]
-                    dataset_name = row.get(name_col) if name_col else None
-                    dataset_description = row.get(desc_col) if desc_col else None
-            fields_df = ace_lib.get_datafields(
-                session,
-                instrument_type=args.instrument_type,
-                region=args.region,
-                delay=args.delay,
-                universe=args.universe,
-                dataset_id=args.dataset_id,
-                data_type="ALL",
-            )
-            fields_summary, field_count = build_field_summary(fields_df, max_fields=args.max_fields)
-            feature_engineering_skill_md = read_text_optional(FEATURE_ENGINEERING_DIR / "SKILL.md")
-            feature_implementation_skill_md = read_text_optional(FEATURE_IMPLEMENTATION_DIR / "SKILL.md")
-            allowed_metric_suffixes = build_allowed_metric_suffixes(fields_df, max_suffixes=300)
-            allowed_operators = []
-            if not args.no_operators_in_prompt:
-                try:
-                    operators_df = ace_lib.get_operators(session)
-                    keep_vector = _vector_ratio_from_datafields_df(fields_df) > 0.5
-                    _, allowed_ops, _ = filter_operators_df(operators_df, keep_vector=keep_vector)
-                    if args.max_operators is not None and args.max_operators > 0:
-                        allowed_operators = allowed_ops[: args.max_operators]
-                    else:
-                        allowed_operators = allowed_ops
-                except Exception as exc:
-                    print(f"Warning: failed to fetch/filter operators; continuing without operators in prompt. Error: {exc}", file=sys.stderr)
-            system_prompt, user_prompt = build_prompt(
-                dataset_id=args.dataset_id,
-                dataset_name=dataset_name,
-                dataset_description=dataset_description,
-                data_category=args.data_category,
-                region=args.region,
-                delay=args.delay,
-                universe=args.universe,
-                fields_summary=fields_summary,
-                field_count=field_count,
-                feature_engineering_skill_md=feature_engineering_skill_md,
-                feature_implementation_skill_md=feature_implementation_skill_md,
-                allowed_metric_suffixes=allowed_metric_suffixes,
-                allowed_operators=allowed_operators,
-            )
-            api_key = (
-                args.moonshot_api_key
-                or os.environ.get("MOONSHOT_API_KEY")
-            )
-            if not api_key:
-                raise ValueError("Moonshot API key missing. Set MOONSHOT_API_KEY or pass --moonshot-api-key")
-            report = call_moonshot(api_key, args.moonshot_model, system_prompt, user_prompt)
-            # Save first, then normalize placeholders after dataset download.
-            ideas_path = save_ideas_report(report, args.region, args.delay, args.dataset_id)
+        dataset_name = None
+        dataset_description = None
+        id_col = pick_first_present_column(datasets_df, ["id", "dataset_id", "datasetId"])
+        name_col = pick_first_present_column(datasets_df, ["name", "dataset_name", "datasetName"])
+        desc_col = pick_first_present_column(datasets_df, ["description", "desc", "dataset_description"])
+        if id_col:
+            matched = datasets_df[datasets_df[id_col].astype(str) == str(args.dataset_id)]
+            if not matched.empty:
+                row = matched.iloc[0]
+                dataset_name = row.get(name_col) if name_col else None
+                dataset_description = row.get(desc_col) if desc_col else None
+        fields_df = ace_lib.get_datafields(
+            session,
+            instrument_type=args.instrument_type,
+            region=args.region,
+            delay=args.delay,
+            universe=args.universe,
+            dataset_id=args.dataset_id,
+            data_type=args.data_type,
+        )
+        fields_summary, field_count = build_field_summary(fields_df, max_fields=args.max_fields)
+        feature_engineering_skill_md = read_text_optional(FEATURE_ENGINEERING_DIR / "SKILL.md")
+        feature_implementation_skill_md = read_text_optional(FEATURE_IMPLEMENTATION_DIR / "SKILL.md")
+        allowed_metric_suffixes = build_allowed_metric_suffixes(fields_df, max_suffixes=300)
+        allowed_operators = []
+        if not args.no_operators_in_prompt:
+            try:
+                operators_df = ace_lib.get_operators(session)
+                keep_vector = _vector_ratio_from_datafields_df(fields_df) > 0.5
+                _, allowed_ops, _ = filter_operators_df(operators_df, keep_vector=keep_vector)
+                if args.max_operators is not None and args.max_operators > 0:
+                    allowed_operators = allowed_ops[: args.max_operators]
+                else:
+                    allowed_operators = allowed_ops
+            except Exception as exc:
+                print(f"Warning: failed to fetch/filter operators; continuing without operators in prompt. Error: {exc}", file=sys.stderr)
+        system_prompt, user_prompt = build_prompt(
+            dataset_id=args.dataset_id,
+            dataset_name=dataset_name,
+            dataset_description=dataset_description,
+            data_category=args.data_category,
+            region=args.region,
+            delay=args.delay,
+            universe=args.universe,
+            data_type=args.data_type,
+            fields_summary=fields_summary,
+            field_count=field_count,
+            feature_engineering_skill_md=feature_engineering_skill_md,
+            feature_implementation_skill_md=feature_implementation_skill_md,
+            allowed_metric_suffixes=allowed_metric_suffixes,
+            allowed_operators=allowed_operators,
+        )
+        api_key = (
+            args.moonshot_api_key
+            or os.environ.get("MOONSHOT_API_KEY")
+        )
+        if not api_key:
+            raise ValueError("Moonshot API key missing. Set MOONSHOT_API_KEY or pass --moonshot-api-key")
+        report = call_moonshot(api_key, args.moonshot_model, system_prompt, user_prompt)
+        # Save first, then normalize placeholders after dataset download.
+        ideas_path = save_ideas_report(report, args.region, args.delay, args.dataset_id)
     ideas_text = ideas_path.read_text(encoding="utf-8")
@@ -891,11 +933,19 @@ def main():
             args.universe,
             "--instrument-type",
             args.instrument_type,
+            "--data-type",
+            args.data_type,
         ],
         cwd=FEATURE_IMPLEMENTATION_SCRIPTS,
     )
     dataset_folder = f"{safe_dataset_id(dataset_id)}_{args.region}_delay{args.delay}"
+    # If the ideas file references a different dataset id than the CLI args,
+    # ensure we also clean that dataset folder before fetching.
+    if dataset_folder != guessed_dataset_folder:
+        delete_path_if_exists(FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder)
     dataset_csv_path = FEATURE_IMPLEMENTATION_DIR / "data" / dataset_folder / f"{dataset_folder}.csv"
     if not dataset_csv_path.exists():
         raise RuntimeError(

cnhkmcp/untracked/APP/trailSomeAlphas/skills/brain-data-feature-engineering/SKILL.md CHANGED Viewed

@@ -180,6 +180,7 @@ For each relevant question-field combination:
     ## Feature Concepts by Question Type
     ### Q1: "What is stable?" (Invariance Features)
     **Concept**: {stability_feature_1_name}
@@ -187,6 +188,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_1}
     - **Why This Feature**: {why_1}
     - **Logical Meaning**: {logical_meaning_1}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_1}
     - **Boundary Conditions**: {boundaries_1}
     - **Implementation Example**: `{implementation_1}`
@@ -196,6 +198,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_2}
     - **Why This Feature**: {why_2}
     - **Logical Meaning**: {logical_meaning_2}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_2}
     - **Boundary Conditions**: {boundaries_2}
     - **Implementation Example**: `{implementation_2}`
@@ -209,6 +212,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_3}
     - **Why This Feature**: {why_3}
     - **Logical Meaning**: {logical_meaning_3}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_3}
     - **Boundary Conditions**: {boundaries_3}
     - **Implementation Example**: `{implementation_3}`
@@ -218,6 +222,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_4}
     - **Why This Feature**: {why_4}
     - **Logical Meaning**: {logical_meaning_4}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_4}
     - **Boundary Conditions**: {boundaries_4}
     - **Implementation Example**: `{implementation_4}`
@@ -231,6 +236,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_5}
     - **Why This Feature**: {why_5}
     - **Logical Meaning**: {logical_meaning_5}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_5}
     - **Boundary Conditions**: {boundaries_5}
     - **Implementation Example**: `{implementation_5}`
@@ -240,6 +246,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_6}
     - **Why This Feature**: {why_6}
     - **Logical Meaning**: {logical_meaning_6}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_6}
     - **Boundary Conditions**: {boundaries_6}
     - **Implementation Example**: `{implementation_6}`
@@ -253,6 +260,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_7}
     - **Why This Feature**: {why_7}
     - **Logical Meaning**: {logical_meaning_7}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_7}
     - **Boundary Conditions**: {boundaries_7}
     - **Implementation Example**: `{implementation_7}`
@@ -262,6 +270,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_8}
     - **Why This Feature**: {why_8}
     - **Logical Meaning**: {logical_meaning_8}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_8}
     - **Boundary Conditions**: {boundaries_8}
     - **Implementation Example**: `{implementation_8}`
@@ -275,6 +284,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_9}
     - **Why This Feature**: {why_9}
     - **Logical Meaning**: {logical_meaning_9}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_9}
     - **Boundary Conditions**: {boundaries_9}
     - **Implementation Example**: `{implementation_9}`
@@ -284,6 +294,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_10}
     - **Why This Feature**: {why_10}
     - **Logical Meaning**: {logical_meaning_10}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_10}
     - **Boundary Conditions**: {boundaries_10}
     - **Implementation Example**: `{implementation_10}`
@@ -297,6 +308,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_11}
     - **Why This Feature**: {why_11}
     - **Logical Meaning**: {logical_meaning_11}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_11}
     - **Boundary Conditions**: {boundaries_11}
     - **Implementation Example**: `{implementation_11}`
@@ -306,6 +318,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_12}
     - **Why This Feature**: {why_12}
     - **Logical Meaning**: {logical_meaning_12}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_12}
     - **Boundary Conditions**: {boundaries_12}
     - **Implementation Example**: `{implementation_12}`
@@ -319,6 +332,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_13}
     - **Why This Feature**: {why_13}
     - **Logical Meaning**: {logical_meaning_13}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_13}
     - **Boundary Conditions**: {boundaries_13}
     - **Implementation Example**: `{implementation_13}`
@@ -328,6 +342,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_14}
     - **Why This Feature**: {why_14}
     - **Logical Meaning**: {logical_meaning_14}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_14}
     - **Boundary Conditions**: {boundaries_14}
     - **Implementation Example**: `{implementation_14}`
@@ -341,6 +356,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_15}
     - **Why This Feature**: {why_15}
     - **Logical Meaning**: {logical_meaning_15}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_15}
     - **Boundary Conditions**: {boundaries_15}
     - **Implementation Example**: `{implementation_15}`
@@ -350,6 +366,7 @@ For each relevant question-field combination:
     - **Definition**: {definition_16}
     - **Why This Feature**: {why_16}
     - **Logical Meaning**: {logical_meaning_16}
+    - **is filling nan necessary**: we have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario.
     - **Directionality**: {directionality_16}
     - **Boundary Conditions**: {boundaries_16}
     - **Implementation Example**: `{implementation_16}`

cnhkmcp 2.3.0__py3-none-any.whl → 2.3.2__py3-none-any.whl

cnhkmcp 2.3.0py3-none-any.whl → 2.3.2py3-none-any.whl