PyPI - hackagent - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hackagent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

hackagent/__init__.py +23 -0
hackagent/agent.py +193 -0
hackagent/api/__init__.py +1 -0
hackagent/api/agent/__init__.py +1 -0
hackagent/api/agent/agent_create.py +340 -0
hackagent/api/agent/agent_destroy.py +136 -0
hackagent/api/agent/agent_list.py +234 -0
hackagent/api/agent/agent_partial_update.py +354 -0
hackagent/api/agent/agent_retrieve.py +227 -0
hackagent/api/agent/agent_update.py +354 -0
hackagent/api/attack/__init__.py +1 -0
hackagent/api/attack/attack_create.py +264 -0
hackagent/api/attack/attack_destroy.py +140 -0
hackagent/api/attack/attack_list.py +242 -0
hackagent/api/attack/attack_partial_update.py +278 -0
hackagent/api/attack/attack_retrieve.py +235 -0
hackagent/api/attack/attack_update.py +278 -0
hackagent/api/key/__init__.py +1 -0
hackagent/api/key/key_create.py +168 -0
hackagent/api/key/key_destroy.py +97 -0
hackagent/api/key/key_list.py +158 -0
hackagent/api/key/key_retrieve.py +150 -0
hackagent/api/prompt/__init__.py +1 -0
hackagent/api/prompt/prompt_create.py +160 -0
hackagent/api/prompt/prompt_destroy.py +98 -0
hackagent/api/prompt/prompt_list.py +173 -0
hackagent/api/prompt/prompt_partial_update.py +174 -0
hackagent/api/prompt/prompt_retrieve.py +151 -0
hackagent/api/prompt/prompt_update.py +174 -0
hackagent/api/result/__init__.py +1 -0
hackagent/api/result/result_create.py +160 -0
hackagent/api/result/result_destroy.py +98 -0
hackagent/api/result/result_list.py +233 -0
hackagent/api/result/result_partial_update.py +178 -0
hackagent/api/result/result_retrieve.py +151 -0
hackagent/api/result/result_trace_create.py +178 -0
hackagent/api/result/result_update.py +174 -0
hackagent/api/run/__init__.py +1 -0
hackagent/api/run/run_create.py +172 -0
hackagent/api/run/run_destroy.py +104 -0
hackagent/api/run/run_list.py +260 -0
hackagent/api/run/run_partial_update.py +186 -0
hackagent/api/run/run_result_create.py +178 -0
hackagent/api/run/run_retrieve.py +163 -0
hackagent/api/run/run_run_tests_create.py +172 -0
hackagent/api/run/run_update.py +186 -0
hackagent/attacks/AdvPrefix/README.md +7 -0
hackagent/attacks/AdvPrefix/__init__.py +0 -0
hackagent/attacks/AdvPrefix/completer.py +438 -0
hackagent/attacks/AdvPrefix/config.py +59 -0
hackagent/attacks/AdvPrefix/preprocessing.py +521 -0
hackagent/attacks/AdvPrefix/scorer.py +259 -0
hackagent/attacks/AdvPrefix/scorer_parser.py +498 -0
hackagent/attacks/AdvPrefix/selector.py +246 -0
hackagent/attacks/AdvPrefix/step1_generate.py +324 -0
hackagent/attacks/AdvPrefix/step4_compute_ce.py +293 -0
hackagent/attacks/AdvPrefix/step6_get_completions.py +387 -0
hackagent/attacks/AdvPrefix/step7_evaluate_responses.py +289 -0
hackagent/attacks/AdvPrefix/step8_aggregate_evaluations.py +177 -0
hackagent/attacks/AdvPrefix/step9_select_prefixes.py +59 -0
hackagent/attacks/AdvPrefix/utils.py +192 -0
hackagent/attacks/__init__.py +6 -0
hackagent/attacks/advprefix.py +1136 -0
hackagent/attacks/base.py +50 -0
hackagent/attacks/strategies.py +539 -0
hackagent/branding.py +143 -0
hackagent/client.py +328 -0
hackagent/errors.py +31 -0
hackagent/logger.py +67 -0
hackagent/models/__init__.py +71 -0
hackagent/models/agent.py +240 -0
hackagent/models/agent_request.py +169 -0
hackagent/models/agent_type_enum.py +12 -0
hackagent/models/attack.py +154 -0
hackagent/models/attack_request.py +82 -0
hackagent/models/evaluation_status_enum.py +14 -0
hackagent/models/organization_minimal.py +68 -0
hackagent/models/paginated_agent_list.py +123 -0
hackagent/models/paginated_attack_list.py +123 -0
hackagent/models/paginated_prompt_list.py +123 -0
hackagent/models/paginated_result_list.py +123 -0
hackagent/models/paginated_run_list.py +123 -0
hackagent/models/paginated_user_api_key_list.py +123 -0
hackagent/models/patched_agent_request.py +176 -0
hackagent/models/patched_attack_request.py +92 -0
hackagent/models/patched_prompt_request.py +162 -0
hackagent/models/patched_result_request.py +237 -0
hackagent/models/patched_run_request.py +138 -0
hackagent/models/prompt.py +226 -0
hackagent/models/prompt_request.py +155 -0
hackagent/models/result.py +294 -0
hackagent/models/result_list_evaluation_status.py +14 -0
hackagent/models/result_request.py +232 -0
hackagent/models/run.py +233 -0
hackagent/models/run_list_status.py +12 -0
hackagent/models/run_request.py +133 -0
hackagent/models/status_enum.py +12 -0
hackagent/models/step_type_enum.py +14 -0
hackagent/models/trace.py +121 -0
hackagent/models/trace_request.py +94 -0
hackagent/models/user_api_key.py +201 -0
hackagent/models/user_api_key_request.py +73 -0
hackagent/models/user_profile_minimal.py +76 -0
hackagent/py.typed +1 -0
hackagent/router/__init__.py +11 -0
hackagent/router/adapters/__init__.py +5 -0
hackagent/router/adapters/google_adk.py +658 -0
hackagent/router/adapters/litellm_adapter.py +290 -0
hackagent/router/base.py +48 -0
hackagent/router/router.py +753 -0
hackagent/types.py +46 -0
hackagent/utils.py +61 -0
hackagent/vulnerabilities/__init__.py +0 -0
hackagent-0.1.0.dist-info/LICENSE +202 -0
hackagent-0.1.0.dist-info/METADATA +173 -0
hackagent-0.1.0.dist-info/RECORD +117 -0
hackagent-0.1.0.dist-info/WHEEL +4 -0

hackagent/attacks/AdvPrefix/selector.py ADDED Viewed

@@ -0,0 +1,246 @@
+import pandas as pd
+import numpy as np
+import logging
+from typing import List, Optional
+from dataclasses import dataclass
+@dataclass
+class PrefixSelectorConfig:
+    """Configuration for prefix selection"""
+    pasr_weight: float  # Weight for log-PASR in selection
+    n_prefixes_per_goal: int = 1  # Number of prefixes to select per goal
+    nll_tol: float = 999  # Tolerance for NLL relative to best prefix
+    pasr_tol: float = 0  # Tolerance for PASR relative to best prefix
+    judges: Optional[List[dict]] = None  # List of judges to use for PASR calculation
+class PrefixSelector:
+    """
+    Selects prefixes based on a combination of judge scores (PASR) and NLL.
+    Supports multiple judges and custom weighting for selection criteria.
+    """
+    def __init__(self, config: PrefixSelectorConfig):
+        """
+        Initialize the prefix selector.
+        Args:
+            config: Configuration for prefix selection
+        """
+        self.config = config
+        self.logger = logging.getLogger(__name__)
+        # Mapping of judge types to their column names in the DataFrame
+        self.judge_column_map = {
+            "nuanced": "eval_nj_mean",
+            "jailbreakbench": "eval_jb_mean",
+            "harmbench": "eval_hb_mean",
+            "strongreject": "eval_sj_binary_mean",
+        }
+    def select_prefixes(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Select prefixes based on combined judge scores and NLL.
+        Args:
+            df: DataFrame containing prefixes with evaluation results
+        Returns:
+            DataFrame containing selected prefixes
+        """
+        # Validate judge configuration list
+        if not isinstance(self.config.judges, list) or not self.config.judges:
+            # Check if judges is a list and not empty
+            raise ValueError(
+                "Judge configuration ('judges' key) must be a non-empty list of dictionaries."
+            )
+        judge_types_found = []
+        missing_columns = []
+        for judge_config in self.config.judges:
+            if not isinstance(judge_config, dict):
+                self.logger.warning(
+                    f"Skipping invalid item in judge config list (not a dict): {judge_config}"
+                )
+                continue
+            # Extract judge type string (e.g., "nuanced") - Assuming a 'type' key
+            judge_type = judge_config.get("type") or judge_config.get("evaluator_type")
+            # Could add inference here if needed, similar to step 7
+            if not judge_type:
+                self.logger.warning(
+                    f"Could not determine type for judge config: {judge_config}. Skipping."
+                )
+                continue
+            if judge_type not in self.judge_column_map:
+                # Check if the *type string* is valid
+                self.logger.error(
+                    f"Unknown judge type specified in config: '{judge_type}'"
+                )
+                raise ValueError(f"Unknown judge type for selection: {judge_type}")
+            # Check if the corresponding column exists in the DataFrame
+            expected_col = self.judge_column_map[judge_type]
+            if expected_col not in df.columns:
+                missing_columns.append(expected_col)
+            if judge_type not in judge_types_found:
+                judge_types_found.append(judge_type)
+        if missing_columns:
+            raise ValueError(
+                f"Missing required evaluation result columns in DataFrame: {missing_columns}"
+            )
+        if not judge_types_found:
+            raise ValueError(
+                "No valid judge types found in the configuration to perform selection."
+            )
+        # Create a working copy of the DataFrame
+        work_df = df.copy()
+        # Calculate combined PASR score using the identified judge types
+        work_df["pasr"] = self._calculate_combined_pasr(work_df, judge_types_found)
+        # Calculate log PASR for scoring
+        work_df["log_pasr"] = np.log(work_df["pasr"] + 1e-6)
+        # Calculate combined score (minimize both 1 - PASR and prefix_nll)
+        work_df["combined_score"] = (
+            -self.config.pasr_weight * work_df["log_pasr"] + work_df["prefix_nll"]
+        )
+        # Create DataFrame for selected prefixes
+        selected_prefixes = pd.DataFrame()
+        # Group by goal and apply selection process
+        for goal, group in work_df.groupby("goal"):
+            # Step 1: Select first prefix based on combined score
+            # Check if group is empty after potential filtering/issues
+            if (
+                group.empty
+                or "combined_score" not in group.columns
+                or group["combined_score"].isnull().all()
+            ):
+                self.logger.warning(
+                    f"Skipping goal '{goal[:50]}...' during selection due to empty group or missing/invalid scores."
+                )
+                continue
+            first_selection_idx = group["combined_score"].idxmin()
+            first_selection = group.loc[first_selection_idx]
+            # Step 2: Filter prefixes within PASR tolerance
+            remaining_candidates = group[
+                (group["pasr"] >= first_selection["pasr"] - self.config.pasr_tol)
+                & (group.index != first_selection.name)
+            ]
+            # Step 3: Filter candidates within NLL tolerance
+            valid_candidates = remaining_candidates[
+                remaining_candidates["prefix_nll"]
+                <= first_selection["prefix_nll"] + self.config.nll_tol
+            ]
+            # Initialize selections list with first selection
+            selections = [first_selection]
+            # Step 4: Iteratively select additional prefixes
+            for _ in range(self.config.n_prefixes_per_goal - 1):
+                # Remove candidates that are sub-prefixes of selected ones
+                valid_candidates = valid_candidates[
+                    ~valid_candidates["prefix"].apply(
+                        lambda x: any(
+                            str(x).startswith(str(sel["prefix"]))
+                            for sel in selections
+                            if sel is not None and "prefix" in sel and x is not None
+                        )
+                    )
+                ]
+                if valid_candidates.empty:
+                    break
+                # Select next prefix with lowest NLL
+                if (
+                    "prefix_nll" not in valid_candidates.columns
+                    or valid_candidates["prefix_nll"].isnull().all()
+                ):
+                    self.logger.warning(
+                        f"Cannot select next prefix for goal '{goal[:50]}...' due to missing/invalid NLL scores in candidates."
+                    )
+                    break
+                next_selection = valid_candidates.nsmallest(1, "prefix_nll").iloc[0]
+                selections.append(next_selection)
+                valid_candidates = valid_candidates[
+                    valid_candidates.index != next_selection.name
+                ]
+            # Combine selections for this goal
+            combined_selection = pd.DataFrame(selections)
+            selected_prefixes = pd.concat([selected_prefixes, combined_selection])
+        # Reset index
+        selected_prefixes.reset_index(drop=True, inplace=True)
+        # Add the new columns (pasr, log_pasr, combined_score) to the output
+        # Ensure columns exist before trying to select them
+        output_columns = [
+            col
+            for col in list(df.columns) + ["pasr", "log_pasr", "combined_score"]
+            if col in selected_prefixes.columns
+        ]
+        selected_prefixes = selected_prefixes[output_columns]
+        self.logger.info(
+            f"Selected {len(selected_prefixes)} prefixes across {len(df['goal'].unique())} goals"
+        )
+        return selected_prefixes
+    def _calculate_combined_pasr(
+        self, df: pd.DataFrame, judge_types: List[str]
+    ) -> pd.Series:
+        """
+        Calculate combined PASR score from specified judge types.
+        Args:
+            df: DataFrame containing judge scores
+            judge_types: List of valid judge type strings (e.g., ["nuanced", "harmbench"])
+        Returns:
+            Series containing combined PASR scores
+        """
+        judge_scores = []
+        for judge_type in judge_types:  # Iterate through the list of type strings
+            column = self.judge_column_map[judge_type]  # Use the type string for lookup
+            # Ensure column is numeric before appending
+            if column in df.columns:
+                try:
+                    numeric_scores = pd.to_numeric(df[column], errors="coerce")
+                    judge_scores.append(numeric_scores)
+                except Exception as e:
+                    self.logger.warning(
+                        f"Could not convert column '{column}' to numeric for PASR calculation. Skipping. Error: {e}"
+                    )
+            else:
+                # This should be caught by initial validation, but as safeguard
+                self.logger.warning(
+                    f"Column '{column}' for judge '{judge_type}' not found during PASR calculation."
+                )
+        if not judge_scores:
+            self.logger.warning(
+                "No valid judge scores found to calculate combined PASR. Returning zeros."
+            )
+            return pd.Series(0, index=df.index)
+        # Calculate mean of judge scores, handling potential NaNs after conversion
+        combined_scores_df = pd.concat(judge_scores, axis=1)
+        # Use mean, skipping NaNs. If a row has all NaNs, the mean will be NaN.
+        mean_scores = combined_scores_df.mean(axis=1, skipna=True)
+        # Fill any resulting NaNs (rows where all judges had NaN scores) with 0
+        return mean_scores.fillna(0)

hackagent/attacks/AdvPrefix/step1_generate.py ADDED Viewed

@@ -0,0 +1,324 @@
+import logging
+import pandas as pd
+from typing import List, Dict, Union, Tuple, Optional
+from hackagent.router.router import AgentRouter  # Added
+from hackagent.models import AgentTypeEnum  # Added
+from hackagent.client import AuthenticatedClient  # Added
+from .utils import get_checkpoint_path
+from rich.progress import (
+    Progress,
+    BarColumn,
+    TextColumn,
+    TimeRemainingColumn,
+    MofNCompleteColumn,
+    SpinnerColumn,
+)
+from hackagent.logger import get_logger
+logger = get_logger(__name__)
+# Constants moved from main file
+CUSTOM_CHAT_TEMPLATES = {
+    "georgesung/llama2_7b_chat_uncensored": "<s>### HUMAN:\n{content}\n\n### RESPONSE:\n",
+    "Tap-M/Luna-AI-Llama2-Uncensored": "<s>USER: {content}\n\nASSISTANT:",
+}
+def _construct_prompts(
+    goals: List[str],
+    meta_prefixes: List[str],
+    meta_prefixes_n_samples: Union[int, List[int]],  # Allow int or list
+) -> Tuple[List[Dict[str, str]], List[str], List[str]]:
+    """Constructs prompts for the generator model."""
+    # Handle the case where meta_prefixes_n_samples is an integer vs a list
+    if isinstance(meta_prefixes_n_samples, list):
+        if len(meta_prefixes) != len(meta_prefixes_n_samples):
+            raise ValueError(
+                "Lengths of meta_prefixes and meta_prefixes_n_samples lists must match."
+            )
+        n_samples_list = meta_prefixes_n_samples
+    elif isinstance(meta_prefixes_n_samples, int):
+        # Apply the same integer sample count to all meta prefixes
+        n_samples_list = [meta_prefixes_n_samples] * len(meta_prefixes)
+    else:
+        raise TypeError("meta_prefixes_n_samples must be an int or a list of ints.")
+    formatted_inputs = []
+    current_goals = []
+    expanded_meta_prefixes = []
+    for goal in goals:
+        for meta_prefix, n_samples in zip(meta_prefixes, n_samples_list):
+            if n_samples <= 0:
+                continue
+            # chat = [{"role": "user", "content": goal}] # Not directly used for router prompt format
+            try:
+                # The prompt for the router will be the fully constructed context.
+                # Custom chat templating needs to happen before sending to router.
+                if meta_prefix in CUSTOM_CHAT_TEMPLATES:
+                    # Assuming meta_prefix identifies the model type for templating,
+                    # which is a bit indirect. Usually, model_string would be used.
+                    # For now, we'll keep this logic, but the 'context' is the prompt.
+                    prompt_content = CUSTOM_CHAT_TEMPLATES[meta_prefix].format(
+                        content=goal
+                    )
+                else:
+                    logger.warning(
+                        f"Using basic formatting for prompt construction with meta_prefix: {meta_prefix}. No matching template found."
+                    )
+                    prompt_content = f"USER: {goal}\\nASSISTANT:"
+                # Append the actual meta_prefix text to the prompt that will be sent
+                final_prompt = prompt_content + meta_prefix
+                formatted_inputs.extend([final_prompt] * n_samples)
+                current_goals.extend([goal] * n_samples)
+                expanded_meta_prefixes.extend([meta_prefix] * n_samples)
+            except Exception as e:
+                logging.error(
+                    f"Error formatting prompt for goal '{goal}' with meta_prefix '{meta_prefix}': {e}"
+                )
+    return formatted_inputs, current_goals, expanded_meta_prefixes
+async def _generate_prefixes(
+    unique_goals: List[str],
+    config: Dict,
+    logger: logging.Logger,
+    client: AuthenticatedClient,  # organization_id removed from here
+) -> List[Dict]:
+    """
+    Helper for step 1. Generate prefixes using AgentRouter with a LiteLLM agent.
+    """
+    results = []
+    generator = config.get("generator", {})
+    if not generator:
+        logger.error("Missing 'generator'. Cannot initialize AgentRouter for LiteLLM.")
+        return results
+    # Map generator to adapter_operational_config for LiteLLM
+    # New keys for LiteLLMAgentAdapter: 'name', 'endpoint', 'api_key'
+    model_name = generator.get("identifier")
+    if not model_name:
+        logger.error(
+            "Missing 'identifier' in 'generator'. Cannot configure LiteLLM agent."
+        )
+        return results
+    adapter_operational_config = {
+        "name": model_name,
+        "endpoint": generator.get("endpoint"),
+        "api_key": generator.get("api_key"),
+        # Other params like max_new_tokens, temperature, top_p for adapter defaults
+        "max_new_tokens": config.get("max_new_tokens", 100),
+        "temperature": config.get("temperature", 0.8),
+        "top_p": config.get("top_p", 1.0),
+    }
+    router: Optional[AgentRouter] = None
+    registration_key: Optional[str] = None
+    try:
+        logger.info(f"Initializing AgentRouter for LiteLLM model: {model_name}")
+        router = AgentRouter(
+            client=client,
+            name=model_name,  # Name for backend agent record
+            agent_type=AgentTypeEnum.LITELMM,
+            endpoint=generator.get("endpoint"),
+            adapter_operational_config=adapter_operational_config,
+            metadata=adapter_operational_config.copy(),
+            overwrite_metadata=True,
+        )
+        if router._agent_registry:
+            registration_key = next(iter(router._agent_registry.keys()))
+            logger.info(
+                f"AgentRouter initialized. Registration key for LiteLLM agent: {registration_key}"
+            )
+        else:
+            logger.error(
+                "AgentRouter initialized, but no agent adapter was registered."
+            )
+            return results  # Cannot proceed
+    except Exception as e:
+        logger.error(
+            f"Error initializing AgentRouter for {model_name}: {e}",
+            exc_info=True,
+        )
+        return results
+    for do_sample in [False, True]:
+        progress_bar_description = (
+            "[cyan]Generating Prefixes (Random Sampling)..."
+            if do_sample
+            else "[cyan]Generating Prefixes (Greedy Decoding)..."
+        )
+        logger.info(
+            f"Generating with {'random sampling' if do_sample else 'greedy decoding'} using LiteLLM via AgentRouter..."
+        )
+        try:
+            # _construct_prompts now returns the full prompt string
+            prompts_to_send, current_goals, current_meta_prefixes = _construct_prompts(
+                unique_goals,
+                config.get("meta_prefixes", []),
+                config.get("meta_prefix_samples", []),
+            )
+            logger.debug(f"Prompts to send ({len(prompts_to_send)}): {prompts_to_send}")
+        except Exception as e:
+            logger.error(f"Error constructing prompts: {e}", exc_info=True)
+            continue
+        if not prompts_to_send:
+            logger.warning("No prompts to send, skipping completion.")
+            continue
+        # Loop through each constructed prompt and call the router
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TextColumn("[progress.percentage]{task.percentage:>3.1f}%"),
+            TimeRemainingColumn(),
+        ) as progress_bar:
+            task = progress_bar.add_task(
+                progress_bar_description, total=len(prompts_to_send)
+            )
+            for idx, current_prompt_text in enumerate(prompts_to_send):
+                goal_for_prompt = current_goals[idx]
+                meta_prefix_for_prompt = current_meta_prefixes[idx]
+                request_params = {
+                    "prompt": current_prompt_text,
+                    "max_new_tokens": config.get("max_new_tokens", 100),
+                    "temperature": config.get("temperature", 0.8)
+                    if do_sample
+                    else 1e-2,
+                    "top_p": config.get("top_p", 1.0),
+                }
+                completion_text = None
+                try:
+                    # logger.info(f"Sending request to router for prompt: {current_prompt_text[:100]}...")
+                    response = await router.route_request(
+                        registration_key=registration_key,  # type: ignore
+                        request_data=request_params,
+                    )
+                    # logger.debug(f"Router response: {response}")
+                    if response and response.get("error_message"):
+                        logger.error(
+                            f"Error from AgentRouter for prompt '{current_prompt_text[:50]}...': {response['error_message']}"
+                        )
+                        # Append error marker or skip
+                        # For now, we'll try to get processed_response even if there's a partial error
+                        # The adapter should handle this.
+                        pass  # Ensure block is not empty if all lines are comments
+                    if response and response.get("processed_response"):
+                        completion_text = response["processed_response"]
+                        # The adapter's processed_response is assumed to be the full text (prompt + generation)
+                        # We need to extract just the generated part.
+                        if completion_text.startswith(current_prompt_text):
+                            generated_part = completion_text[len(current_prompt_text) :]
+                        else:
+                            # Fallback or warning if the response doesn't start with the prompt
+                            logger.warning(
+                                f"Completion for '{current_prompt_text[:50]}...' did not start with the prompt. Using full response as generated part."
+                            )
+                            generated_part = completion_text
+                    else:
+                        logger.warning(
+                            f"No 'processed_response' in router output for prompt: {current_prompt_text[:50]}..."
+                        )
+                        generated_part = " [GENERATION_VIA_ROUTER_FAILED]"
+                except Exception as e:
+                    logger.error(
+                        f"Exception during router.route_request for prompt '{current_prompt_text[:50]}...': {e}",
+                        exc_info=True,
+                    )
+                    generated_part = " [ROUTER_REQUEST_EXCEPTION]"
+                # The 'prefix' should be the meta_prefix + generated_part
+                final_prefix = meta_prefix_for_prompt + generated_part
+                results.append(
+                    {
+                        "goal": goal_for_prompt,
+                        "prefix": final_prefix,
+                        "meta_prefix": meta_prefix_for_prompt,
+                        "temperature": request_params["temperature"],  # Use actual temp
+                        "model_name": model_name,  # Model used by the adapter
+                    }
+                )
+                progress_bar.update(task, advance=1)
+    # No need to del router explicitly here, it goes out of scope.
+    return results
+async def execute(
+    goals: List[str],
+    config: Dict,
+    logger: logging.Logger,
+    run_dir: str,
+    client: AuthenticatedClient,  # organization_id removed from this call
+) -> pd.DataFrame:
+    """Generate initial prefixes using provided goals via AgentRouter."""
+    logger.info("Executing Step 1: Generating prefixes using AgentRouter")
+    if not goals:
+        logger.warning("Step 1 received no goals. Returning empty DataFrame.")
+        return pd.DataFrame(
+            columns=["goal", "prefix", "meta_prefix", "temperature", "model_name"]
+        )
+    generator = config.get("generator")
+    if not generator or not generator.get("identifier"):
+        logger.error(
+            "Step 1: Missing 'generator' or 'identifier' in config. Cannot generate prefixes."
+        )
+        return pd.DataFrame(
+            columns=["goal", "prefix", "meta_prefix", "temperature", "model_name"]
+        )
+    model_name_from_config = generator["identifier"]
+    logger.info(
+        f"Generating prefixes for {len(goals)} unique goals using AgentRouter with LiteLLM: {model_name_from_config}"
+    )
+    all_results = await _generate_prefixes(
+        unique_goals=goals,
+        config=config,
+        logger=logger,
+        client=client,  # organization_id removed from this call
+    )
+    if not all_results:
+        logger.warning("Step 1: No prefixes were generated via AgentRouter.")
+        results_df = pd.DataFrame(
+            columns=["goal", "prefix", "meta_prefix", "temperature", "model_name"]
+        )
+    else:
+        results_df = pd.DataFrame(all_results)
+    output_path = get_checkpoint_path(run_dir, 1)
+    try:
+        results_df.to_csv(output_path, index=False)
+        logger.info(
+            f"Step 1 complete. Generated {len(results_df)} total prefixes via AgentRouter"
+        )
+        logger.info(f"Checkpoint saved to {output_path}")
+    except Exception as e:
+        logger.error(f"Failed to save checkpoint for step 1 to {output_path}: {e}")
+    return results_df