PyPI - ai-nk-cce - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ai-nk-cce 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ai_nk_cce-0.1.0.dist-info/METADATA +118 -0
ai_nk_cce-0.1.0.dist-info/RECORD +46 -0
ai_nk_cce-0.1.0.dist-info/WHEEL +4 -0
api/__init__.py +0 -0
api/mpcdf_vllm.py +94 -0
evals/nk_model.py +277 -0
model/README.md +64 -0
model/config/dataset_conv_v1.yml +9 -0
model/config/dataset_conv_v2_m2.yml +9 -0
model/config/dataset_conv_v3_m2_assembl_nearest.yml +9 -0
model/config/dataset_debug.yml +9 -0
model/config/dataset_v4_int_format.yml +9 -0
model/config/dataset_v5.yml +9 -0
model/config/inference.yml +7 -0
model/config/train.yml +24 -0
model/config/train_debug.yml +19 -0
model/config/train_from_checkpoint.yml +24 -0
model/config/train_from_checkpoint_debug.yml +19 -0
model/config/train_grpo.yml +30 -0
model/config/train_grpo_debug.yml +30 -0
model/config/train_grpo_debug_vllm.yml +32 -0
model/config.py +54 -0
model/dataset.py +324 -0
model/inference.py +51 -0
model/nk_assistant.py +207 -0
model/parser.py +70 -0
model/run_slurm.py +335 -0
model/score.ipynb +596 -0
model/scripts/template.slurm +54 -0
model/scripts/template_rl.slurm +54 -0
model/train.py +293 -0
nk_model/__init__.py +0 -0
nk_model/assembler.py +112 -0
nk_model/biased_prediction_agent.py +389 -0
nk_model/dataset.py +434 -0
nk_model/enums.py +21 -0
nk_model/landscape_cache.py +149 -0
nk_model/models.py +172 -0
nk_model/nk_landscape.py +498 -0
simulation/hill_climber_simulation.py +211 -0
simulation/hill_climber_vs_ai_simulation.py +132 -0
simulation/landscape_selection.py +179 -0
utils/__init__.py +0 -0
utils/binary_conversion.py +128 -0
utils/logging.py +33 -0
utils/utils.py +51 -0

nk_model/dataset.py ADDED Viewed

@@ -0,0 +1,434 @@
+"""Dataset generation utilities for NK landscape datasets.
+This module provides functions to generate, validate, save, and load
+NK landscape datasets for machine learning purposes.
+"""
+import logging
+import os
+from datetime import datetime
+from typing import Optional, Tuple
+import pandas as pd
+from tqdm import tqdm
+from src.nk_model.enums import ConvolutionMethod, NeighborhoodMethod
+from src.nk_model.models import NKParams
+from src.nk_model.nk_landscape import NKLandscape
+logger = logging.getLogger(__name__)
+def generate_landscape_dataset(  # noqa: C901
+    n_values: list[int],
+    k_values: list[int],
+    m_values: list[int],
+    power_scales: list[float],
+    landscapes_per_combo: int,
+    value_format: str = "1000.",
+    neighborhood_method: NeighborhoodMethod = NeighborhoodMethod.NEAREST,
+    convolution_method: ConvolutionMethod = ConvolutionMethod.SYMMETRIC,
+    use_notebook_tqdm: bool = False,
+) -> Tuple[pd.DataFrame, int]:
+    """Generate a dataset of NK landscapes with varying parameters.
+    Creates landscapes for all combinations of the provided parameters
+    and returns a DataFrame with landscape data including coordinates
+    and payoffs.
+    Args:
+        n_values: List of N values (number of components).
+        k_values: List of K values (interactions per component).
+        m_values: List of M values (number of convolutions).
+        power_scales: List of power scaling factors.
+        landscapes_per_combo: Number of landscapes to generate per
+            parameter combination.
+        value_format: Format string for max_val (default: "1000.").
+        neighborhood_method: Method for determining neighbors.
+        convolution_method: Method for convolution.
+        use_notebook_tqdm: If True, use tqdm.notebook for progress
+            bars (default: False).
+    Returns:
+        Tuple of (DataFrame, int) containing:
+            - DataFrame with columns:
+                - landscape_uuid: Unique identifier for each landscape
+                - n, k, m, power_scale: Parameter values
+                - payoff: Payoff value for the coordinate
+                - coord_1 through coord_N: Binary coordinate values
+            - Total number of landscapes generated
+    Raises:
+        ValueError: If invalid parameter combinations are provided.
+    """
+    if use_notebook_tqdm:
+        from tqdm.notebook import tqdm as notebook_tqdm
+        tqdm_func = notebook_tqdm
+    else:
+        tqdm_func = tqdm
+    # Calculate total parameter combinations
+    logger.info("Calculating total landscapes to generate...")
+    logger.debug("N values: %s", n_values)
+    logger.debug("K values: %s", k_values)
+    logger.debug("M values: %s", m_values)
+    logger.debug("Power scales: %s", power_scales)
+    logger.debug("Landscapes per combination: %d", landscapes_per_combo)
+    total_combos = sum(
+        1
+        for n in n_values
+        for k in k_values
+        for m in m_values
+        if k < n and m <= n
+        for _ in power_scales
+    )
+    total_landscapes = total_combos * landscapes_per_combo
+    logger.info(
+        "Will generate %d landscapes across %d parameter combinations",
+        total_landscapes,
+        total_combos,
+    )
+    logger.info(
+        "Landscapes per combination: %d",
+        landscapes_per_combo,
+    )
+    data = []
+    # Main progress bar for N values
+    for n in tqdm_func(n_values, desc="N values", position=0):
+        # Progress bar for K values
+        for k in tqdm_func(
+            k_values, desc=f"K values (N={n})", position=1, leave=False
+        ):
+            if k >= n:
+                continue
+            # Progress bar for M values
+            for m in tqdm_func(
+                m_values,
+                desc=f"M values (N={n}, K={k})",
+                position=2,
+                leave=False,
+            ):
+                if m > n:
+                    continue
+                # Progress bar for power scales
+                for power_scale in tqdm_func(
+                    power_scales,
+                    desc=f"Power scales (N={n}, K={k}, M={m})",
+                    position=3,
+                    leave=False,
+                ):
+                    # Progress bar for individual landscapes
+                    for _ in tqdm_func(
+                        range(landscapes_per_combo),
+                        desc=(
+                            f"Landscapes "
+                            f"(N={n},K={k},M={m},P={power_scale})"
+                        ),
+                        position=4,
+                        leave=False,
+                    ):
+                        params = NKParams(
+                            n=n,
+                            k=k,
+                            m=m,
+                            power=power_scale,
+                            max_val=float(value_format),
+                            neighborhood=neighborhood_method,
+                            convolution=convolution_method,
+                            payoff_type="int",
+                        )
+                        landscape = NKLandscape(params)
+                        # For each item in the landscape
+                        for item in landscape.items:
+                            row = {
+                                "landscape_uuid": landscape.uuid,
+                                "n": n,
+                                "k": k,
+                                "m": m,
+                                "power_scale": power_scale,
+                                "payoff": item.payoff,
+                            }
+                            # Add each coordinate as a separate column
+                            for i, coord in enumerate(item.coordinates):
+                                row[f"coord_{i+1}"] = int(coord)
+                            data.append(row)
+    df = pd.DataFrame(data)
+    logger.info(
+        "Generated dataset with %d rows and %d columns",
+        len(df),
+        len(df.columns),
+    )
+    logger.info(
+        "Successfully generated %d landscapes",
+        total_landscapes,
+    )
+    return df, total_landscapes
+def validate_dataset(  # noqa: C901
+    df: pd.DataFrame,
+    expected_n: Optional[int] = None,
+    expected_landscapes: Optional[int] = None,
+    expected_k_values: Optional[list[int]] = None,
+) -> bool:
+    """Validate a generated NK landscape dataset.
+    Performs comprehensive checks on the dataset including:
+    - Row counts per landscape (should be 2^N)
+    - N value consistency
+    - K value distribution
+    - Payoff value ranges
+    - Coordinate validity (binary values)
+    Args:
+        df: DataFrame to validate.
+        expected_n: Expected N value (if None, uses unique N from df).
+        expected_landscapes: Expected number of unique landscapes.
+        expected_k_values: Expected K values for distribution check.
+    Returns:
+        True if all validations pass, False otherwise.
+    Note:
+        All validation issues are logged at appropriate levels.
+    """
+    logger.info("Running dataset validation checks...")
+    all_uuids = df["landscape_uuid"].unique()
+    total_uuids = len(all_uuids)
+    if expected_n is None:
+        expected_n = df["n"].unique()[0]
+        logger.debug("Inferred expected_n=%d from dataset", expected_n)
+    issues = []
+    # Check number of unique landscapes
+    if expected_landscapes is not None:
+        actual_landscapes = total_uuids
+        if actual_landscapes == expected_landscapes:
+            logger.info(
+                "Found expected number of landscapes: %d",
+                actual_landscapes,
+            )
+        else:
+            msg = (
+                f"Found {actual_landscapes:,} landscapes, "
+                f"expected {expected_landscapes:,}"
+            )
+            logger.error(msg)
+            issues.append(msg)
+    # Check row counts per landscape (should be 2^N)
+    logger.debug("Checking row counts per landscape...")
+    row_counts = df.groupby("landscape_uuid").size()
+    expected_rows = 2**expected_n
+    incorrect_counts = row_counts[row_counts != expected_rows]
+    if len(incorrect_counts) > 0:
+        msg = (
+            f"Found {len(incorrect_counts)} landscapes with "
+            f"wrong row count (expected {expected_rows})"
+        )
+        logger.error(msg)
+        for uuid, count in incorrect_counts.items():
+            logger.error(
+                "UUID %s: %d rows (expected %d)", uuid, count, expected_rows
+            )
+        issues.append(msg)
+    else:
+        logger.info(
+            "All landscapes have correct number of rows (%d)",
+            expected_rows,
+        )
+    # Check N value consistency
+    logger.debug("Checking N value consistency...")
+    if not (df["n"] == expected_n).all():
+        msg = f"Not all N values are {expected_n}"
+        logger.error(msg)
+        logger.error("N value distribution:\n%s", df["n"].value_counts())
+        issues.append(msg)
+    else:
+        logger.info("All N values are %d", expected_n)
+    # Check K value distribution
+    if expected_k_values is not None:
+        logger.debug("Checking K value distribution...")
+        k_counts = df.groupby("k")["landscape_uuid"].nunique()
+        expected_per_k = total_uuids / len(expected_k_values)
+        logger.info(
+            "K value distribution (expected %.0f per K):",
+            expected_per_k,
+        )
+        for k, count in k_counts.items():
+            logger.info("K=%d: %d landscapes", k, count)
+            if count != expected_per_k:
+                msg = (
+                    f"K={k} has {count:,} landscapes, "
+                    f"expected {expected_per_k:.0f}"
+                )
+                logger.error(msg)
+                issues.append(msg)
+    # Check payoffs
+    logger.debug("Checking payoff values...")
+    payoffs = df["payoff"]
+    if payoffs.min() != 0:
+        msg = "No minimum payoff of 0 found"
+        logger.warning(msg)
+        issues.append(msg)
+    # Check max payoff (should match max_val from value_format)
+    max_payoff = payoffs.max()
+    if max_payoff < 0:
+        msg = "Negative payoffs found"
+        logger.error(msg)
+        issues.append(msg)
+    else:
+        logger.info(
+            "Payoff range: [%.2f, %.2f]",
+            payoffs.min(),
+            max_payoff,
+        )
+    # Check coordinates
+    logger.debug("Checking coordinate values...")
+    coord_cols = [col for col in df.columns if col.startswith("coord_")]
+    invalid_coords = []
+    for col in coord_cols:
+        unique_vals = df[col].unique()
+        if not all(val in [0, 1] for val in unique_vals):
+            msg = f"Column {col} has invalid values: {unique_vals}"
+            logger.error(msg)
+            invalid_coords.append(msg)
+    if invalid_coords:
+        issues.extend(invalid_coords)
+    else:
+        logger.info("All coordinates are binary (0s and 1s)")
+    # Summary
+    if issues:
+        logger.error("Validation failed with %d issue(s)", len(issues))
+        return False
+    else:
+        logger.info("All validation checks passed")
+        return True
+def save_dataset(
+    df: pd.DataFrame,
+    output_dir: str = "data/landscapes",
+    n_values: Optional[list[int]] = None,
+    k_values: Optional[list[int]] = None,
+    m_values: Optional[list[int]] = None,
+    power_scales: Optional[list[float]] = None,
+    value_format: Optional[str] = None,
+    neighborhood_method: Optional[NeighborhoodMethod] = None,
+    convolution_method: Optional[ConvolutionMethod] = None,
+    total_landscapes: Optional[int] = None,
+    filename: Optional[str] = None,
+) -> str:
+    """Save a dataset DataFrame to a Parquet file.
+    Creates a descriptive filename based on parameters or uses the
+    provided filename. Creates output directory if it doesn't exist.
+    Args:
+        df: DataFrame to save.
+        output_dir: Directory to save the file (default: "data/landscapes").
+        n_values: N values used (for filename generation).
+        k_values: K values used (for filename generation).
+        m_values: M values used (for filename generation).
+        power_scales: Power scales used (for filename generation).
+        value_format: Value format string (for filename generation).
+        neighborhood_method: Neighborhood method (for filename).
+        convolution_method: Convolution method (for filename).
+        total_landscapes: Total number of landscapes (for filename).
+        filename: Custom filename (overrides auto-generation).
+    Returns:
+        Path to the saved file.
+    Note:
+        If filename is provided, it should not include the directory path.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    if filename is None:
+        # Generate descriptive filename
+        parts = []
+        if n_values:
+            parts.append(f"n{'-'.join(map(str, n_values))}")
+        if k_values:
+            parts.append(f"k{'-'.join(map(str, k_values))}")
+        if m_values:
+            parts.append(f"m{'-'.join(map(str, m_values))}")
+        if power_scales:
+            parts.append(f"p{'-'.join(map(str, power_scales))}")
+        if value_format:
+            parts.append(f"f{value_format}")
+        if neighborhood_method:
+            parts.append(f"nm{neighborhood_method.value}")
+        if convolution_method:
+            parts.append(f"cm{convolution_method.value}")
+        if total_landscapes:
+            parts.append(f"size{total_landscapes}")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        parts.append(timestamp)
+        filename = "_".join(parts) + ".parquet"
+    filepath = os.path.join(output_dir, filename)
+    logger.info("Saving dataset to %s", filepath)
+    df.to_parquet(filepath, index=False)
+    file_size_mb = os.path.getsize(filepath) / (1024 * 1024)
+    logger.info("Saved dataset: %.2f MB", file_size_mb)
+    return filepath
+def load_dataset(filepath: str) -> pd.DataFrame:
+    """Load a dataset from a Parquet file.
+    Args:
+        filepath: Path to the Parquet file.
+    Returns:
+        Loaded DataFrame.
+    Raises:
+        FileNotFoundError: If the file doesn't exist.
+    """
+    if not os.path.exists(filepath):
+        raise FileNotFoundError(f"Dataset file not found: {filepath}")
+    logger.info("Loading dataset from %s", filepath)
+    df = pd.read_parquet(filepath)
+    file_size_mb = os.path.getsize(filepath) / (1024 * 1024)
+    logger.info(
+        "Loaded dataset: %d rows × %d columns (%.2f MB)",
+        len(df),
+        len(df.columns),
+        file_size_mb,
+    )
+    return df

nk_model/enums.py ADDED Viewed

@@ -0,0 +1,21 @@
+from enum import Enum
+class NeighborhoodMethod(Enum):
+    """
+    Method to determine the nearest neighbor for a given node in the NK model.
+    """
+    RANDOM = "random"
+    NEAREST = "nearest"
+    RING = "ring"
+class ConvolutionMethod(Enum):
+    """
+    Method to determine the convolution method to apply to the NK model
+    landscape.
+    """
+    RANDOM = "random"
+    SYMMETRIC = "symmetric"

nk_model/landscape_cache.py ADDED Viewed

@@ -0,0 +1,149 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Literal, Optional
+import numpy as np
+from src.nk_model.models import Item, NKLandscapeCache, NKParams
+class LandscapeCache:
+    """
+    A file-based cache for NKLandscape objects.
+    This class handles saving and loading landscape data to/from disk
+    using Pydantic models for simplified serialization.
+    The cache is stored in JSON format in the data directory.
+    """
+    def __init__(
+        self,
+        cache_dir: str = "data/landscape_cache",
+        cache_type: Literal["memory", "disk", "none"] = "memory",
+    ):
+        """
+        Initialize the landscape cache.
+        Args:
+            cache_dir: Directory to store the cache files
+            cache_type: Type of caching to use:
+                - "none": No caching
+                - "memory": Only in-memory caching
+                - "disk": Both memory and disk caching (default)
+        """
+        self.cache_type = cache_type
+        # Set the cache file path
+        self.cache_file = Path(cache_dir) / "landscape_cache.json"
+        self.cache_file.parent.mkdir(parents=True, exist_ok=True)
+        # Load existing cache
+        self._cache: Dict[str, Dict[str, Any]] = self.load_from_disk()
+    def load_from_disk(self) -> Dict[str, Dict[str, Any]]:
+        """
+        Load the landscape cache from disk, if it exists.
+        Returns:
+            A dictionary mapping landscape UUIDs to their cached data
+            or an empty dict in certain cases.
+        """
+        if self.cache_type == "none" or not self.cache_file.exists():
+            return {}
+        with open(self.cache_file, "r") as f:
+            return json.load(f)
+    def get(self, uuid: str) -> Optional[NKLandscapeCache]:
+        """
+        Get a landscape from the cache.
+        Args:
+            uuid: The UUID of the landscape to retrieve
+        Returns:
+            NKLandscapeCache instance or None if not found
+        """
+        if uuid not in self._cache:
+            return None
+        cached_data = self._cache[uuid]
+        # Try loading as new format first
+        try:
+            return NKLandscapeCache(
+                params=NKParams(**cached_data["params"]),
+                items=[
+                    Item(
+                        coordinates=np.array(item["coordinates"]),
+                        payoff=item["payoff"],
+                    )
+                    for item in cached_data["items"]
+                ],
+            )
+        except (KeyError, TypeError, ValueError):
+            # Fall back to legacy format if new format fails
+            return self._load_legacy_cache(cached_data)
+    def save(self, uuid: str, data: NKLandscapeCache) -> None:
+        """
+        Save a landscape to the cache.
+        Args:
+            uuid: The UUID of the landscape
+            data: NKLandscapeCache instance to cache
+        """
+        if self.cache_type == "none":
+            return
+        # Serialize using Pydantic model
+        data_dict = json.loads(data.model_dump_json())
+        # Save to in-memory cache
+        self._cache[uuid] = data_dict
+        # Save to disk if cache_type is "disk"
+        if self.cache_type == "disk":
+            with open(self.cache_file, "w") as f:
+                json.dump(self._cache, f)
+    def clear(self) -> None:
+        """Clear the entire cache."""
+        if self.cache_type == "none":
+            return
+        self._cache = {}
+        if self.cache_type == "disk" and self.cache_file.exists():
+            self.cache_file.unlink()
+    def _load_legacy_cache(
+        self, cached_data: Dict[str, Any]
+    ) -> NKLandscapeCache:
+        """
+        Load cache data from legacy format.
+        Args:
+            cached_data: Dictionary containing legacy cache data
+        Returns:
+            NKLandscapeCache instance reconstructed from legacy format
+        """
+        # Extract params from old format
+        # Default to float for legacy cache (was the original behavior)
+        params = NKParams(
+            n=cached_data.get("nk_param_n", 0),
+            k=cached_data.get("nk_param_k", 0),
+            m=cached_data.get("nk_param_m", 0),
+            power=cached_data.get("payoff_scaling", 1.0),
+            max_val=cached_data.get("value_format", 1.0),
+            neighborhood=cached_data.get("neighborhood_method"),
+            convolution=cached_data.get("convolution_method"),
+            payoff_type="float",
+        )
+        items = [
+            Item(
+                coordinates=np.array(item["coordinates"]),
+                payoff=item["payoff"],
+            )
+            for item in cached_data["items"]
+        ]
+        return NKLandscapeCache(params=params, items=items)