ai-nk-cce 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ai_nk_cce-0.1.0.dist-info/METADATA +118 -0
  2. ai_nk_cce-0.1.0.dist-info/RECORD +46 -0
  3. ai_nk_cce-0.1.0.dist-info/WHEEL +4 -0
  4. api/__init__.py +0 -0
  5. api/mpcdf_vllm.py +94 -0
  6. evals/nk_model.py +277 -0
  7. model/README.md +64 -0
  8. model/config/dataset_conv_v1.yml +9 -0
  9. model/config/dataset_conv_v2_m2.yml +9 -0
  10. model/config/dataset_conv_v3_m2_assembl_nearest.yml +9 -0
  11. model/config/dataset_debug.yml +9 -0
  12. model/config/dataset_v4_int_format.yml +9 -0
  13. model/config/dataset_v5.yml +9 -0
  14. model/config/inference.yml +7 -0
  15. model/config/train.yml +24 -0
  16. model/config/train_debug.yml +19 -0
  17. model/config/train_from_checkpoint.yml +24 -0
  18. model/config/train_from_checkpoint_debug.yml +19 -0
  19. model/config/train_grpo.yml +30 -0
  20. model/config/train_grpo_debug.yml +30 -0
  21. model/config/train_grpo_debug_vllm.yml +32 -0
  22. model/config.py +54 -0
  23. model/dataset.py +324 -0
  24. model/inference.py +51 -0
  25. model/nk_assistant.py +207 -0
  26. model/parser.py +70 -0
  27. model/run_slurm.py +335 -0
  28. model/score.ipynb +596 -0
  29. model/scripts/template.slurm +54 -0
  30. model/scripts/template_rl.slurm +54 -0
  31. model/train.py +293 -0
  32. nk_model/__init__.py +0 -0
  33. nk_model/assembler.py +112 -0
  34. nk_model/biased_prediction_agent.py +389 -0
  35. nk_model/dataset.py +434 -0
  36. nk_model/enums.py +21 -0
  37. nk_model/landscape_cache.py +149 -0
  38. nk_model/models.py +172 -0
  39. nk_model/nk_landscape.py +498 -0
  40. simulation/hill_climber_simulation.py +211 -0
  41. simulation/hill_climber_vs_ai_simulation.py +132 -0
  42. simulation/landscape_selection.py +179 -0
  43. utils/__init__.py +0 -0
  44. utils/binary_conversion.py +128 -0
  45. utils/logging.py +33 -0
  46. utils/utils.py +51 -0
nk_model/dataset.py ADDED
@@ -0,0 +1,434 @@
1
+ """Dataset generation utilities for NK landscape datasets.
2
+
3
+ This module provides functions to generate, validate, save, and load
4
+ NK landscape datasets for machine learning purposes.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from datetime import datetime
10
+ from typing import Optional, Tuple
11
+
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+
15
+ from src.nk_model.enums import ConvolutionMethod, NeighborhoodMethod
16
+ from src.nk_model.models import NKParams
17
+ from src.nk_model.nk_landscape import NKLandscape
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def generate_landscape_dataset( # noqa: C901
23
+ n_values: list[int],
24
+ k_values: list[int],
25
+ m_values: list[int],
26
+ power_scales: list[float],
27
+ landscapes_per_combo: int,
28
+ value_format: str = "1000.",
29
+ neighborhood_method: NeighborhoodMethod = NeighborhoodMethod.NEAREST,
30
+ convolution_method: ConvolutionMethod = ConvolutionMethod.SYMMETRIC,
31
+ use_notebook_tqdm: bool = False,
32
+ ) -> Tuple[pd.DataFrame, int]:
33
+ """Generate a dataset of NK landscapes with varying parameters.
34
+
35
+ Creates landscapes for all combinations of the provided parameters
36
+ and returns a DataFrame with landscape data including coordinates
37
+ and payoffs.
38
+
39
+ Args:
40
+ n_values: List of N values (number of components).
41
+ k_values: List of K values (interactions per component).
42
+ m_values: List of M values (number of convolutions).
43
+ power_scales: List of power scaling factors.
44
+ landscapes_per_combo: Number of landscapes to generate per
45
+ parameter combination.
46
+ value_format: Format string for max_val (default: "1000.").
47
+ neighborhood_method: Method for determining neighbors.
48
+ convolution_method: Method for convolution.
49
+ use_notebook_tqdm: If True, use tqdm.notebook for progress
50
+ bars (default: False).
51
+
52
+ Returns:
53
+ Tuple of (DataFrame, int) containing:
54
+ - DataFrame with columns:
55
+ - landscape_uuid: Unique identifier for each landscape
56
+ - n, k, m, power_scale: Parameter values
57
+ - payoff: Payoff value for the coordinate
58
+ - coord_1 through coord_N: Binary coordinate values
59
+ - Total number of landscapes generated
60
+
61
+ Raises:
62
+ ValueError: If invalid parameter combinations are provided.
63
+ """
64
+ if use_notebook_tqdm:
65
+ from tqdm.notebook import tqdm as notebook_tqdm
66
+
67
+ tqdm_func = notebook_tqdm
68
+ else:
69
+ tqdm_func = tqdm
70
+
71
+ # Calculate total parameter combinations
72
+ logger.info("Calculating total landscapes to generate...")
73
+ logger.debug("N values: %s", n_values)
74
+ logger.debug("K values: %s", k_values)
75
+ logger.debug("M values: %s", m_values)
76
+ logger.debug("Power scales: %s", power_scales)
77
+ logger.debug("Landscapes per combination: %d", landscapes_per_combo)
78
+
79
+ total_combos = sum(
80
+ 1
81
+ for n in n_values
82
+ for k in k_values
83
+ for m in m_values
84
+ if k < n and m <= n
85
+ for _ in power_scales
86
+ )
87
+ total_landscapes = total_combos * landscapes_per_combo
88
+
89
+ logger.info(
90
+ "Will generate %d landscapes across %d parameter combinations",
91
+ total_landscapes,
92
+ total_combos,
93
+ )
94
+ logger.info(
95
+ "Landscapes per combination: %d",
96
+ landscapes_per_combo,
97
+ )
98
+
99
+ data = []
100
+
101
+ # Main progress bar for N values
102
+ for n in tqdm_func(n_values, desc="N values", position=0):
103
+ # Progress bar for K values
104
+ for k in tqdm_func(
105
+ k_values, desc=f"K values (N={n})", position=1, leave=False
106
+ ):
107
+ if k >= n:
108
+ continue
109
+
110
+ # Progress bar for M values
111
+ for m in tqdm_func(
112
+ m_values,
113
+ desc=f"M values (N={n}, K={k})",
114
+ position=2,
115
+ leave=False,
116
+ ):
117
+ if m > n:
118
+ continue
119
+
120
+ # Progress bar for power scales
121
+ for power_scale in tqdm_func(
122
+ power_scales,
123
+ desc=f"Power scales (N={n}, K={k}, M={m})",
124
+ position=3,
125
+ leave=False,
126
+ ):
127
+ # Progress bar for individual landscapes
128
+ for _ in tqdm_func(
129
+ range(landscapes_per_combo),
130
+ desc=(
131
+ f"Landscapes "
132
+ f"(N={n},K={k},M={m},P={power_scale})"
133
+ ),
134
+ position=4,
135
+ leave=False,
136
+ ):
137
+ params = NKParams(
138
+ n=n,
139
+ k=k,
140
+ m=m,
141
+ power=power_scale,
142
+ max_val=float(value_format),
143
+ neighborhood=neighborhood_method,
144
+ convolution=convolution_method,
145
+ payoff_type="int",
146
+ )
147
+
148
+ landscape = NKLandscape(params)
149
+
150
+ # For each item in the landscape
151
+ for item in landscape.items:
152
+ row = {
153
+ "landscape_uuid": landscape.uuid,
154
+ "n": n,
155
+ "k": k,
156
+ "m": m,
157
+ "power_scale": power_scale,
158
+ "payoff": item.payoff,
159
+ }
160
+ # Add each coordinate as a separate column
161
+ for i, coord in enumerate(item.coordinates):
162
+ row[f"coord_{i+1}"] = int(coord)
163
+
164
+ data.append(row)
165
+
166
+ df = pd.DataFrame(data)
167
+ logger.info(
168
+ "Generated dataset with %d rows and %d columns",
169
+ len(df),
170
+ len(df.columns),
171
+ )
172
+ logger.info(
173
+ "Successfully generated %d landscapes",
174
+ total_landscapes,
175
+ )
176
+ return df, total_landscapes
177
+
178
+
179
+ def validate_dataset( # noqa: C901
180
+ df: pd.DataFrame,
181
+ expected_n: Optional[int] = None,
182
+ expected_landscapes: Optional[int] = None,
183
+ expected_k_values: Optional[list[int]] = None,
184
+ ) -> bool:
185
+ """Validate a generated NK landscape dataset.
186
+
187
+ Performs comprehensive checks on the dataset including:
188
+ - Row counts per landscape (should be 2^N)
189
+ - N value consistency
190
+ - K value distribution
191
+ - Payoff value ranges
192
+ - Coordinate validity (binary values)
193
+
194
+ Args:
195
+ df: DataFrame to validate.
196
+ expected_n: Expected N value (if None, uses unique N from df).
197
+ expected_landscapes: Expected number of unique landscapes.
198
+ expected_k_values: Expected K values for distribution check.
199
+
200
+ Returns:
201
+ True if all validations pass, False otherwise.
202
+
203
+ Note:
204
+ All validation issues are logged at appropriate levels.
205
+ """
206
+ logger.info("Running dataset validation checks...")
207
+
208
+ all_uuids = df["landscape_uuid"].unique()
209
+ total_uuids = len(all_uuids)
210
+
211
+ if expected_n is None:
212
+ expected_n = df["n"].unique()[0]
213
+ logger.debug("Inferred expected_n=%d from dataset", expected_n)
214
+
215
+ issues = []
216
+
217
+ # Check number of unique landscapes
218
+ if expected_landscapes is not None:
219
+ actual_landscapes = total_uuids
220
+ if actual_landscapes == expected_landscapes:
221
+ logger.info(
222
+ "Found expected number of landscapes: %d",
223
+ actual_landscapes,
224
+ )
225
+ else:
226
+ msg = (
227
+ f"Found {actual_landscapes:,} landscapes, "
228
+ f"expected {expected_landscapes:,}"
229
+ )
230
+ logger.error(msg)
231
+ issues.append(msg)
232
+
233
+ # Check row counts per landscape (should be 2^N)
234
+ logger.debug("Checking row counts per landscape...")
235
+ row_counts = df.groupby("landscape_uuid").size()
236
+ expected_rows = 2**expected_n
237
+ incorrect_counts = row_counts[row_counts != expected_rows]
238
+
239
+ if len(incorrect_counts) > 0:
240
+ msg = (
241
+ f"Found {len(incorrect_counts)} landscapes with "
242
+ f"wrong row count (expected {expected_rows})"
243
+ )
244
+ logger.error(msg)
245
+ for uuid, count in incorrect_counts.items():
246
+ logger.error(
247
+ "UUID %s: %d rows (expected %d)", uuid, count, expected_rows
248
+ )
249
+ issues.append(msg)
250
+ else:
251
+ logger.info(
252
+ "All landscapes have correct number of rows (%d)",
253
+ expected_rows,
254
+ )
255
+
256
+ # Check N value consistency
257
+ logger.debug("Checking N value consistency...")
258
+ if not (df["n"] == expected_n).all():
259
+ msg = f"Not all N values are {expected_n}"
260
+ logger.error(msg)
261
+ logger.error("N value distribution:\n%s", df["n"].value_counts())
262
+ issues.append(msg)
263
+ else:
264
+ logger.info("All N values are %d", expected_n)
265
+
266
+ # Check K value distribution
267
+ if expected_k_values is not None:
268
+ logger.debug("Checking K value distribution...")
269
+ k_counts = df.groupby("k")["landscape_uuid"].nunique()
270
+ expected_per_k = total_uuids / len(expected_k_values)
271
+
272
+ logger.info(
273
+ "K value distribution (expected %.0f per K):",
274
+ expected_per_k,
275
+ )
276
+ for k, count in k_counts.items():
277
+ logger.info("K=%d: %d landscapes", k, count)
278
+ if count != expected_per_k:
279
+ msg = (
280
+ f"K={k} has {count:,} landscapes, "
281
+ f"expected {expected_per_k:.0f}"
282
+ )
283
+ logger.error(msg)
284
+ issues.append(msg)
285
+
286
+ # Check payoffs
287
+ logger.debug("Checking payoff values...")
288
+ payoffs = df["payoff"]
289
+ if payoffs.min() != 0:
290
+ msg = "No minimum payoff of 0 found"
291
+ logger.warning(msg)
292
+ issues.append(msg)
293
+
294
+ # Check max payoff (should match max_val from value_format)
295
+ max_payoff = payoffs.max()
296
+ if max_payoff < 0:
297
+ msg = "Negative payoffs found"
298
+ logger.error(msg)
299
+ issues.append(msg)
300
+ else:
301
+ logger.info(
302
+ "Payoff range: [%.2f, %.2f]",
303
+ payoffs.min(),
304
+ max_payoff,
305
+ )
306
+
307
+ # Check coordinates
308
+ logger.debug("Checking coordinate values...")
309
+ coord_cols = [col for col in df.columns if col.startswith("coord_")]
310
+ invalid_coords = []
311
+
312
+ for col in coord_cols:
313
+ unique_vals = df[col].unique()
314
+ if not all(val in [0, 1] for val in unique_vals):
315
+ msg = f"Column {col} has invalid values: {unique_vals}"
316
+ logger.error(msg)
317
+ invalid_coords.append(msg)
318
+
319
+ if invalid_coords:
320
+ issues.extend(invalid_coords)
321
+ else:
322
+ logger.info("All coordinates are binary (0s and 1s)")
323
+
324
+ # Summary
325
+ if issues:
326
+ logger.error("Validation failed with %d issue(s)", len(issues))
327
+ return False
328
+ else:
329
+ logger.info("All validation checks passed")
330
+ return True
331
+
332
+
333
+ def save_dataset(
334
+ df: pd.DataFrame,
335
+ output_dir: str = "data/landscapes",
336
+ n_values: Optional[list[int]] = None,
337
+ k_values: Optional[list[int]] = None,
338
+ m_values: Optional[list[int]] = None,
339
+ power_scales: Optional[list[float]] = None,
340
+ value_format: Optional[str] = None,
341
+ neighborhood_method: Optional[NeighborhoodMethod] = None,
342
+ convolution_method: Optional[ConvolutionMethod] = None,
343
+ total_landscapes: Optional[int] = None,
344
+ filename: Optional[str] = None,
345
+ ) -> str:
346
+ """Save a dataset DataFrame to a Parquet file.
347
+
348
+ Creates a descriptive filename based on parameters or uses the
349
+ provided filename. Creates output directory if it doesn't exist.
350
+
351
+ Args:
352
+ df: DataFrame to save.
353
+ output_dir: Directory to save the file (default: "data/landscapes").
354
+ n_values: N values used (for filename generation).
355
+ k_values: K values used (for filename generation).
356
+ m_values: M values used (for filename generation).
357
+ power_scales: Power scales used (for filename generation).
358
+ value_format: Value format string (for filename generation).
359
+ neighborhood_method: Neighborhood method (for filename).
360
+ convolution_method: Convolution method (for filename).
361
+ total_landscapes: Total number of landscapes (for filename).
362
+ filename: Custom filename (overrides auto-generation).
363
+
364
+ Returns:
365
+ Path to the saved file.
366
+
367
+ Note:
368
+ If filename is provided, it should not include the directory path.
369
+ """
370
+ os.makedirs(output_dir, exist_ok=True)
371
+
372
+ if filename is None:
373
+ # Generate descriptive filename
374
+ parts = []
375
+ if n_values:
376
+ parts.append(f"n{'-'.join(map(str, n_values))}")
377
+ if k_values:
378
+ parts.append(f"k{'-'.join(map(str, k_values))}")
379
+ if m_values:
380
+ parts.append(f"m{'-'.join(map(str, m_values))}")
381
+ if power_scales:
382
+ parts.append(f"p{'-'.join(map(str, power_scales))}")
383
+ if value_format:
384
+ parts.append(f"f{value_format}")
385
+ if neighborhood_method:
386
+ parts.append(f"nm{neighborhood_method.value}")
387
+ if convolution_method:
388
+ parts.append(f"cm{convolution_method.value}")
389
+ if total_landscapes:
390
+ parts.append(f"size{total_landscapes}")
391
+
392
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
393
+ parts.append(timestamp)
394
+
395
+ filename = "_".join(parts) + ".parquet"
396
+
397
+ filepath = os.path.join(output_dir, filename)
398
+
399
+ logger.info("Saving dataset to %s", filepath)
400
+ df.to_parquet(filepath, index=False)
401
+
402
+ file_size_mb = os.path.getsize(filepath) / (1024 * 1024)
403
+ logger.info("Saved dataset: %.2f MB", file_size_mb)
404
+
405
+ return filepath
406
+
407
+
408
+ def load_dataset(filepath: str) -> pd.DataFrame:
409
+ """Load a dataset from a Parquet file.
410
+
411
+ Args:
412
+ filepath: Path to the Parquet file.
413
+
414
+ Returns:
415
+ Loaded DataFrame.
416
+
417
+ Raises:
418
+ FileNotFoundError: If the file doesn't exist.
419
+ """
420
+ if not os.path.exists(filepath):
421
+ raise FileNotFoundError(f"Dataset file not found: {filepath}")
422
+
423
+ logger.info("Loading dataset from %s", filepath)
424
+ df = pd.read_parquet(filepath)
425
+
426
+ file_size_mb = os.path.getsize(filepath) / (1024 * 1024)
427
+ logger.info(
428
+ "Loaded dataset: %d rows × %d columns (%.2f MB)",
429
+ len(df),
430
+ len(df.columns),
431
+ file_size_mb,
432
+ )
433
+
434
+ return df
nk_model/enums.py ADDED
@@ -0,0 +1,21 @@
1
+ from enum import Enum
2
+
3
+
4
+ class NeighborhoodMethod(Enum):
5
+ """
6
+ Method to determine the nearest neighbor for a given node in the NK model.
7
+ """
8
+
9
+ RANDOM = "random"
10
+ NEAREST = "nearest"
11
+ RING = "ring"
12
+
13
+
14
+ class ConvolutionMethod(Enum):
15
+ """
16
+ Method to determine the convolution method to apply to the NK model
17
+ landscape.
18
+ """
19
+
20
+ RANDOM = "random"
21
+ SYMMETRIC = "symmetric"
@@ -0,0 +1,149 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Literal, Optional
4
+
5
+ import numpy as np
6
+
7
+ from src.nk_model.models import Item, NKLandscapeCache, NKParams
8
+
9
+
10
+ class LandscapeCache:
11
+ """
12
+ A file-based cache for NKLandscape objects.
13
+
14
+ This class handles saving and loading landscape data to/from disk
15
+ using Pydantic models for simplified serialization.
16
+ The cache is stored in JSON format in the data directory.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ cache_dir: str = "data/landscape_cache",
22
+ cache_type: Literal["memory", "disk", "none"] = "memory",
23
+ ):
24
+ """
25
+ Initialize the landscape cache.
26
+
27
+ Args:
28
+ cache_dir: Directory to store the cache files
29
+ cache_type: Type of caching to use:
30
+ - "none": No caching
31
+ - "memory": Only in-memory caching
32
+ - "disk": Both memory and disk caching (default)
33
+ """
34
+ self.cache_type = cache_type
35
+
36
+ # Set the cache file path
37
+ self.cache_file = Path(cache_dir) / "landscape_cache.json"
38
+ self.cache_file.parent.mkdir(parents=True, exist_ok=True)
39
+
40
+ # Load existing cache
41
+ self._cache: Dict[str, Dict[str, Any]] = self.load_from_disk()
42
+
43
+ def load_from_disk(self) -> Dict[str, Dict[str, Any]]:
44
+ """
45
+ Load the landscape cache from disk, if it exists.
46
+
47
+ Returns:
48
+ A dictionary mapping landscape UUIDs to their cached data
49
+ or an empty dict in certain cases.
50
+ """
51
+ if self.cache_type == "none" or not self.cache_file.exists():
52
+ return {}
53
+ with open(self.cache_file, "r") as f:
54
+ return json.load(f)
55
+
56
+ def get(self, uuid: str) -> Optional[NKLandscapeCache]:
57
+ """
58
+ Get a landscape from the cache.
59
+
60
+ Args:
61
+ uuid: The UUID of the landscape to retrieve
62
+
63
+ Returns:
64
+ NKLandscapeCache instance or None if not found
65
+ """
66
+ if uuid not in self._cache:
67
+ return None
68
+
69
+ cached_data = self._cache[uuid]
70
+
71
+ # Try loading as new format first
72
+ try:
73
+ return NKLandscapeCache(
74
+ params=NKParams(**cached_data["params"]),
75
+ items=[
76
+ Item(
77
+ coordinates=np.array(item["coordinates"]),
78
+ payoff=item["payoff"],
79
+ )
80
+ for item in cached_data["items"]
81
+ ],
82
+ )
83
+ except (KeyError, TypeError, ValueError):
84
+ # Fall back to legacy format if new format fails
85
+ return self._load_legacy_cache(cached_data)
86
+
87
+ def save(self, uuid: str, data: NKLandscapeCache) -> None:
88
+ """
89
+ Save a landscape to the cache.
90
+
91
+ Args:
92
+ uuid: The UUID of the landscape
93
+ data: NKLandscapeCache instance to cache
94
+ """
95
+ if self.cache_type == "none":
96
+ return
97
+
98
+ # Serialize using Pydantic model
99
+ data_dict = json.loads(data.model_dump_json())
100
+
101
+ # Save to in-memory cache
102
+ self._cache[uuid] = data_dict
103
+
104
+ # Save to disk if cache_type is "disk"
105
+ if self.cache_type == "disk":
106
+ with open(self.cache_file, "w") as f:
107
+ json.dump(self._cache, f)
108
+
109
+ def clear(self) -> None:
110
+ """Clear the entire cache."""
111
+ if self.cache_type == "none":
112
+ return
113
+
114
+ self._cache = {}
115
+ if self.cache_type == "disk" and self.cache_file.exists():
116
+ self.cache_file.unlink()
117
+
118
+ def _load_legacy_cache(
119
+ self, cached_data: Dict[str, Any]
120
+ ) -> NKLandscapeCache:
121
+ """
122
+ Load cache data from legacy format.
123
+
124
+ Args:
125
+ cached_data: Dictionary containing legacy cache data
126
+
127
+ Returns:
128
+ NKLandscapeCache instance reconstructed from legacy format
129
+ """
130
+ # Extract params from old format
131
+ # Default to float for legacy cache (was the original behavior)
132
+ params = NKParams(
133
+ n=cached_data.get("nk_param_n", 0),
134
+ k=cached_data.get("nk_param_k", 0),
135
+ m=cached_data.get("nk_param_m", 0),
136
+ power=cached_data.get("payoff_scaling", 1.0),
137
+ max_val=cached_data.get("value_format", 1.0),
138
+ neighborhood=cached_data.get("neighborhood_method"),
139
+ convolution=cached_data.get("convolution_method"),
140
+ payoff_type="float",
141
+ )
142
+ items = [
143
+ Item(
144
+ coordinates=np.array(item["coordinates"]),
145
+ payoff=item["payoff"],
146
+ )
147
+ for item in cached_data["items"]
148
+ ]
149
+ return NKLandscapeCache(params=params, items=items)