coreinsight-cli 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {coreinsight_cli-0.3.0/coreinsight_cli.egg-info → coreinsight_cli-0.3.2}/PKG-INFO +1 -1
  2. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/main.py +289 -29
  3. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/memory.py +81 -0
  4. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/prompts.py +9 -7
  5. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/sandbox.py +69 -1
  6. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2/coreinsight_cli.egg-info}/PKG-INFO +1 -1
  7. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/pyproject.toml +1 -1
  8. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/LICENSE +0 -0
  9. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/README.md +0 -0
  10. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/__init__.py +0 -0
  11. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/analyzer.py +0 -0
  12. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/config.py +0 -0
  13. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/__init__.py +0 -0
  14. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/bad_loop.py +0 -0
  15. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/data_processor.py +0 -0
  16. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/slow.cpp +0 -0
  17. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/embeddings.py +0 -0
  18. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/hardware.py +0 -0
  19. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/indexer.py +0 -0
  20. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/parser.py +0 -0
  21. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/profiler.py +0 -0
  22. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/scanner.py +0 -0
  23. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/tui.py +0 -0
  24. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/SOURCES.txt +0 -0
  25. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/dependency_links.txt +0 -0
  26. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/entry_points.txt +0 -0
  27. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/requires.txt +0 -0
  28. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/top_level.txt +0 -0
  29. {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coreinsight-cli
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA
5
5
  Author: Varun Jani
6
6
  License: GPL-3.0-or-later
@@ -72,12 +72,10 @@ def _check_speedup_success(success: bool, logs: str) -> bool:
72
72
  pass
73
73
  return False
74
74
 
75
-
76
75
  def _run_single_agent(
77
76
  func_name, original_code, language, context,
78
77
  hardware_target, sandbox, agent, tier_limits,
79
- stream_callback: Optional[Callable[[str], None]] = None,
80
- ):
78
+ stream_callback: Optional[Callable[[str], None]] = None):
81
79
  """
82
80
  Original single-agent pipeline.
83
81
  Returns (result, optimized_code, success, logs, plot_data, is_valid).
@@ -133,12 +131,10 @@ def _run_single_agent(
133
131
 
134
132
  return result, optimized_code, success, logs, plot_data, is_valid
135
133
 
136
-
137
134
  def _run_multi_agent(
138
135
  func_name, original_code, language, context,
139
136
  hardware_target, sandbox, multi_agents, tier_limits,
140
- stream_callback: Optional[Callable[[str], None]] = None,
141
- ):
137
+ stream_callback: Optional[Callable[[str], None]] = None):
142
138
  """
143
139
  Multi-agent pipeline.
144
140
  BottleneckAgent → analysis
@@ -209,21 +205,78 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
209
205
  _log(func_name, "Fetching RAG context...")
210
206
  context = indexer.get_context_for_code(original_code) if indexer else ""
211
207
 
212
- # 0b. Memory lookup — skip LLM entirely if we've seen this pattern before
208
+ # 0b. Memory lookup — skip LLM if we've seen this pattern before,
209
+ # but validate the stored result before trusting it:
210
+ # Gate A: no optimized code stored → previous run was incomplete, re-run LLM
211
+ # Gate B: correctness < 50% last run → keep analysis, re-run correctness only
212
+ # Gate C: result is good → return as-is
213
213
  if memory:
214
214
  memory_hit = memory.lookup(original_code, language)
215
215
  if memory_hit:
216
216
  label = "exact match" if memory_hit.is_exact else f"similarity {memory_hit.similarity:.1%}"
217
- _log(func_name, f"⚡ Recalled from memory ({label}) — skipping LLM", style="bold cyan")
218
- recalled_result = {
219
- "severity": memory_hit.severity,
220
- "issue": memory_hit.issue,
221
- "reasoning": memory_hit.reasoning,
222
- "optimized_code": memory_hit.optimized_code,
223
- "suggestion": "",
224
- "bottlenecks": [],
225
- }
226
- return func_name, recalled_result, None, None, None, None, memory_hit, False
217
+
218
+ # Gate A: stored result has no optimized code — not useful, fall through to LLM
219
+ if not memory_hit.optimized_code:
220
+ _log(func_name, f"Memory hit ({label}) — no optimized code stored, re-running LLM", style="yellow")
221
+ memory_hit = None # fall through; LLM path runs below as normal
222
+
223
+ # Gate B: correctness was poor last time — re-run the correctness check only
224
+ elif memory_hit.total_cases > 0 and memory_hit.correctness_cases / memory_hit.total_cases < 0.5:
225
+ _log(
226
+ func_name,
227
+ f"Memory hit ({label}) — correctness was "
228
+ f"{memory_hit.correctness_cases}/{memory_hit.total_cases} last run, re-checking",
229
+ style="yellow",
230
+ )
231
+ recalled_result = {
232
+ "severity": memory_hit.severity,
233
+ "issue": memory_hit.issue,
234
+ "reasoning": memory_hit.reasoning,
235
+ "optimized_code": memory_hit.optimized_code,
236
+ "suggestion": "",
237
+ "bottlenecks": [],
238
+ }
239
+ new_verification = None
240
+ if not getattr(sandbox, "disabled", False):
241
+ stored_cases = memory.lookup_test_cases(original_code)
242
+ if stored_cases:
243
+ _log(func_name, "Re-running correctness sandbox with stored test cases...", style="dim")
244
+ correctness = sandbox.verify_correctness_only(
245
+ original_code=original_code,
246
+ optimized_code=memory_hit.optimized_code,
247
+ original_func_name=func_name,
248
+ optimized_func_name=func_name,
249
+ test_cases=stored_cases,
250
+ language=language,
251
+ context=context,
252
+ )
253
+ _log(func_name, f"Re-verification: {correctness.passed_cases}/{correctness.total_cases} passed", style="dim")
254
+ try:
255
+ from coreinsight.sandbox import VerificationResult, SpeedupVerification
256
+ new_verification = VerificationResult(
257
+ speedup=SpeedupVerification(
258
+ verified=True,
259
+ computed_speedups=[memory_hit.avg_speedup] if memory_hit.avg_speedup else [],
260
+ details=f"Speedup recalled from memory: {memory_hit.avg_speedup:.2f}x",
261
+ ),
262
+ correctness=correctness,
263
+ )
264
+ except Exception:
265
+ pass # verification display is non-critical
266
+ return func_name, recalled_result, None, None, new_verification, None, memory_hit, False
267
+
268
+ # Gate C: stored result is complete and correctness is acceptable
269
+ else:
270
+ _log(func_name, f"⚡ Recalled from memory ({label}) — skipping LLM", style="bold cyan")
271
+ recalled_result = {
272
+ "severity": memory_hit.severity,
273
+ "issue": memory_hit.issue,
274
+ "reasoning": memory_hit.reasoning,
275
+ "optimized_code": memory_hit.optimized_code,
276
+ "suggestion": "",
277
+ "bottlenecks": [],
278
+ }
279
+ return func_name, recalled_result, None, None, None, None, memory_hit, False
227
280
 
228
281
  # ── Route: single-agent vs multi-agent ──────────────────────────
229
282
  if agent_mode == "multi" and multi_agents:
@@ -244,8 +297,37 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
244
297
  if result is None:
245
298
  return func_name, None, None, f"❌ Analysis error: {logs}", None, None, None, False
246
299
 
300
+ # Retry gate: Low severity or missing optimized code often means the model
301
+ # defaulted to "looks fine" rather than truly auditing.
302
+ # Retry up to 2 times before accepting the conclusion.
303
+ _MAX_ANALYSIS_RETRIES = 2
304
+ _retry = 0
305
+ while (result.get("severity") == "Low" or not optimized_code) and _retry < _MAX_ANALYSIS_RETRIES:
306
+ _retry += 1
307
+ _log(func_name, f"Low/missing result — retrying analysis ({_retry}/{_MAX_ANALYSIS_RETRIES})...", style="yellow")
308
+ if agent_mode == "multi" and multi_agents:
309
+ result, optimized_code, success, logs, plot_data, is_valid_optimization = \
310
+ _run_multi_agent(
311
+ func_name, original_code, language, context,
312
+ hardware_target, sandbox, multi_agents, tier_limits,
313
+ stream_callback=stream_callback,
314
+ )
315
+ else:
316
+ result, optimized_code, success, logs, plot_data, is_valid_optimization = \
317
+ _run_single_agent(
318
+ func_name, original_code, language, context,
319
+ hardware_target, sandbox, agent, tier_limits,
320
+ stream_callback=stream_callback,
321
+ )
322
+ if result is None:
323
+ break
324
+
325
+ if result is None:
326
+ return func_name, None, None, f"❌ Analysis error after {_retry} retries: {logs}", None, None, None, False
327
+
247
328
  if result.get("severity") == "Low" or not optimized_code:
248
- return func_name, None, None, " No critical bottlenecks detected. Code is optimal.", None, None, None, False
329
+ confirmed = f" (confirmed after {_retry} retries)" if _retry > 0 else ""
330
+ return func_name, None, None, f"✅ No significant bottlenecks found{confirmed}.", None, None, None, False
249
331
 
250
332
  # 3. Verification + AI-free hardware profiling
251
333
  verification = None
@@ -260,6 +342,8 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
260
342
  else:
261
343
  _log(func_name, "Generating correctness test cases...")
262
344
  test_cases = agent.generate_test_cases(func_name, original_code, language, context, num_cases=tier_limits["num_test_cases"])
345
+ if memory:
346
+ memory.store_test_cases(original_code, test_cases)
263
347
  _log(func_name, "Running correctness verification in Docker sandbox...")
264
348
  verification = sandbox.verify(
265
349
  csv_output=logs,
@@ -290,11 +374,29 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
290
374
 
291
375
  except Exception as e:
292
376
  err_str = str(e)
293
- if "context" in err_str.lower() and "limit" in err_str.lower():
294
- _log(func_name, f"Context limit hit: {e}", style="bold yellow")
377
+ err_low = err_str.lower()
378
+ if "context" in err_low and "limit" in err_low:
379
+ _log(func_name, "Context limit hit", style="bold yellow")
295
380
  return func_name, None, None, (
296
- f"⚠️ Context limit: {err_str}\n"
297
- f"Try a model with a larger context window, or split the function."
381
+ "⚠️ Context limit — try a model with a larger context window, "
382
+ "or split the function into smaller pieces."
383
+ ), None, None, None, False
384
+ if any(k in err_low for k in ("cannot connect", "connection refused", "docker")):
385
+ _log(func_name, "Docker unavailable", style="bold yellow")
386
+ return func_name, None, None, (
387
+ "⚠️ Docker is not running — start Docker Desktop and try again.\n"
388
+ " Skip the sandbox with: coreinsight analyze --no-docker <file>"
389
+ ), None, None, None, False
390
+ if "timeout" in err_low or "timed out" in err_low:
391
+ _log(func_name, "Sandbox timed out", style="bold yellow")
392
+ return func_name, None, None, (
393
+ "⚠️ Sandbox timed out — the benchmark likely contains an infinite loop.\n"
394
+ " The LLM analysis result above is still valid."
395
+ ), None, None, None, False
396
+ if "out of memory" in err_low or "oom" in err_low:
397
+ _log(func_name, "Sandbox OOM", style="bold yellow")
398
+ return func_name, None, None, (
399
+ "⚠️ Sandbox ran out of memory. Try --no-docker or reduce the file size."
298
400
  ), None, None, None, False
299
401
  _log(func_name, f"Failed: {e}", style="bold red")
300
402
  return func_name, None, None, f"❌ Analysis failed: {err_str}", None, None, None, False
@@ -689,7 +791,12 @@ def run_analysis(file_path: str, no_docker: bool = False, tui_console=None, stre
689
791
  console.print(f"[red]Initialization Error:[/red] {e}")
690
792
  sys.exit(1)
691
793
 
692
- mode_label = "[bold cyan]Multi-Agent[/bold cyan]" if agent_mode == "multi" else "[dim]Single-Agent[/dim]"
794
+ if agent_mode == "multi":
795
+ mode_label = "[bold cyan]Multi-Agent[/bold cyan]"
796
+ elif agent_mode == "auto":
797
+ mode_label = "[cyan]Auto[/cyan]"
798
+ else:
799
+ mode_label = "Single-Agent"
693
800
  console.print(f"[dim]⚙️ Agent mode: {mode_label}[/dim]")
694
801
 
695
802
  mem_count = memory.stats().get("count", 0)
@@ -760,7 +867,15 @@ def run_analysis(file_path: str, no_docker: bool = False, tui_console=None, stre
760
867
 
761
868
  except Exception as exc:
762
869
  with print_lock:
763
- console.print(f"[bold red]❌ Critical failure in thread processing {func['name']}:[/bold red] {exc}")
870
+ exc_low = str(exc).lower()
871
+ if any(k in exc_low for k in ("docker", "cannot connect", "connection refused")):
872
+ console.print(f"[bold yellow]⚠️ {func['name']}: Docker unavailable — start Docker Desktop and retry.[/bold yellow]")
873
+ elif "timeout" in exc_low or "timed out" in exc_low:
874
+ console.print(f"[bold yellow]⚠️ {func['name']}: Sandbox timed out.[/bold yellow]")
875
+ elif "out of memory" in exc_low or "oom" in exc_low:
876
+ console.print(f"[bold yellow]⚠️ {func['name']}: Sandbox ran out of memory.[/bold yellow]")
877
+ else:
878
+ console.print(f"[bold red]❌ {func['name']}: Unexpected error — {exc}[/bold red]")
764
879
 
765
880
  console.print(Panel.fit(f"✅ [bold green]Analysis Complete![/bold green] Final report saved to:\n{report_path.absolute()}"))
766
881
 
@@ -853,6 +968,136 @@ def run_demo(lang: str = "python", no_docker: bool = False, tui_console=None):
853
968
 
854
969
  run_analysis(str(demo_dir / entry_file), no_docker=no_docker, tui_console=tui_console)
855
970
 
971
+ def _run_test_cmd(func_name: str, no_docker: bool = False):
972
+ """
973
+ Re-run correctness verification for a stored optimized function.
974
+ LLM-free on all subsequent calls. On first call after an old analyze
975
+ run (before test case persistence was added), generates test cases
976
+ once via LLM and stores them so future calls need no LLM.
977
+ """
978
+ from coreinsight.memory import OptimizationMemory
979
+ from coreinsight.sandbox import CodeSandbox
980
+
981
+ mem = OptimizationMemory()
982
+ record = mem.lookup_by_name(func_name)
983
+
984
+ if not record:
985
+ console.print(
986
+ f"[yellow]No memory record found for '[bold]{func_name}[/bold]'.[/yellow]\n"
987
+ f"[dim]Run [cyan]coreinsight analyze[/cyan] on a file containing this function first.[/dim]"
988
+ )
989
+ return
990
+
991
+ language = record["language"]
992
+ original_code = record["original_code"]
993
+ optimized_code = record["optimized_code"]
994
+ test_cases = record["test_cases"]
995
+
996
+ console.print(Panel.fit(
997
+ f"Re-verifying [bold cyan]{func_name}[/bold cyan] ({language})",
998
+ border_style="cyan",
999
+ ))
1000
+
1001
+ if not optimized_code:
1002
+ console.print("[red]Optimized code not found in memory store.[/red]")
1003
+ return
1004
+
1005
+ # ── One-time LLM fallback for functions analyzed before test case persistence ──
1006
+ if not test_cases:
1007
+ console.print(
1008
+ "[yellow]⚠ No test cases stored for this function.[/yellow]\n"
1009
+ "[dim]Generating once via LLM — all future calls will be LLM-free...[/dim]"
1010
+ )
1011
+ try:
1012
+ from coreinsight.analyzer import AnalyzerAgent
1013
+ from coreinsight.config import get_model_tier, get_tier_limits
1014
+ config = load_config()
1015
+ provider = config.get("provider", "ollama")
1016
+ model_name = config.get("model_name", "llama3.2")
1017
+ api_keys = config.get("api_keys", {})
1018
+ model_tier = get_model_tier(provider, model_name)
1019
+ tier_limits = get_tier_limits(config)
1020
+ agent = AnalyzerAgent(
1021
+ provider=provider,
1022
+ model_name=model_name,
1023
+ api_keys=api_keys,
1024
+ model_tier=model_tier,
1025
+ )
1026
+ test_cases = agent.generate_test_cases(
1027
+ func_name, original_code, language,
1028
+ context="",
1029
+ num_cases=tier_limits["num_test_cases"],
1030
+ )
1031
+ except Exception as exc:
1032
+ console.print(f"[red]LLM error generating test cases: {exc}[/red]")
1033
+ return
1034
+
1035
+ if not test_cases:
1036
+ console.print(
1037
+ "[red]LLM returned no test cases. "
1038
+ "Check your provider config with [cyan]coreinsight configure[/cyan].[/red]"
1039
+ )
1040
+ return
1041
+
1042
+ mem.store_test_cases(original_code, test_cases)
1043
+ console.print(
1044
+ f"[dim]✓ Generated and stored {len(test_cases)} test case(s). "
1045
+ f"Future calls to [cyan]coreinsight test {func_name}[/cyan] need no LLM.[/dim]"
1046
+ )
1047
+
1048
+ # ── Correctness sandbox — no LLM from this point ──────────────────────
1049
+ sandbox = CodeSandbox(disabled=no_docker)
1050
+
1051
+ if language in ("cpp", "c++", "cuda"):
1052
+ # C++/CUDA correctness harness is embedded by HarnessAgent at analysis
1053
+ # time and cannot be reconstructed post-hoc. Show stored result instead.
1054
+ meta = record["meta"]
1055
+ passed_cases = int(meta.get("correctness_cases", 0))
1056
+ total_cases = int(meta.get("total_cases", 0))
1057
+ if total_cases > 0:
1058
+ all_passed = passed_cases == total_cases
1059
+ badge = "[bold green]✓ PASS[/bold green]" if all_passed else "[bold yellow]⚠ PARTIAL[/bold yellow]"
1060
+ console.print(
1061
+ f"{badge} — Stored result: "
1062
+ f"{passed_cases}/{total_cases} test cases passed at analysis time."
1063
+ )
1064
+ else:
1065
+ console.print(
1066
+ "[dim]No stored correctness result for this function.[/dim]"
1067
+ )
1068
+ console.print(
1069
+ "[dim]C++ re-verification requires re-running analysis. "
1070
+ "Full results in [cyan]coreinsight memory[/cyan].[/dim]"
1071
+ )
1072
+ return
1073
+
1074
+ console.print(f"[dim]Running {len(test_cases)} test case(s) in Docker sandbox...[/dim]")
1075
+
1076
+ result = sandbox.verify_correctness_only(
1077
+ original_code=original_code,
1078
+ optimized_code=optimized_code,
1079
+ original_func_name=func_name,
1080
+ optimized_func_name=func_name,
1081
+ test_cases=test_cases,
1082
+ language=language,
1083
+ )
1084
+
1085
+ if result.verified:
1086
+ console.print(
1087
+ f"[bold green]✓ PASS[/bold green] — "
1088
+ f"{result.passed_cases}/{result.total_cases} test cases passed."
1089
+ )
1090
+ else:
1091
+ console.print(
1092
+ f"[bold red]✗ FAIL[/bold red] — "
1093
+ f"{result.passed_cases}/{result.total_cases} test cases passed."
1094
+ )
1095
+ for failure in result.failures[:10]:
1096
+ console.print(f" [red]✗[/red] {failure}")
1097
+
1098
+ if result.details:
1099
+ console.print(f"[dim]{result.details}[/dim]")
1100
+
856
1101
  def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv"):
857
1102
  from coreinsight.memory import OptimizationMemory, MEMORY_DIR
858
1103
  import shutil
@@ -915,6 +1160,7 @@ def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv
915
1160
  table.add_column("Function", justify="left", style="bold white")
916
1161
  table.add_column("Language", justify="center", style="cyan", width=10)
917
1162
  table.add_column("Speedup", justify="right", style="bold green", width=9)
1163
+ table.add_column("Tests", justify="right", style="green", width=10)
918
1164
  table.add_column("Severity", justify="center", width=10)
919
1165
  table.add_column("Issue", justify="left", style="dim white")
920
1166
  table.add_column("HW Evidence",justify="left", style="dim", width=22)
@@ -935,19 +1181,23 @@ def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv
935
1181
  )
936
1182
 
937
1183
  for i, (meta, rid) in enumerate(paired, start=1):
938
- sev = meta.get("severity", "High")
939
- sev_c = severity_colors.get(sev, "white")
940
- ts = meta.get("timestamp", "")[:19].replace("T", " ")
941
- hw = meta.get("profiler_summary", "") or "—"
942
- issue = (meta.get("issue", "") or "—")[:60]
1184
+ sev = meta.get("severity", "High")
1185
+ sev_c = severity_colors.get(sev, "white")
1186
+ ts = meta.get("timestamp", "")[:19].replace("T", " ")
1187
+ hw = meta.get("profiler_summary", "") or "—"
1188
+ issue = (meta.get("issue", "") or "—")[:60]
943
1189
  if len(meta.get("issue", "")) > 60:
944
1190
  issue += "…"
1191
+ passed_c = int(meta.get("correctness_cases", 0))
1192
+ total_c = int(meta.get("total_cases", 0))
1193
+ tests_str = f"{passed_c}/{total_c}" if total_c > 0 else "—"
945
1194
 
946
1195
  table.add_row(
947
1196
  str(i),
948
1197
  meta.get("func_name", rid[:12]),
949
1198
  meta.get("language", "?"),
950
1199
  f"{float(meta.get('avg_speedup', 0)):.2f}x",
1200
+ tests_str,
951
1201
  f"[{sev_c}]{sev}[/{sev_c}]",
952
1202
  issue,
953
1203
  hw,
@@ -1005,6 +1255,11 @@ def main_cli():
1005
1255
  scan_parser.add_argument("--dir", default=".", help="Directory to scan")
1006
1256
  scan_parser.add_argument("--top", type=int, default=10, help="Number of hotspots to show")
1007
1257
 
1258
+ test_parser = subparsers.add_parser("test", help="Re-run verification sandbox for a stored function")
1259
+ test_parser.add_argument("func_name", help="Name of the function to re-verify")
1260
+ test_parser.add_argument("--no-docker", dest="no_docker", action="store_true",
1261
+ help="Skip Docker (will report skipped)")
1262
+
1008
1263
  args = parser.parse_args()
1009
1264
 
1010
1265
  if args.command == "configure":
@@ -1025,6 +1280,11 @@ def main_cli():
1025
1280
  export_path=getattr(args, "export_path", None),
1026
1281
  export_fmt=getattr(args, "export_fmt", "csv"),
1027
1282
  )
1283
+ elif args.command == "test":
1284
+ _run_test_cmd(
1285
+ func_name=args.func_name,
1286
+ no_docker=getattr(args, "no_docker", False),
1287
+ )
1028
1288
  elif args.command == "scan":
1029
1289
  scanner = ProjectScanner(args.dir)
1030
1290
  scanner.scan_project(max_results=args.top)
@@ -47,6 +47,8 @@ class MemoryHit:
47
47
  language: str
48
48
  severity: str = "High"
49
49
  correctness_cases: int = 0
50
+ total_cases: int = 0
51
+ test_cases: list = field(default_factory=list)
50
52
  profiler_summary: str = ""
51
53
 
52
54
 
@@ -302,6 +304,7 @@ class OptimizationMemory:
302
304
  "reasoning": (result.get("reasoning") or "")[:1000],
303
305
  "severity": result.get("severity", "High"),
304
306
  "correctness_cases": verification.correctness.passed_cases,
307
+ "total_cases": verification.correctness.total_cases,
305
308
  "profiler_summary": profiler_summary[:200],
306
309
  "timestamp": datetime.now(timezone.utc).isoformat(),
307
310
  }
@@ -329,10 +332,87 @@ class OptimizationMemory:
329
332
  except Exception as exc:
330
333
  return {"count": 0, "error": str(exc)}
331
334
 
335
+ def lookup_test_cases(self, original_code: str) -> Optional[list]:
336
+ """
337
+ Return stored test cases for `original_code`, or None if not found.
338
+ Used to re-run correctness without regenerating via LLM.
339
+ """
340
+ if not self._ensure_db():
341
+ return None
342
+ h = self.ast_hash(original_code)
343
+ return self._load_test_cases(h)
344
+
345
+ def store_test_cases(self, original_code: str, test_cases: list) -> None:
346
+ """
347
+ Persist test cases for a function, keyed by AST hash.
348
+ Called from process_function immediately after test cases are generated,
349
+ so `coreinsight test` can re-run verification without the LLM.
350
+ """
351
+ if not self._ensure_db():
352
+ return
353
+ h = self.ast_hash(original_code)
354
+ with self._write_lock:
355
+ try:
356
+ self._code_dir.mkdir(parents=True, exist_ok=True)
357
+ self._save_test_cases(h, test_cases)
358
+ except Exception as exc:
359
+ logger.debug(f"store_test_cases failed: {exc}")
360
+
361
+ def lookup_by_name(self, func_name: str) -> Optional[dict]:
362
+ """
363
+ Find the most recent memory record whose func_name matches exactly.
364
+ Returns a dict with keys: func_name, language, original_code,
365
+ optimized_code, test_cases, meta. Returns None on no match.
366
+ """
367
+ if not self._ensure_db():
368
+ return None
369
+ try:
370
+ all_records = self._collection.get(
371
+ include=["metadatas", "documents"]
372
+ )
373
+ matches = [
374
+ (meta, doc, rid)
375
+ for meta, doc, rid in zip(
376
+ all_records.get("metadatas", []),
377
+ all_records.get("documents", []),
378
+ all_records.get("ids", []),
379
+ )
380
+ if meta.get("func_name") == func_name
381
+ ]
382
+ if not matches:
383
+ return None
384
+ # Most recent first
385
+ matches.sort(key=lambda x: x[0].get("timestamp", ""), reverse=True)
386
+ meta, original_code, h = matches[0]
387
+ return {
388
+ "func_name": func_name,
389
+ "language": meta.get("language", ""),
390
+ "original_code": original_code or "",
391
+ "optimized_code": self._load_code(h) or "",
392
+ "test_cases": self._load_test_cases(h) or [],
393
+ "meta": meta,
394
+ }
395
+ except Exception as exc:
396
+ logger.debug(f"lookup_by_name failed: {exc}")
397
+ return None
398
+
332
399
  # ------------------------------------------------------------------ #
333
400
  # Internal helpers
334
401
  # ------------------------------------------------------------------ #
335
402
 
403
+ def _save_test_cases(self, h: str, cases: list) -> None:
404
+ path = self._code_dir / f"{h}.test_cases.json"
405
+ path.write_text(json.dumps(cases), encoding="utf-8")
406
+
407
+ def _load_test_cases(self, h: str) -> Optional[list]:
408
+ path = self._code_dir / f"{h}.test_cases.json"
409
+ if not path.exists():
410
+ return None
411
+ try:
412
+ return json.loads(path.read_text(encoding="utf-8"))
413
+ except Exception:
414
+ return None
415
+
336
416
  def _save_code(self, h: str, language: str, code: str) -> None:
337
417
  ext = {"python": "py", "cpp": "cpp", "c++": "cpp", "cuda": "cu"}.get(language, "txt")
338
418
  path = self._code_dir / f"{h}.{ext}"
@@ -364,5 +444,6 @@ class OptimizationMemory:
364
444
  language= meta.get("language", ""),
365
445
  severity= meta.get("severity", "High"),
366
446
  correctness_cases= int(meta.get("correctness_cases", 0)),
447
+ total_cases= int(meta.get("total_cases", 0)),
367
448
  profiler_summary= meta.get("profiler_summary", ""),
368
449
  )
@@ -90,7 +90,8 @@ GRADING RUBRIC AND INSTRUCTIONS (APPLY ONLY THE SPECIFIC RUBRIC FOR {language}):
90
90
  INSTRUCTIONS:
91
91
  1. Actively hunt for Medium, High, and Critical issues based ONLY on the specific {language} rubric above. Do not hallucinate GPU concepts for Python code unless PyTorch/CUDA is explicitly used.
92
92
  2. If you find an issue, you MUST explain the hardware-level or interpreter-level reasoning clearly (e.g., CPU cache misses, GIL contention, memory latency).
93
- 3. CODE GENERATION MANDATE: You MUST provide the completely rewritten, optimized function in the `optimized_code` field. The code must be raw, syntactically correct {language} code ready to be compiled/run. Do NOT leave this field empty. Do NOT wrap the code in markdown backticks (e.g., ```cpp) inside the JSON string.
93
+ 3. SEVERITY BIAS: When uncertain between two severity levels, always choose the higher one. A false negative (missing a real bottleneck) is always worse than a false positive. Only assign Low severity if you can explicitly prove the algorithm is already optimal for the target hardware — state the time complexity, memory access pattern, and why no better approach exists. "No obvious issues" is NOT sufficient justification for Low.
94
+ 4. CODE GENERATION MANDATE: You MUST provide the completely rewritten, optimized function in the `optimized_code` field. The code must be raw, syntactically correct {language} code ready to be compiled/run. Do NOT leave this field empty. Do NOT wrap the code in markdown backticks (e.g., ```cpp) inside the JSON string.
94
95
  """
95
96
 
96
97
  # ---------------------------------------------------------------------------
@@ -143,10 +144,10 @@ GRADING RUBRIC (apply only the {language} section):
143
144
  - Low: Trivial stylistic issues only.
144
145
 
145
146
  INSTRUCTIONS:
146
- 1. Identify the single most impactful bottleneck — do not list everything, find the root cause.
147
- 2. Explain the hardware-level or interpreter-level reasoning precisely.
148
- 3. Set `optimized_code` to null code generation happens in a separate agent.
149
- 4. If the code is genuinely optimal, set severity to Low and explain why.
147
+ 1. Identify the single most impactful bottleneck — do not list everything, find the root cause. If no Critical or High issue exists, identify the most significant Medium issue. Do NOT default to Low out of uncertainty.
148
+ 2. Explain the hardware-level or interpreter-level reasoning precisely — name the specific mechanism (e.g., "O(N²) comparisons cause cache thrashing on arrays larger than L2 cache", "GIL held across network I/O blocks all threads").
149
+ 3. CRITICAL: Set `optimized_code` to null. Any non-null value in this field will corrupt the pipeline. Code generation is handled by a separate agent.
150
+ 4. SEVERITY BIAS: When uncertain between two severity levels, always choose the higher one. Only assign Low if you can explicitly prove algorithmic optimality — state the time complexity, memory access pattern, and why no better approach exists for the target hardware. "No obvious issues" is NOT sufficient justification for Low.
150
151
 
151
152
  {format_instructions}
152
153
  """
@@ -254,8 +255,9 @@ REQUIREMENTS:
254
255
  1. Rewrite ONLY the function named {func_name} — preserve its signature exactly.
255
256
  2. Fix the identified bottleneck using the suggestion as your guide.
256
257
  3. The function must be self-contained and correct.
257
- 4. Raw {language} code only no explanation, no markdown fences, no JSON.
258
- 5. Do NOT rename the function.
258
+ 4. VERIFICATION: Before outputting, mentally confirm: does the rewrite directly eliminate the identified bottleneck? If the issue was O(N²), confirm the new complexity is O(N log N) or better. If the issue was a Python loop, confirm it is vectorized with NumPy/PyTorch. If the issue was a deep copy, confirm it is eliminated. Do not output a rewrite that only partially addresses the bottleneck.
259
+ 5. Raw {language} code only — no explanation, no markdown fences, no JSON.
260
+ 6. Do NOT rename the function.
259
261
  """
260
262
 
261
263
  # ── Per-tier addenda for multi-agent harness (same scaffolding pattern) ──────
@@ -150,6 +150,35 @@ class VerificationResult:
150
150
  return "\n".join(lines)
151
151
 
152
152
 
153
+ def _format_sandbox_error(exc: Exception, language: str = "") -> str:
154
+ """Map raw Docker / OS exceptions to user-friendly one-liners."""
155
+ msg = str(exc).lower()
156
+ if "timeout" in msg or "timed out" in msg or "read timeout" in msg:
157
+ return (
158
+ "⚠️ Sandbox timed out — the benchmark likely contains an infinite loop "
159
+ "or extremely slow path. Try --no-docker to skip the sandbox."
160
+ )
161
+ if "out of memory" in msg or "oom" in msg or ("memory" in msg and "kill" in msg):
162
+ return (
163
+ "⚠️ Sandbox ran out of memory (OOM). "
164
+ "Reduce N sizes in the harness or use --no-docker."
165
+ )
166
+ if "no such image" in msg or "pull access" in msg or "not found" in msg:
167
+ lang_label = f" ({language})" if language else ""
168
+ return (
169
+ f"⚠️ Sandbox Docker image not found{lang_label}. "
170
+ "It should have been built on first run — try `docker images` to check."
171
+ )
172
+ if "cannot connect" in msg or "connection refused" in msg or "docker" in msg:
173
+ return (
174
+ "⚠️ Docker is not running. "
175
+ "Start Docker Desktop (or the Docker daemon) and try again."
176
+ )
177
+ if "permission denied" in msg:
178
+ return "⚠️ Sandbox permission error — Docker may lack access to the temp directory."
179
+ return f"⚠️ Sandbox error: {exc}"
180
+
181
+
153
182
  class CodeSandbox:
154
183
  def __init__(self, disabled: bool = False):
155
184
  self.disabled = disabled
@@ -277,7 +306,7 @@ class CodeSandbox:
277
306
  return False, f"Missing CSV output (exit {exit_code}).\nFull output:\n{raw_logs}", None
278
307
 
279
308
  except Exception as e:
280
- return False, f"Sandbox error: {str(e)}", None
309
+ return False, _format_sandbox_error(e, language), None
281
310
 
282
311
  finally:
283
312
  if container:
@@ -312,6 +341,45 @@ class CodeSandbox:
312
341
  )
313
342
  return VerificationResult(speedup=speedup_result, correctness=correctness_result)
314
343
 
344
+ def verify_correctness_only(
345
+ self,
346
+ original_code: str,
347
+ optimized_code: str,
348
+ original_func_name: str,
349
+ optimized_func_name: str,
350
+ test_cases: List[Dict[str, Any]],
351
+ language: str = "python",
352
+ timeout_seconds: int = 60,
353
+ context: str = "",
354
+ ) -> CorrectnessVerification:
355
+ """
356
+ Re-run correctness sandbox only — no speedup check, no LLM.
357
+ Used by `coreinsight test <function_name>`.
358
+
359
+ C++ and CUDA are not supported: their correctness harness is a
360
+ main() block embedded by HarnessAgent at analysis time and cannot
361
+ be reconstructed from stored test cases alone.
362
+ """
363
+ if self.disabled:
364
+ return CorrectnessVerification(verified=False, details=SANDBOX_SKIPPED_MSG)
365
+ if not self.client:
366
+ return CorrectnessVerification(verified=False, details="Docker unavailable.")
367
+ if language in ("cpp", "c++", "cuda"):
368
+ return CorrectnessVerification(
369
+ verified=False,
370
+ details=(
371
+ f"Re-verification not supported for {language}: "
372
+ "correctness harness is embedded at analysis time. "
373
+ "See stored pass rate in `coreinsight memory`."
374
+ ),
375
+ )
376
+ return self._verify_correctness(
377
+ original_code, optimized_code,
378
+ original_func_name, optimized_func_name,
379
+ test_cases, language, timeout_seconds,
380
+ context=context,
381
+ )
382
+
315
383
  def _verify_speedup(self, csv_output: str) -> SpeedupVerification:
316
384
  result = SpeedupVerification(verified=False)
317
385
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coreinsight-cli
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA
5
5
  Author: Varun Jani
6
6
  License: GPL-3.0-or-later
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "coreinsight-cli"
7
- version = "0.3.0"
7
+ version = "0.3.2"
8
8
  description = "Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA"
9
9
  license = {text = "GPL-3.0-or-later"}
10
10
  authors = [
File without changes