coreinsight-cli 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coreinsight_cli-0.3.0/coreinsight_cli.egg-info → coreinsight_cli-0.3.2}/PKG-INFO +1 -1
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/main.py +289 -29
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/memory.py +81 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/prompts.py +9 -7
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/sandbox.py +69 -1
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2/coreinsight_cli.egg-info}/PKG-INFO +1 -1
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/pyproject.toml +1 -1
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/LICENSE +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/README.md +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/__init__.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/analyzer.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/config.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/__init__.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/bad_loop.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/data_processor.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/demo/slow.cpp +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/embeddings.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/hardware.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/indexer.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/parser.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/profiler.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/scanner.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight/tui.py +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/SOURCES.txt +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/dependency_links.txt +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/entry_points.txt +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/requires.txt +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/top_level.txt +0 -0
- {coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/setup.cfg +0 -0
|
@@ -72,12 +72,10 @@ def _check_speedup_success(success: bool, logs: str) -> bool:
|
|
|
72
72
|
pass
|
|
73
73
|
return False
|
|
74
74
|
|
|
75
|
-
|
|
76
75
|
def _run_single_agent(
|
|
77
76
|
func_name, original_code, language, context,
|
|
78
77
|
hardware_target, sandbox, agent, tier_limits,
|
|
79
|
-
stream_callback: Optional[Callable[[str], None]] = None
|
|
80
|
-
):
|
|
78
|
+
stream_callback: Optional[Callable[[str], None]] = None):
|
|
81
79
|
"""
|
|
82
80
|
Original single-agent pipeline.
|
|
83
81
|
Returns (result, optimized_code, success, logs, plot_data, is_valid).
|
|
@@ -133,12 +131,10 @@ def _run_single_agent(
|
|
|
133
131
|
|
|
134
132
|
return result, optimized_code, success, logs, plot_data, is_valid
|
|
135
133
|
|
|
136
|
-
|
|
137
134
|
def _run_multi_agent(
|
|
138
135
|
func_name, original_code, language, context,
|
|
139
136
|
hardware_target, sandbox, multi_agents, tier_limits,
|
|
140
|
-
stream_callback: Optional[Callable[[str], None]] = None
|
|
141
|
-
):
|
|
137
|
+
stream_callback: Optional[Callable[[str], None]] = None):
|
|
142
138
|
"""
|
|
143
139
|
Multi-agent pipeline.
|
|
144
140
|
BottleneckAgent → analysis
|
|
@@ -209,21 +205,78 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
|
|
|
209
205
|
_log(func_name, "Fetching RAG context...")
|
|
210
206
|
context = indexer.get_context_for_code(original_code) if indexer else ""
|
|
211
207
|
|
|
212
|
-
# 0b. Memory lookup — skip LLM
|
|
208
|
+
# 0b. Memory lookup — skip LLM if we've seen this pattern before,
|
|
209
|
+
# but validate the stored result before trusting it:
|
|
210
|
+
# Gate A: no optimized code stored → previous run was incomplete, re-run LLM
|
|
211
|
+
# Gate B: correctness < 50% last run → keep analysis, re-run correctness only
|
|
212
|
+
# Gate C: result is good → return as-is
|
|
213
213
|
if memory:
|
|
214
214
|
memory_hit = memory.lookup(original_code, language)
|
|
215
215
|
if memory_hit:
|
|
216
216
|
label = "exact match" if memory_hit.is_exact else f"similarity {memory_hit.similarity:.1%}"
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
"
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
217
|
+
|
|
218
|
+
# Gate A: stored result has no optimized code — not useful, fall through to LLM
|
|
219
|
+
if not memory_hit.optimized_code:
|
|
220
|
+
_log(func_name, f"Memory hit ({label}) — no optimized code stored, re-running LLM", style="yellow")
|
|
221
|
+
memory_hit = None # fall through; LLM path runs below as normal
|
|
222
|
+
|
|
223
|
+
# Gate B: correctness was poor last time — re-run the correctness check only
|
|
224
|
+
elif memory_hit.total_cases > 0 and memory_hit.correctness_cases / memory_hit.total_cases < 0.5:
|
|
225
|
+
_log(
|
|
226
|
+
func_name,
|
|
227
|
+
f"Memory hit ({label}) — correctness was "
|
|
228
|
+
f"{memory_hit.correctness_cases}/{memory_hit.total_cases} last run, re-checking",
|
|
229
|
+
style="yellow",
|
|
230
|
+
)
|
|
231
|
+
recalled_result = {
|
|
232
|
+
"severity": memory_hit.severity,
|
|
233
|
+
"issue": memory_hit.issue,
|
|
234
|
+
"reasoning": memory_hit.reasoning,
|
|
235
|
+
"optimized_code": memory_hit.optimized_code,
|
|
236
|
+
"suggestion": "",
|
|
237
|
+
"bottlenecks": [],
|
|
238
|
+
}
|
|
239
|
+
new_verification = None
|
|
240
|
+
if not getattr(sandbox, "disabled", False):
|
|
241
|
+
stored_cases = memory.lookup_test_cases(original_code)
|
|
242
|
+
if stored_cases:
|
|
243
|
+
_log(func_name, "Re-running correctness sandbox with stored test cases...", style="dim")
|
|
244
|
+
correctness = sandbox.verify_correctness_only(
|
|
245
|
+
original_code=original_code,
|
|
246
|
+
optimized_code=memory_hit.optimized_code,
|
|
247
|
+
original_func_name=func_name,
|
|
248
|
+
optimized_func_name=func_name,
|
|
249
|
+
test_cases=stored_cases,
|
|
250
|
+
language=language,
|
|
251
|
+
context=context,
|
|
252
|
+
)
|
|
253
|
+
_log(func_name, f"Re-verification: {correctness.passed_cases}/{correctness.total_cases} passed", style="dim")
|
|
254
|
+
try:
|
|
255
|
+
from coreinsight.sandbox import VerificationResult, SpeedupVerification
|
|
256
|
+
new_verification = VerificationResult(
|
|
257
|
+
speedup=SpeedupVerification(
|
|
258
|
+
verified=True,
|
|
259
|
+
computed_speedups=[memory_hit.avg_speedup] if memory_hit.avg_speedup else [],
|
|
260
|
+
details=f"Speedup recalled from memory: {memory_hit.avg_speedup:.2f}x",
|
|
261
|
+
),
|
|
262
|
+
correctness=correctness,
|
|
263
|
+
)
|
|
264
|
+
except Exception:
|
|
265
|
+
pass # verification display is non-critical
|
|
266
|
+
return func_name, recalled_result, None, None, new_verification, None, memory_hit, False
|
|
267
|
+
|
|
268
|
+
# Gate C: stored result is complete and correctness is acceptable
|
|
269
|
+
else:
|
|
270
|
+
_log(func_name, f"⚡ Recalled from memory ({label}) — skipping LLM", style="bold cyan")
|
|
271
|
+
recalled_result = {
|
|
272
|
+
"severity": memory_hit.severity,
|
|
273
|
+
"issue": memory_hit.issue,
|
|
274
|
+
"reasoning": memory_hit.reasoning,
|
|
275
|
+
"optimized_code": memory_hit.optimized_code,
|
|
276
|
+
"suggestion": "",
|
|
277
|
+
"bottlenecks": [],
|
|
278
|
+
}
|
|
279
|
+
return func_name, recalled_result, None, None, None, None, memory_hit, False
|
|
227
280
|
|
|
228
281
|
# ── Route: single-agent vs multi-agent ──────────────────────────
|
|
229
282
|
if agent_mode == "multi" and multi_agents:
|
|
@@ -244,8 +297,37 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
|
|
|
244
297
|
if result is None:
|
|
245
298
|
return func_name, None, None, f"❌ Analysis error: {logs}", None, None, None, False
|
|
246
299
|
|
|
300
|
+
# Retry gate: Low severity or missing optimized code often means the model
|
|
301
|
+
# defaulted to "looks fine" rather than truly auditing.
|
|
302
|
+
# Retry up to 2 times before accepting the conclusion.
|
|
303
|
+
_MAX_ANALYSIS_RETRIES = 2
|
|
304
|
+
_retry = 0
|
|
305
|
+
while (result.get("severity") == "Low" or not optimized_code) and _retry < _MAX_ANALYSIS_RETRIES:
|
|
306
|
+
_retry += 1
|
|
307
|
+
_log(func_name, f"Low/missing result — retrying analysis ({_retry}/{_MAX_ANALYSIS_RETRIES})...", style="yellow")
|
|
308
|
+
if agent_mode == "multi" and multi_agents:
|
|
309
|
+
result, optimized_code, success, logs, plot_data, is_valid_optimization = \
|
|
310
|
+
_run_multi_agent(
|
|
311
|
+
func_name, original_code, language, context,
|
|
312
|
+
hardware_target, sandbox, multi_agents, tier_limits,
|
|
313
|
+
stream_callback=stream_callback,
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
result, optimized_code, success, logs, plot_data, is_valid_optimization = \
|
|
317
|
+
_run_single_agent(
|
|
318
|
+
func_name, original_code, language, context,
|
|
319
|
+
hardware_target, sandbox, agent, tier_limits,
|
|
320
|
+
stream_callback=stream_callback,
|
|
321
|
+
)
|
|
322
|
+
if result is None:
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
if result is None:
|
|
326
|
+
return func_name, None, None, f"❌ Analysis error after {_retry} retries: {logs}", None, None, None, False
|
|
327
|
+
|
|
247
328
|
if result.get("severity") == "Low" or not optimized_code:
|
|
248
|
-
|
|
329
|
+
confirmed = f" (confirmed after {_retry} retries)" if _retry > 0 else ""
|
|
330
|
+
return func_name, None, None, f"✅ No significant bottlenecks found{confirmed}.", None, None, None, False
|
|
249
331
|
|
|
250
332
|
# 3. Verification + AI-free hardware profiling
|
|
251
333
|
verification = None
|
|
@@ -260,6 +342,8 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
|
|
|
260
342
|
else:
|
|
261
343
|
_log(func_name, "Generating correctness test cases...")
|
|
262
344
|
test_cases = agent.generate_test_cases(func_name, original_code, language, context, num_cases=tier_limits["num_test_cases"])
|
|
345
|
+
if memory:
|
|
346
|
+
memory.store_test_cases(original_code, test_cases)
|
|
263
347
|
_log(func_name, "Running correctness verification in Docker sandbox...")
|
|
264
348
|
verification = sandbox.verify(
|
|
265
349
|
csv_output=logs,
|
|
@@ -290,11 +374,29 @@ def process_function(func: dict, language: str, agent: AnalyzerAgent, sandbox: C
|
|
|
290
374
|
|
|
291
375
|
except Exception as e:
|
|
292
376
|
err_str = str(e)
|
|
293
|
-
|
|
294
|
-
|
|
377
|
+
err_low = err_str.lower()
|
|
378
|
+
if "context" in err_low and "limit" in err_low:
|
|
379
|
+
_log(func_name, "Context limit hit", style="bold yellow")
|
|
295
380
|
return func_name, None, None, (
|
|
296
|
-
|
|
297
|
-
|
|
381
|
+
"⚠️ Context limit — try a model with a larger context window, "
|
|
382
|
+
"or split the function into smaller pieces."
|
|
383
|
+
), None, None, None, False
|
|
384
|
+
if any(k in err_low for k in ("cannot connect", "connection refused", "docker")):
|
|
385
|
+
_log(func_name, "Docker unavailable", style="bold yellow")
|
|
386
|
+
return func_name, None, None, (
|
|
387
|
+
"⚠️ Docker is not running — start Docker Desktop and try again.\n"
|
|
388
|
+
" Skip the sandbox with: coreinsight analyze --no-docker <file>"
|
|
389
|
+
), None, None, None, False
|
|
390
|
+
if "timeout" in err_low or "timed out" in err_low:
|
|
391
|
+
_log(func_name, "Sandbox timed out", style="bold yellow")
|
|
392
|
+
return func_name, None, None, (
|
|
393
|
+
"⚠️ Sandbox timed out — the benchmark likely contains an infinite loop.\n"
|
|
394
|
+
" The LLM analysis result above is still valid."
|
|
395
|
+
), None, None, None, False
|
|
396
|
+
if "out of memory" in err_low or "oom" in err_low:
|
|
397
|
+
_log(func_name, "Sandbox OOM", style="bold yellow")
|
|
398
|
+
return func_name, None, None, (
|
|
399
|
+
"⚠️ Sandbox ran out of memory. Try --no-docker or reduce the file size."
|
|
298
400
|
), None, None, None, False
|
|
299
401
|
_log(func_name, f"Failed: {e}", style="bold red")
|
|
300
402
|
return func_name, None, None, f"❌ Analysis failed: {err_str}", None, None, None, False
|
|
@@ -689,7 +791,12 @@ def run_analysis(file_path: str, no_docker: bool = False, tui_console=None, stre
|
|
|
689
791
|
console.print(f"[red]Initialization Error:[/red] {e}")
|
|
690
792
|
sys.exit(1)
|
|
691
793
|
|
|
692
|
-
|
|
794
|
+
if agent_mode == "multi":
|
|
795
|
+
mode_label = "[bold cyan]Multi-Agent[/bold cyan]"
|
|
796
|
+
elif agent_mode == "auto":
|
|
797
|
+
mode_label = "[cyan]Auto[/cyan]"
|
|
798
|
+
else:
|
|
799
|
+
mode_label = "Single-Agent"
|
|
693
800
|
console.print(f"[dim]⚙️ Agent mode: {mode_label}[/dim]")
|
|
694
801
|
|
|
695
802
|
mem_count = memory.stats().get("count", 0)
|
|
@@ -760,7 +867,15 @@ def run_analysis(file_path: str, no_docker: bool = False, tui_console=None, stre
|
|
|
760
867
|
|
|
761
868
|
except Exception as exc:
|
|
762
869
|
with print_lock:
|
|
763
|
-
|
|
870
|
+
exc_low = str(exc).lower()
|
|
871
|
+
if any(k in exc_low for k in ("docker", "cannot connect", "connection refused")):
|
|
872
|
+
console.print(f"[bold yellow]⚠️ {func['name']}: Docker unavailable — start Docker Desktop and retry.[/bold yellow]")
|
|
873
|
+
elif "timeout" in exc_low or "timed out" in exc_low:
|
|
874
|
+
console.print(f"[bold yellow]⚠️ {func['name']}: Sandbox timed out.[/bold yellow]")
|
|
875
|
+
elif "out of memory" in exc_low or "oom" in exc_low:
|
|
876
|
+
console.print(f"[bold yellow]⚠️ {func['name']}: Sandbox ran out of memory.[/bold yellow]")
|
|
877
|
+
else:
|
|
878
|
+
console.print(f"[bold red]❌ {func['name']}: Unexpected error — {exc}[/bold red]")
|
|
764
879
|
|
|
765
880
|
console.print(Panel.fit(f"✅ [bold green]Analysis Complete![/bold green] Final report saved to:\n{report_path.absolute()}"))
|
|
766
881
|
|
|
@@ -853,6 +968,136 @@ def run_demo(lang: str = "python", no_docker: bool = False, tui_console=None):
|
|
|
853
968
|
|
|
854
969
|
run_analysis(str(demo_dir / entry_file), no_docker=no_docker, tui_console=tui_console)
|
|
855
970
|
|
|
971
|
+
def _run_test_cmd(func_name: str, no_docker: bool = False):
|
|
972
|
+
"""
|
|
973
|
+
Re-run correctness verification for a stored optimized function.
|
|
974
|
+
LLM-free on all subsequent calls. On first call after an old analyze
|
|
975
|
+
run (before test case persistence was added), generates test cases
|
|
976
|
+
once via LLM and stores them so future calls need no LLM.
|
|
977
|
+
"""
|
|
978
|
+
from coreinsight.memory import OptimizationMemory
|
|
979
|
+
from coreinsight.sandbox import CodeSandbox
|
|
980
|
+
|
|
981
|
+
mem = OptimizationMemory()
|
|
982
|
+
record = mem.lookup_by_name(func_name)
|
|
983
|
+
|
|
984
|
+
if not record:
|
|
985
|
+
console.print(
|
|
986
|
+
f"[yellow]No memory record found for '[bold]{func_name}[/bold]'.[/yellow]\n"
|
|
987
|
+
f"[dim]Run [cyan]coreinsight analyze[/cyan] on a file containing this function first.[/dim]"
|
|
988
|
+
)
|
|
989
|
+
return
|
|
990
|
+
|
|
991
|
+
language = record["language"]
|
|
992
|
+
original_code = record["original_code"]
|
|
993
|
+
optimized_code = record["optimized_code"]
|
|
994
|
+
test_cases = record["test_cases"]
|
|
995
|
+
|
|
996
|
+
console.print(Panel.fit(
|
|
997
|
+
f"Re-verifying [bold cyan]{func_name}[/bold cyan] ({language})",
|
|
998
|
+
border_style="cyan",
|
|
999
|
+
))
|
|
1000
|
+
|
|
1001
|
+
if not optimized_code:
|
|
1002
|
+
console.print("[red]Optimized code not found in memory store.[/red]")
|
|
1003
|
+
return
|
|
1004
|
+
|
|
1005
|
+
# ── One-time LLM fallback for functions analyzed before test case persistence ──
|
|
1006
|
+
if not test_cases:
|
|
1007
|
+
console.print(
|
|
1008
|
+
"[yellow]⚠ No test cases stored for this function.[/yellow]\n"
|
|
1009
|
+
"[dim]Generating once via LLM — all future calls will be LLM-free...[/dim]"
|
|
1010
|
+
)
|
|
1011
|
+
try:
|
|
1012
|
+
from coreinsight.analyzer import AnalyzerAgent
|
|
1013
|
+
from coreinsight.config import get_model_tier, get_tier_limits
|
|
1014
|
+
config = load_config()
|
|
1015
|
+
provider = config.get("provider", "ollama")
|
|
1016
|
+
model_name = config.get("model_name", "llama3.2")
|
|
1017
|
+
api_keys = config.get("api_keys", {})
|
|
1018
|
+
model_tier = get_model_tier(provider, model_name)
|
|
1019
|
+
tier_limits = get_tier_limits(config)
|
|
1020
|
+
agent = AnalyzerAgent(
|
|
1021
|
+
provider=provider,
|
|
1022
|
+
model_name=model_name,
|
|
1023
|
+
api_keys=api_keys,
|
|
1024
|
+
model_tier=model_tier,
|
|
1025
|
+
)
|
|
1026
|
+
test_cases = agent.generate_test_cases(
|
|
1027
|
+
func_name, original_code, language,
|
|
1028
|
+
context="",
|
|
1029
|
+
num_cases=tier_limits["num_test_cases"],
|
|
1030
|
+
)
|
|
1031
|
+
except Exception as exc:
|
|
1032
|
+
console.print(f"[red]LLM error generating test cases: {exc}[/red]")
|
|
1033
|
+
return
|
|
1034
|
+
|
|
1035
|
+
if not test_cases:
|
|
1036
|
+
console.print(
|
|
1037
|
+
"[red]LLM returned no test cases. "
|
|
1038
|
+
"Check your provider config with [cyan]coreinsight configure[/cyan].[/red]"
|
|
1039
|
+
)
|
|
1040
|
+
return
|
|
1041
|
+
|
|
1042
|
+
mem.store_test_cases(original_code, test_cases)
|
|
1043
|
+
console.print(
|
|
1044
|
+
f"[dim]✓ Generated and stored {len(test_cases)} test case(s). "
|
|
1045
|
+
f"Future calls to [cyan]coreinsight test {func_name}[/cyan] need no LLM.[/dim]"
|
|
1046
|
+
)
|
|
1047
|
+
|
|
1048
|
+
# ── Correctness sandbox — no LLM from this point ──────────────────────
|
|
1049
|
+
sandbox = CodeSandbox(disabled=no_docker)
|
|
1050
|
+
|
|
1051
|
+
if language in ("cpp", "c++", "cuda"):
|
|
1052
|
+
# C++/CUDA correctness harness is embedded by HarnessAgent at analysis
|
|
1053
|
+
# time and cannot be reconstructed post-hoc. Show stored result instead.
|
|
1054
|
+
meta = record["meta"]
|
|
1055
|
+
passed_cases = int(meta.get("correctness_cases", 0))
|
|
1056
|
+
total_cases = int(meta.get("total_cases", 0))
|
|
1057
|
+
if total_cases > 0:
|
|
1058
|
+
all_passed = passed_cases == total_cases
|
|
1059
|
+
badge = "[bold green]✓ PASS[/bold green]" if all_passed else "[bold yellow]⚠ PARTIAL[/bold yellow]"
|
|
1060
|
+
console.print(
|
|
1061
|
+
f"{badge} — Stored result: "
|
|
1062
|
+
f"{passed_cases}/{total_cases} test cases passed at analysis time."
|
|
1063
|
+
)
|
|
1064
|
+
else:
|
|
1065
|
+
console.print(
|
|
1066
|
+
"[dim]No stored correctness result for this function.[/dim]"
|
|
1067
|
+
)
|
|
1068
|
+
console.print(
|
|
1069
|
+
"[dim]C++ re-verification requires re-running analysis. "
|
|
1070
|
+
"Full results in [cyan]coreinsight memory[/cyan].[/dim]"
|
|
1071
|
+
)
|
|
1072
|
+
return
|
|
1073
|
+
|
|
1074
|
+
console.print(f"[dim]Running {len(test_cases)} test case(s) in Docker sandbox...[/dim]")
|
|
1075
|
+
|
|
1076
|
+
result = sandbox.verify_correctness_only(
|
|
1077
|
+
original_code=original_code,
|
|
1078
|
+
optimized_code=optimized_code,
|
|
1079
|
+
original_func_name=func_name,
|
|
1080
|
+
optimized_func_name=func_name,
|
|
1081
|
+
test_cases=test_cases,
|
|
1082
|
+
language=language,
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
if result.verified:
|
|
1086
|
+
console.print(
|
|
1087
|
+
f"[bold green]✓ PASS[/bold green] — "
|
|
1088
|
+
f"{result.passed_cases}/{result.total_cases} test cases passed."
|
|
1089
|
+
)
|
|
1090
|
+
else:
|
|
1091
|
+
console.print(
|
|
1092
|
+
f"[bold red]✗ FAIL[/bold red] — "
|
|
1093
|
+
f"{result.passed_cases}/{result.total_cases} test cases passed."
|
|
1094
|
+
)
|
|
1095
|
+
for failure in result.failures[:10]:
|
|
1096
|
+
console.print(f" [red]✗[/red] {failure}")
|
|
1097
|
+
|
|
1098
|
+
if result.details:
|
|
1099
|
+
console.print(f"[dim]{result.details}[/dim]")
|
|
1100
|
+
|
|
856
1101
|
def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv"):
|
|
857
1102
|
from coreinsight.memory import OptimizationMemory, MEMORY_DIR
|
|
858
1103
|
import shutil
|
|
@@ -915,6 +1160,7 @@ def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv
|
|
|
915
1160
|
table.add_column("Function", justify="left", style="bold white")
|
|
916
1161
|
table.add_column("Language", justify="center", style="cyan", width=10)
|
|
917
1162
|
table.add_column("Speedup", justify="right", style="bold green", width=9)
|
|
1163
|
+
table.add_column("Tests", justify="right", style="green", width=10)
|
|
918
1164
|
table.add_column("Severity", justify="center", width=10)
|
|
919
1165
|
table.add_column("Issue", justify="left", style="dim white")
|
|
920
1166
|
table.add_column("HW Evidence",justify="left", style="dim", width=22)
|
|
@@ -935,19 +1181,23 @@ def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv
|
|
|
935
1181
|
)
|
|
936
1182
|
|
|
937
1183
|
for i, (meta, rid) in enumerate(paired, start=1):
|
|
938
|
-
sev
|
|
939
|
-
sev_c
|
|
940
|
-
ts
|
|
941
|
-
hw
|
|
942
|
-
issue
|
|
1184
|
+
sev = meta.get("severity", "High")
|
|
1185
|
+
sev_c = severity_colors.get(sev, "white")
|
|
1186
|
+
ts = meta.get("timestamp", "")[:19].replace("T", " ")
|
|
1187
|
+
hw = meta.get("profiler_summary", "") or "—"
|
|
1188
|
+
issue = (meta.get("issue", "") or "—")[:60]
|
|
943
1189
|
if len(meta.get("issue", "")) > 60:
|
|
944
1190
|
issue += "…"
|
|
1191
|
+
passed_c = int(meta.get("correctness_cases", 0))
|
|
1192
|
+
total_c = int(meta.get("total_cases", 0))
|
|
1193
|
+
tests_str = f"{passed_c}/{total_c}" if total_c > 0 else "—"
|
|
945
1194
|
|
|
946
1195
|
table.add_row(
|
|
947
1196
|
str(i),
|
|
948
1197
|
meta.get("func_name", rid[:12]),
|
|
949
1198
|
meta.get("language", "?"),
|
|
950
1199
|
f"{float(meta.get('avg_speedup', 0)):.2f}x",
|
|
1200
|
+
tests_str,
|
|
951
1201
|
f"[{sev_c}]{sev}[/{sev_c}]",
|
|
952
1202
|
issue,
|
|
953
1203
|
hw,
|
|
@@ -1005,6 +1255,11 @@ def main_cli():
|
|
|
1005
1255
|
scan_parser.add_argument("--dir", default=".", help="Directory to scan")
|
|
1006
1256
|
scan_parser.add_argument("--top", type=int, default=10, help="Number of hotspots to show")
|
|
1007
1257
|
|
|
1258
|
+
test_parser = subparsers.add_parser("test", help="Re-run verification sandbox for a stored function")
|
|
1259
|
+
test_parser.add_argument("func_name", help="Name of the function to re-verify")
|
|
1260
|
+
test_parser.add_argument("--no-docker", dest="no_docker", action="store_true",
|
|
1261
|
+
help="Skip Docker (will report skipped)")
|
|
1262
|
+
|
|
1008
1263
|
args = parser.parse_args()
|
|
1009
1264
|
|
|
1010
1265
|
if args.command == "configure":
|
|
@@ -1025,6 +1280,11 @@ def main_cli():
|
|
|
1025
1280
|
export_path=getattr(args, "export_path", None),
|
|
1026
1281
|
export_fmt=getattr(args, "export_fmt", "csv"),
|
|
1027
1282
|
)
|
|
1283
|
+
elif args.command == "test":
|
|
1284
|
+
_run_test_cmd(
|
|
1285
|
+
func_name=args.func_name,
|
|
1286
|
+
no_docker=getattr(args, "no_docker", False),
|
|
1287
|
+
)
|
|
1028
1288
|
elif args.command == "scan":
|
|
1029
1289
|
scanner = ProjectScanner(args.dir)
|
|
1030
1290
|
scanner.scan_project(max_results=args.top)
|
|
@@ -47,6 +47,8 @@ class MemoryHit:
|
|
|
47
47
|
language: str
|
|
48
48
|
severity: str = "High"
|
|
49
49
|
correctness_cases: int = 0
|
|
50
|
+
total_cases: int = 0
|
|
51
|
+
test_cases: list = field(default_factory=list)
|
|
50
52
|
profiler_summary: str = ""
|
|
51
53
|
|
|
52
54
|
|
|
@@ -302,6 +304,7 @@ class OptimizationMemory:
|
|
|
302
304
|
"reasoning": (result.get("reasoning") or "")[:1000],
|
|
303
305
|
"severity": result.get("severity", "High"),
|
|
304
306
|
"correctness_cases": verification.correctness.passed_cases,
|
|
307
|
+
"total_cases": verification.correctness.total_cases,
|
|
305
308
|
"profiler_summary": profiler_summary[:200],
|
|
306
309
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
307
310
|
}
|
|
@@ -329,10 +332,87 @@ class OptimizationMemory:
|
|
|
329
332
|
except Exception as exc:
|
|
330
333
|
return {"count": 0, "error": str(exc)}
|
|
331
334
|
|
|
335
|
+
def lookup_test_cases(self, original_code: str) -> Optional[list]:
|
|
336
|
+
"""
|
|
337
|
+
Return stored test cases for `original_code`, or None if not found.
|
|
338
|
+
Used to re-run correctness without regenerating via LLM.
|
|
339
|
+
"""
|
|
340
|
+
if not self._ensure_db():
|
|
341
|
+
return None
|
|
342
|
+
h = self.ast_hash(original_code)
|
|
343
|
+
return self._load_test_cases(h)
|
|
344
|
+
|
|
345
|
+
def store_test_cases(self, original_code: str, test_cases: list) -> None:
|
|
346
|
+
"""
|
|
347
|
+
Persist test cases for a function, keyed by AST hash.
|
|
348
|
+
Called from process_function immediately after test cases are generated,
|
|
349
|
+
so `coreinsight test` can re-run verification without the LLM.
|
|
350
|
+
"""
|
|
351
|
+
if not self._ensure_db():
|
|
352
|
+
return
|
|
353
|
+
h = self.ast_hash(original_code)
|
|
354
|
+
with self._write_lock:
|
|
355
|
+
try:
|
|
356
|
+
self._code_dir.mkdir(parents=True, exist_ok=True)
|
|
357
|
+
self._save_test_cases(h, test_cases)
|
|
358
|
+
except Exception as exc:
|
|
359
|
+
logger.debug(f"store_test_cases failed: {exc}")
|
|
360
|
+
|
|
361
|
+
def lookup_by_name(self, func_name: str) -> Optional[dict]:
|
|
362
|
+
"""
|
|
363
|
+
Find the most recent memory record whose func_name matches exactly.
|
|
364
|
+
Returns a dict with keys: func_name, language, original_code,
|
|
365
|
+
optimized_code, test_cases, meta. Returns None on no match.
|
|
366
|
+
"""
|
|
367
|
+
if not self._ensure_db():
|
|
368
|
+
return None
|
|
369
|
+
try:
|
|
370
|
+
all_records = self._collection.get(
|
|
371
|
+
include=["metadatas", "documents"]
|
|
372
|
+
)
|
|
373
|
+
matches = [
|
|
374
|
+
(meta, doc, rid)
|
|
375
|
+
for meta, doc, rid in zip(
|
|
376
|
+
all_records.get("metadatas", []),
|
|
377
|
+
all_records.get("documents", []),
|
|
378
|
+
all_records.get("ids", []),
|
|
379
|
+
)
|
|
380
|
+
if meta.get("func_name") == func_name
|
|
381
|
+
]
|
|
382
|
+
if not matches:
|
|
383
|
+
return None
|
|
384
|
+
# Most recent first
|
|
385
|
+
matches.sort(key=lambda x: x[0].get("timestamp", ""), reverse=True)
|
|
386
|
+
meta, original_code, h = matches[0]
|
|
387
|
+
return {
|
|
388
|
+
"func_name": func_name,
|
|
389
|
+
"language": meta.get("language", ""),
|
|
390
|
+
"original_code": original_code or "",
|
|
391
|
+
"optimized_code": self._load_code(h) or "",
|
|
392
|
+
"test_cases": self._load_test_cases(h) or [],
|
|
393
|
+
"meta": meta,
|
|
394
|
+
}
|
|
395
|
+
except Exception as exc:
|
|
396
|
+
logger.debug(f"lookup_by_name failed: {exc}")
|
|
397
|
+
return None
|
|
398
|
+
|
|
332
399
|
# ------------------------------------------------------------------ #
|
|
333
400
|
# Internal helpers
|
|
334
401
|
# ------------------------------------------------------------------ #
|
|
335
402
|
|
|
403
|
+
def _save_test_cases(self, h: str, cases: list) -> None:
|
|
404
|
+
path = self._code_dir / f"{h}.test_cases.json"
|
|
405
|
+
path.write_text(json.dumps(cases), encoding="utf-8")
|
|
406
|
+
|
|
407
|
+
def _load_test_cases(self, h: str) -> Optional[list]:
|
|
408
|
+
path = self._code_dir / f"{h}.test_cases.json"
|
|
409
|
+
if not path.exists():
|
|
410
|
+
return None
|
|
411
|
+
try:
|
|
412
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
413
|
+
except Exception:
|
|
414
|
+
return None
|
|
415
|
+
|
|
336
416
|
def _save_code(self, h: str, language: str, code: str) -> None:
|
|
337
417
|
ext = {"python": "py", "cpp": "cpp", "c++": "cpp", "cuda": "cu"}.get(language, "txt")
|
|
338
418
|
path = self._code_dir / f"{h}.{ext}"
|
|
@@ -364,5 +444,6 @@ class OptimizationMemory:
|
|
|
364
444
|
language= meta.get("language", ""),
|
|
365
445
|
severity= meta.get("severity", "High"),
|
|
366
446
|
correctness_cases= int(meta.get("correctness_cases", 0)),
|
|
447
|
+
total_cases= int(meta.get("total_cases", 0)),
|
|
367
448
|
profiler_summary= meta.get("profiler_summary", ""),
|
|
368
449
|
)
|
|
@@ -90,7 +90,8 @@ GRADING RUBRIC AND INSTRUCTIONS (APPLY ONLY THE SPECIFIC RUBRIC FOR {language}):
|
|
|
90
90
|
INSTRUCTIONS:
|
|
91
91
|
1. Actively hunt for Medium, High, and Critical issues based ONLY on the specific {language} rubric above. Do not hallucinate GPU concepts for Python code unless PyTorch/CUDA is explicitly used.
|
|
92
92
|
2. If you find an issue, you MUST explain the hardware-level or interpreter-level reasoning clearly (e.g., CPU cache misses, GIL contention, memory latency).
|
|
93
|
-
3.
|
|
93
|
+
3. SEVERITY BIAS: When uncertain between two severity levels, always choose the higher one. A false negative (missing a real bottleneck) is always worse than a false positive. Only assign Low severity if you can explicitly prove the algorithm is already optimal for the target hardware — state the time complexity, memory access pattern, and why no better approach exists. "No obvious issues" is NOT sufficient justification for Low.
|
|
94
|
+
4. CODE GENERATION MANDATE: You MUST provide the completely rewritten, optimized function in the `optimized_code` field. The code must be raw, syntactically correct {language} code ready to be compiled/run. Do NOT leave this field empty. Do NOT wrap the code in markdown backticks (e.g., ```cpp) inside the JSON string.
|
|
94
95
|
"""
|
|
95
96
|
|
|
96
97
|
# ---------------------------------------------------------------------------
|
|
@@ -143,10 +144,10 @@ GRADING RUBRIC (apply only the {language} section):
|
|
|
143
144
|
- Low: Trivial stylistic issues only.
|
|
144
145
|
|
|
145
146
|
INSTRUCTIONS:
|
|
146
|
-
1. Identify the single most impactful bottleneck — do not list everything, find the root cause.
|
|
147
|
-
2. Explain the hardware-level or interpreter-level reasoning precisely.
|
|
148
|
-
3. Set `optimized_code` to null
|
|
149
|
-
4.
|
|
147
|
+
1. Identify the single most impactful bottleneck — do not list everything, find the root cause. If no Critical or High issue exists, identify the most significant Medium issue. Do NOT default to Low out of uncertainty.
|
|
148
|
+
2. Explain the hardware-level or interpreter-level reasoning precisely — name the specific mechanism (e.g., "O(N²) comparisons cause cache thrashing on arrays larger than L2 cache", "GIL held across network I/O blocks all threads").
|
|
149
|
+
3. CRITICAL: Set `optimized_code` to null. Any non-null value in this field will corrupt the pipeline. Code generation is handled by a separate agent.
|
|
150
|
+
4. SEVERITY BIAS: When uncertain between two severity levels, always choose the higher one. Only assign Low if you can explicitly prove algorithmic optimality — state the time complexity, memory access pattern, and why no better approach exists for the target hardware. "No obvious issues" is NOT sufficient justification for Low.
|
|
150
151
|
|
|
151
152
|
{format_instructions}
|
|
152
153
|
"""
|
|
@@ -254,8 +255,9 @@ REQUIREMENTS:
|
|
|
254
255
|
1. Rewrite ONLY the function named {func_name} — preserve its signature exactly.
|
|
255
256
|
2. Fix the identified bottleneck using the suggestion as your guide.
|
|
256
257
|
3. The function must be self-contained and correct.
|
|
257
|
-
4.
|
|
258
|
-
5.
|
|
258
|
+
4. VERIFICATION: Before outputting, mentally confirm: does the rewrite directly eliminate the identified bottleneck? If the issue was O(N²), confirm the new complexity is O(N log N) or better. If the issue was a Python loop, confirm it is vectorized with NumPy/PyTorch. If the issue was a deep copy, confirm it is eliminated. Do not output a rewrite that only partially addresses the bottleneck.
|
|
259
|
+
5. Raw {language} code only — no explanation, no markdown fences, no JSON.
|
|
260
|
+
6. Do NOT rename the function.
|
|
259
261
|
"""
|
|
260
262
|
|
|
261
263
|
# ── Per-tier addenda for multi-agent harness (same scaffolding pattern) ──────
|
|
@@ -150,6 +150,35 @@ class VerificationResult:
|
|
|
150
150
|
return "\n".join(lines)
|
|
151
151
|
|
|
152
152
|
|
|
153
|
+
def _format_sandbox_error(exc: Exception, language: str = "") -> str:
|
|
154
|
+
"""Map raw Docker / OS exceptions to user-friendly one-liners."""
|
|
155
|
+
msg = str(exc).lower()
|
|
156
|
+
if "timeout" in msg or "timed out" in msg or "read timeout" in msg:
|
|
157
|
+
return (
|
|
158
|
+
"⚠️ Sandbox timed out — the benchmark likely contains an infinite loop "
|
|
159
|
+
"or extremely slow path. Try --no-docker to skip the sandbox."
|
|
160
|
+
)
|
|
161
|
+
if "out of memory" in msg or "oom" in msg or ("memory" in msg and "kill" in msg):
|
|
162
|
+
return (
|
|
163
|
+
"⚠️ Sandbox ran out of memory (OOM). "
|
|
164
|
+
"Reduce N sizes in the harness or use --no-docker."
|
|
165
|
+
)
|
|
166
|
+
if "no such image" in msg or "pull access" in msg or "not found" in msg:
|
|
167
|
+
lang_label = f" ({language})" if language else ""
|
|
168
|
+
return (
|
|
169
|
+
f"⚠️ Sandbox Docker image not found{lang_label}. "
|
|
170
|
+
"It should have been built on first run — try `docker images` to check."
|
|
171
|
+
)
|
|
172
|
+
if "cannot connect" in msg or "connection refused" in msg or "docker" in msg:
|
|
173
|
+
return (
|
|
174
|
+
"⚠️ Docker is not running. "
|
|
175
|
+
"Start Docker Desktop (or the Docker daemon) and try again."
|
|
176
|
+
)
|
|
177
|
+
if "permission denied" in msg:
|
|
178
|
+
return "⚠️ Sandbox permission error — Docker may lack access to the temp directory."
|
|
179
|
+
return f"⚠️ Sandbox error: {exc}"
|
|
180
|
+
|
|
181
|
+
|
|
153
182
|
class CodeSandbox:
|
|
154
183
|
def __init__(self, disabled: bool = False):
|
|
155
184
|
self.disabled = disabled
|
|
@@ -277,7 +306,7 @@ class CodeSandbox:
|
|
|
277
306
|
return False, f"Missing CSV output (exit {exit_code}).\nFull output:\n{raw_logs}", None
|
|
278
307
|
|
|
279
308
|
except Exception as e:
|
|
280
|
-
return False,
|
|
309
|
+
return False, _format_sandbox_error(e, language), None
|
|
281
310
|
|
|
282
311
|
finally:
|
|
283
312
|
if container:
|
|
@@ -312,6 +341,45 @@ class CodeSandbox:
|
|
|
312
341
|
)
|
|
313
342
|
return VerificationResult(speedup=speedup_result, correctness=correctness_result)
|
|
314
343
|
|
|
344
|
+
def verify_correctness_only(
|
|
345
|
+
self,
|
|
346
|
+
original_code: str,
|
|
347
|
+
optimized_code: str,
|
|
348
|
+
original_func_name: str,
|
|
349
|
+
optimized_func_name: str,
|
|
350
|
+
test_cases: List[Dict[str, Any]],
|
|
351
|
+
language: str = "python",
|
|
352
|
+
timeout_seconds: int = 60,
|
|
353
|
+
context: str = "",
|
|
354
|
+
) -> CorrectnessVerification:
|
|
355
|
+
"""
|
|
356
|
+
Re-run correctness sandbox only — no speedup check, no LLM.
|
|
357
|
+
Used by `coreinsight test <function_name>`.
|
|
358
|
+
|
|
359
|
+
C++ and CUDA are not supported: their correctness harness is a
|
|
360
|
+
main() block embedded by HarnessAgent at analysis time and cannot
|
|
361
|
+
be reconstructed from stored test cases alone.
|
|
362
|
+
"""
|
|
363
|
+
if self.disabled:
|
|
364
|
+
return CorrectnessVerification(verified=False, details=SANDBOX_SKIPPED_MSG)
|
|
365
|
+
if not self.client:
|
|
366
|
+
return CorrectnessVerification(verified=False, details="Docker unavailable.")
|
|
367
|
+
if language in ("cpp", "c++", "cuda"):
|
|
368
|
+
return CorrectnessVerification(
|
|
369
|
+
verified=False,
|
|
370
|
+
details=(
|
|
371
|
+
f"Re-verification not supported for {language}: "
|
|
372
|
+
"correctness harness is embedded at analysis time. "
|
|
373
|
+
"See stored pass rate in `coreinsight memory`."
|
|
374
|
+
),
|
|
375
|
+
)
|
|
376
|
+
return self._verify_correctness(
|
|
377
|
+
original_code, optimized_code,
|
|
378
|
+
original_func_name, optimized_func_name,
|
|
379
|
+
test_cases, language, timeout_seconds,
|
|
380
|
+
context=context,
|
|
381
|
+
)
|
|
382
|
+
|
|
315
383
|
def _verify_speedup(self, csv_output: str) -> SpeedupVerification:
|
|
316
384
|
result = SpeedupVerification(verified=False)
|
|
317
385
|
try:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "coreinsight-cli"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.2"
|
|
8
8
|
description = "Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA"
|
|
9
9
|
license = {text = "GPL-3.0-or-later"}
|
|
10
10
|
authors = [
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{coreinsight_cli-0.3.0 → coreinsight_cli-0.3.2}/coreinsight_cli.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|