coreinsight-cli 0.2.8__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {coreinsight_cli-0.2.8/coreinsight_cli.egg-info → coreinsight_cli-0.2.9}/PKG-INFO +1 -1
  2. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/analyzer.py +3 -1
  3. coreinsight_cli-0.2.9/coreinsight/embeddings.py +103 -0
  4. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/indexer.py +2 -53
  5. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/main.py +9 -4
  6. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/memory.py +60 -59
  7. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/profiler.py +265 -13
  8. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/sandbox.py +18 -12
  9. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/tui.py +3 -7
  10. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9/coreinsight_cli.egg-info}/PKG-INFO +1 -1
  11. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight_cli.egg-info/SOURCES.txt +1 -2
  12. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/pyproject.toml +1 -1
  13. coreinsight_cli-0.2.8/coreinsight/Dockerfile.cpp-sandbox +0 -2
  14. coreinsight_cli-0.2.8/coreinsight/Dockerfile.python-sandbox +0 -3
  15. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/LICENSE +0 -0
  16. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/README.md +0 -0
  17. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/__init__.py +0 -0
  18. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/config.py +0 -0
  19. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/demo/__init__.py +0 -0
  20. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/demo/bad_loop.py +0 -0
  21. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/demo/data_processor.py +0 -0
  22. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/demo/slow.cpp +0 -0
  23. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/hardware.py +0 -0
  24. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/parser.py +0 -0
  25. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/prompts.py +0 -0
  26. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight/scanner.py +0 -0
  27. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight_cli.egg-info/dependency_links.txt +0 -0
  28. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight_cli.egg-info/entry_points.txt +0 -0
  29. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight_cli.egg-info/requires.txt +0 -0
  30. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/coreinsight_cli.egg-info/top_level.txt +0 -0
  31. {coreinsight_cli-0.2.8 → coreinsight_cli-0.2.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coreinsight-cli
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA
5
5
  Author: Varun Jani
6
6
  License: GPL-3.0-or-later
@@ -805,7 +805,9 @@ class HarnessAgent:
805
805
  is_valid = self._check_speedup(success, logs)
806
806
  retries += 1
807
807
 
808
- if is_valid and retries > 0:
808
+ if getattr(sandbox, 'disabled', False):
809
+ pass # skipped intentionally — don't annotate as failed
810
+ elif is_valid and retries > 0:
809
811
  logs = f"(Succeeded after {retries} retries)\n" + logs
810
812
  elif not is_valid:
811
813
  logs = f"(Failed after {retries} retries)\n" + logs
@@ -0,0 +1,103 @@
1
+ """
2
+ coreinsight/embeddings.py — Shared embedding utility
3
+
4
+ Single source of truth for embedding model loading used by both
5
+ memory.py (OptimizationMemory) and indexer.py (RepoIndexer).
6
+
7
+ Tries to load all-MiniLM-L6-v2 from local cache first.
8
+ Falls back to a deterministic hash-based embedder when offline
9
+ or when the model has not yet been downloaded.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import logging
15
+ import math
16
+ import os
17
+ from pathlib import Path
18
+ from typing import List, Tuple
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # All models cached here — never hits the network if already present
23
+ MODEL_CACHE_DIR = Path.home() / ".coreinsight" / "models"
24
+ MODEL_NAME = "all-MiniLM-L6-v2"
25
+
26
+
27
+ class _HashEmbeddingFunction:
28
+ """
29
+ Deterministic offline fallback embedder.
30
+
31
+ Produces a 384-dim float vector from token overlap — no downloads,
32
+ no GPU, no network. Semantic quality is lower than MiniLM but RAG
33
+ and memory lookup still work via keyword/structural matching.
34
+
35
+ Run `coreinsight index` once while online to cache the real model.
36
+ """
37
+ DIM = 384
38
+
39
+ def __call__(self, input: List[str]) -> List[List[float]]:
40
+ results = []
41
+ for text in input:
42
+ tokens = text.lower().split()
43
+ vec = [0.0] * self.DIM
44
+ for tok in tokens:
45
+ h = int(hashlib.sha256(tok.encode()).hexdigest(), 16)
46
+ vec[h % self.DIM] += 1.0
47
+ # L2 normalise so cosine distance works correctly
48
+ mag = math.sqrt(sum(x * x for x in vec)) or 1.0
49
+ results.append([x / mag for x in vec])
50
+ return results
51
+
52
+
53
+ def load_embedding_fn() -> Tuple[object, str]:
54
+ """
55
+ Load the sentence-transformer embedding function.
56
+
57
+ Returns:
58
+ (embedding_fn, label) where label is a human-readable string
59
+ indicating which embedder is active — shown in CLI output.
60
+
61
+ Strategy:
62
+ 1. Pin HuggingFace cache to ~/.coreinsight/models so the model
63
+ is never re-downloaded on subsequent runs.
64
+ 2. Probe the model with a dummy call to force-load weights now
65
+ rather than silently failing later during indexing or lookup.
66
+ 3. On any failure (network error, disk full, offline) fall back
67
+ to _HashEmbeddingFunction with a visible warning.
68
+ """
69
+ MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
70
+
71
+ # Pin cache dirs — must be set before chromadb.utils imports torch
72
+ os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(MODEL_CACHE_DIR))
73
+ os.environ.setdefault("HF_HUB_CACHE", str(MODEL_CACHE_DIR))
74
+ # Allow download when online; callers that want strict offline can
75
+ # set HF_HUB_OFFLINE=1 in their environment before importing.
76
+ os.environ.setdefault("HF_HUB_OFFLINE", "0")
77
+
78
+ try:
79
+ from chromadb.utils import embedding_functions as _ef
80
+
81
+ fn = _ef.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
82
+
83
+ # Force-load now so we catch errors here, not mid-analysis.
84
+ fn(["probe"])
85
+
86
+ label = f"{MODEL_NAME} (cached)"
87
+ logger.debug(f"Embedding model loaded: {label}")
88
+ return fn, label
89
+
90
+ except Exception as exc:
91
+ logger.warning(
92
+ f"SentenceTransformer unavailable ({exc}). "
93
+ f"Using offline hash embedder — semantic quality reduced. "
94
+ f"Run `coreinsight index` once while online to cache the model."
95
+ )
96
+ from rich.console import Console as _Console
97
+ _Console().print(
98
+ "[yellow]⚠ Embedding model unavailable (offline or not yet downloaded). "
99
+ "Using keyword-based fallback — RAG and memory recall will work but with "
100
+ "reduced semantic accuracy. "
101
+ "Run [cyan]coreinsight index[/cyan] once while online to cache the model.[/yellow]"
102
+ )
103
+ return _HashEmbeddingFunction(), "hash-based (offline fallback)"
@@ -9,62 +9,11 @@ import chromadb
9
9
  from chromadb.utils import embedding_functions
10
10
 
11
11
  from coreinsight.parser import CodeParser
12
+ from coreinsight.embeddings import load_embedding_fn
12
13
 
13
14
  console = Console()
14
15
  logger = logging.getLogger(__name__)
15
16
 
16
- # Local model cache — never hits the network if model is already here
17
- _MODEL_CACHE_DIR = Path.home() / ".coreinsight" / "models"
18
-
19
-
20
- class _HashEmbeddingFunction:
21
- """
22
- Deterministic offline fallback embedder.
23
- Produces a 384-dim float vector from token overlap — no downloads, no GPU.
24
- Semantic quality is lower than MiniLM but RAG still works via keyword matching.
25
- """
26
- DIM = 384
27
-
28
- def __call__(self, input: list[str]) -> list[list[float]]:
29
- results = []
30
- for text in input:
31
- tokens = text.lower().split()
32
- vec = [0.0] * self.DIM
33
- for tok in tokens:
34
- h = int(hashlib.sha256(tok.encode()).hexdigest(), 16)
35
- vec[h % self.DIM] += 1.0
36
- # L2 normalise
37
- mag = math.sqrt(sum(x * x for x in vec)) or 1.0
38
- results.append([x / mag for x in vec])
39
- return results
40
-
41
-
42
- def _load_embedding_fn():
43
- """
44
- Try to load SentenceTransformer from local cache.
45
- Falls back to _HashEmbeddingFunction if offline or model not cached.
46
- """
47
- _MODEL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
48
- os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(_MODEL_CACHE_DIR))
49
- os.environ.setdefault("HF_HUB_OFFLINE", "0") # allow download when online
50
-
51
- try:
52
- fn = embedding_functions.SentenceTransformerEmbeddingFunction(
53
- model_name="all-MiniLM-L6-v2",
54
- )
55
- # Probe: actually load the model now so we catch network errors here
56
- # rather than silently later during indexing.
57
- fn(["probe"])
58
- return fn, "all-MiniLM-L6-v2 (cached)"
59
- except Exception as e:
60
- logger.warning(f"SentenceTransformer unavailable ({e}). Using offline hash embedder — semantic quality reduced.")
61
- console.print(
62
- "[yellow]⚠ Embedding model unavailable (offline or not yet downloaded). "
63
- "Using keyword-based fallback — RAG will work but with reduced semantic accuracy. "
64
- "Run [cyan]coreinsight index[/cyan] once while online to cache the model.[/yellow]"
65
- )
66
- return _HashEmbeddingFunction(), "hash-based (offline fallback)"
67
-
68
17
 
69
18
  class RepoIndexer:
70
19
  def __init__(self, repo_path: str):
@@ -82,7 +31,7 @@ class RepoIndexer:
82
31
  return True
83
32
  try:
84
33
  self._chroma_client = chromadb.PersistentClient(path=str(self.db_path))
85
- self._embedding_fn, self._embedding_label = _load_embedding_fn()
34
+ self._embedding_fn, self._embedding_label = load_embedding_fn()
86
35
  self._collection = self._chroma_client.get_or_create_collection(
87
36
  name="codebase_context",
88
37
  embedding_function=self._embedding_fn,
@@ -741,7 +741,7 @@ def run_analysis(file_path: str, no_docker: bool = False, tui_console=None):
741
741
  finally:
742
742
  console = _prev_console
743
743
 
744
- def run_demo(lang: str = "python", no_docker: bool = False):
744
+ def run_demo(lang: str = "python", no_docker: bool = False, tui_console=None):
745
745
  import shutil
746
746
  import importlib.resources
747
747
 
@@ -804,11 +804,16 @@ def run_demo(lang: str = "python", no_docker: bool = False):
804
804
  # For Python: auto-index so RAG cross-file context is showcased
805
805
  if lang == "python":
806
806
  console.print("[dim]Auto-indexing demo files to showcase RAG cross-file context...[/dim]")
807
- from coreinsight.indexer import RepoIndexer as _RepoIndexer
808
- _RepoIndexer(str(demo_dir)).index_repository()
807
+ try:
808
+ from coreinsight.indexer import RepoIndexer as _RepoIndexer
809
+ _RepoIndexer(str(demo_dir)).index_repository()
810
+ except Exception as _idx_err:
811
+ # Non-fatal — SQLite write conflicts can occur when running
812
+ # through the TUI. RAG context will be empty for this run.
813
+ console.print(f"[dim yellow]Indexing skipped (will retry next run): {_idx_err}[/dim yellow]")
809
814
  console.print()
810
815
 
811
- run_analysis(str(demo_dir / entry_file), no_docker=no_docker)
816
+ run_analysis(str(demo_dir / entry_file), no_docker=no_docker, tui_console=tui_console)
812
817
 
813
818
  def _run_memory_cmd(clear: bool, export_path: str = None, export_fmt: str = "csv"):
814
819
  from coreinsight.memory import OptimizationMemory, MEMORY_DIR
@@ -21,12 +21,13 @@ from datetime import datetime, timezone
21
21
  from pathlib import Path
22
22
  from typing import Any, Dict, List, Optional
23
23
 
24
+ from coreinsight.embeddings import load_embedding_fn
25
+
24
26
  logger = logging.getLogger(__name__)
25
27
 
26
28
  MEMORY_DIR = Path.home() / ".coreinsight" / "memory_db"
27
29
  CODE_DIR = MEMORY_DIR / "code"
28
30
  COLLECTION = "optimization_memory"
29
- EMBED_MODEL = "all-MiniLM-L6-v2" # same model as RepoIndexer — no extra download
30
31
 
31
32
  # ChromaDB uses cosine *distance* (lower = more similar).
32
33
  # 0.15 distance ≈ 0.85 cosine similarity for this embedding model.
@@ -54,17 +55,19 @@ class OptimizationMemory:
54
55
  Local vector database of verified optimizations.
55
56
 
56
57
  Reads are thread-safe (ChromaDB handles concurrent queries).
57
- Writes are called from the main thread after each future completes,
58
- so no write contention across worker threads.
58
+ Writes are serialized via _write_lock since store() can be called
59
+ from concurrent threads in process_function's as_completed loop.
59
60
  """
60
61
 
61
62
  def __init__(self, memory_dir: Path = MEMORY_DIR) -> None:
62
- self._memory_dir = memory_dir
63
- self._code_dir = memory_dir / "code"
64
- self._client = None
65
- self._collection = None
66
- self._embed_fn = None
67
- self._init_error = ""
63
+ import threading
64
+ self._memory_dir = memory_dir
65
+ self._code_dir = memory_dir / "code"
66
+ self._client = None
67
+ self._collection = None
68
+ self._embed_fn = None
69
+ self._init_error = ""
70
+ self._write_lock = threading.Lock()
68
71
 
69
72
  # ------------------------------------------------------------------ #
70
73
  # Lazy init — avoids slow import at startup
@@ -78,13 +81,11 @@ class OptimizationMemory:
78
81
  try:
79
82
  try:
80
83
  import chromadb
81
- from chromadb.utils import embedding_functions
82
84
  except Exception as sqlite_exc:
83
85
  self._init_error = (
84
86
  f"ChromaDB unavailable (likely outdated SQLite): {sqlite_exc}. "
85
87
  "Optimization memory disabled. "
86
- "Fix: pip install pysqlite3-binary and add the following to the top of memory.py:\n"
87
- " import pysqlite3, sys; sys.modules['sqlite3'] = pysqlite3"
88
+ "Fix: pip install coreinsight-cli[compat]"
88
89
  )
89
90
  return False
90
91
 
@@ -92,9 +93,8 @@ class OptimizationMemory:
92
93
  self._code_dir.mkdir(parents=True, exist_ok=True)
93
94
 
94
95
  self._client = chromadb.PersistentClient(path=str(self._memory_dir))
95
- self._embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
96
- model_name=EMBED_MODEL
97
- )
96
+ self._embed_fn, _embed_label = load_embedding_fn()
97
+ logger.debug(f"Memory embedder: {_embed_label}")
98
98
  self._collection = self._client.get_or_create_collection(
99
99
  name=COLLECTION,
100
100
  embedding_function=self._embed_fn,
@@ -273,52 +273,53 @@ class OptimizationMemory:
273
273
  """
274
274
  if not self._ensure_db():
275
275
  return False
276
- try:
277
- h = self.ast_hash(original_code)
278
- opt_code = result.get("optimized_code", "") or ""
279
- avg_speedup = 0.0
280
- if verification.speedup.computed_speedups:
281
- avg_speedup = (
282
- sum(verification.speedup.computed_speedups)
283
- / len(verification.speedup.computed_speedups)
276
+ with self._write_lock:
277
+ try:
278
+ h = self.ast_hash(original_code)
279
+ opt_code = result.get("optimized_code", "") or ""
280
+ avg_speedup = 0.0
281
+ if verification.speedup.computed_speedups:
282
+ avg_speedup = (
283
+ sum(verification.speedup.computed_speedups)
284
+ / len(verification.speedup.computed_speedups)
285
+ )
286
+
287
+ profiler_summary = ""
288
+ if profiler_result and profiler_result.available and profiler_result.metrics:
289
+ parts = [
290
+ f"{m.name}: {m.delta}"
291
+ for m in profiler_result.metrics[:2]
292
+ ]
293
+ profiler_summary = " | ".join(parts)
294
+
295
+ self._save_code(h, language, opt_code)
296
+
297
+ meta = {
298
+ "func_name": func_name,
299
+ "language": language,
300
+ "avg_speedup": round(avg_speedup, 4),
301
+ "issue": (result.get("issue") or "")[:500],
302
+ "reasoning": (result.get("reasoning") or "")[:1000],
303
+ "severity": result.get("severity", "High"),
304
+ "correctness_cases": verification.correctness.passed_cases,
305
+ "profiler_summary": profiler_summary[:200],
306
+ "timestamp": datetime.now(timezone.utc).isoformat(),
307
+ }
308
+
309
+ self._collection.upsert(
310
+ ids=[h],
311
+ documents=[original_code],
312
+ metadatas=[meta],
284
313
  )
314
+ logger.info(
315
+ f"Memory: stored '{func_name}' "
316
+ f"(hash={h[:8]}…, speedup={avg_speedup:.2f}x)"
317
+ )
318
+ return True
285
319
 
286
- profiler_summary = ""
287
- if profiler_result and profiler_result.available and profiler_result.metrics:
288
- parts = [
289
- f"{m.name}: {m.delta}"
290
- for m in profiler_result.metrics[:2]
291
- ]
292
- profiler_summary = " | ".join(parts)
293
-
294
- self._save_code(h, language, opt_code)
295
-
296
- meta = {
297
- "func_name": func_name,
298
- "language": language,
299
- "avg_speedup": round(avg_speedup, 4),
300
- "issue": (result.get("issue") or "")[:500],
301
- "reasoning": (result.get("reasoning") or "")[:1000],
302
- "severity": result.get("severity", "High"),
303
- "correctness_cases": verification.correctness.passed_cases,
304
- "profiler_summary": profiler_summary[:200],
305
- "timestamp": datetime.now(timezone.utc).isoformat(),
306
- }
307
-
308
- self._collection.upsert(
309
- ids=[h],
310
- documents=[original_code],
311
- metadatas=[meta],
312
- )
313
- logger.info(
314
- f"Memory: stored '{func_name}' "
315
- f"(hash={h[:8]}…, speedup={avg_speedup:.2f}x)"
316
- )
317
- return True
318
-
319
- except Exception as exc:
320
- logger.debug(f"Memory store failed: {exc}")
321
- return False
320
+ except Exception as exc:
321
+ logger.debug(f"Memory store failed: {exc}")
322
+ return False
322
323
 
323
324
  def stats(self) -> Dict[str, Any]:
324
325
  if not self._ensure_db():
@@ -156,6 +156,78 @@ def _fmt_int(n: int) -> str:
156
156
  return f"{n:,}"
157
157
 
158
158
 
159
+ def _parse_nsys_stats(output: str) -> Dict[str, Any]:
160
+ """
161
+ Parse `nsys profile --stats=true` stdout into structured metrics.
162
+ Extracts kernel timing and memory throughput from the summary tables.
163
+ """
164
+ result: Dict[str, Any] = {}
165
+
166
+ # ── Kernel statistics ────────────────────────────────────────────────
167
+ # Header: Time(%) Total Time (ns) Instances Avg (ns) ... Name
168
+ kernel_section = False
169
+ kernels = []
170
+ for line in output.splitlines():
171
+ if "CUDA Kernel Statistics" in line or "GPU Kernel Summary" in line:
172
+ kernel_section = True
173
+ continue
174
+ if kernel_section:
175
+ if line.strip() == "" or line.startswith("="):
176
+ if kernels:
177
+ kernel_section = False
178
+ continue
179
+ # Skip header/separator lines
180
+ if "Time(%)" in line or "----" in line:
181
+ continue
182
+ parts = line.split()
183
+ if len(parts) >= 7:
184
+ try:
185
+ kernels.append({
186
+ "pct": float(parts[0]),
187
+ "total_ns": float(parts[1].replace(",", "")),
188
+ "instances": int(parts[2].replace(",", "")),
189
+ "avg_ns": float(parts[3].replace(",", "")),
190
+ "name": " ".join(parts[7:]) if len(parts) > 7 else parts[-1],
191
+ })
192
+ except (ValueError, IndexError):
193
+ continue
194
+
195
+ if kernels:
196
+ # Top kernel by total time
197
+ top = max(kernels, key=lambda k: k["total_ns"])
198
+ result["top_kernel_name"] = top["name"]
199
+ result["top_kernel_avg_ns"] = top["avg_ns"]
200
+ result["top_kernel_total_ns"] = top["total_ns"]
201
+ result["top_kernel_instances"]= top["instances"]
202
+ result["total_kernel_ns"] = sum(k["total_ns"] for k in kernels)
203
+
204
+ # ── Memory throughput ────────────────────────────────────────────────
205
+ # Look for "Memory Throughput" or HtoD/DtoH transfer lines
206
+ mem_section = False
207
+ total_mem_ns = 0.0
208
+ for line in output.splitlines():
209
+ if "Memory Operation" in line or "Memory Throughput" in line:
210
+ mem_section = True
211
+ continue
212
+ if mem_section:
213
+ if line.strip() == "" or line.startswith("="):
214
+ mem_section = False
215
+ continue
216
+ if "Time(%)" in line or "----" in line:
217
+ continue
218
+ parts = line.split()
219
+ if len(parts) >= 3:
220
+ try:
221
+ total_mem_ns += float(parts[1].replace(",", ""))
222
+ except (ValueError, IndexError):
223
+ continue
224
+
225
+ if total_mem_ns:
226
+ result["total_mem_transfer_ns"] = total_mem_ns
227
+
228
+ return result
229
+
230
+
159
231
  def _parse_perf_stat(stderr: str) -> Dict[str, float]:
160
232
  """Extract hardware counter values from `perf stat` stderr output."""
161
233
  targets = {
@@ -284,7 +356,14 @@ class HardwareProfiler:
284
356
  source_dir=source_dir,
285
357
  )
286
358
  if language in ("cuda", "cu", "cuh"):
287
- return self._profile_cuda(detected)
359
+ return self._profile_cuda(
360
+ detected,
361
+ original_code=original_code,
362
+ optimized_code=optimized_code,
363
+ func_name=func_name,
364
+ original_file_content=original_file_content,
365
+ source_dir=source_dir,
366
+ )
288
367
  except Exception as exc:
289
368
  logger.debug(f"HardwareProfiler.profile exception: {exc}", exc_info=True)
290
369
  return ProfilerResult(
@@ -710,17 +789,190 @@ class HardwareProfiler:
710
789
  return metrics or None
711
790
 
712
791
  # ------------------------------------------------------------------ #
713
- # CUDA path (v0.2.0: nsys / nvprof)
792
+ # CUDA path nsys CLI profiling
714
793
  # ------------------------------------------------------------------ #
715
794
 
716
- def _profile_cuda(self, detected: Dict[str, bool]) -> ProfilerResult:
717
- if detected.get("nsys"):
718
- note = "nsys detected."
719
- elif detected.get("nvprof"):
720
- note = "nvprof detected."
721
- else:
722
- note = "No CUDA profiling tools found (install nsys from CUDA Toolkit)."
723
- return ProfilerResult(
724
- available=False, tool="none", language="cuda",
725
- error=f"{note} CUDA profiling coming in v0.2.0.",
726
- )
795
+ def _profile_cuda(
796
+ self,
797
+ detected: Dict[str, bool],
798
+ original_code: str = "",
799
+ optimized_code: str = "",
800
+ func_name: str = "",
801
+ original_file_content: str = "",
802
+ source_dir: str = "",
803
+ ) -> ProfilerResult:
804
+ result = ProfilerResult(available=False, tool="nsys", language="cuda")
805
+
806
+ if not detected.get("nsys"):
807
+ if detected.get("nvprof"):
808
+ result.error = (
809
+ "nvprof detected but not yet supported — install nsys "
810
+ "from CUDA Toolkit 11.0+ for hardware profiling."
811
+ )
812
+ else:
813
+ result.error = (
814
+ "No CUDA profiling tools found on PATH. "
815
+ "Install nsys: https://developer.nvidia.com/nsight-systems"
816
+ )
817
+ return result
818
+
819
+ if not shutil.which("nvcc"):
820
+ result.error = "nvcc not found — required to compile CUDA sources for profiling."
821
+ return result
822
+
823
+ if not original_file_content:
824
+ result.error = "No CUDA source content available for profiling."
825
+ return result
826
+
827
+ stats_per_label: Dict[str, Dict[str, Any]] = {}
828
+
829
+ # Build optimized source by appending the optimized kernel —
830
+ # last __global__ definition with the same name wins at link time
831
+ # only if we can safely substitute; otherwise skip optimized run.
832
+ sources = [("original", original_file_content)]
833
+ if optimized_code and func_name:
834
+ opt_src = (
835
+ original_file_content.strip()
836
+ + "\n\n// --- CoreInsight optimized replacement ---\n"
837
+ + optimized_code.strip()
838
+ )
839
+ sources.append(("optimized", opt_src))
840
+
841
+ tmp = tempfile.mkdtemp()
842
+ try:
843
+ for label, src in sources:
844
+ src_path = os.path.join(tmp, f"{label}.cu")
845
+ bin_path = os.path.join(tmp, label)
846
+
847
+ with open(src_path, "w") as fh:
848
+ fh.write(src)
849
+
850
+ # Compile
851
+ compile_proc = subprocess.run(
852
+ ["nvcc", "-O3", "-arch=native", src_path, "-o", bin_path],
853
+ capture_output=True, text=True, timeout=120,
854
+ )
855
+ if compile_proc.returncode != 0:
856
+ # Try without -arch=native (older nvcc versions)
857
+ compile_proc = subprocess.run(
858
+ ["nvcc", "-O3", src_path, "-o", bin_path],
859
+ capture_output=True, text=True, timeout=120,
860
+ )
861
+ if compile_proc.returncode != 0:
862
+ logger.debug(
863
+ f"CUDA compile failed for {label}:\n"
864
+ f"{compile_proc.stderr[:400]}"
865
+ )
866
+ result.error = (
867
+ f"nvcc compilation failed for {label} version.\n"
868
+ f"{compile_proc.stderr[:300]}"
869
+ )
870
+ return result
871
+
872
+ # Profile with nsys
873
+ nsys_out_base = os.path.join(tmp, f"nsys_{label}")
874
+ try:
875
+ nsys_proc = subprocess.run(
876
+ [
877
+ "nsys", "profile",
878
+ "--stats=true",
879
+ "--force-overwrite=true",
880
+ "-o", nsys_out_base,
881
+ bin_path,
882
+ ],
883
+ capture_output=True, text=True, timeout=300,
884
+ )
885
+ # nsys writes stats to stdout; combined output in stderr too
886
+ combined = nsys_proc.stdout + nsys_proc.stderr
887
+ parsed = _parse_nsys_stats(combined)
888
+
889
+ if not parsed:
890
+ logger.debug(
891
+ f"nsys: no stats parsed for {label}.\n"
892
+ f"nsys stdout: {nsys_proc.stdout[:300]}\n"
893
+ f"nsys stderr: {nsys_proc.stderr[:300]}"
894
+ )
895
+ result.error = (
896
+ f"nsys ran but produced no parseable stats for {label}. "
897
+ f"Ensure the binary launches at least one CUDA kernel."
898
+ )
899
+ return result
900
+
901
+ stats_per_label[label] = parsed
902
+
903
+ except subprocess.TimeoutExpired:
904
+ result.error = "nsys profiling timed out (300s)."
905
+ return result
906
+ except Exception as exc:
907
+ result.error = f"nsys execution error: {exc}"
908
+ return result
909
+
910
+ except Exception as exc:
911
+ logger.debug(f"CUDA profiling error: {exc}")
912
+ result.error = f"CUDA profiling failed: {exc}"
913
+ return result
914
+ finally:
915
+ shutil.rmtree(tmp, ignore_errors=True)
916
+
917
+ if "original" not in stats_per_label:
918
+ result.error = "No profiling data collected."
919
+ return result
920
+
921
+ orig_s = stats_per_label["original"]
922
+ opt_s = stats_per_label.get("optimized", orig_s)
923
+
924
+ metrics: List[ProfilerMetric] = []
925
+
926
+ # ── Kernel timing ─────────────────────────────────────────────
927
+ orig_ns = orig_s.get("top_kernel_avg_ns", 0.0)
928
+ opt_ns = opt_s.get("top_kernel_avg_ns", orig_ns)
929
+ if orig_ns:
930
+ metrics.append(ProfilerMetric(
931
+ name=f"Kernel avg time [{orig_s.get('top_kernel_name', 'top kernel')}]",
932
+ original=f"{orig_ns / 1000:.2f} µs",
933
+ optimized=f"{opt_ns / 1000:.2f} µs",
934
+ delta=_pct_delta(orig_ns, opt_ns),
935
+ note="lower is better",
936
+ ))
937
+
938
+ orig_total = orig_s.get("total_kernel_ns", 0.0)
939
+ opt_total = opt_s.get("total_kernel_ns", orig_total)
940
+ if orig_total:
941
+ metrics.append(ProfilerMetric(
942
+ name="Total kernel time",
943
+ original=f"{orig_total / 1e6:.3f} ms",
944
+ optimized=f"{opt_total / 1e6:.3f} ms",
945
+ delta=_pct_delta(orig_total, opt_total),
946
+ note="lower is better",
947
+ ))
948
+
949
+ orig_inst = orig_s.get("top_kernel_instances", 0)
950
+ if orig_inst:
951
+ metrics.append(ProfilerMetric(
952
+ name="Kernel launches",
953
+ original=str(orig_inst),
954
+ optimized=str(opt_s.get("top_kernel_instances", orig_inst)),
955
+ delta="—",
956
+ note="",
957
+ ))
958
+
959
+ # ── Memory transfers ──────────────────────────────────────────
960
+ orig_mem = orig_s.get("total_mem_transfer_ns", 0.0)
961
+ opt_mem = opt_s.get("total_mem_transfer_ns", orig_mem)
962
+ if orig_mem:
963
+ metrics.append(ProfilerMetric(
964
+ name="Total memory transfer time",
965
+ original=f"{orig_mem / 1e6:.3f} ms",
966
+ optimized=f"{opt_mem / 1e6:.3f} ms",
967
+ delta=_pct_delta(orig_mem, opt_mem),
968
+ note="lower is better",
969
+ ))
970
+
971
+ if not metrics:
972
+ result.error = "nsys ran but no timing metrics could be extracted."
973
+ return result
974
+
975
+ result.available = True
976
+ result.host_tool_name = "nsys"
977
+ result.host_tool_metrics = metrics
978
+ return result
@@ -18,9 +18,11 @@ SANDBOX_IMAGES = {
18
18
  "cpp": "coreinsight-cpp-sandbox:latest",
19
19
  }
20
20
 
21
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
22
+
21
23
  DOCKERFILES = {
22
- "python": "Dockerfile.python-sandbox",
23
- "cpp": "Dockerfile.cpp-sandbox",
24
+ "python": os.path.join(ROOT_DIR, "docker", "Dockerfile.python-sandbox"),
25
+ "cpp": os.path.join(ROOT_DIR, "docker", "Dockerfile.cpp-sandbox"),
24
26
  }
25
27
 
26
28
  # ---------------------------------------------------------------------------
@@ -188,17 +190,21 @@ class CodeSandbox:
188
190
  label = "Python" if lang == "python" else "C++"
189
191
  console.print(f"[yellow]First run: building {label} sandbox image (one-time, ~30s)...[/yellow]")
190
192
 
191
- dockerfile_path = importlib.resources.files("coreinsight").joinpath(DOCKERFILES[lang])
192
- with importlib.resources.as_file(dockerfile_path) as dockerfile:
193
- _, logs = self.client.images.build(
194
- path=str(dockerfile.parent),
195
- dockerfile=dockerfile.name,
196
- tag=SANDBOX_IMAGES[lang],
197
- rm=True,
193
+ dockerfile_full = DOCKERFILES[lang]
194
+ if not os.path.exists(dockerfile_full):
195
+ raise FileNotFoundError(
196
+ f"Dockerfile not found at {dockerfile_full}. "
197
+ f"Expected docker/ directory at project root."
198
198
  )
199
- for chunk in logs:
200
- if "stream" in chunk:
201
- logger.debug(chunk["stream"].strip())
199
+ _, logs = self.client.images.build(
200
+ path=os.path.dirname(dockerfile_full),
201
+ dockerfile=os.path.basename(dockerfile_full),
202
+ tag=SANDBOX_IMAGES[lang],
203
+ rm=True,
204
+ )
205
+ for chunk in logs:
206
+ if "stream" in chunk:
207
+ logger.debug(chunk["stream"].strip())
202
208
 
203
209
  console.print(f"[green]✓ {label} sandbox image built successfully.[/green]")
204
210
 
@@ -778,19 +778,15 @@ class CoreInsightApp(App):
778
778
  log.write,
779
779
  "\n[bold cyan]Running built-in Python demo...[/bold cyan]\n"
780
780
  )
781
-
782
- # Temporarily patch the demo's console output into the TUI
783
- import coreinsight.main as _main
784
- _prev = _main.console
785
- _main.console = tui_console
786
781
  try:
787
- run_demo(lang="python", no_docker=no_docker)
782
+ # Pass tui_console directly — run_demo forwards it to run_analysis
783
+ # which handles the global console swap cleanly via try/finally
784
+ run_demo(lang="python", no_docker=no_docker, tui_console=tui_console)
788
785
  except SystemExit:
789
786
  pass
790
787
  except Exception as exc:
791
788
  self.call_from_thread(log.write, f"[red]Demo error: {exc}[/red]")
792
789
  finally:
793
- _main.console = _prev
794
790
  self._busy = False
795
791
  self.call_from_thread(self._set_status, "Demo complete.")
796
792
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coreinsight-cli
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA
5
5
  Author: Varun Jani
6
6
  License: GPL-3.0-or-later
@@ -1,11 +1,10 @@
1
1
  LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
- coreinsight/Dockerfile.cpp-sandbox
5
- coreinsight/Dockerfile.python-sandbox
6
4
  coreinsight/__init__.py
7
5
  coreinsight/analyzer.py
8
6
  coreinsight/config.py
7
+ coreinsight/embeddings.py
9
8
  coreinsight/hardware.py
10
9
  coreinsight/indexer.py
11
10
  coreinsight/main.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "coreinsight-cli"
7
- version = "0.2.8"
7
+ version = "0.2.9"
8
8
  description = "Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA"
9
9
  license = {text = "GPL-3.0-or-later"}
10
10
  authors = [
@@ -1,2 +0,0 @@
1
- FROM gcc:latest
2
- WORKDIR /workspace
@@ -1,3 +0,0 @@
1
- FROM python:3.11-slim
2
- RUN pip install --no-cache-dir numpy pandas scipy matplotlib
3
- WORKDIR /workspace
File without changes