PyPI - coreinsight-cli - Versions diffs - 0.2.6__tar.gz → 0.2.8__tar.gz - Mend

coreinsight-cli 0.2.6tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{coreinsight_cli-0.2.6/coreinsight_cli.egg-info → coreinsight_cli-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: coreinsight-cli
-Version: 0.2.6
+Version: 0.2.8
 Summary: Local-first AI performance profiler that mathematically verifies optimizations for Python, C++, and CUDA
 Author: Varun Jani
 License: GPL-3.0-or-later
@@ -32,6 +32,7 @@ Requires-Dist: langchain-anthropic>=0.1.0
 Requires-Dist: pydantic>=2.0
 Requires-Dist: chromadb>=0.5.0
 Requires-Dist: sentence-transformers>=3.0.0
+Requires-Dist: textual>=0.60.0
 Requires-Dist: psutil>=5.9
 Provides-Extra: compat
 Requires-Dist: pysqlite3-binary>=0.5.0; extra == "compat"

{coreinsight_cli-0.2.6 → coreinsight_cli-0.2.8}/coreinsight/analyzer.py RENAMED Viewed

@@ -14,6 +14,35 @@ from langchain_anthropic import ChatAnthropic
 from coreinsight.prompts import SYSTEM_PROMPT, ANALYSIS_TEMPLATE, HARNESS_ADDENDUM
+# Phrases that appear at the start of a truncated LLM response
+_TRUNCATION_HINTS = (
+    "context length",
+    "context_length_exceeded",
+    "maximum context",
+    "token limit",
+    "finish_reason: length",
+    "finish_reason\":\"length",
+)
+def _is_truncated(raw: str) -> bool:
+    """
+    Returns True if the raw LLM output looks like it was cut off mid-generation.
+    Catches both explicit error messages and structural truncation signs.
+    """
+    if not raw or len(raw.strip()) < 20:
+        return True
+    low = raw.lower()
+    if any(hint in low for hint in _TRUNCATION_HINTS):
+        return True
+    stripped = raw.strip()
+    # JSON truncation: opened but never closed
+    if stripped.startswith("{") and not stripped.endswith("}"):
+        return True
+    # Code truncation: opens a block but ends mid-statement
+    if stripped.endswith(("...", "/*", "//", "\"", "'")):
+        return True
+    return False
 logger = logging.getLogger(__name__)
@@ -163,12 +192,15 @@ class AnalyzerAgent:
             self.json_llm = self.base_llm
         elif provider == "local_server":
-            base_url = api_keys.get("local_url", "http://localhost:1234/v1")
+            from coreinsight.prompts import ModelTier
+            base_url   = api_keys.get("local_url", "http://localhost:1234/v1")
+            _max_tokens = 2048 if model_tier == ModelTier.SMALL else 4096
             self.base_llm = ChatOpenAI(
                 model=model_name,
                 api_key="not-needed",
                 base_url=base_url,
                 temperature=0.1,
+                max_tokens=_max_tokens,
                 model_kwargs={"response_format": {"type": "json_object"}},
             )
             self.json_llm = self.base_llm
@@ -196,11 +228,20 @@ class AnalyzerAgent:
             self.json_llm = self.base_llm
         else:  # Ollama default
+            from coreinsight.prompts import ModelTier
+            # Small models (7B) typically have 4096 native context.
+            # Asking for more causes silent degradation or OOM on the host.
+            # Medium/large local models can handle 8192 comfortably.
+            _ctx = 4096 if model_tier == ModelTier.SMALL else 8192
+            # num_predict: small models need room for JSON + code in one shot.
+            # Capping at 2048 for small prevents runaway generation that hits
+            # the limit mid-JSON and returns truncated garbage.
+            _predict = 2048 if model_tier == ModelTier.SMALL else 4096
             self.base_llm = ChatOllama(
                 model=model_name,
                 temperature=0.1,
-                num_predict=4096,
-                num_ctx=8192,
+                num_predict=_predict,
+                num_ctx=_ctx,
             )
             self.json_llm = self.base_llm.bind(format="json")
@@ -258,14 +299,31 @@ class AnalyzerAgent:
     def _invoke_code_chain(self, template: str, variables: dict, language: str) -> str:
         """Shared invocation + extraction logic for harness and fix chains."""
         chain = PromptTemplate.from_template(template) | self.base_llm
-        result = chain.invoke(variables)
+        try:
+            result = chain.invoke(variables)
+        except Exception as e:
+            err = str(e).lower()
+            if any(h in err for h in _TRUNCATION_HINTS):
+                raise RuntimeError(
+                    f"Model hit its context limit. Try a smaller file, fewer functions, "
+                    f"or a model with a larger context window. Detail: {e}"
+                ) from e
+            raise
         raw = result.content if hasattr(result, "content") else str(result)
-        # Handle Anthropic returning a list of content blocks
         if isinstance(raw, list):
             raw = "\n".join(
                 item["text"] if isinstance(item, dict) and "text" in item else str(item)
                 for item in raw
             )
+        if _is_truncated(raw):
+            logger.warning(
+                f"LLM output appears truncated (len={len(raw)}). "
+                f"Model likely hit its context/predict limit."
+            )
+            raise RuntimeError(
+                "Model output was truncated — hit context or token limit. "
+                "Try a model with a larger context window, or reduce the function size."
+            )
         return self._extract_executable_code(raw)
     def generate_harness(
@@ -421,12 +479,14 @@ def _build_llm(provider: str, model_name: str, api_keys: dict):
         return llm, llm
     if provider == "local_server":
-        base_url = api_keys.get("local_url", "http://localhost:1234/v1")
+        base_url    = api_keys.get("local_url", "http://localhost:1234/v1")
+        _max_tokens = api_keys.pop("_predict", 4096)  # reuse same key as Ollama path
         llm = ChatOpenAI(
             model=model_name,
             api_key="not-needed",
             base_url=base_url,
             temperature=0.1,
+            max_tokens=_max_tokens,
             model_kwargs={"response_format": {"type": "json_object"}},
         )
         return llm, llm
@@ -452,16 +512,33 @@ def _build_llm(provider: str, model_name: str, api_keys: dict):
         )
         return llm, llm
-    # Ollama default
+    # Ollama default — context and predict budget are passed in from the
+    # calling agent which knows its own model_tier.
+    # Default to medium-safe values; callers override via kwargs if needed.
+    _ctx     = api_keys.pop("_ctx",     8192)
+    _predict = api_keys.pop("_predict", 4096)
     base = ChatOllama(
         model=model_name,
         temperature=0.1,
-        num_predict=4096,
-        num_ctx=8192,
+        num_predict=_predict,
+        num_ctx=_ctx,
     )
     return base, base.bind(format="json")
+def _build_llm_tiered(provider: str, model_name: str, api_keys: dict, model_tier: str):
+    """Wraps _build_llm with tier-aware context settings for local providers."""
+    from coreinsight.prompts import ModelTier
+    keys = dict(api_keys or {})
+    if provider == "ollama":
+        keys["_ctx"]     = 4096 if model_tier == ModelTier.SMALL else 8192
+        keys["_predict"] = 2048 if model_tier == ModelTier.SMALL else 4096
+    elif provider == "local_server":
+        # max_tokens controls response length — context window is server-side
+        keys["_predict"] = 2048 if model_tier == ModelTier.SMALL else 4096
+    return _build_llm(provider, model_name, keys)
 class BottleneckAgent:
     """
     Agent 1 — analysis only.
@@ -480,7 +557,7 @@ class BottleneckAgent:
         from coreinsight.prompts import BOTTLENECK_TEMPLATE, SYSTEM_PROMPT
         self.model_tier = model_tier
         self.parser     = JsonOutputParser(pydantic_object=AuditResult)
-        self._base_llm, self._json_llm = _build_llm(provider, model_name, api_keys)
+        self._base_llm, self._json_llm = _build_llm_tiered(provider, model_name, api_keys, model_tier)
         self._prompt = PromptTemplate(
             template=BOTTLENECK_TEMPLATE,
@@ -544,7 +621,7 @@ class OptimizerAgent:
     ) -> None:
         from coreinsight.prompts import OPTIMIZER_TEMPLATE
         self.model_tier = model_tier
-        self._base_llm, _ = _build_llm(provider, model_name, api_keys)
+        self._base_llm, _ = _build_llm_tiered(provider, model_name, api_keys, model_tier)
         self._template = OPTIMIZER_TEMPLATE
     def _extract_code(self, raw: str) -> str:
@@ -620,7 +697,7 @@ class HarnessAgent:
             HARNESS_ADDENDUM_MULTI,
         )
         self.model_tier      = model_tier
-        self._base_llm, _    = _build_llm(provider, model_name, api_keys)
+        self._base_llm, _    = _build_llm_tiered(provider, model_name, api_keys, model_tier)
         self._harness_tmpl   = HARNESS_TEMPLATE_MULTI + HARNESS_ADDENDUM_MULTI.get(model_tier, "")
         self._fix_tmpl       = FIX_TEMPLATE_MULTI     + HARNESS_ADDENDUM_MULTI.get(model_tier, "")
@@ -638,14 +715,28 @@ class HarnessAgent:
     def _invoke(self, template: str, variables: dict) -> str:
         chain  = PromptTemplate.from_template(template) | self._base_llm
-        result = chain.invoke(variables)
-        raw    = result.content if hasattr(result, "content") else str(result)
+        try:
+            result = chain.invoke(variables)
+        except Exception as e:
+            err = str(e).lower()
+            if any(h in err for h in _TRUNCATION_HINTS):
+                raise RuntimeError(
+                    f"Model hit its context limit during harness generation. "
+                    f"Detail: {e}"
+                ) from e
+            raise
+        raw = result.content if hasattr(result, "content") else str(result)
         if isinstance(raw, list):
             raw = "\n".join(
                 item["text"] if isinstance(item, dict) and "text" in item
                 else str(item)
                 for item in raw
             )
+        if _is_truncated(raw):
+            raise RuntimeError(
+                "Harness output was truncated — model hit its token limit. "
+                "Switching to fix loop with truncation note."
+            )
         return self._extract_code(raw)
     def _check_speedup(self, success: bool, logs: str) -> bool:
@@ -738,7 +829,7 @@ class TestCaseAgent:
         model_tier: str,
     ) -> None:
         self.model_tier   = model_tier
-        self._base_llm, _ = _build_llm(provider, model_name, api_keys)
+        self._base_llm, _ = _build_llm_tiered(provider, model_name, api_keys, model_tier)
     def generate(
         self,

{coreinsight_cli-0.2.6 → coreinsight_cli-0.2.8}/coreinsight/config.py RENAMED Viewed

@@ -18,6 +18,7 @@ FREE_TIER_LIMITS = {
     "max_retries":       2,
     "num_test_cases":    8,
     "hardware_profiling": False,
+    "max_files": 2,
 }
 PRO_TIER_LIMITS = {
@@ -25,6 +26,7 @@ PRO_TIER_LIMITS = {
     "max_retries":       5,
     "num_test_cases":    15,
     "hardware_profiling": True,
+    "max_files": None,
 }
 SMALL_MODELS  = ["llama3.2:3b", "llama3.2:1b", "codellama:7b", "llama3:7b", "mistral:7b"]
@@ -168,6 +170,7 @@ def run_configure(pro_key: str = None, agent_mode: str = None):
     if provider == "ollama":
         config["model_name"] = Prompt.ask("Ollama model name", default=config.get("model_name", "llama3.2"))
     elif provider == "local_server":
+        from rich.panel import Panel
         console.print(Panel(
             "[bold]Local inference server setup[/bold]\n\n"
             "CoreInsight talks to any OpenAI-compatible local server.\n"

coreinsight-cli 0.2.6__tar.gz → 0.2.8__tar.gz

coreinsight-cli 0.2.6tar.gz → 0.2.8tar.gz