PyPI - specsmith - Versions diffs - 0.7.0.dev236__tar.gz → 0.10.0__tar.gz - Mend

specsmith 0.7.0.dev236tar.gz → 0.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

{specsmith-0.7.0.dev236/src/specsmith.egg-info → specsmith-0.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: specsmith
-Version: 0.7.0.dev236
+Version: 0.10.0
 Summary: Applied Epistemic Engineering toolkit — AEE agent sessions, execution profiles, FPGA/HDL governance, tool installer, 50+ CLI commands.
 Author: BitConcepts
 License-Expression: MIT
@@ -88,6 +88,25 @@ specsmith treats belief systems like code: codable, testable, and deployable. It
 epistemically-governed projects, stress-tests requirements as BeliefArtifacts, runs
 cryptographically-sealed trace vaults, and orchestrates AI agents under formal AEE governance.
+**0.10.0 — Multi-Agent + BYOE.** A `/plan` goes to the architect, `/fix`
+goes to the coder, `/review` goes to a reviewer that runs on a different
+model family. Each *profile* is a `(provider, model, endpoint?, fallback_chain)`
+bundle stored in `~/.specsmith/agents.json`; an *activity routing table*
+maps slash commands and AEE phases to profiles; **BYOE endpoints**
+(`~/.specsmith/endpoints.json`) let you point a profile at any
+OpenAI-v1-compatible backend you self-host (vLLM, llama.cpp `server`,
+LM Studio, TGI, ...). Cross-family **diversity guard**, capability
+filtering, transient-failure fallback chains, and TraceVault decision
+seals on every `/agent` pin are wired in by default. See
+[`docs/site/agents.md`](docs/site/agents.md) for the five-minute walkthrough.
+```bash
+specsmith agents preset apply default       # frontier coder + cross-family reviewer
+specsmith endpoints add --id home-vllm \
+  --base-url http://10.0.0.4:8000/v1 --auth bearer-keyring
+specsmith run --agent opus-reviewer         # one-shot per-session pin
+```
 It also co-installs the standalone `epistemic` Python library for direct use in any project:
 ```python

{specsmith-0.7.0.dev236 → specsmith-0.10.0}/README.md RENAMED Viewed

@@ -16,6 +16,25 @@ specsmith treats belief systems like code: codable, testable, and deployable. It
 epistemically-governed projects, stress-tests requirements as BeliefArtifacts, runs
 cryptographically-sealed trace vaults, and orchestrates AI agents under formal AEE governance.
+**0.10.0 — Multi-Agent + BYOE.** A `/plan` goes to the architect, `/fix`
+goes to the coder, `/review` goes to a reviewer that runs on a different
+model family. Each *profile* is a `(provider, model, endpoint?, fallback_chain)`
+bundle stored in `~/.specsmith/agents.json`; an *activity routing table*
+maps slash commands and AEE phases to profiles; **BYOE endpoints**
+(`~/.specsmith/endpoints.json`) let you point a profile at any
+OpenAI-v1-compatible backend you self-host (vLLM, llama.cpp `server`,
+LM Studio, TGI, ...). Cross-family **diversity guard**, capability
+filtering, transient-failure fallback chains, and TraceVault decision
+seals on every `/agent` pin are wired in by default. See
+[`docs/site/agents.md`](docs/site/agents.md) for the five-minute walkthrough.
+```bash
+specsmith agents preset apply default       # frontier coder + cross-family reviewer
+specsmith endpoints add --id home-vllm \
+  --base-url http://10.0.0.4:8000/v1 --auth bearer-keyring
+specsmith run --agent opus-reviewer         # one-shot per-session pin
+```
 It also co-installs the standalone `epistemic` Python library for direct use in any project:
 ```python

{specsmith-0.7.0.dev236 → specsmith-0.10.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "specsmith"
-version = "0.7.0.dev236"
+version = "0.10.0"
 description = "Applied Epistemic Engineering toolkit — AEE agent sessions, execution profiles, FPGA/HDL governance, tool installer, 50+ CLI commands."
 readme = "README.md"
 license = "MIT"
@@ -171,6 +171,9 @@ module = [
     "specsmith.importer",
     "specsmith.agent.providers.gemini",
     "specsmith.agent.runner",
+    "specsmith.agent.profiles",
+    "specsmith.agent.fallback",
+    "specsmith.agent.core",
     "specsmith.agent.cleanup",
     "specsmith.agent.orchestrator",
     "specsmith.agent.repl",

{specsmith-0.7.0.dev236 → specsmith-0.10.0}/src/specsmith/__init__.py RENAMED Viewed

@@ -8,4 +8,4 @@ from importlib.metadata import version as _pkg_version
 try:
     __version__: str = _pkg_version("specsmith")
 except PackageNotFoundError:  # running from source without install
-    __version__ = "0.3.6"  # fallback: keep in sync with pyproject.toml
+    __version__ = "0.10.0"  # fallback: keep in sync with pyproject.toml

{specsmith-0.7.0.dev236 → specsmith-0.10.0}/src/specsmith/agent/chat_runner.py RENAMED Viewed

@@ -53,6 +53,14 @@ class ChatRunResult:
     files_changed: list[str] = field(default_factory=list)
     verdict: VerifierVerdict | None = None
     raw_text: str = ""
+    # C1: per-turn token + cost accounting. Populated by the provider
+    # driver when it can read counters from the response (Ollama and
+    # Anthropic both expose them). Falls back to a deterministic char-
+    # based heuristic so the TokenMeter chip is never zero on Ollama or
+    # OpenAI-compat endpoints that don't surface usage in streaming mode.
+    tokens_in: int = 0
+    tokens_out: int = 0
+    cost_usd: float = 0.0
     def to_dict(self) -> dict[str, Any]:
         return {
@@ -61,6 +69,9 @@ class ChatRunResult:
             "files_changed": list(self.files_changed),
             "confidence": self.verdict.confidence if self.verdict else 0.0,
             "equilibrium": self.verdict.equilibrium if self.verdict else False,
+            "tokens_in": int(self.tokens_in),
+            "tokens_out": int(self.tokens_out),
+            "cost_usd": float(self.cost_usd),
         }
@@ -80,43 +91,122 @@ def run_chat(
     history: list[dict[str, Any]] | None = None,
     confidence_target: float = 0.7,
     rules_prefix: str = "",
+    endpoint_id: str | None = None,
 ) -> ChatRunResult | None:
-    """Drive a real LLM turn. Return ``None`` if no provider is reachable."""
+    """Drive a real LLM turn. Return ``None`` if no provider is reachable.
+    When ``endpoint_id`` is set, the BYOE store (REQ-142) is consulted and
+    the resolved :class:`Endpoint` short-circuits the provider chain via
+    the new :func:`_run_openai_compat` driver. Any error during endpoint
+    resolution falls back to the legacy auto-detect chain so an offline
+    misconfigured endpoint never breaks `specsmith chat`.
+    """
     history = history or []
     messages = _build_messages(utterance, history, rules_prefix)
+    # REQ-142: explicit endpoint override.
+    if endpoint_id:
+        try:
+            from specsmith.agent.endpoints import EndpointStore
+            endpoint = EndpointStore.load().resolve(endpoint_id)
+        except Exception:  # noqa: BLE001 - any failure → fall back to auto-detect
+            endpoint = None
+        if endpoint is not None:
+            try:
+                full_text, usage = _run_openai_compat(
+                    messages, emitter, msg_block, endpoint=endpoint
+                )
+            except Exception:  # noqa: BLE001 - degrade to auto-detect
+                full_text, usage = None, _UsageDelta()
+            if full_text is not None:
+                return _finalize(
+                    full_text,
+                    "openai_compat",
+                    project_dir,
+                    confidence_target,
+                    messages=messages,
+                    usage=usage,
+                )
     # Order matters: Ollama first because it's local-first and free.
     for provider in (_run_ollama, _run_anthropic, _run_openai, _run_gemini):
         try:
-            full_text = provider(messages, emitter, msg_block)
+            full_text, usage = provider(messages, emitter, msg_block)
         except Exception:  # noqa: BLE001 - any failure → next provider
             continue
         if full_text is None:
             continue
-        return _finalize(full_text, provider.__name__, project_dir, confidence_target)
+        return _finalize(
+            full_text,
+            provider.__name__,
+            project_dir,
+            confidence_target,
+            messages=messages,
+            usage=usage,
+        )
     return None
+@dataclass
+class _UsageDelta:
+    """Per-turn token + cost counters reported by a provider driver.
+    All fields default to ``0`` so callers can construct a zero-value
+    instance without caring whether the provider supports usage tracking.
+    """
+    tokens_in: int = 0
+    tokens_out: int = 0
+    cost_usd: float = 0.0
 def _finalize(
     full_text: str,
     provider_fn_name: str,
     project_dir: Path,
     confidence_target: float,
+    *,
+    messages: list[dict[str, str]] | None = None,
+    usage: _UsageDelta | None = None,
 ) -> ChatRunResult:
     sections = _parse_output_contract(full_text)
     files_changed = _split_files_list(sections.get("files_changed", ""))
     report = report_from_chat_sections(sections, files_changed=files_changed)
     verdict = score(report, confidence_target=confidence_target)
     summary = (sections.get("plan") or full_text.strip()[:200]).strip() or verdict.summary
+    # C1: when the provider didn't report exact counts, estimate from text.
+    # The four-chars-per-token rule of thumb is OpenAI's published guidance
+    # and matches Ollama / Anthropic / Gemini within ~10% across the model
+    # families we ship today — close enough for the TokenMeter chip and
+    # the ``credits record`` ledger event.
+    if usage is None:
+        usage = _UsageDelta()
+    if usage.tokens_in == 0 and messages is not None:
+        usage.tokens_in = _estimate_tokens("\n".join(m.get("content", "") for m in messages))
+    if usage.tokens_out == 0:
+        usage.tokens_out = _estimate_tokens(full_text)
     return ChatRunResult(
         provider=provider_fn_name.removeprefix("_run_"),
         summary=summary,
         files_changed=files_changed,
         verdict=verdict,
         raw_text=full_text,
+        tokens_in=int(usage.tokens_in),
+        tokens_out=int(usage.tokens_out),
+        cost_usd=float(usage.cost_usd),
     )
+def _estimate_tokens(text: str) -> int:
+    """Rough char→token heuristic (4 chars/token, floor at 1 if non-empty)."""
+    if not text:
+        return 0
+    return max(1, len(text) // 4)
 # ---------------------------------------------------------------------------
 # Provider drivers — each returns the full assembled text or None
 # ---------------------------------------------------------------------------
@@ -126,13 +216,14 @@ def _run_ollama(
     messages: list[dict[str, str]],
     emitter: EventEmitter,
     block_id: str,
-) -> str | None:
+) -> tuple[str | None, _UsageDelta]:
     """Stream from a local Ollama daemon using only stdlib."""
     host = os.environ.get("OLLAMA_HOST", DEFAULT_OLLAMA_HOST).rstrip("/")
     model = os.environ.get("SPECSMITH_OLLAMA_MODEL", DEFAULT_OLLAMA_MODEL)
+    usage = _UsageDelta()
     if not _ollama_alive(host):
-        return None
+        return None, usage
     payload = json.dumps({"model": model, "messages": messages, "stream": True}).encode("utf-8")
     req = Request(  # noqa: S310 - URL is a hardcoded localhost default
@@ -157,8 +248,13 @@ def _run_ollama(
                 emitter.token(block_id, chunk)
                 pieces.append(chunk)
             if obj.get("done"):
+                # C1: Ollama exposes prompt_eval_count + eval_count on the
+                # final ``done`` message. Cost is zero for local models.
+                usage.tokens_in = int(obj.get("prompt_eval_count") or 0)
+                usage.tokens_out = int(obj.get("eval_count") or 0)
+                usage.cost_usd = 0.0
                 break
-    return "".join(pieces) if pieces else None
+    return ("".join(pieces) if pieces else None), usage
 def _ollama_alive(host: str) -> bool:
@@ -173,14 +269,15 @@ def _run_anthropic(
     messages: list[dict[str, str]],
     emitter: EventEmitter,
     block_id: str,
-) -> str | None:
+) -> tuple[str | None, _UsageDelta]:
     """Use the anthropic SDK if installed and a key is configured."""
+    usage = _UsageDelta()
     if not os.environ.get("ANTHROPIC_API_KEY"):
-        return None
+        return None, usage
     try:
         import anthropic
     except ImportError:
-        return None
+        return None, usage
     system = "\n".join(m["content"] for m in messages if m["role"] == "system")
     user_msgs = [m for m in messages if m["role"] != "system"]
@@ -197,62 +294,183 @@ def _run_anthropic(
             if text:
                 emitter.token(block_id, text)
                 pieces.append(text)
-    return "".join(pieces) if pieces else None
+        # C1: pull final usage off the SDK's `final_message`. Cost is the
+        # caller's problem (rate-limit module knows the model price); we
+        # report tokens here and let the credits ledger compute USD.
+        try:
+            final = stream.get_final_message()
+            usage.tokens_in = int(getattr(final.usage, "input_tokens", 0) or 0)
+            usage.tokens_out = int(getattr(final.usage, "output_tokens", 0) or 0)
+        except Exception:  # noqa: BLE001 - usage is best-effort
+            pass
+    return ("".join(pieces) if pieces else None), usage
 def _run_openai(
     messages: list[dict[str, str]],
     emitter: EventEmitter,
     block_id: str,
-) -> str | None:
+) -> tuple[str | None, _UsageDelta]:
     """Use the openai SDK if installed and a key is configured."""
+    usage = _UsageDelta()
     if not os.environ.get("OPENAI_API_KEY"):
-        return None
+        return None, usage
     try:
         from openai import OpenAI
     except ImportError:
-        return None
+        return None, usage
     client = OpenAI()
+    # ``stream_options.include_usage`` makes the final SSE chunk carry a
+    # populated ``usage`` block (otherwise streaming responses emit it as
+    # ``None``). Older SDK versions silently ignore unknown kwargs.
     stream = client.chat.completions.create(
         model=os.environ.get("OPENAI_MODEL", "gpt-4o-mini"),
         messages=messages,
         stream=True,
+        stream_options={"include_usage": True},
     )
     pieces: list[str] = []
     for chunk in stream:
-        text = (chunk.choices[0].delta.content or "") if chunk.choices else ""
-        if text:
-            emitter.token(block_id, text)
-            pieces.append(text)
-    return "".join(pieces) if pieces else None
+        if chunk.choices:
+            text = chunk.choices[0].delta.content or ""
+            if text:
+                emitter.token(block_id, text)
+                pieces.append(text)
+        usage_obj = getattr(chunk, "usage", None)
+        if usage_obj is not None:
+            usage.tokens_in = int(getattr(usage_obj, "prompt_tokens", 0) or 0)
+            usage.tokens_out = int(getattr(usage_obj, "completion_tokens", 0) or 0)
+    return ("".join(pieces) if pieces else None), usage
+def _run_openai_compat(
+    messages: list[dict[str, str]],
+    emitter: EventEmitter,
+    block_id: str,
+    *,
+    endpoint: Any,
+) -> tuple[str | None, _UsageDelta]:
+    """Stream from a user-registered OpenAI-v1-compatible endpoint (REQ-142).
+    Uses raw stdlib HTTP so the openai SDK is not a hard dependency for
+    BYOE. Sends a streaming ``/chat/completions`` request, decodes the
+    Server-Sent-Events ``data:`` lines, and forwards each ``content``
+    delta as a ``token`` event on ``block_id``.
+    """
+    usage = _UsageDelta()
+    base_url = endpoint.base_url.rstrip("/")
+    url = f"{base_url}/chat/completions"
+    model = endpoint.default_model or os.environ.get("SPECSMITH_OPENAI_COMPAT_MODEL", "")
+    if not model:
+        # The endpoint did not pin a default model and the env override is
+        # absent. We cannot fabricate one; fall back to the auto-detect chain.
+        return None, usage
+    headers: dict[str, str] = {
+        "Content-Type": "application/json",
+        "Accept": "text/event-stream",
+    }
+    try:
+        token = endpoint.resolve_token()
+    except Exception:  # noqa: BLE001 - fall back to auto-detect chain
+        return None, usage
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    body = json.dumps(
+        {
+            "model": model,
+            "messages": messages,
+            "stream": True,
+            # Many vLLM/llama.cpp builds honour OpenAI's stream_options;
+            # the request is harmless if they don't.
+            "stream_options": {"include_usage": True},
+        }
+    ).encode("utf-8")
+    req = Request(url, data=body, headers=headers, method="POST")  # noqa: S310 - user-supplied
+    ctx = None
+    if not endpoint.verify_tls and url.startswith("https://"):
+        import ssl
+        ctx = ssl.create_default_context()
+        ctx.check_hostname = False
+        ctx.verify_mode = ssl.CERT_NONE
+    pieces: list[str] = []
+    try:
+        with urlopen(req, timeout=120, context=ctx) as resp:  # noqa: S310 - user-supplied
+            for raw_line in resp:
+                line = raw_line.decode("utf-8", errors="replace").rstrip("\n\r")
+                if not line.startswith("data:"):
+                    continue
+                payload = line[len("data:") :].strip()
+                if not payload or payload == "[DONE]":
+                    if payload == "[DONE]":
+                        break
+                    continue
+                try:
+                    obj = json.loads(payload)
+                except ValueError:
+                    continue
+                choices = obj.get("choices") or []
+                usage_obj = obj.get("usage")
+                if usage_obj:
+                    usage.tokens_in = int(usage_obj.get("prompt_tokens") or 0)
+                    usage.tokens_out = int(usage_obj.get("completion_tokens") or 0)
+                if not choices:
+                    continue
+                delta = (choices[0] or {}).get("delta") or {}
+                chunk = str(delta.get("content") or "")
+                if chunk:
+                    emitter.token(block_id, chunk)
+                    pieces.append(chunk)
+    except (URLError, TimeoutError, OSError):
+        return None, usage
+    return ("".join(pieces) if pieces else None), usage
 def _run_gemini(
     messages: list[dict[str, str]],
     emitter: EventEmitter,
     block_id: str,
-) -> str | None:
+) -> tuple[str | None, _UsageDelta]:
     """Use google-genai SDK if installed and a key is configured."""
+    usage = _UsageDelta()
     if not os.environ.get("GOOGLE_API_KEY"):
-        return None
+        return None, usage
     try:
         from google import genai
     except ImportError:
-        return None
+        return None, usage
     client = genai.Client()
     prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
     pieces: list[str] = []
+    last_chunk: Any = None
     for chunk in client.models.generate_content_stream(
         model=os.environ.get("GEMINI_MODEL", "gemini-2.5-flash"),
         contents=prompt,
     ):
+        last_chunk = chunk
         text = getattr(chunk, "text", "") or ""
         if text:
             emitter.token(block_id, text)
             pieces.append(text)
-    return "".join(pieces) if pieces else None
+    # Gemini exposes ``usage_metadata`` on the final chunk. Field names
+    # vary across SDK versions; we accept the union.
+    meta = getattr(last_chunk, "usage_metadata", None) if last_chunk else None
+    if meta is not None:
+        usage.tokens_in = int(
+            getattr(meta, "prompt_token_count", 0) or getattr(meta, "input_token_count", 0) or 0
+        )
+        usage.tokens_out = int(
+            getattr(meta, "candidates_token_count", 0)
+            or getattr(meta, "output_token_count", 0)
+            or 0
+        )
+    return ("".join(pieces) if pieces else None), usage
 # ---------------------------------------------------------------------------

specsmith-0.10.0/src/specsmith/agent/core.py ADDED Viewed

@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
+"""Shared agent runtime primitives (REQ-145).
+Hosts low-level enums and dataclasses that span :mod:`specsmith.agent.runner`,
+:mod:`specsmith.serve`, :mod:`specsmith.agent.profiles`, and
+:mod:`specsmith.agent.fallback` without forcing them to import each other.
+The historical ``cli.py`` referenced ``ModelTier`` from this module before
+it existed in the source tree (the file was lost in an earlier refactor),
+which produced an ``ImportError`` the moment ``specsmith run`` was
+invoked. Restoring the symbol here is the prerequisite for the bridge
+``ready`` event handshake to land before the VS Code extension's 20 s
+startup timeout fires.
+"""
+from __future__ import annotations
+import enum
+from dataclasses import dataclass, field
+from typing import Any
+class ModelTier(str, enum.Enum):
+    """Capability tier for an LLM call.
+    Ordered cheapest → most capable so that a fallback chain can iterate
+    in declaration order without external metadata.
+    """
+    FAST = "fast"
+    BALANCED = "balanced"
+    POWERFUL = "powerful"
+    @classmethod
+    def parse(
+        cls,
+        value: str | ModelTier | None,
+        default: ModelTier | None = None,
+    ) -> ModelTier:
+        """Tolerant parser used by CLI option handlers."""
+        if value is None or value == "":
+            return default or cls.BALANCED
+        if isinstance(value, cls):
+            return value
+        try:
+            return cls(str(value).strip().lower())
+        except ValueError:
+            return default or cls.BALANCED
+@dataclass
+class AgentState:
+    """Mutable per-session metrics surfaced via ``specsmith serve``'s
+    ``GET /api/status`` endpoint and the VS Code TokenMeter chip.
+    Field names mirror what :class:`specsmith.serve._AgentThread` reads off
+    ``runner._state``; do not rename without updating that consumer.
+    """
+    provider_name: str = ""
+    model_name: str = ""
+    profile_id: str = ""
+    session_tokens: int = 0
+    tokens_in: int = 0
+    tokens_out: int = 0
+    total_cost_usd: float = 0.0
+    tool_calls_made: int = 0
+    elapsed_minutes: float = 0.0
+    by_profile: dict[str, dict[str, Any]] = field(default_factory=dict)
+    def credit(
+        self,
+        *,
+        profile_id: str,
+        tokens_in: int = 0,
+        tokens_out: int = 0,
+        cost_usd: float = 0.0,
+        tool_calls: int = 0,
+    ) -> None:
+        """Aggregate one turn's metrics into the running totals."""
+        self.tokens_in += int(tokens_in)
+        self.tokens_out += int(tokens_out)
+        self.session_tokens = self.tokens_in + self.tokens_out
+        self.total_cost_usd += float(cost_usd)
+        self.tool_calls_made += int(tool_calls)
+        bucket = self.by_profile.setdefault(
+            profile_id or "(default)",
+            {"tokens_in": 0, "tokens_out": 0, "cost_usd": 0.0, "tool_calls": 0, "turns": 0},
+        )
+        bucket["tokens_in"] += int(tokens_in)
+        bucket["tokens_out"] += int(tokens_out)
+        bucket["cost_usd"] = round(bucket["cost_usd"] + float(cost_usd), 6)
+        bucket["tool_calls"] += int(tool_calls)
+        bucket["turns"] += 1
+__all__ = ["AgentState", "ModelTier"]

specsmith 0.7.0.dev236__tar.gz → 0.10.0__tar.gz

specsmith 0.7.0.dev236tar.gz → 0.10.0tar.gz