PyPI - specsmith - Versions diffs - 0.10.1.dev287__tar.gz → 0.10.1.dev292__tar.gz - Mend

specsmith 0.10.1.dev287tar.gz → 0.10.1.dev292tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (229) hide show

{specsmith-0.10.1.dev287 → specsmith-0.10.1.dev292}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: specsmith
-Version: 0.10.1.dev287
+Version: 0.10.1.dev292
 Summary: Applied Epistemic Engineering toolkit — AEE agent sessions, execution profiles, FPGA/HDL governance, tool installer, 50+ CLI commands.
 Author: BitConcepts
 License-Expression: MIT

{specsmith-0.10.1.dev287 → specsmith-0.10.1.dev292}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "specsmith"
-version = "0.10.1.dev287"
+version = "0.10.1.dev292"
 description = "Applied Epistemic Engineering toolkit — AEE agent sessions, execution profiles, FPGA/HDL governance, tool installer, 50+ CLI commands."
 readme = "README.md"
 license = "MIT"

specsmith-0.10.1.dev292/src/specsmith/agent/hf_sync.py ADDED Viewed

@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
+"""HuggingFace Open LLM Leaderboard sync for model intelligence (REQ-223).
+Fetches benchmark scores from the HuggingFace API and populates
+`.specsmith/model_scores.json` so that `rank_models_for_role()` uses
+real data instead of hardcoded baselines.
+Usage:
+    from specsmith.agent.hf_sync import sync_scores
+    results = sync_scores()  # returns dict of model_id -> {benchmark: score}
+"""
+from __future__ import annotations
+import json
+import time
+import urllib.request
+from pathlib import Path
+from typing import Any
+# HF Inference API endpoint for model info
+HF_API_BASE = "https://huggingface.co/api"
+# Models we track (subset of popular models with known benchmark data)
+TRACKED_MODELS: list[str] = [
+    "gpt-4.1",
+    "gpt-4.1-mini",
+    "gpt-4o",
+    "gpt-4o-mini",
+    "claude-sonnet-4-20250514",
+    "claude-3.5-sonnet",
+    "gemini-2.5-pro",
+    "gemini-2.5-flash",
+    "Qwen/Qwen2.5-Coder-32B-Instruct",
+    "Qwen/Qwen2.5-Coder-7B-Instruct",
+    "mistralai/Mistral-Large-Latest",
+    "meta-llama/Llama-3.3-70B-Instruct",
+    "deepseek-ai/DeepSeek-V3",
+]
+# Default scores file path (relative to project root)
+SCORES_FILENAME = "model_scores.json"
+def _scores_path(project_dir: str | Path = ".") -> Path:
+    return Path(project_dir).resolve() / ".specsmith" / SCORES_FILENAME
+def load_cached_scores(project_dir: str | Path = ".") -> dict[str, Any]:
+    """Load cached model scores from disk."""
+    path = _scores_path(project_dir)
+    if not path.is_file():
+        return {}
+    try:
+        result: dict[str, Any] = json.loads(path.read_text(encoding="utf-8"))
+        return result
+    except (OSError, ValueError):
+        return {}
+def save_scores(scores: dict[str, Any], project_dir: str | Path = ".") -> None:
+    """Persist model scores to disk."""
+    path = _scores_path(project_dir)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    data = {
+        "synced_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "models": scores,
+    }
+    path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
+def fetch_hf_model_info(model_id: str, timeout: int = 10) -> dict[str, Any]:
+    """Fetch model metadata from HuggingFace API.
+    Returns a dict with model card data. On failure returns empty dict.
+    """
+    url = f"{HF_API_BASE}/models/{model_id}"
+    try:
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=timeout) as resp:  # noqa: S310
+            result: dict[str, Any] = json.loads(resp.read())
+            return result
+    except Exception:  # noqa: BLE001
+        return {}
+def _extract_benchmark_scores(model_info: dict[str, Any]) -> dict[str, float]:
+    """Extract benchmark scores from HF model card metadata.
+    Looks for eval_results in the model card data. Returns a dict of
+    benchmark_name -> score.
+    """
+    scores: dict[str, float] = {}
+    # HF model cards store eval results in cardData.eval_results
+    card_data = model_info.get("cardData", {}) or {}
+    eval_results = card_data.get("eval_results", []) or []
+    for result in eval_results:
+        if not isinstance(result, dict):
+            continue
+        dataset = result.get("dataset", {})
+        name = dataset.get("name", "") if isinstance(dataset, dict) else str(dataset)
+        metrics = result.get("metrics", []) or []
+        for metric in metrics:
+            if isinstance(metric, dict):
+                metric_name = metric.get("name", "")
+                value = metric.get("value")
+                if metric_name and value is not None:
+                    try:
+                        key = f"{name}/{metric_name}" if name else metric_name
+                        scores[key] = float(value)
+                    except (TypeError, ValueError):
+                        continue
+    return scores
+def sync_scores(
+    project_dir: str | Path = ".",
+    models: list[str] | None = None,
+    timeout: int = 10,
+) -> dict[str, Any]:
+    """Sync model scores from HuggingFace.
+    For HF-hosted models, fetches real benchmark data from model cards.
+    For proprietary models (GPT, Claude, Gemini), uses curated baselines.
+    Returns dict of model_id -> {benchmark: score}.
+    """
+    from specsmith.agent.model_intelligence import BASELINE_SCORES
+    target_models = models or TRACKED_MODELS
+    all_scores: dict[str, Any] = {}
+    for model_id in target_models:
+        # For non-HF models, use baseline scores
+        if "/" not in model_id:
+            baseline = BASELINE_SCORES.get(model_id)
+            if baseline:
+                all_scores[model_id] = {"baseline_composite": baseline}
+            continue
+        # For HF models, try to fetch real data
+        info = fetch_hf_model_info(model_id, timeout=timeout)
+        if info:
+            benchmarks = _extract_benchmark_scores(info)
+            if benchmarks:
+                all_scores[model_id] = benchmarks
+                continue
+        # Fallback to baseline
+        baseline = BASELINE_SCORES.get(model_id)
+        if baseline:
+            all_scores[model_id] = {"baseline_composite": baseline}
+    save_scores(all_scores, project_dir)
+    return all_scores
+def is_stale(project_dir: str | Path = ".", max_age_hours: int = 24) -> bool:
+    """Check if cached scores are older than max_age_hours."""
+    cached = load_cached_scores(project_dir)
+    synced_at = cached.get("synced_at", "")
+    if not synced_at:
+        return True
+    try:
+        from datetime import datetime, timezone
+        synced = datetime.fromisoformat(synced_at.replace("Z", "+00:00"))
+        age = datetime.now(timezone.utc) - synced
+        return age.total_seconds() > max_age_hours * 3600
+    except (ValueError, TypeError):
+        return True
+__all__ = [
+    "fetch_hf_model_info",
+    "is_stale",
+    "load_cached_scores",
+    "save_scores",
+    "sync_scores",
+]

specsmith-0.10.1.dev292/src/specsmith/agent/spawner.py ADDED Viewed

@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
+"""Sub-agent spawner — spawn isolated agent workers with tool subsets.
+ARCHITECTURE.md §13 Phase 2: Multi-Agent Layer.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class SpawnedAgent:
+    """Metadata for a spawned sub-agent."""
+    id: str
+    role: str
+    tools: list[str]
+    status: str = "idle"  # idle, running, completed, failed
+    result: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "role": self.role,
+            "tools": self.tools,
+            "status": self.status,
+            "result": self.result,
+        }
+# Tool subsets for different agent roles
+ROLE_TOOLS: dict[str, list[str]] = {
+    "coder": ["read_file", "write_file", "run_shell", "apply_diff"],
+    "reviewer": ["read_file", "run_shell", "git_diff"],
+    "tester": ["read_file", "run_shell", "run_tests"],
+    "architect": ["read_file", "write_file"],
+    "researcher": ["read_file", "search_web", "search_repo"],
+}
+class SubAgentSpawner:
+    """Spawn and manage isolated agent workers.
+    Each spawned agent gets a restricted tool subset based on its role,
+    preventing accidental cross-domain actions (e.g., a reviewer can't
+    write files).
+    """
+    def __init__(self) -> None:
+        self._agents: dict[str, SpawnedAgent] = {}
+        self._counter = 0
+    def spawn(self, role: str, tools: list[str] | None = None) -> SpawnedAgent:
+        """Spawn a new sub-agent with the given role and tool set."""
+        self._counter += 1
+        agent_id = f"agent-{role}-{self._counter:03d}"
+        effective_tools = tools or ROLE_TOOLS.get(role, [])
+        agent = SpawnedAgent(id=agent_id, role=role, tools=effective_tools)
+        self._agents[agent_id] = agent
+        return agent
+    def get(self, agent_id: str) -> SpawnedAgent | None:
+        """Get a spawned agent by ID."""
+        return self._agents.get(agent_id)
+    def list_active(self) -> list[SpawnedAgent]:
+        """List all agents that are not completed/failed."""
+        return [a for a in self._agents.values() if a.status in ("idle", "running")]
+    def list_all(self) -> list[SpawnedAgent]:
+        """List all spawned agents."""
+        return list(self._agents.values())
+    def complete(self, agent_id: str, result: dict[str, Any]) -> None:
+        """Mark an agent as completed with its result."""
+        agent = self._agents.get(agent_id)
+        if agent:
+            agent.status = "completed"
+            agent.result = result
+    def fail(self, agent_id: str, error: str) -> None:
+        """Mark an agent as failed."""
+        agent = self._agents.get(agent_id)
+        if agent:
+            agent.status = "failed"
+            agent.result = {"error": error}
+__all__ = ["ROLE_TOOLS", "SpawnedAgent", "SubAgentSpawner"]

specsmith-0.10.1.dev292/src/specsmith/agent/teams.py ADDED Viewed

@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
+"""Team definitions for multi-agent coordination.
+ARCHITECTURE.md §13 Phase 2: predefined agent team compositions.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class TeamMember:
+    """A role slot within a team."""
+    role: str
+    required: bool = True
+    tools_override: list[str] | None = None
+@dataclass
+class TeamDefinition:
+    """A named team of agent roles that work together."""
+    id: str
+    name: str
+    description: str
+    members: list[TeamMember] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.name,
+            "description": self.description,
+            "members": [{"role": m.role, "required": m.required} for m in self.members],
+        }
+# Pre-defined teams
+PAIR_REVIEW = TeamDefinition(
+    id="pair-review",
+    name="Pair Review",
+    description="Coder + Reviewer pair for code changes with built-in review",
+    members=[
+        TeamMember(role="coder"),
+        TeamMember(role="reviewer"),
+    ],
+)
+FULL_STACK = TeamDefinition(
+    id="full-stack",
+    name="Full Stack",
+    description="Architect + Coder + Tester trio for complete feature development",
+    members=[
+        TeamMember(role="architect"),
+        TeamMember(role="coder"),
+        TeamMember(role="tester"),
+    ],
+)
+IP_ANALYSIS = TeamDefinition(
+    id="ip-analysis",
+    name="IP Analysis",
+    description="IP Analyst + Researcher + Strategist for patent work",
+    members=[
+        TeamMember(role="ip-analyst"),
+        TeamMember(role="researcher"),
+        TeamMember(role="strategist"),
+    ],
+)
+SPEC_DRAFT = TeamDefinition(
+    id="spec-draft",
+    name="Specification Drafting",
+    description="Architect + Drafter + Reviewer for specification writing",
+    members=[
+        TeamMember(role="architect"),
+        TeamMember(role="drafter"),
+        TeamMember(role="reviewer"),
+    ],
+)
+BUILTIN_TEAMS: dict[str, TeamDefinition] = {
+    "pair-review": PAIR_REVIEW,
+    "full-stack": FULL_STACK,
+    "ip-analysis": IP_ANALYSIS,
+    "spec-draft": SPEC_DRAFT,
+}
+def get_team(team_id: str) -> TeamDefinition | None:
+    """Get a built-in team by ID."""
+    return BUILTIN_TEAMS.get(team_id)
+def list_teams() -> list[TeamDefinition]:
+    """List all available teams."""
+    return list(BUILTIN_TEAMS.values())
+__all__ = ["BUILTIN_TEAMS", "TeamDefinition", "TeamMember", "get_team", "list_teams"]

specsmith-0.10.1.dev292/src/specsmith/eval/__init__.py ADDED Viewed

@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
+"""Eval-Driven Development framework (ARCHITECTURE.md §13 Phase 1).
+Provides structured eval suites that test AI model capabilities against
+concrete tasks. Used for model intelligence scoring, regression testing,
+and provider qualification.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class EvalCase:
+    """A single evaluation case — one prompt + expected behavior."""
+    id: str
+    name: str
+    role: str  # which agent role this tests
+    prompt: str
+    expected_keywords: list[str] = field(default_factory=list)
+    max_tokens: int = 1024
+    timeout_seconds: int = 30
+    tags: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.name,
+            "role": self.role,
+            "prompt": self.prompt,
+            "expected_keywords": self.expected_keywords,
+            "max_tokens": self.max_tokens,
+            "tags": self.tags,
+        }
+@dataclass
+class EvalResult:
+    """Result of running a single eval case."""
+    case_id: str
+    passed: bool
+    score: float  # 0.0–1.0
+    latency_ms: float
+    model: str
+    provider: str
+    output_preview: str = ""
+    error: str = ""
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "case_id": self.case_id,
+            "passed": self.passed,
+            "score": round(self.score, 3),
+            "latency_ms": round(self.latency_ms, 1),
+            "model": self.model,
+            "provider": self.provider,
+            "output_preview": self.output_preview[:200],
+            "error": self.error,
+        }
+@dataclass
+class EvalSuite:
+    """A named collection of eval cases."""
+    id: str
+    name: str
+    description: str
+    cases: list[EvalCase] = field(default_factory=list)
+    tags: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "name": self.name,
+            "description": self.description,
+            "case_count": len(self.cases),
+            "tags": self.tags,
+        }
+@dataclass
+class EvalReport:
+    """Aggregated results from running an eval suite."""
+    suite_id: str
+    total: int
+    passed: int
+    failed: int
+    avg_score: float
+    avg_latency_ms: float
+    results: list[EvalResult] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "suite_id": self.suite_id,
+            "total": self.total,
+            "passed": self.passed,
+            "failed": self.failed,
+            "avg_score": round(self.avg_score, 3),
+            "avg_latency_ms": round(self.avg_latency_ms, 1),
+            "results": [r.to_dict() for r in self.results],
+        }
+__all__ = ["EvalCase", "EvalReport", "EvalResult", "EvalSuite"]

specsmith-0.10.1.dev292/src/specsmith/eval/builtins.py ADDED Viewed

@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
+"""Built-in eval suites for core agent capabilities."""
+from __future__ import annotations
+from specsmith.eval import EvalCase, EvalSuite
+CODE_GEN = EvalCase(
+    id="eval-code-gen-001",
+    name="Python function generation",
+    role="coder",
+    prompt=(
+        "Write a Python function `fibonacci(n: int) -> list[int]` that returns "
+        "the first n Fibonacci numbers. Include type hints and a docstring."
+    ),
+    expected_keywords=["def fibonacci", "list[int]", "return"],
+    tags=["code", "python"],
+)
+ARCHITECTURE_REVIEW = EvalCase(
+    id="eval-arch-001",
+    name="Architecture review",
+    role="architect",
+    prompt=(
+        "Review this architecture decision: 'We will use a monolithic SQLite database "
+        "for a multi-tenant SaaS application serving 10,000 concurrent users.' "
+        "Identify risks and suggest alternatives."
+    ),
+    expected_keywords=["scalability", "concurrent", "alternative"],
+    tags=["architecture", "review"],
+)
+TEST_GEN = EvalCase(
+    id="eval-test-gen-001",
+    name="Test generation",
+    role="tester",
+    prompt=(
+        "Write pytest tests for a function `add(a: int, b: int) -> int` that adds "
+        "two integers. Cover edge cases: negative numbers, zero, large values."
+    ),
+    expected_keywords=["def test_", "assert", "add"],
+    tags=["test", "pytest"],
+)
+PATENT_CLAIM = EvalCase(
+    id="eval-patent-001",
+    name="Patent claim analysis",
+    role="ip-analyst",
+    prompt=(
+        "Analyze this patent claim: 'A method for sorting data records comprising: "
+        "receiving a dataset, applying a comparison function, and outputting sorted "
+        "records in ascending order.' Identify the key limitations and suggest "
+        "potential design-arounds."
+    ),
+    expected_keywords=["limitation", "claim", "design"],
+    tags=["patent", "ip"],
+)
+INTENT_CLASSIFY = EvalCase(
+    id="eval-classify-001",
+    name="Intent classification",
+    role="classifier",
+    prompt=(
+        "Classify the following user request into one of these categories: "
+        "[code_change, bug_fix, documentation, question, refactor]. "
+        "Request: 'Can you rename the variable foo to bar in utils.py?'"
+    ),
+    expected_keywords=["refactor"],
+    max_tokens=128,
+    tags=["classify", "intent"],
+)
+# Pre-built suites
+CORE_SUITE = EvalSuite(
+    id="core",
+    name="Core Capabilities",
+    description="Tests fundamental AI capabilities across 5 roles",
+    cases=[CODE_GEN, ARCHITECTURE_REVIEW, TEST_GEN, PATENT_CLAIM, INTENT_CLASSIFY],
+    tags=["core", "smoke"],
+)
+ALL_SUITES: dict[str, EvalSuite] = {
+    "core": CORE_SUITE,
+}
+def get_suite(suite_id: str) -> EvalSuite | None:
+    """Get a built-in suite by ID."""
+    return ALL_SUITES.get(suite_id)
+def list_suites() -> list[EvalSuite]:
+    """List all available built-in suites."""
+    return list(ALL_SUITES.values())

specsmith 0.10.1.dev287__tar.gz → 0.10.1.dev292__tar.gz

specsmith 0.10.1.dev287tar.gz → 0.10.1.dev292tar.gz