PyPI - sandboxy - Versions diffs - 0.0.3__tar.gz → 0.0.5__tar.gz - Mend

sandboxy 0.0.3tar.gz → 0.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

{sandboxy-0.0.3 → sandboxy-0.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sandboxy
-Version: 0.0.3
+Version: 0.0.5
 Summary: Open-source agent simulation and benchmarking platform
 Project-URL: Homepage, https://github.com/sandboxy-ai/sandboxy
 Project-URL: Repository, https://github.com/sandboxy-ai/sandboxy
@@ -39,6 +39,8 @@ Requires-Dist: pytest-xdist>=3.5.0; extra == 'dev'
 Requires-Dist: pytest>=8.0; extra == 'dev'
 Requires-Dist: respx>=0.21.0; extra == 'dev'
 Requires-Dist: ruff>=0.1; extra == 'dev'
+Provides-Extra: mlflow
+Requires-Dist: mlflow>=3.0; extra == 'mlflow'
 Description-Content-Type: text/markdown
 # Sandboxy
@@ -118,7 +120,37 @@ Opens a browser with a local UI for browsing scenarios, running them, and viewin
 ## Writing Scenarios
-Scenarios are YAML files that define agent interactions:
+Scenarios are YAML files that define agent interactions. Sandboxy supports two modes:
+### Single-turn mode
+Use `prompt:` for simple request/response scenarios without tool use:
+```yaml
+id: simple-qa
+name: "Simple Q&A"
+system_prompt: |
+  You are a helpful assistant.
+prompt: |
+  What is the capital of France?
+evaluation:
+  max_score: 100
+  goals:
+    - id: correct_answer
+      name: "Correct Answer"
+      points: 100
+      detection:
+        type: agent_contains
+        patterns:
+          - "Paris"
+```
+### Agentic mode
+Use `steps:` for multi-turn scenarios with tool support:
 ```yaml
 id: customer-support
@@ -129,35 +161,45 @@ system_prompt: |
   You are a customer support agent for TechCo.
   Be helpful but follow company policy.
-user_prompt: |
-  I want a refund for my purchase. Order #12345.
+steps:
+  - id: user_request
+    action: inject_user
+    params:
+      content: "I want a refund for my purchase. Order #12345."
+  - id: agent_response
+    action: await_agent
-# Define tools the agent can use
+# Tools are only available in agentic mode (with steps)
 tools:
-  - name: lookup_order
+  lookup_order:
     description: "Look up order details"
-    params:
-      order_id:
-        type: string
-        required: true
-    returns: "Order details for {{order_id}}"
-# Evaluation criteria
-goals:
-  - name: acknowledged_request
-    description: "Agent acknowledged the refund request"
-    check:
-      type: contains
-      value: "refund"
-  - name: looked_up_order
-    description: "Agent used the lookup tool"
-    check:
-      type: tool_called
-      tool: lookup_order
-scoring:
+    actions:
+      call:
+        params:
+          order_id:
+            type: string
+            required: true
+        returns: "Order details for {{order_id}}"
+evaluation:
   max_score: 100
+  goals:
+    - id: acknowledged_request
+      name: "Acknowledged Request"
+      description: "Agent acknowledged the refund request"
+      points: 50
+      detection:
+        type: agent_contains
+        patterns:
+          - "refund"
+    - id: looked_up_order
+      name: "Looked Up Order"
+      description: "Agent used the lookup tool"
+      points: 50
+      detection:
+        type: tool_called
+        tool: lookup_order
 ```
 ## CLI Reference
@@ -204,6 +246,39 @@ sandboxy list-models --search claude
 sandboxy list-models --free
 ```
+## MLflow Integration
+Export scenario run results to MLflow for experiment tracking and model comparison.
+```bash
+# Install with MLflow support
+pip install sandboxy[mlflow]
+# Export run to MLflow
+sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
+# Custom experiment name
+sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
+```
+Or enable in scenario YAML:
+```yaml
+id: my-scenario
+name: "My Test"
+mlflow:
+  enabled: true
+  experiment: "agent-evals"
+  tags:
+    team: "support"
+system_prompt: |
+  ...
+```
+See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
 ## Configuration
 Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -213,6 +288,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
 | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
 | `OPENAI_API_KEY` | Direct OpenAI access |
 | `ANTHROPIC_API_KEY` | Direct Anthropic access |
+| `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
 ## Project Structure

{sandboxy-0.0.3 → sandboxy-0.0.5}/README.md RENAMED Viewed

@@ -75,7 +75,37 @@ Opens a browser with a local UI for browsing scenarios, running them, and viewin
 ## Writing Scenarios
-Scenarios are YAML files that define agent interactions:
+Scenarios are YAML files that define agent interactions. Sandboxy supports two modes:
+### Single-turn mode
+Use `prompt:` for simple request/response scenarios without tool use:
+```yaml
+id: simple-qa
+name: "Simple Q&A"
+system_prompt: |
+  You are a helpful assistant.
+prompt: |
+  What is the capital of France?
+evaluation:
+  max_score: 100
+  goals:
+    - id: correct_answer
+      name: "Correct Answer"
+      points: 100
+      detection:
+        type: agent_contains
+        patterns:
+          - "Paris"
+```
+### Agentic mode
+Use `steps:` for multi-turn scenarios with tool support:
 ```yaml
 id: customer-support
@@ -86,35 +116,45 @@ system_prompt: |
   You are a customer support agent for TechCo.
   Be helpful but follow company policy.
-user_prompt: |
-  I want a refund for my purchase. Order #12345.
+steps:
+  - id: user_request
+    action: inject_user
+    params:
+      content: "I want a refund for my purchase. Order #12345."
+  - id: agent_response
+    action: await_agent
-# Define tools the agent can use
+# Tools are only available in agentic mode (with steps)
 tools:
-  - name: lookup_order
+  lookup_order:
     description: "Look up order details"
-    params:
-      order_id:
-        type: string
-        required: true
-    returns: "Order details for {{order_id}}"
-# Evaluation criteria
-goals:
-  - name: acknowledged_request
-    description: "Agent acknowledged the refund request"
-    check:
-      type: contains
-      value: "refund"
-  - name: looked_up_order
-    description: "Agent used the lookup tool"
-    check:
-      type: tool_called
-      tool: lookup_order
-scoring:
+    actions:
+      call:
+        params:
+          order_id:
+            type: string
+            required: true
+        returns: "Order details for {{order_id}}"
+evaluation:
   max_score: 100
+  goals:
+    - id: acknowledged_request
+      name: "Acknowledged Request"
+      description: "Agent acknowledged the refund request"
+      points: 50
+      detection:
+        type: agent_contains
+        patterns:
+          - "refund"
+    - id: looked_up_order
+      name: "Looked Up Order"
+      description: "Agent used the lookup tool"
+      points: 50
+      detection:
+        type: tool_called
+        tool: lookup_order
 ```
 ## CLI Reference
@@ -161,6 +201,39 @@ sandboxy list-models --search claude
 sandboxy list-models --free
 ```
+## MLflow Integration
+Export scenario run results to MLflow for experiment tracking and model comparison.
+```bash
+# Install with MLflow support
+pip install sandboxy[mlflow]
+# Export run to MLflow
+sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
+# Custom experiment name
+sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
+```
+Or enable in scenario YAML:
+```yaml
+id: my-scenario
+name: "My Test"
+mlflow:
+  enabled: true
+  experiment: "agent-evals"
+  tags:
+    team: "support"
+system_prompt: |
+  ...
+```
+See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
 ## Configuration
 Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -170,6 +243,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
 | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
 | `OPENAI_API_KEY` | Direct OpenAI access |
 | `ANTHROPIC_API_KEY` | Direct Anthropic access |
+| `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
 ## Project Structure

{sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/components/ModelSelector.tsx RENAMED Viewed

@@ -1,7 +1,17 @@
 import { useState, useRef, useEffect } from 'react'
-import { ChevronDown, Check, X, Search } from 'lucide-react'
+import { ChevronDown, Check, X, Search, Monitor } from 'lucide-react'
 import { ModelInfo } from '../lib/api'
+// Badge component for local models
+function LocalBadge() {
+  return (
+    <span className="inline-flex items-center gap-1 px-1.5 py-0.5 bg-emerald-500/20 border border-emerald-500/40 rounded text-xs text-emerald-400">
+      <Monitor size={10} />
+      Local
+    </span>
+  )
+}
 interface ModelSelectorProps {
   models: ModelInfo[]
   value: string
@@ -43,16 +53,31 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
   // Group models by provider
   const groupedModels = filteredModels.reduce((acc, model) => {
-    const provider = model.id.split('/')[0] || 'other'
+    // Use provider_name for local models, otherwise extract from id
+    const provider = model.provider_name || model.id.split('/')[0] || 'other'
     if (!acc[provider]) acc[provider] = []
     acc[provider].push(model)
     return acc
   }, {} as Record<string, ModelInfo[]>)
-  const providerOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
+  // Local providers first, then cloud providers in preferred order
+  const cloudProviderOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
+  // Check if a provider group has local models
+  const isLocalProvider = (provider: string) => {
+    return groupedModels[provider]?.some(m => m.is_local)
+  }
   const sortedProviders = Object.keys(groupedModels).sort((a, b) => {
-    const aIdx = providerOrder.indexOf(a)
-    const bIdx = providerOrder.indexOf(b)
+    // Local providers always come first
+    const aIsLocal = isLocalProvider(a)
+    const bIsLocal = isLocalProvider(b)
+    if (aIsLocal && !bIsLocal) return -1
+    if (!aIsLocal && bIsLocal) return 1
+    // Within same category, sort by preference
+    const aIdx = cloudProviderOrder.indexOf(a)
+    const bIdx = cloudProviderOrder.indexOf(b)
     if (aIdx === -1 && bIdx === -1) return a.localeCompare(b)
     if (aIdx === -1) return 1
     if (bIdx === -1) return -1
@@ -70,9 +95,10 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
         } ${open ? 'ring-2 ring-orange-400' : ''}`}
       >
         {selectedModel ? (
-          <div className="flex items-center justify-between flex-1 min-w-0">
+          <div className="flex items-center justify-between flex-1 min-w-0 gap-2">
             <span className="text-slate-100 truncate">{selectedModel.name}</span>
-            <span className="text-xs text-slate-500 ml-2 shrink-0">{selectedModel.price}</span>
+            {selectedModel.is_local && <LocalBadge />}
+            <span className="text-xs text-slate-500 shrink-0">{selectedModel.price}</span>
           </div>
         ) : (
           <span className="text-slate-500">{placeholder}</span>
@@ -101,8 +127,9 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
           <div className="overflow-y-auto flex-1">
             {sortedProviders.map(provider => (
               <div key={provider}>
-                <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0">
+                <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0 flex items-center gap-2">
                   {provider}
+                  {isLocalProvider(provider) && <LocalBadge />}
                 </div>
                 {groupedModels[provider].map(model => (
                   <button
@@ -119,8 +146,9 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
                         : 'hover:bg-slate-800 text-slate-100'
                     }`}
                   >
-                    <div className="flex-1 min-w-0">
+                    <div className="flex-1 min-w-0 flex items-center gap-2">
                       <div className="truncate">{model.name}</div>
+                      {model.is_local && !isLocalProvider(provider) && <LocalBadge />}
                     </div>
                     <span className="text-xs text-slate-500 shrink-0">{model.price}</span>
                     {model.id === value && <Check size={16} className="text-orange-400 shrink-0" />}
@@ -188,18 +216,32 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
     m.id.toLowerCase().includes(search.toLowerCase())
   )
-  // Group models by provider
+  // Group models by provider (use provider_name for local models)
   const groupedModels = filteredModels.reduce((acc, model) => {
-    const provider = model.id.split('/')[0] || 'other'
+    const provider = model.provider_name || model.id.split('/')[0] || 'other'
     if (!acc[provider]) acc[provider] = []
     acc[provider].push(model)
     return acc
   }, {} as Record<string, ModelInfo[]>)
-  const providerOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
+  // Local providers first, then cloud providers in preferred order
+  const cloudProviderOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
+  // Check if a provider group has local models
+  const isLocalProvider = (provider: string) => {
+    return groupedModels[provider]?.some(m => m.is_local)
+  }
   const sortedProviders = Object.keys(groupedModels).sort((a, b) => {
-    const aIdx = providerOrder.indexOf(a)
-    const bIdx = providerOrder.indexOf(b)
+    // Local providers always come first
+    const aIsLocal = isLocalProvider(a)
+    const bIsLocal = isLocalProvider(b)
+    if (aIsLocal && !bIsLocal) return -1
+    if (!aIsLocal && bIsLocal) return 1
+    // Within same category, sort by preference
+    const aIdx = cloudProviderOrder.indexOf(a)
+    const bIdx = cloudProviderOrder.indexOf(b)
     if (aIdx === -1 && bIdx === -1) return a.localeCompare(b)
     if (aIdx === -1) return 1
     if (bIdx === -1) return -1
@@ -216,8 +258,13 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
             return (
               <span
                 key={modelId}
-                className="flex items-center gap-1.5 px-2.5 py-1 bg-orange-500/20 border border-orange-400/40 rounded-full text-sm text-slate-100"
+                className={`flex items-center gap-1.5 px-2.5 py-1 rounded-full text-sm text-slate-100 ${
+                  model?.is_local
+                    ? 'bg-emerald-500/20 border border-emerald-400/40'
+                    : 'bg-orange-500/20 border border-orange-400/40'
+                }`}
               >
+                {model?.is_local && <Monitor size={12} className="text-emerald-400" />}
                 {model?.name || modelId}
                 <button
                   type="button"
@@ -268,8 +315,9 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
           <div className="overflow-y-auto flex-1">
             {sortedProviders.map(provider => (
               <div key={provider}>
-                <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0">
+                <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0 flex items-center gap-2">
                   {provider}
+                  {isLocalProvider(provider) && <LocalBadge />}
                 </div>
                 {groupedModels[provider].map(model => {
                   const isSelected = selected.includes(model.id)
@@ -289,8 +337,9 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
                       }`}>
                         {isSelected && <Check size={12} className="text-slate-900" />}
                       </div>
-                      <div className="flex-1 min-w-0">
+                      <div className="flex-1 min-w-0 flex items-center gap-2">
                         <div className="truncate">{model.name}</div>
+                        {model.is_local && !isLocalProvider(provider) && <LocalBadge />}
                       </div>
                       <span className="text-xs text-slate-500 shrink-0">{model.price}</span>
                     </button>

{sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/hooks/useScenarioRun.ts RENAMED Viewed

@@ -7,13 +7,20 @@ import { api, RunScenarioResponse, CompareModelsResponse } from '../lib/api'
 export type RunState = 'idle' | 'running' | 'completed' | 'error'
+export interface MlflowOptions {
+  enabled: boolean
+  trackingUri?: string
+  experiment?: string
+  tracing?: boolean
+}
 export interface UseScenarioRunResult {
   state: RunState
   result: RunScenarioResponse | null
   comparison: CompareModelsResponse | null
   error: string | null
-  runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>) => Promise<void>
-  compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>) => Promise<void>
+  runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
+  compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
   reset: () => void
 }
@@ -33,7 +40,8 @@ export function useScenarioRun(): UseScenarioRunResult {
   const runScenario = useCallback(async (
     scenarioId: string,
     model: string,
-    variables?: Record<string, unknown>
+    variables?: Record<string, unknown>,
+    mlflow?: MlflowOptions
   ) => {
     reset()
     setState('running')
@@ -43,6 +51,10 @@ export function useScenarioRun(): UseScenarioRunResult {
         scenario_id: scenarioId,
         model,
         variables,
+        mlflow_export: mlflow?.enabled,
+        mlflow_tracking_uri: mlflow?.trackingUri,
+        mlflow_experiment: mlflow?.experiment,
+        mlflow_tracing: mlflow?.tracing,
       })
       if (response.error) {
@@ -62,7 +74,8 @@ export function useScenarioRun(): UseScenarioRunResult {
     scenarioId: string,
     models: string[],
     runsPerModel: number = 1,
-    variables?: Record<string, unknown>
+    variables?: Record<string, unknown>,
+    mlflow?: MlflowOptions
   ) => {
     reset()
     setState('running')
@@ -73,6 +86,10 @@ export function useScenarioRun(): UseScenarioRunResult {
         models,
         runs_per_model: runsPerModel,
         variables,
+        mlflow_export: mlflow?.enabled,
+        mlflow_tracking_uri: mlflow?.trackingUri,
+        mlflow_experiment: mlflow?.experiment,
+        mlflow_tracing: mlflow?.tracing,
       })
       setState('completed')

{sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/lib/api.ts RENAMED Viewed

@@ -44,6 +44,8 @@ export interface ModelInfo {
   id: string
   name: string
   price: string
+  is_local?: boolean
+  provider_name?: string
 }
 export interface RunScenarioRequest {
@@ -53,6 +55,10 @@ export interface RunScenarioRequest {
   max_turns?: number
   max_tokens?: number
   temperature?: number
+  mlflow_export?: boolean
+  mlflow_tracking_uri?: string
+  mlflow_experiment?: string
+  mlflow_tracing?: boolean
 }
 export interface HistoryMessage {
@@ -112,6 +118,10 @@ export interface CompareModelsRequest {
   runs_per_model?: number
   variables?: Record<string, unknown>
   max_turns?: number
+  mlflow_export?: boolean
+  mlflow_tracking_uri?: string
+  mlflow_experiment?: string
+  mlflow_tracing?: boolean
 }
 export interface ModelStats {
@@ -205,6 +215,8 @@ export interface RunDatasetRequest {
   max_tokens?: number
   temperature?: number
   parallel?: number
+  mlflow_enabled?: boolean
+  mlflow_experiment?: string
 }
 export interface CaseResultInfo {
@@ -235,7 +247,7 @@ export interface RunDatasetResponse {
 }
 class ApiClient {
-  private async fetch<T>(url: string, options?: RequestInit): Promise<T> {
+  protected async fetch<T>(url: string, options?: RequestInit): Promise<T> {
     const response = await fetch(`${API_BASE}${url}`, {
       ...options,
       headers: {
@@ -350,4 +362,87 @@ class ApiClient {
   }
 }
-export const api = new ApiClient()
+// --- Provider Types ---
+export interface ProviderSummary {
+  name: string
+  type: string
+  base_url: string
+  enabled: boolean
+  status: 'connected' | 'disconnected' | 'error' | 'unknown'
+  model_count: number
+  models: string[]
+}
+export interface ProviderListResponse {
+  providers: ProviderSummary[]
+}
+export interface LocalModelInfoResponse {
+  id: string
+  name: string
+  context_length: number
+  supports_tools: boolean
+  is_local: boolean
+}
+export interface ProviderDetailResponse {
+  config: Record<string, unknown>
+  status: {
+    status: string
+    last_checked: string | null
+    available_models: string[]
+    latency_ms: number | null
+    error_message: string | null
+  }
+  models: LocalModelInfoResponse[]
+}
+export interface AddProviderRequest {
+  name: string
+  type: 'ollama' | 'lmstudio' | 'vllm' | 'openai-compatible'
+  base_url: string
+  api_key?: string | null
+  models?: string[]
+  default_params?: Record<string, unknown>
+}
+export interface TestConnectionResponse {
+  success: boolean
+  latency_ms: number | null
+  models_found: string[]
+  error: string | null
+}
+// Extend ApiClient with provider methods
+class ApiClientWithProviders extends ApiClient {
+  async listProviders(): Promise<ProviderSummary[]> {
+    const response = await this.fetch<ProviderListResponse>('/providers')
+    return response.providers
+  }
+  async addProvider(request: AddProviderRequest): Promise<ProviderSummary> {
+    return this.fetch<ProviderSummary>('/providers', {
+      method: 'POST',
+      body: JSON.stringify(request),
+    })
+  }
+  async getProvider(name: string): Promise<ProviderDetailResponse> {
+    return this.fetch<ProviderDetailResponse>(`/providers/${encodeURIComponent(name)}`)
+  }
+  async deleteProvider(name: string): Promise<void> {
+    await this.fetch<void>(`/providers/${encodeURIComponent(name)}`, {
+      method: 'DELETE',
+    })
+  }
+  async testProvider(name: string): Promise<TestConnectionResponse> {
+    return this.fetch<TestConnectionResponse>(`/providers/${encodeURIComponent(name)}/test`, {
+      method: 'POST',
+    })
+  }
+}
+export const api = new ApiClientWithProviders()

sandboxy 0.0.3__tar.gz → 0.0.5__tar.gz

sandboxy 0.0.3tar.gz → 0.0.5tar.gz