npm - triagent - Versions diffs - 0.1.0-alpha13 → 0.1.0-alpha18 - Mend

triagent 0.1.0-alpha13 → 0.1.0-alpha18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/package.json +3 -4
package/src/cli/config.ts +96 -0
package/src/index.ts +201 -3
package/src/integrations/elasticsearch/client.ts +210 -0
package/src/integrations/grafana/client.ts +186 -0
package/src/integrations/kubernetes/multi-cluster.ts +199 -0
package/src/integrations/kubernetes/types.ts +24 -0
package/src/integrations/loki/client.ts +219 -0
package/src/integrations/prometheus/client.ts +163 -0
package/src/integrations/slack/client.ts +265 -0
package/src/integrations/teams/client.ts +199 -0
package/src/mastra/agents/debugger.ts +152 -108
package/src/mastra/tools/approval-store.ts +180 -0
package/src/mastra/tools/cli.ts +94 -2
package/src/mastra/tools/cost.ts +389 -0
package/src/mastra/tools/logs.ts +210 -0
package/src/mastra/tools/network.ts +253 -0
package/src/mastra/tools/prometheus.ts +221 -0
package/src/mastra/tools/remediation.ts +365 -0
package/src/mastra/tools/runbook.ts +186 -0
package/src/server/routes/history.ts +207 -0
package/src/server/routes/notifications.ts +236 -0
package/src/server/webhook.ts +36 -2
package/src/storage/index.ts +3 -0
package/src/storage/investigation-history.ts +277 -0
package/src/storage/runbook-index.ts +330 -0
package/src/storage/types.ts +72 -0
package/src/tui/app.tsx +492 -76
package/src/tui/components/approval-dialog.tsx +156 -0
package/src/tui/components/approval-modal.tsx +278 -0
package/src/tui/components/index.ts +38 -0
package/src/tui/components/styled-span.tsx +24 -0
package/src/tui/components/timeline.tsx +223 -0
package/src/tui/components/toast.tsx +101 -0

package/src/mastra/agents/debugger.ts CHANGED Viewed

@@ -1,123 +1,160 @@
 import { Agent } from "@mastra/core/agent";
 import { z } from "zod";
 import { cliTool } from "../tools/cli.js";
-import { gitTool } from "../tools/git.js";
-import { filesystemTool } from "../tools/filesystem.js";
-import { loadTriagentMd } from "../../cli/config.js";
+import { loadTriagentMd, loadRunbookMd } from "../../cli/config.js";
 import type { Config } from "../../config.js";
-const DEBUGGER_INSTRUCTIONS = `You are an expert Kubernetes debugging agent named Triagent. Your role is to investigate and diagnose issues in Kubernetes clusters by analyzing resources, logs, code, and git history.
-## Your Capabilities
-1. **CLI Access** (cli tool):
-   - Run any shell command including kubectl, grep, awk, jq, curl, etc.
-   - Pipe commands together for powerful filtering and processing
-   - Examples:
-     - \`kubectl get pods -A | grep inventory\`
-     - \`kubectl logs deploy/myapp --tail 100 | grep -i error\`
-     - \`kubectl get pods -o json | jq '.items[].metadata.name'\`
-     - \`kubectl describe pod mypod | grep -A10 Events\`
-2. **Code Analysis** (filesystem tool):
-   - Read source code files
-   - List directory structures
-   - Search for patterns in code
-3. **Git History** (git tool):
-   - View recent commits
-   - Compare changes between commits
-   - Show specific commit details
-   - Blame files to find who changed what
+const DEBUGGER_INSTRUCTIONS = `You are an expert Kubernetes debugging agent named Triagent. Your role is to investigate and diagnose issues in Kubernetes clusters using CLI tools.
+## Your Tool
+You have access to a single powerful tool: **cli** - Execute any shell command. Use pipes, redirects, and command composition to accomplish complex tasks.
+## CLI Capabilities
+### Kubernetes (kubectl)
+\`\`\`bash
+# Resource discovery
+kubectl get pods -A | grep -i <service>
+kubectl get deploy,svc,pods -A -o wide
+kubectl get pods -l app=<name> -n <namespace>
+# Logs and events
+kubectl logs deploy/<name> --tail 100 | grep -i error
+kubectl logs <pod> -c <container> --since=1h
+kubectl get events -A --sort-by='.lastTimestamp' | head -30
+# Debugging
+kubectl describe pod <name> -n <namespace>
+kubectl get pod <name> -o yaml | grep -A20 status
+kubectl top pods -n <namespace>
+kubectl exec -it <pod> -- sh -c "command"
+# Network debugging
+kubectl exec <pod> -- nslookup <service>
+kubectl exec <pod> -- nc -zv <host> <port>
+kubectl get networkpolicy -A
+kubectl get endpoints <service> -n <namespace>
+\`\`\`
+### Git
+\`\`\`bash
+git log --oneline -20
+git log --since="2 hours ago" --oneline
+git diff HEAD~5
+git show <commit>
+git blame <file>
+git log -p -- <file>
+\`\`\`
+### Filesystem
+\`\`\`bash
+ls -la <path>
+cat <file>
+head -100 <file>
+grep -r "pattern" <path>
+find . -name "*.yaml" -exec grep -l "keyword" {} \\;
+\`\`\`
+### Prometheus (via promtool or curl)
+\`\`\`bash
+# Query metrics
+curl -s "http://prometheus:9090/api/v1/query?query=up" | jq .
+curl -s "http://prometheus:9090/api/v1/query?query=container_cpu_usage_seconds_total{pod=~'myapp.*'}" | jq '.data.result[]'
+# Get alerts
+curl -s "http://prometheus:9090/api/v1/alerts" | jq '.data.alerts[] | {alertname: .labels.alertname, state: .state}'
+# Check targets
+curl -s "http://prometheus:9090/api/v1/targets" | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
+\`\`\`
+### Loki (via logcli)
+\`\`\`bash
+# Query logs
+logcli query '{namespace="production"}' --limit=100
+logcli query '{app="myapp"} |= "error"' --since=1h
+logcli query '{namespace="production"} | json | level="error"' --limit=50
+# Tail logs
+logcli query '{app="myapp"}' --tail
+\`\`\`
+### Resource Analysis
+\`\`\`bash
+# Resource usage with jq
+kubectl get pods -o json | jq '.items[] | {name: .metadata.name, cpu: .spec.containers[].resources.requests.cpu, memory: .spec.containers[].resources.requests.memory}'
+# Count pods by status
+kubectl get pods -A -o json | jq '.items | group_by(.status.phase) | map({status: .[0].status.phase, count: length})'
+\`\`\`
 ## Resource Discovery Strategy
-When asked to find resources for a service (e.g., "inventory service"), DO NOT simply try one label like \`app=inventory\` and give up if not found. Instead, use a systematic discovery approach:
-1. **Search by partial name match using grep**:
-   - \`kubectl get pods -A | grep -i inventory\`
-   - \`kubectl get deploy,svc -A | grep -i inventory\`
-   - This finds resources with "inventory" anywhere in the name (e.g., \`inventory-api\`, \`svc-inventory\`)
-2. **If grep returns no results, list all resources to browse**:
-   - \`kubectl get pods,deploy,svc -A\` to see everything
-   - \`kubectl get pods -n <namespace>\` if namespace is known
+When asked to find resources for a service (e.g., "inventory service"), use systematic discovery:
-3. **Try common label patterns**:
-   - \`kubectl get pods -A -l app=inventory\`
-   - \`kubectl get pods -A -l app.kubernetes.io/name=inventory\`
-   - \`kubectl get pods -A -l component=inventory\`
-4. **Follow the resource chain**:
-   - Found a Service? \`kubectl describe svc <name> | grep Selector\` then find pods with that selector
-   - Found a Deployment? \`kubectl get pods -l app=<deployment-name>\`
-   - Use \`kubectl get endpoints <svc-name>\` to see which pods back a service
-5. **Check events for context**:
-   - \`kubectl get events -A --sort-by='.lastTimestamp' | grep -i inventory\`
-   - \`kubectl get events -A --sort-by='.lastTimestamp' | head -20\` for recent cluster activity
-6. **When you find a potential match**:
-   - \`kubectl describe <resource> <name>\` to confirm it's the right one
-   - Check related resources (pods for a deployment, endpoints for a service)
-Always report what you searched for and what you found, even if it's not an exact match. The user can confirm if you found the right resource.
+1. **Search by name**: \`kubectl get pods,deploy,svc -A | grep -i inventory\`
+2. **Try label patterns**: \`kubectl get pods -A -l app=inventory\` or \`app.kubernetes.io/name=inventory\`
+3. **Follow the chain**: Service → Endpoints → Pods → Containers
+4. **Check events**: \`kubectl get events -A --sort-by='.lastTimestamp' | grep -i inventory\`
 ## Investigation Process
-When given an incident, follow this systematic approach:
-1. **Understand the Issue**: Parse the incident description to identify:
-   - What service/component is affected
-   - What symptoms are being observed
-   - When the issue started (if known)
-2. **Discover Relevant Resources**:
-   - Use the Resource Discovery Strategy above to find the affected resources
-   - Don't assume exact names or labels - search broadly first
-   - Follow the resource chain (Service → Deployment → Pods → Containers)
-3. **Check Cluster State**:
-   - Get pod status for discovered resources
-   - Check for recent events related to those resources
-   - Look at resource usage
-4. **Analyze Logs**:
-   - Fetch logs from affected pods (use \`--tail 100\` to get recent logs)
-   - Look for errors, exceptions, or unusual patterns
-   - If multiple containers, check each one
-5. **Investigate Recent Changes**:
-   - Check git log for recent commits
-   - Review diffs of suspicious changes
-   - Correlate timing with when issues started
-6. **Examine Code**:
-   - Read relevant configuration files
-   - Check application code if needed
-   - Look for misconfigurations
-7. **Synthesize Findings**:
-   - Identify the root cause
-   - List affected resources
-   - Provide actionable recommendations
+1. **Understand**: Parse incident for affected service, symptoms, timing
+2. **Discover**: Find affected resources using grep and label selectors
+3. **Check State**: Pod status, events, resource usage
+4. **Analyze Logs**: kubectl logs with grep for errors
+5. **Check Changes**: git log, git diff for recent commits
+6. **Examine Config**: Read manifests and application config
+7. **Synthesize**: Root cause, evidence, recommendations
 ## Output Format
-Always provide your findings in a clear, structured format:
-- **Summary**: Brief overview of the issue
-- **Root Cause**: The identified cause of the problem
-- **Evidence**: Specific data that supports your conclusion
-- **Affected Resources**: List of impacted K8s resources
-- **Recent Changes**: Relevant commits that might be related
-- **Recommendations**: Specific steps to remediate the issue
+Provide findings in a structured format:
+- **Summary**: Brief overview
+- **Root Cause**: Identified cause
+- **Evidence**: Supporting data
+- **Affected Resources**: Impacted K8s resources
+- **Recent Changes**: Relevant commits
+- **Recommendations**: Remediation steps
+## Write Operations - AUTOMATIC UI APPROVAL
+**IMPORTANT: Do NOT ask the user for permission in text. Just execute write commands directly.**
+The CLI tool automatically detects write operations and triggers a UI-based approval prompt. Your job is to:
+1. **Execute write commands immediately** without asking "Would you like to proceed?" or similar
+2. The UI will show an approval dialog to the user
+3. If approved, you'll receive an approval token
+4. Retry the command with the provided \`approvalToken\`
+**WRONG approach:**
+\`\`\`
+"Would you like me to scale the deployment? This requires your approval."
+[Waiting for user to type "yes"]
+\`\`\`
+**CORRECT approach:**
+\`\`\`
+[Just execute the command]
+kubectl scale deployment/myapp --replicas=2 -n prod
+[UI shows approval prompt, user approves]
+[Receive token, retry with token]
+\`\`\`
+**Write operations (automatically detected):**
+- Kubernetes: \`kubectl delete|apply|create|patch|scale|rollout|drain|cordon\`
+- Git: \`git commit|push|merge|rebase|reset\`
+- File system: \`rm|mv|cp|mkdir|chmod\`
+When you receive an approval token in the user's message, extract it and retry the command with \`approvalToken: "<token>"\`.
 ## Important Guidelines
-- Be thorough but efficient - don't run unnecessary commands
-- Focus on actionable insights
-- If unsure, state your confidence level
-- Prioritize quick wins that can restore service
+- Use command composition with pipes for efficiency
+- Be thorough but don't run unnecessary commands
+- State confidence level when unsure
+- Prioritize quick wins to restore service
 - Consider both application and infrastructure issues`;
 export const InvestigationResultSchema = z.object({
@@ -167,10 +204,19 @@ export async function createDebuggerAgent(config: Config) {
   // Load user instructions from ~/.config/triagent/TRIAGENT.md if present
   const userInstructions = await loadTriagentMd();
-  // Combine user instructions with default instructions
-  const instructions = userInstructions
-    ? `## User-Provided Instructions\n\n${userInstructions}\n\n---\n\n${DEBUGGER_INSTRUCTIONS}`
-    : DEBUGGER_INSTRUCTIONS;
+  // Load runbook from ~/.config/triagent/RUNBOOK.md if present
+  const runbook = await loadRunbookMd();
+  // Build instructions with optional user content and runbook
+  let instructions = DEBUGGER_INSTRUCTIONS;
+  if (userInstructions) {
+    instructions = `## User-Provided Instructions\n\n${userInstructions}\n\n---\n\n${instructions}`;
+  }
+  if (runbook) {
+    instructions = `${instructions}\n\n---\n\n## Runbook\n\nRefer to this runbook for standard operating procedures:\n\n${runbook}`;
+  }
   // Construct model config with API key and optional base URL
   const modelId = `${config.aiProvider}/${config.aiModel}` as const;
@@ -187,8 +233,6 @@ export async function createDebuggerAgent(config: Config) {
     model: modelConfig as any, // Mastra handles model routing
     tools: {
       cli: cliTool,
-      git: gitTool,
-      filesystem: filesystemTool,
     },
   });
 }

package/src/mastra/tools/approval-store.ts ADDED Viewed

@@ -0,0 +1,180 @@
+import { randomBytes } from "crypto";
+export interface PendingApproval {
+  id: string;
+  command: string;
+  token: string;
+  riskLevel: "low" | "medium" | "high" | "critical";
+  createdAt: Date;
+  expiresAt: Date;
+}
+export interface ApprovalStore {
+  /** Request approval for a command, returns pending approval info */
+  requestApproval(command: string): PendingApproval;
+  /** Approve a pending request, returns the token */
+  approve(id: string): string | null;
+  /** Reject a pending request */
+  reject(id: string): void;
+  /** Validate an approval token for a command */
+  validateToken(command: string, token: string): boolean;
+  /** Get pending approval by ID */
+  getPending(id: string): PendingApproval | undefined;
+  /** Get all pending approvals */
+  getAllPending(): PendingApproval[];
+  /** Clear expired approvals */
+  clearExpired(): void;
+}
+// Risk patterns - more dangerous commands = higher risk
+const CRITICAL_PATTERNS = [
+  /\bkubectl\s+delete\s+(namespace|ns|node|pv|pvc|clusterrole)/i,
+  /\brm\s+-rf?\s+\/(?!tmp)/i, // rm -rf not in /tmp
+  /\bgit\s+push\s+.*--force/i,
+  /\bhelm\s+(uninstall|delete)\b/i,
+];
+const HIGH_PATTERNS = [
+  /\bkubectl\s+delete\b/i,
+  /\bkubectl\s+apply\s+-f\s+http/i, // apply from URL
+  /\bkubectl\s+drain\b/i,
+  /\bkubectl\s+cordon\b/i,
+  /\bgit\s+reset\s+--hard/i,
+  /\bgit\s+push\b/i,
+  /\bhelm\s+(install|upgrade)\b/i,
+];
+const MEDIUM_PATTERNS = [
+  /\bkubectl\s+scale\b/i,
+  /\bkubectl\s+rollout\s+(restart|undo)/i,
+  /\bkubectl\s+(apply|create|patch)\b/i,
+  /\bgit\s+(commit|merge|rebase)/i,
+];
+function classifyRisk(command: string): PendingApproval["riskLevel"] {
+  if (CRITICAL_PATTERNS.some(p => p.test(command))) return "critical";
+  if (HIGH_PATTERNS.some(p => p.test(command))) return "high";
+  if (MEDIUM_PATTERNS.some(p => p.test(command))) return "medium";
+  return "low";
+}
+function generateToken(): string {
+  return randomBytes(16).toString("hex");
+}
+function generateId(): string {
+  return randomBytes(8).toString("hex");
+}
+const EXPIRATION_MS = 10 * 60 * 1000; // 10 minutes
+// Singleton store instance
+class ApprovalStoreImpl implements ApprovalStore {
+  private pending: Map<string, PendingApproval> = new Map();
+  private approvedTokens: Map<string, { command: string; expiresAt: Date }> = new Map();
+  requestApproval(command: string): PendingApproval {
+    // Clean up expired entries first
+    this.clearExpired();
+    const id = generateId();
+    const token = generateToken();
+    const now = new Date();
+    const expiresAt = new Date(now.getTime() + EXPIRATION_MS);
+    const approval: PendingApproval = {
+      id,
+      command,
+      token,
+      riskLevel: classifyRisk(command),
+      createdAt: now,
+      expiresAt,
+    };
+    this.pending.set(id, approval);
+    return approval;
+  }
+  approve(id: string): string | null {
+    const pending = this.pending.get(id);
+    if (!pending) return null;
+    // Check if expired
+    if (new Date() > pending.expiresAt) {
+      this.pending.delete(id);
+      return null;
+    }
+    // Move to approved tokens
+    this.approvedTokens.set(pending.token, {
+      command: pending.command,
+      expiresAt: pending.expiresAt,
+    });
+    // Remove from pending
+    this.pending.delete(id);
+    return pending.token;
+  }
+  reject(id: string): void {
+    this.pending.delete(id);
+  }
+  validateToken(command: string, token: string): boolean {
+    const approved = this.approvedTokens.get(token);
+    if (!approved) return false;
+    // Check expiration
+    if (new Date() > approved.expiresAt) {
+      this.approvedTokens.delete(token);
+      return false;
+    }
+    // Token must match the exact command
+    if (approved.command !== command) return false;
+    // Token is valid - consume it (one-time use)
+    this.approvedTokens.delete(token);
+    return true;
+  }
+  getPending(id: string): PendingApproval | undefined {
+    const pending = this.pending.get(id);
+    if (pending && new Date() > pending.expiresAt) {
+      this.pending.delete(id);
+      return undefined;
+    }
+    return pending;
+  }
+  getAllPending(): PendingApproval[] {
+    this.clearExpired();
+    return Array.from(this.pending.values());
+  }
+  clearExpired(): void {
+    const now = new Date();
+    for (const [id, pending] of this.pending) {
+      if (now > pending.expiresAt) {
+        this.pending.delete(id);
+      }
+    }
+    for (const [token, approved] of this.approvedTokens) {
+      if (now > approved.expiresAt) {
+        this.approvedTokens.delete(token);
+      }
+    }
+  }
+}
+// Export singleton instance
+export const approvalStore: ApprovalStore = new ApprovalStoreImpl();

package/src/mastra/tools/cli.ts CHANGED Viewed

@@ -1,11 +1,52 @@
 import { createTool } from "@mastra/core/tools";
 import { z } from "zod";
 import { execCommand } from "../../sandbox/bashlet.js";
+import { approvalStore, type PendingApproval } from "./approval-store.js";
 interface CliOutput {
   success: boolean;
   output: string;
   error?: string;
+  requiresApproval?: boolean;
+  command?: string;
+  // Token-based approval fields
+  approvalId?: string;
+  riskLevel?: PendingApproval["riskLevel"];
+}
+// Write command patterns that require user approval
+const WRITE_COMMAND_PATTERNS = [
+  // Kubernetes write operations
+  /\bkubectl\s+(delete|apply|create|patch|edit|replace|set|label|annotate|taint|cordon|uncordon|drain)\b/i,
+  /\bkubectl\s+rollout\s+(restart|undo|pause|resume)\b/i,
+  /\bkubectl\s+scale\b/i,
+  /\bkubectl\s+exec\b.*\s+--\s+.*(rm|mv|cp|chmod|chown|kill|pkill|shutdown|reboot|dd|mkfs|fdisk)\b/i,
+  // Git write operations
+  /\bgit\s+(commit|push|merge|rebase|reset|checkout|stash|tag|branch\s+-[dD]|cherry-pick|revert|am|pull)\b/i,
+  // File system write operations
+  /\b(rm|rmdir|mv|cp|mkdir|touch|chmod|chown|ln)\s+/i,
+  /\b(cat|echo|printf)\s+.*[>|]/, // Redirects
+  /\btee\s+/i,
+  /\bsed\s+-i/i, // In-place sed
+  // Package managers
+  /\b(apt|apt-get|yum|dnf|brew|npm|yarn|pip|cargo)\s+(install|remove|uninstall|update|upgrade)\b/i,
+  // Service management
+  /\b(systemctl|service)\s+(start|stop|restart|enable|disable)\b/i,
+  // Docker/container write operations
+  /\bdocker\s+(rm|rmi|stop|kill|prune|system\s+prune)\b/i,
+  /\bdocker-compose\s+(down|rm|stop)\b/i,
+  // Helm write operations
+  /\bhelm\s+(install|upgrade|uninstall|delete|rollback)\b/i,
+];
+function isWriteCommand(command: string): boolean {
+  return WRITE_COMMAND_PATTERNS.some(pattern => pattern.test(command));
 }
 function filterSensitiveData(output: string): string {
@@ -25,6 +66,9 @@ export const cliTool = createTool({
 Use this to run any CLI commands including kubectl, grep, awk, jq, curl, etc.
 Supports pipes and command chaining.
+IMPORTANT: Write/modify commands require user approval before execution.
+The tool will detect write operations and pause for confirmation.
 Examples:
 - List all pods: kubectl get pods -A
 - Find pods by name: kubectl get pods -A | grep inventory
@@ -35,10 +79,58 @@ Examples:
   inputSchema: z.object({
     command: z.string().describe("The shell command to execute"),
+    approvalToken: z.string().optional().describe("Approval token from user confirmation. Required for write operations."),
+  }),
+  outputSchema: z.object({
+    success: z.boolean(),
+    output: z.string(),
+    error: z.string().optional(),
+    requiresApproval: z.boolean().optional(),
+    command: z.string().optional(),
+    approvalId: z.string().optional(),
+    riskLevel: z.enum(["low", "medium", "high", "critical"]).optional(),
   }),
-  execute: async ({ command }): Promise<CliOutput> => {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  execute: (async (inputData: any): Promise<CliOutput> => {
+    const { command, approvalToken } = inputData;
     try {
+      // Check if this is a write command
+      if (isWriteCommand(command)) {
+        // If token provided, validate it
+        if (approvalToken) {
+          const isValid = approvalStore.validateToken(command, approvalToken);
+          if (!isValid) {
+            // Invalid or expired token - request new approval
+            const pending = approvalStore.requestApproval(command);
+            return {
+              success: false,
+              output: "",
+              requiresApproval: true,
+              command: command,
+              approvalId: pending.id,
+              riskLevel: pending.riskLevel,
+              error: `⚠️ APPROVAL TOKEN INVALID OR EXPIRED\n\nCommand: ${command}\nRisk Level: ${pending.riskLevel.toUpperCase()}\nApproval ID: ${pending.id}\n\nPlease wait for user to approve this operation. A new approval token will be provided.`,
+            };
+          }
+          // Token valid - proceed with execution
+        } else {
+          // No token - request approval
+          const pending = approvalStore.requestApproval(command);
+          return {
+            success: false,
+            output: "",
+            requiresApproval: true,
+            command: command,
+            approvalId: pending.id,
+            riskLevel: pending.riskLevel,
+            error: `⚠️ WRITE OPERATION DETECTED - APPROVAL REQUIRED\n\nCommand: ${command}\nRisk Level: ${pending.riskLevel.toUpperCase()}\nApproval ID: ${pending.id}\n\nThis command will modify state. Waiting for user approval...`,
+          };
+        }
+      }
       const result = await execCommand(command);
       if (result.exitCode !== 0) {
@@ -61,5 +153,5 @@ Examples:
         error: error instanceof Error ? error.message : String(error),
       };
     }
-  },
+  }) as any,
 });