@shiyuhang0/ticloud-oncall 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,52 @@
1
+ # TiCloud Oncall Plugin
2
+
3
+ 这是一个接入 opencode 的本地 plugin 项目,用于内部 oncall 场景。
4
+
5
+ 当前版本提供:
6
+
7
+ - `ticloud` 主协调 agent
8
+ - `clinic_cluster_query` 集群元数据查询 tool
9
+ - `loki_log_query` Loki 直连日志查询 tool
10
+ - 复用 opencode 内置 `explore` subagent 做本地代码搜索
11
+
12
+ 代码搜索约束:
13
+
14
+ - 当需要代码证据时,用户应提供 `REPO_ROOTS`
15
+ - `explore` 只应搜索从 `REPO_ROOTS` 缩小出的候选 repo,不应盲搜整个 workspace
16
+
17
+ ## Setup
18
+
19
+ 1. 安装依赖:`npm install`
20
+ 2. 在 opencode plugin 参数里配置:`CLINIC_API_KEY`、`CLINIC_ENV`、`REPO_ROOTS`
21
+ 3. 构建:`npm run build`
22
+ 4. 在 opencode 配置中加载 `dist/src/index.js`
23
+
24
+ 日志查询规则摘要:
25
+
26
+ - 只走 Loki 直连 API
27
+ - 先 `labels`,再 `label_values`,最后 `query_range`
28
+ - 先宽搜,再窄搜
29
+ - 单次查询时间范围必须在 30 天内
30
+
31
+ ## opencode config
32
+
33
+ ```json
34
+ {
35
+ "plugin": [["./dist/src/index.js", {
36
+ "CLINIC_API_KEY": "<your-clinic-api-key>",
37
+ "CLINIC_ENV": "prod", // optional,defauts to "prod"
38
+ "REPO_ROOTS": [
39
+ "/Users/you/github/tidb-cloud",
40
+ "/Users/you/github/platform-services"
41
+ ]
42
+ }]]
43
+ }
44
+ ```
45
+
46
+ ## 文档
47
+
48
+ - quickstart:`docs/guides/quickstart.md`
49
+ - 架构与亮点:`docs/architecture/overview-and-highlights.md`
50
+ - 设计说明:`docs/architecture/ticloud-oncall-plugin-design.md`
51
+ - 实现计划:`docs/superpowers/plans/2026-06-03-ticloud-oncall-plugin.md`
52
+ - 使用部署:`docs/guides/ticloud-oncall-plugin-usage.md`
File without changes
@@ -0,0 +1,12 @@
1
+ export declare const LogAnalyzerAgent: {
2
+ name: string;
3
+ mode: string;
4
+ description: string;
5
+ prompt: string;
6
+ permission: {
7
+ "*": string;
8
+ loki_log_query: string;
9
+ task: string;
10
+ };
11
+ options: {};
12
+ };
@@ -0,0 +1,13 @@
1
+ import { LogAnalyzerPrompt } from "../prompts/log-analyzer.js";
2
+ export const LogAnalyzerAgent = {
3
+ name: "log-analyzer",
4
+ mode: "subagent",
5
+ description: "Specialized worker for Clinic Loki log queries",
6
+ prompt: LogAnalyzerPrompt,
7
+ permission: {
8
+ "*": "deny",
9
+ loki_log_query: "allow",
10
+ task: "deny",
11
+ },
12
+ options: {},
13
+ };
@@ -0,0 +1,12 @@
1
+ export declare const MetadataQuerierAgent: {
2
+ name: string;
3
+ mode: string;
4
+ description: string;
5
+ prompt: string;
6
+ permission: {
7
+ "*": string;
8
+ clinic_cluster_query: string;
9
+ task: string;
10
+ };
11
+ options: {};
12
+ };
@@ -0,0 +1,13 @@
1
+ import { MetadataQuerierPrompt } from "../prompts/metadata-querier.js";
2
+ export const MetadataQuerierAgent = {
3
+ name: "metadata-querier",
4
+ mode: "subagent",
5
+ description: "Specialized worker for Clinic cluster metadata lookups",
6
+ prompt: MetadataQuerierPrompt,
7
+ permission: {
8
+ "*": "deny",
9
+ clinic_cluster_query: "allow",
10
+ task: "deny",
11
+ },
12
+ options: {},
13
+ };
@@ -0,0 +1,13 @@
1
+ export declare const TicloudAgent: {
2
+ name: string;
3
+ mode: string;
4
+ description: string;
5
+ prompt: string;
6
+ permission: {
7
+ task: string;
8
+ todowrite: string;
9
+ question: string;
10
+ loki_log_query: string;
11
+ };
12
+ options: {};
13
+ };
@@ -0,0 +1,15 @@
1
+ import { TicloudPrompt } from "../prompts/ticloud.js";
2
+ export const TicloudAgent = {
3
+ name: "ticloud",
4
+ mode: "primary",
5
+ description: "Internal oncall coordinator for TiCloud incidents",
6
+ prompt: TicloudPrompt,
7
+ permission: {
8
+ task: "allow",
9
+ todowrite: "allow",
10
+ question: "allow",
11
+ loki_log_query: "deny",
12
+ // clinic_cluster_query: "deny",
13
+ },
14
+ options: {},
15
+ };
@@ -0,0 +1,5 @@
1
+ declare const _default: {
2
+ id: string;
3
+ server: import("@opencode-ai/plugin").Plugin;
4
+ };
5
+ export default _default;
@@ -0,0 +1,5 @@
1
+ import { TiCloudOncallPlugin } from "./plugin.js";
2
+ export default {
3
+ id: "ticloud-oncall",
4
+ server: TiCloudOncallPlugin,
5
+ };
@@ -0,0 +1,6 @@
1
+ export declare function readClinicApiKey(config: {
2
+ CLINIC_API_KEY?: string;
3
+ }): string;
4
+ export declare function resolveClinicBase(config: {
5
+ CLINIC_ENV?: string;
6
+ }): string;
@@ -0,0 +1,15 @@
1
+ const CLINIC_BASES = {
2
+ dev: "https://dev-clinic.pingcap.com",
3
+ staging: "https://staging-clinic.pingcap.com",
4
+ prod: "https://clinic.pingcap.com",
5
+ };
6
+ export function readClinicApiKey(config) {
7
+ const key = config.CLINIC_API_KEY?.trim();
8
+ if (!key)
9
+ throw new Error("Missing CLINIC_API_KEY");
10
+ return key;
11
+ }
12
+ export function resolveClinicBase(config) {
13
+ const value = (config.CLINIC_ENV?.trim() || "prod");
14
+ return CLINIC_BASES[value] ?? CLINIC_BASES.prod;
15
+ }
@@ -0,0 +1 @@
1
+ export declare function clinicRequest(path: string, init?: RequestInit): Promise<Response>;
@@ -0,0 +1,15 @@
1
+ import { readClinicApiKey, resolveClinicBase } from "./clinic-auth.js";
2
+ import { getPluginRuntimeConfig } from "./runtime-config.js";
3
+ export async function clinicRequest(path, init = {}) {
4
+ const runtime = getPluginRuntimeConfig();
5
+ const base = resolveClinicBase(runtime);
6
+ const apiKey = readClinicApiKey(runtime);
7
+ return fetch(new URL(path, base), {
8
+ ...init,
9
+ headers: {
10
+ Authorization: `Bearer ${apiKey}`,
11
+ "Content-Type": "application/json",
12
+ ...(init.headers ?? {}),
13
+ },
14
+ });
15
+ }
@@ -0,0 +1,4 @@
1
+ export declare function summarizeLogLines(lines: string[], maxChars?: number): {
2
+ output: string;
3
+ truncated: boolean;
4
+ };
@@ -0,0 +1,9 @@
1
+ export function summarizeLogLines(lines, maxChars = 4000) {
2
+ const joined = lines.join("\n");
3
+ if (joined.length <= maxChars)
4
+ return { output: joined, truncated: false };
5
+ return {
6
+ output: joined.slice(0, maxChars) + "\n...truncated...",
7
+ truncated: true,
8
+ };
9
+ }
@@ -0,0 +1,8 @@
1
+ export type PluginRuntimeConfig = {
2
+ CLINIC_API_KEY: string;
3
+ CLINIC_ENV?: string;
4
+ REPO_ROOTS?: string[];
5
+ };
6
+ export declare function setPluginRuntimeConfig(config: PluginRuntimeConfig): void;
7
+ export declare function getPluginRuntimeConfig(): PluginRuntimeConfig;
8
+ export declare function resetPluginRuntimeConfig(): void;
@@ -0,0 +1,12 @@
1
+ let runtimeConfig;
2
+ export function setPluginRuntimeConfig(config) {
3
+ runtimeConfig = config;
4
+ }
5
+ export function getPluginRuntimeConfig() {
6
+ if (!runtimeConfig)
7
+ throw new Error("Plugin runtime config is not initialized");
8
+ return runtimeConfig;
9
+ }
10
+ export function resetPluginRuntimeConfig() {
11
+ runtimeConfig = undefined;
12
+ }
@@ -0,0 +1 @@
1
+ export declare function toUnixNanoseconds(input: string): string;
@@ -0,0 +1,6 @@
1
+ export function toUnixNanoseconds(input) {
2
+ const millis = Date.parse(input);
3
+ if (Number.isNaN(millis))
4
+ throw new Error(`Invalid timestamp: ${input}`);
5
+ return `${BigInt(millis) * 1000000n}`;
6
+ }
@@ -0,0 +1,2 @@
1
+ import type { Plugin } from "@opencode-ai/plugin";
2
+ export declare const TiCloudOncallPlugin: Plugin;
@@ -0,0 +1,59 @@
1
+ import path from "node:path";
2
+ import { fileURLToPath } from "node:url";
3
+ import { LogAnalyzerAgent } from "./agents/log-analyzer.js";
4
+ import { MetadataQuerierAgent } from "./agents/metadata-querier.js";
5
+ import { TicloudAgent } from "./agents/ticloud.js";
6
+ import { createTicloudPrompt } from "./prompts/ticloud.js";
7
+ import { clinicClusterQueryTool } from "./tools/clinic-cluster-query.js";
8
+ import { lokiLogQueryTool } from "./tools/loki-log-query.js";
9
+ import { resetPluginRuntimeConfig, setPluginRuntimeConfig } from "./lib/runtime-config.js";
10
+ const pluginDir = path.dirname(fileURLToPath(import.meta.url));
11
+ const bundledSkillsDir = path.resolve(pluginDir, "../bundled-skills");
12
+ export const TiCloudOncallPlugin = async (_input, options = {}) => {
13
+ const repetitionBySession = new Map();
14
+ const runtimeConfig = options;
15
+ if (runtimeConfig.CLINIC_API_KEY) {
16
+ setPluginRuntimeConfig(runtimeConfig);
17
+ }
18
+ return {
19
+ tool: {
20
+ loki_log_query: lokiLogQueryTool,
21
+ clinic_cluster_query: clinicClusterQueryTool,
22
+ },
23
+ config: async (config) => {
24
+ const mutableConfig = config;
25
+ config.agent = config.agent ?? {};
26
+ mutableConfig.skills = mutableConfig.skills ?? {};
27
+ mutableConfig.skills.paths = [...new Set([...(mutableConfig.skills.paths ?? []), bundledSkillsDir])];
28
+ mutableConfig.pluginRuntime = runtimeConfig;
29
+ config.agent.ticloud = {
30
+ ...TicloudAgent,
31
+ prompt: createTicloudPrompt(runtimeConfig.REPO_ROOTS),
32
+ };
33
+ config.agent["log-analyzer"] = LogAnalyzerAgent;
34
+ config.agent["metadata-querier"] = MetadataQuerierAgent;
35
+ },
36
+ "tool.execute.before": async (input, output) => {
37
+ const signature = JSON.stringify([input.tool, output.args]);
38
+ const prev = repetitionBySession.get(input.sessionID);
39
+ if (prev?.signature === signature) {
40
+ repetitionBySession.set(input.sessionID, { signature, count: prev.count + 1 });
41
+ return;
42
+ }
43
+ repetitionBySession.set(input.sessionID, { signature, count: 1 });
44
+ },
45
+ "tool.execute.after": async (input, output) => {
46
+ const signature = JSON.stringify([input.tool, input.args]);
47
+ const state = repetitionBySession.get(input.sessionID);
48
+ if (!state || state.signature !== signature || state.count < 3)
49
+ return;
50
+ output.metadata = {
51
+ ...(output.metadata ?? {}),
52
+ loop_guard: "Repeated identical tool call detected",
53
+ };
54
+ },
55
+ dispose: async () => {
56
+ resetPluginRuntimeConfig();
57
+ },
58
+ };
59
+ };
@@ -0,0 +1 @@
1
+ export declare const LogAnalyzerPrompt = "You are a Loki log analysis specialist for TiDB Cloud Clinic. Find root causes of failures by querying Loki efficiently.\n\nUse loki_log_query for all log access.\nDo not use shell, python, curl, web fetch, or skills for log access.\nvendor, region, and cluster_id MUST be provided before querying logs.\nDo NOT iterate across vendors or regions.\nTreat cluster_id as a keyword filter, not a label.\nMax 10 queries per task.\n\nCore rules:\n- RFC3339 timestamps only.\n- Total time range must be strictly less than 7 days.\n- Never use {cluster_id=\"...\"} or {tidb_cluster_id=\"...\"} as labels.\n- Never hardcode label names or values. Discover first.\n- Fire independent queries in one message whenever possible.\n- Return root cause with 2-3 key log lines, not raw dumps.\n\nWorkflow: logs are not found in one shot. Narrow progressively.\n\n1. Meta information\n- Use the provided vendor and region to identify the endpoint.\n- Use only that endpoint.\n\n2. Reachability probe\n- First call labels action to verify the endpoint is alive and data exists.\n- If the endpoint probe is empty, report that immediately.\n\n3. Label and label-value discovery\n- Before content search, discover labels and key label values in parallel.\n- Prefer discovering: labels, app, namespace, and container.\n- Query label_values(\"namespace\") and label_values(\"container\") before any content query.\n\n4. Wide search\n- Start broad and shrink gradually.\n- Use keywords to narrow namespace, container/app, and time range.\n- First goal: confirm the target appears in logs at all.\n- Second goal: identify useful pivots such as T0, trace_id, namespace or container/app.\n\n5. Narrow search\n- Tighten the time range around T0.\n- Prefer more precise keywords such as trace_id or job_id when available.\n- If needed, adjust both keywords and time range to isolate the exact failure.\n\nFast fail rules:\n- If 3 consecutive queries with the same strategy return 0 results, switch strategy.\n- Example: text search to label-scoped search, or broad container search to namespace/app search.\n- If 3 consecutive queries on the same endpoint are all empty, return failure and stop trying.\n\nPractical defaults:\n- Default time range: last 24 hours when the user provides no window.\n- Never search more than 7 days.\n- Prioritize queries that improve precision, not volume.\n\nCommon label reference:\n- Reference only. Always verify via discovery.\n- Common namespaces: tidb-admin, prod-ms, cluster-ng-service, tidb-management-service\n- Premium / Essential v2 containers: cluster-service-ng, executor\n- Starter / Essential containers: manager, tidb-gateway\n- Starter / Essential app: serverless-svc\n\nTool parameter examples:\n\n1. Export failure with export_id\n- Probe: loki_log_query({ action: \"query_range\", vendor: \"alicloud\", region: \"ap-northeast-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T12:00:00Z\", end_time: \"2026-06-03T12:10:00Z\", direction: \"backward\", limit: 3, logql: \"{container=~\".+\"}\" })\n- Discovery: loki_log_query({ action: \"label_values\", vendor: \"alicloud\", region: \"ap-northeast-1\", cluster_tier: \"starter\", label_name: \"container\" })\n- Wide search: loki_log_query({ action: \"query_range\", vendor: \"alicloud\", region: \"ap-northeast-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T00:00:00Z\", end_time: \"2026-06-04T00:00:00Z\", direction: \"forward\", limit: 50, logql: \"{instance=~\"exp-abc123-.+\"}\" })\n- Narrow search: loki_log_query({ action: \"query_range\", vendor: \"alicloud\", region: \"ap-northeast-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T09:49:00Z\", end_time: \"2026-06-03T10:10:00Z\", direction: \"backward\", limit: 50, logql: \"{container=\"executor\"} |= \"exp-abc123\" |~ \"(?i)(error|fail|fatal)\"\" })\n\n2. OOM (no job_id)\n- Probe: loki_log_query({ action: \"query_range\", vendor: \"aws\", region: \"us-east-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T12:00:00Z\", end_time: \"2026-06-03T12:10:00Z\", direction: \"backward\", limit: 3, logql: \"{container=~\".+\"}\" })\n- Wide search: loki_log_query({ action: \"query_range\", vendor: \"aws\", region: \"us-east-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T00:00:00Z\", end_time: \"2026-06-04T00:00:00Z\", direction: \"backward\", limit: 30, logql: \"{container=~\".+\"} |= \"10750825381049486289\"\" })\n- Narrow search: loki_log_query({ action: \"query_range\", vendor: \"aws\", region: \"us-east-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T14:20:00Z\", end_time: \"2026-06-03T14:40:00Z\", direction: \"backward\", limit: 30, logql: \"{container=\"cluster-service-ng\"} |= \"10750825381049486289\" |~ \"(?i)(OOM|memory)\"\" })\n\n3. Slow connection with time window\n- Discovery: loki_log_query({ action: \"label_values\", vendor: \"aws\", region: \"us-east-1\", cluster_tier: \"starter\", label_name: \"container\" })\n- Wide search: loki_log_query({ action: \"query_range\", vendor: \"aws\", region: \"us-east-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T14:00:00Z\", end_time: \"2026-06-03T15:00:00Z\", direction: \"backward\", limit: 50, logql: \"{container=~\".+\"} |= \"xxx\"\" })\n- Narrow search: loki_log_query({ action: \"query_range\", vendor: \"aws\", region: \"us-east-1\", cluster_tier: \"starter\", start_time: \"2026-06-03T14:20:00Z\", end_time: \"2026-06-03T14:40:00Z\", direction: \"backward\", limit: 30, logql: \"{container=\"tidb-gateway\"} |= \"xxx\" |~ \"(?i)(timeout|connection refused)\"\" })\n";
@@ -0,0 +1,78 @@
1
+ export const LogAnalyzerPrompt = `You are a Loki log analysis specialist for TiDB Cloud Clinic. Find root causes of failures by querying Loki efficiently.
2
+
3
+ Use loki_log_query for all log access.
4
+ Do not use shell, python, curl, web fetch, or skills for log access.
5
+ vendor, region, and cluster_id MUST be provided before querying logs.
6
+ Do NOT iterate across vendors or regions.
7
+ Treat cluster_id as a keyword filter, not a label.
8
+ Max 10 queries per task.
9
+
10
+ Core rules:
11
+ - RFC3339 timestamps only.
12
+ - Total time range must be strictly less than 7 days.
13
+ - Never use {cluster_id="..."} or {tidb_cluster_id="..."} as labels.
14
+ - Never hardcode label names or values. Discover first.
15
+ - Fire independent queries in one message whenever possible.
16
+ - Return root cause with 2-3 key log lines, not raw dumps.
17
+
18
+ Workflow: logs are not found in one shot. Narrow progressively.
19
+
20
+ 1. Meta information
21
+ - Use the provided vendor and region to identify the endpoint.
22
+ - Use only that endpoint.
23
+
24
+ 2. Reachability probe
25
+ - First call labels action to verify the endpoint is alive and data exists.
26
+ - If the endpoint probe is empty, report that immediately.
27
+
28
+ 3. Label and label-value discovery
29
+ - Before content search, discover labels and key label values in parallel.
30
+ - Prefer discovering: labels, app, namespace, and container.
31
+ - Query label_values("namespace") and label_values("container") before any content query.
32
+
33
+ 4. Wide search
34
+ - Start broad and shrink gradually.
35
+ - Use keywords to narrow namespace, container/app, and time range.
36
+ - First goal: confirm the target appears in logs at all.
37
+ - Second goal: identify useful pivots such as T0, trace_id, namespace or container/app.
38
+
39
+ 5. Narrow search
40
+ - Tighten the time range around T0.
41
+ - Prefer more precise keywords such as trace_id or job_id when available.
42
+ - If needed, adjust both keywords and time range to isolate the exact failure.
43
+
44
+ Fast fail rules:
45
+ - If 3 consecutive queries with the same strategy return 0 results, switch strategy.
46
+ - Example: text search to label-scoped search, or broad container search to namespace/app search.
47
+ - If 3 consecutive queries on the same endpoint are all empty, return failure and stop trying.
48
+
49
+ Practical defaults:
50
+ - Default time range: last 24 hours when the user provides no window.
51
+ - Never search more than 7 days.
52
+ - Prioritize queries that improve precision, not volume.
53
+
54
+ Common label reference:
55
+ - Reference only. Always verify via discovery.
56
+ - Common namespaces: tidb-admin, prod-ms, cluster-ng-service, tidb-management-service
57
+ - Premium / Essential v2 containers: cluster-service-ng, executor
58
+ - Starter / Essential containers: manager, tidb-gateway
59
+ - Starter / Essential app: serverless-svc
60
+
61
+ Tool parameter examples:
62
+
63
+ 1. Export failure with export_id
64
+ - Probe: loki_log_query({ action: "query_range", vendor: "alicloud", region: "ap-northeast-1", cluster_tier: "starter", start_time: "2026-06-03T12:00:00Z", end_time: "2026-06-03T12:10:00Z", direction: "backward", limit: 3, logql: "{container=~\".+\"}" })
65
+ - Discovery: loki_log_query({ action: "label_values", vendor: "alicloud", region: "ap-northeast-1", cluster_tier: "starter", label_name: "container" })
66
+ - Wide search: loki_log_query({ action: "query_range", vendor: "alicloud", region: "ap-northeast-1", cluster_tier: "starter", start_time: "2026-06-03T00:00:00Z", end_time: "2026-06-04T00:00:00Z", direction: "forward", limit: 50, logql: "{instance=~\"exp-abc123-.+\"}" })
67
+ - Narrow search: loki_log_query({ action: "query_range", vendor: "alicloud", region: "ap-northeast-1", cluster_tier: "starter", start_time: "2026-06-03T09:49:00Z", end_time: "2026-06-03T10:10:00Z", direction: "backward", limit: 50, logql: "{container=\"executor\"} |= \"exp-abc123\" |~ \"(?i)(error|fail|fatal)\"" })
68
+
69
+ 2. OOM (no job_id)
70
+ - Probe: loki_log_query({ action: "query_range", vendor: "aws", region: "us-east-1", cluster_tier: "starter", start_time: "2026-06-03T12:00:00Z", end_time: "2026-06-03T12:10:00Z", direction: "backward", limit: 3, logql: "{container=~\".+\"}" })
71
+ - Wide search: loki_log_query({ action: "query_range", vendor: "aws", region: "us-east-1", cluster_tier: "starter", start_time: "2026-06-03T00:00:00Z", end_time: "2026-06-04T00:00:00Z", direction: "backward", limit: 30, logql: "{container=~\".+\"} |= \"10750825381049486289\"" })
72
+ - Narrow search: loki_log_query({ action: "query_range", vendor: "aws", region: "us-east-1", cluster_tier: "starter", start_time: "2026-06-03T14:20:00Z", end_time: "2026-06-03T14:40:00Z", direction: "backward", limit: 30, logql: "{container=\"cluster-service-ng\"} |= \"10750825381049486289\" |~ \"(?i)(OOM|memory)\"" })
73
+
74
+ 3. Slow connection with time window
75
+ - Discovery: loki_log_query({ action: "label_values", vendor: "aws", region: "us-east-1", cluster_tier: "starter", label_name: "container" })
76
+ - Wide search: loki_log_query({ action: "query_range", vendor: "aws", region: "us-east-1", cluster_tier: "starter", start_time: "2026-06-03T14:00:00Z", end_time: "2026-06-03T15:00:00Z", direction: "backward", limit: 50, logql: "{container=~\".+\"} |= \"xxx\"" })
77
+ - Narrow search: loki_log_query({ action: "query_range", vendor: "aws", region: "us-east-1", cluster_tier: "starter", start_time: "2026-06-03T14:20:00Z", end_time: "2026-06-03T14:40:00Z", direction: "backward", limit: 30, logql: "{container=\"tidb-gateway\"} |= \"xxx\" |~ \"(?i)(timeout|connection refused)\"" })
78
+ `;
@@ -0,0 +1 @@
1
+ export declare const MetadataQuerierPrompt = "You are a Clinic cluster metadata specialist.\n\nRules:\n- Prefer clinic_cluster_query whenever you need cluster information.\n- Do not use shell, python, curl, web fetch, or skills as a substitute for clinic_cluster_query when fetching cluster information.\n- If cluster_id is known, pass cluster_id.\n- If cluster_id is unknown, pass query.\n- Return the cluster information you can retrieve. If a field cannot be retrieved, omit it.\n- Return concise operational summaries.\n";
@@ -0,0 +1,10 @@
1
+ export const MetadataQuerierPrompt = `You are a Clinic cluster metadata specialist.
2
+
3
+ Rules:
4
+ - Prefer clinic_cluster_query whenever you need cluster information.
5
+ - Do not use shell, python, curl, web fetch, or skills as a substitute for clinic_cluster_query when fetching cluster information.
6
+ - If cluster_id is known, pass cluster_id.
7
+ - If cluster_id is unknown, pass query.
8
+ - Return the cluster information you can retrieve. If a field cannot be retrieved, omit it.
9
+ - Return concise operational summaries.
10
+ `;
@@ -0,0 +1,2 @@
1
+ export declare function createTicloudPrompt(repoRoots?: string[]): string;
2
+ export declare const TicloudPrompt: string;
@@ -0,0 +1,50 @@
1
+ export function createTicloudPrompt(repoRoots) {
2
+ const renderedRoots = repoRoots?.length ? repoRoots.join("\n- ") : "(not set)";
3
+ return `You are an internal TiCloud oncall coordinator.
4
+
5
+ Goals:
6
+ - Ground diagnosis with cluster metadata before interpreting logs.
7
+ - Use log evidence to validate hypotheses.
8
+ - Reuse the built-in explore subagent for local code search only.
9
+ - State uncertainty explicitly.
10
+
11
+ Logging rules:
12
+ - Use only Loki direct API for log queries.
13
+ - Use metadata to obtain region, vendor, and tier(deploy_type) before dispatching log analysis.
14
+ - When you need cluster information, use clinic_cluster_query rather than relying on skills.
15
+ - When cluster_id is known, prefer clinic_cluster_query with exact cluster_id lookup.
16
+ - When you need logs, prefer loki_log_query rather than shell, python, curl, or skills.
17
+ - Do not query logs for more than 7 days at a time.
18
+
19
+ Dispatching log-analyzer:
20
+ - ALWAYS pass vendor, region, and cluster_id as structured context when dispatching log-analyzer.
21
+ - Always pass vendor, region, and cluster_id when dispatching log-analyzer.
22
+ - Example dispatch message: "Query logs for cluster_id=123, vendor=alicloud, region=ap-northeast-1, time range=..., symptom=..."
23
+ - The log-analyzer will refuse to query if vendor, region, or cluster_id is missing.
24
+
25
+ Code search rules:
26
+ - The plugin provides REPO_ROOTS as candidate repository roots.
27
+ - If REPO_ROOTS is missing or the candidate repository set cannot be identified confidently, ask the user to provide it.
28
+ - When dispatching explore for code search, search only inside the narrowed candidate repositories selected from REPO_ROOTS.
29
+ - Only dispatch explore when the candidate repo set is reasonably small.
30
+ - Do not search the whole workspace when the repository path is unknown.
31
+ - Configured REPO_ROOTS:
32
+ - ${renderedRoots}
33
+
34
+ Repo search heuristics:
35
+ - Prefer code search terms extracted from logs, especially error phrases, app names, container names, namespace names, and unique stack trace fragments.
36
+ - If logs already reveal likely app/container names, pass those as search hints to explore.
37
+ - Use metadata first, then logs, then code; code search should be a targeted follow-up, not a blind scan.
38
+ - Treat REPO_ROOTS as a candidate repository roots pool and narrow it before code search.
39
+
40
+ Workflow:
41
+ 1. Identify missing context.
42
+ 2. Dispatch metadata-querier when cluster identity, region, vendor, or tier is unclear. MUST complete this before log analysis.
43
+ 3. Dispatch log-analyzer ONLY after metadata is resolved. Pass vendor, region, cluster_id, and cluster_tier explicitly.
44
+ 4. Use metadata and log clues to narrow candidate repositories from REPO_ROOTS.
45
+ 5. Ask for REPO_ROOTS before code search if it is missing or still too ambiguous.
46
+ 6. Dispatch explore only when code evidence is needed, and constrain the search to the narrowed candidate repositories.
47
+ 7. Summarize findings, evidence, likely cause, and next steps.
48
+ `;
49
+ }
50
+ export const TicloudPrompt = createTicloudPrompt();
@@ -0,0 +1,3 @@
1
+ import { type ToolDefinition } from "@opencode-ai/plugin";
2
+ export declare function resetClinicClusterQueryCache(): void;
3
+ export declare const clinicClusterQueryTool: ToolDefinition;
@@ -0,0 +1,127 @@
1
+ import { tool } from "@opencode-ai/plugin";
2
+ import { clinicRequest } from "../lib/clinic-client.js";
3
+ const orgIDCache = new Map();
4
+ export function resetClinicClusterQueryCache() {
5
+ orgIDCache.clear();
6
+ }
7
+ function getClusterItems(payload) {
8
+ return payload?.items ?? payload?.data?.clusters ?? payload?.clusters ?? [];
9
+ }
10
+ function getClusterID(cluster) {
11
+ return cluster?.id ?? cluster?.clusterID ?? cluster?.clusterId ?? cluster?.cluster_id;
12
+ }
13
+ function getClusterName(cluster) {
14
+ return cluster?.clusterName ?? cluster?.name;
15
+ }
16
+ function getClusterStatus(cluster) {
17
+ return cluster?.clusterStatus ?? cluster?.status;
18
+ }
19
+ function getClusterDeployType(cluster) {
20
+ return cluster?.clusterDeployType ?? cluster?.deployType;
21
+ }
22
+ function getClusterDeployTypeV2(cluster) {
23
+ return cluster?.clusterDeployTypeV2 ?? cluster?.deployTypeV2;
24
+ }
25
+ function getClusterProvider(cluster) {
26
+ return cluster?.clusterProviderName ?? cluster?.provider;
27
+ }
28
+ function getClusterRegion(cluster) {
29
+ return cluster?.clusterRegionName ?? cluster?.region;
30
+ }
31
+ function cacheOrgID(cluster) {
32
+ const clusterID = getClusterID(cluster);
33
+ const orgID = cluster?.orgID;
34
+ if (clusterID && orgID)
35
+ orgIDCache.set(clusterID, orgID);
36
+ }
37
+ function formatCluster(cluster) {
38
+ return [
39
+ `clusterID=${getClusterID(cluster) ?? "unknown"}`,
40
+ `name=${getClusterName(cluster) ?? "unknown"}`,
41
+ getClusterStatus(cluster) ? `status=${getClusterStatus(cluster)}` : undefined,
42
+ getClusterDeployTypeV2(cluster) ? `deployTypeV2=${getClusterDeployTypeV2(cluster)}` : undefined,
43
+ getClusterDeployType(cluster) ? `deployType=${getClusterDeployType(cluster)}` : undefined,
44
+ getClusterProvider(cluster) ? `provider=${getClusterProvider(cluster)}` : undefined,
45
+ getClusterRegion(cluster) ? `region=${getClusterRegion(cluster)}` : undefined,
46
+ cluster.clusterVersion ?? cluster.version ? `version=${cluster.clusterVersion ?? cluster.version}` : undefined,
47
+ cluster.projectID ? `projectID=${cluster.projectID}` : undefined,
48
+ cluster.tenantID ? `tenantID=${cluster.tenantID}` : undefined,
49
+ cluster.parentID ? `parentID=${cluster.parentID}` : undefined,
50
+ ]
51
+ .filter(Boolean)
52
+ .join("\n");
53
+ }
54
+ function formatClusterList(payload) {
55
+ const items = getClusterItems(payload);
56
+ if (!items.length)
57
+ return "No matching clusters found.";
58
+ if (items.length === 1 && getClusterID(items[0]))
59
+ return formatCluster(items[0]);
60
+ return items
61
+ .slice(0, 10)
62
+ .map((item) => [
63
+ `- ${getClusterID(item)}: ${getClusterName(item) ?? "unknown"}`,
64
+ getClusterDeployTypeV2(item) ? `type=${getClusterDeployTypeV2(item)}` : undefined,
65
+ getClusterProvider(item) ? `provider=${getClusterProvider(item)}` : undefined,
66
+ getClusterRegion(item) ? `region=${getClusterRegion(item)}` : undefined,
67
+ item.tenantID ? `tenantID=${item.tenantID}` : undefined,
68
+ ]
69
+ .filter(Boolean)
70
+ .join(" "))
71
+ .join("\n");
72
+ }
73
+ async function searchClusterByID(clusterID) {
74
+ const url = new URL("/clinic/api/v1/dashboard/clusters", "https://placeholder.local");
75
+ url.searchParams.set("cluster_id", clusterID);
76
+ url.searchParams.set("show_deleted", "true");
77
+ url.searchParams.set("limit", "1");
78
+ const response = await clinicRequest(url.pathname + url.search);
79
+ const data = await response.json();
80
+ const cluster = getClusterItems(data).find((item) => getClusterID(item) === clusterID);
81
+ if (cluster)
82
+ cacheOrgID(cluster);
83
+ return cluster;
84
+ }
85
+ async function resolveOrgID(clusterID) {
86
+ const cached = orgIDCache.get(clusterID);
87
+ if (cached)
88
+ return cached;
89
+ const cluster = await searchClusterByID(clusterID);
90
+ const orgID = cluster?.orgID;
91
+ if (orgID)
92
+ orgIDCache.set(clusterID, orgID);
93
+ return orgID;
94
+ }
95
+ async function fetchClusterDetails(clusterID) {
96
+ const summary = await searchClusterByID(clusterID);
97
+ if (!summary)
98
+ return undefined;
99
+ const orgID = await resolveOrgID(clusterID);
100
+ if (!orgID)
101
+ return summary;
102
+ const response = await clinicRequest(`/clinic/api/v1/orgs/${orgID}/clusters/${clusterID}`);
103
+ const detail = await response.json();
104
+ cacheOrgID(detail?.data ?? detail);
105
+ return detail?.data ?? detail ?? summary;
106
+ }
107
+ export const clinicClusterQueryTool = tool({
108
+ description: "Query TiDB Cloud cluster metadata via Clinic API",
109
+ args: {
110
+ cluster_id: tool.schema.string().optional(),
111
+ query: tool.schema.string().optional(),
112
+ },
113
+ async execute(args) {
114
+ if (args.cluster_id) {
115
+ const cluster = await fetchClusterDetails(args.cluster_id);
116
+ if (!cluster)
117
+ return "No matching clusters found.";
118
+ return formatCluster(cluster);
119
+ }
120
+ if (!args.query)
121
+ throw new Error("query requires cluster_id or query");
122
+ const url = new URL("/clinic/api/v1/dashboard/clusters", "https://placeholder.local");
123
+ url.searchParams.set("query", args.query);
124
+ const response = await clinicRequest(url.pathname + url.search);
125
+ return formatClusterList(await response.json());
126
+ },
127
+ });
@@ -0,0 +1,2 @@
1
+ import { type ToolDefinition } from "@opencode-ai/plugin";
2
+ export declare const lokiLogQueryTool: ToolDefinition;
@@ -0,0 +1,119 @@
1
+ import { tool } from "@opencode-ai/plugin";
2
+ import { summarizeLogLines } from "../lib/result-format.js";
3
+ import { toUnixNanoseconds } from "../lib/time.js";
4
+ const SEVEN_DAYS_MS = 7 * 24 * 60 * 60 * 1000;
5
+ function isPremiumTier(tier) {
6
+ return tier === "premium" || tier === "nextgen";
7
+ }
8
+ function buildBaseCandidates(region, vendor, variant, tier) {
9
+ const standard = `https://www.ds.${region}.${vendor}.observability.tidbcloud.com/loki/self-monitoring/loki/api/v1`;
10
+ const ng = `https://www.ds.${region}-ng.${vendor}.observability.tidbcloud.com/loki/self-monitoring/loki/api/v1`;
11
+ const premium = `https://www.ds.${region}-premium.${vendor}.observability.tidbcloud.com/loki/self-monitoring/loki/api/v1`;
12
+ if (variant === "standard")
13
+ return [standard];
14
+ if (variant === "ng")
15
+ return [ng];
16
+ if (variant === "premium")
17
+ return [premium];
18
+ if (isPremiumTier(tier))
19
+ return [ng, premium];
20
+ return [standard];
21
+ }
22
+ async function requestWithFallback(candidates, pathname, searchParams) {
23
+ let lastError;
24
+ const errors = [];
25
+ for (const base of candidates) {
26
+ const url = new URL(pathname, `${base}/`);
27
+ url.search = searchParams.toString();
28
+ try {
29
+ const response = await fetch(url);
30
+ if (response.ok)
31
+ return response;
32
+ const body = await response.text().catch(() => "");
33
+ errors.push(`${url}: HTTP ${response.status} - ${body.slice(0, 500)}`);
34
+ lastError = new Error(`Loki endpoint returned ${response.status} for ${url}`);
35
+ }
36
+ catch (err) {
37
+ errors.push(`${url}: ${err instanceof Error ? err.message : String(err)}`);
38
+ lastError = err instanceof Error ? err : new Error(String(err));
39
+ }
40
+ }
41
+ throw new Error(`Failed to reach Loki endpoint. Errors:\n${errors.join("\n")}`);
42
+ }
43
+ function validateTimeRange(startTime, endTime) {
44
+ if (!startTime || !endTime)
45
+ return;
46
+ const start = Date.parse(startTime);
47
+ const end = Date.parse(endTime);
48
+ if (Number.isNaN(start) || Number.isNaN(end))
49
+ throw new Error("Invalid input: start_time and end_time must be valid timestamps.");
50
+ if (end < start)
51
+ throw new Error("Invalid input: end_time must be later than start_time.");
52
+ if (end - start > SEVEN_DAYS_MS) {
53
+ throw new Error("Invalid input: Loki direct query time range must be within 7 days.");
54
+ }
55
+ }
56
+ function formatLabelList(payload) {
57
+ const items = payload?.data ?? [];
58
+ if (!items.length)
59
+ return "No labels found.";
60
+ return items.join("\n");
61
+ }
62
+ function formatLokiPayload(payload) {
63
+ const streams = payload?.data?.result ?? [];
64
+ if (!streams.length)
65
+ return "No log lines matched the query. Check time range, labels, or whether the filter is too narrow.";
66
+ const lines = streams.flatMap((stream, index) => {
67
+ const labels = Object.entries(stream.stream ?? {})
68
+ .map(([key, value]) => `${key}=${value}`)
69
+ .join(" ");
70
+ const header = labels ? `stream ${index + 1}: ${labels}` : `stream ${index + 1}`;
71
+ const values = (stream.values ?? []).map((entry) => `${entry[0]} => ${entry[1]}`);
72
+ return [header, ...values];
73
+ });
74
+ const { output } = summarizeLogLines(lines);
75
+ return `Matched ${streams.length} streams\n\n${output}`;
76
+ }
77
+ export const lokiLogQueryTool = tool({
78
+ description: "Query Loki direct API for labels, label values, or query ranges.",
79
+ args: {
80
+ action: tool.schema.enum(["labels", "label_values", "query_range"]),
81
+ region: tool.schema.string(),
82
+ vendor: tool.schema.enum(["aws", "alicloud", "gcp", "azure"]),
83
+ endpoint_variant: tool.schema.enum(["standard", "ng", "premium", "auto"]).default("auto"),
84
+ cluster_tier: tool.schema.string().optional(),
85
+ label_name: tool.schema.string().optional(),
86
+ logql: tool.schema.string().optional(),
87
+ start_time: tool.schema.string().optional(),
88
+ end_time: tool.schema.string().optional(),
89
+ limit: tool.schema.number().int().positive().default(100),
90
+ direction: tool.schema.enum(["forward", "backward"]).default("backward"),
91
+ },
92
+ async execute(args) {
93
+ validateTimeRange(args.start_time, args.end_time);
94
+ const candidates = buildBaseCandidates(args.region, args.vendor, args.endpoint_variant, args.cluster_tier);
95
+ const params = new URLSearchParams();
96
+ if (args.start_time)
97
+ params.set("start", toUnixNanoseconds(args.start_time));
98
+ if (args.end_time)
99
+ params.set("end", toUnixNanoseconds(args.end_time));
100
+ if (args.action === "labels") {
101
+ const response = await requestWithFallback(candidates, "labels", params);
102
+ return formatLabelList(await response.json());
103
+ }
104
+ if (args.action === "label_values") {
105
+ if (!args.label_name)
106
+ throw new Error("Invalid input: label_name is required for label_values.");
107
+ const response = await requestWithFallback(candidates, `label/${args.label_name}/values`, params);
108
+ return formatLabelList(await response.json());
109
+ }
110
+ if (!args.logql || !args.start_time || !args.end_time) {
111
+ throw new Error("Invalid input: query_range requires logql, start_time, and end_time.");
112
+ }
113
+ params.set("query", args.logql);
114
+ params.set("limit", String(args.limit));
115
+ params.set("direction", args.direction ?? "backward");
116
+ const response = await requestWithFallback(candidates, "query_range", params);
117
+ return formatLokiPayload(await response.json());
118
+ },
119
+ });
package/package.json ADDED
@@ -0,0 +1,36 @@
1
+ {
2
+ "name": "@shiyuhang0/ticloud-oncall",
3
+ "version": "0.1.0",
4
+ "description": "TiDB Cloud oncall plugin for opencode",
5
+ "type": "module",
6
+ "main": "dist/src/index.js",
7
+ "types": "dist/src/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "import": "./dist/src/index.js",
11
+ "types": "./dist/src/index.d.ts"
12
+ }
13
+ },
14
+ "files": [
15
+ "dist/src",
16
+ "bundled-skills"
17
+ ],
18
+ "scripts": {
19
+ "build": "tsc -p tsconfig.json",
20
+ "typecheck": "tsc --noEmit -p tsconfig.json",
21
+ "test": "vitest run",
22
+ "prepublishOnly": "npm run build"
23
+ },
24
+ "dependencies": {
25
+ "@opencode-ai/plugin": "^1.15.13",
26
+ "zod": "^3.25.76"
27
+ },
28
+ "devDependencies": {
29
+ "@types/node": "^22.15.30",
30
+ "typescript": "^5.8.3",
31
+ "vitest": "^2.1.8"
32
+ },
33
+ "publishConfig": {
34
+ "access": "public"
35
+ }
36
+ }