npm - @argosvix/mcp-server - Versions diffs - 0.28.3-alpha.1 → 0.30.0-alpha.1 - Mend

@argosvix/mcp-server 0.28.3-alpha.1 → 0.30.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/resources.d.ts.map +1 -1
package/dist/resources.js +26 -8
package/dist/resources.js.map +1 -1
package/dist/resources.test.js +43 -0
package/dist/resources.test.js.map +1 -1
package/dist/tools.d.ts.map +1 -1
package/dist/tools.js +182 -7
package/dist/tools.js.map +1 -1
package/dist/tools.test.js +30 -1
package/dist/tools.test.js.map +1 -1
package/dist/version.d.ts +1 -1
package/dist/version.js +1 -1
package/package.json +1 -1

package/dist/tools.js CHANGED Viewed

@@ -51,6 +51,7 @@ const TOOL_ARG_ALLOWLIST = {
         "sleepMinutes",
         "enabled",
         "conditions",
+        "evalCriterionId",
     ],
     // 2026-06-03 v1.6 #13-4 = update_alert / delete_alert tools (= axis 1 強化)。
     // backend PATCH/DELETE /v1/alerts/:id の wrap。 alertType は immutable (= backend
@@ -200,6 +201,13 @@ const TOOL_ARG_ALLOWLIST = {
     // 2026-06-02 Codex round 2 🔴 fix = idempotencyKey 必須 path (= AI agent が
     // retry した時に backend で dedup)、 client が opaque string 64 char で carry。
     run_eval: ["name", "recentCount", "label", "promptRegistryId", "idempotencyKey"],
+    // 2026-06-15 #2 Phase D4 = golden dataset (期待出力つき固定テストセット) ツール。
+    // list/get/create は CRUD、 run は対象モデルで実行 → judge → eval_scores (= 回帰 A/B)。
+    list_eval_datasets: [],
+    get_eval_dataset: ["datasetId"],
+    create_eval_dataset: ["name", "description", "items", "frozen"],
+    run_eval_dataset: ["datasetId", "targetModel", "judgeModel", "idempotencyKey"],
+    delete_eval_dataset: ["datasetId"],
     // 2026-06-06 axis 4 Tier 2 = 自律 AI ops 第一弾。 mutation 軸なので dryRun 必須、
     // backend で audit emit + UPDATE 順序 + idempotency carry (= R35 narrative)。
     purge_expired_plaintext: ["olderThanDays", "dryRun", "approvalId"],
@@ -377,7 +385,8 @@ export const tools = [
                     type: "string",
                     description: "監視する指標。 cost_threshold=単発コスト閾値 (USD) / monthly_budget=月次予算 (USD) / " +
                         "error_rate=エラー率(%) / latency_degradation=レイテンシ劣化 (ms) / " +
-                        "anomaly_cost / anomaly_latency / anomaly_error_rate=異常検知 (= windowMinutes は 60 固定)",
+                        "anomaly_cost / anomaly_latency / anomaly_error_rate=異常検知 (= windowMinutes は 60 固定) / " +
+                        "eval_score=eval スコア低下 (= evalCriterionId 必須、 直近 window の平均 score が thresholdValue 未満で発火)",
                     enum: [
                         "cost_threshold",
                         "error_rate",
@@ -386,6 +395,7 @@ export const tools = [
                         "anomaly_cost",
                         "anomaly_latency",
                         "anomaly_error_rate",
+                        "eval_score",
                     ],
                 },
                 thresholdValue: {
@@ -456,6 +466,11 @@ export const tools = [
                     description: "作成直後に有効化するか。 デフォルト true。",
                     default: true,
                 },
+                evalCriterionId: {
+                    type: "integer",
+                    description: "alertType=eval_score のとき必須。 監視する eval criterion の id (= list_eval_criteria.criteria[].id)。 直近 window の平均 score が thresholdValue 未満で発火する。",
+                    minimum: 1,
+                },
                 conditions: {
                     type: "object",
                     description: "v1.5 multi-condition alert (= 複合条件)。 指定すると alertType + thresholdValue + " +
@@ -2018,8 +2033,8 @@ export const tools = [
     },
     {
         name: "aggregate_calls",
-        description: "calls の 集計 cube を 取得 (= POST /v1/query/aggregate)。 groupBy (= provider / model / day / hour / minute / tag) × metric (= cost / latency / tokens / count / error_rate) で 1 call で AI agent が 「今月の cost を model 別 に集計」 narrative carry。 " +
-            "tag mode は tagKey 必須 (= alphanumeric + _ - のみ、 例: 'env' / 'feature')。 hour mode は 168h / minute mode は 60min まで (= 超過 400)。 cost = SUM(cost_usd) / latency = AVG(latency_ms) / tokens = SUM(total_tokens) / count = COUNT(*) / error_rate = error ÷ total。 " +
+        description: "calls の 集計 cube を 取得 (= POST /v1/query/aggregate)。 groupBy (= provider / model / day / hour / minute / tag / error) × metric (= cost / latency / tokens / input_tokens / output_tokens / count / error_rate) で 1 call で AI agent が 「今月の cost を model 別 に集計」 narrative carry。 " +
+            "tag mode は tagKey 必須 (= alphanumeric + _ - のみ、 例: 'env' / 'feature')。 error mode は エラー行のみを error 文字列で種類別集計 (= どのエラーが何件か。 metric=count 推奨)。 hour mode は 168h / minute mode は 60min まで (= 超過 400)。 cost = SUM(cost_usd) / latency = AVG(latency_ms) / tokens = SUM(total_tokens) / input_tokens = SUM(prompt_tokens) / output_tokens = SUM(completion_tokens) / count = COUNT(*) / error_rate = error ÷ total。 " +
             "返却 = { groups: [{key, value, count}], total: {value, count} } 形式。 軸 1 操作系 + 自律 AI ops の 分析 narrative の coverage 拡張。",
         inputSchema: {
             type: "object",
@@ -2035,14 +2050,14 @@ export const tools = [
                 },
                 groupBy: {
                     type: "string",
-                    description: "集約軸 (= 'provider' / 'model' / 'day' / 'hour' / 'minute' / 'tag')、 default = 'provider'。 hour は 168h / minute は 60min まで",
-                    enum: ["provider", "model", "day", "hour", "minute", "tag"],
+                    description: "集約軸 (= 'provider' / 'model' / 'day' / 'hour' / 'minute' / 'tag' / 'error')、 default = 'provider'。 hour は 168h / minute は 60min まで。 error はエラー行のみを種類別に集計",
+                    enum: ["provider", "model", "day", "hour", "minute", "tag", "error"],
                     default: "provider",
                 },
                 metric: {
                     type: "string",
-                    description: "metric 種別 (= 'cost' / 'latency' / 'tokens' / 'count' / 'error_rate')、 default = 'cost'",
-                    enum: ["cost", "latency", "tokens", "count", "error_rate"],
+                    description: "metric 種別 (= 'cost' / 'latency' / 'tokens' / 'input_tokens' / 'output_tokens' / 'count' / 'error_rate')、 default = 'cost'",
+                    enum: ["cost", "latency", "tokens", "input_tokens", "output_tokens", "count", "error_rate"],
                     default: "cost",
                 },
                 provider: {
@@ -2307,6 +2322,129 @@ export const tools = [
             },
         },
     },
+    {
+        name: "list_eval_datasets",
+        description: "自 account の golden dataset 一覧 (= GET /v1/eval-datasets)。 各 dataset は name / 説明 / item 件数 / frozen 状態を持つ。 golden dataset = 期待出力つきの固定テストセットで、 run_eval_dataset で対象モデルに通して回帰 A/B を測る母集団。",
+        inputSchema: {
+            type: "object",
+            additionalProperties: false,
+            properties: {},
+        },
+    },
+    {
+        name: "get_eval_dataset",
+        description: "指定 dataset の detail + items 全件を取得 (= GET /v1/eval-datasets/:id)。 datasetId は list_eval_datasets.datasets[].id。",
+        inputSchema: {
+            type: "object",
+            additionalProperties: false,
+            required: ["datasetId"],
+            properties: {
+                datasetId: {
+                    type: "integer",
+                    description: "対象 dataset の id (= list_eval_datasets.datasets[].id)",
+                    minimum: 1,
+                },
+            },
+        },
+    },
+    {
+        name: "create_eval_dataset",
+        description: "golden dataset を新規作成 (= POST /v1/eval-datasets、 Pro+ 限定)。 items に期待出力つきテストケースを最大 20 件渡せる。 dataset は account あたり最大 50 件。 frozen=true で母集団を凍結 (= 以後 item 改変・解凍不可、 回帰判定の比較可能性を固定)。",
+        inputSchema: {
+            type: "object",
+            additionalProperties: false,
+            required: ["name"],
+            properties: {
+                name: {
+                    type: "string",
+                    description: "dataset 名 (1-100 文字、 account 内で一意)",
+                    minLength: 1,
+                    maxLength: 100,
+                },
+                description: {
+                    type: "string",
+                    description: "任意の説明 (<= 500 文字)",
+                    maxLength: 500,
+                },
+                items: {
+                    type: "array",
+                    description: "テストケース (最大 20 件)。 各 inputText を対象モデルに入力し、 expectedOutput を judge の [REFERENCE ANSWER] として採点に使う。",
+                    maxItems: 20,
+                    items: {
+                        type: "object",
+                        additionalProperties: false,
+                        required: ["inputText"],
+                        properties: {
+                            inputText: {
+                                type: "string",
+                                description: "モデルへの入力 (1-4000 文字)",
+                                minLength: 1,
+                                maxLength: 4000,
+                            },
+                            expectedOutput: {
+                                type: "string",
+                                description: "期待する出力 (任意、 <= 4000 文字)。 judge に参照解として渡す。",
+                                maxLength: 4000,
+                            },
+                        },
+                    },
+                },
+                frozen: {
+                    type: "boolean",
+                    description: "true = 母集団凍結 (= 以後 item 改変・解凍不可)。 省略 = false。",
+                },
+            },
+        },
+    },
+    {
+        name: "run_eval_dataset",
+        description: "golden dataset を対象モデルで実行して回帰判定する (= POST /v1/eval-datasets/:id/run、 Pro+ 限定)。 各 item の inputText を targetModel に通し、 出力を既定 criteria + expectedOutput で gpt-4o-mini に採点させて eval_scores に記録する。 結果は compare_eval_runs で run 間比較できる。 実行記録は本番 cost/分析/アラート集計からは除外される。 cost: item 数 × criteria 数の LLM call。 OPENAI_API_KEY 未 provision 環境では 503。",
+        inputSchema: {
+            type: "object",
+            additionalProperties: false,
+            required: ["datasetId", "targetModel"],
+            properties: {
+                datasetId: {
+                    type: "integer",
+                    description: "実行する dataset の id (= list_eval_datasets.datasets[].id)",
+                    minimum: 1,
+                },
+                targetModel: {
+                    type: "string",
+                    description: "回帰を測りたい対象モデル (= 価格表に載っている OpenAI モデルのみ、 例 'gpt-4o-mini')。 未知モデルは 400。",
+                    minLength: 1,
+                    maxLength: 128,
+                },
+                judgeModel: {
+                    type: "string",
+                    description: "採点モデル (省略 = gpt-4o-mini)。 価格表に載っている OpenAI モデルのみ。",
+                    maxLength: 128,
+                },
+                idempotencyKey: {
+                    type: "string",
+                    description: "retry dedup 用の opaque key (= UUID 推奨、 200 char cap)。 同 key の再 POST は既存 run を返す (= 二重課金防止)。",
+                    minLength: 1,
+                    maxLength: 200,
+                },
+            },
+        },
+    },
+    {
+        name: "delete_eval_dataset",
+        description: "golden dataset を削除 (= DELETE /v1/eval-datasets/:id、 Pro+ 限定)。 items は連鎖削除される。 過去の eval run / score は残る。",
+        inputSchema: {
+            type: "object",
+            additionalProperties: false,
+            required: ["datasetId"],
+            properties: {
+                datasetId: {
+                    type: "integer",
+                    description: "削除する dataset の id",
+                    minimum: 1,
+                },
+            },
+        },
+    },
 ];
 export async function dispatchTool(input) {
     const { name, args, apiKey, apiBase } = input;
@@ -2588,6 +2726,43 @@ export async function dispatchTool(input) {
                 }
                 return await callApi(apiBase, `/v1/eval-criteria/${encodeURIComponent(criterionId)}`, {}, apiKey, { method: "DELETE" });
             }
+            case "list_eval_datasets": {
+                return await callApi(apiBase, "/v1/eval-datasets", {}, apiKey);
+            }
+            case "get_eval_dataset": {
+                const datasetId = validateAnnotationId(safeArgs["datasetId"]);
+                if (!datasetId) {
+                    return errorResponse("datasetId required (positive integer up to 10 digits)");
+                }
+                return await callApi(apiBase, `/v1/eval-datasets/${encodeURIComponent(datasetId)}`, {}, apiKey);
+            }
+            case "create_eval_dataset": {
+                if (typeof safeArgs["name"] !== "string") {
+                    return errorResponse("name required (string)");
+                }
+                return await callApi(apiBase, "/v1/eval-datasets", {}, apiKey, {
+                    method: "POST",
+                    jsonBody: safeArgs,
+                });
+            }
+            case "run_eval_dataset": {
+                const datasetId = validateAnnotationId(safeArgs["datasetId"]);
+                if (!datasetId) {
+                    return errorResponse("datasetId required (positive integer up to 10 digits)");
+                }
+                if (typeof safeArgs["targetModel"] !== "string") {
+                    return errorResponse("targetModel required (string)");
+                }
+                const { datasetId: _omitDsId, ...body } = safeArgs;
+                return await callApi(apiBase, `/v1/eval-datasets/${encodeURIComponent(datasetId)}/run`, {}, apiKey, { method: "POST", jsonBody: body });
+            }
+            case "delete_eval_dataset": {
+                const datasetId = validateAnnotationId(safeArgs["datasetId"]);
+                if (!datasetId) {
+                    return errorResponse("datasetId required (positive integer up to 10 digits)");
+                }
+                return await callApi(apiBase, `/v1/eval-datasets/${encodeURIComponent(datasetId)}`, {}, apiKey, { method: "DELETE" });
+            }
             case "test_webhook": {
                 if (typeof safeArgs["url"] !== "string") {
                     return errorResponse("url required (https://...)");