npm - @einja/dev-cli - Versions diffs - 0.1.40 → 0.1.44 - Mend

@einja/dev-cli 0.1.40 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py ADDED Viewed

@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""スキルdescriptionのトリガー評価を実行。
+スキルのdescriptionが一連のクエリに対してClaudeのスキル使用を
+トリガーするかどうかをテストする。結果をJSONで出力。
+"""
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+try:
+    from scripts.utils import parse_skill_md
+except ImportError:
+    from utils import parse_skill_md
+def find_project_root() -> Path:
+    """cwdから上方向にウォークし、.claude/を探してプロジェクトルートを見つける。
+    Claude Codeがプロジェクトルートを発見する方法を模倣し、
+    作成するコマンドファイルがclaude -pの検索対象に入るようにする。
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """単一のクエリを実行し、スキルがトリガーされたかどうかを返す。
+    .claude/commands/にコマンドファイルを作成してClaudeのavailable_skillsリストに
+    表示させ、`claude -p`で生のクエリを実行する。
+    --include-partial-messagesを使用してストリームイベント（content_block_start）から
+    早期にトリガーを検出する。
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # description内のクォートでの破損を避けるためYAMLブロックスカラーを使用
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+        # Claude Codeセッション内でclaude -pのネストを許可するためCLAUDECODE環境変数を除去
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # ストリームイベント検出用の状態追跡
+        pending_tool_name = None
+        accumulated_json = ""
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    # ストリームイベントによる早期検出
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+                    # フォールバック: 完全なassistantメッセージ
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # 任意の終了パス（return、例外、タイムアウト）でプロセスをクリーンアップ
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """評価セット全体を実行し、結果を返す。"""
+    results = []
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                print(f"警告: クエリが失敗しました: {e}", file=sys.stderr)
+                query_triggers[query].append(False)
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+def main():
+    parser = argparse.ArgumentParser(description="スキルdescriptionのトリガー評価を実行")
+    parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
+    parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
+    parser.add_argument("--description", default=None, help="テスト用descriptionの上書き")
+    parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数")
+    parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト（秒）")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値")
+    parser.add_argument("--model", default=None, help="claude -pに使用するモデル（デフォルト: ユーザー設定のモデル）")
+    parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
+    args = parser.parse_args()
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
+        sys.exit(1)
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+    if args.verbose:
+        print(f"評価中: {description}", file=sys.stderr)
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+    if args.verbose:
+        summary = output["summary"]
+        print(f"結果: {summary['passed']}/{summary['total']} パス", file=sys.stderr)
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            print(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
+    print(json.dumps(output, indent=2))
+if __name__ == "__main__":
+    main()

package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py ADDED Viewed

@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+"""評価＋改善ループを実行。全パスまたは最大イテレーション到達まで繰り返す。
+run_eval.pyとimprove_description.pyをループで組み合わせ、
+履歴を追跡し最良のdescriptionを返す。
+過学習防止のためtrain/test分割（fraction指定）に対応。
+"""
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+try:
+    from scripts.generate_report import generate_html
+    from scripts.improve_description import improve_description
+    from scripts.run_eval import find_project_root, run_eval
+    from scripts.utils import parse_skill_md
+except ImportError:
+    from generate_report import generate_html
+    from improve_description import improve_description
+    from run_eval import find_project_root, run_eval
+    from utils import parse_skill_md
+import anthropic
+def split_eval_set(
+    eval_set: list[dict],
+    holdout: float,
+    seed: int = 42,
+) -> tuple[list[dict], list[dict]]:
+    """評価セットをトレーニングとテストに分割する（fraction指定）。
+    holdoutは全体に対する割合（例: 0.4 = 40%）。
+    holdoutが0の場合、全データをトレーニングに使用する。
+    should_trigger=Trueとshould_trigger=Falseの両方から
+    均等にホールドアウトする（stratified split）。
+    """
+    if holdout <= 0:
+        return eval_set, []
+    rng = random.Random(seed)
+    # should_triggerで分離
+    trigger = [e for e in eval_set if e.get("should_trigger", True)]
+    no_trigger = [e for e in eval_set if not e.get("should_trigger", True)]
+    # 各グループをシャッフル
+    rng.shuffle(trigger)
+    rng.shuffle(no_trigger)
+    # 分割点を割合で計算
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+    # 分割
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+    return train_set, test_set
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    seed: int | None,
+    model: str | None,
+    improve_model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """評価＋改善ループのメイン関数。"""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+    # train/test分割（holdoutが0より大きい場合のみ）
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout, seed if seed is not None else 42)
+        if verbose:
+            print(f"分割: トレーニング {len(train_set)} 件, テスト {len(test_set)} 件 (holdout={holdout})", file=sys.stderr)
+    else:
+        train_set = eval_set
+        test_set = []
+    client = anthropic.Anthropic()
+    history: list[dict] = []
+    exit_reason = "unknown"
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            print(f"\n{'='*60}", file=sys.stderr)
+            print(f"イテレーション {iteration}/{max_iterations}", file=sys.stderr)
+            print(f"description: {current_description}", file=sys.stderr)
+            print(f"{'='*60}", file=sys.stderr)
+        # train + test を一括で並行評価（効率化）
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+        # クエリの一致でtrain/testに結果を振り分け
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+        # 履歴エントリの構築（レポートジェネレーターとの後方互換性を保持）
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # レポートジェネレーター後方互換
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+        # ライブレポートを更新（指定されている場合）
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+            if verbose:
+                print(f"レポートを更新しました: {live_report_path}", file=sys.stderr)
+        if verbose:
+            def print_eval_stats(label: str, results: list[dict], elapsed: float) -> None:
+                pos = [r for r in results if r.get("should_trigger", True)]
+                neg = [r for r in results if not r.get("should_trigger", True)]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                print(
+                    f"{label}: {tp+tn}/{total} 正解, "
+                    f"precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
+                    file=sys.stderr,
+                )
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    print(
+                        f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
+                        file=sys.stderr,
+                    )
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)  # type: ignore[index]
+        # train全パスなら終了（testは過学習モニタリング用のみ）
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                print(f"\nイテレーション {iteration} でtrain全クエリパス！ループを終了します。", file=sys.stderr)
+            break
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                print(f"\n最大イテレーション数到達 ({max_iterations})。", file=sys.stderr)
+            break
+        # descriptionを改善（train結果のみ使用）
+        if verbose:
+            print(f"\ndescriptionを改善中...", file=sys.stderr)
+        t0 = time.time()
+        # 過学習防止のため、改善モデルにtest_スコアを見せないようにブラインド処理
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=improve_model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+        if verbose:
+            print(f"新しいdescription ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
+        current_description = new_description
+    # 最良のdescriptionを選択（testセットあり→test優先、なし→train）
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+    if verbose:
+        print(f"\n終了理由: {exit_reason}", file=sys.stderr)
+        print(f"最良スコア: {best_score} (イテレーション {best['iteration']})", file=sys.stderr)
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser(description="評価＋改善ループを実行")
+    parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
+    parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
+    parser.add_argument("--description", default=None, help="開始descriptionを上書き")
+    parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数（デフォルト: 10）")
+    parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数（デフォルト: 30）")
+    parser.add_argument("--max-iterations", type=int, default=5, help="最大イテレーション数（デフォルト: 5）")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数（デフォルト: 3）")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値（デフォルト: 0.5）")
+    parser.add_argument("--holdout", type=float, default=0.4, help="テスト用ホールドアウト割合（0で無効、デフォルト: 0.4）")
+    parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
+    parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
+    parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル（デフォルト: claude-sonnet-4-20250514）")
+    parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
+    parser.add_argument("--report", default="auto", help="HTMLレポートの出力先パス（'auto'で一時ファイル自動起動、'none'で無効）")
+    parser.add_argument("--results-dir", default=None, help="タイムスタンプ付きサブディレクトリに全出力（results.json, report.html, logs）を保存")
+    parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ（--results-dirより優先）")
+    args = parser.parse_args()
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
+        sys.exit(1)
+    name, _, _ = parse_skill_md(skill_path)
+    # ライブレポートパスのセットアップ
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # ブラウザで即座に開けるよう初期HTMLを書き込む
+        live_report_path.write_text("<html><body><h1>最適化ループを開始しています...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+    # 出力ディレクトリの決定（run_loop実行前に作成してlogsを保存可能にする）
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+    # --log-dir が明示指定されていればそちらを優先、なければ results_dir/logs
+    if args.log_dir:
+        log_dir: Path | None = Path(args.log_dir)
+    elif results_dir:
+        log_dir = results_dir / "logs"
+    else:
+        log_dir = None
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        seed=args.seed,
+        model=args.model,
+        improve_model=args.improve_model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+    # JSON出力
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+    # 最終HTMLレポートの書き込み（auto_refreshオフ）
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        print(f"\nレポート: {live_report_path}", file=sys.stderr)
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+    if results_dir:
+        print(f"結果を保存しました: {results_dir}", file=sys.stderr)
+if __name__ == "__main__":
+    main()