npm - @einja/dev-cli - Versions diffs - 0.1.41 → 0.1.45 - Mend

@einja/dev-cli 0.1.41 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py CHANGED Viewed

@@ -3,14 +3,16 @@
 run_eval.pyとimprove_description.pyをループで組み合わせ、
 履歴を追跡し最良のdescriptionを返す。
-過学習防止のためtrain/test分割に対応。
+過学習防止のためtrain/test分割（fraction指定）に対応。
 """
 import argparse
 import json
 import random
 import sys
+import tempfile
 import time
+import webbrowser
 from pathlib import Path
 try:
@@ -29,90 +31,87 @@ import anthropic
 def split_eval_set(
     eval_set: list[dict],
-    holdout: int,
-    seed: int | None = None,
+    holdout: float,
+    seed: int = 42,
 ) -> tuple[list[dict], list[dict]]:
-    """評価セットをトレーニングとテストに分割する。
+    """評価セットをトレーニングとテストに分割する（fraction指定）。
+    holdoutは全体に対する割合（例: 0.4 = 40%）。
     holdoutが0の場合、全データをトレーニングに使用する。
     should_trigger=Trueとshould_trigger=Falseの両方から
-    均等にホールドアウトする。
+    均等にホールドアウトする（stratified split）。
     """
     if holdout <= 0:
         return eval_set, []
     rng = random.Random(seed)
-    positive = [item for item in eval_set if item.get("should_trigger", True)]
-    negative = [item for item in eval_set if not item.get("should_trigger", True)]
+    # should_triggerで分離
+    trigger = [e for e in eval_set if e.get("should_trigger", True)]
+    no_trigger = [e for e in eval_set if not e.get("should_trigger", True)]
-    # 正例と負例から均等にホールドアウト
-    pos_holdout = holdout // 2
-    neg_holdout = holdout - pos_holdout
+    # 各グループをシャッフル
+    rng.shuffle(trigger)
+    rng.shuffle(no_trigger)
-    # 上限調整
-    pos_holdout = min(pos_holdout, len(positive) - 1) if len(positive) > 1 else 0
-    neg_holdout = min(neg_holdout, len(negative) - 1) if len(negative) > 1 else 0
+    # 分割点を割合で計算
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
-    rng.shuffle(positive)
-    rng.shuffle(negative)
-    test_set = positive[:pos_holdout] + negative[:neg_holdout]
-    train_set = positive[pos_holdout:] + negative[neg_holdout:]
+    # 分割
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
     return train_set, test_set
 def run_loop(
-    eval_set_path: str,
-    skill_path: str,
-    max_iterations: int = 10,
-    num_workers: int = 10,
-    timeout: int = 30,
-    runs_per_query: int = 3,
-    trigger_threshold: float = 0.5,
-    holdout: int = 0,
-    seed: int | None = None,
-    model: str | None = None,
-    improve_model: str = "claude-sonnet-4-20250514",
-    verbose: bool = False,
-    report_path: str | None = None,
-    log_dir: str | None = None,
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    seed: int | None,
+    model: str | None,
+    improve_model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
 ) -> dict:
     """評価＋改善ループのメイン関数。"""
-    eval_set = json.loads(Path(eval_set_path).read_text())
-    skill_dir = Path(skill_path)
-    if not (skill_dir / "SKILL.md").exists():
-        print(f"エラー: {skill_dir} にSKILL.mdが見つかりません", file=sys.stderr)
-        sys.exit(1)
-    name, original_description, content = parse_skill_md(skill_dir)
     project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
-    # train/test分割
-    train_set, test_set = split_eval_set(eval_set, holdout, seed)
-    if verbose:
-        print(f"スキル: {name}", file=sys.stderr)
-        print(f"トレーニングクエリ: {len(train_set)}, テストクエリ: {len(test_set)}", file=sys.stderr)
-        print(f"最大イテレーション: {max_iterations}", file=sys.stderr)
-        print(f"オリジナルdescription: {original_description}", file=sys.stderr)
+    # train/test分割（holdoutが0より大きい場合のみ）
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout, seed if seed is not None else 42)
+        if verbose:
+            print(f"分割: トレーニング {len(train_set)} 件, テスト {len(test_set)} 件 (holdout={holdout})", file=sys.stderr)
+    else:
+        train_set = eval_set
+        test_set = []
     client = anthropic.Anthropic()
-    current_description = original_description
     history: list[dict] = []
-    improve_history: list[dict] = []
-    log_path = Path(log_dir) if log_dir else None
+    exit_reason = "unknown"
-    for iteration in range(max_iterations):
+    for iteration in range(1, max_iterations + 1):
         if verbose:
-            print(f"\n--- イテレーション {iteration} ---", file=sys.stderr)
-            print(f"description: {current_description[:100]}...", file=sys.stderr)
-        # トレーニング評価
-        train_results = run_eval(
-            eval_set=train_set,
+            print(f"\n{'='*60}", file=sys.stderr)
+            print(f"イテレーション {iteration}/{max_iterations}", file=sys.stderr)
+            print(f"description: {current_description}", file=sys.stderr)
+            print(f"{'='*60}", file=sys.stderr)
+        # train + test を一括で並行評価（効率化）
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
             skill_name=name,
             description=current_description,
             num_workers=num_workers,
@@ -122,161 +121,227 @@ def run_loop(
             trigger_threshold=trigger_threshold,
             model=model,
         )
+        eval_elapsed = time.time() - t0
+        # クエリの一致でtrain/testに結果を振り分け
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
-        # テスト評価（テストセットがある場合）
-        test_results = None
         if test_set:
-            test_results = run_eval(
-                eval_set=test_set,
-                skill_name=name,
-                description=current_description,
-                num_workers=num_workers,
-                timeout=timeout,
-                project_root=project_root,
-                runs_per_query=runs_per_query,
-                trigger_threshold=trigger_threshold,
-                model=model,
-            )
-        # 履歴エントリの構築
-        entry: dict = {
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+        # 履歴エントリの構築（レポートジェネレーターとの後方互換性を保持）
+        history.append({
+            "iteration": iteration,
             "description": current_description,
-            "train_passed": train_results["summary"]["passed"],
-            "train_failed": train_results["summary"]["failed"],
-            "train_total": train_results["summary"]["total"],
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
             "train_results": train_results["results"],
-        }
-        if test_results:
-            entry["test_passed"] = test_results["summary"]["passed"]
-            entry["test_failed"] = test_results["summary"]["failed"]
-            entry["test_total"] = test_results["summary"]["total"]
-            entry["test_results"] = test_results["results"]
-        history.append(entry)
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # レポートジェネレーター後方互換
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+        # ライブレポートを更新（指定されている場合）
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+            if verbose:
+                print(f"レポートを更新しました: {live_report_path}", file=sys.stderr)
         if verbose:
-            train_s = f"{train_results['summary']['passed']}/{train_results['summary']['total']}"
-            msg = f"トレーニングスコア: {train_s}"
-            if test_results:
-                test_s = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
-                msg += f", テストスコア: {test_s}"
-            print(msg, file=sys.stderr)
-        # レポート更新
-        if report_path:
-            output_data = {"history": history, "holdout": holdout}
-            report_html = generate_html(output_data, auto_refresh=True, skill_name=name)
-            Path(report_path).write_text(report_html)
+            def print_eval_stats(label: str, results: list[dict], elapsed: float) -> None:
+                pos = [r for r in results if r.get("should_trigger", True)]
+                neg = [r for r in results if not r.get("should_trigger", True)]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                print(
+                    f"{label}: {tp+tn}/{total} 正解, "
+                    f"precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
+                    file=sys.stderr,
+                )
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    print(
+                        f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}",
+                        file=sys.stderr,
+                    )
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)  # type: ignore[index]
+        # train全パスなら終了（testは過学習モニタリング用のみ）
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
             if verbose:
-                print(f"レポートを更新しました: {report_path}", file=sys.stderr)
-        # 全パスなら終了
-        if train_results["summary"]["failed"] == 0:
-            if test_results is None or test_results["summary"]["failed"] == 0:
-                if verbose:
-                    print("全クエリパス。ループを終了します。", file=sys.stderr)
-                break
+                print(f"\nイテレーション {iteration} でtrain全クエリパス！ループを終了します。", file=sys.stderr)
+            break
-        # 最終イテレーションでなければ改善
-        if iteration < max_iterations - 1:
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
             if verbose:
-                print("descriptionを改善中...", file=sys.stderr)
-            new_description = improve_description(
-                client=client,
-                skill_name=name,
-                skill_content=content,
-                current_description=current_description,
-                eval_results=train_results,
-                history=improve_history,
-                model=improve_model,
-                test_results=test_results,
-                log_dir=log_path,
-                iteration=iteration,
-            )
-            # 改善履歴を更新
-            improve_entry: dict = {
-                "description": current_description,
-                "train_passed": train_results["summary"]["passed"],
-                "train_total": train_results["summary"]["total"],
-                "results": train_results["results"],
-            }
-            if test_results:
-                improve_entry["test_passed"] = test_results["summary"]["passed"]
-                improve_entry["test_total"] = test_results["summary"]["total"]
-            improve_history.append(improve_entry)
+                print(f"\n最大イテレーション数到達 ({max_iterations})。", file=sys.stderr)
+            break
-            current_description = new_description
+        # descriptionを改善（train結果のみ使用）
+        if verbose:
+            print(f"\ndescriptionを改善中...", file=sys.stderr)
+        t0 = time.time()
+        # 過学習防止のため、改善モデルにtest_スコアを見せないようにブラインド処理
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=improve_model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
-            if verbose:
-                print(f"新しいdescription: {new_description[:100]}...", file=sys.stderr)
-    # 最良のdescriptionを選択（テスト > トレーニングで優先）
-    best_idx = 0
-    best_test = -1
-    best_train = -1
-    for i, h in enumerate(history):
-        t_passed = h.get("test_passed", -1)
-        tr_passed = h.get("train_passed", h.get("passed", 0))
-        if t_passed > best_test or (t_passed == best_test and tr_passed > best_train):
-            best_test = t_passed
-            best_train = tr_passed
-            best_idx = i
-    best = history[best_idx]
-    # 最終レポート（auto_refreshオフ）
-    if report_path:
-        output_data = {"history": history, "holdout": holdout}
-        report_html = generate_html(output_data, auto_refresh=False, skill_name=name)
-        Path(report_path).write_text(report_html)
-    output = {
-        "skill_name": name,
+        if verbose:
+            print(f"新しいdescription ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
+        current_description = new_description
+    # 最良のdescriptionを選択（testセットあり→test優先、なし→train）
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+    if verbose:
+        print(f"\n終了理由: {exit_reason}", file=sys.stderr)
+        print(f"最良スコア: {best_score} (イテレーション {best['iteration']})", file=sys.stderr)
+    return {
+        "exit_reason": exit_reason,
         "original_description": original_description,
         "best_description": best["description"],
-        "best_iteration": best_idx,
-        "history": history,
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
         "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
     }
-    if verbose:
-        print(f"\n最良のdescription (イテレーション {best_idx}): {best['description']}", file=sys.stderr)
-        train_s = f"{best['train_passed']}/{best['train_total']}"
-        msg = f"最良スコア - トレーニング: {train_s}"
-        if best.get("test_passed") is not None:
-            test_s = f"{best['test_passed']}/{best['test_total']}"
-            msg += f", テスト: {test_s}"
-        print(msg, file=sys.stderr)
-    return output
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser(description="評価＋改善ループを実行")
     parser.add_argument("--eval-set", required=True, help="評価セットJSONファイルへのパス")
     parser.add_argument("--skill-path", required=True, help="スキルディレクトリへのパス")
-    parser.add_argument("--max-iterations", type=int, default=10, help="最大イテレーション数（デフォルト: 10）")
+    parser.add_argument("--description", default=None, help="開始descriptionを上書き")
     parser.add_argument("--num-workers", type=int, default=10, help="並行ワーカー数（デフォルト: 10）")
     parser.add_argument("--timeout", type=int, default=30, help="クエリごとのタイムアウト秒数（デフォルト: 30）")
+    parser.add_argument("--max-iterations", type=int, default=5, help="最大イテレーション数（デフォルト: 5）")
     parser.add_argument("--runs-per-query", type=int, default=3, help="クエリごとの実行回数（デフォルト: 3）")
     parser.add_argument("--trigger-threshold", type=float, default=0.5, help="トリガー率の閾値（デフォルト: 0.5）")
-    parser.add_argument("--holdout", type=int, default=0, help="テスト用ホールドアウトクエリ数（デフォルト: 0）")
+    parser.add_argument("--holdout", type=float, default=0.4, help="テスト用ホールドアウト割合（0で無効、デフォルト: 0.4）")
     parser.add_argument("--seed", type=int, default=None, help="train/test分割のランダムシード")
     parser.add_argument("--model", default=None, help="評価時にclaude -pに使用するモデル")
     parser.add_argument("--improve-model", default="claude-sonnet-4-20250514", help="description改善に使用するモデル（デフォルト: claude-sonnet-4-20250514）")
     parser.add_argument("--verbose", action="store_true", help="進捗をstderrに出力")
-    parser.add_argument("--report", default=None, help="HTMLレポートの出力先パス（ライブ更新あり）")
-    parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ")
+    parser.add_argument("--report", default="auto", help="HTMLレポートの出力先パス（'auto'で一時ファイル自動起動、'none'で無効）")
+    parser.add_argument("--results-dir", default=None, help="タイムスタンプ付きサブディレクトリに全出力（results.json, report.html, logs）を保存")
+    parser.add_argument("--log-dir", default=None, help="改善トランスクリプトのログディレクトリ（--results-dirより優先）")
     args = parser.parse_args()
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"エラー: {skill_path} にSKILL.mdが見つかりません", file=sys.stderr)
+        sys.exit(1)
+    name, _, _ = parse_skill_md(skill_path)
+    # ライブレポートパスのセットアップ
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # ブラウザで即座に開けるよう初期HTMLを書き込む
+        live_report_path.write_text("<html><body><h1>最適化ループを開始しています...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+    # 出力ディレクトリの決定（run_loop実行前に作成してlogsを保存可能にする）
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+    # --log-dir が明示指定されていればそちらを優先、なければ results_dir/logs
+    if args.log_dir:
+        log_dir: Path | None = Path(args.log_dir)
+    elif results_dir:
+        log_dir = results_dir / "logs"
+    else:
+        log_dir = None
     output = run_loop(
-        eval_set_path=args.eval_set,
-        skill_path=args.skill_path,
-        max_iterations=args.max_iterations,
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
         num_workers=args.num_workers,
         timeout=args.timeout,
+        max_iterations=args.max_iterations,
         runs_per_query=args.runs_per_query,
         trigger_threshold=args.trigger_threshold,
         holdout=args.holdout,
@@ -284,11 +349,26 @@ def main():
         model=args.model,
         improve_model=args.improve_model,
         verbose=args.verbose,
-        report_path=args.report,
-        log_dir=args.log_dir,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
     )
-    print(json.dumps(output, indent=2))
+    # JSON出力
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+    # 最終HTMLレポートの書き込み（auto_refreshオフ）
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        print(f"\nレポート: {live_report_path}", file=sys.stderr)
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+    if results_dir:
+        print(f"結果を保存しました: {results_dir}", file=sys.stderr)
 if __name__ == "__main__":