npm - @nahisaho/satori - Versions diffs - 0.1.0 - Mend

@nahisaho/satori 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/src/.github/skills/scientific-pca-tsne/SKILL.md ADDED Viewed

@@ -0,0 +1,235 @@
+---
+name: scientific-pca-tsne
+description: |
+  PCA・t-SNE・UMAP による次元削減と空間マッピングのスキル。化学空間・特徴量空間・
+  多技法融合空間の可視化を行う際に使用。Scientific Skills Exp-02, 03, 05, 07, 11, 13
+  で汎用的に使用されたパターン。
+---
+# Scientific Dimensionality Reduction & Space Mapping
+高次元データを 2D/3D 空間に射影して構造を可視化するスキル。
+PCA（線形）、t-SNE（非線形、局所構造保存）、UMAP（非線形、大域+局所）の
+3 手法を使い分ける。
+## When to Use
+- 高次元データの構造を 2D で可視化したいとき
+- 材料・化合物・サンプルのクラスター構造を発見したいとき
+- 多技法の測定データを統合したいとき
+- 主成分の寄与率や負荷量を解釈したいとき
+## Quick Start
+## 手法選択ガイド
+| 手法 | 特徴 | 推奨場面 |
+|---|---|---|
+| PCA | 線形・解釈容易・寄与率あり | 最初の概観、PC 負荷量の解釈 |
+| t-SNE | 非線形・局所構造保存 | クラスター分離の可視化 |
+| UMAP | 非線形・大域+局所 | 大規模データ、連続的な勾配の可視化 |
+## 標準パイプライン
+### 1. PCA + スクリープロット
+```python
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+def pca_analysis(df, feature_cols, n_components=None, figsize=(14, 5)):
+    """PCA を実行し、スクリープロットと PC1-PC2 散布図を描画する。"""
+    scaler = StandardScaler()
+    X_sc = scaler.fit_transform(df[feature_cols])
+    if n_components is None:
+        n_components = min(len(feature_cols), 10)
+    pca = PCA(n_components=n_components)
+    pcs = pca.fit_transform(X_sc)
+    # スクリープロット
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
+    cumvar = np.cumsum(pca.explained_variance_ratio_) * 100
+    ax1.bar(range(1, n_components + 1), pca.explained_variance_ratio_ * 100,
+            color="steelblue", edgecolor="black")
+    ax1.plot(range(1, n_components + 1), cumvar, "ro-", linewidth=2)
+    ax1.set_xlabel("Principal Component")
+    ax1.set_ylabel("Explained Variance (%)")
+    ax1.set_title("Scree Plot", fontweight="bold")
+    ax1.axhline(y=80, color="gray", linestyle="--", alpha=0.5)
+    # PC1 vs PC2
+    ax2.scatter(pcs[:, 0], pcs[:, 1], alpha=0.6, s=30, edgecolors="k", linewidth=0.3)
+    ax2.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
+    ax2.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
+    ax2.set_title("PCA Projection", fontweight="bold")
+    plt.tight_layout()
+    plt.savefig("figures/pca_screeplot.png", dpi=300, bbox_inches="tight")
+    plt.close()
+    return pca, pcs
+def pca_loading_plot(pca, feature_names, figsize=(10, 8)):
+    """PC1-PC2 の負荷量（loading）バイプロットを描画する。"""
+    loadings = pca.components_[:2].T
+    fig, ax = plt.subplots(figsize=figsize)
+    for i, name in enumerate(feature_names):
+        ax.arrow(0, 0, loadings[i, 0], loadings[i, 1],
+                head_width=0.02, head_length=0.01, fc="red", ec="red", alpha=0.7)
+        ax.text(loadings[i, 0] * 1.1, loadings[i, 1] * 1.1, name,
+               fontsize=8, ha="center")
+    ax.set_xlabel(f"PC1 Loading")
+    ax.set_ylabel(f"PC2 Loading")
+    ax.set_title("PCA Loading Plot (Biplot)", fontweight="bold")
+    ax.axhline(0, color="gray", linewidth=0.5)
+    ax.axvline(0, color="gray", linewidth=0.5)
+    plt.tight_layout()
+    plt.savefig("figures/pca_loadings.png", dpi=300, bbox_inches="tight")
+    plt.close()
+```
+### 2. t-SNE 可視化
+```python
+from sklearn.manifold import TSNE
+def tsne_visualization(X_scaled, labels, label_name="Group",
+                       perplexity=30, random_state=42, figsize=(8, 8)):
+    """t-SNE 2D 射影を描画する。"""
+    tsne = TSNE(n_components=2, perplexity=perplexity,
+                random_state=random_state, n_iter=1000)
+    coords = tsne.fit_transform(X_scaled)
+    fig, ax = plt.subplots(figsize=figsize)
+    for label in sorted(set(labels)):
+        mask = labels == label
+        ax.scatter(coords[mask, 0], coords[mask, 1],
+                  label=label, alpha=0.7, s=40, edgecolors="k", linewidth=0.3)
+    ax.set_xlabel("t-SNE 1")
+    ax.set_ylabel("t-SNE 2")
+    ax.set_title("t-SNE Projection", fontweight="bold")
+    ax.legend(title=label_name, bbox_to_anchor=(1.05, 1))
+    plt.tight_layout()
+    plt.savefig("figures/tsne_projection.png", dpi=300, bbox_inches="tight")
+    plt.close()
+    return coords
+```
+### 3. PCA + t-SNE 並列パネル（Exp-11, 13 パターン）
+```python
+def pca_tsne_panel(df, feature_cols, hue_col, figsize=(16, 7)):
+    """PCA と t-SNE を横並びで描画する。"""
+    scaler = StandardScaler()
+    X_sc = scaler.fit_transform(df[feature_cols])
+    pca = PCA(n_components=2)
+    pcs = pca.fit_transform(X_sc)
+    tsne = TSNE(n_components=2, perplexity=30, random_state=42)
+    tsne_coords = tsne.fit_transform(X_sc)
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
+    labels = df[hue_col].values
+    for label in sorted(set(labels)):
+        mask = labels == label
+        ax1.scatter(pcs[mask, 0], pcs[mask, 1], label=label,
+                   alpha=0.7, s=40, edgecolors="k", linewidth=0.3)
+        ax2.scatter(tsne_coords[mask, 0], tsne_coords[mask, 1], label=label,
+                   alpha=0.7, s=40, edgecolors="k", linewidth=0.3)
+    ev = pca.explained_variance_ratio_ * 100
+    ax1.set_xlabel(f"PC1 ({ev[0]:.1f}%)")
+    ax1.set_ylabel(f"PC2 ({ev[1]:.1f}%)")
+    ax1.set_title("(A) PCA", fontweight="bold")
+    ax1.legend(title=hue_col)
+    ax2.set_xlabel("t-SNE 1")
+    ax2.set_ylabel("t-SNE 2")
+    ax2.set_title("(B) t-SNE", fontweight="bold")
+    ax2.legend(title=hue_col)
+    plt.tight_layout()
+    plt.savefig("figures/pca_tsne_panel.png", dpi=300, bbox_inches="tight")
+    plt.close()
+    # 座標を保存
+    coords_df = pd.DataFrame({
+        "PC1": pcs[:, 0], "PC2": pcs[:, 1],
+        "tSNE1": tsne_coords[:, 0], "tSNE2": tsne_coords[:, 1],
+        hue_col: labels,
+    })
+    coords_df.to_csv("results/pca_tsne_coordinates.csv", index=False)
+    return pca, tsne_coords
+```
+### 4. 階層的クラスタリング + Silhouette 最適 k
+```python
+from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
+from sklearn.metrics import silhouette_score
+def hierarchical_clustering(X_scaled, method="ward", max_k=10, figsize=(12, 5)):
+    """階層的クラスタリングと最適クラスター数の決定。"""
+    Z = linkage(X_scaled, method=method)
+    # Silhouette スコアで最適 k を探索
+    sil_scores = []
+    for k in range(2, max_k + 1):
+        labels = fcluster(Z, k, criterion="maxclust")
+        sil = silhouette_score(X_scaled, labels)
+        sil_scores.append({"k": k, "silhouette": sil})
+    sil_df = pd.DataFrame(sil_scores)
+    best_k = sil_df.loc[sil_df["silhouette"].idxmax(), "k"]
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
+    # デンドログラム
+    dendrogram(Z, ax=ax1, truncate_mode="lastp", p=30, leaf_rotation=90)
+    ax1.set_title("Dendrogram", fontweight="bold")
+    ax1.set_ylabel("Distance")
+    # Silhouette スコア
+    ax2.plot(sil_df["k"], sil_df["silhouette"], "bo-", linewidth=2)
+    ax2.axvline(best_k, color="red", linestyle="--", label=f"Best k={int(best_k)}")
+    ax2.set_xlabel("Number of Clusters (k)")
+    ax2.set_ylabel("Silhouette Score")
+    ax2.set_title("Optimal k (Silhouette)", fontweight="bold")
+    ax2.legend()
+    plt.tight_layout()
+    plt.savefig("figures/clustering_analysis.png", dpi=300, bbox_inches="tight")
+    plt.close()
+    return Z, int(best_k)
+```
+## References
+### Output Files
+| ファイル | 形式 |
+|---|---|
+| `results/pca_tsne_coordinates.csv` | CSV |
+| `figures/pca_screeplot.png` | PNG |
+| `figures/pca_loadings.png` | PNG |
+| `figures/tsne_projection.png` | PNG |
+| `figures/pca_tsne_panel.png` | PNG |
+| `figures/clustering_analysis.png` | PNG |
+#### 参照実験
+- **Exp-02**: 化学空間 PCA/t-SNE（EGFR 阻害剤）
+- **Exp-03**: PCA / UMAP / t-SNE 3 手法比較
+- **Exp-07**: PLS-DA + PCA メタボロミクス
+- **Exp-11**: ラマンスペクトルの PCA/t-SNE + 階層的クラスタリング
+- **Exp-13**: 多技法融合 PCA/t-SNE（XRD+AFM+電気+光学）

package/src/.github/skills/scientific-pipeline-scaffold/SKILL.md ADDED Viewed

@@ -0,0 +1,331 @@
+---
+name: scientific-pipeline-scaffold
+description: |
+  科学データ解析パイプラインの基盤スキル。ディレクトリ構造の自動構築、再現性のためのシード管理、
+  進捗ログ出力、実行時間計測、JSON サマリー生成、ダッシュボード総括図の作成を行う際に使用。
+  全 13 実験に共通する足場パターンを統合。
+---
+# Scientific Pipeline Scaffold
+全 Exp-01〜13 に共通する「足場（scaffold）」パターンを統合したスキル。
+新しい実験パイプラインを立ち上げる際に最初に適用し、再現性・可読性・構造を保証する。
+## When to Use
+- 新しい解析実験のスクリプトを立ち上げるとき
+- 再現可能な実験パイプラインを構築するとき
+- 解析結果をJSON サマリーとしてエクスポートしたいとき
+- 総括ダッシュボード図を自動生成したいとき
+## Quick Start
+## 1. スクリプトヘッダーテンプレート
+```python
+#!/usr/bin/env python3
+"""
+Exp-XX: [実験タイトル]
+Scientific Skills Series
+Description:
+    [1-2行の概要]
+Author: [名前]
+Date: [YYYY-MM-DD]
+"""
+import warnings
+warnings.filterwarnings("ignore")
+import matplotlib
+matplotlib.use("Agg")  # headless 環境対応
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+import json
+import time
+# === 再現性設定 ===
+SEED = 42
+np.random.seed(SEED)
+# === ディレクトリ構造 ===
+BASE_DIR = Path(__file__).resolve().parent
+DATA_DIR = BASE_DIR / "data"
+FIG_DIR = BASE_DIR / "figures"
+RESULTS_DIR = BASE_DIR / "results"
+for d in [DATA_DIR, FIG_DIR, RESULTS_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+# === 出版品質スタイル ===
+plt.rcParams.update({
+    "font.family": "sans-serif",
+    "font.sans-serif": ["Arial", "Helvetica", "DejaVu Sans"],
+    "font.size": 10,
+    "axes.titlesize": 12,
+    "axes.labelsize": 11,
+    "axes.spines.top": False,
+    "axes.spines.right": False,
+    "savefig.dpi": 300,
+    "savefig.bbox": "tight",
+    "figure.facecolor": "white",
+})
+```
+## 2. メイン関数テンプレート
+```python
+def main():
+    """メインパイプライン実行関数。"""
+    start_time = time.time()
+    print("=" * 60)
+    print("Exp-XX: [実験タイトル]")
+    print("=" * 60)
+    # ──── Step 1: データ読み込み / 生成 ────
+    print("\n[Step 1] データ読み込み...")
+    # df = pd.read_csv(DATA_DIR / "dataset.csv")
+    # df = generate_dataset()
+    # df.to_csv(DATA_DIR / "dataset.csv", index=False)
+    # ──── Step 2: EDA ────
+    print("\n[Step 2] 探索的データ解析...")
+    # → scientific-eda-correlation スキル参照
+    # ──── Step 3: 前処理 ────
+    print("\n[Step 3] 前処理...")
+    # → scientific-data-preprocessing スキル参照
+    # ──── Step 4: モデル学習 ────
+    print("\n[Step 4] モデル学習...")
+    # → scientific-ml-regression / ml-classification スキル参照
+    # ──── Step 5: 可視化 ────
+    print("\n[Step 5] 可視化...")
+    # → scientific-publication-figures スキル参照
+    # ──── Step 6: サマリー ────
+    elapsed = time.time() - start_time
+    print(f"\n[Step 6] サマリー生成... (elapsed: {elapsed:.1f}s)")
+    # → generate_summary() 呼び出し
+    print("\n" + "=" * 60)
+    print(f"完了！ ({elapsed:.1f} 秒)")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()
+```
+## 3. 進捗ログユーティリティ
+```python
+class StepLogger:
+    """段階的進捗ログを管理するユーティリティ。"""
+    def __init__(self, experiment_name):
+        self.experiment_name = experiment_name
+        self.step_count = 0
+        self.start_time = time.time()
+        self.step_times = {}
+        print("=" * 60)
+        print(f"{experiment_name}")
+        print("=" * 60)
+    def step(self, description):
+        """新しい Step を開始する。"""
+        self.step_count += 1
+        step_start = time.time()
+        if self.step_count > 1:
+            prev = self.step_count - 1
+            self.step_times[prev] = time.time() - self._current_step_start
+        self._current_step_start = step_start
+        print(f"\n[Step {self.step_count}] {description}...")
+    def finish(self):
+        """パイプライン完了を記録する。"""
+        elapsed = time.time() - self.start_time
+        if self.step_count > 0:
+            self.step_times[self.step_count] = time.time() - self._current_step_start
+        print(f"\n{'=' * 60}")
+        print(f"完了！ (合計 {elapsed:.1f} 秒)")
+        for step_num, t in self.step_times.items():
+            print(f"  Step {step_num}: {t:.1f}s")
+        print(f"{'=' * 60}")
+        return elapsed
+```
+## 4. JSON サマリー生成
+全実験で共通する `analysis_summary.json` のスキーマ。
+```python
+def generate_summary(results_dict, experiment_name, elapsed_seconds,
+                     output_path=None):
+    """
+    標準フォーマットの analysis_summary.json を生成する。
+    results_dict はドメイン固有の結果を含む辞書。
+    この関数がメタ情報を自動追加する。
+    """
+    import datetime
+    summary = {
+        "experiment": experiment_name,
+        "timestamp": datetime.datetime.now().isoformat(),
+        "elapsed_seconds": round(elapsed_seconds, 2),
+        "environment": {
+            "python": __import__("sys").version,
+            "seed": SEED,
+        },
+        "data": {
+            "n_samples": results_dict.get("n_samples"),
+            "n_features": results_dict.get("n_features"),
+            "source": results_dict.get("data_source", "simulation"),
+        },
+        "results": results_dict,
+    }
+    if output_path is None:
+        output_path = RESULTS_DIR / "analysis_summary.json"
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2, ensure_ascii=False, default=str)
+    print(f"  → Summary saved: {output_path}")
+    return summary
+```
+### JSON サマリー標準キー一覧
+| キー | 型 | 説明 | 必須 |
+|---|---|---|---|
+| `experiment` | str | 実験名 | ✅ |
+| `timestamp` | str | ISO 8601 日時 | ✅ |
+| `elapsed_seconds` | float | 実行時間 | ✅ |
+| `environment.python` | str | Python バージョン | ✅ |
+| `environment.seed` | int | 乱数シード | ✅ |
+| `data.n_samples` | int | サンプル数 | ✅ |
+| `data.n_features` | int | 特徴量数 | ○ |
+| `data.source` | str | データソース | ○ |
+| `results.best_model` | str | 最良モデル名 | ○ |
+| `results.best_r2` / `best_auc` | float | 最良スコア | ○ |
+| `results.n_figures` | int | 生成図数 | ○ |
+## 5. 総括ダッシュボードパネル
+```python
+import matplotlib.gridspec as gridspec
+def create_summary_panel(panel_data, experiment_name, figsize=(20, 14)):
+    """
+    解析結果の総括ダッシュボードを 1 枚の Figure にまとめる。
+    panel_data: [
+        {"type": "table", "title": "...", "data": df_or_dict},
+        {"type": "plot_func", "title": "...", "func": callable, "kwargs": {}},
+        {"type": "text", "title": "...", "text": "..."},
+        {"type": "metrics_bar", "title": "...", "names": [...], "values": [...]},
+    ]
+    """
+    n_panels = len(panel_data)
+    ncols = min(3, n_panels)
+    nrows = (n_panels + ncols - 1) // ncols
+    fig = plt.figure(figsize=figsize)
+    gs = gridspec.GridSpec(nrows, ncols, figure=fig, hspace=0.4, wspace=0.3)
+    for i, panel in enumerate(panel_data):
+        row, col = divmod(i, ncols)
+        ax = fig.add_subplot(gs[row, col])
+        ptype = panel["type"]
+        title = panel.get("title", f"Panel {chr(65 + i)}")
+        if ptype == "metrics_bar":
+            ax.barh(panel["names"], panel["values"],
+                   color="steelblue", edgecolor="black")
+            ax.set_xlabel("Value")
+        elif ptype == "text":
+            ax.text(0.05, 0.95, panel["text"], transform=ax.transAxes,
+                   fontsize=9, verticalalignment="top", fontfamily="monospace",
+                   bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5))
+            ax.axis("off")
+        elif ptype == "table":
+            ax.axis("off")
+            if isinstance(panel["data"], pd.DataFrame):
+                tbl = ax.table(cellText=panel["data"].values,
+                              colLabels=panel["data"].columns,
+                              cellLoc="center", loc="center")
+                tbl.auto_set_font_size(False)
+                tbl.set_fontsize(8)
+            elif isinstance(panel["data"], dict):
+                rows = [[k, str(v)] for k, v in panel["data"].items()]
+                tbl = ax.table(cellText=rows, colLabels=["Metric", "Value"],
+                              cellLoc="center", loc="center")
+                tbl.auto_set_font_size(False)
+                tbl.set_fontsize(9)
+        elif ptype == "plot_func":
+            panel["func"](ax=ax, **panel.get("kwargs", {}))
+        # パネルラベル (A, B, C, ...)
+        ax.set_title(f"({chr(65 + i)}) {title}", fontsize=11, fontweight="bold")
+    fig.suptitle(f"Summary: {experiment_name}", fontsize=14, fontweight="bold")
+    plt.savefig(FIG_DIR / "summary_panel.png", dpi=300, bbox_inches="tight")
+    plt.close()
+```
+## 6. 共通ユーティリティ
+```python
+def save_fig(fig, filename, dpi=300, formats=("png",)):
+    """図を保存してクローズする共通関数。"""
+    for fmt in formats:
+        fig.savefig(FIG_DIR / f"{filename}.{fmt}",
+                   dpi=dpi, bbox_inches="tight",
+                   facecolor="white", edgecolor="none")
+    plt.close(fig)
+    print(f"  → Figure saved: {filename}")
+def save_results(df, filename, index=False):
+    """DataFrame を results/ に保存する共通関数。"""
+    path = RESULTS_DIR / filename
+    df.to_csv(path, index=index)
+    print(f"  → Results saved: {filename}")
+```
+## ディレクトリ構造の標準
+```
+Exp-XX/
+├── exp_analysis.py          # メインスクリプト
+├── qiita-exp-analysis.md    # Qiita 記事
+├── data/
+│   └── dataset.csv          # 入力データ
+├── figures/
+│   ├── Fig01_*.png          # 個別図（Fig + 連番 + 説明）
+│   ├── Fig02_*.png
+│   └── summary_panel.png    # 総括ダッシュボード
+└── results/
+    ├── descriptive_statistics.csv
+    ├── model_metrics.csv
+    ├── feature_importance.csv
+    └── analysis_summary.json  # JSON サマリー
+```
+## References
+- **全 13 実験**: ディレクトリ構造、シード管理、warnings 抑制
+- **Exp-12, 13**: `main()` + 実行時間計測 + JSON サマリー + 総括パネル
+- **Exp-10**: `save_fig()` / `write_summary()` ユーティリティ