npm - @nahisaho/satori - Versions diffs - 0.23.0 → 0.25.0 - Mend

@nahisaho/satori 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/src/.github/skills/scientific-data-profiling/SKILL.md ADDED Viewed

@@ -0,0 +1,247 @@
+---
+name: scientific-data-profiling
+description: |
+  データプロファイリング・品質スキル。ydata-profiling 自動 EDA ・
+  Great Expectations データバリデーション・データ品質スコア・
+  型推論・相関検出・外れ値フラグ・データカタログ生成。
+---
+# Scientific Data Profiling
+データセットの包括的プロファイリング・品質評価・
+自動 EDA レポートパイプラインを提供する。
+## When to Use
+- 新しいデータセットの全体像を素早く把握するとき
+- データ品質スコアを算出して品質基準をチェックするとき
+- ydata-profiling で自動 EDA レポートを生成するとき
+- Great Expectations でデータバリデーションルールを定義するとき
+- データカタログ (辞書) を自動生成するとき
+- 相関・外れ値・欠損を一括診断するとき
+---
+## Quick Start
+## 1. ydata-profiling 自動 EDA
+```python
+import numpy as np
+import pandas as pd
+def auto_profile_report(df, title="Data Profile Report",
+                        minimal=False, output="profile_report.html"):
+    """
+    ydata-profiling 自動 EDA レポート。
+    Parameters:
+        df: pd.DataFrame — 入力データ
+        title: str — レポートタイトル
+        minimal: bool — 軽量モード
+        output: str — 出力 HTML パス
+    """
+    from ydata_profiling import ProfileReport
+    profile = ProfileReport(
+        df, title=title, minimal=minimal,
+        correlations={"pearson": {"calculate": True},
+                      "spearman": {"calculate": True},
+                      "kendall": {"calculate": True}},
+        missing_diagrams={"bar": True, "matrix": True, "heatmap": True})
+    profile.to_file(output)
+    # サマリー抽出
+    desc = profile.get_description()
+    summary = {
+        "n_rows": len(df),
+        "n_cols": len(df.columns),
+        "n_numeric": len(df.select_dtypes(include=[np.number]).columns),
+        "n_categorical": len(df.select_dtypes(include=["object", "category"]).columns),
+        "total_missing": int(df.isnull().sum().sum()),
+        "missing_pct": float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100),
+        "n_duplicates": int(df.duplicated().sum()),
+    }
+    print(f"Profile Report → {output}")
+    print(f"  {summary['n_rows']} rows × {summary['n_cols']} cols, "
+          f"{summary['missing_pct']:.1f}% missing, "
+          f"{summary['n_duplicates']} duplicates")
+    return {"report_path": output, "summary": summary}
+```
+## 2. データ品質スコア
+```python
+def data_quality_score(df, rules=None):
+    """
+    データ品質スコア算出 (0-100)。
+    Parameters:
+        df: pd.DataFrame — 入力データ
+        rules: dict | None — カスタムルール
+    """
+    scores = {}
+    # 1. 完全性 (Completeness) — 非欠損率
+    completeness = 1.0 - df.isnull().sum().sum() / (len(df) * len(df.columns))
+    scores["completeness"] = completeness
+    # 2. 一意性 (Uniqueness) — 非重複率
+    uniqueness = 1.0 - df.duplicated().sum() / len(df) if len(df) > 0 else 1.0
+    scores["uniqueness"] = uniqueness
+    # 3. 一貫性 (Consistency) — 型一貫性
+    type_consistent = 0
+    for col in df.columns:
+        non_null = df[col].dropna()
+        if len(non_null) == 0:
+            type_consistent += 1
+            continue
+        try:
+            inferred = pd.api.types.infer_dtype(non_null, skipna=True)
+            if inferred not in ["mixed", "mixed-integer"]:
+                type_consistent += 1
+        except Exception:
+            pass
+    consistency = type_consistent / len(df.columns) if len(df.columns) > 0 else 1.0
+    scores["consistency"] = consistency
+    # 4. 適時性 (Timeliness) — 日付カラムの新しさ
+    date_cols = df.select_dtypes(include=["datetime64"]).columns
+    if len(date_cols) > 0:
+        max_date = df[date_cols[0]].max()
+        freshness = 1.0  # Placeholder
+        scores["timeliness"] = freshness
+    else:
+        scores["timeliness"] = 1.0
+    # 5. 妥当性 (Validity) — 数値カラムの有限性
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) > 0:
+        finite_rate = df[numeric_cols].apply(lambda x: np.isfinite(x.dropna()).mean()).mean()
+        scores["validity"] = float(finite_rate)
+    else:
+        scores["validity"] = 1.0
+    # 総合スコア
+    weights = {"completeness": 0.3, "uniqueness": 0.2,
+               "consistency": 0.2, "timeliness": 0.1, "validity": 0.2}
+    total_score = sum(scores[k] * weights[k] for k in weights) * 100
+    # カスタムルール
+    rule_results = []
+    if rules:
+        for rule_name, rule_fn in rules.items():
+            try:
+                passed = rule_fn(df)
+                rule_results.append({"rule": rule_name, "passed": passed})
+            except Exception as e:
+                rule_results.append({"rule": rule_name, "passed": False,
+                                     "error": str(e)})
+    print(f"Data Quality Score: {total_score:.1f}/100")
+    for k, v in scores.items():
+        print(f"  {k}: {v:.3f}")
+    return {"total_score": total_score, "dimension_scores": scores,
+            "rule_results": rule_results}
+```
+## 3. Great Expectations バリデーション
+```python
+def great_expectations_validate(df, expectations=None):
+    """
+    Great Expectations スタイルのデータバリデーション。
+    Parameters:
+        df: pd.DataFrame — 入力データ
+        expectations: list[dict] | None — バリデーションルール
+    """
+    if expectations is None:
+        expectations = _auto_generate_expectations(df)
+    results = []
+    for exp in expectations:
+        exp_type = exp["type"]
+        col = exp.get("column")
+        kwargs = exp.get("kwargs", {})
+        try:
+            if exp_type == "expect_column_to_exist":
+                success = col in df.columns
+            elif exp_type == "expect_column_values_to_not_be_null":
+                max_pct = kwargs.get("mostly", 1.0)
+                non_null_pct = df[col].notnull().mean()
+                success = non_null_pct >= max_pct
+            elif exp_type == "expect_column_values_to_be_between":
+                min_val, max_val = kwargs["min_value"], kwargs["max_value"]
+                vals = df[col].dropna()
+                success = bool((vals >= min_val).all() and (vals <= max_val).all())
+            elif exp_type == "expect_column_values_to_be_unique":
+                success = not df[col].duplicated().any()
+            elif exp_type == "expect_column_values_to_be_in_set":
+                valid_set = set(kwargs["value_set"])
+                success = df[col].dropna().isin(valid_set).all()
+            elif exp_type == "expect_table_row_count_to_be_between":
+                success = kwargs["min_value"] <= len(df) <= kwargs["max_value"]
+            else:
+                success = None
+            results.append({"expectation": exp_type, "column": col,
+                            "success": success})
+        except Exception as e:
+            results.append({"expectation": exp_type, "column": col,
+                            "success": False, "error": str(e)})
+    results_df = pd.DataFrame(results)
+    n_pass = results_df["success"].sum()
+    n_total = len(results_df)
+    print(f"Validation: {n_pass}/{n_total} expectations passed "
+          f"({n_pass/n_total*100:.0f}%)")
+    return results_df
+def _auto_generate_expectations(df):
+    """自動でバリデーションルールを推論。"""
+    expectations = []
+    for col in df.columns:
+        expectations.append({"type": "expect_column_to_exist", "column": col})
+        expectations.append({
+            "type": "expect_column_values_to_not_be_null",
+            "column": col,
+            "kwargs": {"mostly": 0.9}})
+        if df[col].dtype in [np.float64, np.int64]:
+            q1, q3 = df[col].quantile([0.01, 0.99])
+            iqr = q3 - q1
+            expectations.append({
+                "type": "expect_column_values_to_be_between",
+                "column": col,
+                "kwargs": {"min_value": float(q1 - 3 * iqr),
+                           "max_value": float(q3 + 3 * iqr)}})
+    return expectations
+```
+---
+## パイプライン統合
+```
+[データ取得] → data-profiling → eda-correlation
+                (品質診断)       (探索的解析)
+                     │                ↓
+          missing-data-analysis  anomaly-detection
+            (欠損補完)             (異常検知)
+```
+## パイプライン出力
+| ファイル | 説明 | 次スキル |
+|---------|------|---------|
+| `profile_report.html` | ydata-profiling レポート | → EDA |
+| `quality_score.json` | データ品質スコア | → 品質管理 |
+| `validation_results.csv` | バリデーション結果 | → データ修正 |

package/src/.github/skills/scientific-federated-learning/SKILL.md ADDED Viewed

@@ -0,0 +1,241 @@
+---
+name: scientific-federated-learning
+description: |
+  連合学習スキル。Flower フレームワークによる FL パイプライン・
+  FedAvg/FedProx/FedOpt 集約戦略・差分プライバシー (DP-SGD)・
+  非 IID データ分割・通信効率化。
+---
+# Scientific Federated Learning
+プライバシー保護型分散機械学習を実現する連合学習パイプラインを提供する。
+## When to Use
+- 複数施設・組織のデータを集約せずにモデル学習するとき
+- 医療データ・個人情報を含むデータで ML を行うとき
+- 差分プライバシーを適用した学習が必要なとき
+- 非 IID データ分割下での連合学習を設計するとき
+- 通信効率を考慮した分散学習を構築するとき
+---
+## Quick Start
+## 1. Flower 連合学習パイプライン
+```python
+import flwr as fl
+import numpy as np
+from typing import Dict, List, Tuple, Optional
+def create_fl_client(model, train_loader, val_loader,
+                     device="cpu"):
+    """
+    Flower クライアント生成。
+    Parameters:
+        model: nn.Module — PyTorch モデル
+        train_loader: DataLoader — 訓練データ
+        val_loader: DataLoader — 検証データ
+        device: str — "cpu" / "cuda"
+    """
+    import torch
+    class SatoriFlClient(fl.client.NumPyClient):
+        def get_parameters(self, config):
+            return [val.cpu().numpy()
+                    for val in model.parameters()]
+        def set_parameters(self, parameters):
+            for param, new_val in zip(model.parameters(), parameters):
+                param.data = torch.tensor(new_val).to(device)
+        def fit(self, parameters, config):
+            self.set_parameters(parameters)
+            model.train()
+            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+            criterion = torch.nn.CrossEntropyLoss()
+            epochs = config.get("local_epochs", 1)
+            for _ in range(epochs):
+                for X, y in train_loader:
+                    X, y = X.to(device), y.to(device)
+                    optimizer.zero_grad()
+                    loss = criterion(model(X), y)
+                    loss.backward()
+                    optimizer.step()
+            return self.get_parameters(config), len(train_loader.dataset), {}
+        def evaluate(self, parameters, config):
+            self.set_parameters(parameters)
+            model.eval()
+            criterion = torch.nn.CrossEntropyLoss()
+            total_loss, correct, total = 0.0, 0, 0
+            with torch.no_grad():
+                for X, y in val_loader:
+                    X, y = X.to(device), y.to(device)
+                    preds = model(X)
+                    total_loss += criterion(preds, y).item() * len(y)
+                    correct += (preds.argmax(1) == y).sum().item()
+                    total += len(y)
+            return total_loss / total, total, {"accuracy": correct / total}
+    return SatoriFlClient()
+def create_fl_strategy(algorithm="fedavg", min_clients=2,
+                       fraction_fit=1.0, fraction_evaluate=1.0,
+                       proximal_mu=0.1):
+    """
+    連合学習集約戦略の選択。
+    Parameters:
+        algorithm: str — "fedavg" / "fedprox" / "fedopt" / "fedadam"
+        min_clients: int — 最小クライアント数
+        fraction_fit: float — 学習参加率
+        fraction_evaluate: float — 評価参加率
+        proximal_mu: float — FedProx 近接項の強度
+    """
+    common = dict(
+        min_fit_clients=min_clients,
+        min_evaluate_clients=min_clients,
+        min_available_clients=min_clients,
+        fraction_fit=fraction_fit,
+        fraction_evaluate=fraction_evaluate,
+    )
+    strategies = {
+        "fedavg": fl.server.strategy.FedAvg(**common),
+        "fedprox": fl.server.strategy.FedProx(
+            proximal_mu=proximal_mu, **common),
+        "fedadam": fl.server.strategy.FedAdam(
+            eta=1e-1, eta_l=1e-1, tau=1e-9, **common),
+    }
+    strategy = strategies.get(algorithm, strategies["fedavg"])
+    print(f"FL Strategy: {algorithm} | min_clients={min_clients}")
+    return strategy
+```
+## 2. 差分プライバシー (DP-SGD)
+```python
+def apply_differential_privacy(model, train_loader,
+                               target_epsilon=1.0,
+                               target_delta=1e-5,
+                               max_grad_norm=1.0,
+                               noise_multiplier=1.1,
+                               epochs=10, lr=1e-3):
+    """
+    Opacus DP-SGD による差分プライバシー学習。
+    Parameters:
+        model: nn.Module — PyTorch モデル
+        train_loader: DataLoader — 訓練データ
+        target_epsilon: float — プライバシーバジェット ε
+        target_delta: float — プライバシーパラメータ δ
+        max_grad_norm: float — 勾配クリッピングノルム
+        noise_multiplier: float — ノイズ乗数 σ
+        epochs: int — 学習エポック数
+        lr: float — 学習率
+    """
+    import torch
+    from opacus import PrivacyEngine
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+    privacy_engine = PrivacyEngine()
+    model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
+        module=model,
+        optimizer=optimizer,
+        data_loader=train_loader,
+        epochs=epochs,
+        target_epsilon=target_epsilon,
+        target_delta=target_delta,
+        max_grad_norm=max_grad_norm,
+    )
+    criterion = torch.nn.CrossEntropyLoss()
+    history = []
+    for epoch in range(epochs):
+        model.train()
+        total_loss = 0
+        for X, y in train_loader:
+            optimizer.zero_grad()
+            loss = criterion(model(X), y)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        epsilon = privacy_engine.get_epsilon(delta=target_delta)
+        history.append({"epoch": epoch + 1,
+                        "loss": total_loss / len(train_loader),
+                        "epsilon": epsilon})
+        print(f"Epoch {epoch+1}: loss={total_loss/len(train_loader):.4f}, "
+              f"ε={epsilon:.2f}")
+    import pandas as pd
+    return pd.DataFrame(history)
+```
+## 3. 非 IID データ分割
+```python
+def create_non_iid_splits(dataset_labels, n_clients=5,
+                          alpha=0.5, seed=42):
+    """
+    Dirichlet 分布ベースの非 IID データ分割。
+    Parameters:
+        dataset_labels: np.ndarray — 全データのラベル配列
+        n_clients: int — クライアント数
+        alpha: float — Dirichlet α (小さいほど偏りが大きい)
+        seed: int — 乱数シード
+    """
+    rng = np.random.default_rng(seed)
+    n_classes = len(np.unique(dataset_labels))
+    client_indices = [[] for _ in range(n_clients)]
+    for c in range(n_classes):
+        class_idx = np.where(dataset_labels == c)[0]
+        proportions = rng.dirichlet(np.repeat(alpha, n_clients))
+        split_points = (np.cumsum(proportions) * len(class_idx)).astype(int)
+        splits = np.split(class_idx, split_points[:-1])
+        for i, split in enumerate(splits):
+            client_indices[i].extend(split.tolist())
+    # 分布サマリー
+    for i, indices in enumerate(client_indices):
+        labels = dataset_labels[indices]
+        unique, counts = np.unique(labels, return_counts=True)
+        dist = dict(zip(unique.tolist(), counts.tolist()))
+        print(f"Client {i}: {len(indices)} samples, dist={dist}")
+    return client_indices
+```
+---
+## パイプライン統合
+```
+[プライバシー要件] → federated-learning → model-monitoring
+                      (連合学習)              (モデル監視)
+                           │
+                    deep-learning ← transfer-learning
+                      (基盤 NN)       (転移学習)
+```
+## パイプライン出力
+| ファイル | 説明 | 次スキル |
+|---------|------|---------|
+| `fl_strategy_config.json` | FL 集約設定 | → サーバー起動 |
+| `dp_training_history.csv` | DP 学習履歴 | → model-monitoring |
+| `client_splits.json` | 非 IID 分割情報 | → FL クライアント |