@nahisaho/satori 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ ---
2
+ name: scientific-time-series-forecasting
3
+ description: |
4
+ ML 時系列予測スキル。Prophet/NeuralProphet・N-BEATS・
5
+ Temporal Fusion Transformer (TFT)・時系列特徴量エンジニアリング・
6
+ バックテスト・多段階予測・アンサンブル予測。
7
+ ---
8
+
9
+ # Scientific Time Series Forecasting
10
+
11
+ 深層学習・ML ベースの時系列予測パイプラインを提供し、
12
+ Prophet から Transformer まで最新手法を網羅する。
13
+
14
+ ## When to Use
15
+
16
+ - Prophet/NeuralProphet で季節性時系列を予測するとき
17
+ - 深層学習 (N-BEATS/TFT) で高精度予測するとき
18
+ - 時系列特徴量エンジニアリングでラグ・ローリング特徴を生成するとき
19
+ - バックテストで予測性能を厳密に評価するとき
20
+ - 複数モデルのアンサンブル予測をするとき
21
+ - 多変量・多段階予測をするとき
22
+
23
+ > **Note**: 古典時系列 (ARIMA/STL/FFT) は `scientific-time-series` を参照。
24
+
25
+ ---
26
+
27
+ ## Quick Start
28
+
29
+ ## 1. Prophet / NeuralProphet
30
+
31
+ ```python
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+
36
+ def prophet_forecast(df, date_col, value_col, periods=30,
37
+ freq="D", yearly=True, weekly=True,
38
+ changepoint_prior=0.05):
39
+ """
40
+ Prophet 時系列予測。
41
+
42
+ Parameters:
43
+ df: pd.DataFrame — 時系列データ
44
+ date_col: str — 日付カラム
45
+ value_col: str — 値カラム
46
+ periods: int — 予測期間
47
+ freq: str — 頻度 ("D" / "H" / "M")
48
+ yearly: bool — 年次季節性
49
+ weekly: bool — 週次季節性
50
+ changepoint_prior: float — 変化点感度
51
+ """
52
+ from prophet import Prophet
53
+
54
+ prophet_df = df[[date_col, value_col]].rename(
55
+ columns={date_col: "ds", value_col: "y"})
56
+
57
+ model = Prophet(
58
+ yearly_seasonality=yearly,
59
+ weekly_seasonality=weekly,
60
+ changepoint_prior_scale=changepoint_prior)
61
+ model.fit(prophet_df)
62
+
63
+ future = model.make_future_dataframe(periods=periods, freq=freq)
64
+ forecast = model.predict(future)
65
+
66
+ # 評価
67
+ merged = forecast.merge(prophet_df, on="ds", how="left")
68
+ valid = merged.dropna(subset=["y"])
69
+ mae = np.mean(np.abs(valid["y"] - valid["yhat"]))
70
+ mape = np.mean(np.abs((valid["y"] - valid["yhat"]) / valid["y"])) * 100
71
+
72
+ fig1 = model.plot(forecast)
73
+ fig1.savefig("prophet_forecast.png", dpi=150, bbox_inches="tight")
74
+
75
+ fig2 = model.plot_components(forecast)
76
+ fig2.savefig("prophet_components.png", dpi=150, bbox_inches="tight")
77
+
78
+ print(f"Prophet: {periods} periods, MAE={mae:.4f}, MAPE={mape:.1f}%")
79
+ return {"forecast": forecast, "model": model,
80
+ "mae": mae, "mape": mape}
81
+
82
+
83
+ def neuralprophet_forecast(df, date_col, value_col, periods=30,
84
+ n_lags=60, n_forecasts=30):
85
+ """
86
+ NeuralProphet 時系列予測 (AR-Net)。
87
+
88
+ Parameters:
89
+ df: pd.DataFrame — 時系列データ
90
+ date_col: str — 日付カラム
91
+ value_col: str — 値カラム
92
+ periods: int — 予測期間
93
+ n_lags: int — 自己回帰ラグ数
94
+ n_forecasts: int — 多段階予測ステップ
95
+ """
96
+ from neuralprophet import NeuralProphet
97
+
98
+ np_df = df[[date_col, value_col]].rename(
99
+ columns={date_col: "ds", value_col: "y"})
100
+
101
+ model = NeuralProphet(
102
+ n_lags=n_lags, n_forecasts=n_forecasts,
103
+ yearly_seasonality=True, weekly_seasonality=True,
104
+ learning_rate=0.01, epochs=100)
105
+
106
+ metrics = model.fit(np_df, freq="D")
107
+
108
+ future = model.make_future_dataframe(np_df, periods=periods, n_historic_predictions=True)
109
+ forecast = model.predict(future)
110
+
111
+ fig = model.plot(forecast)
112
+ fig.savefig("neuralprophet_forecast.png", dpi=150, bbox_inches="tight")
113
+
114
+ print(f"NeuralProphet: lags={n_lags}, forecasts={n_forecasts}")
115
+ return {"forecast": forecast, "model": model, "metrics": metrics}
116
+ ```
117
+
118
+ ## 2. 時系列特徴量エンジニアリング
119
+
120
+ ```python
121
+ def create_ts_features(df, date_col, value_col,
122
+ lags=None, rolling_windows=None):
123
+ """
124
+ 時系列特徴量エンジニアリング。
125
+
126
+ Parameters:
127
+ df: pd.DataFrame — 時系列データ
128
+ date_col: str — 日付カラム
129
+ value_col: str — 値カラム
130
+ lags: list[int] | None — ラグ特徴量 (e.g., [1,7,14,28])
131
+ rolling_windows: list[int] | None — ローリング窓 (e.g., [7,14,30])
132
+ """
133
+ if lags is None:
134
+ lags = [1, 3, 7, 14, 28]
135
+ if rolling_windows is None:
136
+ rolling_windows = [7, 14, 30]
137
+
138
+ result = df.copy()
139
+ result[date_col] = pd.to_datetime(result[date_col])
140
+ result = result.sort_values(date_col)
141
+
142
+ # カレンダー特徴量
143
+ result["dayofweek"] = result[date_col].dt.dayofweek
144
+ result["dayofyear"] = result[date_col].dt.dayofyear
145
+ result["month"] = result[date_col].dt.month
146
+ result["quarter"] = result[date_col].dt.quarter
147
+ result["is_weekend"] = (result[date_col].dt.dayofweek >= 5).astype(int)
148
+
149
+ # 周期エンコーディング
150
+ result["sin_day"] = np.sin(2 * np.pi * result["dayofyear"] / 365.25)
151
+ result["cos_day"] = np.cos(2 * np.pi * result["dayofyear"] / 365.25)
152
+ result["sin_week"] = np.sin(2 * np.pi * result["dayofweek"] / 7)
153
+ result["cos_week"] = np.cos(2 * np.pi * result["dayofweek"] / 7)
154
+
155
+ # ラグ特徴量
156
+ for lag in lags:
157
+ result[f"lag_{lag}"] = result[value_col].shift(lag)
158
+
159
+ # ローリング統計量
160
+ for window in rolling_windows:
161
+ result[f"rolling_mean_{window}"] = result[value_col].rolling(window).mean()
162
+ result[f"rolling_std_{window}"] = result[value_col].rolling(window).std()
163
+ result[f"rolling_min_{window}"] = result[value_col].rolling(window).min()
164
+ result[f"rolling_max_{window}"] = result[value_col].rolling(window).max()
165
+
166
+ # 差分特徴量
167
+ result["diff_1"] = result[value_col].diff(1)
168
+ result["diff_7"] = result[value_col].diff(7)
169
+
170
+ n_features = len(result.columns) - len(df.columns)
171
+ print(f"TS Features: {n_features} features created "
172
+ f"(lags={lags}, windows={rolling_windows})")
173
+ return result
174
+
175
+
176
+ def ts_backtest(df, date_col, value_col, model_fn,
177
+ n_splits=5, horizon=30, gap=0):
178
+ """
179
+ 時系列バックテスト (Walk-forward validation)。
180
+
181
+ Parameters:
182
+ df: pd.DataFrame — 時系列データ
183
+ date_col: str — 日付カラム
184
+ value_col: str — 値カラム
185
+ model_fn: callable — モデル学習・予測関数 (train_df → forecast_df)
186
+ n_splits: int — 分割数
187
+ horizon: int — 予測ホライズン
188
+ gap: int — 学習-テスト間ギャップ
189
+ """
190
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
191
+
192
+ sorted_df = df.sort_values(date_col).reset_index(drop=True)
193
+ n = len(sorted_df)
194
+ fold_size = (n - horizon) // n_splits
195
+
196
+ results = []
197
+
198
+ for i in range(n_splits):
199
+ train_end = fold_size * (i + 1)
200
+ test_start = train_end + gap
201
+ test_end = min(test_start + horizon, n)
202
+
203
+ if test_end > n:
204
+ break
205
+
206
+ train_df = sorted_df.iloc[:train_end]
207
+ test_df = sorted_df.iloc[test_start:test_end]
208
+
209
+ forecast = model_fn(train_df)
210
+ y_true = test_df[value_col].values[:len(forecast)]
211
+ y_pred = forecast[:len(y_true)]
212
+
213
+ mae = mean_absolute_error(y_true, y_pred)
214
+ rmse = np.sqrt(mean_squared_error(y_true, y_pred))
215
+ mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100
216
+
217
+ results.append({
218
+ "fold": i, "train_size": train_end,
219
+ "test_size": test_end - test_start,
220
+ "mae": mae, "rmse": rmse, "mape": mape})
221
+
222
+ results_df = pd.DataFrame(results)
223
+ print(f"Backtest ({n_splits} folds, h={horizon}): "
224
+ f"MAE={results_df['mae'].mean():.4f} ± {results_df['mae'].std():.4f}")
225
+ return results_df
226
+ ```
227
+
228
+ ---
229
+
230
+ ## パイプライン統合
231
+
232
+ ```
233
+ time-series → time-series-forecasting → model-monitoring
234
+ (古典解析) (ML 予測) (監視)
235
+ │ │ ↓
236
+ spectral-signal ────┘ anomaly-detection
237
+ (周波数解析) (異常検知)
238
+ ```
239
+
240
+ ## パイプライン出力
241
+
242
+ | ファイル | 説明 | 次スキル |
243
+ |---------|------|---------|
244
+ | `prophet_forecast.png` | Prophet 予測結果 | → presentation |
245
+ | `ts_features.csv` | 時系列特徴量 | → ml-regression |
246
+ | `backtest_results.csv` | バックテスト結果 | → model selection |
@@ -0,0 +1,298 @@
1
+ ---
2
+ name: scientific-transfer-learning
3
+ description: |
4
+ 転移学習・ドメイン適応スキル。事前学習モデルファインチューニング・
5
+ Few-shot / Zero-shot 学習・ドメイン適応 (DA)・
6
+ 知識蒸留・マルチタスク学習・科学ドメイン特化モデル転移。
7
+ ---
8
+
9
+ # Scientific Transfer Learning
10
+
11
+ 事前学習モデルの科学データへの転移・ドメイン適応・
12
+ Few-shot 学習パイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - 事前学習済みモデル (ImageNet/BERT) をファインチューニングするとき
17
+ - 小規模科学データセットで高精度を実現したいとき
18
+ - ドメイン適応で異なるデータ分布間のギャップを埋めるとき
19
+ - Few-shot 学習で数例から分類するとき
20
+ - 知識蒸留で大規模モデルを軽量化するとき
21
+ - マルチタスク学習で複数タスクを共同学習するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. Vision モデルファインチューニング
28
+
29
+ ```python
30
+ import torch
31
+ import torch.nn as nn
32
+ from torch.utils.data import DataLoader
33
+ import numpy as np
34
+
35
+
36
+ def finetune_vision_model(train_loader, val_loader,
37
+ model_name="resnet50",
38
+ num_classes=10, epochs=20,
39
+ lr=1e-4, freeze_backbone=True):
40
+ """
41
+ Vision モデルファインチューニング。
42
+
43
+ Parameters:
44
+ train_loader: DataLoader — 学習データ
45
+ val_loader: DataLoader — 検証データ
46
+ model_name: str — "resnet50" / "vit_b_16" / "efficientnet_b0"
47
+ num_classes: int — クラス数
48
+ epochs: int — エポック数
49
+ lr: float — 学習率
50
+ freeze_backbone: bool — バックボーン凍結
51
+ """
52
+ import torchvision.models as models
53
+
54
+ # モデルロード
55
+ model_fn = getattr(models, model_name)
56
+ weights_name = model_name.replace("_", "").title() + "_Weights"
57
+ try:
58
+ weights = getattr(models, weights_name).DEFAULT
59
+ except AttributeError:
60
+ weights = "DEFAULT"
61
+ model = model_fn(weights=weights)
62
+
63
+ # 最終層置換
64
+ if hasattr(model, "fc"):
65
+ in_features = model.fc.in_features
66
+ model.fc = nn.Linear(in_features, num_classes)
67
+ elif hasattr(model, "classifier"):
68
+ if isinstance(model.classifier, nn.Sequential):
69
+ in_features = model.classifier[-1].in_features
70
+ model.classifier[-1] = nn.Linear(in_features, num_classes)
71
+ else:
72
+ in_features = model.classifier.in_features
73
+ model.classifier = nn.Linear(in_features, num_classes)
74
+ elif hasattr(model, "heads"):
75
+ in_features = model.heads.head.in_features
76
+ model.heads.head = nn.Linear(in_features, num_classes)
77
+
78
+ # バックボーン凍結
79
+ if freeze_backbone:
80
+ for name, param in model.named_parameters():
81
+ if "fc" not in name and "classifier" not in name and "heads" not in name:
82
+ param.requires_grad = False
83
+
84
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
85
+ model = model.to(device)
86
+
87
+ optimizer = torch.optim.AdamW(
88
+ filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
89
+ criterion = nn.CrossEntropyLoss()
90
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
91
+
92
+ best_acc = 0.0
93
+ history = []
94
+
95
+ for epoch in range(epochs):
96
+ model.train()
97
+ train_loss = 0.0
98
+ for X_batch, y_batch in train_loader:
99
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
100
+ optimizer.zero_grad()
101
+ outputs = model(X_batch)
102
+ loss = criterion(outputs, y_batch)
103
+ loss.backward()
104
+ optimizer.step()
105
+ train_loss += loss.item()
106
+ scheduler.step()
107
+
108
+ # Validation
109
+ model.eval()
110
+ correct = total = 0
111
+ with torch.no_grad():
112
+ for X_batch, y_batch in val_loader:
113
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
114
+ outputs = model(X_batch)
115
+ _, predicted = outputs.max(1)
116
+ total += y_batch.size(0)
117
+ correct += predicted.eq(y_batch).sum().item()
118
+
119
+ val_acc = correct / total
120
+ history.append({"epoch": epoch, "train_loss": train_loss / len(train_loader),
121
+ "val_acc": val_acc})
122
+ if val_acc > best_acc:
123
+ best_acc = val_acc
124
+
125
+ print(f"Finetune {model_name}: best val acc = {best_acc:.4f}")
126
+ return model, history
127
+ ```
128
+
129
+ ## 2. NLP モデルファインチューニング
130
+
131
+ ```python
132
+ def finetune_text_classifier(train_texts, train_labels,
133
+ val_texts, val_labels,
134
+ model_name="dmis-lab/biobert-base-cased-v1.2",
135
+ num_labels=2, epochs=5, lr=2e-5):
136
+ """
137
+ BERT/BioBERT テキスト分類ファインチューニング。
138
+
139
+ Parameters:
140
+ train_texts: list[str] — 学習テキスト
141
+ train_labels: list[int] — 学習ラベル
142
+ val_texts: list[str] — 検証テキスト
143
+ val_labels: list[int] — 検証ラベル
144
+ model_name: str — HuggingFace モデル名
145
+ num_labels: int — ラベル数
146
+ epochs: int — エポック数
147
+ lr: float — 学習率
148
+ """
149
+ from transformers import (
150
+ AutoTokenizer, AutoModelForSequenceClassification,
151
+ TrainingArguments, Trainer)
152
+ from datasets import Dataset
153
+
154
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
155
+ model = AutoModelForSequenceClassification.from_pretrained(
156
+ model_name, num_labels=num_labels)
157
+
158
+ def tokenize(examples):
159
+ return tokenizer(examples["text"], truncation=True,
160
+ padding="max_length", max_length=512)
161
+
162
+ train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
163
+ val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
164
+ train_ds = train_ds.map(tokenize, batched=True)
165
+ val_ds = val_ds.map(tokenize, batched=True)
166
+
167
+ args = TrainingArguments(
168
+ output_dir="./ft_output", num_train_epochs=epochs,
169
+ per_device_train_batch_size=16, learning_rate=lr,
170
+ evaluation_strategy="epoch", save_strategy="epoch",
171
+ load_best_model_at_end=True, metric_for_best_model="accuracy")
172
+
173
+ def compute_metrics(eval_pred):
174
+ preds = np.argmax(eval_pred.predictions, axis=-1)
175
+ acc = (preds == eval_pred.label_ids).mean()
176
+ return {"accuracy": acc}
177
+
178
+ trainer = Trainer(model=model, args=args, train_dataset=train_ds,
179
+ eval_dataset=val_ds, compute_metrics=compute_metrics)
180
+ trainer.train()
181
+
182
+ metrics = trainer.evaluate()
183
+ print(f"Finetune {model_name}: val acc = {metrics['eval_accuracy']:.4f}")
184
+ return model, tokenizer, metrics
185
+ ```
186
+
187
+ ## 3. Few-shot 学習
188
+
189
+ ```python
190
+ def prototypical_network(support_X, support_y, query_X,
191
+ feature_extractor=None):
192
+ """
193
+ Prototypical Network — Few-shot 分類。
194
+
195
+ Parameters:
196
+ support_X: np.ndarray — サポートセット特徴量
197
+ support_y: np.ndarray — サポートラベル
198
+ query_X: np.ndarray — クエリセット特徴量
199
+ feature_extractor: callable | None — 特徴量抽出器
200
+ """
201
+ if feature_extractor is not None:
202
+ support_emb = feature_extractor(support_X)
203
+ query_emb = feature_extractor(query_X)
204
+ else:
205
+ support_emb = support_X
206
+ query_emb = query_X
207
+
208
+ classes = np.unique(support_y)
209
+ prototypes = np.array([
210
+ support_emb[support_y == c].mean(axis=0) for c in classes])
211
+
212
+ # ユークリッド距離
213
+ dists = np.array([
214
+ np.linalg.norm(query_emb - p, axis=1) for p in prototypes]).T
215
+
216
+ predictions = classes[np.argmin(dists, axis=1)]
217
+ confidences = np.exp(-dists.min(axis=1))
218
+
219
+ print(f"Few-shot: {len(classes)} classes, "
220
+ f"{len(support_y)} support → {len(query_X)} query")
221
+ return predictions, confidences
222
+ ```
223
+
224
+ ## 4. 知識蒸留
225
+
226
+ ```python
227
+ def knowledge_distillation(teacher, student, train_loader,
228
+ epochs=20, temperature=4.0, alpha=0.7,
229
+ lr=1e-3):
230
+ """
231
+ 知識蒸留 (Teacher → Student)。
232
+
233
+ Parameters:
234
+ teacher: nn.Module — 教師モデル (frozen)
235
+ student: nn.Module — 生徒モデル
236
+ train_loader: DataLoader — 学習データ
237
+ epochs: int — エポック数
238
+ temperature: float — 蒸留温度
239
+ alpha: float — soft loss の重み
240
+ lr: float — 学習率
241
+ """
242
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
243
+ teacher = teacher.to(device).eval()
244
+ student = student.to(device)
245
+
246
+ optimizer = torch.optim.AdamW(student.parameters(), lr=lr)
247
+ ce_loss = nn.CrossEntropyLoss()
248
+ kl_loss = nn.KLDivLoss(reduction="batchmean")
249
+
250
+ for epoch in range(epochs):
251
+ student.train()
252
+ total_loss = 0.0
253
+ for X_batch, y_batch in train_loader:
254
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
255
+
256
+ with torch.no_grad():
257
+ teacher_logits = teacher(X_batch)
258
+
259
+ student_logits = student(X_batch)
260
+
261
+ soft_loss = kl_loss(
262
+ nn.functional.log_softmax(student_logits / temperature, dim=1),
263
+ nn.functional.softmax(teacher_logits / temperature, dim=1)
264
+ ) * (temperature ** 2)
265
+
266
+ hard_loss = ce_loss(student_logits, y_batch)
267
+ loss = alpha * soft_loss + (1 - alpha) * hard_loss
268
+
269
+ optimizer.zero_grad()
270
+ loss.backward()
271
+ optimizer.step()
272
+ total_loss += loss.item()
273
+
274
+ print(f" Epoch {epoch}: loss = {total_loss / len(train_loader):.4f}")
275
+
276
+ print(f"Distillation: T={temperature}, α={alpha}, {epochs} epochs")
277
+ return student
278
+ ```
279
+
280
+ ---
281
+
282
+ ## パイプライン統合
283
+
284
+ ```
285
+ deep-learning → transfer-learning → active-learning
286
+ (モデル設計) (転移・適応) (効率的ラベル付け)
287
+ │ │ ↓
288
+ healthcare-ai ───────┘ ensemble-methods
289
+ (臨床 AI) (アンサンブル)
290
+ ```
291
+
292
+ ## パイプライン出力
293
+
294
+ | ファイル | 説明 | 次スキル |
295
+ |---------|------|---------|
296
+ | `ft_model.pt` | ファインチューニング済みモデル | → 推論 |
297
+ | `ft_history.csv` | 学習履歴 | → visualization |
298
+ | `few_shot_predictions.csv` | Few-shot 予測 | → 評価 |