@nahisaho/satori 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENCE +0 -0
- package/README.md +191 -0
- package/bin/satori.js +95 -0
- package/package.json +29 -0
- package/src/.github/skills/scientific-academic-writing/SKILL.md +361 -0
- package/src/.github/skills/scientific-academic-writing/assets/acs_article.md +199 -0
- package/src/.github/skills/scientific-academic-writing/assets/elsevier_article.md +244 -0
- package/src/.github/skills/scientific-academic-writing/assets/ieee_transactions.md +212 -0
- package/src/.github/skills/scientific-academic-writing/assets/imrad_standard.md +181 -0
- package/src/.github/skills/scientific-academic-writing/assets/nature_article.md +179 -0
- package/src/.github/skills/scientific-academic-writing/assets/qiita_technical_article.md +385 -0
- package/src/.github/skills/scientific-academic-writing/assets/science_research_article.md +169 -0
- package/src/.github/skills/scientific-bioinformatics/SKILL.md +220 -0
- package/src/.github/skills/scientific-biosignal-processing/SKILL.md +357 -0
- package/src/.github/skills/scientific-causal-inference/SKILL.md +347 -0
- package/src/.github/skills/scientific-cheminformatics/SKILL.md +196 -0
- package/src/.github/skills/scientific-data-preprocessing/SKILL.md +413 -0
- package/src/.github/skills/scientific-data-simulation/SKILL.md +244 -0
- package/src/.github/skills/scientific-doe/SKILL.md +360 -0
- package/src/.github/skills/scientific-eda-correlation/SKILL.md +141 -0
- package/src/.github/skills/scientific-feature-importance/SKILL.md +208 -0
- package/src/.github/skills/scientific-image-analysis/SKILL.md +310 -0
- package/src/.github/skills/scientific-materials-characterization/SKILL.md +368 -0
- package/src/.github/skills/scientific-meta-analysis/SKILL.md +352 -0
- package/src/.github/skills/scientific-metabolomics/SKILL.md +326 -0
- package/src/.github/skills/scientific-ml-classification/SKILL.md +265 -0
- package/src/.github/skills/scientific-ml-regression/SKILL.md +215 -0
- package/src/.github/skills/scientific-multi-omics/SKILL.md +303 -0
- package/src/.github/skills/scientific-network-analysis/SKILL.md +257 -0
- package/src/.github/skills/scientific-pca-tsne/SKILL.md +235 -0
- package/src/.github/skills/scientific-pipeline-scaffold/SKILL.md +331 -0
- package/src/.github/skills/scientific-process-optimization/SKILL.md +215 -0
- package/src/.github/skills/scientific-publication-figures/SKILL.md +208 -0
- package/src/.github/skills/scientific-sequence-analysis/SKILL.md +389 -0
- package/src/.github/skills/scientific-spectral-signal/SKILL.md +227 -0
- package/src/.github/skills/scientific-statistical-testing/SKILL.md +240 -0
- package/src/.github/skills/scientific-survival-clinical/SKILL.md +239 -0
- package/src/.github/skills/scientific-time-series/SKILL.md +291 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-sequence-analysis
|
|
3
|
+
description: |
|
|
4
|
+
ゲノム配列解析スキル。コドン使用頻度(RSCU/CAI)、ペアワイズアラインメント
|
|
5
|
+
(Needleman-Wunsch/Smith-Waterman)、系統解析(Jukes-Cantor/UPGMA/ブートストラップ)、
|
|
6
|
+
ORF 探索、CpG 島検出、制限酵素マッピング、タンパク質特性(pI/GRAVY/疎水性プロファイル)
|
|
7
|
+
の解析テンプレート。Scientific Skills Exp-09 で確立したパターン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Sequence Analysis
|
|
11
|
+
|
|
12
|
+
DNA / RNA / タンパク質の配列解析パイプライン。bioinformatics スキルが scRNA-seq や
|
|
13
|
+
バルク RNA-seq のオミクスレベル解析を扱うのに対し、本スキルは個々の配列レベル
|
|
14
|
+
(分子生物学レベル)の解析に特化する。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- DNA / タンパク質配列の特性解析が必要なとき
|
|
19
|
+
- コドン使用頻度・バイアスを分析するとき
|
|
20
|
+
- ペアワイズ/マルチプルアラインメント、系統樹を作成するとき
|
|
21
|
+
- ORF 探索、CpG 島検出、制限酵素マッピングが必要なとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. 塩基組成解析
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from collections import Counter
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
def sequence_composition(sequence, seq_type="dna"):
|
|
35
|
+
"""
|
|
36
|
+
配列の塩基/アミノ酸組成を算出する。
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
dict with counts, frequencies, GC content (DNA), MW estimate
|
|
40
|
+
"""
|
|
41
|
+
seq = sequence.upper()
|
|
42
|
+
counts = Counter(seq)
|
|
43
|
+
total = len(seq)
|
|
44
|
+
freq = {k: v / total * 100 for k, v in counts.items()}
|
|
45
|
+
|
|
46
|
+
result = {"length": total, "composition": counts, "frequency_pct": freq}
|
|
47
|
+
|
|
48
|
+
if seq_type == "dna":
|
|
49
|
+
gc = (counts.get("G", 0) + counts.get("C", 0)) / total * 100
|
|
50
|
+
at = (counts.get("A", 0) + counts.get("T", 0)) / total * 100
|
|
51
|
+
result["gc_content_pct"] = gc
|
|
52
|
+
result["at_content_pct"] = at
|
|
53
|
+
elif seq_type == "rna":
|
|
54
|
+
gc = (counts.get("G", 0) + counts.get("C", 0)) / total * 100
|
|
55
|
+
result["gc_content_pct"] = gc
|
|
56
|
+
|
|
57
|
+
return result
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## 2. コドン使用頻度(RSCU / CAI)
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from Bio.Data import CodonTable
|
|
64
|
+
|
|
65
|
+
CODON_TABLE = {
|
|
66
|
+
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
|
|
67
|
+
"CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
|
|
68
|
+
"ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
|
|
69
|
+
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
|
|
70
|
+
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
|
|
71
|
+
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
|
|
72
|
+
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
|
|
73
|
+
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
|
|
74
|
+
"TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
|
|
75
|
+
"CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
|
|
76
|
+
"AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
|
|
77
|
+
"GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
|
|
78
|
+
"TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
|
|
79
|
+
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
|
|
80
|
+
"AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
|
|
81
|
+
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def compute_rscu(cds_sequence):
|
|
85
|
+
"""
|
|
86
|
+
Relative Synonymous Codon Usage (RSCU) を算出する。
|
|
87
|
+
|
|
88
|
+
RSCU = observed / expected
|
|
89
|
+
expected = total for AA / number of synonymous codons
|
|
90
|
+
RSCU = 1: 均等使用, >1: 優先使用, <1: 劣位使用
|
|
91
|
+
"""
|
|
92
|
+
seq = cds_sequence.upper().replace("U", "T")
|
|
93
|
+
codons = [seq[i:i+3] for i in range(0, len(seq)-2, 3)]
|
|
94
|
+
codon_counts = Counter(codons)
|
|
95
|
+
|
|
96
|
+
# アミノ酸ごとの同義コドン集計
|
|
97
|
+
aa_groups = {}
|
|
98
|
+
for codon, aa in CODON_TABLE.items():
|
|
99
|
+
if aa == "*":
|
|
100
|
+
continue
|
|
101
|
+
aa_groups.setdefault(aa, []).append(codon)
|
|
102
|
+
|
|
103
|
+
rscu = {}
|
|
104
|
+
for aa, synonymous in aa_groups.items():
|
|
105
|
+
total = sum(codon_counts.get(c, 0) for c in synonymous)
|
|
106
|
+
n_syn = len(synonymous)
|
|
107
|
+
for codon in synonymous:
|
|
108
|
+
observed = codon_counts.get(codon, 0)
|
|
109
|
+
expected = total / n_syn if n_syn > 0 else 0
|
|
110
|
+
rscu[codon] = observed / expected if expected > 0 else 0
|
|
111
|
+
|
|
112
|
+
return rscu
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def codon_adaptation_index(cds_sequence, reference_rscu):
|
|
116
|
+
"""
|
|
117
|
+
Codon Adaptation Index (CAI) を算出する。
|
|
118
|
+
|
|
119
|
+
CAI = exp(1/L * Σ ln(w_i))
|
|
120
|
+
w_i = RSCU_i / RSCU_max (for synonymous family)
|
|
121
|
+
"""
|
|
122
|
+
seq = cds_sequence.upper().replace("U", "T")
|
|
123
|
+
codons = [seq[i:i+3] for i in range(0, len(seq)-2, 3)]
|
|
124
|
+
|
|
125
|
+
# 各同義ファミリーの最大 RSCU
|
|
126
|
+
aa_groups = {}
|
|
127
|
+
for codon, aa in CODON_TABLE.items():
|
|
128
|
+
if aa == "*":
|
|
129
|
+
continue
|
|
130
|
+
aa_groups.setdefault(aa, []).append(codon)
|
|
131
|
+
|
|
132
|
+
max_rscu = {}
|
|
133
|
+
for aa, synonymous in aa_groups.items():
|
|
134
|
+
max_val = max(reference_rscu.get(c, 0) for c in synonymous)
|
|
135
|
+
for c in synonymous:
|
|
136
|
+
max_rscu[c] = max_val
|
|
137
|
+
|
|
138
|
+
w_values = []
|
|
139
|
+
for codon in codons:
|
|
140
|
+
if codon in CODON_TABLE and CODON_TABLE[codon] != "*":
|
|
141
|
+
w = reference_rscu.get(codon, 0) / max_rscu.get(codon, 1)
|
|
142
|
+
if w > 0:
|
|
143
|
+
w_values.append(np.log(w))
|
|
144
|
+
|
|
145
|
+
cai = np.exp(np.mean(w_values)) if w_values else 0
|
|
146
|
+
return cai
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## 3. ペアワイズアラインメント
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
def needleman_wunsch(seq1, seq2, match=2, mismatch=-1, gap=-2):
|
|
153
|
+
"""
|
|
154
|
+
Needleman-Wunsch グローバルアラインメント(動的計画法)。
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
aligned_seq1, aligned_seq2, score
|
|
158
|
+
"""
|
|
159
|
+
n, m = len(seq1), len(seq2)
|
|
160
|
+
dp = np.zeros((n+1, m+1))
|
|
161
|
+
traceback = np.zeros((n+1, m+1), dtype=int) # 0=diag, 1=up, 2=left
|
|
162
|
+
|
|
163
|
+
for i in range(1, n+1):
|
|
164
|
+
dp[i][0] = i * gap
|
|
165
|
+
for j in range(1, m+1):
|
|
166
|
+
dp[0][j] = j * gap
|
|
167
|
+
|
|
168
|
+
for i in range(1, n+1):
|
|
169
|
+
for j in range(1, m+1):
|
|
170
|
+
s = match if seq1[i-1] == seq2[j-1] else mismatch
|
|
171
|
+
scores = [dp[i-1][j-1] + s, dp[i-1][j] + gap, dp[i][j-1] + gap]
|
|
172
|
+
dp[i][j] = max(scores)
|
|
173
|
+
traceback[i][j] = np.argmax(scores)
|
|
174
|
+
|
|
175
|
+
# トレースバック
|
|
176
|
+
a1, a2 = [], []
|
|
177
|
+
i, j = n, m
|
|
178
|
+
while i > 0 or j > 0:
|
|
179
|
+
if i > 0 and j > 0 and traceback[i][j] == 0:
|
|
180
|
+
a1.append(seq1[i-1]); a2.append(seq2[j-1]); i -= 1; j -= 1
|
|
181
|
+
elif i > 0 and traceback[i][j] == 1:
|
|
182
|
+
a1.append(seq1[i-1]); a2.append("-"); i -= 1
|
|
183
|
+
else:
|
|
184
|
+
a1.append("-"); a2.append(seq2[j-1]); j -= 1
|
|
185
|
+
|
|
186
|
+
return "".join(reversed(a1)), "".join(reversed(a2)), dp[n][m]
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 4. 系統解析
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
def jukes_cantor_distance(seq1, seq2):
|
|
193
|
+
"""Jukes-Cantor 距離: d = -3/4 ln(1 - 4p/3)"""
|
|
194
|
+
aligned_len = min(len(seq1), len(seq2))
|
|
195
|
+
mismatches = sum(1 for a, b in zip(seq1[:aligned_len], seq2[:aligned_len])
|
|
196
|
+
if a != b and a != "-" and b != "-")
|
|
197
|
+
valid = sum(1 for a, b in zip(seq1[:aligned_len], seq2[:aligned_len])
|
|
198
|
+
if a != "-" and b != "-")
|
|
199
|
+
p = mismatches / valid if valid > 0 else 0
|
|
200
|
+
|
|
201
|
+
if p >= 0.75:
|
|
202
|
+
return float("inf")
|
|
203
|
+
return -0.75 * np.log(1 - 4 * p / 3)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def upgma_tree(distance_matrix, names):
|
|
207
|
+
"""
|
|
208
|
+
UPGMA (Unweighted Pair Group Method with Arithmetic Mean) 系統樹を構築する。
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Newick 形式の文字列
|
|
212
|
+
"""
|
|
213
|
+
n = len(names)
|
|
214
|
+
dm = distance_matrix.copy()
|
|
215
|
+
clusters = {i: names[i] for i in range(n)}
|
|
216
|
+
sizes = {i: 1 for i in range(n)}
|
|
217
|
+
|
|
218
|
+
while len(clusters) > 1:
|
|
219
|
+
keys = list(clusters.keys())
|
|
220
|
+
min_dist = float("inf")
|
|
221
|
+
merge_i, merge_j = 0, 0
|
|
222
|
+
|
|
223
|
+
for a in range(len(keys)):
|
|
224
|
+
for b in range(a+1, len(keys)):
|
|
225
|
+
if dm[keys[a]][keys[b]] < min_dist:
|
|
226
|
+
min_dist = dm[keys[a]][keys[b]]
|
|
227
|
+
merge_i, merge_j = keys[a], keys[b]
|
|
228
|
+
|
|
229
|
+
height = min_dist / 2
|
|
230
|
+
new_name = f"({clusters[merge_i]}:{height:.4f},{clusters[merge_j]}:{height:.4f})"
|
|
231
|
+
|
|
232
|
+
new_id = max(clusters.keys()) + 1
|
|
233
|
+
clusters[new_id] = new_name
|
|
234
|
+
sizes[new_id] = sizes[merge_i] + sizes[merge_j]
|
|
235
|
+
|
|
236
|
+
# 新ノードへの距離更新(加重平均)
|
|
237
|
+
new_row = {}
|
|
238
|
+
for k in clusters.keys():
|
|
239
|
+
if k != new_id:
|
|
240
|
+
d = (sizes[merge_i] * dm[merge_i].get(k, 0) +
|
|
241
|
+
sizes[merge_j] * dm[merge_j].get(k, 0)) / sizes[new_id]
|
|
242
|
+
new_row[k] = d
|
|
243
|
+
|
|
244
|
+
dm[new_id] = new_row
|
|
245
|
+
for k in new_row:
|
|
246
|
+
dm.setdefault(k, {})[new_id] = new_row[k]
|
|
247
|
+
|
|
248
|
+
del clusters[merge_i]
|
|
249
|
+
del clusters[merge_j]
|
|
250
|
+
|
|
251
|
+
return list(clusters.values())[0] + ";"
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
## 5. ORF 探索
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
def find_orfs(sequence, min_length_aa=100):
|
|
258
|
+
"""
|
|
259
|
+
全6リーディングフレームからORF(Open Reading Frame)を探索する。
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
list of dict with frame, start, end, length_aa, protein_seq
|
|
263
|
+
"""
|
|
264
|
+
seq = sequence.upper()
|
|
265
|
+
reverse_comp = seq.translate(str.maketrans("ATGC", "TACG"))[::-1]
|
|
266
|
+
|
|
267
|
+
orfs = []
|
|
268
|
+
for strand, s in [("+", seq), ("-", reverse_comp)]:
|
|
269
|
+
for frame in range(3):
|
|
270
|
+
i = frame
|
|
271
|
+
while i < len(s) - 2:
|
|
272
|
+
codon = s[i:i+3]
|
|
273
|
+
if codon == "ATG":
|
|
274
|
+
# ORF 開始 → 停止コドンまでスキャン
|
|
275
|
+
protein = []
|
|
276
|
+
j = i
|
|
277
|
+
while j < len(s) - 2:
|
|
278
|
+
c = s[j:j+3]
|
|
279
|
+
aa = CODON_TABLE.get(c, "X")
|
|
280
|
+
if aa == "*":
|
|
281
|
+
break
|
|
282
|
+
protein.append(aa)
|
|
283
|
+
j += 3
|
|
284
|
+
if len(protein) >= min_length_aa:
|
|
285
|
+
orfs.append({
|
|
286
|
+
"strand": strand,
|
|
287
|
+
"frame": frame + 1,
|
|
288
|
+
"start": i + 1,
|
|
289
|
+
"end": j + 3,
|
|
290
|
+
"length_aa": len(protein),
|
|
291
|
+
"protein_seq": "".join(protein[:50]) + "...",
|
|
292
|
+
})
|
|
293
|
+
i = j + 3
|
|
294
|
+
else:
|
|
295
|
+
i += 3
|
|
296
|
+
|
|
297
|
+
return sorted(orfs, key=lambda x: x["length_aa"], reverse=True)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## 6. CpG 島検出
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
def detect_cpg_islands(sequence, window=200, step=1,
|
|
304
|
+
min_gc=0.50, min_obs_exp=0.60, min_length=200):
|
|
305
|
+
"""
|
|
306
|
+
配列中の CpG island を検出する。
|
|
307
|
+
|
|
308
|
+
判定基準(Gardiner-Garden & Frommer, 1987):
|
|
309
|
+
- GC含量 ≥ 50%
|
|
310
|
+
- CpG observed/expected ≥ 0.60
|
|
311
|
+
- 長さ ≥ 200 bp
|
|
312
|
+
"""
|
|
313
|
+
seq = sequence.upper()
|
|
314
|
+
islands = []
|
|
315
|
+
in_island = False
|
|
316
|
+
start = 0
|
|
317
|
+
|
|
318
|
+
for i in range(0, len(seq) - window, step):
|
|
319
|
+
w = seq[i:i+window]
|
|
320
|
+
gc = (w.count("G") + w.count("C")) / window
|
|
321
|
+
cpg_obs = w.count("CG") / window
|
|
322
|
+
c_freq = w.count("C") / window
|
|
323
|
+
g_freq = w.count("G") / window
|
|
324
|
+
cpg_exp = c_freq * g_freq
|
|
325
|
+
obs_exp = cpg_obs / cpg_exp if cpg_exp > 0 else 0
|
|
326
|
+
|
|
327
|
+
if gc >= min_gc and obs_exp >= min_obs_exp:
|
|
328
|
+
if not in_island:
|
|
329
|
+
start = i
|
|
330
|
+
in_island = True
|
|
331
|
+
else:
|
|
332
|
+
if in_island:
|
|
333
|
+
length = i - start + window
|
|
334
|
+
if length >= min_length:
|
|
335
|
+
islands.append({"start": start+1, "end": i+window,
|
|
336
|
+
"length": length})
|
|
337
|
+
in_island = False
|
|
338
|
+
|
|
339
|
+
return islands
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## 7. タンパク質特性
|
|
343
|
+
|
|
344
|
+
```python
|
|
345
|
+
AMINO_ACID_MW = {
|
|
346
|
+
"A": 89.09, "R": 174.20, "N": 132.12, "D": 133.10, "C": 121.16,
|
|
347
|
+
"E": 147.13, "Q": 146.15, "G": 75.03, "H": 155.16, "I": 131.17,
|
|
348
|
+
"L": 131.17, "K": 146.19, "M": 149.21, "F": 165.19, "P": 115.13,
|
|
349
|
+
"S": 105.09, "T": 119.12, "W": 204.23, "Y": 181.19, "V": 117.15,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
KYTE_DOOLITTLE = {
|
|
353
|
+
"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5,
|
|
354
|
+
"E": -3.5, "Q": -3.5, "G": -0.4, "H": -3.2, "I": 4.5,
|
|
355
|
+
"L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6,
|
|
356
|
+
"S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2,
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
def protein_properties(protein_seq):
|
|
360
|
+
"""タンパク質の物理化学的特性を算出する。"""
|
|
361
|
+
seq = protein_seq.upper()
|
|
362
|
+
mw = sum(AMINO_ACID_MW.get(aa, 0) for aa in seq) - 18.015 * (len(seq) - 1)
|
|
363
|
+
gravy = np.mean([KYTE_DOOLITTLE.get(aa, 0) for aa in seq])
|
|
364
|
+
|
|
365
|
+
return {
|
|
366
|
+
"length_aa": len(seq),
|
|
367
|
+
"molecular_weight_Da": mw,
|
|
368
|
+
"gravy": gravy,
|
|
369
|
+
"hydrophobic_pct": sum(1 for aa in seq if KYTE_DOOLITTLE.get(aa, 0) > 0) / len(seq) * 100,
|
|
370
|
+
}
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
## References
|
|
374
|
+
|
|
375
|
+
### Output Files
|
|
376
|
+
|
|
377
|
+
| ファイル | 形式 |
|
|
378
|
+
|---|---|
|
|
379
|
+
| `results/sequence_composition.csv` | CSV |
|
|
380
|
+
| `results/rscu_analysis.csv` | CSV |
|
|
381
|
+
| `results/orf_predictions.csv` | CSV |
|
|
382
|
+
| `results/cpg_islands.csv` | CSV |
|
|
383
|
+
| `figures/codon_usage_heatmap.png` | PNG |
|
|
384
|
+
| `figures/phylogenetic_tree.png` | PNG |
|
|
385
|
+
| `figures/hydrophobicity_profile.png` | PNG |
|
|
386
|
+
|
|
387
|
+
#### 参照実験
|
|
388
|
+
|
|
389
|
+
- **Exp-09**: コドン使用頻度、ペアワイズアラインメント、系統解析、ORF 探索、CpG 島検出
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-spectral-signal
|
|
3
|
+
description: |
|
|
4
|
+
分光スペクトルおよび生体信号の前処理・解析スキル。ベースライン補正、フィルタリング、
|
|
5
|
+
ピーク検出、帯域パワー解析を行う際に使用。
|
|
6
|
+
Scientific Skills Exp-08(ECG/EEG)、Exp-11(ラマン分光)で確立したパターン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Spectral & Signal Processing
|
|
10
|
+
|
|
11
|
+
分光スペクトル(ラマン、IR、UV-Vis など)および生体信号(ECG, EEG)の
|
|
12
|
+
前処理・解析パイプラインスキル。ノイズ除去、ベースライン補正、ピーク検出、
|
|
13
|
+
周波数解析の標準ワークフローを提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- スペクトルデータの前処理(ベースライン補正、平滑化、正規化)
|
|
18
|
+
- 時系列信号のフィルタリング(バンドパス、ノッチ)
|
|
19
|
+
- ピーク検出と定量化
|
|
20
|
+
- 周波数帯域パワー解析(EEG δ/θ/α/β/γ)
|
|
21
|
+
- スペクトル類似度解析と分類
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
## スペクトル前処理パイプライン
|
|
26
|
+
|
|
27
|
+
### 1. ALS ベースライン補正(Exp-11)
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
from scipy import sparse
|
|
32
|
+
from scipy.sparse.linalg import spsolve
|
|
33
|
+
|
|
34
|
+
def baseline_correction_als(y, lam=1e5, p=0.01, niter=10):
|
|
35
|
+
"""
|
|
36
|
+
Asymmetric Least Squares (ALS) によるベースライン推定。
|
|
37
|
+
蛍光バックグラウンドの除去に使用。
|
|
38
|
+
lam: 平滑化パラメータ(大きいほど滑らか)
|
|
39
|
+
p: 非対称性パラメータ(小さいほどベースライン追従が強い)
|
|
40
|
+
"""
|
|
41
|
+
L = len(y)
|
|
42
|
+
D = sparse.diags([1, -2, 1], [0, -1, -2], shape=(L, L - 2))
|
|
43
|
+
w = np.ones(L)
|
|
44
|
+
for _ in range(niter):
|
|
45
|
+
W = sparse.diags(w)
|
|
46
|
+
Z = W + lam * D.dot(D.transpose())
|
|
47
|
+
z = spsolve(Z, w * y)
|
|
48
|
+
w = p * (y > z) + (1 - p) * (y < z)
|
|
49
|
+
return z
|
|
50
|
+
|
|
51
|
+
def remove_baseline(spectrum, wavenumbers, lam=1e5, p=0.01):
|
|
52
|
+
"""ベースライン補正済みスペクトルを返す。"""
|
|
53
|
+
baseline = baseline_correction_als(spectrum, lam=lam, p=p)
|
|
54
|
+
return spectrum - baseline
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 2. Savitzky-Golay 平滑化
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from scipy.signal import savgol_filter
|
|
61
|
+
|
|
62
|
+
def smooth_spectrum(spectrum, window_length=11, polyorder=3):
|
|
63
|
+
"""Savitzky-Golay フィルタでスペクトルを平滑化する。"""
|
|
64
|
+
return savgol_filter(spectrum, window_length=window_length, polyorder=polyorder)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### 3. 正規化
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
def normalize_spectrum(spectrum, method="minmax"):
|
|
71
|
+
"""
|
|
72
|
+
スペクトルを正規化する。
|
|
73
|
+
method: 'minmax', 'snv' (Standard Normal Variate), 'area'
|
|
74
|
+
"""
|
|
75
|
+
if method == "minmax":
|
|
76
|
+
return (spectrum - spectrum.min()) / (spectrum.max() - spectrum.min() + 1e-10)
|
|
77
|
+
elif method == "snv":
|
|
78
|
+
return (spectrum - spectrum.mean()) / (spectrum.std() + 1e-10)
|
|
79
|
+
elif method == "area":
|
|
80
|
+
return spectrum / (np.trapz(np.abs(spectrum)) + 1e-10)
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"Unknown method: {method}")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 4. ピーク検出
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from scipy.signal import find_peaks
|
|
89
|
+
|
|
90
|
+
def detect_peaks(spectrum, wavenumbers=None, height=None,
|
|
91
|
+
prominence=0.05, distance=10, width=None):
|
|
92
|
+
"""
|
|
93
|
+
スペクトル中のピークを検出する。
|
|
94
|
+
返値: ピーク位置インデックス、ピーク属性辞書
|
|
95
|
+
"""
|
|
96
|
+
peaks, properties = find_peaks(
|
|
97
|
+
spectrum, height=height, prominence=prominence,
|
|
98
|
+
distance=distance, width=width
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if wavenumbers is not None:
|
|
102
|
+
peak_positions = wavenumbers[peaks]
|
|
103
|
+
else:
|
|
104
|
+
peak_positions = peaks
|
|
105
|
+
|
|
106
|
+
return peaks, peak_positions, properties
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## 生体信号処理パイプライン
|
|
110
|
+
|
|
111
|
+
### 5. バンドパス/ノッチフィルタ(Exp-08)
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from scipy.signal import butter, sosfilt, iirnotch
|
|
115
|
+
|
|
116
|
+
def bandpass_filter(signal, fs, lowcut, highcut, order=4):
|
|
117
|
+
"""Butterworth バンドパスフィルタ。"""
|
|
118
|
+
nyq = 0.5 * fs
|
|
119
|
+
sos = butter(order, [lowcut / nyq, highcut / nyq], btype="band", output="sos")
|
|
120
|
+
return sosfilt(sos, signal)
|
|
121
|
+
|
|
122
|
+
def notch_filter(signal, fs, freq=50.0, Q=30.0):
|
|
123
|
+
"""商用電源ノイズ除去用ノッチフィルタ(50/60 Hz)。"""
|
|
124
|
+
b, a = iirnotch(freq, Q, fs)
|
|
125
|
+
from scipy.signal import filtfilt
|
|
126
|
+
return filtfilt(b, a, signal)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 6. 周波数帯域パワー解析(EEG)
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from scipy.signal import welch
|
|
133
|
+
|
|
134
|
+
EEG_BANDS = {
|
|
135
|
+
"delta": (0.5, 4),
|
|
136
|
+
"theta": (4, 8),
|
|
137
|
+
"alpha": (8, 13),
|
|
138
|
+
"beta": (13, 30),
|
|
139
|
+
"gamma": (30, 100),
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
def band_power(signal, fs, band, method="welch"):
|
|
143
|
+
"""指定周波数帯域のパワーを算出する。"""
|
|
144
|
+
freqs, psd = welch(signal, fs=fs, nperseg=min(len(signal), 256))
|
|
145
|
+
low, high = band
|
|
146
|
+
idx = np.logical_and(freqs >= low, freqs <= high)
|
|
147
|
+
return np.trapz(psd[idx], freqs[idx])
|
|
148
|
+
|
|
149
|
+
def eeg_band_powers(signal, fs, bands=None):
|
|
150
|
+
"""EEG の全帯域パワーを一括計算する。"""
|
|
151
|
+
if bands is None:
|
|
152
|
+
bands = EEG_BANDS
|
|
153
|
+
powers = {}
|
|
154
|
+
for name, (low, high) in bands.items():
|
|
155
|
+
powers[name] = band_power(signal, fs, (low, high))
|
|
156
|
+
total = sum(powers.values())
|
|
157
|
+
relative = {k: v / total for k, v in powers.items()}
|
|
158
|
+
return powers, relative
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### 7. R 波検出 & HRV 解析(ECG)
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
def detect_r_peaks(ecg_signal, fs, height_factor=0.5, distance_ms=300):
|
|
165
|
+
"""ECG 信号から R 波を検出する。"""
|
|
166
|
+
min_distance = int(distance_ms * fs / 1000)
|
|
167
|
+
threshold = height_factor * np.max(ecg_signal)
|
|
168
|
+
peaks, _ = find_peaks(ecg_signal, height=threshold, distance=min_distance)
|
|
169
|
+
return peaks
|
|
170
|
+
|
|
171
|
+
def hrv_analysis(r_peaks, fs):
|
|
172
|
+
"""R-R 間隔から HRV 指標を算出する。"""
|
|
173
|
+
rr_intervals = np.diff(r_peaks) / fs * 1000 # ms
|
|
174
|
+
return {
|
|
175
|
+
"mean_RR": np.mean(rr_intervals),
|
|
176
|
+
"SDNN": np.std(rr_intervals, ddof=1),
|
|
177
|
+
"RMSSD": np.sqrt(np.mean(np.diff(rr_intervals) ** 2)),
|
|
178
|
+
"pNN50": np.sum(np.abs(np.diff(rr_intervals)) > 50) / len(rr_intervals) * 100,
|
|
179
|
+
"mean_HR": 60000 / np.mean(rr_intervals),
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### 8. スペクトル類似度解析(Exp-11)
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from scipy.spatial.distance import cosine
|
|
187
|
+
from scipy.cluster.hierarchy import linkage, dendrogram
|
|
188
|
+
from scipy.spatial.distance import squareform
|
|
189
|
+
|
|
190
|
+
def spectral_similarity_matrix(spectra_dict, method="cosine"):
|
|
191
|
+
"""
|
|
192
|
+
スペクトル間の類似度行列を算出する。
|
|
193
|
+
spectra_dict: {name: spectrum_array}
|
|
194
|
+
"""
|
|
195
|
+
names = list(spectra_dict.keys())
|
|
196
|
+
n = len(names)
|
|
197
|
+
sim_matrix = np.zeros((n, n))
|
|
198
|
+
|
|
199
|
+
for i in range(n):
|
|
200
|
+
for j in range(n):
|
|
201
|
+
if method == "cosine":
|
|
202
|
+
sim_matrix[i, j] = 1 - cosine(spectra_dict[names[i]],
|
|
203
|
+
spectra_dict[names[j]])
|
|
204
|
+
elif method == "pearson":
|
|
205
|
+
sim_matrix[i, j] = np.corrcoef(spectra_dict[names[i]],
|
|
206
|
+
spectra_dict[names[j]])[0, 1]
|
|
207
|
+
|
|
208
|
+
return pd.DataFrame(sim_matrix, index=names, columns=names)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## References
|
|
212
|
+
|
|
213
|
+
### Output Files
|
|
214
|
+
|
|
215
|
+
| ファイル | 形式 |
|
|
216
|
+
|---|---|
|
|
217
|
+
| `results/peak_detection_results.csv` | CSV |
|
|
218
|
+
| `results/hrv_metrics.csv` | CSV |
|
|
219
|
+
| `results/spectral_similarity.csv` | CSV |
|
|
220
|
+
| `figures/spectrum_processed.png` | PNG |
|
|
221
|
+
| `figures/peak_detection.png` | PNG |
|
|
222
|
+
| `figures/eeg_band_powers.png` | PNG |
|
|
223
|
+
|
|
224
|
+
#### 参照実験
|
|
225
|
+
|
|
226
|
+
- **Exp-08**: ECG/EEG 生体信号処理(フィルタ、R 波検出、HRV、帯域パワー)
|
|
227
|
+
- **Exp-11**: ラマン分光(ALS ベースライン、ピーク検出、類似度行列、クラスタリング)
|