py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of py2ls might be problematic. Click here for more details.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/ImageLoader.py +621 -0
- py2ls/__init__.py +7 -5
- py2ls/apptainer2ls.py +3940 -0
- py2ls/batman.py +164 -42
- py2ls/bio.py +2595 -0
- py2ls/cell_image_clf.py +1632 -0
- py2ls/container2ls.py +4635 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/email/email_html_template.html +88 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/data/re_common_pattern.json +173 -0
- py2ls/data/sns_info.json +74 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/styles/stylelib/.DS_Store +0 -0
- py2ls/data/styles/stylelib/grid.mplstyle +15 -0
- py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
- py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
- py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
- py2ls/data/styles/stylelib/light.mplstyl +6 -0
- py2ls/data/styles/stylelib/muted.mplstyle +6 -0
- py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature.mplstyle +31 -0
- py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
- py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
- py2ls/data/styles/stylelib/paper.mplstyle +290 -0
- py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
- py2ls/data/styles/stylelib/retro.mplstyle +4 -0
- py2ls/data/styles/stylelib/sans.mplstyle +10 -0
- py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
- py2ls/data/styles/stylelib/science.mplstyle +48 -0
- py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
- py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
- py2ls/data/tiles.csv +146 -0
- py2ls/data/usages_pd.json +1417 -0
- py2ls/data/usages_sns.json +31 -0
- py2ls/docker2ls.py +5446 -0
- py2ls/ec2ls.py +61 -0
- py2ls/fetch_update.py +145 -0
- py2ls/ich2ls.py +1955 -296
- py2ls/im2.py +8242 -0
- py2ls/image_ml2ls.py +2100 -0
- py2ls/ips.py +33909 -3418
- py2ls/ml2ls.py +7700 -0
- py2ls/mol.py +289 -0
- py2ls/mount2ls.py +1307 -0
- py2ls/netfinder.py +873 -351
- py2ls/nl2ls.py +283 -0
- py2ls/ocr.py +1581 -458
- py2ls/plot.py +10394 -314
- py2ls/rna2ls.py +311 -0
- py2ls/ssh2ls.md +456 -0
- py2ls/ssh2ls.py +5933 -0
- py2ls/ssh2ls_v01.py +2204 -0
- py2ls/stats.py +66 -172
- py2ls/temp20251124.py +509 -0
- py2ls/translator.py +2 -0
- py2ls/utils/decorators.py +3564 -0
- py2ls/utils_bio.py +3453 -0
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/rna2ls.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
rna_master_tools.py
|
|
4
|
+
Single-file advanced RNA-seq toolkit + CLI.
|
|
5
|
+
|
|
6
|
+
Usage examples:
|
|
7
|
+
python rna_master_tools.py run-rmats --b1 a1.bam,a2.bam --b2 b1.bam,b2.bam --gtf ref.gtf --outdir rmats_out
|
|
8
|
+
python rna_master_tools.py parse-rmats --in rmats_out/SE.MATS.JCEC.txt --out rmats_parsed.tsv
|
|
9
|
+
python rna_master_tools.py run-starfusion --left left.fq --right right.fq --genome_lib_dir GRCh38_ctat_lib --outdir fusion_out
|
|
10
|
+
python rna_master_tools.py compute-wgcna --counts counts_normalized.tsv --out modules.tsv
|
|
11
|
+
python rna_master_tools.py biomarkers --counts counts_normalized.tsv --labels samplesheet.tsv --out biomarker_report.tsv
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
import argparse
|
|
17
|
+
import logging
|
|
18
|
+
import subprocess
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import matplotlib.pyplot as plt
|
|
22
|
+
|
|
23
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
24
|
+
from sklearn.model_selection import train_test_split
|
|
25
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
26
|
+
from sklearn.metrics import classification_report
|
|
27
|
+
|
|
28
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# -------------------------
|
|
32
|
+
# Utilities
|
|
33
|
+
# -------------------------
|
|
34
|
+
def ensure_dir(path):
|
|
35
|
+
os.makedirs(path, exist_ok=True)
|
|
36
|
+
return path
|
|
37
|
+
|
|
38
|
+
def run_cmd(cmd, check=True):
|
|
39
|
+
logger.info("Run cmd: %s", " ".join(cmd))
|
|
40
|
+
try:
|
|
41
|
+
res = subprocess.run(cmd, check=check, capture_output=True, text=True)
|
|
42
|
+
logger.debug("stdout: %s", res.stdout[:200])
|
|
43
|
+
logger.debug("stderr: %s", res.stderr[:200])
|
|
44
|
+
return res
|
|
45
|
+
except subprocess.CalledProcessError as e:
|
|
46
|
+
logger.error("Command failed: %s", e.stderr)
|
|
47
|
+
raise
|
|
48
|
+
|
|
49
|
+
def read_table(path, index_col=None):
|
|
50
|
+
return pd.read_csv(path, sep="\t", index_col=index_col)
|
|
51
|
+
|
|
52
|
+
def write_table(df, path):
|
|
53
|
+
df.to_csv(path, sep="\t", index=True)
|
|
54
|
+
|
|
55
|
+
# -------------------------
|
|
56
|
+
# rMATS wrappers & parsers
|
|
57
|
+
# -------------------------
|
|
58
|
+
def run_rmats_cli(b1_list, b2_list, gtf, outdir, readlen=100, threads=8, rmats_cmd="rmats.py"):
|
|
59
|
+
"""
|
|
60
|
+
Run rMATS CLI. b1_list/b2_list are comma-separated BAM paths strings.
|
|
61
|
+
"""
|
|
62
|
+
ensure_dir(outdir)
|
|
63
|
+
tmpdir = ensure_dir(os.path.join(outdir, "tmp"))
|
|
64
|
+
cmd = [
|
|
65
|
+
rmats_cmd,
|
|
66
|
+
"--b1", b1_list,
|
|
67
|
+
"--b2", b2_list,
|
|
68
|
+
"--gtf", gtf,
|
|
69
|
+
"--od", outdir,
|
|
70
|
+
"--tmp", tmpdir,
|
|
71
|
+
"--readLength", str(readlen),
|
|
72
|
+
"--nthread", str(threads)
|
|
73
|
+
]
|
|
74
|
+
run_cmd(cmd)
|
|
75
|
+
return outdir
|
|
76
|
+
|
|
77
|
+
def parse_rmats_event_table(event_file, out_tsv=None):
|
|
78
|
+
"""
|
|
79
|
+
Parse rMATS event file and compute PSI means and dPSI.
|
|
80
|
+
"""
|
|
81
|
+
df = pd.read_csv(event_file, sep="\t", low_memory=False)
|
|
82
|
+
# helper
|
|
83
|
+
def mean_psi(s):
|
|
84
|
+
try:
|
|
85
|
+
vals = [float(x) for x in str(s).split(",") if x not in (".", "", "NA")]
|
|
86
|
+
return np.nanmean(vals) if len(vals)>0 else np.nan
|
|
87
|
+
except:
|
|
88
|
+
return np.nan
|
|
89
|
+
if 'IncLevel1' in df.columns:
|
|
90
|
+
df['PSI_group1'] = df['IncLevel1'].apply(mean_psi)
|
|
91
|
+
if 'IncLevel2' in df.columns:
|
|
92
|
+
df['PSI_group2'] = df['IncLevel2'].apply(mean_psi)
|
|
93
|
+
if 'PSI_group1' in df and 'PSI_group2' in df:
|
|
94
|
+
df['dPSI'] = df['PSI_group1'] - df['PSI_group2']
|
|
95
|
+
for col in ['PValue','FDR']:
|
|
96
|
+
if col in df.columns:
|
|
97
|
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
98
|
+
if out_tsv:
|
|
99
|
+
df.to_csv(out_tsv, sep="\t", index=False)
|
|
100
|
+
return df
|
|
101
|
+
|
|
102
|
+
def plot_rmats_volcano(df, out_png=None, fdr_thresh=0.05, dpsi_thresh=0.1):
|
|
103
|
+
neglog = -np.log10(df['FDR'].replace(0,1e-300))
|
|
104
|
+
sig = (df['FDR'] <= fdr_thresh) & (df['dPSI'].abs() >= dpsi_thresh)
|
|
105
|
+
plt.figure(figsize=(7,6))
|
|
106
|
+
plt.scatter(df['dPSI'], neglog, s=8, c='gray', alpha=0.7)
|
|
107
|
+
if sig.any():
|
|
108
|
+
plt.scatter(df.loc[sig,'dPSI'], neglog.loc[sig], s=10, c='red', alpha=0.8)
|
|
109
|
+
plt.axvline(dpsi_thresh, color='blue', linestyle='--')
|
|
110
|
+
plt.axvline(-dpsi_thresh, color='blue', linestyle='--')
|
|
111
|
+
plt.axhline(-np.log10(fdr_thresh), color='green', linestyle='--')
|
|
112
|
+
plt.xlabel('dPSI'); plt.ylabel('-log10(FDR)'); plt.title('rMATS volcano')
|
|
113
|
+
if out_png:
|
|
114
|
+
plt.savefig(out_png, dpi=150)
|
|
115
|
+
return plt
|
|
116
|
+
|
|
117
|
+
# -------------------------
|
|
118
|
+
# STAR-Fusion wrapper & parser
|
|
119
|
+
# -------------------------
|
|
120
|
+
def run_star_fusion_cli(left_fq, right_fq, genome_lib_dir, outdir, threads=8):
|
|
121
|
+
"""
|
|
122
|
+
Run STAR-Fusion. Assumes STAR-Fusion installed and in PATH or in container.
|
|
123
|
+
"""
|
|
124
|
+
ensure_dir(outdir)
|
|
125
|
+
cmd = [
|
|
126
|
+
"STAR-Fusion",
|
|
127
|
+
"--left_fq", left_fq,
|
|
128
|
+
"--right_fq", right_fq,
|
|
129
|
+
"--genome_lib_dir", genome_lib_dir,
|
|
130
|
+
"--output_dir", outdir,
|
|
131
|
+
"--CPU", str(threads)
|
|
132
|
+
]
|
|
133
|
+
run_cmd(cmd)
|
|
134
|
+
return outdir
|
|
135
|
+
|
|
136
|
+
def parse_star_fusion_tsv(fusion_tsv, out_tsv=None):
|
|
137
|
+
df = pd.read_csv(fusion_tsv, sep="\t", comment='#', low_memory=False)
|
|
138
|
+
if out_tsv:
|
|
139
|
+
df.to_csv(out_tsv, sep="\t", index=False)
|
|
140
|
+
return df
|
|
141
|
+
|
|
142
|
+
# -------------------------
|
|
143
|
+
# WGCNA-like coexpression (lightweight)
|
|
144
|
+
# -------------------------
|
|
145
|
+
def compute_wgcna_modules_cli(counts_tsv, out_tsv=None, soft_power=6):
|
|
146
|
+
"""
|
|
147
|
+
counts_tsv: genes x samples (rows genes, columns samples) or samples x genes.
|
|
148
|
+
We'll accept genes rows (index=gene).
|
|
149
|
+
Returns DataFrame gene->module
|
|
150
|
+
"""
|
|
151
|
+
df = pd.read_csv(counts_tsv, sep="\t", index_col=0)
|
|
152
|
+
# ensure genes are rows
|
|
153
|
+
if df.shape[0] < df.shape[1] and df.index.str.contains('ENS').any():
|
|
154
|
+
# likely already genes x samples; keep
|
|
155
|
+
pass
|
|
156
|
+
# compute correlation between genes (use transpose if needed)
|
|
157
|
+
expr = df
|
|
158
|
+
# if many samples < genes, correlation across samples => gene-gene correlation requires samples dimension
|
|
159
|
+
corr = expr.T.corr() # gene x gene
|
|
160
|
+
adj = np.abs(corr) ** soft_power
|
|
161
|
+
# approximate TOM (simple)
|
|
162
|
+
tom = adj * np.abs(corr)
|
|
163
|
+
dist = 1 - tom
|
|
164
|
+
# fill nans
|
|
165
|
+
dist = dist.fillna(1.0)
|
|
166
|
+
# clustering
|
|
167
|
+
# AgglomerativeClustering does not accept distance matrix directly for fit_predict unless precomputed affinity
|
|
168
|
+
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.2, affinity='precomputed', linkage='average')
|
|
169
|
+
labels = clustering.fit_predict(dist.values)
|
|
170
|
+
modules = pd.DataFrame({'gene': corr.index, 'module': labels})
|
|
171
|
+
if out_tsv:
|
|
172
|
+
modules.to_csv(out_tsv, sep="\t", index=False)
|
|
173
|
+
return modules
|
|
174
|
+
|
|
175
|
+
# -------------------------
|
|
176
|
+
# BIOMARKER SELECTION + ML
|
|
177
|
+
# -------------------------
|
|
178
|
+
def select_features_rf_cli(counts_tsv, labels_tsv, out_tsv=None, topk=50):
|
|
179
|
+
"""
|
|
180
|
+
counts_tsv: genes x samples (index gene, columns samples) OR samples x genes.
|
|
181
|
+
labels_tsv: must include columns sampleID, label
|
|
182
|
+
We'll convert to samples x features for ML.
|
|
183
|
+
"""
|
|
184
|
+
counts = pd.read_csv(counts_tsv, sep="\t", index_col=0)
|
|
185
|
+
ss = pd.read_csv(labels_tsv, sep="\t")
|
|
186
|
+
# convert to samples x genes if needed
|
|
187
|
+
if counts.index[0].startswith('ENS') or counts.index[0].isalpha():
|
|
188
|
+
# genes x samples -> transpose
|
|
189
|
+
X = counts.T
|
|
190
|
+
else:
|
|
191
|
+
X = counts
|
|
192
|
+
# align samples
|
|
193
|
+
if 'sampleID' in ss.columns:
|
|
194
|
+
ss = ss.set_index('sampleID')
|
|
195
|
+
labels = ss.loc[X.index, ss.columns[-1]] # last column is label
|
|
196
|
+
# RF
|
|
197
|
+
rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=1)
|
|
198
|
+
rf.fit(X, labels)
|
|
199
|
+
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
|
|
200
|
+
top = imp.head(topk)
|
|
201
|
+
if out_tsv:
|
|
202
|
+
top.to_csv(out_tsv, sep="\t", header=['importance'])
|
|
203
|
+
return top
|
|
204
|
+
|
|
205
|
+
def train_classifiers_cli(counts_tsv, labels_tsv, out_tsv=None, test_size=0.25):
|
|
206
|
+
counts = pd.read_csv(counts_tsv, sep="\t", index_col=0)
|
|
207
|
+
ss = pd.read_csv(labels_tsv, sep="\t")
|
|
208
|
+
if counts.index[0].startswith('ENS') or counts.index[0].isalpha():
|
|
209
|
+
X = counts.T
|
|
210
|
+
else:
|
|
211
|
+
X = counts
|
|
212
|
+
if 'sampleID' in ss.columns:
|
|
213
|
+
ss = ss.set_index('sampleID')
|
|
214
|
+
y = ss.loc[X.index, ss.columns[-1]]
|
|
215
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
|
|
216
|
+
rf = RandomForestClassifier(n_estimators=400, random_state=42)
|
|
217
|
+
rf.fit(X_train, y_train)
|
|
218
|
+
pred = rf.predict(X_test)
|
|
219
|
+
rep = classification_report(y_test, pred, output_dict=True)
|
|
220
|
+
if out_tsv:
|
|
221
|
+
pd.DataFrame(rep).to_csv(out_tsv, sep="\t")
|
|
222
|
+
return rep
|
|
223
|
+
|
|
224
|
+
# -------------------------
|
|
225
|
+
# CLI: argparse dispatcher
|
|
226
|
+
# -------------------------
|
|
227
|
+
def build_parser():
|
|
228
|
+
p = argparse.ArgumentParser(prog="rna_master_tools.py")
|
|
229
|
+
sub = p.add_subparsers(dest="cmd")
|
|
230
|
+
|
|
231
|
+
# rMATS run
|
|
232
|
+
a = sub.add_parser("run-rmats")
|
|
233
|
+
a.add_argument("--b1", required=True, help="Comma-separated BAMs for group1")
|
|
234
|
+
a.add_argument("--b2", required=True, help="Comma-separated BAMs for group2")
|
|
235
|
+
a.add_argument("--gtf", required=True)
|
|
236
|
+
a.add_argument("--outdir", required=True)
|
|
237
|
+
a.add_argument("--readlen", type=int, default=100)
|
|
238
|
+
a.add_argument("--threads", type=int, default=8)
|
|
239
|
+
a.add_argument("--rmats-cmd", default="rmats.py")
|
|
240
|
+
|
|
241
|
+
# parse rMATS
|
|
242
|
+
b = sub.add_parser("parse-rmats")
|
|
243
|
+
b.add_argument("--in", dest="infile", required=True)
|
|
244
|
+
b.add_argument("--out", dest="out", default=None)
|
|
245
|
+
|
|
246
|
+
# plot rMATS volcano
|
|
247
|
+
c = sub.add_parser("plot-rmats")
|
|
248
|
+
c.add_argument("--in", dest="infile", required=True)
|
|
249
|
+
c.add_argument("--out", dest="out", default=None)
|
|
250
|
+
|
|
251
|
+
# STAR-Fusion run
|
|
252
|
+
d = sub.add_parser("run-starfusion")
|
|
253
|
+
d.add_argument("--left", required=True)
|
|
254
|
+
d.add_argument("--right", required=True)
|
|
255
|
+
d.add_argument("--genome_lib_dir", required=True)
|
|
256
|
+
d.add_argument("--outdir", required=True)
|
|
257
|
+
d.add_argument("--threads", type=int, default=8)
|
|
258
|
+
|
|
259
|
+
# parse star-fusion
|
|
260
|
+
e = sub.add_parser("parse-starfusion")
|
|
261
|
+
e.add_argument("--in", dest="infile", required=True)
|
|
262
|
+
e.add_argument("--out", dest="out", default=None)
|
|
263
|
+
|
|
264
|
+
# WGCNA
|
|
265
|
+
f = sub.add_parser("compute-wgcna")
|
|
266
|
+
f.add_argument("--counts", required=True)
|
|
267
|
+
f.add_argument("--out", dest="out", default=None)
|
|
268
|
+
f.add_argument("--power", type=int, default=6)
|
|
269
|
+
|
|
270
|
+
# biomarkers select
|
|
271
|
+
g = sub.add_parser("biomarkers")
|
|
272
|
+
g.add_argument("--counts", required=True)
|
|
273
|
+
g.add_argument("--labels", required=True)
|
|
274
|
+
g.add_argument("--out", dest="out", default=None)
|
|
275
|
+
g.add_argument("--topk", type=int, default=50)
|
|
276
|
+
|
|
277
|
+
# classifiers
|
|
278
|
+
h = sub.add_parser("train-classifiers")
|
|
279
|
+
h.add_argument("--counts", required=True)
|
|
280
|
+
h.add_argument("--labels", required=True)
|
|
281
|
+
h.add_argument("--out", dest="out", default=None)
|
|
282
|
+
|
|
283
|
+
return p
|
|
284
|
+
|
|
285
|
+
def main(argv=None):
|
|
286
|
+
p = build_parser()
|
|
287
|
+
args = p.parse_args(argv)
|
|
288
|
+
|
|
289
|
+
if args.cmd == "run-rmats":
|
|
290
|
+
run_rmats_cli(args.b1, args.b2, args.gtf, args.outdir, readlen=args.readlen, threads=args.threads, rmats_cmd=args.rmats_cmd)
|
|
291
|
+
elif args.cmd == "parse-rmats":
|
|
292
|
+
parse_rmats_event_table(args.infile, out_tsv=args.out)
|
|
293
|
+
elif args.cmd == "plot-rmats":
|
|
294
|
+
df = parse_rmats_event_table(args.infile)
|
|
295
|
+
plot_rmats_volcano(df, out_png=args.out)
|
|
296
|
+
elif args.cmd == "run-starfusion":
|
|
297
|
+
run_star_fusion_cli(args.left, args.right, args.genome_lib_dir, args.outdir, threads=args.threads)
|
|
298
|
+
elif args.cmd == "parse-starfusion":
|
|
299
|
+
parse_star_fusion_tsv(args.infile, out_tsv=args.out)
|
|
300
|
+
elif args.cmd == "compute-wgcna":
|
|
301
|
+
compute_wgcna_modules_cli(args.counts, out_tsv=args.out, soft_power=args.power)
|
|
302
|
+
elif args.cmd == "biomarkers":
|
|
303
|
+
select_features_rf_cli(args.counts, args.labels, out_tsv=args.out, topk=args.topk)
|
|
304
|
+
elif args.cmd == "train-classifiers":
|
|
305
|
+
train_classifiers_cli(args.counts, args.labels, out_tsv=args.out)
|
|
306
|
+
else:
|
|
307
|
+
p.print_help()
|
|
308
|
+
sys.exit(1)
|
|
309
|
+
|
|
310
|
+
if __name__ == "__main__":
|
|
311
|
+
main()
|