py2ls 0.2.4.2__py3-none-any.whl → 0.2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/bio.py
CHANGED
@@ -4,8 +4,13 @@ import pandas as pd
|
|
4
4
|
import os
|
5
5
|
import logging
|
6
6
|
from . import ips
|
7
|
+
from . import plot
|
8
|
+
import matplotlib.pyplot as plt
|
9
|
+
|
7
10
|
def load_geo(
|
8
|
-
datasets: Union[list, str] = ["GSE00000", "GSE00001"],
|
11
|
+
datasets: Union[list, str] = ["GSE00000", "GSE00001"],
|
12
|
+
dir_save: str = "./datasets",
|
13
|
+
verbose=False,
|
9
14
|
) -> dict:
|
10
15
|
"""
|
11
16
|
Check if GEO datasets are already in the directory, and download them if not.
|
@@ -17,7 +22,7 @@ def load_geo(
|
|
17
22
|
Returns:
|
18
23
|
dict: A dictionary containing the GEO objects for each dataset.
|
19
24
|
"""
|
20
|
-
use_str="""
|
25
|
+
use_str = """
|
21
26
|
get_meta(geo: dict, dataset: str = "GSE25097")
|
22
27
|
get_expression_data(geo: dict, dataset: str = "GSE25097")
|
23
28
|
get_probe(geo: dict, dataset: str = "GSE25097", platform_id: str = "GPL10687")
|
@@ -51,7 +56,7 @@ def load_geo(
|
|
51
56
|
return geo_data
|
52
57
|
|
53
58
|
|
54
|
-
def get_meta(geo: dict, dataset: str = "GSE25097",verbose=True) -> pd.DataFrame:
|
59
|
+
def get_meta(geo: dict, dataset: str = "GSE25097", verbose=True) -> pd.DataFrame:
|
55
60
|
"""
|
56
61
|
df_meta = get_meta(geo, dataset="GSE25097")
|
57
62
|
Extracts metadata from a specific GEO dataset and returns it as a DataFrame.
|
@@ -122,23 +127,29 @@ def get_meta(geo: dict, dataset: str = "GSE25097",verbose=True) -> pd.DataFrame:
|
|
122
127
|
print(
|
123
128
|
f"Meta info columns for dataset '{dataset}': \n{sorted(meta_df.columns.tolist())}"
|
124
129
|
)
|
125
|
-
|
130
|
+
display(meta_df[:3].T)
|
131
|
+
return meta_df
|
132
|
+
|
126
133
|
|
127
|
-
def get_probe(
|
134
|
+
def get_probe(
|
135
|
+
geo: dict, dataset: str = "GSE25097", platform_id: str = None, verbose=True
|
136
|
+
):
|
128
137
|
"""
|
129
138
|
df_probe = get_probe(geo, dataset="GSE25097", platform_id: str = "GPL10687")
|
130
139
|
"""
|
131
140
|
# try to find the platform_id from meta
|
132
141
|
if platform_id is None:
|
133
|
-
df_meta=get_meta(geo=geo, dataset=dataset,verbose=False)
|
134
|
-
platform_id=df_meta["platform_id"].unique().tolist()
|
135
|
-
platform_id = platform_id[0] if len(platform_id)==1 else platform_id
|
142
|
+
df_meta = get_meta(geo=geo, dataset=dataset, verbose=False)
|
143
|
+
platform_id = df_meta["platform_id"].unique().tolist()
|
144
|
+
platform_id = platform_id[0] if len(platform_id) == 1 else platform_id
|
136
145
|
print(platform_id)
|
137
146
|
df_probe = geo[dataset].gpls[platform_id].table
|
138
147
|
if df_probe.empty:
|
139
|
-
print(
|
148
|
+
print(
|
149
|
+
f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息"
|
150
|
+
)
|
140
151
|
return get_meta(geo, dataset, verbose=True)
|
141
|
-
if verbose:
|
152
|
+
if verbose:
|
142
153
|
print(f"columns in the probe table: \n{sorted(df_probe.columns.tolist())}")
|
143
154
|
return df_probe
|
144
155
|
|
@@ -170,17 +181,18 @@ def get_expression_data(geo: dict, dataset: str = "GSE25097") -> pd.DataFrame:
|
|
170
181
|
return expression_values
|
171
182
|
|
172
183
|
|
173
|
-
|
174
|
-
def get_data(geo: dict, dataset: str = "GSE25097",verbose=True):
|
184
|
+
def get_data(geo: dict, dataset: str = "GSE25097", verbose=True):
|
175
185
|
# get probe info
|
176
|
-
df_probe = get_probe(geo,dataset=dataset,verbose=False)
|
186
|
+
df_probe = get_probe(geo, dataset=dataset, verbose=False)
|
177
187
|
# get expression values
|
178
|
-
df_expression = get_expression_data(geo, dataset=dataset
|
188
|
+
df_expression = get_expression_data(geo, dataset=dataset)
|
179
189
|
print(
|
180
190
|
f"df_expression.shape: {df_expression.shape} \ndf_probe.shape: {df_probe.shape}"
|
181
191
|
)
|
182
192
|
if any([df_probe.empty, df_expression.empty]):
|
183
|
-
print(
|
193
|
+
print(
|
194
|
+
f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息"
|
195
|
+
)
|
184
196
|
return get_meta(geo, dataset, verbose=True)
|
185
197
|
df_exp = pd.merge(
|
186
198
|
df_probe,
|
@@ -191,27 +203,48 @@ def get_data(geo: dict, dataset: str = "GSE25097",verbose=True):
|
|
191
203
|
)
|
192
204
|
|
193
205
|
# get meta info
|
194
|
-
df_meta=get_meta(geo, dataset=dataset,verbose=False)
|
195
|
-
col_rm=[
|
206
|
+
df_meta = get_meta(geo, dataset=dataset, verbose=False)
|
207
|
+
col_rm = [
|
208
|
+
"channel_count",
|
209
|
+
"contact_web_link",
|
210
|
+
"contact_address",
|
211
|
+
"contact_city",
|
212
|
+
"contact_country",
|
213
|
+
"contact_department",
|
214
|
+
"contact_email",
|
215
|
+
"contact_institute",
|
216
|
+
"contact_laboratory",
|
217
|
+
"contact_name",
|
218
|
+
"contact_phone",
|
219
|
+
"contact_state",
|
220
|
+
"contact_zip/postal_code",
|
221
|
+
"contributor",
|
222
|
+
"manufacture_protocol",
|
223
|
+
"taxid",
|
224
|
+
"web_link",
|
225
|
+
]
|
196
226
|
# rm unrelavent columns
|
197
227
|
df_meta = df_meta.drop(columns=[col for col in col_rm if col in df_meta.columns])
|
198
228
|
# sorte columns
|
199
|
-
df_meta = df_meta.reindex(sorted(df_meta.columns),axis=1)
|
229
|
+
df_meta = df_meta.reindex(sorted(df_meta.columns), axis=1)
|
200
230
|
# find a proper column
|
201
|
-
col_sample_id = ips.strcmp("sample_id",df_meta.columns.tolist())[0]
|
202
|
-
df_meta.set_index(col_sample_id, inplace=True)
|
203
|
-
|
204
|
-
col_gene_symbol = ips.strcmp("GeneSymbol",df_exp.columns.tolist())[0]
|
231
|
+
col_sample_id = ips.strcmp("sample_id", df_meta.columns.tolist())[0]
|
232
|
+
df_meta.set_index(col_sample_id, inplace=True) # set gene symbol as index
|
233
|
+
|
234
|
+
col_gene_symbol = ips.strcmp("GeneSymbol", df_exp.columns.tolist())[0]
|
205
235
|
# select the 'GSM' columns
|
206
236
|
col_gsm = df_exp.columns[df_exp.columns.str.startswith("GSM")].tolist()
|
207
237
|
df_exp.set_index(col_gene_symbol, inplace=True)
|
208
|
-
df_exp=df_exp[col_gsm].T
|
209
|
-
|
210
|
-
df_merged=ips.df_merge(df_meta,df_exp)
|
238
|
+
df_exp = df_exp[col_gsm].T # transpose, so that could add meta info
|
239
|
+
|
240
|
+
df_merged = ips.df_merge(df_meta, df_exp)
|
211
241
|
if verbose:
|
212
|
-
print(
|
213
|
-
|
214
|
-
|
242
|
+
print(
|
243
|
+
f"\ndataset:'{dataset}' n_sample = {df_merged.shape[0]}, n_gene={df_exp.shape[1]}"
|
244
|
+
)
|
245
|
+
display(df_merged.sample(5))
|
246
|
+
return df_merged
|
247
|
+
|
215
248
|
|
216
249
|
def split_at_lower_upper(lst):
|
217
250
|
"""
|
@@ -228,16 +261,17 @@ def split_at_lower_upper(lst):
|
|
228
261
|
return lst[: i + 1], lst[i + 1 :]
|
229
262
|
return lst, []
|
230
263
|
|
231
|
-
|
264
|
+
|
265
|
+
def add_condition(
|
232
266
|
data: pd.DataFrame,
|
233
|
-
column:str="characteristics_ch1"
|
234
|
-
column_new:str="condition"
|
235
|
-
by:str="tissue: tumor liver"
|
236
|
-
by_not:str=": tumor", # 健康的选择条件
|
237
|
-
by_name:str="non-tumor", # 健康的命名
|
238
|
-
by_not_name:str="tumor", # 不健康的命名
|
239
|
-
inplace: bool = True,
|
240
|
-
verbose:bool = True
|
267
|
+
column: str = "characteristics_ch1", # 在哪一行进行分类
|
268
|
+
column_new: str = "condition", # 新col的命名
|
269
|
+
by: str = "tissue: tumor liver", # 通过by来命名
|
270
|
+
by_not: str = ": tumor", # 健康的选择条件
|
271
|
+
by_name: str = "non-tumor", # 健康的命名
|
272
|
+
by_not_name: str = "tumor", # 不健康的命名
|
273
|
+
inplace: bool = True, # replace the data
|
274
|
+
verbose: bool = True,
|
241
275
|
):
|
242
276
|
"""
|
243
277
|
Add a new column to the DataFrame based on the presence of a specific substring in another column.
|
@@ -255,18 +289,225 @@ def get_condition(
|
|
255
289
|
|
256
290
|
"""
|
257
291
|
# first check the content in column
|
258
|
-
content=data[column].unique().tolist()
|
292
|
+
content = data[column].unique().tolist()
|
259
293
|
if verbose:
|
260
|
-
if len(content)>10:
|
294
|
+
if len(content) > 10:
|
261
295
|
display(content[:10])
|
262
296
|
else:
|
263
297
|
display(content)
|
264
298
|
# 优先by
|
265
299
|
if by:
|
266
|
-
data[column_new] = data[column].apply(
|
300
|
+
data[column_new] = data[column].apply(
|
301
|
+
lambda x: by_name if by in x else by_not_name
|
302
|
+
)
|
267
303
|
elif by_not:
|
268
|
-
data[column_new] = data[column].apply(
|
304
|
+
data[column_new] = data[column].apply(
|
305
|
+
lambda x: by_not_name if not by_not in x else by_name
|
306
|
+
)
|
307
|
+
if verbose:
|
308
|
+
display(data)
|
309
|
+
if not inplace:
|
310
|
+
return data
|
311
|
+
|
312
|
+
|
313
|
+
def add_condition_multi(
|
314
|
+
data: pd.DataFrame,
|
315
|
+
column: str = "characteristics_ch1", # Column to classify
|
316
|
+
column_new: str = "condition", # New column name
|
317
|
+
conditions: dict = {
|
318
|
+
"low": "low",
|
319
|
+
"high": "high",
|
320
|
+
"intermediate": "intermediate",
|
321
|
+
}, # A dictionary where keys are substrings and values are condition names
|
322
|
+
default_name: str = "unknown", # Default name if no condition matches
|
323
|
+
inplace: bool = True, # Whether to replace the data
|
324
|
+
verbose: bool = True,
|
325
|
+
):
|
326
|
+
"""
|
327
|
+
Add a new column to the DataFrame based on the presence of specific substrings in another column.
|
328
|
+
|
329
|
+
Parameters
|
330
|
+
----------
|
331
|
+
data : pd.DataFrame
|
332
|
+
The input DataFrame containing the data.
|
333
|
+
column : str, optional
|
334
|
+
The name of the column in which to search for the substrings (default is 'characteristics_ch1').
|
335
|
+
column_new : str, optional
|
336
|
+
The name of the new column to be created (default is 'condition').
|
337
|
+
conditions : dict, optional
|
338
|
+
A dictionary where keys are substrings to search for and values are the corresponding labels.
|
339
|
+
default_name : str, optional
|
340
|
+
The name to assign if no condition matches (default is 'unknown').
|
341
|
+
inplace : bool, optional
|
342
|
+
Whether to modify the original DataFrame (default is True).
|
343
|
+
verbose : bool, optional
|
344
|
+
Whether to display the unique values and final DataFrame (default is True).
|
345
|
+
"""
|
346
|
+
|
347
|
+
# Display the unique values in the column
|
348
|
+
content = data[column].unique().tolist()
|
349
|
+
if verbose:
|
350
|
+
if len(content) > 10:
|
351
|
+
display(content[:10])
|
352
|
+
else:
|
353
|
+
display(content)
|
354
|
+
|
355
|
+
# Check if conditions are provided
|
356
|
+
if conditions is None:
|
357
|
+
raise ValueError(
|
358
|
+
"Conditions must be provided as a dictionary with substrings and corresponding labels."
|
359
|
+
)
|
360
|
+
|
361
|
+
# Define a helper function to map the conditions
|
362
|
+
def map_condition(value):
|
363
|
+
for substring, label in conditions.items():
|
364
|
+
if substring in value:
|
365
|
+
return label
|
366
|
+
return default_name # If no condition matches, return the default name
|
367
|
+
|
368
|
+
# Apply the mapping function to create the new column
|
369
|
+
data[column_new] = data[column].apply(map_condition)
|
370
|
+
|
371
|
+
# Display the updated DataFrame if verbose is True
|
269
372
|
if verbose:
|
270
373
|
display(data)
|
374
|
+
|
271
375
|
if not inplace:
|
272
|
-
return data
|
376
|
+
return data
|
377
|
+
|
378
|
+
def clean_dataset(
|
379
|
+
data: pd.DataFrame, dataset: str = "GSE25097", condition: str = "condition",sep="///"
|
380
|
+
):
|
381
|
+
"""
|
382
|
+
#* it has been involved in bio.batch_effects(), but default: False
|
383
|
+
1. clean data set and prepare super_datasets
|
384
|
+
2. if "///" in index, then extend it, or others.
|
385
|
+
3. drop duplicates and dropna()
|
386
|
+
4. add the 'condition' and 'dataset info' to the columns
|
387
|
+
5. set genes as index
|
388
|
+
"""
|
389
|
+
#! (4.1) clean data set and prepare super_datasets
|
390
|
+
# df_data_2, 左边的列是meta,右边的列是gene_symbol
|
391
|
+
col_gene = split_at_lower_upper(data.columns.tolist())[1][0]
|
392
|
+
idx = ips.strcmp(col_gene, data.columns.tolist())[1]
|
393
|
+
df_gene = data.iloc[:, idx:].T # keep the last 'condition'
|
394
|
+
|
395
|
+
#! if "///" in index, then extend it, or others.
|
396
|
+
print(f"before extend shape: {df_gene.shape}")
|
397
|
+
df = df_gene.reset_index()
|
398
|
+
df_gene = ips.df_extend(df, column="index", sep=sep)
|
399
|
+
# reset 'index' column as index
|
400
|
+
# df_gene = df_gene.set_index("index")
|
401
|
+
print(f"after extended by '{sep}' shape: {df_gene.shape}")
|
402
|
+
|
403
|
+
# *alternative:
|
404
|
+
# df_unique = df.reset_index().drop_duplicates(subset="index").set_index("index")
|
405
|
+
#! 4.2 drop duplicates and dropna()
|
406
|
+
df_gene = df_gene.drop_duplicates(subset=["index"]).dropna()
|
407
|
+
print(f"drop duplicates and dropna: shape: {df_gene.shape}")
|
408
|
+
|
409
|
+
#! add the 'condition' and 'dataset info' to the columns
|
410
|
+
ds = [data["dataset"][0]] * len(df_gene.columns[1:])
|
411
|
+
samp = df_gene.columns.tolist()[1:]
|
412
|
+
cond = df_gene[df_gene["index"] == condition].values.tolist()[0][1:]
|
413
|
+
df_gene.columns = ["index"] + [
|
414
|
+
f"{ds}_{sam}_{cond}" for (ds, sam, cond) in zip(ds, samp, cond)
|
415
|
+
]
|
416
|
+
df_gene.drop(df_gene[df_gene["index"] == condition].index, inplace=True)
|
417
|
+
#! set genes as index
|
418
|
+
df_gene.set_index("index",inplace=True)
|
419
|
+
display(df_gene.head())
|
420
|
+
return df_gene
|
421
|
+
|
422
|
+
def batch_effect(
|
423
|
+
data: list = "[df_gene_1, df_gene_2, df_gene_3]",
|
424
|
+
datasets: list = ["GSE25097", "GSE62232", "GSE65372"],
|
425
|
+
clean_data:bool=False, # default, not do data cleaning
|
426
|
+
top_genes:int=10,# only for plotting
|
427
|
+
plot_=True,
|
428
|
+
dir_save="./res/",
|
429
|
+
kws_clean_dataset:dict={},
|
430
|
+
**kwargs
|
431
|
+
):
|
432
|
+
"""
|
433
|
+
usage 1:
|
434
|
+
bio.batch_effect(
|
435
|
+
data=[df_gene_1, df_gene_2, df_gene_3],
|
436
|
+
datasets=["GSE25097", "GSE62232", "GSE65372"],
|
437
|
+
clean_data=False,
|
438
|
+
dir_save="./res/")
|
439
|
+
|
440
|
+
#! # or conbine clean_dataset and batch_effect together
|
441
|
+
# # data = [bio.clean_dataset(data=dt, dataset=ds) for (dt, ds) in zip(data, datasets)]
|
442
|
+
data_common = bio.batch_effect(
|
443
|
+
data=[df_data_1, df_data_2, df_data_3],
|
444
|
+
datasets=["GSE25097", "GSE62232", "GSE65372"], clean_data=True
|
445
|
+
)
|
446
|
+
"""
|
447
|
+
# data = [df_gene_1, df_gene_2, df_gene_3]
|
448
|
+
# datasets = ["GSE25097", "GSE62232", "GSE65372"]
|
449
|
+
# top_genes = 10 # show top 10 genes
|
450
|
+
# plot_ = True
|
451
|
+
from combat.pycombat import pycombat
|
452
|
+
if clean_data:
|
453
|
+
data=[clean_dataset(data=dt,dataset=ds,**kws_clean_dataset) for (dt,ds) in zip(data,datasets)]
|
454
|
+
#! prepare data
|
455
|
+
# the datasets are dataframes where:
|
456
|
+
# the indexes correspond to the gene names
|
457
|
+
# the column names correspond to the sample names
|
458
|
+
#! merge batchs
|
459
|
+
# https://epigenelabs.github.io/pyComBat/
|
460
|
+
# we merge all the datasets into one, by keeping the common genes only
|
461
|
+
df_expression_common_genes = pd.concat(data, join="inner", axis=1)
|
462
|
+
#! convert to float
|
463
|
+
ips.df_astype(df_expression_common_genes, astype="float", inplace=True)
|
464
|
+
|
465
|
+
#!to visualise results, use Mini datasets, only take the first 10 samples of each batch(dataset)
|
466
|
+
if plot_:
|
467
|
+
col2plot = []
|
468
|
+
for ds in datasets:
|
469
|
+
# select the first 10 samples to plot, to see the diff
|
470
|
+
dat_tmp = df_expression_common_genes.columns[
|
471
|
+
df_expression_common_genes.columns.str.startswith(ds)
|
472
|
+
][:top_genes].tolist()
|
473
|
+
col2plot.extend(dat_tmp)
|
474
|
+
# visualise results
|
475
|
+
_, axs = plt.subplots(2, 1, figsize=(15, 10))
|
476
|
+
plot.plotxy(
|
477
|
+
ax=axs[0],
|
478
|
+
data=df_expression_common_genes.loc[:, col2plot],
|
479
|
+
kind="bar",
|
480
|
+
figsets=dict(
|
481
|
+
title="Samples expression distribution (non-correction)",
|
482
|
+
ylabel="Observations",
|
483
|
+
xangle=90,
|
484
|
+
),
|
485
|
+
)
|
486
|
+
# prepare batch list
|
487
|
+
batch = [
|
488
|
+
ips.ssplit(i, by="_")[0] for i in df_expression_common_genes.columns.tolist()
|
489
|
+
]
|
490
|
+
# run pyComBat
|
491
|
+
df_corrected = pycombat(df_expression_common_genes, batch, **kwargs)
|
492
|
+
print(f"df_corrected.shape: {df_corrected.shape}")
|
493
|
+
display(df_corrected.head())
|
494
|
+
# visualise results again
|
495
|
+
if plot_:
|
496
|
+
|
497
|
+
plot.plotxy(
|
498
|
+
ax=axs[1],
|
499
|
+
data=df_corrected.loc[:, col2plot],
|
500
|
+
kind="bar",
|
501
|
+
figsets=dict(
|
502
|
+
title="Samples expression distribution (corrected)",
|
503
|
+
ylabel="Observations",
|
504
|
+
xangle=90,
|
505
|
+
),
|
506
|
+
)
|
507
|
+
if dir_save is not None:
|
508
|
+
ips.figsave(dir_save + "batch_sample_exp_distri.pdf")
|
509
|
+
return df_corrected
|
510
|
+
|
511
|
+
def get_common_genes(elment1, elment2):
|
512
|
+
common_genes=ips.shared(elment1, elment2)
|
513
|
+
return common_genes
|
py2ls/ips.py
CHANGED
@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
|
|
51
51
|
|
52
52
|
from . import netfinder
|
53
53
|
|
54
|
-
# from .plot import get_color
|
55
|
-
|
56
54
|
try:
|
57
55
|
get_ipython().run_line_magic("load_ext", "autoreload")
|
58
56
|
get_ipython().run_line_magic("autoreload", "2")
|
@@ -518,6 +516,59 @@ def is_text(s):
|
|
518
516
|
return has_alpha and has_non_alpha
|
519
517
|
|
520
518
|
|
519
|
+
from typing import Any, Union
|
520
|
+
|
521
|
+
def shared(lst1:Any, lst2:Any,*args, verbose=True):
|
522
|
+
"""
|
523
|
+
check the shared elelements in two list.
|
524
|
+
usage:
|
525
|
+
list1 = [1, 2, 3, 4, 5]
|
526
|
+
list2 = [4, 5, 6, 7, 8]
|
527
|
+
list3 = [5, 6, 9, 10]
|
528
|
+
a = shared(list1, list2,list3)
|
529
|
+
"""
|
530
|
+
if verbose:
|
531
|
+
print("\n********* checking shared elements *********")
|
532
|
+
if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
|
533
|
+
print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
|
534
|
+
shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
|
535
|
+
# support more lists
|
536
|
+
if args:
|
537
|
+
for arg in args:
|
538
|
+
shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
|
539
|
+
shared_elements = list(shared_elements)
|
540
|
+
if verbose:
|
541
|
+
elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
|
542
|
+
print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
|
543
|
+
print("********* checking shared elements *********")
|
544
|
+
return shared_elements
|
545
|
+
|
546
|
+
def flatten(nested: Any, unique_list=True,verbose=True):
|
547
|
+
"""
|
548
|
+
Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
|
549
|
+
Parameters:
|
550
|
+
nested : Any, Can be a list, tuple, dictionary, or set.
|
551
|
+
Returns: list, A flattened list.
|
552
|
+
"""
|
553
|
+
flattened_list = []
|
554
|
+
stack = [nested]
|
555
|
+
while stack:
|
556
|
+
current = stack.pop()
|
557
|
+
if isinstance(current, dict):
|
558
|
+
stack.extend(current.values())
|
559
|
+
elif isinstance(current, (list, tuple, set)):
|
560
|
+
stack.extend(current)
|
561
|
+
elif isinstance(current, pd.Series):
|
562
|
+
stack.extend(current)
|
563
|
+
else:
|
564
|
+
flattened_list.append(current)
|
565
|
+
if verbose:
|
566
|
+
print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
|
567
|
+
if unique_list:
|
568
|
+
return unique(flattened_list)
|
569
|
+
else:
|
570
|
+
return flattened_list
|
571
|
+
|
521
572
|
def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
|
522
573
|
"""
|
523
574
|
Compares a search term with a list of candidate strings and finds the best match based on similarity score.
|
@@ -548,7 +599,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
|
|
548
599
|
similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
|
549
600
|
elif "W" in scorer.lower():
|
550
601
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
551
|
-
elif "ratio" in scorer.lower():#Ratio (Strictest)
|
602
|
+
elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
|
552
603
|
similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
|
553
604
|
else:
|
554
605
|
similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
|
@@ -1721,7 +1772,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1721
1772
|
fmt=kwargs.pop("fmt",False)
|
1722
1773
|
verbose=kwargs.pop("verbose",False)
|
1723
1774
|
if verbose:
|
1724
|
-
|
1775
|
+
use_pd("read_csv", verbose=verbose)
|
1725
1776
|
return
|
1726
1777
|
|
1727
1778
|
if comment is None:
|
@@ -1853,7 +1904,7 @@ def fload(fpath, kind=None, **kwargs):
|
|
1853
1904
|
engine = kwargs.get("engine", "openpyxl")
|
1854
1905
|
verbose=kwargs.pop("verbose",False)
|
1855
1906
|
if verbose:
|
1856
|
-
|
1907
|
+
use_pd("read_excel", verbose=verbose)
|
1857
1908
|
df = pd.read_excel(fpath, engine=engine, **kwargs)
|
1858
1909
|
try:
|
1859
1910
|
meata=pd.ExcelFile(fpath)
|
@@ -2263,7 +2314,7 @@ def fsave(
|
|
2263
2314
|
|
2264
2315
|
verbose=kwargs.pop("verbose",False)
|
2265
2316
|
if verbose:
|
2266
|
-
|
2317
|
+
use_pd("to_csv", verbose=verbose)
|
2267
2318
|
kwargs_csv = dict(
|
2268
2319
|
path_or_buf=None,
|
2269
2320
|
sep=",",
|
@@ -2295,7 +2346,7 @@ def fsave(
|
|
2295
2346
|
verbose=kwargs.pop("verbose",False)
|
2296
2347
|
sheet_name = kwargs.pop("sheet_name", "Sheet1")
|
2297
2348
|
if verbose:
|
2298
|
-
|
2349
|
+
use_pd("to_excel", verbose=verbose)
|
2299
2350
|
if any(kwargs):
|
2300
2351
|
format_excel(df=data, filename=fpath, **kwargs)
|
2301
2352
|
else:
|
@@ -4444,7 +4495,42 @@ def preview(var):
|
|
4444
4495
|
# preview("# This is a Markdown header")
|
4445
4496
|
# preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
|
4446
4497
|
# preview({"key": "value", "numbers": [1, 2, 3]})
|
4447
|
-
|
4498
|
+
def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
|
4499
|
+
"""
|
4500
|
+
Extend a DataFrame by the list elecments in the column.
|
4501
|
+
|
4502
|
+
Parameters:
|
4503
|
+
----------
|
4504
|
+
data : pd.DataFrame
|
4505
|
+
The input DataFrame to be extended.
|
4506
|
+
|
4507
|
+
column : str
|
4508
|
+
The name of the column to be split.
|
4509
|
+
|
4510
|
+
axis : int, optional
|
4511
|
+
The axis along which to expand the DataFrame.
|
4512
|
+
- 0 (default): Expand the specified column into multiple rows.
|
4513
|
+
- 1: Expand the specified column into multiple columns.
|
4514
|
+
|
4515
|
+
sep : str, optional
|
4516
|
+
The separator used to split the values in the specified column.
|
4517
|
+
Must be provided for the function to work correctly.
|
4518
|
+
"""
|
4519
|
+
|
4520
|
+
data = data.copy()
|
4521
|
+
mask = data[column].str.contains(sep, na=False)
|
4522
|
+
data = data.copy()
|
4523
|
+
if mask.any():
|
4524
|
+
data[column] = (
|
4525
|
+
data[column]
|
4526
|
+
.apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
|
4527
|
+
)
|
4528
|
+
|
4529
|
+
# Strip spaces from each item in the lists
|
4530
|
+
data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
|
4531
|
+
|
4532
|
+
data = data.explode(column, ignore_index=True)
|
4533
|
+
return data
|
4448
4534
|
# ! DataFrame
|
4449
4535
|
def df_astype(
|
4450
4536
|
data: pd.DataFrame,
|
@@ -4731,7 +4817,7 @@ def df_merge(
|
|
4731
4817
|
"""
|
4732
4818
|
|
4733
4819
|
# 1. Check if indices are comparable (same length and types)
|
4734
|
-
if use_index
|
4820
|
+
if use_index:
|
4735
4821
|
print(f"Merging based on index using '{how}' join...")
|
4736
4822
|
df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
|
4737
4823
|
return df_merged
|
@@ -4984,7 +5070,7 @@ def df_cluster(
|
|
4984
5070
|
X = scaler.fit_transform(X)
|
4985
5071
|
|
4986
5072
|
for n_cluster in range_n_clusters:
|
4987
|
-
kmeans = KMeans(n_clusters=n_cluster, random_state=
|
5073
|
+
kmeans = KMeans(n_clusters=n_cluster, random_state=1)
|
4988
5074
|
cluster_labels = kmeans.fit_predict(X)
|
4989
5075
|
|
4990
5076
|
silhouette_avg = silhouette_score(X, cluster_labels)
|
@@ -5000,7 +5086,7 @@ def df_cluster(
|
|
5000
5086
|
print(f"n_clusters = {n_clusters}")
|
5001
5087
|
|
5002
5088
|
# Apply K-Means Clustering with Optimal Number of Clusters
|
5003
|
-
kmeans = KMeans(n_clusters=n_clusters, random_state=
|
5089
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=1)
|
5004
5090
|
cluster_labels = kmeans.fit_predict(X)
|
5005
5091
|
|
5006
5092
|
if plot:
|
@@ -5101,7 +5187,7 @@ def df_cluster(
|
|
5101
5187
|
# n_clusters = (
|
5102
5188
|
# np.argmax(silhouette_avg_scores) + 2
|
5103
5189
|
# ) # Optimal clusters based on max silhouette score
|
5104
|
-
# kmeans = KMeans(n_clusters=n_clusters, random_state=
|
5190
|
+
# kmeans = KMeans(n_clusters=n_clusters, random_state=1)
|
5105
5191
|
# cluster_labels = kmeans.fit_predict(X)
|
5106
5192
|
silhouette_vals = silhouette_samples(X, cluster_labels)
|
5107
5193
|
|
@@ -5252,12 +5338,14 @@ def df_reducer(
|
|
5252
5338
|
columns: Optional[List[str]] = None,
|
5253
5339
|
method: str = "umap", # 'pca', 'umap'
|
5254
5340
|
n_components: int = 2, # Default for umap, but 50 for PCA
|
5255
|
-
umap_neighbors: int = 15, #
|
5256
|
-
umap_min_dist: float = 0.1, #
|
5341
|
+
umap_neighbors: int = 15, # UMAP-specific
|
5342
|
+
umap_min_dist: float = 0.1, # UMAP-specific
|
5343
|
+
tsne_perplexity: int = 30, # t-SNE-specific
|
5257
5344
|
scale: bool = True,
|
5258
5345
|
fill_missing: bool = True,
|
5259
5346
|
debug: bool = False,
|
5260
5347
|
inplace: bool = True, # replace the oringinal data
|
5348
|
+
plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
|
5261
5349
|
) -> pd.DataFrame:
|
5262
5350
|
"""
|
5263
5351
|
Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
|
@@ -5293,9 +5381,35 @@ def df_reducer(
|
|
5293
5381
|
reduced_df : pd.DataFrame
|
5294
5382
|
DataFrame with the reduced dimensions.
|
5295
5383
|
"""
|
5296
|
-
|
5384
|
+
|
5385
|
+
"""
|
5386
|
+
PCA: explained_variance:
|
5387
|
+
indicates the proportion of the dataset's total variance that each principal
|
5388
|
+
component (PC) explains. It gives you a sense of how much information
|
5389
|
+
(or variance) is captured by each PC
|
5390
|
+
Interpretation:
|
5391
|
+
- Higher values indicate that the corresponding PC captures more variance.
|
5392
|
+
- The sum of the explained variances for all PCs equals 1 (or 100%).
|
5393
|
+
- If the first few components explain a high percentage (e.g., 90%),
|
5394
|
+
it means you can reduce the dimensionality of the data significantly without losing much information.
|
5395
|
+
Use case:
|
5396
|
+
You may plot a scree plot, which shows the explained variance for each PC, to help decide
|
5397
|
+
how many components to keep for analysis.
|
5398
|
+
|
5399
|
+
PCA: Singular values:
|
5400
|
+
represent the magnitude of variance along each principal component. Mathematically,
|
5401
|
+
they are the square roots of the eigenvalues of the covariance matrix.
|
5402
|
+
Interpretation:
|
5403
|
+
Larger singular values indicate that the associated PC captures more variance.
|
5404
|
+
Singular values are related to the scale of the data. If the data are scaled
|
5405
|
+
before PCA (e.g., standardized), then the singular values will provide a measure
|
5406
|
+
of the spread of data along each PC.
|
5407
|
+
Use case:
|
5408
|
+
Singular values help quantify the contribution of each principal component in a
|
5409
|
+
similar way to the explained variance. They are useful in understanding the overall
|
5410
|
+
structure of the data.
|
5411
|
+
"""
|
5297
5412
|
from sklearn.preprocessing import StandardScaler
|
5298
|
-
import umap
|
5299
5413
|
from sklearn.impute import SimpleImputer
|
5300
5414
|
|
5301
5415
|
# Select columns if specified, else use all columns
|
@@ -5312,76 +5426,211 @@ def df_reducer(
|
|
5312
5426
|
X = scaler.fit_transform(X)
|
5313
5427
|
|
5314
5428
|
# Check valid method input
|
5315
|
-
|
5316
|
-
|
5317
|
-
|
5429
|
+
methods=["pca", "umap","tsne","factor","isolation_forest"]
|
5430
|
+
method=strcmp(method, methods)[0]
|
5318
5431
|
# Apply PCA if selected
|
5319
|
-
if method == "pca":
|
5320
|
-
|
5321
|
-
# to get the n_components with threshold method:
|
5322
|
-
pca = PCA()
|
5323
|
-
pca_result = pca.fit_transform(X)
|
5324
|
-
|
5325
|
-
# Calculate explained variance
|
5326
|
-
explained_variance = pca.explained_variance_ratio_
|
5327
|
-
# Cumulative explained variance
|
5328
|
-
cumulative_variance = np.cumsum(explained_variance)
|
5329
|
-
# Set a threshold for cumulative variance
|
5330
|
-
threshold = 0.95 # Example threshold
|
5331
|
-
n_components = (
|
5332
|
-
np.argmax(cumulative_variance >= threshold) + 1
|
5333
|
-
) # Number of components to retain
|
5334
|
-
if debug:
|
5335
|
-
# debug:
|
5336
|
-
# Plot the cumulative explained variance
|
5337
|
-
plt.figure(figsize=(8, 5))
|
5338
|
-
plt.plot(
|
5339
|
-
range(1, len(cumulative_variance) + 1),
|
5340
|
-
cumulative_variance,
|
5341
|
-
marker="o",
|
5342
|
-
linestyle="-",
|
5343
|
-
)
|
5344
|
-
plt.title("Cumulative Explained Variance by Principal Components")
|
5345
|
-
plt.xlabel("Number of Principal Components")
|
5346
|
-
plt.ylabel("Cumulative Explained Variance")
|
5347
|
-
plt.xticks(range(1, len(cumulative_variance) + 1))
|
5348
|
-
# Add horizontal line for the threshold
|
5349
|
-
plt.axhline(
|
5350
|
-
y=threshold, color="r", linestyle="--", label="Threshold (95%)"
|
5351
|
-
)
|
5352
|
-
# Add vertical line for n_components
|
5353
|
-
plt.axvline(
|
5354
|
-
x=n_components,
|
5355
|
-
color="g",
|
5356
|
-
linestyle="--",
|
5357
|
-
label=f"n_components = {n_components}",
|
5358
|
-
)
|
5359
|
-
plt.legend()
|
5360
|
-
plt.grid()
|
5432
|
+
if method == "pca":
|
5433
|
+
from sklearn.decomposition import PCA
|
5361
5434
|
pca = PCA(n_components=n_components)
|
5362
5435
|
X_reduced = pca.fit_transform(X)
|
5363
|
-
|
5436
|
+
|
5437
|
+
# Additional PCA information
|
5438
|
+
explained_variance = pca.explained_variance_ratio_
|
5439
|
+
singular_values = pca.singular_values_
|
5440
|
+
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
|
5441
|
+
|
5442
|
+
if debug:
|
5443
|
+
print(f"PCA completed: Reduced to {n_components} components.")
|
5444
|
+
print(f"Explained Variance: {explained_variance}")
|
5445
|
+
print(f"Singular Values: {singular_values}")
|
5446
|
+
|
5447
|
+
# Plot explained variance if debug=True
|
5448
|
+
if debug:
|
5449
|
+
# Plot explained variance
|
5450
|
+
cumulative_variance = np.cumsum(explained_variance)
|
5451
|
+
plt.figure(figsize=(8, 5))
|
5452
|
+
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
|
5453
|
+
plt.title("Cumulative Explained Variance by Principal Components")
|
5454
|
+
plt.xlabel("Number of Principal Components")
|
5455
|
+
plt.ylabel("Cumulative Explained Variance")
|
5456
|
+
plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
|
5457
|
+
plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
|
5458
|
+
plt.legend()
|
5459
|
+
plt.grid()
|
5460
|
+
plt.show()
|
5461
|
+
|
5462
|
+
# Prepare reduced DataFrame with additional PCA info
|
5463
|
+
pca_df = pd.DataFrame(
|
5464
|
+
X_reduced, index=data.index,
|
5465
|
+
columns=[f"PC_{i+1}" for i in range(n_components)]
|
5466
|
+
)
|
5467
|
+
# pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
|
5468
|
+
# pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
|
5469
|
+
# Expand explained variance to multiple columns if needed
|
5470
|
+
for i in range(n_components):
|
5471
|
+
pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
|
5472
|
+
for i in range(n_components):
|
5473
|
+
pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
|
5364
5474
|
|
5365
5475
|
# Apply UMAP if selected
|
5366
5476
|
elif method == "umap":
|
5477
|
+
import umap
|
5367
5478
|
umap_reducer = umap.UMAP(
|
5368
5479
|
n_neighbors=umap_neighbors,
|
5369
5480
|
min_dist=umap_min_dist,
|
5370
|
-
n_components=n_components
|
5481
|
+
n_components=n_components
|
5371
5482
|
)
|
5372
5483
|
X_reduced = umap_reducer.fit_transform(X)
|
5373
|
-
print(f"UMAP completed: Reduced to {n_components} components.")
|
5374
5484
|
|
5375
|
-
|
5376
|
-
|
5485
|
+
# Additional UMAP information
|
5486
|
+
embedding = umap_reducer.embedding_
|
5487
|
+
trustworthiness = umap_reducer._raw_data[:, :n_components]
|
5488
|
+
|
5489
|
+
if debug:
|
5490
|
+
print(f"UMAP completed: Reduced to {n_components} components.")
|
5491
|
+
print(f"Embedding Shape: {embedding.shape}")
|
5492
|
+
print(f"Trustworthiness: {trustworthiness}")
|
5493
|
+
|
5494
|
+
# Prepare reduced DataFrame with additional UMAP info
|
5495
|
+
umap_df = pd.DataFrame(
|
5496
|
+
X_reduced, index=data.index,
|
5497
|
+
columns=[f"UMAP_{i+1}" for i in range(n_components)]
|
5498
|
+
)
|
5499
|
+
umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
|
5500
|
+
umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
|
5501
|
+
elif method == "tsne":
|
5502
|
+
from sklearn.manifold import TSNE
|
5503
|
+
tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
|
5504
|
+
X_reduced = tsne.fit_transform(X)
|
5505
|
+
|
5506
|
+
# Prepare reduced DataFrame with additional t-SNE info
|
5507
|
+
tsne_df = pd.DataFrame(
|
5508
|
+
X_reduced, index=data.index,
|
5509
|
+
columns=[f"tSNE_{i+1}" for i in range(n_components)]
|
5510
|
+
)
|
5511
|
+
tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
|
5512
|
+
|
5513
|
+
# Apply Factor Analysis if selected
|
5514
|
+
elif method == "factor":
|
5515
|
+
from sklearn.decomposition import FactorAnalysis
|
5516
|
+
factor = FactorAnalysis(n_components=n_components, random_state=1)
|
5517
|
+
X_reduced = factor.fit_transform(X)
|
5518
|
+
# Factor Analysis does not directly provide explained variance, but we can approximate it
|
5519
|
+
fa_variance = factor.noise_variance_
|
5520
|
+
# Prepare reduced DataFrame with additional Factor Analysis info
|
5521
|
+
factor_df = pd.DataFrame(
|
5522
|
+
X_reduced, index=data.index,
|
5523
|
+
columns=[f"Factor_{i+1}" for i in range(n_components)]
|
5524
|
+
)
|
5525
|
+
factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
|
5526
|
+
|
5527
|
+
# Apply Isolation Forest for outlier detection if selected
|
5528
|
+
elif method == "isolation_forest":
|
5529
|
+
from sklearn.decomposition import PCA
|
5530
|
+
from sklearn.ensemble import IsolationForest
|
5531
|
+
# Step 1: Apply PCA for dimensionality reduction to 2 components
|
5532
|
+
pca = PCA(n_components=n_components)
|
5533
|
+
X_pca = pca.fit_transform(X)
|
5534
|
+
|
5535
|
+
explained_variance = pca.explained_variance_ratio_
|
5536
|
+
singular_values = pca.singular_values_
|
5537
|
+
|
5538
|
+
# Prepare reduced DataFrame with additional PCA info
|
5539
|
+
iso_forest_df = pd.DataFrame(
|
5540
|
+
X_pca, index=data.index,
|
5541
|
+
columns=[f"PC_{i+1}" for i in range(n_components)]
|
5542
|
+
)
|
5543
|
+
|
5544
|
+
isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
|
5545
|
+
isolation_forest.fit(X)
|
5546
|
+
anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
|
5547
|
+
# Predict labels: 1 (normal), -1 (anomaly)
|
5548
|
+
anomaly_labels = isolation_forest.fit_predict(X)
|
5549
|
+
# Add anomaly scores and labels to the DataFrame
|
5550
|
+
iso_forest_df["Anomaly Score"] = anomaly_scores
|
5551
|
+
iso_forest_df["Anomaly Label"] = anomaly_labels
|
5552
|
+
# add info from pca
|
5553
|
+
for i in range(n_components):
|
5554
|
+
iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
|
5555
|
+
for i in range(n_components):
|
5556
|
+
iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
|
5557
|
+
|
5558
|
+
# Return reduced data and info as a new DataFrame with the same index
|
5559
|
+
if method == "pca":
|
5560
|
+
reduced_df = pca_df
|
5561
|
+
colname_met = "PC_"
|
5562
|
+
if plot_:
|
5563
|
+
sns.scatterplot(
|
5564
|
+
data=pca_df,
|
5565
|
+
x="PC_1",
|
5566
|
+
y="PC_2",
|
5567
|
+
# hue="condition",
|
5568
|
+
)
|
5569
|
+
elif method == "umap":
|
5570
|
+
reduced_df = umap_df
|
5571
|
+
colname_met = "UMAP_"
|
5572
|
+
if plot_:
|
5573
|
+
sns.scatterplot(
|
5574
|
+
data=umap_df,
|
5575
|
+
x="UMAP_1",
|
5576
|
+
y="UMAP_2",
|
5577
|
+
# hue="condition",
|
5578
|
+
)
|
5579
|
+
elif method == "tsne":
|
5580
|
+
reduced_df = tsne_df
|
5581
|
+
colname_met = "t-SNE_"
|
5582
|
+
if plot_:
|
5583
|
+
sns.scatterplot(
|
5584
|
+
data=tsne_df,
|
5585
|
+
x="tSNE_1",
|
5586
|
+
y="tSNE_2",
|
5587
|
+
# hue="batch",
|
5588
|
+
)
|
5589
|
+
elif method == "factor":
|
5590
|
+
reduced_df = factor_df
|
5591
|
+
colname_met = "Factor_"
|
5592
|
+
if plot_:
|
5593
|
+
sns.scatterplot(
|
5594
|
+
data=factor_df,
|
5595
|
+
x="Factor_1",
|
5596
|
+
y="Factor_2",
|
5597
|
+
# hue="batch",
|
5598
|
+
)
|
5599
|
+
elif method == "isolation_forest":
|
5600
|
+
reduced_df = iso_forest_df # Already a DataFrame for outliers
|
5601
|
+
colname_met = "PC_"
|
5602
|
+
if plot_:
|
5603
|
+
ax = sns.scatterplot(
|
5604
|
+
data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
|
5605
|
+
x="PC_1",
|
5606
|
+
y="PC_2",
|
5607
|
+
label="normal", c="b",
|
5608
|
+
)
|
5609
|
+
ax = sns.scatterplot(
|
5610
|
+
ax=ax,
|
5611
|
+
data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
|
5612
|
+
x="PC_1",
|
5613
|
+
y="PC_2",
|
5614
|
+
c="r",
|
5615
|
+
label="outlier", marker="+", s=30,
|
5616
|
+
)
|
5617
|
+
|
5377
5618
|
|
5378
5619
|
if inplace:
|
5379
|
-
#
|
5620
|
+
# If inplace=True, add components back into the original data
|
5380
5621
|
for col_idx in range(n_components):
|
5381
|
-
data[f"
|
5622
|
+
data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
|
5623
|
+
|
5624
|
+
# Add extra info for PCA/UMAP
|
5625
|
+
if method == "pca":
|
5626
|
+
data["Explained Variance"] = reduced_df["Explained Variance"]
|
5627
|
+
data["Singular Values"] = reduced_df["Singular Values"]
|
5628
|
+
elif method == "umap":
|
5629
|
+
data["Embedding"] = reduced_df["Embedding"]
|
5630
|
+
data["Trustworthiness"] = reduced_df["Trustworthiness"]
|
5382
5631
|
return None # No return when inplace=True
|
5383
5632
|
|
5384
|
-
return reduced_df
|
5633
|
+
return reduced_df
|
5385
5634
|
|
5386
5635
|
|
5387
5636
|
# example:
|
@@ -5636,7 +5885,7 @@ def evaluate_cluster(
|
|
5636
5885
|
return metrics
|
5637
5886
|
|
5638
5887
|
|
5639
|
-
def
|
5888
|
+
def use_pd(
|
5640
5889
|
func_name="excel",
|
5641
5890
|
verbose=True,
|
5642
5891
|
dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
|
py2ls/plot.py
CHANGED
@@ -16,7 +16,9 @@ from .stats import *
|
|
16
16
|
from .netfinder import get_soup, fetch
|
17
17
|
|
18
18
|
# Suppress INFO messages from fontTools
|
19
|
-
logging.getLogger("fontTools").setLevel(logging.
|
19
|
+
logging.getLogger("fontTools").setLevel(logging.ERROR)
|
20
|
+
logging.getLogger('matplotlib').setLevel(logging.ERROR)
|
21
|
+
|
20
22
|
|
21
23
|
|
22
24
|
def add_text(ax=None, height_offset=0.5, fmt=".1f", **kwargs):
|
@@ -3149,7 +3151,7 @@ def volcano(
|
|
3149
3151
|
x:str,
|
3150
3152
|
y:str,
|
3151
3153
|
gene_col=None,
|
3152
|
-
top_genes=5,
|
3154
|
+
top_genes=[5, 5], # [down-regulated, up-regulated]
|
3153
3155
|
thr_x=np.log2(1.5),
|
3154
3156
|
thr_y=-np.log10(0.05),
|
3155
3157
|
colors=("#00BFFF", "#9d9a9a", "#FF3030"),
|
@@ -3163,7 +3165,11 @@ def volcano(
|
|
3163
3165
|
ax=None,
|
3164
3166
|
verbose=False,
|
3165
3167
|
kws_text=dict(fontsize=10, color="k"),
|
3166
|
-
|
3168
|
+
kws_bbox=dict(facecolor='none',
|
3169
|
+
alpha=0.5,
|
3170
|
+
edgecolor='black',
|
3171
|
+
boxstyle='round,pad=0.3'),# '{}' to hide
|
3172
|
+
kws_arrow={},
|
3167
3173
|
**kwargs,
|
3168
3174
|
):
|
3169
3175
|
"""
|
@@ -3179,7 +3185,7 @@ def volcano(
|
|
3179
3185
|
Column name for y-axis values (e.g., -log10(FDR)).
|
3180
3186
|
gene_col : str, optional
|
3181
3187
|
Column name for gene names. If provided, gene names will be displayed. Default is None.
|
3182
|
-
top_genes : int, optional
|
3188
|
+
top_genes : int, list, optional
|
3183
3189
|
Number of top genes to label based on y-axis values. Default is 5.
|
3184
3190
|
thr_x : float, optional
|
3185
3191
|
Threshold for x-axis values. Default is 0.585.
|
@@ -3239,14 +3245,22 @@ def volcano(
|
|
3239
3245
|
colors[2],
|
3240
3246
|
np.where((data[x] < -thr_x) & (data[y] > thr_y), colors[0], colors[1]),
|
3241
3247
|
)
|
3248
|
+
top_genes=[top_genes, top_genes] if isinstance(top_genes,int) else top_genes
|
3249
|
+
|
3250
|
+
down_reg_genes = data[
|
3251
|
+
(data["color"] == colors[0]) &
|
3252
|
+
(data[x].abs() > thr_x) &
|
3253
|
+
(data[y] > thr_y)
|
3254
|
+
].sort_values(by=[y, x], ascending=[False, True]).head(top_genes[0])
|
3255
|
+
|
3256
|
+
# Selecting top upregulated genes based on both p-value and fold change
|
3257
|
+
up_reg_genes = data[
|
3258
|
+
(data["color"] == colors[2]) &
|
3259
|
+
(data[x].abs() > thr_x) &
|
3260
|
+
(data[y] > thr_y)
|
3261
|
+
].sort_values(by=[y, x], ascending=[False, False]).head(top_genes[1])
|
3262
|
+
sele_gene = pd.concat([down_reg_genes, up_reg_genes])
|
3242
3263
|
|
3243
|
-
# Selecting top significant points for labeling
|
3244
|
-
sele_gene = (
|
3245
|
-
data.query("color != @colors[2]") # Exclude gray points
|
3246
|
-
.groupby("color", axis=0)
|
3247
|
-
.apply(lambda x: x.sort_values(y, ascending=False).head(top_genes))
|
3248
|
-
.droplevel(level=0)
|
3249
|
-
)
|
3250
3264
|
palette = {colors[0]: colors[0], colors[1]: colors[1], colors[2]: colors[2]}
|
3251
3265
|
# Plot setup
|
3252
3266
|
if ax is None:
|
@@ -3277,9 +3291,9 @@ def volcano(
|
|
3277
3291
|
)
|
3278
3292
|
|
3279
3293
|
# Add threshold lines for x and y axes
|
3280
|
-
|
3281
|
-
|
3282
|
-
|
3294
|
+
ax.axhline(y=thr_y, color="black", linestyle="--",lw=1)
|
3295
|
+
ax.axvline(x=-thr_x, color="black", linestyle="--",lw=1)
|
3296
|
+
ax.axvline(x=thr_x, color="black", linestyle="--",lw=1)
|
3283
3297
|
|
3284
3298
|
# Add gene labels for selected significant points
|
3285
3299
|
if gene_col:
|
@@ -3288,14 +3302,29 @@ def volcano(
|
|
3288
3302
|
fontname = kws_text.pop("fontname", "Arial")
|
3289
3303
|
textcolor = kws_text.pop("color", "k")
|
3290
3304
|
fontsize = kws_text.pop("fontsize", 10)
|
3305
|
+
arrowstyles = [
|
3306
|
+
"->","<-","<->","<|-","-|>","<|-|>",
|
3307
|
+
"-","-[","-[",
|
3308
|
+
"fancy","simple","wedge",
|
3309
|
+
]
|
3310
|
+
arrowstyle = kws_arrow.pop("style", "<|-")
|
3311
|
+
arrowstyle = strcmp(arrowstyle, arrowstyles,scorer='strict')[0]
|
3312
|
+
expand=kws_arrow.pop("expand",(1.05,1.1))
|
3313
|
+
arrowcolor = kws_arrow.pop("color", "0.4")
|
3314
|
+
arrowlinewidth = kws_arrow.pop("lw", 0.75)
|
3315
|
+
shrinkA = kws_arrow.pop("shrinkA", 0)
|
3316
|
+
shrinkB = kws_arrow.pop("shrinkB", 0)
|
3317
|
+
mutation_scale = kws_arrow.pop("head", 10)
|
3318
|
+
arrow_fill=kws_arrow.pop("fill", False)
|
3291
3319
|
for i in range(sele_gene.shape[0]):
|
3292
3320
|
if isinstance(textcolor, list): # be consistant with dots's color
|
3293
3321
|
textcolor = colors[0] if sele_gene[x].iloc[i] > 0 else colors[1]
|
3294
3322
|
texts.append(
|
3295
|
-
|
3323
|
+
ax.text(
|
3296
3324
|
x=sele_gene[x].iloc[i],
|
3297
3325
|
y=sele_gene[y].iloc[i],
|
3298
3326
|
s=sele_gene[gene_col].iloc[i],
|
3327
|
+
bbox=kws_bbox if kws_bbox else None,
|
3299
3328
|
fontdict={
|
3300
3329
|
"fontsize": fontsize,
|
3301
3330
|
"color": textcolor,
|
@@ -3303,40 +3332,31 @@ def volcano(
|
|
3303
3332
|
},
|
3304
3333
|
)
|
3305
3334
|
)
|
3306
|
-
|
3307
|
-
|
3308
|
-
|
3309
|
-
|
3310
|
-
|
3311
|
-
|
3312
|
-
|
3313
|
-
|
3314
|
-
|
3315
|
-
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
|
3321
|
-
|
3322
|
-
|
3323
|
-
|
3324
|
-
|
3325
|
-
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
texts,
|
3330
|
-
expand_text=(1.05, 1.2),
|
3331
|
-
arrowprops=dict(
|
3332
|
-
arrowstyle=arrowstyle,
|
3333
|
-
color=arrowcolor,
|
3334
|
-
lw=arrowlinewidth,
|
3335
|
-
shrinkA=shrinkA,
|
3336
|
-
shrinkB=shrinkB,
|
3337
|
-
**kws_arrow,
|
3338
|
-
),
|
3335
|
+
print(arrowstyle)
|
3336
|
+
adjust_text(
|
3337
|
+
texts,
|
3338
|
+
expand=expand,
|
3339
|
+
min_arrow_len=5,
|
3340
|
+
# force_explode=(0.1, 0.5),
|
3341
|
+
# force_text=(0.1, 0.5),
|
3342
|
+
# force_points=(0.1, 0.5),
|
3343
|
+
# explode_radius=10,
|
3344
|
+
# expand_text=(1, 1),
|
3345
|
+
# expand_points=(1, 1),
|
3346
|
+
# ha='center',
|
3347
|
+
# va='top',
|
3348
|
+
ax=ax,
|
3349
|
+
arrowprops=dict(
|
3350
|
+
arrowstyle=arrowstyle,
|
3351
|
+
fill=arrow_fill,
|
3352
|
+
color=arrowcolor,
|
3353
|
+
lw=arrowlinewidth,
|
3354
|
+
shrinkA=shrinkA,
|
3355
|
+
shrinkB=shrinkB,
|
3356
|
+
mutation_scale=mutation_scale,
|
3357
|
+
**kws_arrow,
|
3339
3358
|
)
|
3359
|
+
)
|
3340
3360
|
|
3341
3361
|
figsets(**kws_figsets)
|
3342
3362
|
|
@@ -173,7 +173,7 @@ py2ls/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
|
|
173
173
|
py2ls/README.md,sha256=CwvJWAnSXnCnrVHlnEbrxxi6MbjbE_MT6DH2D53S818,11572
|
174
174
|
py2ls/__init__.py,sha256=Nn8jTIvySX7t7DMJ8VNRVctTStgXGjHldOIdZ35PdW8,165
|
175
175
|
py2ls/batman.py,sha256=E7gYofbDzN7S5oCmO_dd5Z1bxxhoYMJSD6s-VaF388E,11398
|
176
|
-
py2ls/bio.py,sha256=
|
176
|
+
py2ls/bio.py,sha256=FnEf4RV4LBUQfLefWIpIFszVRYeXjnRlc5261DINIdg,18835
|
177
177
|
py2ls/brain_atlas.py,sha256=w1o5EelRjq89zuFJUNSz4Da8HnTCwAwDAZ4NU4a-bAY,5486
|
178
178
|
py2ls/chat.py,sha256=Yr22GoIvoWhpV3m4fdwV_I0Mn77La346_ymSinR-ORA,3793
|
179
179
|
py2ls/correlators.py,sha256=RbOaJIPLCHJtUm5SFi_4dCJ7VFUPWR0PErfK3K26ad4,18243
|
@@ -213,15 +213,15 @@ py2ls/export_requirements.py,sha256=x2WgUF0jYKz9GfA1MVKN-MdsM-oQ8yUeC6Ua8oCymio,
|
|
213
213
|
py2ls/fetch_update.py,sha256=9LXj661GpCEFII2wx_99aINYctDiHni6DOruDs_fdt8,4752
|
214
214
|
py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
|
215
215
|
py2ls/ich2ls.py,sha256=3E9R8oVpyYZXH5PiIQgT3CN5NxLe4Dwtm2LwaeacE6I,21381
|
216
|
-
py2ls/ips.py,sha256=
|
216
|
+
py2ls/ips.py,sha256=yYSpbHIGDfLK2SXtTX4f--H5oa885pggXePEbhiNRsw,220887
|
217
217
|
py2ls/netfinder.py,sha256=LwBkGITB_4BTNtY6RlKdEZVFW6epzMWlnqy2g03KtyU,56117
|
218
218
|
py2ls/ocr.py,sha256=5lhUbJufIKRSOL6wAWVLEo8TqMYSjoI_Q-IO-_4u3DE,31419
|
219
|
-
py2ls/plot.py,sha256=
|
219
|
+
py2ls/plot.py,sha256=B_npRfO2rZJJjcYSQ7YMZt2LZTG0mU08JCDnM6zAVx4,136956
|
220
220
|
py2ls/setuptools-70.1.0-py3-none-any.whl,sha256=2bi3cUVal8ip86s0SOvgspteEF8SKLukECi-EWmFomc,882588
|
221
221
|
py2ls/sleep_events_detectors.py,sha256=bQA3HJqv5qnYKJJEIhCyhlDtkXQfIzqksnD0YRXso68,52145
|
222
222
|
py2ls/stats.py,sha256=DMoJd8Z5YV9T1wB-4P52F5K5scfVK55DT8UP4Twcebo,38627
|
223
223
|
py2ls/translator.py,sha256=zBeq4pYZeroqw3DT-5g7uHfVqKd-EQptT6LJ-Adi8JY,34244
|
224
224
|
py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
|
225
|
-
py2ls-0.2.4.
|
226
|
-
py2ls-0.2.4.
|
227
|
-
py2ls-0.2.4.
|
225
|
+
py2ls-0.2.4.3.dist-info/METADATA,sha256=S4Il5phQ0Vx8U7VrlEUopkX-hfwcKKQi-qkfD2EYI1g,20038
|
226
|
+
py2ls-0.2.4.3.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
227
|
+
py2ls-0.2.4.3.dist-info/RECORD,,
|
File without changes
|