py2ls 0.2.4.2__py3-none-any.whl → 0.2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/bio.py CHANGED
@@ -4,8 +4,13 @@ import pandas as pd
4
4
  import os
5
5
  import logging
6
6
  from . import ips
7
+ from . import plot
8
+ import matplotlib.pyplot as plt
9
+
7
10
  def load_geo(
8
- datasets: Union[list, str] = ["GSE00000", "GSE00001"], dir_save: str = "./datasets", verbose=False
11
+ datasets: Union[list, str] = ["GSE00000", "GSE00001"],
12
+ dir_save: str = "./datasets",
13
+ verbose=False,
9
14
  ) -> dict:
10
15
  """
11
16
  Check if GEO datasets are already in the directory, and download them if not.
@@ -17,7 +22,7 @@ def load_geo(
17
22
  Returns:
18
23
  dict: A dictionary containing the GEO objects for each dataset.
19
24
  """
20
- use_str="""
25
+ use_str = """
21
26
  get_meta(geo: dict, dataset: str = "GSE25097")
22
27
  get_expression_data(geo: dict, dataset: str = "GSE25097")
23
28
  get_probe(geo: dict, dataset: str = "GSE25097", platform_id: str = "GPL10687")
@@ -51,7 +56,7 @@ def load_geo(
51
56
  return geo_data
52
57
 
53
58
 
54
- def get_meta(geo: dict, dataset: str = "GSE25097",verbose=True) -> pd.DataFrame:
59
+ def get_meta(geo: dict, dataset: str = "GSE25097", verbose=True) -> pd.DataFrame:
55
60
  """
56
61
  df_meta = get_meta(geo, dataset="GSE25097")
57
62
  Extracts metadata from a specific GEO dataset and returns it as a DataFrame.
@@ -122,23 +127,29 @@ def get_meta(geo: dict, dataset: str = "GSE25097",verbose=True) -> pd.DataFrame:
122
127
  print(
123
128
  f"Meta info columns for dataset '{dataset}': \n{sorted(meta_df.columns.tolist())}"
124
129
  )
125
- return meta_df
130
+ display(meta_df[:3].T)
131
+ return meta_df
132
+
126
133
 
127
- def get_probe(geo: dict, dataset: str = "GSE25097", platform_id: str = None, verbose=True):
134
+ def get_probe(
135
+ geo: dict, dataset: str = "GSE25097", platform_id: str = None, verbose=True
136
+ ):
128
137
  """
129
138
  df_probe = get_probe(geo, dataset="GSE25097", platform_id: str = "GPL10687")
130
139
  """
131
140
  # try to find the platform_id from meta
132
141
  if platform_id is None:
133
- df_meta=get_meta(geo=geo, dataset=dataset,verbose=False)
134
- platform_id=df_meta["platform_id"].unique().tolist()
135
- platform_id = platform_id[0] if len(platform_id)==1 else platform_id
142
+ df_meta = get_meta(geo=geo, dataset=dataset, verbose=False)
143
+ platform_id = df_meta["platform_id"].unique().tolist()
144
+ platform_id = platform_id[0] if len(platform_id) == 1 else platform_id
136
145
  print(platform_id)
137
146
  df_probe = geo[dataset].gpls[platform_id].table
138
147
  if df_probe.empty:
139
- print(f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息")
148
+ print(
149
+ f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息"
150
+ )
140
151
  return get_meta(geo, dataset, verbose=True)
141
- if verbose:
152
+ if verbose:
142
153
  print(f"columns in the probe table: \n{sorted(df_probe.columns.tolist())}")
143
154
  return df_probe
144
155
 
@@ -170,17 +181,18 @@ def get_expression_data(geo: dict, dataset: str = "GSE25097") -> pd.DataFrame:
170
181
  return expression_values
171
182
 
172
183
 
173
-
174
- def get_data(geo: dict, dataset: str = "GSE25097",verbose=True):
184
+ def get_data(geo: dict, dataset: str = "GSE25097", verbose=True):
175
185
  # get probe info
176
- df_probe = get_probe(geo,dataset=dataset,verbose=False)
186
+ df_probe = get_probe(geo, dataset=dataset, verbose=False)
177
187
  # get expression values
178
- df_expression = get_expression_data(geo, dataset=dataset )
188
+ df_expression = get_expression_data(geo, dataset=dataset)
179
189
  print(
180
190
  f"df_expression.shape: {df_expression.shape} \ndf_probe.shape: {df_probe.shape}"
181
191
  )
182
192
  if any([df_probe.empty, df_expression.empty]):
183
- print(f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息")
193
+ print(
194
+ f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息"
195
+ )
184
196
  return get_meta(geo, dataset, verbose=True)
185
197
  df_exp = pd.merge(
186
198
  df_probe,
@@ -191,27 +203,48 @@ def get_data(geo: dict, dataset: str = "GSE25097",verbose=True):
191
203
  )
192
204
 
193
205
  # get meta info
194
- df_meta=get_meta(geo, dataset=dataset,verbose=False)
195
- col_rm=['channel_count','contact_web_link','contact_address', 'contact_city', 'contact_country', 'contact_department', 'contact_email', 'contact_institute', 'contact_laboratory', 'contact_name', 'contact_phone', 'contact_state', 'contact_zip/postal_code', 'contributor', 'manufacture_protocol', 'taxid','web_link']
206
+ df_meta = get_meta(geo, dataset=dataset, verbose=False)
207
+ col_rm = [
208
+ "channel_count",
209
+ "contact_web_link",
210
+ "contact_address",
211
+ "contact_city",
212
+ "contact_country",
213
+ "contact_department",
214
+ "contact_email",
215
+ "contact_institute",
216
+ "contact_laboratory",
217
+ "contact_name",
218
+ "contact_phone",
219
+ "contact_state",
220
+ "contact_zip/postal_code",
221
+ "contributor",
222
+ "manufacture_protocol",
223
+ "taxid",
224
+ "web_link",
225
+ ]
196
226
  # rm unrelavent columns
197
227
  df_meta = df_meta.drop(columns=[col for col in col_rm if col in df_meta.columns])
198
228
  # sorte columns
199
- df_meta = df_meta.reindex(sorted(df_meta.columns),axis=1)
229
+ df_meta = df_meta.reindex(sorted(df_meta.columns), axis=1)
200
230
  # find a proper column
201
- col_sample_id = ips.strcmp("sample_id",df_meta.columns.tolist())[0]
202
- df_meta.set_index(col_sample_id, inplace=True) # set gene symbol as index
203
-
204
- col_gene_symbol = ips.strcmp("GeneSymbol",df_exp.columns.tolist())[0]
231
+ col_sample_id = ips.strcmp("sample_id", df_meta.columns.tolist())[0]
232
+ df_meta.set_index(col_sample_id, inplace=True) # set gene symbol as index
233
+
234
+ col_gene_symbol = ips.strcmp("GeneSymbol", df_exp.columns.tolist())[0]
205
235
  # select the 'GSM' columns
206
236
  col_gsm = df_exp.columns[df_exp.columns.str.startswith("GSM")].tolist()
207
237
  df_exp.set_index(col_gene_symbol, inplace=True)
208
- df_exp=df_exp[col_gsm].T # transpose, so that could add meta info
209
-
210
- df_merged=ips.df_merge(df_meta,df_exp)
238
+ df_exp = df_exp[col_gsm].T # transpose, so that could add meta info
239
+
240
+ df_merged = ips.df_merge(df_meta, df_exp)
211
241
  if verbose:
212
- print(f"\ndataset:'{dataset}' n_sample = {df_merged.shape[0]}, n_gene={df_exp.shape[1]}")
213
- display(df_merged.sample(10))
214
- return df_merged
242
+ print(
243
+ f"\ndataset:'{dataset}' n_sample = {df_merged.shape[0]}, n_gene={df_exp.shape[1]}"
244
+ )
245
+ display(df_merged.sample(5))
246
+ return df_merged
247
+
215
248
 
216
249
  def split_at_lower_upper(lst):
217
250
  """
@@ -228,16 +261,17 @@ def split_at_lower_upper(lst):
228
261
  return lst[: i + 1], lst[i + 1 :]
229
262
  return lst, []
230
263
 
231
- def get_condition(
264
+
265
+ def add_condition(
232
266
  data: pd.DataFrame,
233
- column:str="characteristics_ch1",#在哪一行进行分类
234
- column_new:str="condition",# 新col的命名
235
- by:str="tissue: tumor liver",# 通过by来命名
236
- by_not:str=": tumor", # 健康的选择条件
237
- by_name:str="non-tumor", # 健康的命名
238
- by_not_name:str="tumor", # 不健康的命名
239
- inplace: bool = True, #replace the data
240
- verbose:bool = True
267
+ column: str = "characteristics_ch1", # 在哪一行进行分类
268
+ column_new: str = "condition", # 新col的命名
269
+ by: str = "tissue: tumor liver", # 通过by来命名
270
+ by_not: str = ": tumor", # 健康的选择条件
271
+ by_name: str = "non-tumor", # 健康的命名
272
+ by_not_name: str = "tumor", # 不健康的命名
273
+ inplace: bool = True, # replace the data
274
+ verbose: bool = True,
241
275
  ):
242
276
  """
243
277
  Add a new column to the DataFrame based on the presence of a specific substring in another column.
@@ -255,18 +289,225 @@ def get_condition(
255
289
 
256
290
  """
257
291
  # first check the content in column
258
- content=data[column].unique().tolist()
292
+ content = data[column].unique().tolist()
259
293
  if verbose:
260
- if len(content)>10:
294
+ if len(content) > 10:
261
295
  display(content[:10])
262
296
  else:
263
297
  display(content)
264
298
  # 优先by
265
299
  if by:
266
- data[column_new] = data[column].apply(lambda x: by_name if by in x else by_not_name)
300
+ data[column_new] = data[column].apply(
301
+ lambda x: by_name if by in x else by_not_name
302
+ )
267
303
  elif by_not:
268
- data[column_new] = data[column].apply(lambda x: by_not_name if not by_not in x else by_name)
304
+ data[column_new] = data[column].apply(
305
+ lambda x: by_not_name if not by_not in x else by_name
306
+ )
307
+ if verbose:
308
+ display(data)
309
+ if not inplace:
310
+ return data
311
+
312
+
313
+ def add_condition_multi(
314
+ data: pd.DataFrame,
315
+ column: str = "characteristics_ch1", # Column to classify
316
+ column_new: str = "condition", # New column name
317
+ conditions: dict = {
318
+ "low": "low",
319
+ "high": "high",
320
+ "intermediate": "intermediate",
321
+ }, # A dictionary where keys are substrings and values are condition names
322
+ default_name: str = "unknown", # Default name if no condition matches
323
+ inplace: bool = True, # Whether to replace the data
324
+ verbose: bool = True,
325
+ ):
326
+ """
327
+ Add a new column to the DataFrame based on the presence of specific substrings in another column.
328
+
329
+ Parameters
330
+ ----------
331
+ data : pd.DataFrame
332
+ The input DataFrame containing the data.
333
+ column : str, optional
334
+ The name of the column in which to search for the substrings (default is 'characteristics_ch1').
335
+ column_new : str, optional
336
+ The name of the new column to be created (default is 'condition').
337
+ conditions : dict, optional
338
+ A dictionary where keys are substrings to search for and values are the corresponding labels.
339
+ default_name : str, optional
340
+ The name to assign if no condition matches (default is 'unknown').
341
+ inplace : bool, optional
342
+ Whether to modify the original DataFrame (default is True).
343
+ verbose : bool, optional
344
+ Whether to display the unique values and final DataFrame (default is True).
345
+ """
346
+
347
+ # Display the unique values in the column
348
+ content = data[column].unique().tolist()
349
+ if verbose:
350
+ if len(content) > 10:
351
+ display(content[:10])
352
+ else:
353
+ display(content)
354
+
355
+ # Check if conditions are provided
356
+ if conditions is None:
357
+ raise ValueError(
358
+ "Conditions must be provided as a dictionary with substrings and corresponding labels."
359
+ )
360
+
361
+ # Define a helper function to map the conditions
362
+ def map_condition(value):
363
+ for substring, label in conditions.items():
364
+ if substring in value:
365
+ return label
366
+ return default_name # If no condition matches, return the default name
367
+
368
+ # Apply the mapping function to create the new column
369
+ data[column_new] = data[column].apply(map_condition)
370
+
371
+ # Display the updated DataFrame if verbose is True
269
372
  if verbose:
270
373
  display(data)
374
+
271
375
  if not inplace:
272
- return data
376
+ return data
377
+
378
+ def clean_dataset(
379
+ data: pd.DataFrame, dataset: str = "GSE25097", condition: str = "condition",sep="///"
380
+ ):
381
+ """
382
+ #* it has been involved in bio.batch_effects(), but default: False
383
+ 1. clean data set and prepare super_datasets
384
+ 2. if "///" in index, then extend it, or others.
385
+ 3. drop duplicates and dropna()
386
+ 4. add the 'condition' and 'dataset info' to the columns
387
+ 5. set genes as index
388
+ """
389
+ #! (4.1) clean data set and prepare super_datasets
390
+ # df_data_2, 左边的列是meta,右边的列是gene_symbol
391
+ col_gene = split_at_lower_upper(data.columns.tolist())[1][0]
392
+ idx = ips.strcmp(col_gene, data.columns.tolist())[1]
393
+ df_gene = data.iloc[:, idx:].T # keep the last 'condition'
394
+
395
+ #! if "///" in index, then extend it, or others.
396
+ print(f"before extend shape: {df_gene.shape}")
397
+ df = df_gene.reset_index()
398
+ df_gene = ips.df_extend(df, column="index", sep=sep)
399
+ # reset 'index' column as index
400
+ # df_gene = df_gene.set_index("index")
401
+ print(f"after extended by '{sep}' shape: {df_gene.shape}")
402
+
403
+ # *alternative:
404
+ # df_unique = df.reset_index().drop_duplicates(subset="index").set_index("index")
405
+ #! 4.2 drop duplicates and dropna()
406
+ df_gene = df_gene.drop_duplicates(subset=["index"]).dropna()
407
+ print(f"drop duplicates and dropna: shape: {df_gene.shape}")
408
+
409
+ #! add the 'condition' and 'dataset info' to the columns
410
+ ds = [data["dataset"][0]] * len(df_gene.columns[1:])
411
+ samp = df_gene.columns.tolist()[1:]
412
+ cond = df_gene[df_gene["index"] == condition].values.tolist()[0][1:]
413
+ df_gene.columns = ["index"] + [
414
+ f"{ds}_{sam}_{cond}" for (ds, sam, cond) in zip(ds, samp, cond)
415
+ ]
416
+ df_gene.drop(df_gene[df_gene["index"] == condition].index, inplace=True)
417
+ #! set genes as index
418
+ df_gene.set_index("index",inplace=True)
419
+ display(df_gene.head())
420
+ return df_gene
421
+
422
+ def batch_effect(
423
+ data: list = "[df_gene_1, df_gene_2, df_gene_3]",
424
+ datasets: list = ["GSE25097", "GSE62232", "GSE65372"],
425
+ clean_data:bool=False, # default, not do data cleaning
426
+ top_genes:int=10,# only for plotting
427
+ plot_=True,
428
+ dir_save="./res/",
429
+ kws_clean_dataset:dict={},
430
+ **kwargs
431
+ ):
432
+ """
433
+ usage 1:
434
+ bio.batch_effect(
435
+ data=[df_gene_1, df_gene_2, df_gene_3],
436
+ datasets=["GSE25097", "GSE62232", "GSE65372"],
437
+ clean_data=False,
438
+ dir_save="./res/")
439
+
440
+ #! # or conbine clean_dataset and batch_effect together
441
+ # # data = [bio.clean_dataset(data=dt, dataset=ds) for (dt, ds) in zip(data, datasets)]
442
+ data_common = bio.batch_effect(
443
+ data=[df_data_1, df_data_2, df_data_3],
444
+ datasets=["GSE25097", "GSE62232", "GSE65372"], clean_data=True
445
+ )
446
+ """
447
+ # data = [df_gene_1, df_gene_2, df_gene_3]
448
+ # datasets = ["GSE25097", "GSE62232", "GSE65372"]
449
+ # top_genes = 10 # show top 10 genes
450
+ # plot_ = True
451
+ from combat.pycombat import pycombat
452
+ if clean_data:
453
+ data=[clean_dataset(data=dt,dataset=ds,**kws_clean_dataset) for (dt,ds) in zip(data,datasets)]
454
+ #! prepare data
455
+ # the datasets are dataframes where:
456
+ # the indexes correspond to the gene names
457
+ # the column names correspond to the sample names
458
+ #! merge batchs
459
+ # https://epigenelabs.github.io/pyComBat/
460
+ # we merge all the datasets into one, by keeping the common genes only
461
+ df_expression_common_genes = pd.concat(data, join="inner", axis=1)
462
+ #! convert to float
463
+ ips.df_astype(df_expression_common_genes, astype="float", inplace=True)
464
+
465
+ #!to visualise results, use Mini datasets, only take the first 10 samples of each batch(dataset)
466
+ if plot_:
467
+ col2plot = []
468
+ for ds in datasets:
469
+ # select the first 10 samples to plot, to see the diff
470
+ dat_tmp = df_expression_common_genes.columns[
471
+ df_expression_common_genes.columns.str.startswith(ds)
472
+ ][:top_genes].tolist()
473
+ col2plot.extend(dat_tmp)
474
+ # visualise results
475
+ _, axs = plt.subplots(2, 1, figsize=(15, 10))
476
+ plot.plotxy(
477
+ ax=axs[0],
478
+ data=df_expression_common_genes.loc[:, col2plot],
479
+ kind="bar",
480
+ figsets=dict(
481
+ title="Samples expression distribution (non-correction)",
482
+ ylabel="Observations",
483
+ xangle=90,
484
+ ),
485
+ )
486
+ # prepare batch list
487
+ batch = [
488
+ ips.ssplit(i, by="_")[0] for i in df_expression_common_genes.columns.tolist()
489
+ ]
490
+ # run pyComBat
491
+ df_corrected = pycombat(df_expression_common_genes, batch, **kwargs)
492
+ print(f"df_corrected.shape: {df_corrected.shape}")
493
+ display(df_corrected.head())
494
+ # visualise results again
495
+ if plot_:
496
+
497
+ plot.plotxy(
498
+ ax=axs[1],
499
+ data=df_corrected.loc[:, col2plot],
500
+ kind="bar",
501
+ figsets=dict(
502
+ title="Samples expression distribution (corrected)",
503
+ ylabel="Observations",
504
+ xangle=90,
505
+ ),
506
+ )
507
+ if dir_save is not None:
508
+ ips.figsave(dir_save + "batch_sample_exp_distri.pdf")
509
+ return df_corrected
510
+
511
+ def get_common_genes(elment1, elment2):
512
+ common_genes=ips.shared(elment1, elment2)
513
+ return common_genes
py2ls/ips.py CHANGED
@@ -51,8 +51,6 @@ from bs4 import BeautifulSoup
51
51
 
52
52
  from . import netfinder
53
53
 
54
- # from .plot import get_color
55
-
56
54
  try:
57
55
  get_ipython().run_line_magic("load_ext", "autoreload")
58
56
  get_ipython().run_line_magic("autoreload", "2")
@@ -518,6 +516,59 @@ def is_text(s):
518
516
  return has_alpha and has_non_alpha
519
517
 
520
518
 
519
+ from typing import Any, Union
520
+
521
+ def shared(lst1:Any, lst2:Any,*args, verbose=True):
522
+ """
523
+ check the shared elelements in two list.
524
+ usage:
525
+ list1 = [1, 2, 3, 4, 5]
526
+ list2 = [4, 5, 6, 7, 8]
527
+ list3 = [5, 6, 9, 10]
528
+ a = shared(list1, list2,list3)
529
+ """
530
+ if verbose:
531
+ print("\n********* checking shared elements *********")
532
+ if any([not isinstance(lst1,list),not isinstance(lst1,list)]):
533
+ print(f"{' '*2}type(list1):\t{type(lst1)},\n{' '*2}type(list2):\t{type(lst2)}>")
534
+ shared_elements=set(flatten(lst1,verbose=verbose)).intersection(flatten(lst2,verbose=verbose))
535
+ # support more lists
536
+ if args:
537
+ for arg in args:
538
+ shared_elements=shared_elements.intersection(set(flatten(arg,verbose=verbose)))
539
+ shared_elements = list(shared_elements)
540
+ if verbose:
541
+ elements2show = shared_elements if len(shared_elements)<10 else shared_elements[:5]
542
+ print(f"{' '*2}{len(shared_elements)} elements shared: {' '*2}{elements2show}")
543
+ print("********* checking shared elements *********")
544
+ return shared_elements
545
+
546
+ def flatten(nested: Any, unique_list=True,verbose=True):
547
+ """
548
+ Recursively flattens a nested structure (lists, tuples, dictionaries, sets) into a single list.
549
+ Parameters:
550
+ nested : Any, Can be a list, tuple, dictionary, or set.
551
+ Returns: list, A flattened list.
552
+ """
553
+ flattened_list = []
554
+ stack = [nested]
555
+ while stack:
556
+ current = stack.pop()
557
+ if isinstance(current, dict):
558
+ stack.extend(current.values())
559
+ elif isinstance(current, (list, tuple, set)):
560
+ stack.extend(current)
561
+ elif isinstance(current, pd.Series):
562
+ stack.extend(current)
563
+ else:
564
+ flattened_list.append(current)
565
+ if verbose:
566
+ print(f"{' '*2}<in info: {len(unique(flattened_list))} elements after flattened>")
567
+ if unique_list:
568
+ return unique(flattened_list)
569
+ else:
570
+ return flattened_list
571
+
521
572
  def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"):
522
573
  """
523
574
  Compares a search term with a list of candidate strings and finds the best match based on similarity score.
@@ -548,7 +599,7 @@ def strcmp(search_term, candidates, ignore_case=True, verbose=False, scorer="WR"
548
599
  similarity_scores = [fuzz.partial_ratio(str1_, word) for word in str2_]
549
600
  elif "W" in scorer.lower():
550
601
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
551
- elif "ratio" in scorer.lower():#Ratio (Strictest)
602
+ elif "ratio" in scorer.lower() or "stri" in scorer.lower():#Ratio (Strictest)
552
603
  similarity_scores = [fuzz.ratio(str1_, word) for word in str2_]
553
604
  else:
554
605
  similarity_scores = [fuzz.WRatio(str1_, word) for word in str2_]
@@ -1721,7 +1772,7 @@ def fload(fpath, kind=None, **kwargs):
1721
1772
  fmt=kwargs.pop("fmt",False)
1722
1773
  verbose=kwargs.pop("verbose",False)
1723
1774
  if verbose:
1724
- print_pd_usage("read_csv", verbose=verbose)
1775
+ use_pd("read_csv", verbose=verbose)
1725
1776
  return
1726
1777
 
1727
1778
  if comment is None:
@@ -1853,7 +1904,7 @@ def fload(fpath, kind=None, **kwargs):
1853
1904
  engine = kwargs.get("engine", "openpyxl")
1854
1905
  verbose=kwargs.pop("verbose",False)
1855
1906
  if verbose:
1856
- print_pd_usage("read_excel", verbose=verbose)
1907
+ use_pd("read_excel", verbose=verbose)
1857
1908
  df = pd.read_excel(fpath, engine=engine, **kwargs)
1858
1909
  try:
1859
1910
  meata=pd.ExcelFile(fpath)
@@ -2263,7 +2314,7 @@ def fsave(
2263
2314
 
2264
2315
  verbose=kwargs.pop("verbose",False)
2265
2316
  if verbose:
2266
- print_pd_usage("to_csv", verbose=verbose)
2317
+ use_pd("to_csv", verbose=verbose)
2267
2318
  kwargs_csv = dict(
2268
2319
  path_or_buf=None,
2269
2320
  sep=",",
@@ -2295,7 +2346,7 @@ def fsave(
2295
2346
  verbose=kwargs.pop("verbose",False)
2296
2347
  sheet_name = kwargs.pop("sheet_name", "Sheet1")
2297
2348
  if verbose:
2298
- print_pd_usage("to_excel", verbose=verbose)
2349
+ use_pd("to_excel", verbose=verbose)
2299
2350
  if any(kwargs):
2300
2351
  format_excel(df=data, filename=fpath, **kwargs)
2301
2352
  else:
@@ -4444,7 +4495,42 @@ def preview(var):
4444
4495
  # preview("# This is a Markdown header")
4445
4496
  # preview(pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]}))
4446
4497
  # preview({"key": "value", "numbers": [1, 2, 3]})
4447
-
4498
+ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
4499
+ """
4500
+ Extend a DataFrame by the list elecments in the column.
4501
+
4502
+ Parameters:
4503
+ ----------
4504
+ data : pd.DataFrame
4505
+ The input DataFrame to be extended.
4506
+
4507
+ column : str
4508
+ The name of the column to be split.
4509
+
4510
+ axis : int, optional
4511
+ The axis along which to expand the DataFrame.
4512
+ - 0 (default): Expand the specified column into multiple rows.
4513
+ - 1: Expand the specified column into multiple columns.
4514
+
4515
+ sep : str, optional
4516
+ The separator used to split the values in the specified column.
4517
+ Must be provided for the function to work correctly.
4518
+ """
4519
+
4520
+ data = data.copy()
4521
+ mask = data[column].str.contains(sep, na=False)
4522
+ data = data.copy()
4523
+ if mask.any():
4524
+ data[column] = (
4525
+ data[column]
4526
+ .apply(lambda x: x.split(sep) if isinstance(x, str) else x) # Only split if x is a string
4527
+ )
4528
+
4529
+ # Strip spaces from each item in the lists
4530
+ data[column] = data[column].apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)
4531
+
4532
+ data = data.explode(column, ignore_index=True)
4533
+ return data
4448
4534
  # ! DataFrame
4449
4535
  def df_astype(
4450
4536
  data: pd.DataFrame,
@@ -4731,7 +4817,7 @@ def df_merge(
4731
4817
  """
4732
4818
 
4733
4819
  # 1. Check if indices are comparable (same length and types)
4734
- if use_index or df1.index.equals(df2.index):
4820
+ if use_index:
4735
4821
  print(f"Merging based on index using '{how}' join...")
4736
4822
  df_merged = pd.merge(df1, df2, left_index=True, right_index=True, how=how)
4737
4823
  return df_merged
@@ -4984,7 +5070,7 @@ def df_cluster(
4984
5070
  X = scaler.fit_transform(X)
4985
5071
 
4986
5072
  for n_cluster in range_n_clusters:
4987
- kmeans = KMeans(n_clusters=n_cluster, random_state=42)
5073
+ kmeans = KMeans(n_clusters=n_cluster, random_state=1)
4988
5074
  cluster_labels = kmeans.fit_predict(X)
4989
5075
 
4990
5076
  silhouette_avg = silhouette_score(X, cluster_labels)
@@ -5000,7 +5086,7 @@ def df_cluster(
5000
5086
  print(f"n_clusters = {n_clusters}")
5001
5087
 
5002
5088
  # Apply K-Means Clustering with Optimal Number of Clusters
5003
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
5089
+ kmeans = KMeans(n_clusters=n_clusters, random_state=1)
5004
5090
  cluster_labels = kmeans.fit_predict(X)
5005
5091
 
5006
5092
  if plot:
@@ -5101,7 +5187,7 @@ def df_cluster(
5101
5187
  # n_clusters = (
5102
5188
  # np.argmax(silhouette_avg_scores) + 2
5103
5189
  # ) # Optimal clusters based on max silhouette score
5104
- # kmeans = KMeans(n_clusters=n_clusters, random_state=42)
5190
+ # kmeans = KMeans(n_clusters=n_clusters, random_state=1)
5105
5191
  # cluster_labels = kmeans.fit_predict(X)
5106
5192
  silhouette_vals = silhouette_samples(X, cluster_labels)
5107
5193
 
@@ -5252,12 +5338,14 @@ def df_reducer(
5252
5338
  columns: Optional[List[str]] = None,
5253
5339
  method: str = "umap", # 'pca', 'umap'
5254
5340
  n_components: int = 2, # Default for umap, but 50 for PCA
5255
- umap_neighbors: int = 15, # Default
5256
- umap_min_dist: float = 0.1, # Default
5341
+ umap_neighbors: int = 15, # UMAP-specific
5342
+ umap_min_dist: float = 0.1, # UMAP-specific
5343
+ tsne_perplexity: int = 30, # t-SNE-specific
5257
5344
  scale: bool = True,
5258
5345
  fill_missing: bool = True,
5259
5346
  debug: bool = False,
5260
5347
  inplace: bool = True, # replace the oringinal data
5348
+ plot_:bool = False,# plot scatterplot, but no 'hue',so it is meaningless
5261
5349
  ) -> pd.DataFrame:
5262
5350
  """
5263
5351
  Reduces the dimensionality of the selected DataFrame using PCA or UMAP.
@@ -5293,9 +5381,35 @@ def df_reducer(
5293
5381
  reduced_df : pd.DataFrame
5294
5382
  DataFrame with the reduced dimensions.
5295
5383
  """
5296
- from sklearn.decomposition import PCA
5384
+
5385
+ """
5386
+ PCA: explained_variance:
5387
+ indicates the proportion of the dataset's total variance that each principal
5388
+ component (PC) explains. It gives you a sense of how much information
5389
+ (or variance) is captured by each PC
5390
+ Interpretation:
5391
+ - Higher values indicate that the corresponding PC captures more variance.
5392
+ - The sum of the explained variances for all PCs equals 1 (or 100%).
5393
+ - If the first few components explain a high percentage (e.g., 90%),
5394
+ it means you can reduce the dimensionality of the data significantly without losing much information.
5395
+ Use case:
5396
+ You may plot a scree plot, which shows the explained variance for each PC, to help decide
5397
+ how many components to keep for analysis.
5398
+
5399
+ PCA: Singular values:
5400
+ represent the magnitude of variance along each principal component. Mathematically,
5401
+ they are the square roots of the eigenvalues of the covariance matrix.
5402
+ Interpretation:
5403
+ Larger singular values indicate that the associated PC captures more variance.
5404
+ Singular values are related to the scale of the data. If the data are scaled
5405
+ before PCA (e.g., standardized), then the singular values will provide a measure
5406
+ of the spread of data along each PC.
5407
+ Use case:
5408
+ Singular values help quantify the contribution of each principal component in a
5409
+ similar way to the explained variance. They are useful in understanding the overall
5410
+ structure of the data.
5411
+ """
5297
5412
  from sklearn.preprocessing import StandardScaler
5298
- import umap
5299
5413
  from sklearn.impute import SimpleImputer
5300
5414
 
5301
5415
  # Select columns if specified, else use all columns
@@ -5312,76 +5426,211 @@ def df_reducer(
5312
5426
  X = scaler.fit_transform(X)
5313
5427
 
5314
5428
  # Check valid method input
5315
- if method not in ["pca", "umap"]:
5316
- raise ValueError(f"Invalid method '{method}'. Choose 'pca' or 'umap'.")
5317
-
5429
+ methods=["pca", "umap","tsne","factor","isolation_forest"]
5430
+ method=strcmp(method, methods)[0]
5318
5431
  # Apply PCA if selected
5319
- if method == "pca":
5320
- if n_components is None:
5321
- # to get the n_components with threshold method:
5322
- pca = PCA()
5323
- pca_result = pca.fit_transform(X)
5324
-
5325
- # Calculate explained variance
5326
- explained_variance = pca.explained_variance_ratio_
5327
- # Cumulative explained variance
5328
- cumulative_variance = np.cumsum(explained_variance)
5329
- # Set a threshold for cumulative variance
5330
- threshold = 0.95 # Example threshold
5331
- n_components = (
5332
- np.argmax(cumulative_variance >= threshold) + 1
5333
- ) # Number of components to retain
5334
- if debug:
5335
- # debug:
5336
- # Plot the cumulative explained variance
5337
- plt.figure(figsize=(8, 5))
5338
- plt.plot(
5339
- range(1, len(cumulative_variance) + 1),
5340
- cumulative_variance,
5341
- marker="o",
5342
- linestyle="-",
5343
- )
5344
- plt.title("Cumulative Explained Variance by Principal Components")
5345
- plt.xlabel("Number of Principal Components")
5346
- plt.ylabel("Cumulative Explained Variance")
5347
- plt.xticks(range(1, len(cumulative_variance) + 1))
5348
- # Add horizontal line for the threshold
5349
- plt.axhline(
5350
- y=threshold, color="r", linestyle="--", label="Threshold (95%)"
5351
- )
5352
- # Add vertical line for n_components
5353
- plt.axvline(
5354
- x=n_components,
5355
- color="g",
5356
- linestyle="--",
5357
- label=f"n_components = {n_components}",
5358
- )
5359
- plt.legend()
5360
- plt.grid()
5432
+ if method == "pca":
5433
+ from sklearn.decomposition import PCA
5361
5434
  pca = PCA(n_components=n_components)
5362
5435
  X_reduced = pca.fit_transform(X)
5363
- print(f"PCA completed: Reduced to {n_components} components.")
5436
+
5437
+ # Additional PCA information
5438
+ explained_variance = pca.explained_variance_ratio_
5439
+ singular_values = pca.singular_values_
5440
+ loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
5441
+
5442
+ if debug:
5443
+ print(f"PCA completed: Reduced to {n_components} components.")
5444
+ print(f"Explained Variance: {explained_variance}")
5445
+ print(f"Singular Values: {singular_values}")
5446
+
5447
+ # Plot explained variance if debug=True
5448
+ if debug:
5449
+ # Plot explained variance
5450
+ cumulative_variance = np.cumsum(explained_variance)
5451
+ plt.figure(figsize=(8, 5))
5452
+ plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
5453
+ plt.title("Cumulative Explained Variance by Principal Components")
5454
+ plt.xlabel("Number of Principal Components")
5455
+ plt.ylabel("Cumulative Explained Variance")
5456
+ plt.axhline(y=0.95, color="r", linestyle="--", label="Threshold (95%)")
5457
+ plt.axvline(x=n_components, color="g", linestyle="--", label=f"n_components = {n_components}")
5458
+ plt.legend()
5459
+ plt.grid()
5460
+ plt.show()
5461
+
5462
+ # Prepare reduced DataFrame with additional PCA info
5463
+ pca_df = pd.DataFrame(
5464
+ X_reduced, index=data.index,
5465
+ columns=[f"PC_{i+1}" for i in range(n_components)]
5466
+ )
5467
+ # pca_df["Explained Variance"] = np.tile(explained_variance[:n_components], (pca_df.shape[0], 1))
5468
+ # pca_df["Singular Values"] = np.tile(singular_values[:n_components], (pca_df.shape[0], 1))
5469
+ # Expand explained variance to multiple columns if needed
5470
+ for i in range(n_components):
5471
+ pca_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (pca_df.shape[0], 1))
5472
+ for i in range(n_components):
5473
+ pca_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (pca_df.shape[0], 1))
5364
5474
 
5365
5475
  # Apply UMAP if selected
5366
5476
  elif method == "umap":
5477
+ import umap
5367
5478
  umap_reducer = umap.UMAP(
5368
5479
  n_neighbors=umap_neighbors,
5369
5480
  min_dist=umap_min_dist,
5370
- n_components=n_components,
5481
+ n_components=n_components
5371
5482
  )
5372
5483
  X_reduced = umap_reducer.fit_transform(X)
5373
- print(f"UMAP completed: Reduced to {n_components} components.")
5374
5484
 
5375
- # Return reduced data as a new DataFrame with the same index
5376
- reduced_df = pd.DataFrame(X_reduced, index=data.index)
5485
+ # Additional UMAP information
5486
+ embedding = umap_reducer.embedding_
5487
+ trustworthiness = umap_reducer._raw_data[:, :n_components]
5488
+
5489
+ if debug:
5490
+ print(f"UMAP completed: Reduced to {n_components} components.")
5491
+ print(f"Embedding Shape: {embedding.shape}")
5492
+ print(f"Trustworthiness: {trustworthiness}")
5493
+
5494
+ # Prepare reduced DataFrame with additional UMAP info
5495
+ umap_df = pd.DataFrame(
5496
+ X_reduced, index=data.index,
5497
+ columns=[f"UMAP_{i+1}" for i in range(n_components)]
5498
+ )
5499
+ umap_df["Embedding"] = embedding[:, 0] # Example of embedding data
5500
+ umap_df["Trustworthiness"] = trustworthiness[:, 0] # Trustworthiness metric
5501
+ elif method == "tsne":
5502
+ from sklearn.manifold import TSNE
5503
+ tsne = TSNE(n_components=n_components, perplexity=tsne_perplexity, random_state=1)
5504
+ X_reduced = tsne.fit_transform(X)
5505
+
5506
+ # Prepare reduced DataFrame with additional t-SNE info
5507
+ tsne_df = pd.DataFrame(
5508
+ X_reduced, index=data.index,
5509
+ columns=[f"tSNE_{i+1}" for i in range(n_components)]
5510
+ )
5511
+ tsne_df["Perplexity"] = np.tile(f"Perplexity: {tsne_perplexity}", (tsne_df.shape[0], 1))
5512
+
5513
+ # Apply Factor Analysis if selected
5514
+ elif method == "factor":
5515
+ from sklearn.decomposition import FactorAnalysis
5516
+ factor = FactorAnalysis(n_components=n_components, random_state=1)
5517
+ X_reduced = factor.fit_transform(X)
5518
+ # Factor Analysis does not directly provide explained variance, but we can approximate it
5519
+ fa_variance = factor.noise_variance_
5520
+ # Prepare reduced DataFrame with additional Factor Analysis info
5521
+ factor_df = pd.DataFrame(
5522
+ X_reduced, index=data.index,
5523
+ columns=[f"Factor_{i+1}" for i in range(n_components)]
5524
+ )
5525
+ factor_df["Noise Variance"] = np.tile(format(np.mean(fa_variance) * 100, ".3f") + "%", (factor_df.shape[0], 1))
5526
+
5527
+ # Apply Isolation Forest for outlier detection if selected
5528
+ elif method == "isolation_forest":
5529
+ from sklearn.decomposition import PCA
5530
+ from sklearn.ensemble import IsolationForest
5531
+ # Step 1: Apply PCA for dimensionality reduction to 2 components
5532
+ pca = PCA(n_components=n_components)
5533
+ X_pca = pca.fit_transform(X)
5534
+
5535
+ explained_variance = pca.explained_variance_ratio_
5536
+ singular_values = pca.singular_values_
5537
+
5538
+ # Prepare reduced DataFrame with additional PCA info
5539
+ iso_forest_df = pd.DataFrame(
5540
+ X_pca, index=data.index,
5541
+ columns=[f"PC_{i+1}" for i in range(n_components)]
5542
+ )
5543
+
5544
+ isolation_forest = IsolationForest(n_estimators=100, contamination='auto',random_state=1)
5545
+ isolation_forest.fit(X)
5546
+ anomaly_scores = isolation_forest.decision_function(X) # Anomaly score: larger is less anomalous
5547
+ # Predict labels: 1 (normal), -1 (anomaly)
5548
+ anomaly_labels = isolation_forest.fit_predict(X)
5549
+ # Add anomaly scores and labels to the DataFrame
5550
+ iso_forest_df["Anomaly Score"] = anomaly_scores
5551
+ iso_forest_df["Anomaly Label"] = anomaly_labels
5552
+ # add info from pca
5553
+ for i in range(n_components):
5554
+ iso_forest_df[f"Explained Variance PC_{i+1}"] = np.tile(format(explained_variance[i]*100,".3f")+"%", (iso_forest_df.shape[0], 1))
5555
+ for i in range(n_components):
5556
+ iso_forest_df[f"Singular Values PC_{i+1}"] = np.tile(singular_values[i], (iso_forest_df.shape[0], 1))
5557
+
5558
+ # Return reduced data and info as a new DataFrame with the same index
5559
+ if method == "pca":
5560
+ reduced_df = pca_df
5561
+ colname_met = "PC_"
5562
+ if plot_:
5563
+ sns.scatterplot(
5564
+ data=pca_df,
5565
+ x="PC_1",
5566
+ y="PC_2",
5567
+ # hue="condition",
5568
+ )
5569
+ elif method == "umap":
5570
+ reduced_df = umap_df
5571
+ colname_met = "UMAP_"
5572
+ if plot_:
5573
+ sns.scatterplot(
5574
+ data=umap_df,
5575
+ x="UMAP_1",
5576
+ y="UMAP_2",
5577
+ # hue="condition",
5578
+ )
5579
+ elif method == "tsne":
5580
+ reduced_df = tsne_df
5581
+ colname_met = "t-SNE_"
5582
+ if plot_:
5583
+ sns.scatterplot(
5584
+ data=tsne_df,
5585
+ x="tSNE_1",
5586
+ y="tSNE_2",
5587
+ # hue="batch",
5588
+ )
5589
+ elif method == "factor":
5590
+ reduced_df = factor_df
5591
+ colname_met = "Factor_"
5592
+ if plot_:
5593
+ sns.scatterplot(
5594
+ data=factor_df,
5595
+ x="Factor_1",
5596
+ y="Factor_2",
5597
+ # hue="batch",
5598
+ )
5599
+ elif method == "isolation_forest":
5600
+ reduced_df = iso_forest_df # Already a DataFrame for outliers
5601
+ colname_met = "PC_"
5602
+ if plot_:
5603
+ ax = sns.scatterplot(
5604
+ data=iso_forest_df[iso_forest_df["Anomaly Label"] == 1],
5605
+ x="PC_1",
5606
+ y="PC_2",
5607
+ label="normal", c="b",
5608
+ )
5609
+ ax = sns.scatterplot(
5610
+ ax=ax,
5611
+ data=iso_forest_df[iso_forest_df["Anomaly Label"] == -1],
5612
+ x="PC_1",
5613
+ y="PC_2",
5614
+ c="r",
5615
+ label="outlier", marker="+", s=30,
5616
+ )
5617
+
5377
5618
 
5378
5619
  if inplace:
5379
- # Replace or add new columns based on n_components
5620
+ # If inplace=True, add components back into the original data
5380
5621
  for col_idx in range(n_components):
5381
- data[f"Component_{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5622
+ data[f"{colname_met}{col_idx+1}"] = reduced_df.iloc[:, col_idx]
5623
+
5624
+ # Add extra info for PCA/UMAP
5625
+ if method == "pca":
5626
+ data["Explained Variance"] = reduced_df["Explained Variance"]
5627
+ data["Singular Values"] = reduced_df["Singular Values"]
5628
+ elif method == "umap":
5629
+ data["Embedding"] = reduced_df["Embedding"]
5630
+ data["Trustworthiness"] = reduced_df["Trustworthiness"]
5382
5631
  return None # No return when inplace=True
5383
5632
 
5384
- return reduced_df
5633
+ return reduced_df
5385
5634
 
5386
5635
 
5387
5636
  # example:
@@ -5636,7 +5885,7 @@ def evaluate_cluster(
5636
5885
  return metrics
5637
5886
 
5638
5887
 
5639
- def print_pd_usage(
5888
+ def use_pd(
5640
5889
  func_name="excel",
5641
5890
  verbose=True,
5642
5891
  dir_json="/Users/macjianfeng/Dropbox/github/python/py2ls/py2ls/data/usages_pd.json",
py2ls/plot.py CHANGED
@@ -16,7 +16,9 @@ from .stats import *
16
16
  from .netfinder import get_soup, fetch
17
17
 
18
18
  # Suppress INFO messages from fontTools
19
- logging.getLogger("fontTools").setLevel(logging.WARNING)
19
+ logging.getLogger("fontTools").setLevel(logging.ERROR)
20
+ logging.getLogger('matplotlib').setLevel(logging.ERROR)
21
+
20
22
 
21
23
 
22
24
  def add_text(ax=None, height_offset=0.5, fmt=".1f", **kwargs):
@@ -3149,7 +3151,7 @@ def volcano(
3149
3151
  x:str,
3150
3152
  y:str,
3151
3153
  gene_col=None,
3152
- top_genes=5,
3154
+ top_genes=[5, 5], # [down-regulated, up-regulated]
3153
3155
  thr_x=np.log2(1.5),
3154
3156
  thr_y=-np.log10(0.05),
3155
3157
  colors=("#00BFFF", "#9d9a9a", "#FF3030"),
@@ -3163,7 +3165,11 @@ def volcano(
3163
3165
  ax=None,
3164
3166
  verbose=False,
3165
3167
  kws_text=dict(fontsize=10, color="k"),
3166
- kws_arrow=dict(style="-", color="k", lw=0.5),
3168
+ kws_bbox=dict(facecolor='none',
3169
+ alpha=0.5,
3170
+ edgecolor='black',
3171
+ boxstyle='round,pad=0.3'),# '{}' to hide
3172
+ kws_arrow={},
3167
3173
  **kwargs,
3168
3174
  ):
3169
3175
  """
@@ -3179,7 +3185,7 @@ def volcano(
3179
3185
  Column name for y-axis values (e.g., -log10(FDR)).
3180
3186
  gene_col : str, optional
3181
3187
  Column name for gene names. If provided, gene names will be displayed. Default is None.
3182
- top_genes : int, optional
3188
+ top_genes : int, list, optional
3183
3189
  Number of top genes to label based on y-axis values. Default is 5.
3184
3190
  thr_x : float, optional
3185
3191
  Threshold for x-axis values. Default is 0.585.
@@ -3239,14 +3245,22 @@ def volcano(
3239
3245
  colors[2],
3240
3246
  np.where((data[x] < -thr_x) & (data[y] > thr_y), colors[0], colors[1]),
3241
3247
  )
3248
+ top_genes=[top_genes, top_genes] if isinstance(top_genes,int) else top_genes
3249
+
3250
+ down_reg_genes = data[
3251
+ (data["color"] == colors[0]) &
3252
+ (data[x].abs() > thr_x) &
3253
+ (data[y] > thr_y)
3254
+ ].sort_values(by=[y, x], ascending=[False, True]).head(top_genes[0])
3255
+
3256
+ # Selecting top upregulated genes based on both p-value and fold change
3257
+ up_reg_genes = data[
3258
+ (data["color"] == colors[2]) &
3259
+ (data[x].abs() > thr_x) &
3260
+ (data[y] > thr_y)
3261
+ ].sort_values(by=[y, x], ascending=[False, False]).head(top_genes[1])
3262
+ sele_gene = pd.concat([down_reg_genes, up_reg_genes])
3242
3263
 
3243
- # Selecting top significant points for labeling
3244
- sele_gene = (
3245
- data.query("color != @colors[2]") # Exclude gray points
3246
- .groupby("color", axis=0)
3247
- .apply(lambda x: x.sort_values(y, ascending=False).head(top_genes))
3248
- .droplevel(level=0)
3249
- )
3250
3264
  palette = {colors[0]: colors[0], colors[1]: colors[1], colors[2]: colors[2]}
3251
3265
  # Plot setup
3252
3266
  if ax is None:
@@ -3277,9 +3291,9 @@ def volcano(
3277
3291
  )
3278
3292
 
3279
3293
  # Add threshold lines for x and y axes
3280
- plt.axhline(y=thr_y, color="black", linestyle="--")
3281
- plt.axvline(x=-thr_x, color="black", linestyle="--")
3282
- plt.axvline(x=thr_x, color="black", linestyle="--")
3294
+ ax.axhline(y=thr_y, color="black", linestyle="--",lw=1)
3295
+ ax.axvline(x=-thr_x, color="black", linestyle="--",lw=1)
3296
+ ax.axvline(x=thr_x, color="black", linestyle="--",lw=1)
3283
3297
 
3284
3298
  # Add gene labels for selected significant points
3285
3299
  if gene_col:
@@ -3288,14 +3302,29 @@ def volcano(
3288
3302
  fontname = kws_text.pop("fontname", "Arial")
3289
3303
  textcolor = kws_text.pop("color", "k")
3290
3304
  fontsize = kws_text.pop("fontsize", 10)
3305
+ arrowstyles = [
3306
+ "->","<-","<->","<|-","-|>","<|-|>",
3307
+ "-","-[","-[",
3308
+ "fancy","simple","wedge",
3309
+ ]
3310
+ arrowstyle = kws_arrow.pop("style", "<|-")
3311
+ arrowstyle = strcmp(arrowstyle, arrowstyles,scorer='strict')[0]
3312
+ expand=kws_arrow.pop("expand",(1.05,1.1))
3313
+ arrowcolor = kws_arrow.pop("color", "0.4")
3314
+ arrowlinewidth = kws_arrow.pop("lw", 0.75)
3315
+ shrinkA = kws_arrow.pop("shrinkA", 0)
3316
+ shrinkB = kws_arrow.pop("shrinkB", 0)
3317
+ mutation_scale = kws_arrow.pop("head", 10)
3318
+ arrow_fill=kws_arrow.pop("fill", False)
3291
3319
  for i in range(sele_gene.shape[0]):
3292
3320
  if isinstance(textcolor, list): # be consistant with dots's color
3293
3321
  textcolor = colors[0] if sele_gene[x].iloc[i] > 0 else colors[1]
3294
3322
  texts.append(
3295
- plt.text(
3323
+ ax.text(
3296
3324
  x=sele_gene[x].iloc[i],
3297
3325
  y=sele_gene[y].iloc[i],
3298
3326
  s=sele_gene[gene_col].iloc[i],
3327
+ bbox=kws_bbox if kws_bbox else None,
3299
3328
  fontdict={
3300
3329
  "fontsize": fontsize,
3301
3330
  "color": textcolor,
@@ -3303,40 +3332,31 @@ def volcano(
3303
3332
  },
3304
3333
  )
3305
3334
  )
3306
-
3307
- arrowstyles = [
3308
- "-",
3309
- "->",
3310
- "-[",
3311
- "|->",
3312
- "<-",
3313
- "<->",
3314
- "<|-",
3315
- "<|-|>",
3316
- "-|>",
3317
- "-[ ",
3318
- "fancy",
3319
- "simple",
3320
- "wedge",
3321
- ]
3322
- arrowstyle = kws_arrow.pop("style", "-")
3323
- arrowcolor = kws_arrow.pop("color", "0.5")
3324
- arrowlinewidth = kws_arrow.pop("lw", 0.5)
3325
- shrinkA = kws_arrow.pop("shrinkA", 5)
3326
- shrinkB = kws_arrow.pop("shrinkB", 5)
3327
- arrowstyle = strcmp(arrowstyle, arrowstyles)[0]
3328
- adjust_text(
3329
- texts,
3330
- expand_text=(1.05, 1.2),
3331
- arrowprops=dict(
3332
- arrowstyle=arrowstyle,
3333
- color=arrowcolor,
3334
- lw=arrowlinewidth,
3335
- shrinkA=shrinkA,
3336
- shrinkB=shrinkB,
3337
- **kws_arrow,
3338
- ),
3335
+ print(arrowstyle)
3336
+ adjust_text(
3337
+ texts,
3338
+ expand=expand,
3339
+ min_arrow_len=5,
3340
+ # force_explode=(0.1, 0.5),
3341
+ # force_text=(0.1, 0.5),
3342
+ # force_points=(0.1, 0.5),
3343
+ # explode_radius=10,
3344
+ # expand_text=(1, 1),
3345
+ # expand_points=(1, 1),
3346
+ # ha='center',
3347
+ # va='top',
3348
+ ax=ax,
3349
+ arrowprops=dict(
3350
+ arrowstyle=arrowstyle,
3351
+ fill=arrow_fill,
3352
+ color=arrowcolor,
3353
+ lw=arrowlinewidth,
3354
+ shrinkA=shrinkA,
3355
+ shrinkB=shrinkB,
3356
+ mutation_scale=mutation_scale,
3357
+ **kws_arrow,
3339
3358
  )
3359
+ )
3340
3360
 
3341
3361
  figsets(**kws_figsets)
3342
3362
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: py2ls
3
- Version: 0.2.4.2
3
+ Version: 0.2.4.3
4
4
  Summary: py(thon)2(too)ls
5
5
  Author: Jianfeng
6
6
  Author-email: Jianfeng.Liu0413@gmail.com
@@ -173,7 +173,7 @@ py2ls/LICENSE,sha256=UOZ1F5fFDe3XXvG4oNnkL1-Ecun7zpHzRxjp-XsMeAo,11324
173
173
  py2ls/README.md,sha256=CwvJWAnSXnCnrVHlnEbrxxi6MbjbE_MT6DH2D53S818,11572
174
174
  py2ls/__init__.py,sha256=Nn8jTIvySX7t7DMJ8VNRVctTStgXGjHldOIdZ35PdW8,165
175
175
  py2ls/batman.py,sha256=E7gYofbDzN7S5oCmO_dd5Z1bxxhoYMJSD6s-VaF388E,11398
176
- py2ls/bio.py,sha256=5q7T_LXmDg0MJoKXwO0kWnfbpshXNvUR5kCnYyLqm2w,10711
176
+ py2ls/bio.py,sha256=FnEf4RV4LBUQfLefWIpIFszVRYeXjnRlc5261DINIdg,18835
177
177
  py2ls/brain_atlas.py,sha256=w1o5EelRjq89zuFJUNSz4Da8HnTCwAwDAZ4NU4a-bAY,5486
178
178
  py2ls/chat.py,sha256=Yr22GoIvoWhpV3m4fdwV_I0Mn77La346_ymSinR-ORA,3793
179
179
  py2ls/correlators.py,sha256=RbOaJIPLCHJtUm5SFi_4dCJ7VFUPWR0PErfK3K26ad4,18243
@@ -213,15 +213,15 @@ py2ls/export_requirements.py,sha256=x2WgUF0jYKz9GfA1MVKN-MdsM-oQ8yUeC6Ua8oCymio,
213
213
  py2ls/fetch_update.py,sha256=9LXj661GpCEFII2wx_99aINYctDiHni6DOruDs_fdt8,4752
214
214
  py2ls/freqanalysis.py,sha256=F4218VSPbgL5tnngh6xNCYuNnfR-F_QjECUUxrPYZss,32594
215
215
  py2ls/ich2ls.py,sha256=3E9R8oVpyYZXH5PiIQgT3CN5NxLe4Dwtm2LwaeacE6I,21381
216
- py2ls/ips.py,sha256=46nrt6RRl8Lc-tMh03dRqxF4nUlLfMElnETE1ipu-DM,210309
216
+ py2ls/ips.py,sha256=yYSpbHIGDfLK2SXtTX4f--H5oa885pggXePEbhiNRsw,220887
217
217
  py2ls/netfinder.py,sha256=LwBkGITB_4BTNtY6RlKdEZVFW6epzMWlnqy2g03KtyU,56117
218
218
  py2ls/ocr.py,sha256=5lhUbJufIKRSOL6wAWVLEo8TqMYSjoI_Q-IO-_4u3DE,31419
219
- py2ls/plot.py,sha256=A4NiRDItVyrc80qPtLgT1mpzvebU_iMVVownjsu_YFc,135976
219
+ py2ls/plot.py,sha256=B_npRfO2rZJJjcYSQ7YMZt2LZTG0mU08JCDnM6zAVx4,136956
220
220
  py2ls/setuptools-70.1.0-py3-none-any.whl,sha256=2bi3cUVal8ip86s0SOvgspteEF8SKLukECi-EWmFomc,882588
221
221
  py2ls/sleep_events_detectors.py,sha256=bQA3HJqv5qnYKJJEIhCyhlDtkXQfIzqksnD0YRXso68,52145
222
222
  py2ls/stats.py,sha256=DMoJd8Z5YV9T1wB-4P52F5K5scfVK55DT8UP4Twcebo,38627
223
223
  py2ls/translator.py,sha256=zBeq4pYZeroqw3DT-5g7uHfVqKd-EQptT6LJ-Adi8JY,34244
224
224
  py2ls/wb_detector.py,sha256=7y6TmBUj9exCZeIgBAJ_9hwuhkDh1x_-yg4dvNY1_GQ,6284
225
- py2ls-0.2.4.2.dist-info/METADATA,sha256=_YQg86nAdjPWqkaIrH6p9nSPhjNHbY1AU0BGV6o3wU0,20038
226
- py2ls-0.2.4.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
227
- py2ls-0.2.4.2.dist-info/RECORD,,
225
+ py2ls-0.2.4.3.dist-info/METADATA,sha256=S4Il5phQ0Vx8U7VrlEUopkX-hfwcKKQi-qkfD2EYI1g,20038
226
+ py2ls-0.2.4.3.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
227
+ py2ls-0.2.4.3.dist-info/RECORD,,