py2ls 0.2.4__py3-none-any.whl → 0.2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/bio.py ADDED
@@ -0,0 +1,272 @@
1
+ import GEOparse
2
+ from typing import Union
3
+ import pandas as pd
4
+ import os
5
+ import logging
6
+ from . import ips
7
+ def load_geo(
8
+ datasets: Union[list, str] = ["GSE00000", "GSE00001"], dir_save: str = "./datasets", verbose=False
9
+ ) -> dict:
10
+ """
11
+ Check if GEO datasets are already in the directory, and download them if not.
12
+
13
+ Parameters:
14
+ datasets (list): List of GEO dataset IDs to download.
15
+ dir_save (str): Directory where datasets will be stored.
16
+
17
+ Returns:
18
+ dict: A dictionary containing the GEO objects for each dataset.
19
+ """
20
+ use_str="""
21
+ get_meta(geo: dict, dataset: str = "GSE25097")
22
+ get_expression_data(geo: dict, dataset: str = "GSE25097")
23
+ get_probe(geo: dict, dataset: str = "GSE25097", platform_id: str = "GPL10687")
24
+ get_data(geo: dict, dataset: str = "GSE25097")
25
+ """
26
+ print(f"you could do further: \n{use_str}")
27
+ if not verbose:
28
+ logging.getLogger("GEOparse").setLevel(logging.WARNING)
29
+ else:
30
+ logging.getLogger("GEOparse").setLevel(logging.DEBUG)
31
+ # Create the directory if it doesn't exist
32
+ if not os.path.exists(dir_save):
33
+ os.makedirs(dir_save)
34
+ print(f"Created directory: {dir_save}")
35
+ if isinstance(datasets, str):
36
+ datasets = [datasets]
37
+ geo_data = {}
38
+ for dataset in datasets:
39
+ # Check if the dataset file already exists in the directory
40
+ dataset_file = os.path.join(dir_save, f"{dataset}_family.soft.gz")
41
+
42
+ if not os.path.isfile(dataset_file):
43
+ print(f"\n\nDataset {dataset} not found locally. Downloading...")
44
+ geo = GEOparse.get_GEO(geo=dataset, destdir=dir_save)
45
+ else:
46
+ print(f"\n\nDataset {dataset} already exists locally. Loading...")
47
+ geo = GEOparse.get_GEO(filepath=dataset_file)
48
+
49
+ geo_data[dataset] = geo
50
+
51
+ return geo_data
52
+
53
+
54
+ def get_meta(geo: dict, dataset: str = "GSE25097",verbose=True) -> pd.DataFrame:
55
+ """
56
+ df_meta = get_meta(geo, dataset="GSE25097")
57
+ Extracts metadata from a specific GEO dataset and returns it as a DataFrame.
58
+ The function dynamically extracts all available metadata fields from the given dataset.
59
+
60
+ Parameters:
61
+ geo (dict): A dictionary containing the GEO objects for different datasets.
62
+ dataset (str): The name of the dataset to extract metadata from (default is "GSE25097").
63
+
64
+ Returns:
65
+ pd.DataFrame: A DataFrame containing structured metadata from the specified GEO dataset.
66
+ """
67
+ # Check if the dataset is available in the provided GEO dictionary
68
+ if dataset not in geo:
69
+ raise ValueError(f"Dataset '{dataset}' not found in the provided GEO data.")
70
+
71
+ # List to store metadata dictionaries
72
+ meta_list = []
73
+
74
+ # Extract the GEO object for the specified dataset
75
+ geo_obj = geo[dataset]
76
+
77
+ # Overall Study Metadata
78
+ study_meta = geo_obj.metadata
79
+ study_metadata = {key: study_meta[key] for key in study_meta.keys()}
80
+
81
+ # Platform Metadata
82
+ for platform_id, platform in geo_obj.gpls.items():
83
+ platform_metadata = {
84
+ key: platform.metadata[key] for key in platform.metadata.keys()
85
+ }
86
+ platform_metadata["platform_id"] = platform_id # Include platform ID
87
+
88
+ # Sample Metadata
89
+ for sample_id, sample in geo_obj.gsms.items():
90
+ sample_metadata = {
91
+ key: sample.metadata[key] for key in sample.metadata.keys()
92
+ }
93
+ sample_metadata["sample_id"] = sample_id # Include sample ID
94
+ # Combine all metadata into a single dictionary
95
+ combined_meta = {
96
+ "dataset": dataset,
97
+ **{
98
+ k: (
99
+ v[0]
100
+ if isinstance(v, list) and len(v) == 1
101
+ else ", ".join(map(str, v))
102
+ )
103
+ for k, v in study_metadata.items()
104
+ }, # Flatten study metadata
105
+ **platform_metadata, # Unpack platform metadata
106
+ **{
107
+ k: (
108
+ v[0]
109
+ if isinstance(v, list) and len(v) == 1
110
+ else "".join(map(str, v))
111
+ )
112
+ for k, v in sample_metadata.items()
113
+ }, # Flatten sample metadata
114
+ }
115
+
116
+ # Append the combined metadata to the list
117
+ meta_list.append(combined_meta)
118
+
119
+ # Convert the list of dictionaries to a DataFrame
120
+ meta_df = pd.DataFrame(meta_list)
121
+ if verbose:
122
+ print(
123
+ f"Meta info columns for dataset '{dataset}': \n{sorted(meta_df.columns.tolist())}"
124
+ )
125
+ return meta_df
126
+
127
+ def get_probe(geo: dict, dataset: str = "GSE25097", platform_id: str = None, verbose=True):
128
+ """
129
+ df_probe = get_probe(geo, dataset="GSE25097", platform_id: str = "GPL10687")
130
+ """
131
+ # try to find the platform_id from meta
132
+ if platform_id is None:
133
+ df_meta=get_meta(geo=geo, dataset=dataset,verbose=False)
134
+ platform_id=df_meta["platform_id"].unique().tolist()
135
+ platform_id = platform_id[0] if len(platform_id)==1 else platform_id
136
+ print(platform_id)
137
+ df_probe = geo[dataset].gpls[platform_id].table
138
+ if df_probe.empty:
139
+ print(f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息")
140
+ return get_meta(geo, dataset, verbose=True)
141
+ if verbose:
142
+ print(f"columns in the probe table: \n{sorted(df_probe.columns.tolist())}")
143
+ return df_probe
144
+
145
+
146
+ def get_expression_data(geo: dict, dataset: str = "GSE25097") -> pd.DataFrame:
147
+ """
148
+ df_expression = get_expression_data(geo,dataset="GSE25097")
149
+ 只包含表达量数据,并没有考虑它的probe和其它的meta
150
+
151
+ Extracts expression values from GEO datasets and returns it as a DataFrame.
152
+
153
+ Parameters:
154
+ geo (dict): A dictionary containing GEO objects for each dataset.
155
+
156
+ Returns:
157
+ pd.DataFrame: A DataFrame containing expression data from the GEO datasets.
158
+ """
159
+ expression_dataframes = []
160
+ try:
161
+ expression_values = geo[dataset].pivot_samples("VALUE")
162
+ except:
163
+ for sample_id, sample in geo[dataset].gsms.items():
164
+ if hasattr(sample, "table"):
165
+ expression_values = (
166
+ sample.table.T
167
+ ) # Transpose for easier DataFrame creation
168
+ expression_values["dataset"] = dataset
169
+ expression_values["sample_id"] = sample_id
170
+ return expression_values
171
+
172
+
173
+
174
+ def get_data(geo: dict, dataset: str = "GSE25097",verbose=True):
175
+ # get probe info
176
+ df_probe = get_probe(geo,dataset=dataset,verbose=False)
177
+ # get expression values
178
+ df_expression = get_expression_data(geo, dataset=dataset )
179
+ print(
180
+ f"df_expression.shape: {df_expression.shape} \ndf_probe.shape: {df_probe.shape}"
181
+ )
182
+ if any([df_probe.empty, df_expression.empty]):
183
+ print(f"above is meta info, failed to find the probe info. 看一下是不是在单独的文件中包含了probe信息")
184
+ return get_meta(geo, dataset, verbose=True)
185
+ df_exp = pd.merge(
186
+ df_probe,
187
+ df_expression,
188
+ left_on=df_probe.columns.tolist()[0],
189
+ right_index=True,
190
+ how="outer",
191
+ )
192
+
193
+ # get meta info
194
+ df_meta=get_meta(geo, dataset=dataset,verbose=False)
195
+ col_rm=['channel_count','contact_web_link','contact_address', 'contact_city', 'contact_country', 'contact_department', 'contact_email', 'contact_institute', 'contact_laboratory', 'contact_name', 'contact_phone', 'contact_state', 'contact_zip/postal_code', 'contributor', 'manufacture_protocol', 'taxid','web_link']
196
+ # rm unrelavent columns
197
+ df_meta = df_meta.drop(columns=[col for col in col_rm if col in df_meta.columns])
198
+ # sorte columns
199
+ df_meta = df_meta.reindex(sorted(df_meta.columns),axis=1)
200
+ # find a proper column
201
+ col_sample_id = ips.strcmp("sample_id",df_meta.columns.tolist())[0]
202
+ df_meta.set_index(col_sample_id, inplace=True) # set gene symbol as index
203
+
204
+ col_gene_symbol = ips.strcmp("GeneSymbol",df_exp.columns.tolist())[0]
205
+ # select the 'GSM' columns
206
+ col_gsm = df_exp.columns[df_exp.columns.str.startswith("GSM")].tolist()
207
+ df_exp.set_index(col_gene_symbol, inplace=True)
208
+ df_exp=df_exp[col_gsm].T # transpose, so that could add meta info
209
+
210
+ df_merged=ips.df_merge(df_meta,df_exp)
211
+ if verbose:
212
+ print(f"\ndataset:'{dataset}' n_sample = {df_merged.shape[0]}, n_gene={df_exp.shape[1]}")
213
+ display(df_merged.sample(10))
214
+ return df_merged
215
+
216
+ def split_at_lower_upper(lst):
217
+ """
218
+ 将一串list,从全是lowercase,然后就是大写或者nan的地方分隔成两个list
219
+ """
220
+ for i in range(len(lst) - 1):
221
+ if isinstance(lst[i], str) and lst[i].islower():
222
+ next_item = lst[i + 1]
223
+ if isinstance(next_item, str) and next_item.isupper():
224
+ # Found the split point: lowercase followed by uppercase
225
+ return lst[: i + 1], lst[i + 1 :]
226
+ elif pd.isna(next_item):
227
+ # NaN case after a lowercase string
228
+ return lst[: i + 1], lst[i + 1 :]
229
+ return lst, []
230
+
231
+ def get_condition(
232
+ data: pd.DataFrame,
233
+ column:str="characteristics_ch1",#在哪一行进行分类
234
+ column_new:str="condition",# 新col的命名
235
+ by:str="tissue: tumor liver",# 通过by来命名
236
+ by_not:str=": tumor", # 健康的选择条件
237
+ by_name:str="non-tumor", # 健康的命名
238
+ by_not_name:str="tumor", # 不健康的命名
239
+ inplace: bool = True, #replace the data
240
+ verbose:bool = True
241
+ ):
242
+ """
243
+ Add a new column to the DataFrame based on the presence of a specific substring in another column.
244
+
245
+ Parameters
246
+ ----------
247
+ data : pd.DataFrame
248
+ The input DataFrame containing the data.
249
+ column : str, optional
250
+ The name of the column in which to search for the substring (default is 'characteristics_ch1').
251
+ column_new : str, optional
252
+ The name of the new column to be created (default is 'condition').
253
+ by : str, optional
254
+ The substring to search for in the specified column (default is 'heal').
255
+
256
+ """
257
+ # first check the content in column
258
+ content=data[column].unique().tolist()
259
+ if verbose:
260
+ if len(content)>10:
261
+ display(content[:10])
262
+ else:
263
+ display(content)
264
+ # 优先by
265
+ if by:
266
+ data[column_new] = data[column].apply(lambda x: by_name if by in x else by_not_name)
267
+ elif by_not:
268
+ data[column_new] = data[column].apply(lambda x: by_not_name if not by_not in x else by_name)
269
+ if verbose:
270
+ display(data)
271
+ if not inplace:
272
+ return data