HiVis 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hivis-0.1.0/HiVis/Aggregation.py +594 -0
- hivis-0.1.0/HiVis/Aggregation_utils.py +202 -0
- hivis-0.1.0/HiVis/HiVis.py +944 -0
- hivis-0.1.0/HiVis/HiVis_plot.py +1329 -0
- hivis-0.1.0/HiVis/HiVis_utils.py +1008 -0
- hivis-0.1.0/HiVis/__init__.py +3 -0
- hivis-0.1.0/HiVis.egg-info/PKG-INFO +135 -0
- hivis-0.1.0/HiVis.egg-info/SOURCES.txt +15 -0
- hivis-0.1.0/HiVis.egg-info/dependency_links.txt +1 -0
- hivis-0.1.0/HiVis.egg-info/requires.txt +16 -0
- hivis-0.1.0/HiVis.egg-info/top_level.txt +1 -0
- hivis-0.1.0/LICENSE +21 -0
- hivis-0.1.0/PKG-INFO +135 -0
- hivis-0.1.0/README.md +75 -0
- hivis-0.1.0/pyproject.toml +43 -0
- hivis-0.1.0/setup.cfg +4 -0
- hivis-0.1.0/setup.py +38 -0
|
@@ -0,0 +1,594 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Aggregation of spots from HiVis
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
import warnings
|
|
8
|
+
import re
|
|
9
|
+
import gc
|
|
10
|
+
import os
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import anndata as ad
|
|
14
|
+
from shapely.affinity import scale
|
|
15
|
+
from scipy.stats import mode
|
|
16
|
+
import scipy.io
|
|
17
|
+
from scipy.spatial import cKDTree
|
|
18
|
+
from tqdm import tqdm
|
|
19
|
+
import geopandas as gpd
|
|
20
|
+
|
|
21
|
+
from . import HiVis_plot
|
|
22
|
+
from . import HiVis_utils
|
|
23
|
+
|
|
24
|
+
class Aggregation:
|
|
25
|
+
'''
|
|
26
|
+
Stores data of HiVis that have been aggregated (for example to single-cells). \
|
|
27
|
+
Enables plotting via Aggregation.plot. Each instance is linked to a HiViz object.
|
|
28
|
+
'''
|
|
29
|
+
|
|
30
|
+
def __init__(self, hiviz_instance, adata_agg, name, geojson_agg_path=None):
|
|
31
|
+
'''
|
|
32
|
+
Creates a new instance that is linked to a HiViz object.
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
* hiviz_instance (HiViz) - HiViz object
|
|
36
|
+
* adata_agg (ad.AnnData) - anndata of aggregations
|
|
37
|
+
* name (str) - name of object
|
|
38
|
+
* geojson_path (str) - path of geojson, exported annotations
|
|
39
|
+
'''
|
|
40
|
+
|
|
41
|
+
if not isinstance(adata_agg, ad._core.anndata.AnnData):
|
|
42
|
+
raise ValueError("Adata must be Anndata object")
|
|
43
|
+
if not "pxl_col_in_fullres" in adata_agg.obs.columns or not "pxl_row_in_fullres" in adata_agg.obs.columns:
|
|
44
|
+
raise ValueError("Anndata.obs must include [pxl_col_in_fullres, pxl_row_in_fullres ]")
|
|
45
|
+
adata_agg = adata_agg[adata_agg.obs["pxl_col_in_fullres"].notna(),:].copy()
|
|
46
|
+
if adata_agg.shape[0] == 0:
|
|
47
|
+
raise ValueError("Filtered AnnData object is empty. No valid rows remain.")
|
|
48
|
+
|
|
49
|
+
scalefactor_json = hiviz_instance.json
|
|
50
|
+
adata_agg.obs["pxl_col_in_lowres"] = adata_agg.obs["pxl_col_in_fullres"] * scalefactor_json["tissue_lowres_scalef"]
|
|
51
|
+
adata_agg.obs["pxl_row_in_lowres"] = adata_agg.obs["pxl_row_in_fullres"] * scalefactor_json["tissue_lowres_scalef"]
|
|
52
|
+
adata_agg.obs["pxl_col_in_highres"] = adata_agg.obs["pxl_col_in_fullres"] * scalefactor_json["tissue_hires_scalef"]
|
|
53
|
+
adata_agg.obs["pxl_row_in_highres"] = adata_agg.obs["pxl_row_in_fullres"] * scalefactor_json["tissue_hires_scalef"]
|
|
54
|
+
adata_agg.obs["um_x"] = adata_agg.obs["pxl_col_in_fullres"] * scalefactor_json["microns_per_pixel"]
|
|
55
|
+
adata_agg.obs["um_y"] = adata_agg.obs["pxl_row_in_fullres"] * scalefactor_json["microns_per_pixel"]
|
|
56
|
+
|
|
57
|
+
self.adata = adata_agg
|
|
58
|
+
self.viz = hiviz_instance
|
|
59
|
+
self.name = name
|
|
60
|
+
self.path_output = self.viz.path_output + f"/{self.name}"
|
|
61
|
+
if not os.path.exists(self.path_output):
|
|
62
|
+
os.makedirs(self.path_output)
|
|
63
|
+
self.plot = HiVis_plot.PlotAgg(self)
|
|
64
|
+
self.adata_cropped = None
|
|
65
|
+
self.tree = None
|
|
66
|
+
|
|
67
|
+
if geojson_agg_path:
|
|
68
|
+
self.import_geometry(geojson_agg_path,object_type="cell")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def import_geometry(self, geojson_path, object_type="cell"):
|
|
72
|
+
'''
|
|
73
|
+
Adds "geometry" column to self.adata.obs, based on Geojson exported from Qupath.
|
|
74
|
+
|
|
75
|
+
Parameters:
|
|
76
|
+
* geojson_path (str) - path to geojson file
|
|
77
|
+
* object_type (str) - which "objectType" to merge from the geojson
|
|
78
|
+
'''
|
|
79
|
+
|
|
80
|
+
if isinstance(geojson_path,str):
|
|
81
|
+
gdf = gpd.read_file(geojson_path)
|
|
82
|
+
elif isinstance(geojson_path,gpd.GeoDataFrame):
|
|
83
|
+
gdf = geojson_path
|
|
84
|
+
gdf = gdf[gdf["objectType"] == object_type]
|
|
85
|
+
gdf = gdf.loc[:,["id","geometry"]]
|
|
86
|
+
gdf.rename(columns={"id":self.adata.obs.index.name},inplace=True)
|
|
87
|
+
|
|
88
|
+
microns_per_pixel = self.viz.json["microns_per_pixel"]
|
|
89
|
+
gdf["geometry"] = gdf["geometry"].apply(lambda geom: scale(geom, xfact=microns_per_pixel, yfact=microns_per_pixel, origin=(0, 0)))
|
|
90
|
+
with warnings.catch_warnings():
|
|
91
|
+
warnings.simplefilter("ignore", category=UserWarning)
|
|
92
|
+
gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkt)
|
|
93
|
+
gdf = gdf.set_index(self.adata.obs.index.name)
|
|
94
|
+
|
|
95
|
+
if "geometry" in self.adata.obs.columns:
|
|
96
|
+
print("Geometry column already exists, overwriting...")
|
|
97
|
+
del self.adata.obs["geometry"]
|
|
98
|
+
with warnings.catch_warnings():
|
|
99
|
+
warnings.filterwarnings("ignore", message="Geometry column does not contain geometry")
|
|
100
|
+
self.adata.obs = self.adata.obs.join(gdf,how="left")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def merge(self, adata, obs=None, var=None, umap=True, pca=True, hvg=True):
|
|
104
|
+
'''
|
|
105
|
+
Merge info from an anndata to self.adata, in case genes have been filtered.
|
|
106
|
+
|
|
107
|
+
Parameters:
|
|
108
|
+
* adata (ad.AnnData) - anndata where to get the values from
|
|
109
|
+
* obs - single string or list of obs to merge
|
|
110
|
+
* var - single string or list of var to merge
|
|
111
|
+
* umap (bool) - add umap to OBSM, and UMAP coordinates to obs
|
|
112
|
+
* pca (bool) - add PCA to OBSM
|
|
113
|
+
* hvg (bool) - add highly variable genes to vars
|
|
114
|
+
'''
|
|
115
|
+
|
|
116
|
+
if not obs:
|
|
117
|
+
obs = []
|
|
118
|
+
elif isinstance(obs, str):
|
|
119
|
+
obs = [obs]
|
|
120
|
+
if umap and "X_umap" in adata.obsm:
|
|
121
|
+
if self.adata.shape[0] == adata.shape[0]:
|
|
122
|
+
self.adata.obsm['X_umap'] = adata.obsm['X_umap'].copy()
|
|
123
|
+
else:
|
|
124
|
+
print("Cant add UMAP to obsm, size of adatas don't match")
|
|
125
|
+
umap_coords = adata.obsm['X_umap']
|
|
126
|
+
adata.obs['UMAP_1'] = umap_coords[:, 0]
|
|
127
|
+
adata.obs['UMAP_2'] = umap_coords[:, 1]
|
|
128
|
+
|
|
129
|
+
obs += ['UMAP_1','UMAP_2']
|
|
130
|
+
if pca and "X_pca" in adata.obsm:
|
|
131
|
+
if self.adata.shape[0] == adata.shape[0]:
|
|
132
|
+
self.adata.obsm['X_pca'] = adata.obsm['X_pca'].copy()
|
|
133
|
+
if hvg and 'highly_variable' in adata.var.columns:
|
|
134
|
+
if not var:
|
|
135
|
+
var = 'highly_variable'
|
|
136
|
+
else:
|
|
137
|
+
if 'highly_variable' not in var:
|
|
138
|
+
var += ['highly_variable']
|
|
139
|
+
if obs:
|
|
140
|
+
existing_columns = [col for col in obs if col in self.adata.obs.columns]
|
|
141
|
+
if existing_columns:
|
|
142
|
+
self.adata.obs.drop(columns=existing_columns, inplace=True)
|
|
143
|
+
self.adata.obs = self.adata.obs.join(adata.obs[obs], how="left")
|
|
144
|
+
if var:
|
|
145
|
+
if isinstance(var, str):
|
|
146
|
+
var = [var]
|
|
147
|
+
existing_columns = [col for col in var if col in self.adata.var.columns]
|
|
148
|
+
if existing_columns:
|
|
149
|
+
self.adata.var.drop(columns=existing_columns, inplace=True)
|
|
150
|
+
self.adata.var = self.adata.var.join(adata.var[var], how="left")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get(self, what, cropped=False, geometry=False):
|
|
154
|
+
'''
|
|
155
|
+
get a vector from data (a gene) or metadata (from obs or var). or subset the object.
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
* what - if string, will get data or metadata. \
|
|
159
|
+
else, will return a new Aggregation object that is spliced. \
|
|
160
|
+
the splicing is passed to the self.adata.
|
|
161
|
+
* cropped (bool) - get the data from the adata_cropped after plotting spatial
|
|
162
|
+
* geometry (bool) - include only objects which have geometry
|
|
163
|
+
|
|
164
|
+
**Returns**: either np.array of data or, if subsetting, a new Aggregation instance
|
|
165
|
+
'''
|
|
166
|
+
|
|
167
|
+
adata = self.adata_cropped if cropped else self.adata
|
|
168
|
+
if geometry and self.plot.geometry is not None:
|
|
169
|
+
adata = adata[adata.obs.index.isin(self.plot.geometry.index)]
|
|
170
|
+
if isinstance(what, str): # Easy access to data or metadata arrays
|
|
171
|
+
if what in adata.obs.columns: # Metadata
|
|
172
|
+
column_data = adata.obs[what]
|
|
173
|
+
if column_data.dtype.name == 'category':
|
|
174
|
+
return column_data.astype(str).values
|
|
175
|
+
return column_data.values
|
|
176
|
+
elif what in adata.var.index: # A gene
|
|
177
|
+
return np.array(adata[:, what].X.todense().ravel()).flatten()
|
|
178
|
+
elif what in adata.var.columns: # Gene metadata
|
|
179
|
+
column_data = adata.var[what]
|
|
180
|
+
if column_data.dtype.name == 'category':
|
|
181
|
+
return column_data.astype(str).values
|
|
182
|
+
return column_data.values
|
|
183
|
+
obs_cols_lower = adata.obs.columns.str.lower()
|
|
184
|
+
if what.lower() in obs_cols_lower:
|
|
185
|
+
col_name = adata.obs.columns[obs_cols_lower.get_loc(what.lower())]
|
|
186
|
+
column_data = adata.obs[col_name]
|
|
187
|
+
if column_data.dtype.name == 'category':
|
|
188
|
+
return column_data.astype(str).values
|
|
189
|
+
return column_data.values
|
|
190
|
+
elif self.viz.organism == "mouse" and (what.lower().capitalize() in adata.var.index):
|
|
191
|
+
return np.array(adata[:, what.lower().capitalize()].X.todense()).flatten()
|
|
192
|
+
elif self.viz.organism == "human" and (what.upper() in adata.var.index):
|
|
193
|
+
return np.array(adata[:, what.upper()].X.todense()).flatten()
|
|
194
|
+
var_cols_lower = adata.var.columns.str.lower()
|
|
195
|
+
if what.lower() in var_cols_lower:
|
|
196
|
+
col_name = adata.var.columns[var_cols_lower.get_loc(what.lower())]
|
|
197
|
+
column_data = adata.var[col_name]
|
|
198
|
+
if column_data.dtype.name == 'category':
|
|
199
|
+
return column_data.astype(str).values
|
|
200
|
+
return column_data.values
|
|
201
|
+
else:
|
|
202
|
+
# Create a new Aggregation object based on adata subsetting
|
|
203
|
+
return self.subset(what)
|
|
204
|
+
|
|
205
|
+
def subset(self, what=(slice(None), slice(None))):
|
|
206
|
+
'''
|
|
207
|
+
Create a new Aggregation object based on adata subsetting.
|
|
208
|
+
**Returns** new Aggregation instance
|
|
209
|
+
'''
|
|
210
|
+
what = tuple(idx.to_numpy() if hasattr(idx, "to_numpy") else idx for idx in what)
|
|
211
|
+
adata = self.adata[what].copy()
|
|
212
|
+
adata.var = adata.var.loc[:,~adata.var.columns.str.startswith(("cor_","exp_"))]
|
|
213
|
+
for layer in self.adata.layers.keys():
|
|
214
|
+
adata.layers[layer] = self.adata.layers[layer][what].copy()
|
|
215
|
+
return Aggregation(self.viz, adata, name=self.name)
|
|
216
|
+
|
|
217
|
+
def __getitem__(self, what):
|
|
218
|
+
'''get a vector from data (a gene) or metadata (from obs or var). or subset the object.'''
|
|
219
|
+
item = self.get(what, cropped=False)
|
|
220
|
+
if item is None:
|
|
221
|
+
raise KeyError(f"[{what}] isn't in data or metadatas")
|
|
222
|
+
return item
|
|
223
|
+
|
|
224
|
+
def pseudobulk(self, by=None,layer=None):
|
|
225
|
+
'''
|
|
226
|
+
Sums the gene expression for each group in a single obs.
|
|
227
|
+
|
|
228
|
+
Parameters:
|
|
229
|
+
* by (str) - return a dataframe, each column is a value in "by" (for example cluster), rows are genes. \
|
|
230
|
+
If None, will return the mean expression of every gene.
|
|
231
|
+
* layer (str) - which layer in adata to use.
|
|
232
|
+
|
|
233
|
+
**Returns** the gene expression for each group (pd.DataFrame)
|
|
234
|
+
'''
|
|
235
|
+
if layer is None:
|
|
236
|
+
x = self.adata.X
|
|
237
|
+
else:
|
|
238
|
+
if layer not in self.adata.layers:
|
|
239
|
+
raise KeyError(f"Layer '{layer}' not found in self.adata.layers. Available layers: {list(self.adata.layers.keys())}")
|
|
240
|
+
x = self.adata.layers[layer]
|
|
241
|
+
|
|
242
|
+
if by is None:
|
|
243
|
+
pb = x.mean(axis=0).A1
|
|
244
|
+
return pd.Series(pb, index=self.adata.var_names)
|
|
245
|
+
|
|
246
|
+
expr_df = pd.DataFrame(x.toarray(),
|
|
247
|
+
index=self.adata.obs_names,
|
|
248
|
+
columns=self.adata.var_names)
|
|
249
|
+
|
|
250
|
+
group_key = self.adata.obs[by]
|
|
251
|
+
return expr_df.groupby(group_key, observed=True).mean().T
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def smooth(self, what, radius, method="median", new_col_name=None, **kwargs):
|
|
255
|
+
'''
|
|
256
|
+
Applies median smoothing to the specified column in adata.obs using spatial neighbors.
|
|
257
|
+
|
|
258
|
+
Parameters:
|
|
259
|
+
* what (str) - what to smooth. either a gene name or column name from self.adata.obs
|
|
260
|
+
* radius (float) - in microns
|
|
261
|
+
* method - ["mode", "median", "mean", "gaussian", "log"]
|
|
262
|
+
* new_col_name (str) - Optional custom name for the output column.
|
|
263
|
+
* \**kwargs - Additional Parameters for specific methods (e.g., sigma for gaussian, offset for log).
|
|
264
|
+
'''
|
|
265
|
+
coords = self.adata.obs[['um_x', 'um_y']].values
|
|
266
|
+
|
|
267
|
+
if self.tree is None:
|
|
268
|
+
# Build a KDTree for fast neighbor look-up.
|
|
269
|
+
print("Building coordinate tree")
|
|
270
|
+
self.tree = cKDTree(coords)
|
|
271
|
+
|
|
272
|
+
values = self[what]
|
|
273
|
+
if len(values) != self.adata.shape[0]:
|
|
274
|
+
raise ValueError(f"{what} not in adata.obs or a gene name")
|
|
275
|
+
|
|
276
|
+
if isinstance(values[0], str):
|
|
277
|
+
if method != "mode":
|
|
278
|
+
raise ValueError("Smoothing on string columns is only supported using the 'mode' method.")
|
|
279
|
+
|
|
280
|
+
smoothed_values = []
|
|
281
|
+
|
|
282
|
+
if method == "log":
|
|
283
|
+
offset = kwargs.get("offset", 1.0)
|
|
284
|
+
if np.min(values) < -offset:
|
|
285
|
+
raise ValueError(f"Negative values detected in '{what}'. Log smoothing requires all values >= {-offset}.")
|
|
286
|
+
elif method == "gaussian":
|
|
287
|
+
sigma = kwargs.get("sigma", radius / 2)
|
|
288
|
+
|
|
289
|
+
# Iterate through each object's coordinates, find neighbors, and compute the median.
|
|
290
|
+
for i, point in enumerate(tqdm(coords, desc=f"{method} filtering '{what}' in radius {radius}")):
|
|
291
|
+
# Find all neighbors within the given radius.
|
|
292
|
+
indices = self.tree.query_ball_point(point, radius)
|
|
293
|
+
if not indices:
|
|
294
|
+
# Assign the original value or np.nan if no neighbor is found.
|
|
295
|
+
new_val = values[i]
|
|
296
|
+
neighbor_values = values[indices]
|
|
297
|
+
|
|
298
|
+
if method == "median":
|
|
299
|
+
new_val = np.median(neighbor_values)
|
|
300
|
+
elif method == "mean":
|
|
301
|
+
new_val = np.mean(neighbor_values)
|
|
302
|
+
elif method == "mode":
|
|
303
|
+
if isinstance(neighbor_values[0], str):
|
|
304
|
+
unique_vals, counts = np.unique(neighbor_values, return_counts=True)
|
|
305
|
+
new_val = unique_vals[np.argmax(counts)]
|
|
306
|
+
else:
|
|
307
|
+
new_val = mode(neighbor_values).mode
|
|
308
|
+
elif method == "gaussian":
|
|
309
|
+
# Calculate distances to neighbors.
|
|
310
|
+
distances = np.linalg.norm(coords[indices] - point, axis=1)
|
|
311
|
+
|
|
312
|
+
# Compute Gaussian weights.
|
|
313
|
+
weights = np.exp(- (distances**2) / (2 * sigma**2))
|
|
314
|
+
new_val = np.sum(neighbor_values * weights) / np.sum(weights)
|
|
315
|
+
elif method == "log":
|
|
316
|
+
# Apply a log1p transform to handle zero values; add an offset if necessary.
|
|
317
|
+
offset = kwargs.get("offset", 1.0)
|
|
318
|
+
# It is assumed that neighbor_values + offset > 0.
|
|
319
|
+
new_val = np.expm1(np.median(np.log1p(neighbor_values + offset))) - offset
|
|
320
|
+
else:
|
|
321
|
+
raise ValueError(f"Unknown smoothing method: {method}")
|
|
322
|
+
|
|
323
|
+
smoothed_values.append(new_val)
|
|
324
|
+
|
|
325
|
+
if not new_col_name:
|
|
326
|
+
new_col_name = f'{what}_smooth_r{radius}'
|
|
327
|
+
self.adata.obs[new_col_name] = smoothed_values
|
|
328
|
+
|
|
329
|
+
def noise_mean_curve(self, plot=False, layer=None, signif_thresh=0.95, inplace=False, **kwargs):
|
|
330
|
+
'''
|
|
331
|
+
Generates a noise-mean curve of the data.
|
|
332
|
+
|
|
333
|
+
Parameters:
|
|
334
|
+
* plot (bool) - plot the curve
|
|
335
|
+
* layer (str) - which layer in the AnnData to use
|
|
336
|
+
* signif_thresh (float) - for plotting, add text for genes in this residual percentile
|
|
337
|
+
* inplace (bool) - add the mean_expression, cv and residuals to VAR
|
|
338
|
+
|
|
339
|
+
**Returns** dataframe with expression, CV and residuals of each gene (pd.DataFrame). \
|
|
340
|
+
If plot=true, will also return ax.
|
|
341
|
+
'''
|
|
342
|
+
return HiVis_utils.noise_mean_curve(self.adata, plot=plot,layer=layer,
|
|
343
|
+
signif_thresh=signif_thresh,inplace=inplace, **kwargs)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def cor(self, what, self_corr_value=None, normilize=True, layer: str = None, inplace=False):
|
|
347
|
+
'''
|
|
348
|
+
Calculates gene(s) correlation.
|
|
349
|
+
|
|
350
|
+
Parameters:
|
|
351
|
+
* what (str or list) - if str, computes Spearman correlation of a given gene with all genes. \
|
|
352
|
+
if list, will compute correlation between all genes in the list
|
|
353
|
+
* self_corr_value - replace the correlation of the gene with itself by this value
|
|
354
|
+
* normalize (bool) - normilize expression before computing correlation
|
|
355
|
+
* layer (str) - which layer in the AnnData to use
|
|
356
|
+
* inplace (bool) - add the correlation to VAR
|
|
357
|
+
|
|
358
|
+
**Returns** dataframe of spearman correlation between genes (pd.DataFrame)
|
|
359
|
+
'''
|
|
360
|
+
if isinstance(what, str):
|
|
361
|
+
x = self[what]
|
|
362
|
+
return HiVis_utils.cor_gene(self.adata, x, what, self_corr_value, normilize, layer, inplace)
|
|
363
|
+
return HiVis_utils.cor_genes(self.adata, what, self_corr_value, normilize, layer)
|
|
364
|
+
|
|
365
|
+
def sync(self, what: str):
|
|
366
|
+
'''
|
|
367
|
+
Transfers metadata assignment from the Aggregation to the spots.
|
|
368
|
+
|
|
369
|
+
Parameters:
|
|
370
|
+
* what (str) - obs column name to pass to HiViz object
|
|
371
|
+
'''
|
|
372
|
+
if what not in self.adata.obs:
|
|
373
|
+
raise KeyError(f"'{what}' does not exist in agg.adata.obs.")
|
|
374
|
+
agg_id_col = self.adata.obs.index.name
|
|
375
|
+
if agg_id_col not in self.viz.adata.obs.columns:
|
|
376
|
+
raise KeyError(f"'{agg_id_col}' does not exist in HiViz.adata.obs.")
|
|
377
|
+
mapping = self.adata.obs[what]
|
|
378
|
+
self.viz.adata.obs[what] = self.viz.adata.obs[agg_id_col].map(mapping)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def export_h5(self, path=None):
|
|
382
|
+
'''
|
|
383
|
+
Exports the adata.
|
|
384
|
+
|
|
385
|
+
Parameters:
|
|
386
|
+
* path (str) - path to save the h5 file. If None, will save to path_output
|
|
387
|
+
|
|
388
|
+
**Returns** path where the file was saved (str)
|
|
389
|
+
'''
|
|
390
|
+
print(f"SAVING [{self.name}]")
|
|
391
|
+
if not path:
|
|
392
|
+
path = f"{self.path_output}/{self.name}.h5ad"
|
|
393
|
+
self.adata.write(path)
|
|
394
|
+
return path
|
|
395
|
+
|
|
396
|
+
def dge(self, column, group1, group2=None, method="wilcox", two_sided=False,
|
|
397
|
+
umi_thresh=0, inplace=False, layer=None):
|
|
398
|
+
'''
|
|
399
|
+
Runs differential gene expression analysis between two groups.
|
|
400
|
+
Values will be saved in self.var: expression_mean, log2fc, pval
|
|
401
|
+
|
|
402
|
+
Parameters:
|
|
403
|
+
* column - which column in obs has the groups classification
|
|
404
|
+
* group1 - specific value in the "column"
|
|
405
|
+
* group2 - specific value in the "column". \
|
|
406
|
+
if None, will run against all other values, and will be called "rest"
|
|
407
|
+
* method - either "wilcox" or "t_test"
|
|
408
|
+
* two_sided - if one sided, will give the pval for each group, \
|
|
409
|
+
and the minimal of both groups (which will also be FDR adjusted)
|
|
410
|
+
* umi_thresh - use only spots with more UMIs than this number
|
|
411
|
+
* expression - function F {mean, mean, max} F(mean(group1),mean(group2))
|
|
412
|
+
* inplace - modify the adata.var with log2fc, pval and expression columns
|
|
413
|
+
* layer (str) - which layer in the AnnData to use
|
|
414
|
+
|
|
415
|
+
**Returns** the DGE results (pd.DataFrame)
|
|
416
|
+
'''
|
|
417
|
+
alternative = "two-sided" if two_sided else "greater"
|
|
418
|
+
df = HiVis_utils.dge(self.adata, column, group1, group2, umi_thresh,layer=layer,
|
|
419
|
+
method=method, alternative=alternative, inplace=inplace)
|
|
420
|
+
if group2 is None:
|
|
421
|
+
group2 = "rest"
|
|
422
|
+
df = df[[f"pval_{column}",f"log2fc_{column}",group1,group2]]
|
|
423
|
+
df.rename(columns={f"log2fc_{column}":"log2fc"},inplace=True)
|
|
424
|
+
if not two_sided:
|
|
425
|
+
df[f"pval_{group1}"] = 1 - df[f"pval_{column}"]
|
|
426
|
+
df[f"pval_{group2}"] = df[f"pval_{column}"]
|
|
427
|
+
df["pval"] = df[[f"pval_{group1}",f"pval_{group2}"]].min(axis=1)
|
|
428
|
+
else:
|
|
429
|
+
df["pval"] = df[f"pval_{column}"]
|
|
430
|
+
del df[f"pval_{column}"]
|
|
431
|
+
df["qval"] = HiVis_utils.p_adjust(df["pval"])
|
|
432
|
+
df["expression_mean"] = df[[group1, group2]].mean(axis=1)
|
|
433
|
+
df["expression_min"] = df[[group1, group2]].min(axis=1)
|
|
434
|
+
df["expression_max"] = df[[group1, group2]].max(axis=1)
|
|
435
|
+
df["gene"] = df.index
|
|
436
|
+
if inplace:
|
|
437
|
+
var = df.copy()
|
|
438
|
+
var.rename(columns={
|
|
439
|
+
"qval":f"qval_{column}",
|
|
440
|
+
"pval":f"pval_{column}",
|
|
441
|
+
"log2fc":f"log2fc_{column}",
|
|
442
|
+
"expression_mean":f"expression_mean_{column}",
|
|
443
|
+
"expression_min":f"expression_min_{column}",
|
|
444
|
+
"expression_max":f"expression_max_{column}",
|
|
445
|
+
},inplace=True)
|
|
446
|
+
del var["gene"]
|
|
447
|
+
self.adata.var = self.adata.var.join(var, how="left")
|
|
448
|
+
return df
|
|
449
|
+
|
|
450
|
+
@property
|
|
451
|
+
def shape(self):
|
|
452
|
+
'''**Returns** Aggregation.adata.shape'''
|
|
453
|
+
return self.adata.shape
|
|
454
|
+
|
|
455
|
+
def __str__(self):
|
|
456
|
+
s = f"# Aggregation # {self.name} #\n\n"
|
|
457
|
+
s += f"# Parent: {self.viz.name} #\n"
|
|
458
|
+
s += f"\tSize: {self.adata.shape[0]} x {self.adata.shape[1]}\n"
|
|
459
|
+
s += '\nobs: '
|
|
460
|
+
s += ', '.join(list(self.adata.obs.columns))
|
|
461
|
+
if not self.adata.var.columns.empty:
|
|
462
|
+
s += '\n\nvar: '
|
|
463
|
+
s += ', '.join(list(self.adata.var.columns))
|
|
464
|
+
layers = list(self.adata.layers.keys())
|
|
465
|
+
if layers:
|
|
466
|
+
s += '\n\nlayers: '
|
|
467
|
+
s += ', '.join(layers)
|
|
468
|
+
return s
|
|
469
|
+
|
|
470
|
+
def __repr__(self):
|
|
471
|
+
# s = f"Aggregation[{self.name}]"
|
|
472
|
+
s = self.__str__()
|
|
473
|
+
return s
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def combine(self, other):
|
|
477
|
+
'''
|
|
478
|
+
Combines two Aggregation objects into a single adata.
|
|
479
|
+
'''
|
|
480
|
+
return self + other
|
|
481
|
+
|
|
482
|
+
def __add__(self, other):
|
|
483
|
+
'''Combines two Aggregation objects into a single adata'''
|
|
484
|
+
if not isinstance(other, (Aggregation)):
|
|
485
|
+
raise ValueError("Addition supported only for Aggregation class")
|
|
486
|
+
self.adata.obs["source_"] = self.name
|
|
487
|
+
other.adata.obs["source_"] = other.name if other.name != self.name else f"{self.name}_1"
|
|
488
|
+
adata = ad.concat([self.adata, other.adata], join='outer')
|
|
489
|
+
del self.adata.obs["source_"]
|
|
490
|
+
del other.adata.obs["source_"]
|
|
491
|
+
return adata
|
|
492
|
+
|
|
493
|
+
def __delitem__(self, key):
|
|
494
|
+
'''Deletes metadata'''
|
|
495
|
+
if isinstance(key, str):
|
|
496
|
+
if key in self.adata.obs:
|
|
497
|
+
del self.adata.obs[key]
|
|
498
|
+
elif key in self.adata.var:
|
|
499
|
+
del self.adata.var[key]
|
|
500
|
+
else:
|
|
501
|
+
raise KeyError(f"'{key}' not found in adata.obs")
|
|
502
|
+
else:
|
|
503
|
+
raise TypeError(f"Key must be a string, not {type(key).__name__}")
|
|
504
|
+
|
|
505
|
+
def update(self):
|
|
506
|
+
'''
|
|
507
|
+
Updates the methods in the instance.
|
|
508
|
+
Should be used after modifying the source code in the class
|
|
509
|
+
'''
|
|
510
|
+
HiVis_utils.update_instance_methods(self)
|
|
511
|
+
HiVis_utils.update_instance_methods(self.plot)
|
|
512
|
+
_ = gc.collect()
|
|
513
|
+
|
|
514
|
+
def head(self, n=5):
|
|
515
|
+
'''**Returns** Aggregation.adata.obs.head(n), where n is number of rows'''
|
|
516
|
+
return self.adata.obs.head(n)
|
|
517
|
+
|
|
518
|
+
@property
|
|
519
|
+
def columns(self):
|
|
520
|
+
'''**Returns** Aggregation.adata.obs.columns'''
|
|
521
|
+
return self.adata.obs.columns.copy()
|
|
522
|
+
|
|
523
|
+
def copy(self, new_name=None, new_out_path=False, full=False):
|
|
524
|
+
'''
|
|
525
|
+
Creates a deep copy of the instance
|
|
526
|
+
if new_name is specified, renames the object and changes the path_output.
|
|
527
|
+
If full is False, the name will be added to the current (previous) name.
|
|
528
|
+
|
|
529
|
+
**Returns** new Aggregation instance
|
|
530
|
+
'''
|
|
531
|
+
new = deepcopy(self)
|
|
532
|
+
new.viz = self.viz
|
|
533
|
+
gc.collect()
|
|
534
|
+
new = deepcopy(self)
|
|
535
|
+
if new_name:
|
|
536
|
+
new.rename(new_name, new_out_path=new_out_path, full=full)
|
|
537
|
+
return new
|
|
538
|
+
return new
|
|
539
|
+
|
|
540
|
+
def rename(self, new_name: str, new_out_path=True, full=False):
|
|
541
|
+
'''
|
|
542
|
+
Renames the object and changes the path_output.
|
|
543
|
+
If full is False, the name will be added to the current (previous) name
|
|
544
|
+
'''
|
|
545
|
+
if full:
|
|
546
|
+
self.name = new_name
|
|
547
|
+
else:
|
|
548
|
+
self.name = f"{self.viz.name}_{new_name}"
|
|
549
|
+
if new_out_path:
|
|
550
|
+
self.path_output = self.viz.path_output + f"/{new_name}"
|
|
551
|
+
|
|
552
|
+
def export_to_matlab(self, path=None):
|
|
553
|
+
'''
|
|
554
|
+
Exports gene names, data (sparse matrix) and metadata to a .mat file
|
|
555
|
+
'''
|
|
556
|
+
var_names = self.adata.var_names.to_numpy()
|
|
557
|
+
if 'X_umap' in self.adata.obsm:
|
|
558
|
+
self.adata.obs['UMAP_1'] = self.adata.obsm['X_umap'][:, 0]
|
|
559
|
+
self.adata.obs['UMAP_2'] = self.adata.obsm['X_umap'][:, 1]
|
|
560
|
+
|
|
561
|
+
obs = self.adata.obs.copy()
|
|
562
|
+
obs["Cell_ID"] = obs.index.tolist()
|
|
563
|
+
|
|
564
|
+
# Shorten long column names in obs
|
|
565
|
+
def shorten_col_names(columns, max_len=28):
|
|
566
|
+
seen_names = {}
|
|
567
|
+
rename_dict = {}
|
|
568
|
+
for col in columns:
|
|
569
|
+
if len(col) > max_len:
|
|
570
|
+
base_name = col[:max_len]
|
|
571
|
+
count = seen_names.get(base_name, 0)
|
|
572
|
+
new_name = f"{base_name}_{count}"
|
|
573
|
+
seen_names[base_name] = count + 1
|
|
574
|
+
rename_dict[col] = new_name
|
|
575
|
+
return rename_dict
|
|
576
|
+
|
|
577
|
+
rename_dict = shorten_col_names(obs.columns)
|
|
578
|
+
obs = obs.rename(columns=rename_dict)
|
|
579
|
+
|
|
580
|
+
def remove_non_ascii(d):
|
|
581
|
+
return {re.sub(r'[^\x00-\x7F]+', '_', k): v for k, v in d.items()}
|
|
582
|
+
|
|
583
|
+
obs = obs.to_dict(orient='list')
|
|
584
|
+
obs = remove_non_ascii(obs)
|
|
585
|
+
|
|
586
|
+
if not path:
|
|
587
|
+
path = f"{self.path_output}/matlab"
|
|
588
|
+
if not os.path.exists(path):
|
|
589
|
+
os.makedirs(path)
|
|
590
|
+
path = f"{path}/{self.name}.mat"
|
|
591
|
+
print("[Saving mat file]")
|
|
592
|
+
scipy.io.savemat(path, {"genes": var_names, "mat": self.adata.X,"metadata":obs})
|
|
593
|
+
self.adata.obs.to_csv(path.replace(".mat","metadata.csv"))
|
|
594
|
+
|