HiVis 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,594 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Aggregation of spots from HiVis
4
+ """
5
+
6
+ from copy import deepcopy
7
+ import warnings
8
+ import re
9
+ import gc
10
+ import os
11
+ import numpy as np
12
+ import pandas as pd
13
+ import anndata as ad
14
+ from shapely.affinity import scale
15
+ from scipy.stats import mode
16
+ import scipy.io
17
+ from scipy.spatial import cKDTree
18
+ from tqdm import tqdm
19
+ import geopandas as gpd
20
+
21
+ from . import HiVis_plot
22
+ from . import HiVis_utils
23
+
24
+ class Aggregation:
25
+ '''
26
+ Stores data of HiVis that have been aggregated (for example to single-cells). \
27
+ Enables plotting via Aggregation.plot. Each instance is linked to a HiViz object.
28
+ '''
29
+
30
+ def __init__(self, hiviz_instance, adata_agg, name, geojson_agg_path=None):
31
+ '''
32
+ Creates a new instance that is linked to a HiViz object.
33
+
34
+ Parameters:
35
+ * hiviz_instance (HiViz) - HiViz object
36
+ * adata_agg (ad.AnnData) - anndata of aggregations
37
+ * name (str) - name of object
38
+ * geojson_path (str) - path of geojson, exported annotations
39
+ '''
40
+
41
+ if not isinstance(adata_agg, ad._core.anndata.AnnData):
42
+ raise ValueError("Adata must be Anndata object")
43
+ if not "pxl_col_in_fullres" in adata_agg.obs.columns or not "pxl_row_in_fullres" in adata_agg.obs.columns:
44
+ raise ValueError("Anndata.obs must include [pxl_col_in_fullres, pxl_row_in_fullres ]")
45
+ adata_agg = adata_agg[adata_agg.obs["pxl_col_in_fullres"].notna(),:].copy()
46
+ if adata_agg.shape[0] == 0:
47
+ raise ValueError("Filtered AnnData object is empty. No valid rows remain.")
48
+
49
+ scalefactor_json = hiviz_instance.json
50
+ adata_agg.obs["pxl_col_in_lowres"] = adata_agg.obs["pxl_col_in_fullres"] * scalefactor_json["tissue_lowres_scalef"]
51
+ adata_agg.obs["pxl_row_in_lowres"] = adata_agg.obs["pxl_row_in_fullres"] * scalefactor_json["tissue_lowres_scalef"]
52
+ adata_agg.obs["pxl_col_in_highres"] = adata_agg.obs["pxl_col_in_fullres"] * scalefactor_json["tissue_hires_scalef"]
53
+ adata_agg.obs["pxl_row_in_highres"] = adata_agg.obs["pxl_row_in_fullres"] * scalefactor_json["tissue_hires_scalef"]
54
+ adata_agg.obs["um_x"] = adata_agg.obs["pxl_col_in_fullres"] * scalefactor_json["microns_per_pixel"]
55
+ adata_agg.obs["um_y"] = adata_agg.obs["pxl_row_in_fullres"] * scalefactor_json["microns_per_pixel"]
56
+
57
+ self.adata = adata_agg
58
+ self.viz = hiviz_instance
59
+ self.name = name
60
+ self.path_output = self.viz.path_output + f"/{self.name}"
61
+ if not os.path.exists(self.path_output):
62
+ os.makedirs(self.path_output)
63
+ self.plot = HiVis_plot.PlotAgg(self)
64
+ self.adata_cropped = None
65
+ self.tree = None
66
+
67
+ if geojson_agg_path:
68
+ self.import_geometry(geojson_agg_path,object_type="cell")
69
+
70
+
71
+ def import_geometry(self, geojson_path, object_type="cell"):
72
+ '''
73
+ Adds "geometry" column to self.adata.obs, based on Geojson exported from Qupath.
74
+
75
+ Parameters:
76
+ * geojson_path (str) - path to geojson file
77
+ * object_type (str) - which "objectType" to merge from the geojson
78
+ '''
79
+
80
+ if isinstance(geojson_path,str):
81
+ gdf = gpd.read_file(geojson_path)
82
+ elif isinstance(geojson_path,gpd.GeoDataFrame):
83
+ gdf = geojson_path
84
+ gdf = gdf[gdf["objectType"] == object_type]
85
+ gdf = gdf.loc[:,["id","geometry"]]
86
+ gdf.rename(columns={"id":self.adata.obs.index.name},inplace=True)
87
+
88
+ microns_per_pixel = self.viz.json["microns_per_pixel"]
89
+ gdf["geometry"] = gdf["geometry"].apply(lambda geom: scale(geom, xfact=microns_per_pixel, yfact=microns_per_pixel, origin=(0, 0)))
90
+ with warnings.catch_warnings():
91
+ warnings.simplefilter("ignore", category=UserWarning)
92
+ gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkt)
93
+ gdf = gdf.set_index(self.adata.obs.index.name)
94
+
95
+ if "geometry" in self.adata.obs.columns:
96
+ print("Geometry column already exists, overwriting...")
97
+ del self.adata.obs["geometry"]
98
+ with warnings.catch_warnings():
99
+ warnings.filterwarnings("ignore", message="Geometry column does not contain geometry")
100
+ self.adata.obs = self.adata.obs.join(gdf,how="left")
101
+
102
+
103
+ def merge(self, adata, obs=None, var=None, umap=True, pca=True, hvg=True):
104
+ '''
105
+ Merge info from an anndata to self.adata, in case genes have been filtered.
106
+
107
+ Parameters:
108
+ * adata (ad.AnnData) - anndata where to get the values from
109
+ * obs - single string or list of obs to merge
110
+ * var - single string or list of var to merge
111
+ * umap (bool) - add umap to OBSM, and UMAP coordinates to obs
112
+ * pca (bool) - add PCA to OBSM
113
+ * hvg (bool) - add highly variable genes to vars
114
+ '''
115
+
116
+ if not obs:
117
+ obs = []
118
+ elif isinstance(obs, str):
119
+ obs = [obs]
120
+ if umap and "X_umap" in adata.obsm:
121
+ if self.adata.shape[0] == adata.shape[0]:
122
+ self.adata.obsm['X_umap'] = adata.obsm['X_umap'].copy()
123
+ else:
124
+ print("Cant add UMAP to obsm, size of adatas don't match")
125
+ umap_coords = adata.obsm['X_umap']
126
+ adata.obs['UMAP_1'] = umap_coords[:, 0]
127
+ adata.obs['UMAP_2'] = umap_coords[:, 1]
128
+
129
+ obs += ['UMAP_1','UMAP_2']
130
+ if pca and "X_pca" in adata.obsm:
131
+ if self.adata.shape[0] == adata.shape[0]:
132
+ self.adata.obsm['X_pca'] = adata.obsm['X_pca'].copy()
133
+ if hvg and 'highly_variable' in adata.var.columns:
134
+ if not var:
135
+ var = 'highly_variable'
136
+ else:
137
+ if 'highly_variable' not in var:
138
+ var += ['highly_variable']
139
+ if obs:
140
+ existing_columns = [col for col in obs if col in self.adata.obs.columns]
141
+ if existing_columns:
142
+ self.adata.obs.drop(columns=existing_columns, inplace=True)
143
+ self.adata.obs = self.adata.obs.join(adata.obs[obs], how="left")
144
+ if var:
145
+ if isinstance(var, str):
146
+ var = [var]
147
+ existing_columns = [col for col in var if col in self.adata.var.columns]
148
+ if existing_columns:
149
+ self.adata.var.drop(columns=existing_columns, inplace=True)
150
+ self.adata.var = self.adata.var.join(adata.var[var], how="left")
151
+
152
+
153
+ def get(self, what, cropped=False, geometry=False):
154
+ '''
155
+ get a vector from data (a gene) or metadata (from obs or var). or subset the object.
156
+
157
+ Parameters:
158
+ * what - if string, will get data or metadata. \
159
+ else, will return a new Aggregation object that is spliced. \
160
+ the splicing is passed to the self.adata.
161
+ * cropped (bool) - get the data from the adata_cropped after plotting spatial
162
+ * geometry (bool) - include only objects which have geometry
163
+
164
+ **Returns**: either np.array of data or, if subsetting, a new Aggregation instance
165
+ '''
166
+
167
+ adata = self.adata_cropped if cropped else self.adata
168
+ if geometry and self.plot.geometry is not None:
169
+ adata = adata[adata.obs.index.isin(self.plot.geometry.index)]
170
+ if isinstance(what, str): # Easy access to data or metadata arrays
171
+ if what in adata.obs.columns: # Metadata
172
+ column_data = adata.obs[what]
173
+ if column_data.dtype.name == 'category':
174
+ return column_data.astype(str).values
175
+ return column_data.values
176
+ elif what in adata.var.index: # A gene
177
+ return np.array(adata[:, what].X.todense().ravel()).flatten()
178
+ elif what in adata.var.columns: # Gene metadata
179
+ column_data = adata.var[what]
180
+ if column_data.dtype.name == 'category':
181
+ return column_data.astype(str).values
182
+ return column_data.values
183
+ obs_cols_lower = adata.obs.columns.str.lower()
184
+ if what.lower() in obs_cols_lower:
185
+ col_name = adata.obs.columns[obs_cols_lower.get_loc(what.lower())]
186
+ column_data = adata.obs[col_name]
187
+ if column_data.dtype.name == 'category':
188
+ return column_data.astype(str).values
189
+ return column_data.values
190
+ elif self.viz.organism == "mouse" and (what.lower().capitalize() in adata.var.index):
191
+ return np.array(adata[:, what.lower().capitalize()].X.todense()).flatten()
192
+ elif self.viz.organism == "human" and (what.upper() in adata.var.index):
193
+ return np.array(adata[:, what.upper()].X.todense()).flatten()
194
+ var_cols_lower = adata.var.columns.str.lower()
195
+ if what.lower() in var_cols_lower:
196
+ col_name = adata.var.columns[var_cols_lower.get_loc(what.lower())]
197
+ column_data = adata.var[col_name]
198
+ if column_data.dtype.name == 'category':
199
+ return column_data.astype(str).values
200
+ return column_data.values
201
+ else:
202
+ # Create a new Aggregation object based on adata subsetting
203
+ return self.subset(what)
204
+
205
+ def subset(self, what=(slice(None), slice(None))):
206
+ '''
207
+ Create a new Aggregation object based on adata subsetting.
208
+ **Returns** new Aggregation instance
209
+ '''
210
+ what = tuple(idx.to_numpy() if hasattr(idx, "to_numpy") else idx for idx in what)
211
+ adata = self.adata[what].copy()
212
+ adata.var = adata.var.loc[:,~adata.var.columns.str.startswith(("cor_","exp_"))]
213
+ for layer in self.adata.layers.keys():
214
+ adata.layers[layer] = self.adata.layers[layer][what].copy()
215
+ return Aggregation(self.viz, adata, name=self.name)
216
+
217
+ def __getitem__(self, what):
218
+ '''get a vector from data (a gene) or metadata (from obs or var). or subset the object.'''
219
+ item = self.get(what, cropped=False)
220
+ if item is None:
221
+ raise KeyError(f"[{what}] isn't in data or metadatas")
222
+ return item
223
+
224
+ def pseudobulk(self, by=None,layer=None):
225
+ '''
226
+ Sums the gene expression for each group in a single obs.
227
+
228
+ Parameters:
229
+ * by (str) - return a dataframe, each column is a value in "by" (for example cluster), rows are genes. \
230
+ If None, will return the mean expression of every gene.
231
+ * layer (str) - which layer in adata to use.
232
+
233
+ **Returns** the gene expression for each group (pd.DataFrame)
234
+ '''
235
+ if layer is None:
236
+ x = self.adata.X
237
+ else:
238
+ if layer not in self.adata.layers:
239
+ raise KeyError(f"Layer '{layer}' not found in self.adata.layers. Available layers: {list(self.adata.layers.keys())}")
240
+ x = self.adata.layers[layer]
241
+
242
+ if by is None:
243
+ pb = x.mean(axis=0).A1
244
+ return pd.Series(pb, index=self.adata.var_names)
245
+
246
+ expr_df = pd.DataFrame(x.toarray(),
247
+ index=self.adata.obs_names,
248
+ columns=self.adata.var_names)
249
+
250
+ group_key = self.adata.obs[by]
251
+ return expr_df.groupby(group_key, observed=True).mean().T
252
+
253
+
254
+ def smooth(self, what, radius, method="median", new_col_name=None, **kwargs):
255
+ '''
256
+ Applies median smoothing to the specified column in adata.obs using spatial neighbors.
257
+
258
+ Parameters:
259
+ * what (str) - what to smooth. either a gene name or column name from self.adata.obs
260
+ * radius (float) - in microns
261
+ * method - ["mode", "median", "mean", "gaussian", "log"]
262
+ * new_col_name (str) - Optional custom name for the output column.
263
+ * \**kwargs - Additional Parameters for specific methods (e.g., sigma for gaussian, offset for log).
264
+ '''
265
+ coords = self.adata.obs[['um_x', 'um_y']].values
266
+
267
+ if self.tree is None:
268
+ # Build a KDTree for fast neighbor look-up.
269
+ print("Building coordinate tree")
270
+ self.tree = cKDTree(coords)
271
+
272
+ values = self[what]
273
+ if len(values) != self.adata.shape[0]:
274
+ raise ValueError(f"{what} not in adata.obs or a gene name")
275
+
276
+ if isinstance(values[0], str):
277
+ if method != "mode":
278
+ raise ValueError("Smoothing on string columns is only supported using the 'mode' method.")
279
+
280
+ smoothed_values = []
281
+
282
+ if method == "log":
283
+ offset = kwargs.get("offset", 1.0)
284
+ if np.min(values) < -offset:
285
+ raise ValueError(f"Negative values detected in '{what}'. Log smoothing requires all values >= {-offset}.")
286
+ elif method == "gaussian":
287
+ sigma = kwargs.get("sigma", radius / 2)
288
+
289
+ # Iterate through each object's coordinates, find neighbors, and compute the median.
290
+ for i, point in enumerate(tqdm(coords, desc=f"{method} filtering '{what}' in radius {radius}")):
291
+ # Find all neighbors within the given radius.
292
+ indices = self.tree.query_ball_point(point, radius)
293
+ if not indices:
294
+ # Assign the original value or np.nan if no neighbor is found.
295
+ new_val = values[i]
296
+ neighbor_values = values[indices]
297
+
298
+ if method == "median":
299
+ new_val = np.median(neighbor_values)
300
+ elif method == "mean":
301
+ new_val = np.mean(neighbor_values)
302
+ elif method == "mode":
303
+ if isinstance(neighbor_values[0], str):
304
+ unique_vals, counts = np.unique(neighbor_values, return_counts=True)
305
+ new_val = unique_vals[np.argmax(counts)]
306
+ else:
307
+ new_val = mode(neighbor_values).mode
308
+ elif method == "gaussian":
309
+ # Calculate distances to neighbors.
310
+ distances = np.linalg.norm(coords[indices] - point, axis=1)
311
+
312
+ # Compute Gaussian weights.
313
+ weights = np.exp(- (distances**2) / (2 * sigma**2))
314
+ new_val = np.sum(neighbor_values * weights) / np.sum(weights)
315
+ elif method == "log":
316
+ # Apply a log1p transform to handle zero values; add an offset if necessary.
317
+ offset = kwargs.get("offset", 1.0)
318
+ # It is assumed that neighbor_values + offset > 0.
319
+ new_val = np.expm1(np.median(np.log1p(neighbor_values + offset))) - offset
320
+ else:
321
+ raise ValueError(f"Unknown smoothing method: {method}")
322
+
323
+ smoothed_values.append(new_val)
324
+
325
+ if not new_col_name:
326
+ new_col_name = f'{what}_smooth_r{radius}'
327
+ self.adata.obs[new_col_name] = smoothed_values
328
+
329
+ def noise_mean_curve(self, plot=False, layer=None, signif_thresh=0.95, inplace=False, **kwargs):
330
+ '''
331
+ Generates a noise-mean curve of the data.
332
+
333
+ Parameters:
334
+ * plot (bool) - plot the curve
335
+ * layer (str) - which layer in the AnnData to use
336
+ * signif_thresh (float) - for plotting, add text for genes in this residual percentile
337
+ * inplace (bool) - add the mean_expression, cv and residuals to VAR
338
+
339
+ **Returns** dataframe with expression, CV and residuals of each gene (pd.DataFrame). \
340
+ If plot=true, will also return ax.
341
+ '''
342
+ return HiVis_utils.noise_mean_curve(self.adata, plot=plot,layer=layer,
343
+ signif_thresh=signif_thresh,inplace=inplace, **kwargs)
344
+
345
+
346
+ def cor(self, what, self_corr_value=None, normilize=True, layer: str = None, inplace=False):
347
+ '''
348
+ Calculates gene(s) correlation.
349
+
350
+ Parameters:
351
+ * what (str or list) - if str, computes Spearman correlation of a given gene with all genes. \
352
+ if list, will compute correlation between all genes in the list
353
+ * self_corr_value - replace the correlation of the gene with itself by this value
354
+ * normalize (bool) - normilize expression before computing correlation
355
+ * layer (str) - which layer in the AnnData to use
356
+ * inplace (bool) - add the correlation to VAR
357
+
358
+ **Returns** dataframe of spearman correlation between genes (pd.DataFrame)
359
+ '''
360
+ if isinstance(what, str):
361
+ x = self[what]
362
+ return HiVis_utils.cor_gene(self.adata, x, what, self_corr_value, normilize, layer, inplace)
363
+ return HiVis_utils.cor_genes(self.adata, what, self_corr_value, normilize, layer)
364
+
365
+ def sync(self, what: str):
366
+ '''
367
+ Transfers metadata assignment from the Aggregation to the spots.
368
+
369
+ Parameters:
370
+ * what (str) - obs column name to pass to HiViz object
371
+ '''
372
+ if what not in self.adata.obs:
373
+ raise KeyError(f"'{what}' does not exist in agg.adata.obs.")
374
+ agg_id_col = self.adata.obs.index.name
375
+ if agg_id_col not in self.viz.adata.obs.columns:
376
+ raise KeyError(f"'{agg_id_col}' does not exist in HiViz.adata.obs.")
377
+ mapping = self.adata.obs[what]
378
+ self.viz.adata.obs[what] = self.viz.adata.obs[agg_id_col].map(mapping)
379
+
380
+
381
+ def export_h5(self, path=None):
382
+ '''
383
+ Exports the adata.
384
+
385
+ Parameters:
386
+ * path (str) - path to save the h5 file. If None, will save to path_output
387
+
388
+ **Returns** path where the file was saved (str)
389
+ '''
390
+ print(f"SAVING [{self.name}]")
391
+ if not path:
392
+ path = f"{self.path_output}/{self.name}.h5ad"
393
+ self.adata.write(path)
394
+ return path
395
+
396
+ def dge(self, column, group1, group2=None, method="wilcox", two_sided=False,
397
+ umi_thresh=0, inplace=False, layer=None):
398
+ '''
399
+ Runs differential gene expression analysis between two groups.
400
+ Values will be saved in self.var: expression_mean, log2fc, pval
401
+
402
+ Parameters:
403
+ * column - which column in obs has the groups classification
404
+ * group1 - specific value in the "column"
405
+ * group2 - specific value in the "column". \
406
+ if None, will run against all other values, and will be called "rest"
407
+ * method - either "wilcox" or "t_test"
408
+ * two_sided - if one sided, will give the pval for each group, \
409
+ and the minimal of both groups (which will also be FDR adjusted)
410
+ * umi_thresh - use only spots with more UMIs than this number
411
+ * expression - function F {mean, mean, max} F(mean(group1),mean(group2))
412
+ * inplace - modify the adata.var with log2fc, pval and expression columns
413
+ * layer (str) - which layer in the AnnData to use
414
+
415
+ **Returns** the DGE results (pd.DataFrame)
416
+ '''
417
+ alternative = "two-sided" if two_sided else "greater"
418
+ df = HiVis_utils.dge(self.adata, column, group1, group2, umi_thresh,layer=layer,
419
+ method=method, alternative=alternative, inplace=inplace)
420
+ if group2 is None:
421
+ group2 = "rest"
422
+ df = df[[f"pval_{column}",f"log2fc_{column}",group1,group2]]
423
+ df.rename(columns={f"log2fc_{column}":"log2fc"},inplace=True)
424
+ if not two_sided:
425
+ df[f"pval_{group1}"] = 1 - df[f"pval_{column}"]
426
+ df[f"pval_{group2}"] = df[f"pval_{column}"]
427
+ df["pval"] = df[[f"pval_{group1}",f"pval_{group2}"]].min(axis=1)
428
+ else:
429
+ df["pval"] = df[f"pval_{column}"]
430
+ del df[f"pval_{column}"]
431
+ df["qval"] = HiVis_utils.p_adjust(df["pval"])
432
+ df["expression_mean"] = df[[group1, group2]].mean(axis=1)
433
+ df["expression_min"] = df[[group1, group2]].min(axis=1)
434
+ df["expression_max"] = df[[group1, group2]].max(axis=1)
435
+ df["gene"] = df.index
436
+ if inplace:
437
+ var = df.copy()
438
+ var.rename(columns={
439
+ "qval":f"qval_{column}",
440
+ "pval":f"pval_{column}",
441
+ "log2fc":f"log2fc_{column}",
442
+ "expression_mean":f"expression_mean_{column}",
443
+ "expression_min":f"expression_min_{column}",
444
+ "expression_max":f"expression_max_{column}",
445
+ },inplace=True)
446
+ del var["gene"]
447
+ self.adata.var = self.adata.var.join(var, how="left")
448
+ return df
449
+
450
+ @property
451
+ def shape(self):
452
+ '''**Returns** Aggregation.adata.shape'''
453
+ return self.adata.shape
454
+
455
+ def __str__(self):
456
+ s = f"# Aggregation # {self.name} #\n\n"
457
+ s += f"# Parent: {self.viz.name} #\n"
458
+ s += f"\tSize: {self.adata.shape[0]} x {self.adata.shape[1]}\n"
459
+ s += '\nobs: '
460
+ s += ', '.join(list(self.adata.obs.columns))
461
+ if not self.adata.var.columns.empty:
462
+ s += '\n\nvar: '
463
+ s += ', '.join(list(self.adata.var.columns))
464
+ layers = list(self.adata.layers.keys())
465
+ if layers:
466
+ s += '\n\nlayers: '
467
+ s += ', '.join(layers)
468
+ return s
469
+
470
+ def __repr__(self):
471
+ # s = f"Aggregation[{self.name}]"
472
+ s = self.__str__()
473
+ return s
474
+
475
+
476
+ def combine(self, other):
477
+ '''
478
+ Combines two Aggregation objects into a single adata.
479
+ '''
480
+ return self + other
481
+
482
+ def __add__(self, other):
483
+ '''Combines two Aggregation objects into a single adata'''
484
+ if not isinstance(other, (Aggregation)):
485
+ raise ValueError("Addition supported only for Aggregation class")
486
+ self.adata.obs["source_"] = self.name
487
+ other.adata.obs["source_"] = other.name if other.name != self.name else f"{self.name}_1"
488
+ adata = ad.concat([self.adata, other.adata], join='outer')
489
+ del self.adata.obs["source_"]
490
+ del other.adata.obs["source_"]
491
+ return adata
492
+
493
+ def __delitem__(self, key):
494
+ '''Deletes metadata'''
495
+ if isinstance(key, str):
496
+ if key in self.adata.obs:
497
+ del self.adata.obs[key]
498
+ elif key in self.adata.var:
499
+ del self.adata.var[key]
500
+ else:
501
+ raise KeyError(f"'{key}' not found in adata.obs")
502
+ else:
503
+ raise TypeError(f"Key must be a string, not {type(key).__name__}")
504
+
505
+ def update(self):
506
+ '''
507
+ Updates the methods in the instance.
508
+ Should be used after modifying the source code in the class
509
+ '''
510
+ HiVis_utils.update_instance_methods(self)
511
+ HiVis_utils.update_instance_methods(self.plot)
512
+ _ = gc.collect()
513
+
514
+ def head(self, n=5):
515
+ '''**Returns** Aggregation.adata.obs.head(n), where n is number of rows'''
516
+ return self.adata.obs.head(n)
517
+
518
+ @property
519
+ def columns(self):
520
+ '''**Returns** Aggregation.adata.obs.columns'''
521
+ return self.adata.obs.columns.copy()
522
+
523
+ def copy(self, new_name=None, new_out_path=False, full=False):
524
+ '''
525
+ Creates a deep copy of the instance
526
+ if new_name is specified, renames the object and changes the path_output.
527
+ If full is False, the name will be added to the current (previous) name.
528
+
529
+ **Returns** new Aggregation instance
530
+ '''
531
+ new = deepcopy(self)
532
+ new.viz = self.viz
533
+ gc.collect()
534
+ new = deepcopy(self)
535
+ if new_name:
536
+ new.rename(new_name, new_out_path=new_out_path, full=full)
537
+ return new
538
+ return new
539
+
540
+ def rename(self, new_name: str, new_out_path=True, full=False):
541
+ '''
542
+ Renames the object and changes the path_output.
543
+ If full is False, the name will be added to the current (previous) name
544
+ '''
545
+ if full:
546
+ self.name = new_name
547
+ else:
548
+ self.name = f"{self.viz.name}_{new_name}"
549
+ if new_out_path:
550
+ self.path_output = self.viz.path_output + f"/{new_name}"
551
+
552
+ def export_to_matlab(self, path=None):
553
+ '''
554
+ Exports gene names, data (sparse matrix) and metadata to a .mat file
555
+ '''
556
+ var_names = self.adata.var_names.to_numpy()
557
+ if 'X_umap' in self.adata.obsm:
558
+ self.adata.obs['UMAP_1'] = self.adata.obsm['X_umap'][:, 0]
559
+ self.adata.obs['UMAP_2'] = self.adata.obsm['X_umap'][:, 1]
560
+
561
+ obs = self.adata.obs.copy()
562
+ obs["Cell_ID"] = obs.index.tolist()
563
+
564
+ # Shorten long column names in obs
565
+ def shorten_col_names(columns, max_len=28):
566
+ seen_names = {}
567
+ rename_dict = {}
568
+ for col in columns:
569
+ if len(col) > max_len:
570
+ base_name = col[:max_len]
571
+ count = seen_names.get(base_name, 0)
572
+ new_name = f"{base_name}_{count}"
573
+ seen_names[base_name] = count + 1
574
+ rename_dict[col] = new_name
575
+ return rename_dict
576
+
577
+ rename_dict = shorten_col_names(obs.columns)
578
+ obs = obs.rename(columns=rename_dict)
579
+
580
+ def remove_non_ascii(d):
581
+ return {re.sub(r'[^\x00-\x7F]+', '_', k): v for k, v in d.items()}
582
+
583
+ obs = obs.to_dict(orient='list')
584
+ obs = remove_non_ascii(obs)
585
+
586
+ if not path:
587
+ path = f"{self.path_output}/matlab"
588
+ if not os.path.exists(path):
589
+ os.makedirs(path)
590
+ path = f"{path}/{self.name}.mat"
591
+ print("[Saving mat file]")
592
+ scipy.io.savemat(path, {"genes": var_names, "mat": self.adata.X,"metadata":obs})
593
+ self.adata.obs.to_csv(path.replace(".mat","metadata.csv"))
594
+