hormone2cell 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ """ calculate the hormone production and receiving strength in the datasets """
2
+ __version__ = '1.0.0'
3
+ __author__ = 'Lijiang Fei'
4
+
5
+ from .utils import *
6
+ from .aveExp import compute_aveExp_by_category
7
+ from .combine_assay import combine_assay
8
+ from .hormone_strength import hormone_strength
9
+ from . import data
hormone2cell/aveExp.py ADDED
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+ from .data import load_hormone_file
3
+ import os
4
+ import gc
5
+ from typing import List
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import scanpy as sc
10
+ from anndata import AnnData
11
+ from scipy.sparse import csr_matrix
12
+
13
+
14
+ def get_exp_percentage(
15
+ sub: AnnData,
16
+ gene_ids: List[str],
17
+ clusteruse: str
18
+ ) -> pd.DataFrame:
19
+ """
20
+ Calculate, for each cluster (cell type), the average expression of each gene,
21
+ the number of expressing cells, and the percentage of expressing cells.
22
+ Efficiently operates on sparse matrices to avoid expensive DataFrame conversions.
23
+
24
+ Parameters
25
+ ----------
26
+ sub : AnnData
27
+ AnnData object containing the expression matrix `X` and cell metadata `obs`.
28
+ gene_ids : List[str]
29
+ List of gene IDs corresponding to `sub.var.index`.
30
+ clusteruse : str
31
+ The column name in `sub.obs` used to group cells into clusters.
32
+
33
+ Returns
34
+ -------
35
+ result_df : pd.DataFrame
36
+ A DataFrame summarizing average expression, expressed cell counts,
37
+ total cell counts, and expression percentage per cluster and gene.
38
+ """
39
+ # Ensure sub.X is a CSR sparse matrix
40
+ obs_matrix = sub.X if isinstance(sub.X, csr_matrix) else sub.X.tocsr()
41
+
42
+ # Extract cluster labels
43
+ cluster_labels = sub.obs[clusteruse].values
44
+ unique_clusters = np.unique(cluster_labels)
45
+
46
+ # Initialize result containers
47
+ average_obs_list = [] # Stores mean expression values
48
+ expressed_cell_list = [] # Stores counts of expressing cells (> 0)
49
+ total_cell_list = [] # Stores total cell counts per cluster
50
+
51
+ # Iterate through each cluster and compute statistics
52
+ for cluster in unique_clusters:
53
+ mask = cluster_labels == cluster # Boolean mask for current cluster
54
+ cluster_matrix = obs_matrix[mask] # Subset sparse matrix rows
55
+
56
+ # 1. Compute mean expression
57
+ cluster_mean = cluster_matrix.mean(axis=0)
58
+ average_obs_list.append(np.array(cluster_mean).flatten())
59
+
60
+ # 2. Count cells with expression > 0
61
+ cluster_expressed = (cluster_matrix > 0).sum(axis=0)
62
+ expressed_cell_list.append(np.array(cluster_expressed).flatten())
63
+
64
+ # 3. Get total number of cells in this cluster
65
+ total_cells = cluster_matrix.shape[0]
66
+ total_cell_list.append(total_cells)
67
+
68
+ # Convert results into DataFrames
69
+ average_obs_df = pd.DataFrame(np.vstack(average_obs_list), columns=gene_ids, index=unique_clusters)
70
+ expressed_cell_df = pd.DataFrame(np.vstack(expressed_cell_list), columns=gene_ids, index=unique_clusters)
71
+ total_cell_df = pd.DataFrame(total_cell_list, columns=['TotalCellNumber'], index=unique_clusters)
72
+
73
+ # Reshape to long format
74
+ average_obs_melt = average_obs_df.reset_index().melt(id_vars='index', var_name='Gene', value_name='Expression')
75
+ average_obs_melt.rename(columns={'index': clusteruse}, inplace=True)
76
+
77
+ expressed_cell_melt = expressed_cell_df.reset_index().melt(id_vars='index', var_name='Gene', value_name='ExpressedCellNumber')
78
+ expressed_cell_melt.rename(columns={'index': clusteruse}, inplace=True)
79
+
80
+ # Merge all results
81
+ result_df = pd.merge(average_obs_melt, expressed_cell_melt, on=[clusteruse, 'Gene'])
82
+ result_df = pd.merge(result_df, total_cell_df, left_on=clusteruse, right_index=True)
83
+
84
+ # Compute percentage of expressing cells
85
+ result_df['Percentage'] = (result_df['ExpressedCellNumber'] / result_df['TotalCellNumber']) * 100
86
+
87
+ return result_df
88
+
89
+
90
+
91
+ # Define a function to scale values in column A to the range 0-1, similar with scanpy function
92
+ # def scale_within_group(x):
93
+ # """Scale values between 0 and 1 within a group."""
94
+ # return (x - x.min()) / (x.max() - x.min()) if x.max() != x.min() else x
95
+
96
+ # calculate the mean expression and pct of each gene in each cell type.
97
+ import numpy as np
98
+ import pandas as pd
99
+ import scanpy as sc
100
+ from anndata import AnnData
101
+
102
+
103
+ # calculate the mean expression and pct of each gene in each cell type.
104
+ def get_result_df(sub: AnnData, celltype_col: str, tissue_col: str) -> pd.DataFrame:
105
+ """
106
+ Calculate average gene expression and percentage per cell type, supporting multiple tissues.
107
+
108
+ Parameters
109
+ ----------
110
+ sub : AnnData
111
+ AnnData object containing expression matrix `X`, var (genes), and obs (cell metadata).
112
+ celltype_col : str
113
+ Column in `obs` that defines cell types (will be converted to categorical).
114
+ tissue_col : str
115
+ Column in `obs` that indicates tissue identity.
116
+
117
+ Returns
118
+ -------
119
+ pd.DataFrame
120
+ Aggregated expression statistics per (tissue, cell type, gene).
121
+ """
122
+ # 1) Filter out genes expressed in fewer than 3 cells
123
+ sc.pp.filter_genes(sub, min_cells=3)
124
+
125
+ # 2) Prepare gene IDs and cell type categories
126
+ gene_ids = list(sub.var.index.values)
127
+ sub.obs[celltype_col] = sub.obs[celltype_col].astype("category")
128
+
129
+ # 3) Core stats from your existing helper
130
+ result_df = get_exp_percentage(sub, gene_ids, clusteruse=celltype_col)
131
+
132
+ # 4) Attach tissue per cell type via a de-duplicated mapping from obs
133
+ tissue_map = sub.obs[[celltype_col, tissue_col]].drop_duplicates(celltype_col)
134
+ result_df = result_df.merge(tissue_map, on=celltype_col, how="left")
135
+
136
+ # 5) Standardize column names (only if needed)
137
+ # If your get_exp_percentage already returns 'Percentage', this is unnecessary.
138
+ result_df.columns = result_df.columns.str.replace("percentage", "Percentage", regex=False)
139
+
140
+ # 6) Reorder/select columns (keep only those that exist)
141
+ keep_cols = [
142
+ tissue_col, # dynamic tissue column
143
+ celltype_col, # dynamic celltype column
144
+ "Gene",
145
+ "ExpressedCellNumber",
146
+ "TotalCellNumber",
147
+ "Percentage",
148
+ "Expression",
149
+ ]
150
+ result_df = result_df[[c for c in keep_cols if c in result_df.columns]]
151
+
152
+ # 7) Log-transform average expression
153
+ result_df["logExpression"] = np.log1p(result_df["Expression"])
154
+
155
+ # Optional: a concise status message
156
+ print(f"Tissues in input: {sub.obs[tissue_col].unique().tolist()} — average calculation done.")
157
+
158
+ return result_df
159
+
160
+
161
+ # ## list the files and sort by file size
162
+ # def get_sorted_file_names(folder_path):
163
+ # files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
164
+ # files_sorted = sorted(files, key=lambda f: os.path.getsize(os.path.join(folder_path, f)), reverse=False)
165
+ # return files_sorted
166
+
167
+
168
+ def _ensure_csr_x(adata: AnnData) -> AnnData:
169
+ """Ensure that `adata.X` is a CSR sparse matrix for efficient operations."""
170
+ if isinstance(adata.X, csr_matrix):
171
+ return adata
172
+ adata = adata.copy()
173
+ adata.X = adata.X.tocsr()
174
+ return adata
175
+
176
+
177
+ def _map_celltype_tissue_to_cluster(sub: AnnData, celltype_tissue_col: str) -> AnnData:
178
+ """
179
+ Create a new integer-encoded `Cluster` column based on `Celltype_tissue`.
180
+
181
+ Uses pandas.Categorical codes (0..K-1) to represent unique categories.
182
+ """
183
+ sub = sub.copy()
184
+ cats = pd.Categorical(sub.obs[celltype_tissue_col])
185
+ sub.obs["Cluster"] = pd.Index(cats.codes, dtype="int64")
186
+ return sub
187
+
188
+
189
+ def check_count_data(adata: AnnData) -> AnnData:
190
+ """
191
+ Check whether adata.X or adata.raw.X contains raw counts.
192
+ Rules:
193
+ - If max(adata.X.data) > 100 -> return a copy of adata
194
+ - Else, if adata.raw exists and max(adata.raw.X.data) > 100 -> return adata.raw.to_adata()
195
+ - Else -> print message in English and raise ValueError
196
+ """
197
+ # 1. Check adata.X (assumed sparse matrix)
198
+ max_x = adata.X.data.max() if adata.X.data.size > 0 else 0.0
199
+ if max_x > 100:
200
+ return adata
201
+
202
+ # 2. Check adata.raw if available
203
+ if adata.raw is not None:
204
+ max_raw = adata.raw.X.data.max() if adata.raw.X.data.size > 0 else 0.0
205
+ if max_raw > 100:
206
+ return adata.raw.to_adata()
207
+
208
+ # 3. Neither X nor raw look like raw counts
209
+ msg = (
210
+ "Neither adata.X nor adata.raw.X seems to contain raw counts. "
211
+ "Hormone2cell requires raw counts as input to compute celltype-level average expression."
212
+ )
213
+ print(msg)
214
+ raise ValueError(msg)
215
+
216
+
217
+ def compute_aveExp_by_category(
218
+ adata: AnnData,
219
+ sc_sn_col: str = "suspension_type",
220
+ celltype_col: str = "Celltype",
221
+ tissue_col: str ='Tissue',
222
+ ) -> pd.DataFrame:
223
+ """
224
+ Integrated workflow for average expression analysis by category (`cell` / `nucleus`).
225
+
226
+ Steps
227
+ -----
228
+ - Check count data (raw counts required)
229
+ - Perform QC: filter cells with fewer than `min_genes`
230
+ - Normalize counts with `normalize_total(target_sum)`
231
+ - Digitize `Celltype_tissue` into an integer-encoded `Cluster` column
232
+ - For each category in `sc_sn_col` (e.g. 'cell', 'nucleus'), run get_result_df()
233
+ - Return a list of result DataFrames, one per category
234
+ """
235
+ # Check input data type (must be raw counts)
236
+ adata = check_count_data(adata)
237
+ adata = _ensure_csr_x(adata)
238
+ ## add a tissue level unique columns
239
+ adata.obs['Celltype_tissue']=adata.obs[tissue_col].astype('str')+'____'+adata.obs[celltype_col].astype('str')
240
+ #celltype_tissue_col='Celltype_tissue'
241
+ adata = _map_celltype_tissue_to_cluster(adata, 'Celltype_tissue')
242
+
243
+ # subset to hormone related genes
244
+ gene_dt=load_hormone_file()
245
+ genes_use=gene_dt['Gene'].unique().tolist()
246
+ genes_use=list(set(genes_use).intersection(set(adata.var.index)))
247
+ if len(genes_use) > 0:
248
+ adata = adata[:, genes_use]
249
+ print(f'{len(genes_use)} hormone-related genes are found.')
250
+ else:
251
+ raise ValueError("No hormone-related genes were found in the current AnnData object.")
252
+
253
+ # Quality control and normalization
254
+ #sc.pp.filter_cells(adata, min_genes=100)
255
+ sc.pp.normalize_total(adata, target_sum=10000)
256
+
257
+ # Collect categories (e.g. ['cell'] or ['nucleus'])
258
+ categories = list(pd.unique(adata.obs[sc_sn_col].astype(str)))
259
+ results = []
260
+
261
+ # Case 1: only one assay
262
+ if len(categories) == 1:
263
+ cat=categories[0]
264
+ print(f'Computing average gene expression per cell type in the {cat} dataset.')
265
+ result_df = get_result_df(adata,tissue_col=tissue_col, celltype_col="Cluster")
266
+
267
+
268
+ # Merge Celltype_tissue annotation directly
269
+ result_df = result_df.merge(
270
+ adata.obs[['Celltype_tissue', "Cluster"]].drop_duplicates("Cluster"),
271
+ on="Cluster",
272
+ how="left"
273
+ )
274
+ result_df[celltype_col] = [i.split('____')[1] for i in result_df['Celltype_tissue'].values]
275
+ # Keep consistent column order (only if they exist)
276
+ #result_df.columns = result_df.columns.str.replace(celltype_tissue_col, "Celltype", regex=False)
277
+ keep_cols = [
278
+ "Tissue",celltype_col ,"Cluster", 'Celltype_tissue', "Gene",
279
+ "ExpressedCellNumber", "TotalCellNumber", "Percentage", "logExpression"
280
+ ]
281
+ result_df = result_df[[c for c in keep_cols if c in result_df.columns]]
282
+
283
+
284
+ # Add category column
285
+ result_df[sc_sn_col] = cat
286
+ results.append(result_df)
287
+
288
+ res_df = pd.concat(results, ignore_index=True)
289
+ mask=res_df['logExpression']>0
290
+ res_df=res_df.loc[mask]
291
+ return res_df
292
+
293
+ # Case 2: multiple assays (e.g. 'cell' and 'nucleus')
294
+ elif len(categories) > 1:
295
+ for cat in categories:
296
+ print(f'Computing average gene expression per cell type in the {cat} dataset.')
297
+ mask = adata.obs[sc_sn_col].astype(str) == str(cat)
298
+ sub = adata[mask].copy()
299
+ result_df = get_result_df(sub=sub,tissue_col=tissue_col, celltype_col="Cluster")
300
+
301
+ # Merge Celltype_tissue annotation directly
302
+ result_df = result_df.merge(
303
+ adata.obs[['Celltype_tissue', "Cluster"]].drop_duplicates("Cluster"),
304
+ on="Cluster",
305
+ how="left"
306
+ )
307
+ result_df[celltype_col] = [i.split('____')[1] for i in result_df['Celltype_tissue'].values]
308
+ keep_cols = [
309
+ "Tissue",celltype_col ,"Cluster", 'Celltype_tissue', "Gene",
310
+ "ExpressedCellNumber", "TotalCellNumber", "Percentage", "logExpression"
311
+ ]
312
+ result_df = result_df[[c for c in keep_cols if c in result_df.columns]]
313
+
314
+
315
+ # Add category column
316
+ result_df[sc_sn_col] = cat
317
+ results.append(result_df)
318
+
319
+ res_df = pd.concat(results, ignore_index=True)
320
+ mask=res_df['logExpression']>0
321
+ res_df=res_df.loc[mask]
322
+ return res_df
323
+
324
+
325
+
@@ -0,0 +1,64 @@
1
+
2
+ from typing import Literal
3
+ import pandas as pd
4
+
5
+
6
+ def combine_assay(cell: pd.DataFrame, nucleus: pd.DataFrame,celltype_column: str) -> pd.DataFrame:
7
+ """
8
+ Combine cell and nucleus assay results:
9
+ - Build Hormone_CT key = Hormone + '___' + Celltype_unique
10
+ - Mark rows present only in cell / only in nucleus / in both
11
+ - For pairs present in both, keep the row with the highest Strength
12
+ - Drop optional columns
13
+ """
14
+ required = {"Hormone", celltype_column, "Strength"}
15
+ miss_cell = required - set(cell.columns)
16
+ miss_nuc = required - set(nucleus.columns)
17
+ if miss_cell or miss_nuc:
18
+ raise ValueError(f"Missing columns: cell {miss_cell}, nucleus {miss_nuc}")
19
+
20
+ # Work on copies; build key without turning NaN into the literal string "nan"
21
+ c = cell.copy()
22
+ n = nucleus.copy()
23
+ c["Hormone_CT"] = c["Hormone"].astype("string").str.cat(
24
+ c[celltype_column].astype("string"), sep="___", na_rep=None
25
+ )
26
+ n["Hormone_CT"] = n["Hormone"].astype("string").str.cat(
27
+ n[celltype_column].astype("string"), sep="___", na_rep=None
28
+ )
29
+
30
+ # Compute membership using Index set ops (fast, concise)
31
+ idx_c = pd.Index(c["Hormone_CT"])
32
+ idx_n = pd.Index(n["Hormone_CT"])
33
+ only_c = idx_c.difference(idx_n)
34
+ only_n = idx_n.difference(idx_c)
35
+ both = idx_c.intersection(idx_n)
36
+
37
+ # Slice and tag
38
+ dt_cell_only = c.loc[c["Hormone_CT"].isin(only_c)].assign(assay="cell_only")
39
+ dt_nuc_only = n.loc[n["Hormone_CT"].isin(only_n)].assign(assay="nucleus_only")
40
+
41
+ # For keys in both, concat then keep the max AveExpression per Hormone_CT
42
+ dt_both = pd.concat(
43
+ [
44
+ c.loc[c["Hormone_CT"].isin(both)].assign(assay="both"),
45
+ n.loc[n["Hormone_CT"].isin(both)].assign(assay="both"),
46
+ ],
47
+ ignore_index=True,
48
+ )
49
+ if not dt_both.empty:
50
+ dt_both = (dt_both
51
+ .dropna(subset=["Strength"])
52
+ .sort_values(["Hormone_CT", "Strength"], ascending=[True, False])
53
+ .drop_duplicates("Hormone_CT", keep="first")
54
+ .reset_index(drop=True))
55
+
56
+ # Combine all parts
57
+ out = pd.concat([dt_cell_only, dt_nuc_only, dt_both], ignore_index=True)
58
+
59
+ # Clean up
60
+ out = out.drop(columns=[c for c in ["Type", "tmp"] if c in out.columns], errors="ignore")
61
+ out = out.loc[out["Strength"].notna()]#.rename(columns={"Strength": "Strength"})
62
+ #out = out.drop('Hormone_CT',axis=1)
63
+
64
+ return out
hormone2cell/data.py ADDED
@@ -0,0 +1,71 @@
1
+ import importlib.resources
2
+ import pandas as pd
3
+ import scanpy as sc
4
+
5
+
6
+ def load_hormone_producing_file():
7
+ """
8
+ Load a hormone receptor data file (pickle format) packaged within the current module.
9
+ """
10
+ # Access the resource file within the current package and open it in binary mode
11
+ with importlib.resources.files(__package__).joinpath('HCA_Sub2_Table2C_hormones_v1.0.6_20260106.pkl').open("rb") as f:
12
+ # Use pandas to load the pickled object
13
+ return pd.read_pickle(f)
14
+
15
+
16
+ def load_hormone_receptor_file():
17
+ """
18
+ Load a hormone receptor data file (pickle format) packaged within the current module.
19
+ """
20
+ # Access the resource file within the current package and open it in binary mode
21
+ with importlib.resources.files(__package__).joinpath('HCA_Sub2_Table2D_receptors_v1.0.7_20260107.pkl').open("rb") as f:
22
+ # Use pandas to load the pickled object
23
+ return pd.read_pickle(f)
24
+
25
+ def load_hormone_file():
26
+ """
27
+ Load a hormone data file (pickle format) packaged within the current module that contains all the hormone genes.
28
+ """
29
+ # Access the resource file within the current package and open it in binary mode
30
+ with importlib.resources.files(__package__).joinpath('Hormone_info_list_v1.0.7.pkl').open("rb") as f:
31
+ # Use pandas to load the pickled object
32
+ return pd.read_pickle(f)
33
+
34
+ def load_precomputed_maxvalue(assay: str) -> pd.DataFrame:
35
+ """
36
+ Load precomputed max average expression values for hormones.
37
+
38
+ Parameters
39
+ ----------
40
+ assay : str
41
+ Either "cell" or "nucleus".
42
+
43
+ Returns
44
+ -------
45
+ pd.DataFrame
46
+ DataFrame loaded from the corresponding pickle file.
47
+ """
48
+ if assay == 'cell':
49
+ #file = 'HormoneCellAtlas_v3_max_value_cell.pkl'
50
+ file = 'HormoneCellAtlas_v11_finegrained_max_value_cell.pkl'
51
+ elif assay == 'nucleus':
52
+ #file = 'HormoneCellAtlas_v3_max_value_nucleus.pkl' # 注意这里和 cell 对称
53
+ file = 'HormoneCellAtlas_v11_finegrained_max_value_nucleus.pkl'
54
+ else:
55
+ raise ValueError("assay must be either 'cell' or 'nucleus'.")
56
+
57
+ with importlib.resources.files(__package__).joinpath(file).open("rb") as f:
58
+ dt = pd.read_pickle(f)
59
+
60
+ return dt
61
+
62
+
63
+
64
+ def load_pancreas_data():
65
+ """
66
+ Load a sampled pancreas data as the query dataset, which includes both single-cell and single-nucleus data. .
67
+ """
68
+ # Get the file path within the current package
69
+ file_path = importlib.resources.files(__package__).joinpath('pancreas_downsample200CT.h5ad')
70
+ # Use scanpy to read directly from the path
71
+ return sc.read(file_path)