napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,462 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
from typing import Optional, Union, NamedTuple
|
7
|
+
from scipy.stats import gaussian_kde
|
8
|
+
from scipy.signal import find_peaks
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
def discretize_expression_data(
|
14
|
+
expression_data: pd.DataFrame,
|
15
|
+
metadata_attributes: list[str] = None,
|
16
|
+
min_row_sum: int = 50,
|
17
|
+
zfpm_threshold: float = -3,
|
18
|
+
min_peakheight: float = 0.02,
|
19
|
+
min_peakdistance: int = 1,
|
20
|
+
prominence: float = 0.05,
|
21
|
+
verbose: bool = False,
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
Discretize the GTEx data
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
expression_data: pandas DataFrame
|
29
|
+
The expression data to discretize
|
30
|
+
metadata_attributes: list[str], optional
|
31
|
+
Non-numeric and other metadata attributes which should be included in the output but ignored when discretizing expression data
|
32
|
+
min_row_sum: int, optional
|
33
|
+
The minimum row sum to use for filtering constituatively un-expressed genes
|
34
|
+
zfpm_threshold: float, optional
|
35
|
+
The zFPKM threshold to use for discretization. Samples with zFPKM values below this threshold are considered as unexpressed (0) in the sample/condition.
|
36
|
+
min_peakheight: float, optional
|
37
|
+
The minimum peak height to use for peak detection
|
38
|
+
min_peakdistance: int, optional
|
39
|
+
The minimum peak distance to use for peak detection
|
40
|
+
prominence: float, optional
|
41
|
+
The prominence to use for peak detection
|
42
|
+
verbose: bool, optional
|
43
|
+
Whether to print verbose output
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
tuple of pandas DataFrames
|
48
|
+
A tuple of two pandas DataFrames. The first DataFrame contains the zFPKM-transformed expression data with the metadata attributes merged on the left. The second DataFrame contains the expression data with binary values (0 for unexpressed, 1 for expressed) merged on the left.
|
49
|
+
|
50
|
+
"""
|
51
|
+
|
52
|
+
expression_data_types = expression_data.dtypes
|
53
|
+
if metadata_attributes is None:
|
54
|
+
metadata_attributes = [
|
55
|
+
col
|
56
|
+
for col in expression_data_types.index
|
57
|
+
if expression_data_types[col] == "object"
|
58
|
+
]
|
59
|
+
|
60
|
+
expression_numpy_df = expression_data.drop(columns=metadata_attributes)
|
61
|
+
# ensure that all variables are numeric
|
62
|
+
invalid_variables = [
|
63
|
+
x
|
64
|
+
for x, y in zip(expression_numpy_df.columns, expression_numpy_df.dtypes)
|
65
|
+
if y not in ["int64", "float64"]
|
66
|
+
]
|
67
|
+
|
68
|
+
if len(invalid_variables) > 0:
|
69
|
+
raise ValueError(
|
70
|
+
f"The following variables are not numeric: {invalid_variables}. Either include these in metadata_attributes, convert them to numeric, or remove them from the expression data."
|
71
|
+
)
|
72
|
+
|
73
|
+
# calculate rowsums
|
74
|
+
expression_numpy_df = expression_numpy_df.loc[
|
75
|
+
expression_numpy_df.sum(axis=1) > min_row_sum, :
|
76
|
+
]
|
77
|
+
|
78
|
+
n_unexpressed = expression_data.shape[0] - expression_numpy_df.shape[0]
|
79
|
+
if n_unexpressed > 0:
|
80
|
+
logger.info(
|
81
|
+
f"Removed {n_unexpressed} genes whose expression across all samples was below {min_row_sum}."
|
82
|
+
)
|
83
|
+
|
84
|
+
logger.info("Discretizing expression data...")
|
85
|
+
zfpkm_df = zfpkm(
|
86
|
+
expression_numpy_df,
|
87
|
+
min_peakheight=min_peakheight,
|
88
|
+
min_peakdistance=min_peakdistance,
|
89
|
+
prominence=prominence,
|
90
|
+
verbose=verbose,
|
91
|
+
)
|
92
|
+
|
93
|
+
is_expressed = (zfpkm_df > zfpm_threshold).astype(int)
|
94
|
+
n_expressed = sum(is_expressed.values.flatten())
|
95
|
+
expression_fraction = round(n_expressed / zfpkm_df.size, 3)
|
96
|
+
logger.info(f"Expression fraction: {expression_fraction}")
|
97
|
+
|
98
|
+
if expression_fraction < 0.01:
|
99
|
+
logger.warning(
|
100
|
+
"Less than 1% of the data was expressed. This is likely due to the zFPKM threshold being too high."
|
101
|
+
)
|
102
|
+
|
103
|
+
return (
|
104
|
+
pd.DataFrame(expression_data[metadata_attributes]).merge(
|
105
|
+
zfpkm_df, left_index=True, right_index=True
|
106
|
+
),
|
107
|
+
pd.DataFrame(expression_data[metadata_attributes]).merge(
|
108
|
+
is_expressed, left_index=True, right_index=True
|
109
|
+
),
|
110
|
+
)
|
111
|
+
|
112
|
+
|
113
|
+
def zfpkm(
|
114
|
+
fpkm_df: pd.DataFrame,
|
115
|
+
min_peakheight: float = 0.02,
|
116
|
+
min_peakdistance: int = 1,
|
117
|
+
prominence: float = 0.05,
|
118
|
+
verbose: bool = False,
|
119
|
+
) -> pd.DataFrame:
|
120
|
+
"""Transform entire DataFrame using zFPKM.
|
121
|
+
|
122
|
+
Parameters
|
123
|
+
----------
|
124
|
+
fpkm_df : pd.DataFrame
|
125
|
+
DataFrame containing raw FPKM values.
|
126
|
+
Rows = genes/transcripts, Columns = samples
|
127
|
+
min_peakheight : float, optional
|
128
|
+
Minimum height for peak detection, by default 0.02
|
129
|
+
min_peakdistance : int, optional
|
130
|
+
Minimum distance between peaks, by default 1
|
131
|
+
prominence : float, optional
|
132
|
+
Minimum prominence for peak detection, by default 0.05
|
133
|
+
verbose : bool, optional
|
134
|
+
Whether to log detailed information, by default False
|
135
|
+
|
136
|
+
Returns
|
137
|
+
-------
|
138
|
+
pd.DataFrame
|
139
|
+
DataFrame with zFPKM transformed values
|
140
|
+
"""
|
141
|
+
# Remove problematic rows
|
142
|
+
fpkm_df = _remove_nan_inf_rows(fpkm_df)
|
143
|
+
|
144
|
+
zfpkm_df = pd.DataFrame(index=fpkm_df.index)
|
145
|
+
|
146
|
+
for col in fpkm_df.columns:
|
147
|
+
z_fpkm = _zfpkm_calc(
|
148
|
+
fpkm_df[col], min_peakheight, min_peakdistance, prominence, verbose
|
149
|
+
)
|
150
|
+
zfpkm_df[col] = z_fpkm
|
151
|
+
|
152
|
+
return zfpkm_df
|
153
|
+
|
154
|
+
|
155
|
+
class PeakIndices(NamedTuple):
|
156
|
+
"""Container for peak indices classified by importance.
|
157
|
+
|
158
|
+
Parameters
|
159
|
+
----------
|
160
|
+
major : float
|
161
|
+
Position of the rightmost/highest peak
|
162
|
+
minor : Optional[float]
|
163
|
+
Position of the second most significant peak, if it exists
|
164
|
+
other : Optional[np.ndarray]
|
165
|
+
Positions of any remaining peaks
|
166
|
+
"""
|
167
|
+
|
168
|
+
major: float
|
169
|
+
minor: Optional[float]
|
170
|
+
other: Optional[np.ndarray]
|
171
|
+
|
172
|
+
|
173
|
+
class PeakSelector:
|
174
|
+
"""Class to handle peak detection and classification in density data.
|
175
|
+
|
176
|
+
Parameters
|
177
|
+
----------
|
178
|
+
min_peakheight : float, optional
|
179
|
+
Minimum height for peak detection, by default 0.02
|
180
|
+
min_peakdistance : int, optional
|
181
|
+
Minimum distance between peaks, by default 1
|
182
|
+
prominence : float, optional
|
183
|
+
Minimum prominence for peak detection, by default 0.05
|
184
|
+
verbose : bool, optional
|
185
|
+
Whether to log detailed information, by default True
|
186
|
+
"""
|
187
|
+
|
188
|
+
def __init__(
|
189
|
+
self,
|
190
|
+
min_peakheight: float = 0.02,
|
191
|
+
min_peakdistance: int = 1,
|
192
|
+
prominence: float = 0.05,
|
193
|
+
verbose: bool = False,
|
194
|
+
):
|
195
|
+
self.min_peakheight = min_peakheight
|
196
|
+
self.min_peakdistance = min_peakdistance
|
197
|
+
self.prominence = prominence
|
198
|
+
self.verbose = verbose
|
199
|
+
|
200
|
+
def find_peaks(self, density_y: np.ndarray, x_eval: np.ndarray) -> PeakIndices:
|
201
|
+
"""Find and classify peaks in density data.
|
202
|
+
|
203
|
+
Parameters
|
204
|
+
----------
|
205
|
+
density_y : np.ndarray
|
206
|
+
Y-values of the density estimation
|
207
|
+
x_eval : np.ndarray
|
208
|
+
X-values corresponding to density_y
|
209
|
+
|
210
|
+
Returns
|
211
|
+
-------
|
212
|
+
PeakIndices
|
213
|
+
Named tuple containing classified peak positions
|
214
|
+
"""
|
215
|
+
peaks, _ = find_peaks(
|
216
|
+
density_y,
|
217
|
+
height=self.min_peakheight,
|
218
|
+
distance=self.min_peakdistance,
|
219
|
+
prominence=self.prominence,
|
220
|
+
)
|
221
|
+
|
222
|
+
logger.debug("Found %d peaks", len(peaks))
|
223
|
+
|
224
|
+
if len(peaks) == 0:
|
225
|
+
# If no peaks found, use the maximum density point
|
226
|
+
peak_idx = np.argmax(density_y)
|
227
|
+
return PeakIndices(major=x_eval[peak_idx], minor=None, other=None)
|
228
|
+
|
229
|
+
# Get peak positions and sort by x-value
|
230
|
+
peak_positions = x_eval[peaks]
|
231
|
+
sorted_indices = np.argsort(peak_positions)
|
232
|
+
sorted_positions = peak_positions[sorted_indices]
|
233
|
+
|
234
|
+
# Always use rightmost peak as major
|
235
|
+
major = sorted_positions[-1]
|
236
|
+
|
237
|
+
# If we have more peaks, classify them
|
238
|
+
minor = sorted_positions[-2] if len(sorted_positions) > 1 else None
|
239
|
+
other = sorted_positions[:-2] if len(sorted_positions) > 2 else None
|
240
|
+
|
241
|
+
if self.verbose:
|
242
|
+
logger.info(
|
243
|
+
"Major peak at %.3f, minor peak at %.3f",
|
244
|
+
major,
|
245
|
+
minor if minor is not None else float("nan"),
|
246
|
+
)
|
247
|
+
if other is not None:
|
248
|
+
logger.debug("Additional peaks at: %s", other)
|
249
|
+
|
250
|
+
return PeakIndices(major=major, minor=minor, other=other)
|
251
|
+
|
252
|
+
|
253
|
+
def generate_simple_test_data(n_genes: int = 200, n_samples: int = 100) -> pd.DataFrame:
|
254
|
+
"""Generate a simple test dataset for basic validation.
|
255
|
+
|
256
|
+
Parameters
|
257
|
+
----------
|
258
|
+
n_genes : int, optional
|
259
|
+
Number of genes to generate, by default 200
|
260
|
+
n_samples : int, optional
|
261
|
+
Number of samples to generate, by default 50
|
262
|
+
|
263
|
+
Returns
|
264
|
+
-------
|
265
|
+
pd.DataFrame
|
266
|
+
DataFrame with simulated FPKM values.
|
267
|
+
Rows = genes, Columns = samples
|
268
|
+
"""
|
269
|
+
np.random.seed(42)
|
270
|
+
|
271
|
+
fpkm_data = np.zeros((n_genes, n_samples))
|
272
|
+
|
273
|
+
for gene_idx in range(n_genes):
|
274
|
+
active_fraction = np.random.uniform(0.2, 0.8) # 20-80% of samples active
|
275
|
+
expression_delta = np.random.gamma(shape=2, scale=0.5)
|
276
|
+
noise_level = np.random.gamma(shape=2, scale=0.5)
|
277
|
+
|
278
|
+
n_active_samples = int(active_fraction * n_samples)
|
279
|
+
base_log_expr = np.random.normal(-1, 0.5) # Slightly lower baseline
|
280
|
+
|
281
|
+
log_expr = np.random.normal(base_log_expr, noise_level, n_samples)
|
282
|
+
|
283
|
+
if n_active_samples > 0:
|
284
|
+
active_samples = np.random.choice(
|
285
|
+
n_samples, n_active_samples, replace=False
|
286
|
+
)
|
287
|
+
log_expr[active_samples] += expression_delta
|
288
|
+
|
289
|
+
fpkm_data[gene_idx, :] = np.power(2, log_expr)
|
290
|
+
|
291
|
+
gene_names = [f"ENSG{i:05d}" for i in range(n_genes)]
|
292
|
+
sample_names = [f"Sample_{i+1:02d}" for i in range(n_samples)]
|
293
|
+
|
294
|
+
return pd.DataFrame(fpkm_data, index=gene_names, columns=sample_names)
|
295
|
+
|
296
|
+
|
297
|
+
def _remove_nan_inf_rows(fpkm_df: pd.DataFrame) -> pd.DataFrame:
|
298
|
+
"""Remove rows containing all NaN or infinite values.
|
299
|
+
|
300
|
+
Parameters
|
301
|
+
----------
|
302
|
+
fpkm_df : pd.DataFrame
|
303
|
+
Input DataFrame with FPKM values
|
304
|
+
|
305
|
+
Returns
|
306
|
+
-------
|
307
|
+
pd.DataFrame
|
308
|
+
DataFrame with rows containing all NaN or infinite values removed
|
309
|
+
|
310
|
+
Notes
|
311
|
+
-----
|
312
|
+
Logs a warning if any rows are filtered out.
|
313
|
+
"""
|
314
|
+
initial_rows = len(fpkm_df)
|
315
|
+
clean_df = fpkm_df[
|
316
|
+
~fpkm_df.apply(lambda row: row.isna().all() or np.isinf(row).all(), axis=1)
|
317
|
+
]
|
318
|
+
filtered_rows = initial_rows - len(clean_df)
|
319
|
+
|
320
|
+
if filtered_rows > 0:
|
321
|
+
logger.warning(
|
322
|
+
"Filtered out %d rows containing all NaN or infinite values (from %d total rows)",
|
323
|
+
filtered_rows,
|
324
|
+
initial_rows,
|
325
|
+
)
|
326
|
+
|
327
|
+
return clean_df
|
328
|
+
|
329
|
+
|
330
|
+
def _zfpkm_calc(
|
331
|
+
fpkm: Union[np.ndarray, pd.Series],
|
332
|
+
min_peakheight: float = 0.02,
|
333
|
+
min_peakdistance: int = 1,
|
334
|
+
prominence: float = 0.05,
|
335
|
+
verbose: bool = False,
|
336
|
+
) -> np.ndarray:
|
337
|
+
"""Perform zFPKM transform on a single sample of FPKM data.
|
338
|
+
|
339
|
+
The zFPKM algorithm fits a kernel density estimate to the log2(FPKM)
|
340
|
+
distribution of ALL GENES within a single sample. This requires:
|
341
|
+
- Input: A vector of FPKM values for all genes in ONE sample
|
342
|
+
- Many genes (typically 1000+) for meaningful density estimation
|
343
|
+
- The algorithm identifies the rightmost peak as "active" gene expression
|
344
|
+
|
345
|
+
Parameters
|
346
|
+
----------
|
347
|
+
fpkm : Union[np.ndarray, pd.Series]
|
348
|
+
Raw FPKM values for all genes in ONE sample (NOT log2 transformed)
|
349
|
+
min_peakheight : float, optional
|
350
|
+
Minimum height for peak detection, by default 0.02
|
351
|
+
min_peakdistance : int, optional
|
352
|
+
Minimum distance between peaks, by default 1
|
353
|
+
prominence : float, optional
|
354
|
+
Minimum prominence for peak detection, by default 0.05
|
355
|
+
verbose : bool, optional
|
356
|
+
Whether to log debug information, by default False
|
357
|
+
|
358
|
+
Returns
|
359
|
+
-------
|
360
|
+
np.ndarray
|
361
|
+
Array of zFPKM values
|
362
|
+
|
363
|
+
Raises
|
364
|
+
------
|
365
|
+
ValueError
|
366
|
+
If no valid FPKM values are found after filtering
|
367
|
+
"""
|
368
|
+
|
369
|
+
# Convert to numpy array and handle non-numeric values
|
370
|
+
fpkm = np.array(fpkm, dtype=float)
|
371
|
+
initial_len = len(fpkm)
|
372
|
+
fpkm = fpkm[~(np.isnan(fpkm) | np.isinf(fpkm))]
|
373
|
+
|
374
|
+
if len(fpkm) < initial_len:
|
375
|
+
logger.warning(
|
376
|
+
"Filtered out %d NaN/infinite values from input vector of length %d",
|
377
|
+
initial_len - len(fpkm),
|
378
|
+
initial_len,
|
379
|
+
)
|
380
|
+
|
381
|
+
if len(fpkm) == 0:
|
382
|
+
raise ValueError("No valid FPKM values found")
|
383
|
+
|
384
|
+
# Log2 transform - CRITICAL: Don't add small value, use 0 for exact zeros
|
385
|
+
fpkm_log2 = np.log2(np.maximum(fpkm, 1e-10))
|
386
|
+
|
387
|
+
# Compute kernel density estimate using R-like bandwidth
|
388
|
+
kde = gaussian_kde(fpkm_log2)
|
389
|
+
|
390
|
+
# Create evaluation points matching R's approach
|
391
|
+
x_min, x_max = fpkm_log2.min(), fpkm_log2.max()
|
392
|
+
x_range = x_max - x_min
|
393
|
+
x_eval = np.linspace(x_min - 0.1 * x_range, x_max + 0.1 * x_range, 512)
|
394
|
+
density_y = kde(x_eval)
|
395
|
+
|
396
|
+
# Find peaks using PeakSelector
|
397
|
+
peak_selector = PeakSelector(
|
398
|
+
min_peakheight=min_peakheight,
|
399
|
+
min_peakdistance=min_peakdistance,
|
400
|
+
prominence=prominence,
|
401
|
+
verbose=verbose,
|
402
|
+
)
|
403
|
+
peaks = peak_selector.find_peaks(density_y, x_eval)
|
404
|
+
mu = peaks.major
|
405
|
+
|
406
|
+
# Estimate standard deviation using method from Hart et al.
|
407
|
+
active_samples = fpkm_log2[fpkm_log2 > mu]
|
408
|
+
n_active = len(active_samples)
|
409
|
+
n_total = len(fpkm_log2)
|
410
|
+
active_fraction = n_active / n_total
|
411
|
+
|
412
|
+
if verbose:
|
413
|
+
logger.info(
|
414
|
+
"Active samples: %d/%d (%.2f%%)", n_active, n_total, 100 * active_fraction
|
415
|
+
)
|
416
|
+
|
417
|
+
# Use Hart et al. method only if we have enough active samples
|
418
|
+
min_active_samples = max(5, int(0.1 * n_total)) # At least 5 or 10% of samples
|
419
|
+
|
420
|
+
if n_active >= min_active_samples and active_fraction >= 0.05: # At least 5% active
|
421
|
+
U = np.mean(active_samples)
|
422
|
+
center = mu
|
423
|
+
stdev = (U - mu) * np.sqrt(np.pi / 2)
|
424
|
+
if verbose:
|
425
|
+
logger.info(
|
426
|
+
"Using Hart et al. method: mu=%.3f, U=%.3f, stdev=%.3f", mu, U, stdev
|
427
|
+
)
|
428
|
+
else:
|
429
|
+
# Fall back to sample standard deviation
|
430
|
+
stdev = np.std(fpkm_log2, ddof=1) # Use sample std dev (N-1)
|
431
|
+
# Use minor peak if it exists, otherwise use mean
|
432
|
+
center = peaks.minor if peaks.minor is not None else peaks.major
|
433
|
+
method_used = "sample_std"
|
434
|
+
if verbose:
|
435
|
+
logger.info(
|
436
|
+
"Falling back to sample std dev: stdev=%.3f (too few active: %d)",
|
437
|
+
stdev,
|
438
|
+
n_active,
|
439
|
+
)
|
440
|
+
|
441
|
+
# Handle edge case where stdev might still be 0 or negative
|
442
|
+
if stdev <= 0:
|
443
|
+
stdev = 0.1 # Minimal fallback
|
444
|
+
method_used = "minimal_fallback"
|
445
|
+
center = np.mean(fpkm_log2)
|
446
|
+
logger.warning(
|
447
|
+
"Standard deviation calculation resulted in non-positive value. Using minimal fallback: %.3f",
|
448
|
+
stdev,
|
449
|
+
)
|
450
|
+
|
451
|
+
# Compute zFPKM transform
|
452
|
+
z_fpkm = (fpkm_log2 - center) / stdev
|
453
|
+
|
454
|
+
if verbose:
|
455
|
+
logger.info(
|
456
|
+
"Method: %s, zFPKM range: %.3f to %.3f",
|
457
|
+
method_used,
|
458
|
+
z_fpkm.min(),
|
459
|
+
z_fpkm.max(),
|
460
|
+
)
|
461
|
+
|
462
|
+
return z_fpkm
|