napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -153
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +49 -67
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +356 -0
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev6.dist-info/RECORD +0 -97
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,462 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import numpy as np
5
+ import pandas as pd
6
+ from typing import Optional, Union, NamedTuple
7
+ from scipy.stats import gaussian_kde
8
+ from scipy.signal import find_peaks
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def discretize_expression_data(
14
+ expression_data: pd.DataFrame,
15
+ metadata_attributes: list[str] = None,
16
+ min_row_sum: int = 50,
17
+ zfpm_threshold: float = -3,
18
+ min_peakheight: float = 0.02,
19
+ min_peakdistance: int = 1,
20
+ prominence: float = 0.05,
21
+ verbose: bool = False,
22
+ ):
23
+ """
24
+ Discretize the GTEx data
25
+
26
+ Parameters
27
+ ----------
28
+ expression_data: pandas DataFrame
29
+ The expression data to discretize
30
+ metadata_attributes: list[str], optional
31
+ Non-numeric and other metadata attributes which should be included in the output but ignored when discretizing expression data
32
+ min_row_sum: int, optional
33
+ The minimum row sum to use for filtering constituatively un-expressed genes
34
+ zfpm_threshold: float, optional
35
+ The zFPKM threshold to use for discretization. Samples with zFPKM values below this threshold are considered as unexpressed (0) in the sample/condition.
36
+ min_peakheight: float, optional
37
+ The minimum peak height to use for peak detection
38
+ min_peakdistance: int, optional
39
+ The minimum peak distance to use for peak detection
40
+ prominence: float, optional
41
+ The prominence to use for peak detection
42
+ verbose: bool, optional
43
+ Whether to print verbose output
44
+
45
+ Returns
46
+ -------
47
+ tuple of pandas DataFrames
48
+ A tuple of two pandas DataFrames. The first DataFrame contains the zFPKM-transformed expression data with the metadata attributes merged on the left. The second DataFrame contains the expression data with binary values (0 for unexpressed, 1 for expressed) merged on the left.
49
+
50
+ """
51
+
52
+ expression_data_types = expression_data.dtypes
53
+ if metadata_attributes is None:
54
+ metadata_attributes = [
55
+ col
56
+ for col in expression_data_types.index
57
+ if expression_data_types[col] == "object"
58
+ ]
59
+
60
+ expression_numpy_df = expression_data.drop(columns=metadata_attributes)
61
+ # ensure that all variables are numeric
62
+ invalid_variables = [
63
+ x
64
+ for x, y in zip(expression_numpy_df.columns, expression_numpy_df.dtypes)
65
+ if y not in ["int64", "float64"]
66
+ ]
67
+
68
+ if len(invalid_variables) > 0:
69
+ raise ValueError(
70
+ f"The following variables are not numeric: {invalid_variables}. Either include these in metadata_attributes, convert them to numeric, or remove them from the expression data."
71
+ )
72
+
73
+ # calculate rowsums
74
+ expression_numpy_df = expression_numpy_df.loc[
75
+ expression_numpy_df.sum(axis=1) > min_row_sum, :
76
+ ]
77
+
78
+ n_unexpressed = expression_data.shape[0] - expression_numpy_df.shape[0]
79
+ if n_unexpressed > 0:
80
+ logger.info(
81
+ f"Removed {n_unexpressed} genes whose expression across all samples was below {min_row_sum}."
82
+ )
83
+
84
+ logger.info("Discretizing expression data...")
85
+ zfpkm_df = zfpkm(
86
+ expression_numpy_df,
87
+ min_peakheight=min_peakheight,
88
+ min_peakdistance=min_peakdistance,
89
+ prominence=prominence,
90
+ verbose=verbose,
91
+ )
92
+
93
+ is_expressed = (zfpkm_df > zfpm_threshold).astype(int)
94
+ n_expressed = sum(is_expressed.values.flatten())
95
+ expression_fraction = round(n_expressed / zfpkm_df.size, 3)
96
+ logger.info(f"Expression fraction: {expression_fraction}")
97
+
98
+ if expression_fraction < 0.01:
99
+ logger.warning(
100
+ "Less than 1% of the data was expressed. This is likely due to the zFPKM threshold being too high."
101
+ )
102
+
103
+ return (
104
+ pd.DataFrame(expression_data[metadata_attributes]).merge(
105
+ zfpkm_df, left_index=True, right_index=True
106
+ ),
107
+ pd.DataFrame(expression_data[metadata_attributes]).merge(
108
+ is_expressed, left_index=True, right_index=True
109
+ ),
110
+ )
111
+
112
+
113
+ def zfpkm(
114
+ fpkm_df: pd.DataFrame,
115
+ min_peakheight: float = 0.02,
116
+ min_peakdistance: int = 1,
117
+ prominence: float = 0.05,
118
+ verbose: bool = False,
119
+ ) -> pd.DataFrame:
120
+ """Transform entire DataFrame using zFPKM.
121
+
122
+ Parameters
123
+ ----------
124
+ fpkm_df : pd.DataFrame
125
+ DataFrame containing raw FPKM values.
126
+ Rows = genes/transcripts, Columns = samples
127
+ min_peakheight : float, optional
128
+ Minimum height for peak detection, by default 0.02
129
+ min_peakdistance : int, optional
130
+ Minimum distance between peaks, by default 1
131
+ prominence : float, optional
132
+ Minimum prominence for peak detection, by default 0.05
133
+ verbose : bool, optional
134
+ Whether to log detailed information, by default False
135
+
136
+ Returns
137
+ -------
138
+ pd.DataFrame
139
+ DataFrame with zFPKM transformed values
140
+ """
141
+ # Remove problematic rows
142
+ fpkm_df = _remove_nan_inf_rows(fpkm_df)
143
+
144
+ zfpkm_df = pd.DataFrame(index=fpkm_df.index)
145
+
146
+ for col in fpkm_df.columns:
147
+ z_fpkm = _zfpkm_calc(
148
+ fpkm_df[col], min_peakheight, min_peakdistance, prominence, verbose
149
+ )
150
+ zfpkm_df[col] = z_fpkm
151
+
152
+ return zfpkm_df
153
+
154
+
155
+ class PeakIndices(NamedTuple):
156
+ """Container for peak indices classified by importance.
157
+
158
+ Parameters
159
+ ----------
160
+ major : float
161
+ Position of the rightmost/highest peak
162
+ minor : Optional[float]
163
+ Position of the second most significant peak, if it exists
164
+ other : Optional[np.ndarray]
165
+ Positions of any remaining peaks
166
+ """
167
+
168
+ major: float
169
+ minor: Optional[float]
170
+ other: Optional[np.ndarray]
171
+
172
+
173
+ class PeakSelector:
174
+ """Class to handle peak detection and classification in density data.
175
+
176
+ Parameters
177
+ ----------
178
+ min_peakheight : float, optional
179
+ Minimum height for peak detection, by default 0.02
180
+ min_peakdistance : int, optional
181
+ Minimum distance between peaks, by default 1
182
+ prominence : float, optional
183
+ Minimum prominence for peak detection, by default 0.05
184
+ verbose : bool, optional
185
+ Whether to log detailed information, by default True
186
+ """
187
+
188
+ def __init__(
189
+ self,
190
+ min_peakheight: float = 0.02,
191
+ min_peakdistance: int = 1,
192
+ prominence: float = 0.05,
193
+ verbose: bool = False,
194
+ ):
195
+ self.min_peakheight = min_peakheight
196
+ self.min_peakdistance = min_peakdistance
197
+ self.prominence = prominence
198
+ self.verbose = verbose
199
+
200
+ def find_peaks(self, density_y: np.ndarray, x_eval: np.ndarray) -> PeakIndices:
201
+ """Find and classify peaks in density data.
202
+
203
+ Parameters
204
+ ----------
205
+ density_y : np.ndarray
206
+ Y-values of the density estimation
207
+ x_eval : np.ndarray
208
+ X-values corresponding to density_y
209
+
210
+ Returns
211
+ -------
212
+ PeakIndices
213
+ Named tuple containing classified peak positions
214
+ """
215
+ peaks, _ = find_peaks(
216
+ density_y,
217
+ height=self.min_peakheight,
218
+ distance=self.min_peakdistance,
219
+ prominence=self.prominence,
220
+ )
221
+
222
+ logger.debug("Found %d peaks", len(peaks))
223
+
224
+ if len(peaks) == 0:
225
+ # If no peaks found, use the maximum density point
226
+ peak_idx = np.argmax(density_y)
227
+ return PeakIndices(major=x_eval[peak_idx], minor=None, other=None)
228
+
229
+ # Get peak positions and sort by x-value
230
+ peak_positions = x_eval[peaks]
231
+ sorted_indices = np.argsort(peak_positions)
232
+ sorted_positions = peak_positions[sorted_indices]
233
+
234
+ # Always use rightmost peak as major
235
+ major = sorted_positions[-1]
236
+
237
+ # If we have more peaks, classify them
238
+ minor = sorted_positions[-2] if len(sorted_positions) > 1 else None
239
+ other = sorted_positions[:-2] if len(sorted_positions) > 2 else None
240
+
241
+ if self.verbose:
242
+ logger.info(
243
+ "Major peak at %.3f, minor peak at %.3f",
244
+ major,
245
+ minor if minor is not None else float("nan"),
246
+ )
247
+ if other is not None:
248
+ logger.debug("Additional peaks at: %s", other)
249
+
250
+ return PeakIndices(major=major, minor=minor, other=other)
251
+
252
+
253
+ def generate_simple_test_data(n_genes: int = 200, n_samples: int = 100) -> pd.DataFrame:
254
+ """Generate a simple test dataset for basic validation.
255
+
256
+ Parameters
257
+ ----------
258
+ n_genes : int, optional
259
+ Number of genes to generate, by default 200
260
+ n_samples : int, optional
261
+ Number of samples to generate, by default 50
262
+
263
+ Returns
264
+ -------
265
+ pd.DataFrame
266
+ DataFrame with simulated FPKM values.
267
+ Rows = genes, Columns = samples
268
+ """
269
+ np.random.seed(42)
270
+
271
+ fpkm_data = np.zeros((n_genes, n_samples))
272
+
273
+ for gene_idx in range(n_genes):
274
+ active_fraction = np.random.uniform(0.2, 0.8) # 20-80% of samples active
275
+ expression_delta = np.random.gamma(shape=2, scale=0.5)
276
+ noise_level = np.random.gamma(shape=2, scale=0.5)
277
+
278
+ n_active_samples = int(active_fraction * n_samples)
279
+ base_log_expr = np.random.normal(-1, 0.5) # Slightly lower baseline
280
+
281
+ log_expr = np.random.normal(base_log_expr, noise_level, n_samples)
282
+
283
+ if n_active_samples > 0:
284
+ active_samples = np.random.choice(
285
+ n_samples, n_active_samples, replace=False
286
+ )
287
+ log_expr[active_samples] += expression_delta
288
+
289
+ fpkm_data[gene_idx, :] = np.power(2, log_expr)
290
+
291
+ gene_names = [f"ENSG{i:05d}" for i in range(n_genes)]
292
+ sample_names = [f"Sample_{i+1:02d}" for i in range(n_samples)]
293
+
294
+ return pd.DataFrame(fpkm_data, index=gene_names, columns=sample_names)
295
+
296
+
297
+ def _remove_nan_inf_rows(fpkm_df: pd.DataFrame) -> pd.DataFrame:
298
+ """Remove rows containing all NaN or infinite values.
299
+
300
+ Parameters
301
+ ----------
302
+ fpkm_df : pd.DataFrame
303
+ Input DataFrame with FPKM values
304
+
305
+ Returns
306
+ -------
307
+ pd.DataFrame
308
+ DataFrame with rows containing all NaN or infinite values removed
309
+
310
+ Notes
311
+ -----
312
+ Logs a warning if any rows are filtered out.
313
+ """
314
+ initial_rows = len(fpkm_df)
315
+ clean_df = fpkm_df[
316
+ ~fpkm_df.apply(lambda row: row.isna().all() or np.isinf(row).all(), axis=1)
317
+ ]
318
+ filtered_rows = initial_rows - len(clean_df)
319
+
320
+ if filtered_rows > 0:
321
+ logger.warning(
322
+ "Filtered out %d rows containing all NaN or infinite values (from %d total rows)",
323
+ filtered_rows,
324
+ initial_rows,
325
+ )
326
+
327
+ return clean_df
328
+
329
+
330
+ def _zfpkm_calc(
331
+ fpkm: Union[np.ndarray, pd.Series],
332
+ min_peakheight: float = 0.02,
333
+ min_peakdistance: int = 1,
334
+ prominence: float = 0.05,
335
+ verbose: bool = False,
336
+ ) -> np.ndarray:
337
+ """Perform zFPKM transform on a single sample of FPKM data.
338
+
339
+ The zFPKM algorithm fits a kernel density estimate to the log2(FPKM)
340
+ distribution of ALL GENES within a single sample. This requires:
341
+ - Input: A vector of FPKM values for all genes in ONE sample
342
+ - Many genes (typically 1000+) for meaningful density estimation
343
+ - The algorithm identifies the rightmost peak as "active" gene expression
344
+
345
+ Parameters
346
+ ----------
347
+ fpkm : Union[np.ndarray, pd.Series]
348
+ Raw FPKM values for all genes in ONE sample (NOT log2 transformed)
349
+ min_peakheight : float, optional
350
+ Minimum height for peak detection, by default 0.02
351
+ min_peakdistance : int, optional
352
+ Minimum distance between peaks, by default 1
353
+ prominence : float, optional
354
+ Minimum prominence for peak detection, by default 0.05
355
+ verbose : bool, optional
356
+ Whether to log debug information, by default False
357
+
358
+ Returns
359
+ -------
360
+ np.ndarray
361
+ Array of zFPKM values
362
+
363
+ Raises
364
+ ------
365
+ ValueError
366
+ If no valid FPKM values are found after filtering
367
+ """
368
+
369
+ # Convert to numpy array and handle non-numeric values
370
+ fpkm = np.array(fpkm, dtype=float)
371
+ initial_len = len(fpkm)
372
+ fpkm = fpkm[~(np.isnan(fpkm) | np.isinf(fpkm))]
373
+
374
+ if len(fpkm) < initial_len:
375
+ logger.warning(
376
+ "Filtered out %d NaN/infinite values from input vector of length %d",
377
+ initial_len - len(fpkm),
378
+ initial_len,
379
+ )
380
+
381
+ if len(fpkm) == 0:
382
+ raise ValueError("No valid FPKM values found")
383
+
384
+ # Log2 transform - CRITICAL: Don't add small value, use 0 for exact zeros
385
+ fpkm_log2 = np.log2(np.maximum(fpkm, 1e-10))
386
+
387
+ # Compute kernel density estimate using R-like bandwidth
388
+ kde = gaussian_kde(fpkm_log2)
389
+
390
+ # Create evaluation points matching R's approach
391
+ x_min, x_max = fpkm_log2.min(), fpkm_log2.max()
392
+ x_range = x_max - x_min
393
+ x_eval = np.linspace(x_min - 0.1 * x_range, x_max + 0.1 * x_range, 512)
394
+ density_y = kde(x_eval)
395
+
396
+ # Find peaks using PeakSelector
397
+ peak_selector = PeakSelector(
398
+ min_peakheight=min_peakheight,
399
+ min_peakdistance=min_peakdistance,
400
+ prominence=prominence,
401
+ verbose=verbose,
402
+ )
403
+ peaks = peak_selector.find_peaks(density_y, x_eval)
404
+ mu = peaks.major
405
+
406
+ # Estimate standard deviation using method from Hart et al.
407
+ active_samples = fpkm_log2[fpkm_log2 > mu]
408
+ n_active = len(active_samples)
409
+ n_total = len(fpkm_log2)
410
+ active_fraction = n_active / n_total
411
+
412
+ if verbose:
413
+ logger.info(
414
+ "Active samples: %d/%d (%.2f%%)", n_active, n_total, 100 * active_fraction
415
+ )
416
+
417
+ # Use Hart et al. method only if we have enough active samples
418
+ min_active_samples = max(5, int(0.1 * n_total)) # At least 5 or 10% of samples
419
+
420
+ if n_active >= min_active_samples and active_fraction >= 0.05: # At least 5% active
421
+ U = np.mean(active_samples)
422
+ center = mu
423
+ stdev = (U - mu) * np.sqrt(np.pi / 2)
424
+ if verbose:
425
+ logger.info(
426
+ "Using Hart et al. method: mu=%.3f, U=%.3f, stdev=%.3f", mu, U, stdev
427
+ )
428
+ else:
429
+ # Fall back to sample standard deviation
430
+ stdev = np.std(fpkm_log2, ddof=1) # Use sample std dev (N-1)
431
+ # Use minor peak if it exists, otherwise use mean
432
+ center = peaks.minor if peaks.minor is not None else peaks.major
433
+ method_used = "sample_std"
434
+ if verbose:
435
+ logger.info(
436
+ "Falling back to sample std dev: stdev=%.3f (too few active: %d)",
437
+ stdev,
438
+ n_active,
439
+ )
440
+
441
+ # Handle edge case where stdev might still be 0 or negative
442
+ if stdev <= 0:
443
+ stdev = 0.1 # Minimal fallback
444
+ method_used = "minimal_fallback"
445
+ center = np.mean(fpkm_log2)
446
+ logger.warning(
447
+ "Standard deviation calculation resulted in non-positive value. Using minimal fallback: %.3f",
448
+ stdev,
449
+ )
450
+
451
+ # Compute zFPKM transform
452
+ z_fpkm = (fpkm_log2 - center) / stdev
453
+
454
+ if verbose:
455
+ logger.info(
456
+ "Method: %s, zFPKM range: %.3f to %.3f",
457
+ method_used,
458
+ z_fpkm.min(),
459
+ z_fpkm.max(),
460
+ )
461
+
462
+ return z_fpkm