pylocuszoom 0.8.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/qq.py ADDED
@@ -0,0 +1,123 @@
1
+ """QQ plot data preparation and statistics."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy import stats
6
+
7
+
8
+ def calculate_lambda_gc(p_values: np.ndarray) -> float:
9
+ """Calculate genomic inflation factor (lambda GC).
10
+
11
+ Lambda is the ratio of the median observed chi-squared statistic
12
+ to the expected median under the null hypothesis.
13
+
14
+ Args:
15
+ p_values: Array of p-values.
16
+
17
+ Returns:
18
+ Genomic inflation factor (lambda). Returns NaN if no valid p-values.
19
+ """
20
+ # Remove NaN and zero/negative values
21
+ p_clean = p_values[~np.isnan(p_values) & (p_values > 0)]
22
+ if len(p_clean) == 0:
23
+ return np.nan
24
+
25
+ # Convert to chi-squared statistics (1 df)
26
+ chi2 = stats.chi2.ppf(1 - p_clean, df=1)
27
+
28
+ # Expected median for chi-squared with 1 df
29
+ expected_median = stats.chi2.ppf(0.5, df=1)
30
+
31
+ # Lambda = observed median / expected median
32
+ return np.median(chi2) / expected_median
33
+
34
+
35
+ def calculate_confidence_band(
36
+ n_points: int, confidence: float = 0.95
37
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
38
+ """Calculate confidence band for QQ plot.
39
+
40
+ Uses order statistics to compute expected distribution of p-values
41
+ under the null hypothesis.
42
+
43
+ Args:
44
+ n_points: Number of p-values.
45
+ confidence: Confidence level (default 0.95 for 95% CI).
46
+
47
+ Returns:
48
+ Tuple of (expected, lower_bound, upper_bound) arrays in -log10 scale.
49
+ """
50
+ # Expected quantiles
51
+ expected = -np.log10((np.arange(1, n_points + 1)) / (n_points + 1))
52
+
53
+ # Confidence interval using beta distribution
54
+ alpha = 1 - confidence
55
+ ranks = np.arange(1, n_points + 1)
56
+ n_minus_rank = n_points - ranks + 1
57
+
58
+ lower_p = stats.beta.ppf(alpha / 2, ranks, n_minus_rank)
59
+ upper_p = stats.beta.ppf(1 - alpha / 2, ranks, n_minus_rank)
60
+
61
+ # Convert to -log10 scale (swap because -log10 reverses order)
62
+ lower_bound = -np.log10(upper_p)
63
+ upper_bound = -np.log10(lower_p)
64
+
65
+ return expected, lower_bound, upper_bound
66
+
67
+
68
+ def prepare_qq_data(
69
+ df: pd.DataFrame,
70
+ p_col: str = "p",
71
+ ) -> pd.DataFrame:
72
+ """Prepare DataFrame for QQ plot rendering.
73
+
74
+ Args:
75
+ df: DataFrame with p-values.
76
+ p_col: Column name for p-value.
77
+
78
+ Returns:
79
+ DataFrame with columns for QQ plotting:
80
+ - _expected: Expected -log10(p) under null
81
+ - _observed: Observed -log10(p)
82
+ - _ci_lower: Lower confidence bound
83
+ - _ci_upper: Upper confidence bound
84
+
85
+ Attributes stored in DataFrame.attrs:
86
+ - lambda_gc: Genomic inflation factor
87
+ - n_variants: Number of valid p-values
88
+ """
89
+ if p_col not in df.columns:
90
+ raise ValueError(f"Column '{p_col}' not found in DataFrame")
91
+
92
+ # Get p-values and filter invalid
93
+ p_values = df[p_col].values
94
+ valid_mask = ~np.isnan(p_values) & (p_values > 0) & (p_values <= 1)
95
+ p_valid = p_values[valid_mask]
96
+
97
+ if len(p_valid) == 0:
98
+ raise ValueError("No valid p-values found (must be > 0 and <= 1)")
99
+
100
+ # Sort p-values (smallest first -> largest -log10 last)
101
+ p_sorted = np.sort(p_valid)
102
+
103
+ # Calculate observed -log10(p)
104
+ observed = -np.log10(p_sorted)
105
+
106
+ # Calculate expected and confidence bands
107
+ expected, ci_lower, ci_upper = calculate_confidence_band(len(p_sorted))
108
+
109
+ # Create result DataFrame
110
+ result = pd.DataFrame(
111
+ {
112
+ "_expected": expected,
113
+ "_observed": observed,
114
+ "_ci_lower": ci_lower,
115
+ "_ci_upper": ci_upper,
116
+ }
117
+ )
118
+
119
+ # Store statistics in attrs
120
+ result.attrs["lambda_gc"] = calculate_lambda_gc(p_valid)
121
+ result.attrs["n_variants"] = len(p_valid)
122
+
123
+ return result
@@ -432,8 +432,8 @@ def add_recombination_overlay(
432
432
  region_recomb["pos"],
433
433
  region_recomb["rate"],
434
434
  color=RECOMB_COLOR,
435
- linewidth=1.5,
436
- alpha=0.7,
435
+ linewidth=2.5,
436
+ alpha=0.8,
437
437
  zorder=0, # Behind scatter points
438
438
  )
439
439
 
@@ -447,14 +447,14 @@ def add_recombination_overlay(
447
447
  zorder=0,
448
448
  )
449
449
 
450
- # Format secondary axis
451
- recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color=RECOMB_COLOR, fontsize=9)
452
- recomb_ax.tick_params(axis="y", labelcolor=RECOMB_COLOR, labelsize=8)
450
+ # Format secondary axis - use black for label text (more readable)
451
+ recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color="black", fontsize=9)
452
+ recomb_ax.tick_params(axis="y", labelcolor="black", labelsize=8)
453
453
  recomb_ax.set_ylim(bottom=0)
454
454
 
455
- # Don't let recomb rate overwhelm the plot
455
+ # Scale to fit data with headroom
456
456
  max_rate = region_recomb["rate"].max()
457
- recomb_ax.set_ylim(0, max(max_rate * 1.2, 20))
457
+ recomb_ax.set_ylim(0, max(max_rate * 1.3, 10))
458
458
 
459
459
  # Remove top spine for cleaner look
460
460
  recomb_ax.spines["top"].set_visible(False)
pylocuszoom/schemas.py CHANGED
@@ -10,12 +10,7 @@ from typing import Optional, Union
10
10
  import pandas as pd
11
11
  from pydantic import BaseModel, ConfigDict, field_validator, model_validator
12
12
 
13
-
14
- class LoaderValidationError(Exception):
15
- """Raised when loaded data fails validation."""
16
-
17
- pass
18
-
13
+ from .exceptions import LoaderValidationError
19
14
 
20
15
  # =============================================================================
21
16
  # GWAS Validation
@@ -0,0 +1,319 @@
1
+ """Statistical visualization plotter for PheWAS and forest plots.
2
+
3
+ Provides variant-centric visualizations:
4
+ - PheWAS plots showing associations across phenotypes
5
+ - Forest plots showing effect sizes with confidence intervals
6
+ """
7
+
8
+ from typing import Any, Optional, Tuple
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+ from ._plotter_utils import DEFAULT_GENOMEWIDE_THRESHOLD, transform_pvalues
14
+ from .backends import BackendType, get_backend
15
+ from .colors import get_phewas_category_palette
16
+ from .forest import validate_forest_df
17
+ from .phewas import validate_phewas_df
18
+
19
+
20
+ class StatsPlotter:
21
+ """Statistical visualization plotter for PheWAS and forest plots.
22
+
23
+ Creates variant-centric visualizations for phenome-wide associations
24
+ and meta-analysis forest plots.
25
+
26
+ Args:
27
+ backend: Plotting backend ('matplotlib', 'plotly', or 'bokeh').
28
+ genomewide_threshold: P-value threshold for significance line.
29
+
30
+ Example:
31
+ >>> plotter = StatsPlotter()
32
+ >>> fig = plotter.plot_phewas(phewas_df, variant_id="rs12345")
33
+ >>> fig.savefig("phewas.png", dpi=150)
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ backend: BackendType = "matplotlib",
39
+ genomewide_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
40
+ ):
41
+ """Initialize the stats plotter."""
42
+ self._backend = get_backend(backend)
43
+ self.genomewide_threshold = genomewide_threshold
44
+
45
+ def plot_phewas(
46
+ self,
47
+ phewas_df: pd.DataFrame,
48
+ variant_id: str,
49
+ phenotype_col: str = "phenotype",
50
+ p_col: str = "p_value",
51
+ category_col: str = "category",
52
+ effect_col: Optional[str] = None,
53
+ significance_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
54
+ figsize: Tuple[float, float] = (10, 8),
55
+ ) -> Any:
56
+ """Create a PheWAS (Phenome-Wide Association Study) plot.
57
+
58
+ Shows associations of a single variant across multiple phenotypes,
59
+ with phenotypes grouped by category and colored accordingly.
60
+
61
+ Args:
62
+ phewas_df: DataFrame with phenotype associations.
63
+ variant_id: Variant identifier (e.g., "rs12345") for plot title.
64
+ phenotype_col: Column name for phenotype names.
65
+ p_col: Column name for p-values.
66
+ category_col: Column name for phenotype categories.
67
+ effect_col: Optional column name for effect direction (beta/OR).
68
+ significance_threshold: P-value threshold for significance line.
69
+ figsize: Figure size as (width, height).
70
+
71
+ Returns:
72
+ Figure object (type depends on backend).
73
+
74
+ Example:
75
+ >>> fig = plotter.plot_phewas(
76
+ ... phewas_df,
77
+ ... variant_id="rs12345",
78
+ ... category_col="category",
79
+ ... )
80
+ """
81
+ validate_phewas_df(phewas_df, phenotype_col, p_col, category_col)
82
+
83
+ df = phewas_df.copy()
84
+ df = transform_pvalues(df, p_col)
85
+
86
+ # Sort by category then by p-value for consistent ordering
87
+ if category_col in df.columns:
88
+ df = df.sort_values([category_col, p_col])
89
+ categories = df[category_col].unique().tolist()
90
+ palette = get_phewas_category_palette(categories)
91
+ else:
92
+ df = df.sort_values(p_col)
93
+ categories = []
94
+ palette = {}
95
+
96
+ # Create figure
97
+ fig, axes = self._backend.create_figure(
98
+ n_panels=1,
99
+ height_ratios=[1.0],
100
+ figsize=figsize,
101
+ )
102
+ ax = axes[0]
103
+
104
+ # Assign y-positions (one per phenotype)
105
+ df["y_pos"] = range(len(df))
106
+
107
+ # Plot points by category
108
+ if categories:
109
+ for cat in categories:
110
+ # Handle NaN category: NaN == NaN is False in pandas
111
+ if pd.isna(cat):
112
+ cat_data = df[df[category_col].isna()]
113
+ else:
114
+ cat_data = df[df[category_col] == cat]
115
+ # Use upward triangles for positive effects, circles otherwise
116
+ if effect_col and effect_col in cat_data.columns:
117
+ # Vectorized: split by effect sign, 2 scatter calls per category
118
+ pos_data = cat_data[cat_data[effect_col] >= 0]
119
+ neg_data = cat_data[cat_data[effect_col] < 0]
120
+
121
+ if not pos_data.empty:
122
+ self._backend.scatter(
123
+ ax,
124
+ pos_data["neglog10p"],
125
+ pos_data["y_pos"],
126
+ colors=palette[cat],
127
+ sizes=60,
128
+ marker="^",
129
+ edgecolor="black",
130
+ linewidth=0.5,
131
+ zorder=2,
132
+ )
133
+ if not neg_data.empty:
134
+ self._backend.scatter(
135
+ ax,
136
+ neg_data["neglog10p"],
137
+ neg_data["y_pos"],
138
+ colors=palette[cat],
139
+ sizes=60,
140
+ marker="v",
141
+ edgecolor="black",
142
+ linewidth=0.5,
143
+ zorder=2,
144
+ )
145
+ else:
146
+ self._backend.scatter(
147
+ ax,
148
+ cat_data["neglog10p"],
149
+ cat_data["y_pos"],
150
+ colors=palette[cat],
151
+ sizes=60,
152
+ marker="o",
153
+ edgecolor="black",
154
+ linewidth=0.5,
155
+ zorder=2,
156
+ )
157
+ else:
158
+ self._backend.scatter(
159
+ ax,
160
+ df["neglog10p"],
161
+ df["y_pos"],
162
+ colors="#4169E1",
163
+ sizes=60,
164
+ edgecolor="black",
165
+ linewidth=0.5,
166
+ zorder=2,
167
+ )
168
+
169
+ # Add significance threshold line
170
+ sig_line = -np.log10(significance_threshold)
171
+ self._backend.axvline(
172
+ ax, x=sig_line, color="red", linestyle="--", linewidth=1, alpha=0.7
173
+ )
174
+
175
+ # Set axis labels and limits
176
+ self._backend.set_xlabel(ax, r"$-\log_{10}$ P")
177
+ self._backend.set_ylabel(ax, "Phenotype")
178
+ self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
179
+
180
+ # Set y-tick labels to phenotype names
181
+ self._backend.set_yticks(
182
+ ax,
183
+ positions=df["y_pos"].tolist(),
184
+ labels=df[phenotype_col].tolist(),
185
+ fontsize=8,
186
+ )
187
+
188
+ self._backend.set_title(ax, f"PheWAS: {variant_id}")
189
+ self._backend.hide_spines(ax, ["top", "right"])
190
+ self._backend.finalize_layout(fig)
191
+
192
+ return fig
193
+
194
+ def plot_forest(
195
+ self,
196
+ forest_df: pd.DataFrame,
197
+ variant_id: str,
198
+ study_col: str = "study",
199
+ effect_col: str = "effect",
200
+ ci_lower_col: str = "ci_lower",
201
+ ci_upper_col: str = "ci_upper",
202
+ weight_col: Optional[str] = None,
203
+ null_value: float = 0.0,
204
+ effect_label: str = "Effect Size",
205
+ figsize: Tuple[float, float] = (8, 6),
206
+ ) -> Any:
207
+ """Create a forest plot showing effect sizes with confidence intervals.
208
+
209
+ Args:
210
+ forest_df: DataFrame with effect sizes and confidence intervals.
211
+ variant_id: Variant identifier for plot title.
212
+ study_col: Column name for study/phenotype names.
213
+ effect_col: Column name for effect sizes.
214
+ ci_lower_col: Column name for lower confidence interval.
215
+ ci_upper_col: Column name for upper confidence interval.
216
+ weight_col: Optional column for study weights (affects marker size).
217
+ null_value: Reference value for null effect (0 for beta, 1 for OR).
218
+ effect_label: X-axis label.
219
+ figsize: Figure size as (width, height).
220
+
221
+ Returns:
222
+ Figure object (type depends on backend).
223
+
224
+ Example:
225
+ >>> fig = plotter.plot_forest(
226
+ ... forest_df,
227
+ ... variant_id="rs12345",
228
+ ... effect_label="Odds Ratio",
229
+ ... null_value=1.0,
230
+ ... )
231
+ """
232
+ validate_forest_df(forest_df, study_col, effect_col, ci_lower_col, ci_upper_col)
233
+
234
+ df = forest_df.copy()
235
+
236
+ # Create figure
237
+ fig, axes = self._backend.create_figure(
238
+ n_panels=1,
239
+ height_ratios=[1.0],
240
+ figsize=figsize,
241
+ )
242
+ ax = axes[0]
243
+
244
+ # Assign y-positions (reverse so first study is at top)
245
+ df["y_pos"] = range(len(df) - 1, -1, -1)
246
+
247
+ # Calculate marker sizes from weights
248
+ if weight_col and weight_col in df.columns:
249
+ # Scale weights to marker sizes (min 40, max 200)
250
+ weights = df[weight_col]
251
+ min_size, max_size = 40, 200
252
+ weight_range = weights.max() - weights.min()
253
+ if weight_range > 0:
254
+ sizes = min_size + (weights - weights.min()) / weight_range * (
255
+ max_size - min_size
256
+ )
257
+ else:
258
+ sizes = (min_size + max_size) / 2
259
+ else:
260
+ sizes = 80
261
+
262
+ # Calculate error bar extents
263
+ xerr_lower = df[effect_col] - df[ci_lower_col]
264
+ xerr_upper = df[ci_upper_col] - df[effect_col]
265
+
266
+ # Plot error bars (confidence intervals)
267
+ self._backend.errorbar_h(
268
+ ax,
269
+ x=df[effect_col],
270
+ y=df["y_pos"],
271
+ xerr_lower=xerr_lower,
272
+ xerr_upper=xerr_upper,
273
+ color="black",
274
+ linewidth=1.5,
275
+ capsize=3,
276
+ zorder=2,
277
+ )
278
+
279
+ # Plot effect size markers
280
+ self._backend.scatter(
281
+ ax,
282
+ df[effect_col],
283
+ df["y_pos"],
284
+ colors="#4169E1",
285
+ sizes=sizes,
286
+ marker="s", # square markers typical for forest plots
287
+ edgecolor="black",
288
+ linewidth=0.5,
289
+ zorder=3,
290
+ )
291
+
292
+ # Add null effect line
293
+ self._backend.axvline(
294
+ ax, x=null_value, color="grey", linestyle="--", linewidth=1, alpha=0.7
295
+ )
296
+
297
+ # Set axis labels and limits
298
+ self._backend.set_xlabel(ax, effect_label)
299
+ self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
300
+
301
+ # Ensure x-axis includes the null value with some padding
302
+ x_min = min(df[ci_lower_col].min(), null_value)
303
+ x_max = max(df[ci_upper_col].max(), null_value)
304
+ x_padding = (x_max - x_min) * 0.1
305
+ self._backend.set_xlim(ax, x_min - x_padding, x_max + x_padding)
306
+
307
+ # Set y-tick labels to study names
308
+ self._backend.set_yticks(
309
+ ax,
310
+ positions=df["y_pos"].tolist(),
311
+ labels=df[study_col].tolist(),
312
+ fontsize=10,
313
+ )
314
+
315
+ self._backend.set_title(ax, f"Forest Plot: {variant_id}")
316
+ self._backend.hide_spines(ax, ["top", "right"])
317
+ self._backend.finalize_layout(fig)
318
+
319
+ return fig
pylocuszoom/utils.py CHANGED
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
8
8
 
9
9
  import pandas as pd
10
10
 
11
+ from .exceptions import ValidationError
12
+
11
13
  if TYPE_CHECKING:
12
14
  from pyspark.sql import DataFrame as SparkDataFrame
13
15
 
@@ -15,10 +17,6 @@ if TYPE_CHECKING:
15
17
  DataFrameLike = Union[pd.DataFrame, "SparkDataFrame", Any]
16
18
 
17
19
 
18
- class ValidationError(ValueError):
19
- """Raised when input validation fails."""
20
-
21
-
22
20
  def is_spark_dataframe(df: Any) -> bool:
23
21
  """Check if object is a PySpark DataFrame.
24
22
 
pylocuszoom/validation.py CHANGED
@@ -159,6 +159,57 @@ class DataFrameValidator:
159
159
 
160
160
  return self
161
161
 
162
+ def require_ci_ordering(
163
+ self,
164
+ ci_lower_col: str,
165
+ effect_col: str,
166
+ ci_upper_col: str,
167
+ ) -> "DataFrameValidator":
168
+ """Check that confidence intervals are properly ordered.
169
+
170
+ Validates that ci_lower <= effect <= ci_upper for all rows.
171
+ Invalid ordering would produce negative error bar lengths.
172
+
173
+ Args:
174
+ ci_lower_col: Column name for lower CI bound.
175
+ effect_col: Column name for effect size (point estimate).
176
+ ci_upper_col: Column name for upper CI bound.
177
+
178
+ Returns:
179
+ Self for method chaining.
180
+ """
181
+ # Skip if any column is missing
182
+ for col in [ci_lower_col, effect_col, ci_upper_col]:
183
+ if col not in self._df.columns:
184
+ return self
185
+
186
+ lower = self._df[ci_lower_col]
187
+ effect = self._df[effect_col]
188
+ upper = self._df[ci_upper_col]
189
+
190
+ # Check ci_lower <= effect
191
+ lower_gt_effect = (lower > effect).sum()
192
+ if lower_gt_effect > 0:
193
+ self._errors.append(
194
+ f"{lower_gt_effect} rows have {ci_lower_col} > {effect_col}"
195
+ )
196
+
197
+ # Check effect <= ci_upper
198
+ effect_gt_upper = (effect > upper).sum()
199
+ if effect_gt_upper > 0:
200
+ self._errors.append(
201
+ f"{effect_gt_upper} rows have {effect_col} > {ci_upper_col}"
202
+ )
203
+
204
+ # Check ci_lower <= ci_upper (implicit from above, but explicit is clearer)
205
+ lower_gt_upper = (lower > upper).sum()
206
+ if lower_gt_upper > 0:
207
+ self._errors.append(
208
+ f"{lower_gt_upper} rows have {ci_lower_col} > {ci_upper_col}"
209
+ )
210
+
211
+ return self
212
+
162
213
  def validate(self) -> None:
163
214
  """Raise ValidationError if any validation rules failed.
164
215