pylocuszoom 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +23 -2
- pylocuszoom/backends/base.py +86 -0
- pylocuszoom/backends/bokeh_backend.py +116 -20
- pylocuszoom/backends/matplotlib_backend.py +69 -0
- pylocuszoom/backends/plotly_backend.py +115 -23
- pylocuszoom/colors.py +41 -0
- pylocuszoom/forest.py +37 -0
- pylocuszoom/loaders.py +35 -17
- pylocuszoom/phewas.py +35 -0
- pylocuszoom/plotter.py +258 -4
- pylocuszoom/recombination.py +45 -31
- pylocuszoom/schemas.py +37 -26
- {pylocuszoom-0.5.0.dist-info → pylocuszoom-0.6.0.dist-info}/METADATA +53 -5
- pylocuszoom-0.6.0.dist-info/RECORD +26 -0
- pylocuszoom-0.5.0.dist-info/RECORD +0 -24
- {pylocuszoom-0.5.0.dist-info → pylocuszoom-0.6.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-0.5.0.dist-info → pylocuszoom-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/loaders.py
CHANGED
|
@@ -260,10 +260,14 @@ def load_saige(
|
|
|
260
260
|
"POS": pos_col,
|
|
261
261
|
"MarkerID": rs_col,
|
|
262
262
|
"CHR": "chr",
|
|
263
|
-
"p.value": p_col,
|
|
264
|
-
"p.value.NA": p_col, # SPA-adjusted
|
|
265
263
|
}
|
|
266
264
|
|
|
265
|
+
# Prefer SPA-adjusted p-value (p.value.NA) over raw p.value when both present
|
|
266
|
+
if "p.value.NA" in df.columns:
|
|
267
|
+
col_map["p.value.NA"] = p_col
|
|
268
|
+
elif "p.value" in df.columns:
|
|
269
|
+
col_map["p.value"] = p_col
|
|
270
|
+
|
|
267
271
|
df = df.rename(columns=col_map)
|
|
268
272
|
logger.debug(f"Loaded SAIGE file with {len(df)} variants")
|
|
269
273
|
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
@@ -318,7 +322,7 @@ def load_gtex_eqtl(
|
|
|
318
322
|
gene: Optional gene to filter to (ENSG ID or gene symbol).
|
|
319
323
|
|
|
320
324
|
Returns:
|
|
321
|
-
DataFrame with columns: pos, p_value, gene,
|
|
325
|
+
DataFrame with columns: pos, p_value, gene, effect_size.
|
|
322
326
|
|
|
323
327
|
Example:
|
|
324
328
|
>>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
|
|
@@ -351,10 +355,10 @@ def load_gtex_eqtl(
|
|
|
351
355
|
col_map[col] = "gene"
|
|
352
356
|
break
|
|
353
357
|
|
|
354
|
-
# Effect size (slope)
|
|
358
|
+
# Effect size (slope) - standardize to effect_size for plotting compatibility
|
|
355
359
|
for col in ["slope", "beta", "effect_size"]:
|
|
356
360
|
if col in df.columns:
|
|
357
|
-
col_map[col] = "
|
|
361
|
+
col_map[col] = "effect_size"
|
|
358
362
|
break
|
|
359
363
|
|
|
360
364
|
df = df.rename(columns=col_map)
|
|
@@ -385,7 +389,7 @@ def load_eqtl_catalogue(
|
|
|
385
389
|
gene: Optional gene to filter to.
|
|
386
390
|
|
|
387
391
|
Returns:
|
|
388
|
-
DataFrame with columns: pos, p_value, gene,
|
|
392
|
+
DataFrame with columns: pos, p_value, gene, effect_size.
|
|
389
393
|
"""
|
|
390
394
|
df = pd.read_csv(filepath, sep="\t")
|
|
391
395
|
|
|
@@ -393,7 +397,7 @@ def load_eqtl_catalogue(
|
|
|
393
397
|
"position": "pos",
|
|
394
398
|
"pvalue": "p_value",
|
|
395
399
|
"gene_id": "gene",
|
|
396
|
-
"beta": "
|
|
400
|
+
"beta": "effect_size", # Standardize to effect_size for plotter
|
|
397
401
|
"chromosome": "chr",
|
|
398
402
|
}
|
|
399
403
|
|
|
@@ -422,7 +426,7 @@ def load_matrixeqtl(
|
|
|
422
426
|
gene: Optional gene to filter to.
|
|
423
427
|
|
|
424
428
|
Returns:
|
|
425
|
-
DataFrame with columns: pos, p_value, gene,
|
|
429
|
+
DataFrame with columns: pos, p_value, gene, effect_size.
|
|
426
430
|
|
|
427
431
|
Note:
|
|
428
432
|
MatrixEQTL output doesn't include position by default.
|
|
@@ -435,7 +439,7 @@ def load_matrixeqtl(
|
|
|
435
439
|
"gene": "gene",
|
|
436
440
|
"p-value": "p_value",
|
|
437
441
|
"pvalue": "p_value",
|
|
438
|
-
"beta": "
|
|
442
|
+
"beta": "effect_size", # Standardize to effect_size for plotter
|
|
439
443
|
"t-stat": "t_stat",
|
|
440
444
|
}
|
|
441
445
|
|
|
@@ -725,14 +729,28 @@ def load_bed(
|
|
|
725
729
|
# Assign column names if no header
|
|
726
730
|
if not has_header:
|
|
727
731
|
n_cols = len(df.columns)
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
732
|
+
# Standard BED column names (up to BED12)
|
|
733
|
+
bed_col_names = [
|
|
734
|
+
"chr",
|
|
735
|
+
"start",
|
|
736
|
+
"end",
|
|
737
|
+
"gene_name",
|
|
738
|
+
"score",
|
|
739
|
+
"strand",
|
|
740
|
+
"thickStart",
|
|
741
|
+
"thickEnd",
|
|
742
|
+
"itemRgb",
|
|
743
|
+
"blockCount",
|
|
744
|
+
"blockSizes",
|
|
745
|
+
"blockStarts",
|
|
746
|
+
]
|
|
747
|
+
# Use standard names for known columns, generic for extras
|
|
748
|
+
if n_cols <= len(bed_col_names):
|
|
749
|
+
df.columns = bed_col_names[:n_cols]
|
|
750
|
+
else:
|
|
751
|
+
# More columns than BED12 - use known names + generic
|
|
752
|
+
extra_cols = [f"col{i}" for i in range(len(bed_col_names), n_cols)]
|
|
753
|
+
df.columns = bed_col_names + extra_cols
|
|
736
754
|
|
|
737
755
|
# Standardize column names if header was present
|
|
738
756
|
col_map = {
|
pylocuszoom/phewas.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""PheWAS data validation and preparation.
|
|
2
|
+
|
|
3
|
+
Validates and prepares phenome-wide association study data for plotting.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from .utils import ValidationError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validate_phewas_df(
|
|
12
|
+
df: pd.DataFrame,
|
|
13
|
+
phenotype_col: str = "phenotype",
|
|
14
|
+
p_col: str = "p_value",
|
|
15
|
+
category_col: str = "category",
|
|
16
|
+
) -> None:
|
|
17
|
+
"""Validate PheWAS DataFrame has required columns.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df: PheWAS results DataFrame.
|
|
21
|
+
phenotype_col: Column name for phenotype names.
|
|
22
|
+
p_col: Column name for p-values.
|
|
23
|
+
category_col: Column name for phenotype categories (optional).
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
ValidationError: If required columns are missing.
|
|
27
|
+
"""
|
|
28
|
+
required = [phenotype_col, p_col]
|
|
29
|
+
missing = [col for col in required if col not in df.columns]
|
|
30
|
+
|
|
31
|
+
if missing:
|
|
32
|
+
raise ValidationError(
|
|
33
|
+
f"PheWAS DataFrame missing required columns: {missing}. "
|
|
34
|
+
f"Required: {required}. Found: {list(df.columns)}"
|
|
35
|
+
)
|
pylocuszoom/plotter.py
CHANGED
|
@@ -31,12 +31,14 @@ from .colors import (
|
|
|
31
31
|
get_eqtl_color,
|
|
32
32
|
get_ld_bin,
|
|
33
33
|
get_ld_color_palette,
|
|
34
|
+
get_phewas_category_palette,
|
|
34
35
|
)
|
|
35
36
|
from .eqtl import validate_eqtl_df
|
|
36
37
|
from .finemapping import (
|
|
37
38
|
get_credible_sets,
|
|
38
39
|
prepare_finemapping_for_plotting,
|
|
39
40
|
)
|
|
41
|
+
from .forest import validate_forest_df
|
|
40
42
|
from .gene_track import (
|
|
41
43
|
assign_gene_positions,
|
|
42
44
|
plot_gene_track,
|
|
@@ -45,6 +47,7 @@ from .gene_track import (
|
|
|
45
47
|
from .labels import add_snp_labels
|
|
46
48
|
from .ld import calculate_ld, find_plink
|
|
47
49
|
from .logging import enable_logging, logger
|
|
50
|
+
from .phewas import validate_phewas_df
|
|
48
51
|
from .recombination import (
|
|
49
52
|
RECOMB_COLOR,
|
|
50
53
|
add_recombination_overlay,
|
|
@@ -1030,11 +1033,17 @@ class LocusZoomPlotter:
|
|
|
1030
1033
|
if eqtl_gene and "gene" in eqtl_data.columns:
|
|
1031
1034
|
eqtl_data = eqtl_data[eqtl_data["gene"] == eqtl_gene]
|
|
1032
1035
|
|
|
1033
|
-
# Filter by region
|
|
1036
|
+
# Filter by region (position and chromosome)
|
|
1034
1037
|
if "pos" in eqtl_data.columns:
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
+
mask = (eqtl_data["pos"] >= start) & (eqtl_data["pos"] <= end)
|
|
1039
|
+
# Also filter by chromosome if column exists
|
|
1040
|
+
if "chr" in eqtl_data.columns:
|
|
1041
|
+
chrom_str = str(chrom).replace("chr", "")
|
|
1042
|
+
eqtl_chrom = (
|
|
1043
|
+
eqtl_data["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
1044
|
+
)
|
|
1045
|
+
mask = mask & (eqtl_chrom == chrom_str)
|
|
1046
|
+
eqtl_data = eqtl_data[mask]
|
|
1038
1047
|
|
|
1039
1048
|
if not eqtl_data.empty:
|
|
1040
1049
|
eqtl_data["neglog10p"] = -np.log10(
|
|
@@ -1155,3 +1164,248 @@ class LocusZoomPlotter:
|
|
|
1155
1164
|
self._backend.finalize_layout(fig, hspace=0.1)
|
|
1156
1165
|
|
|
1157
1166
|
return fig
|
|
1167
|
+
|
|
1168
|
+
def plot_phewas(
|
|
1169
|
+
self,
|
|
1170
|
+
phewas_df: pd.DataFrame,
|
|
1171
|
+
variant_id: str,
|
|
1172
|
+
phenotype_col: str = "phenotype",
|
|
1173
|
+
p_col: str = "p_value",
|
|
1174
|
+
category_col: str = "category",
|
|
1175
|
+
effect_col: Optional[str] = None,
|
|
1176
|
+
significance_threshold: float = 5e-8,
|
|
1177
|
+
figsize: Tuple[float, float] = (10, 8),
|
|
1178
|
+
) -> Any:
|
|
1179
|
+
"""Create a PheWAS (Phenome-Wide Association Study) plot.
|
|
1180
|
+
|
|
1181
|
+
Shows associations of a single variant across multiple phenotypes,
|
|
1182
|
+
with phenotypes grouped by category and colored accordingly.
|
|
1183
|
+
|
|
1184
|
+
Args:
|
|
1185
|
+
phewas_df: DataFrame with phenotype associations.
|
|
1186
|
+
variant_id: Variant identifier (e.g., "rs12345") for plot title.
|
|
1187
|
+
phenotype_col: Column name for phenotype names.
|
|
1188
|
+
p_col: Column name for p-values.
|
|
1189
|
+
category_col: Column name for phenotype categories.
|
|
1190
|
+
effect_col: Optional column name for effect direction (beta/OR).
|
|
1191
|
+
significance_threshold: P-value threshold for significance line.
|
|
1192
|
+
figsize: Figure size as (width, height).
|
|
1193
|
+
|
|
1194
|
+
Returns:
|
|
1195
|
+
Figure object (type depends on backend).
|
|
1196
|
+
|
|
1197
|
+
Example:
|
|
1198
|
+
>>> fig = plotter.plot_phewas(
|
|
1199
|
+
... phewas_df,
|
|
1200
|
+
... variant_id="rs12345",
|
|
1201
|
+
... category_col="category",
|
|
1202
|
+
... )
|
|
1203
|
+
"""
|
|
1204
|
+
validate_phewas_df(phewas_df, phenotype_col, p_col, category_col)
|
|
1205
|
+
|
|
1206
|
+
df = phewas_df.copy()
|
|
1207
|
+
df["neglog10p"] = -np.log10(df[p_col].clip(lower=1e-300))
|
|
1208
|
+
|
|
1209
|
+
# Sort by category then by p-value for consistent ordering
|
|
1210
|
+
if category_col in df.columns:
|
|
1211
|
+
df = df.sort_values([category_col, p_col])
|
|
1212
|
+
categories = df[category_col].unique().tolist()
|
|
1213
|
+
palette = get_phewas_category_palette(categories)
|
|
1214
|
+
else:
|
|
1215
|
+
df = df.sort_values(p_col)
|
|
1216
|
+
categories = []
|
|
1217
|
+
palette = {}
|
|
1218
|
+
|
|
1219
|
+
# Create figure
|
|
1220
|
+
fig, axes = self._backend.create_figure(
|
|
1221
|
+
n_panels=1,
|
|
1222
|
+
height_ratios=[1.0],
|
|
1223
|
+
figsize=figsize,
|
|
1224
|
+
)
|
|
1225
|
+
ax = axes[0]
|
|
1226
|
+
|
|
1227
|
+
# Assign y-positions (one per phenotype)
|
|
1228
|
+
df["y_pos"] = range(len(df))
|
|
1229
|
+
|
|
1230
|
+
# Plot points by category
|
|
1231
|
+
if categories:
|
|
1232
|
+
for cat in categories:
|
|
1233
|
+
cat_data = df[df[category_col] == cat]
|
|
1234
|
+
# Use upward triangles for positive effects, circles otherwise
|
|
1235
|
+
if effect_col and effect_col in cat_data.columns:
|
|
1236
|
+
for _, row in cat_data.iterrows():
|
|
1237
|
+
marker = "^" if row[effect_col] >= 0 else "v"
|
|
1238
|
+
self._backend.scatter(
|
|
1239
|
+
ax,
|
|
1240
|
+
pd.Series([row["neglog10p"]]),
|
|
1241
|
+
pd.Series([row["y_pos"]]),
|
|
1242
|
+
colors=palette[cat],
|
|
1243
|
+
sizes=60,
|
|
1244
|
+
marker=marker,
|
|
1245
|
+
edgecolor="black",
|
|
1246
|
+
linewidth=0.5,
|
|
1247
|
+
zorder=2,
|
|
1248
|
+
)
|
|
1249
|
+
else:
|
|
1250
|
+
self._backend.scatter(
|
|
1251
|
+
ax,
|
|
1252
|
+
cat_data["neglog10p"],
|
|
1253
|
+
cat_data["y_pos"],
|
|
1254
|
+
colors=palette[cat],
|
|
1255
|
+
sizes=60,
|
|
1256
|
+
marker="o",
|
|
1257
|
+
edgecolor="black",
|
|
1258
|
+
linewidth=0.5,
|
|
1259
|
+
zorder=2,
|
|
1260
|
+
)
|
|
1261
|
+
else:
|
|
1262
|
+
self._backend.scatter(
|
|
1263
|
+
ax,
|
|
1264
|
+
df["neglog10p"],
|
|
1265
|
+
df["y_pos"],
|
|
1266
|
+
colors="#4169E1",
|
|
1267
|
+
sizes=60,
|
|
1268
|
+
edgecolor="black",
|
|
1269
|
+
linewidth=0.5,
|
|
1270
|
+
zorder=2,
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
# Add significance threshold line
|
|
1274
|
+
sig_line = -np.log10(significance_threshold)
|
|
1275
|
+
self._backend.axvline(
|
|
1276
|
+
ax, x=sig_line, color="red", linestyle="--", linewidth=1, alpha=0.7
|
|
1277
|
+
)
|
|
1278
|
+
|
|
1279
|
+
# Set axis labels and limits
|
|
1280
|
+
self._backend.set_xlabel(ax, r"$-\log_{10}$ P")
|
|
1281
|
+
self._backend.set_ylabel(ax, "Phenotype")
|
|
1282
|
+
self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
|
|
1283
|
+
|
|
1284
|
+
# Set y-tick labels to phenotype names (matplotlib only)
|
|
1285
|
+
if self.backend_name == "matplotlib":
|
|
1286
|
+
ax.set_yticks(df["y_pos"])
|
|
1287
|
+
ax.set_yticklabels(df[phenotype_col], fontsize=8)
|
|
1288
|
+
|
|
1289
|
+
self._backend.set_title(ax, f"PheWAS: {variant_id}")
|
|
1290
|
+
self._backend.hide_spines(ax, ["top", "right"])
|
|
1291
|
+
self._backend.finalize_layout(fig)
|
|
1292
|
+
|
|
1293
|
+
return fig
|
|
1294
|
+
|
|
1295
|
+
def plot_forest(
|
|
1296
|
+
self,
|
|
1297
|
+
forest_df: pd.DataFrame,
|
|
1298
|
+
variant_id: str,
|
|
1299
|
+
study_col: str = "study",
|
|
1300
|
+
effect_col: str = "effect",
|
|
1301
|
+
ci_lower_col: str = "ci_lower",
|
|
1302
|
+
ci_upper_col: str = "ci_upper",
|
|
1303
|
+
weight_col: Optional[str] = None,
|
|
1304
|
+
null_value: float = 0.0,
|
|
1305
|
+
effect_label: str = "Effect Size",
|
|
1306
|
+
figsize: Tuple[float, float] = (8, 6),
|
|
1307
|
+
) -> Any:
|
|
1308
|
+
"""Create a forest plot showing effect sizes with confidence intervals.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
forest_df: DataFrame with effect sizes and confidence intervals.
|
|
1312
|
+
variant_id: Variant identifier for plot title.
|
|
1313
|
+
study_col: Column name for study/phenotype names.
|
|
1314
|
+
effect_col: Column name for effect sizes.
|
|
1315
|
+
ci_lower_col: Column name for lower confidence interval.
|
|
1316
|
+
ci_upper_col: Column name for upper confidence interval.
|
|
1317
|
+
weight_col: Optional column for study weights (affects marker size).
|
|
1318
|
+
null_value: Reference value for null effect (0 for beta, 1 for OR).
|
|
1319
|
+
effect_label: X-axis label.
|
|
1320
|
+
figsize: Figure size as (width, height).
|
|
1321
|
+
|
|
1322
|
+
Returns:
|
|
1323
|
+
Figure object (type depends on backend).
|
|
1324
|
+
|
|
1325
|
+
Example:
|
|
1326
|
+
>>> fig = plotter.plot_forest(
|
|
1327
|
+
... forest_df,
|
|
1328
|
+
... variant_id="rs12345",
|
|
1329
|
+
... effect_label="Odds Ratio",
|
|
1330
|
+
... null_value=1.0,
|
|
1331
|
+
... )
|
|
1332
|
+
"""
|
|
1333
|
+
validate_forest_df(forest_df, study_col, effect_col, ci_lower_col, ci_upper_col)
|
|
1334
|
+
|
|
1335
|
+
df = forest_df.copy()
|
|
1336
|
+
|
|
1337
|
+
# Create figure
|
|
1338
|
+
fig, axes = self._backend.create_figure(
|
|
1339
|
+
n_panels=1,
|
|
1340
|
+
height_ratios=[1.0],
|
|
1341
|
+
figsize=figsize,
|
|
1342
|
+
)
|
|
1343
|
+
ax = axes[0]
|
|
1344
|
+
|
|
1345
|
+
# Assign y-positions (reverse so first study is at top)
|
|
1346
|
+
df["y_pos"] = range(len(df) - 1, -1, -1)
|
|
1347
|
+
|
|
1348
|
+
# Calculate marker sizes from weights
|
|
1349
|
+
if weight_col and weight_col in df.columns:
|
|
1350
|
+
# Scale weights to marker sizes (min 40, max 200)
|
|
1351
|
+
weights = df[weight_col]
|
|
1352
|
+
min_size, max_size = 40, 200
|
|
1353
|
+
weight_range = weights.max() - weights.min()
|
|
1354
|
+
if weight_range > 0:
|
|
1355
|
+
sizes = min_size + (weights - weights.min()) / weight_range * (
|
|
1356
|
+
max_size - min_size
|
|
1357
|
+
)
|
|
1358
|
+
else:
|
|
1359
|
+
sizes = (min_size + max_size) / 2
|
|
1360
|
+
else:
|
|
1361
|
+
sizes = 80
|
|
1362
|
+
|
|
1363
|
+
# Calculate error bar extents
|
|
1364
|
+
xerr_lower = df[effect_col] - df[ci_lower_col]
|
|
1365
|
+
xerr_upper = df[ci_upper_col] - df[effect_col]
|
|
1366
|
+
|
|
1367
|
+
# Plot error bars (confidence intervals)
|
|
1368
|
+
self._backend.errorbar_h(
|
|
1369
|
+
ax,
|
|
1370
|
+
x=df[effect_col],
|
|
1371
|
+
y=df["y_pos"],
|
|
1372
|
+
xerr_lower=xerr_lower,
|
|
1373
|
+
xerr_upper=xerr_upper,
|
|
1374
|
+
color="black",
|
|
1375
|
+
linewidth=1.5,
|
|
1376
|
+
capsize=3,
|
|
1377
|
+
zorder=2,
|
|
1378
|
+
)
|
|
1379
|
+
|
|
1380
|
+
# Plot effect size markers
|
|
1381
|
+
self._backend.scatter(
|
|
1382
|
+
ax,
|
|
1383
|
+
df[effect_col],
|
|
1384
|
+
df["y_pos"],
|
|
1385
|
+
colors="#4169E1",
|
|
1386
|
+
sizes=sizes,
|
|
1387
|
+
marker="s", # square markers typical for forest plots
|
|
1388
|
+
edgecolor="black",
|
|
1389
|
+
linewidth=0.5,
|
|
1390
|
+
zorder=3,
|
|
1391
|
+
)
|
|
1392
|
+
|
|
1393
|
+
# Add null effect line
|
|
1394
|
+
self._backend.axvline(
|
|
1395
|
+
ax, x=null_value, color="grey", linestyle="--", linewidth=1, alpha=0.7
|
|
1396
|
+
)
|
|
1397
|
+
|
|
1398
|
+
# Set axis labels and limits
|
|
1399
|
+
self._backend.set_xlabel(ax, effect_label)
|
|
1400
|
+
self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
|
|
1401
|
+
|
|
1402
|
+
# Set y-tick labels to study names (matplotlib only)
|
|
1403
|
+
if self.backend_name == "matplotlib":
|
|
1404
|
+
ax.set_yticks(df["y_pos"])
|
|
1405
|
+
ax.set_yticklabels(df[study_col], fontsize=10)
|
|
1406
|
+
|
|
1407
|
+
self._backend.set_title(ax, f"Forest Plot: {variant_id}")
|
|
1408
|
+
self._backend.hide_spines(ax, ["top", "right"])
|
|
1409
|
+
self._backend.finalize_layout(fig)
|
|
1410
|
+
|
|
1411
|
+
return fig
|
pylocuszoom/recombination.py
CHANGED
|
@@ -9,12 +9,13 @@ Provides:
|
|
|
9
9
|
import os
|
|
10
10
|
import tarfile
|
|
11
11
|
import tempfile
|
|
12
|
-
import urllib.request
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
from typing import Optional
|
|
15
14
|
|
|
16
15
|
import pandas as pd
|
|
16
|
+
import requests
|
|
17
17
|
from matplotlib.axes import Axes
|
|
18
|
+
from tqdm import tqdm
|
|
18
19
|
|
|
19
20
|
from .logging import logger
|
|
20
21
|
|
|
@@ -54,6 +55,38 @@ def get_chain_file_path() -> Path:
|
|
|
54
55
|
return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
|
|
55
56
|
|
|
56
57
|
|
|
58
|
+
def _download_with_progress(
|
|
59
|
+
url: str, dest_path: Path, desc: str = "Downloading"
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Download a file with a progress bar.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
url: URL to download from.
|
|
65
|
+
dest_path: Destination file path.
|
|
66
|
+
desc: Description for the progress bar.
|
|
67
|
+
"""
|
|
68
|
+
response = requests.get(url, stream=True, timeout=60)
|
|
69
|
+
response.raise_for_status()
|
|
70
|
+
|
|
71
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
72
|
+
|
|
73
|
+
with (
|
|
74
|
+
open(dest_path, "wb") as f,
|
|
75
|
+
tqdm(
|
|
76
|
+
total=total_size,
|
|
77
|
+
unit="B",
|
|
78
|
+
unit_scale=True,
|
|
79
|
+
unit_divisor=1024,
|
|
80
|
+
desc=desc,
|
|
81
|
+
disable=total_size == 0, # Disable if size unknown
|
|
82
|
+
) as pbar,
|
|
83
|
+
):
|
|
84
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
85
|
+
if chunk:
|
|
86
|
+
f.write(chunk)
|
|
87
|
+
pbar.update(len(chunk))
|
|
88
|
+
|
|
89
|
+
|
|
57
90
|
def download_liftover_chain(force: bool = False) -> Path:
|
|
58
91
|
"""Download the CanFam3 to CanFam4 liftover chain file.
|
|
59
92
|
|
|
@@ -73,20 +106,11 @@ def download_liftover_chain(force: bool = False) -> Path:
|
|
|
73
106
|
logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
|
|
74
107
|
logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
|
|
75
108
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
import requests
|
|
82
|
-
|
|
83
|
-
response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
|
|
84
|
-
response.raise_for_status()
|
|
85
|
-
chain_path.write_bytes(response.content)
|
|
86
|
-
except ImportError:
|
|
87
|
-
raise RuntimeError(
|
|
88
|
-
"Failed to download. Install requests: pip install requests"
|
|
89
|
-
)
|
|
109
|
+
_download_with_progress(
|
|
110
|
+
CANFAM3_TO_CANFAM4_CHAIN_URL,
|
|
111
|
+
chain_path,
|
|
112
|
+
desc="Liftover chain",
|
|
113
|
+
)
|
|
90
114
|
|
|
91
115
|
logger.info(f"Chain file saved to: {chain_path}")
|
|
92
116
|
return chain_path
|
|
@@ -217,24 +241,14 @@ def download_canine_recombination_maps(
|
|
|
217
241
|
logger.debug(f"Source: {CANINE_RECOMB_URL}")
|
|
218
242
|
|
|
219
243
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
220
|
-
# Download tar.gz file
|
|
244
|
+
# Download tar.gz file with progress bar
|
|
221
245
|
tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
|
|
222
246
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
try:
|
|
229
|
-
import requests
|
|
230
|
-
|
|
231
|
-
response = requests.get(CANINE_RECOMB_URL, timeout=60)
|
|
232
|
-
response.raise_for_status()
|
|
233
|
-
tar_path.write_bytes(response.content)
|
|
234
|
-
except ImportError:
|
|
235
|
-
raise RuntimeError(
|
|
236
|
-
"Failed to download. Install requests: pip install requests"
|
|
237
|
-
)
|
|
247
|
+
_download_with_progress(
|
|
248
|
+
CANINE_RECOMB_URL,
|
|
249
|
+
tar_path,
|
|
250
|
+
desc="Recombination maps",
|
|
251
|
+
)
|
|
238
252
|
|
|
239
253
|
logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
|
|
240
254
|
|
pylocuszoom/schemas.py
CHANGED
|
@@ -84,30 +84,36 @@ def validate_gwas_dataframe(
|
|
|
84
84
|
"GWAS validation failed:\n - " + "\n - ".join(errors)
|
|
85
85
|
)
|
|
86
86
|
|
|
87
|
-
# Check data types
|
|
88
|
-
|
|
87
|
+
# Check data types (must be numeric for range checks)
|
|
88
|
+
pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
|
|
89
|
+
p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
|
|
90
|
+
|
|
91
|
+
if not pos_is_numeric:
|
|
89
92
|
errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
|
|
90
93
|
|
|
91
|
-
if not
|
|
94
|
+
if not p_is_numeric:
|
|
92
95
|
errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
|
|
93
96
|
|
|
94
|
-
#
|
|
95
|
-
if
|
|
96
|
-
|
|
97
|
-
|
|
97
|
+
# Only check value ranges if columns are numeric (avoid confusing errors)
|
|
98
|
+
if pos_is_numeric:
|
|
99
|
+
if (df[pos_col] <= 0).any():
|
|
100
|
+
n_invalid = (df[pos_col] <= 0).sum()
|
|
101
|
+
errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
|
|
98
102
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
if df[pos_col].isna().any():
|
|
104
|
+
n_na = df[pos_col].isna().sum()
|
|
105
|
+
errors.append(f"Column '{pos_col}' has {n_na} missing values")
|
|
102
106
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
if p_is_numeric:
|
|
108
|
+
if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
|
|
109
|
+
n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
|
|
110
|
+
errors.append(
|
|
111
|
+
f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
|
|
112
|
+
)
|
|
107
113
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
114
|
+
if df[p_col].isna().any():
|
|
115
|
+
n_na = df[p_col].isna().sum()
|
|
116
|
+
errors.append(f"Column '{p_col}' has {n_na} missing values")
|
|
111
117
|
|
|
112
118
|
if errors:
|
|
113
119
|
raise LoaderValidationError(
|
|
@@ -344,20 +350,25 @@ def validate_genes_dataframe(
|
|
|
344
350
|
)
|
|
345
351
|
|
|
346
352
|
# Check data types
|
|
347
|
-
|
|
353
|
+
start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
|
|
354
|
+
end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
|
|
355
|
+
|
|
356
|
+
if not start_is_numeric:
|
|
348
357
|
errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
|
|
349
358
|
|
|
350
|
-
if not
|
|
359
|
+
if not end_is_numeric:
|
|
351
360
|
errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
|
|
352
361
|
|
|
353
|
-
#
|
|
354
|
-
if
|
|
355
|
-
|
|
356
|
-
|
|
362
|
+
# Only check ranges if columns are numeric (avoid confusing errors)
|
|
363
|
+
if start_is_numeric:
|
|
364
|
+
if (df["start"] < 0).any():
|
|
365
|
+
n_invalid = (df["start"] < 0).sum()
|
|
366
|
+
errors.append(f"Column 'start' has {n_invalid} negative values")
|
|
357
367
|
|
|
358
|
-
if
|
|
359
|
-
|
|
360
|
-
|
|
368
|
+
if start_is_numeric and end_is_numeric:
|
|
369
|
+
if (df["end"] < df["start"]).any():
|
|
370
|
+
n_invalid = (df["end"] < df["start"]).sum()
|
|
371
|
+
errors.append(f"Found {n_invalid} genes where end < start")
|
|
361
372
|
|
|
362
373
|
if errors:
|
|
363
374
|
raise LoaderValidationError(
|