pylocuszoom 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/loaders.py CHANGED
@@ -260,10 +260,14 @@ def load_saige(
260
260
  "POS": pos_col,
261
261
  "MarkerID": rs_col,
262
262
  "CHR": "chr",
263
- "p.value": p_col,
264
- "p.value.NA": p_col, # SPA-adjusted
265
263
  }
266
264
 
265
+ # Prefer SPA-adjusted p-value (p.value.NA) over raw p.value when both present
266
+ if "p.value.NA" in df.columns:
267
+ col_map["p.value.NA"] = p_col
268
+ elif "p.value" in df.columns:
269
+ col_map["p.value"] = p_col
270
+
267
271
  df = df.rename(columns=col_map)
268
272
  logger.debug(f"Loaded SAIGE file with {len(df)} variants")
269
273
  validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
@@ -318,7 +322,7 @@ def load_gtex_eqtl(
318
322
  gene: Optional gene to filter to (ENSG ID or gene symbol).
319
323
 
320
324
  Returns:
321
- DataFrame with columns: pos, p_value, gene, effect.
325
+ DataFrame with columns: pos, p_value, gene, effect_size.
322
326
 
323
327
  Example:
324
328
  >>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
@@ -351,10 +355,10 @@ def load_gtex_eqtl(
351
355
  col_map[col] = "gene"
352
356
  break
353
357
 
354
- # Effect size (slope)
358
+ # Effect size (slope) - standardize to effect_size for plotting compatibility
355
359
  for col in ["slope", "beta", "effect_size"]:
356
360
  if col in df.columns:
357
- col_map[col] = "effect"
361
+ col_map[col] = "effect_size"
358
362
  break
359
363
 
360
364
  df = df.rename(columns=col_map)
@@ -385,7 +389,7 @@ def load_eqtl_catalogue(
385
389
  gene: Optional gene to filter to.
386
390
 
387
391
  Returns:
388
- DataFrame with columns: pos, p_value, gene, effect.
392
+ DataFrame with columns: pos, p_value, gene, effect_size.
389
393
  """
390
394
  df = pd.read_csv(filepath, sep="\t")
391
395
 
@@ -393,7 +397,7 @@ def load_eqtl_catalogue(
393
397
  "position": "pos",
394
398
  "pvalue": "p_value",
395
399
  "gene_id": "gene",
396
- "beta": "effect",
400
+ "beta": "effect_size", # Standardize to effect_size for plotter
397
401
  "chromosome": "chr",
398
402
  }
399
403
 
@@ -422,7 +426,7 @@ def load_matrixeqtl(
422
426
  gene: Optional gene to filter to.
423
427
 
424
428
  Returns:
425
- DataFrame with columns: pos, p_value, gene, effect.
429
+ DataFrame with columns: pos, p_value, gene, effect_size.
426
430
 
427
431
  Note:
428
432
  MatrixEQTL output doesn't include position by default.
@@ -435,7 +439,7 @@ def load_matrixeqtl(
435
439
  "gene": "gene",
436
440
  "p-value": "p_value",
437
441
  "pvalue": "p_value",
438
- "beta": "effect",
442
+ "beta": "effect_size", # Standardize to effect_size for plotter
439
443
  "t-stat": "t_stat",
440
444
  }
441
445
 
@@ -725,14 +729,28 @@ def load_bed(
725
729
  # Assign column names if no header
726
730
  if not has_header:
727
731
  n_cols = len(df.columns)
728
- col_names = ["chr", "start", "end"]
729
- if n_cols >= 4:
730
- col_names.append("gene_name")
731
- if n_cols >= 5:
732
- col_names.append("score")
733
- if n_cols >= 6:
734
- col_names.append("strand")
735
- df.columns = col_names[:n_cols]
732
+ # Standard BED column names (up to BED12)
733
+ bed_col_names = [
734
+ "chr",
735
+ "start",
736
+ "end",
737
+ "gene_name",
738
+ "score",
739
+ "strand",
740
+ "thickStart",
741
+ "thickEnd",
742
+ "itemRgb",
743
+ "blockCount",
744
+ "blockSizes",
745
+ "blockStarts",
746
+ ]
747
+ # Use standard names for known columns, generic for extras
748
+ if n_cols <= len(bed_col_names):
749
+ df.columns = bed_col_names[:n_cols]
750
+ else:
751
+ # More columns than BED12 - use known names + generic
752
+ extra_cols = [f"col{i}" for i in range(len(bed_col_names), n_cols)]
753
+ df.columns = bed_col_names + extra_cols
736
754
 
737
755
  # Standardize column names if header was present
738
756
  col_map = {
pylocuszoom/phewas.py ADDED
@@ -0,0 +1,35 @@
1
+ """PheWAS data validation and preparation.
2
+
3
+ Validates and prepares phenome-wide association study data for plotting.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from .utils import ValidationError
9
+
10
+
11
+ def validate_phewas_df(
12
+ df: pd.DataFrame,
13
+ phenotype_col: str = "phenotype",
14
+ p_col: str = "p_value",
15
+ category_col: str = "category",
16
+ ) -> None:
17
+ """Validate PheWAS DataFrame has required columns.
18
+
19
+ Args:
20
+ df: PheWAS results DataFrame.
21
+ phenotype_col: Column name for phenotype names.
22
+ p_col: Column name for p-values.
23
+ category_col: Column name for phenotype categories (optional).
24
+
25
+ Raises:
26
+ ValidationError: If required columns are missing.
27
+ """
28
+ required = [phenotype_col, p_col]
29
+ missing = [col for col in required if col not in df.columns]
30
+
31
+ if missing:
32
+ raise ValidationError(
33
+ f"PheWAS DataFrame missing required columns: {missing}. "
34
+ f"Required: {required}. Found: {list(df.columns)}"
35
+ )
pylocuszoom/plotter.py CHANGED
@@ -31,12 +31,14 @@ from .colors import (
31
31
  get_eqtl_color,
32
32
  get_ld_bin,
33
33
  get_ld_color_palette,
34
+ get_phewas_category_palette,
34
35
  )
35
36
  from .eqtl import validate_eqtl_df
36
37
  from .finemapping import (
37
38
  get_credible_sets,
38
39
  prepare_finemapping_for_plotting,
39
40
  )
41
+ from .forest import validate_forest_df
40
42
  from .gene_track import (
41
43
  assign_gene_positions,
42
44
  plot_gene_track,
@@ -45,6 +47,7 @@ from .gene_track import (
45
47
  from .labels import add_snp_labels
46
48
  from .ld import calculate_ld, find_plink
47
49
  from .logging import enable_logging, logger
50
+ from .phewas import validate_phewas_df
48
51
  from .recombination import (
49
52
  RECOMB_COLOR,
50
53
  add_recombination_overlay,
@@ -1030,11 +1033,17 @@ class LocusZoomPlotter:
1030
1033
  if eqtl_gene and "gene" in eqtl_data.columns:
1031
1034
  eqtl_data = eqtl_data[eqtl_data["gene"] == eqtl_gene]
1032
1035
 
1033
- # Filter by region
1036
+ # Filter by region (position and chromosome)
1034
1037
  if "pos" in eqtl_data.columns:
1035
- eqtl_data = eqtl_data[
1036
- (eqtl_data["pos"] >= start) & (eqtl_data["pos"] <= end)
1037
- ]
1038
+ mask = (eqtl_data["pos"] >= start) & (eqtl_data["pos"] <= end)
1039
+ # Also filter by chromosome if column exists
1040
+ if "chr" in eqtl_data.columns:
1041
+ chrom_str = str(chrom).replace("chr", "")
1042
+ eqtl_chrom = (
1043
+ eqtl_data["chr"].astype(str).str.replace("chr", "", regex=False)
1044
+ )
1045
+ mask = mask & (eqtl_chrom == chrom_str)
1046
+ eqtl_data = eqtl_data[mask]
1038
1047
 
1039
1048
  if not eqtl_data.empty:
1040
1049
  eqtl_data["neglog10p"] = -np.log10(
@@ -1155,3 +1164,248 @@ class LocusZoomPlotter:
1155
1164
  self._backend.finalize_layout(fig, hspace=0.1)
1156
1165
 
1157
1166
  return fig
1167
+
1168
+ def plot_phewas(
1169
+ self,
1170
+ phewas_df: pd.DataFrame,
1171
+ variant_id: str,
1172
+ phenotype_col: str = "phenotype",
1173
+ p_col: str = "p_value",
1174
+ category_col: str = "category",
1175
+ effect_col: Optional[str] = None,
1176
+ significance_threshold: float = 5e-8,
1177
+ figsize: Tuple[float, float] = (10, 8),
1178
+ ) -> Any:
1179
+ """Create a PheWAS (Phenome-Wide Association Study) plot.
1180
+
1181
+ Shows associations of a single variant across multiple phenotypes,
1182
+ with phenotypes grouped by category and colored accordingly.
1183
+
1184
+ Args:
1185
+ phewas_df: DataFrame with phenotype associations.
1186
+ variant_id: Variant identifier (e.g., "rs12345") for plot title.
1187
+ phenotype_col: Column name for phenotype names.
1188
+ p_col: Column name for p-values.
1189
+ category_col: Column name for phenotype categories.
1190
+ effect_col: Optional column name for effect direction (beta/OR).
1191
+ significance_threshold: P-value threshold for significance line.
1192
+ figsize: Figure size as (width, height).
1193
+
1194
+ Returns:
1195
+ Figure object (type depends on backend).
1196
+
1197
+ Example:
1198
+ >>> fig = plotter.plot_phewas(
1199
+ ... phewas_df,
1200
+ ... variant_id="rs12345",
1201
+ ... category_col="category",
1202
+ ... )
1203
+ """
1204
+ validate_phewas_df(phewas_df, phenotype_col, p_col, category_col)
1205
+
1206
+ df = phewas_df.copy()
1207
+ df["neglog10p"] = -np.log10(df[p_col].clip(lower=1e-300))
1208
+
1209
+ # Sort by category then by p-value for consistent ordering
1210
+ if category_col in df.columns:
1211
+ df = df.sort_values([category_col, p_col])
1212
+ categories = df[category_col].unique().tolist()
1213
+ palette = get_phewas_category_palette(categories)
1214
+ else:
1215
+ df = df.sort_values(p_col)
1216
+ categories = []
1217
+ palette = {}
1218
+
1219
+ # Create figure
1220
+ fig, axes = self._backend.create_figure(
1221
+ n_panels=1,
1222
+ height_ratios=[1.0],
1223
+ figsize=figsize,
1224
+ )
1225
+ ax = axes[0]
1226
+
1227
+ # Assign y-positions (one per phenotype)
1228
+ df["y_pos"] = range(len(df))
1229
+
1230
+ # Plot points by category
1231
+ if categories:
1232
+ for cat in categories:
1233
+ cat_data = df[df[category_col] == cat]
1234
+ # Use upward triangles for positive effects, circles otherwise
1235
+ if effect_col and effect_col in cat_data.columns:
1236
+ for _, row in cat_data.iterrows():
1237
+ marker = "^" if row[effect_col] >= 0 else "v"
1238
+ self._backend.scatter(
1239
+ ax,
1240
+ pd.Series([row["neglog10p"]]),
1241
+ pd.Series([row["y_pos"]]),
1242
+ colors=palette[cat],
1243
+ sizes=60,
1244
+ marker=marker,
1245
+ edgecolor="black",
1246
+ linewidth=0.5,
1247
+ zorder=2,
1248
+ )
1249
+ else:
1250
+ self._backend.scatter(
1251
+ ax,
1252
+ cat_data["neglog10p"],
1253
+ cat_data["y_pos"],
1254
+ colors=palette[cat],
1255
+ sizes=60,
1256
+ marker="o",
1257
+ edgecolor="black",
1258
+ linewidth=0.5,
1259
+ zorder=2,
1260
+ )
1261
+ else:
1262
+ self._backend.scatter(
1263
+ ax,
1264
+ df["neglog10p"],
1265
+ df["y_pos"],
1266
+ colors="#4169E1",
1267
+ sizes=60,
1268
+ edgecolor="black",
1269
+ linewidth=0.5,
1270
+ zorder=2,
1271
+ )
1272
+
1273
+ # Add significance threshold line
1274
+ sig_line = -np.log10(significance_threshold)
1275
+ self._backend.axvline(
1276
+ ax, x=sig_line, color="red", linestyle="--", linewidth=1, alpha=0.7
1277
+ )
1278
+
1279
+ # Set axis labels and limits
1280
+ self._backend.set_xlabel(ax, r"$-\log_{10}$ P")
1281
+ self._backend.set_ylabel(ax, "Phenotype")
1282
+ self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
1283
+
1284
+ # Set y-tick labels to phenotype names (matplotlib only)
1285
+ if self.backend_name == "matplotlib":
1286
+ ax.set_yticks(df["y_pos"])
1287
+ ax.set_yticklabels(df[phenotype_col], fontsize=8)
1288
+
1289
+ self._backend.set_title(ax, f"PheWAS: {variant_id}")
1290
+ self._backend.hide_spines(ax, ["top", "right"])
1291
+ self._backend.finalize_layout(fig)
1292
+
1293
+ return fig
1294
+
1295
+ def plot_forest(
1296
+ self,
1297
+ forest_df: pd.DataFrame,
1298
+ variant_id: str,
1299
+ study_col: str = "study",
1300
+ effect_col: str = "effect",
1301
+ ci_lower_col: str = "ci_lower",
1302
+ ci_upper_col: str = "ci_upper",
1303
+ weight_col: Optional[str] = None,
1304
+ null_value: float = 0.0,
1305
+ effect_label: str = "Effect Size",
1306
+ figsize: Tuple[float, float] = (8, 6),
1307
+ ) -> Any:
1308
+ """Create a forest plot showing effect sizes with confidence intervals.
1309
+
1310
+ Args:
1311
+ forest_df: DataFrame with effect sizes and confidence intervals.
1312
+ variant_id: Variant identifier for plot title.
1313
+ study_col: Column name for study/phenotype names.
1314
+ effect_col: Column name for effect sizes.
1315
+ ci_lower_col: Column name for lower confidence interval.
1316
+ ci_upper_col: Column name for upper confidence interval.
1317
+ weight_col: Optional column for study weights (affects marker size).
1318
+ null_value: Reference value for null effect (0 for beta, 1 for OR).
1319
+ effect_label: X-axis label.
1320
+ figsize: Figure size as (width, height).
1321
+
1322
+ Returns:
1323
+ Figure object (type depends on backend).
1324
+
1325
+ Example:
1326
+ >>> fig = plotter.plot_forest(
1327
+ ... forest_df,
1328
+ ... variant_id="rs12345",
1329
+ ... effect_label="Odds Ratio",
1330
+ ... null_value=1.0,
1331
+ ... )
1332
+ """
1333
+ validate_forest_df(forest_df, study_col, effect_col, ci_lower_col, ci_upper_col)
1334
+
1335
+ df = forest_df.copy()
1336
+
1337
+ # Create figure
1338
+ fig, axes = self._backend.create_figure(
1339
+ n_panels=1,
1340
+ height_ratios=[1.0],
1341
+ figsize=figsize,
1342
+ )
1343
+ ax = axes[0]
1344
+
1345
+ # Assign y-positions (reverse so first study is at top)
1346
+ df["y_pos"] = range(len(df) - 1, -1, -1)
1347
+
1348
+ # Calculate marker sizes from weights
1349
+ if weight_col and weight_col in df.columns:
1350
+ # Scale weights to marker sizes (min 40, max 200)
1351
+ weights = df[weight_col]
1352
+ min_size, max_size = 40, 200
1353
+ weight_range = weights.max() - weights.min()
1354
+ if weight_range > 0:
1355
+ sizes = min_size + (weights - weights.min()) / weight_range * (
1356
+ max_size - min_size
1357
+ )
1358
+ else:
1359
+ sizes = (min_size + max_size) / 2
1360
+ else:
1361
+ sizes = 80
1362
+
1363
+ # Calculate error bar extents
1364
+ xerr_lower = df[effect_col] - df[ci_lower_col]
1365
+ xerr_upper = df[ci_upper_col] - df[effect_col]
1366
+
1367
+ # Plot error bars (confidence intervals)
1368
+ self._backend.errorbar_h(
1369
+ ax,
1370
+ x=df[effect_col],
1371
+ y=df["y_pos"],
1372
+ xerr_lower=xerr_lower,
1373
+ xerr_upper=xerr_upper,
1374
+ color="black",
1375
+ linewidth=1.5,
1376
+ capsize=3,
1377
+ zorder=2,
1378
+ )
1379
+
1380
+ # Plot effect size markers
1381
+ self._backend.scatter(
1382
+ ax,
1383
+ df[effect_col],
1384
+ df["y_pos"],
1385
+ colors="#4169E1",
1386
+ sizes=sizes,
1387
+ marker="s", # square markers typical for forest plots
1388
+ edgecolor="black",
1389
+ linewidth=0.5,
1390
+ zorder=3,
1391
+ )
1392
+
1393
+ # Add null effect line
1394
+ self._backend.axvline(
1395
+ ax, x=null_value, color="grey", linestyle="--", linewidth=1, alpha=0.7
1396
+ )
1397
+
1398
+ # Set axis labels and limits
1399
+ self._backend.set_xlabel(ax, effect_label)
1400
+ self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
1401
+
1402
+ # Set y-tick labels to study names (matplotlib only)
1403
+ if self.backend_name == "matplotlib":
1404
+ ax.set_yticks(df["y_pos"])
1405
+ ax.set_yticklabels(df[study_col], fontsize=10)
1406
+
1407
+ self._backend.set_title(ax, f"Forest Plot: {variant_id}")
1408
+ self._backend.hide_spines(ax, ["top", "right"])
1409
+ self._backend.finalize_layout(fig)
1410
+
1411
+ return fig
@@ -9,12 +9,13 @@ Provides:
9
9
  import os
10
10
  import tarfile
11
11
  import tempfile
12
- import urllib.request
13
12
  from pathlib import Path
14
13
  from typing import Optional
15
14
 
16
15
  import pandas as pd
16
+ import requests
17
17
  from matplotlib.axes import Axes
18
+ from tqdm import tqdm
18
19
 
19
20
  from .logging import logger
20
21
 
@@ -54,6 +55,38 @@ def get_chain_file_path() -> Path:
54
55
  return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
55
56
 
56
57
 
58
+ def _download_with_progress(
59
+ url: str, dest_path: Path, desc: str = "Downloading"
60
+ ) -> None:
61
+ """Download a file with a progress bar.
62
+
63
+ Args:
64
+ url: URL to download from.
65
+ dest_path: Destination file path.
66
+ desc: Description for the progress bar.
67
+ """
68
+ response = requests.get(url, stream=True, timeout=60)
69
+ response.raise_for_status()
70
+
71
+ total_size = int(response.headers.get("content-length", 0))
72
+
73
+ with (
74
+ open(dest_path, "wb") as f,
75
+ tqdm(
76
+ total=total_size,
77
+ unit="B",
78
+ unit_scale=True,
79
+ unit_divisor=1024,
80
+ desc=desc,
81
+ disable=total_size == 0, # Disable if size unknown
82
+ ) as pbar,
83
+ ):
84
+ for chunk in response.iter_content(chunk_size=8192):
85
+ if chunk:
86
+ f.write(chunk)
87
+ pbar.update(len(chunk))
88
+
89
+
57
90
  def download_liftover_chain(force: bool = False) -> Path:
58
91
  """Download the CanFam3 to CanFam4 liftover chain file.
59
92
 
@@ -73,20 +106,11 @@ def download_liftover_chain(force: bool = False) -> Path:
73
106
  logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
74
107
  logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
75
108
 
76
- try:
77
- urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
78
- except Exception as e:
79
- logger.debug(f"urllib download failed: {e}")
80
- try:
81
- import requests
82
-
83
- response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
84
- response.raise_for_status()
85
- chain_path.write_bytes(response.content)
86
- except ImportError:
87
- raise RuntimeError(
88
- "Failed to download. Install requests: pip install requests"
89
- )
109
+ _download_with_progress(
110
+ CANFAM3_TO_CANFAM4_CHAIN_URL,
111
+ chain_path,
112
+ desc="Liftover chain",
113
+ )
90
114
 
91
115
  logger.info(f"Chain file saved to: {chain_path}")
92
116
  return chain_path
@@ -217,24 +241,14 @@ def download_canine_recombination_maps(
217
241
  logger.debug(f"Source: {CANINE_RECOMB_URL}")
218
242
 
219
243
  with tempfile.TemporaryDirectory() as tmpdir:
220
- # Download tar.gz file
244
+ # Download tar.gz file with progress bar
221
245
  tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
222
246
 
223
- try:
224
- urllib.request.urlretrieve(CANINE_RECOMB_URL, tar_path)
225
- except Exception as e:
226
- logger.debug(f"urllib download failed: {e}")
227
- logger.debug("Trying alternative method with requests...")
228
- try:
229
- import requests
230
-
231
- response = requests.get(CANINE_RECOMB_URL, timeout=60)
232
- response.raise_for_status()
233
- tar_path.write_bytes(response.content)
234
- except ImportError:
235
- raise RuntimeError(
236
- "Failed to download. Install requests: pip install requests"
237
- )
247
+ _download_with_progress(
248
+ CANINE_RECOMB_URL,
249
+ tar_path,
250
+ desc="Recombination maps",
251
+ )
238
252
 
239
253
  logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
240
254
 
pylocuszoom/schemas.py CHANGED
@@ -84,30 +84,36 @@ def validate_gwas_dataframe(
84
84
  "GWAS validation failed:\n - " + "\n - ".join(errors)
85
85
  )
86
86
 
87
- # Check data types
88
- if not pd.api.types.is_numeric_dtype(df[pos_col]):
87
+ # Check data types (must be numeric for range checks)
88
+ pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
89
+ p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
90
+
91
+ if not pos_is_numeric:
89
92
  errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
90
93
 
91
- if not pd.api.types.is_numeric_dtype(df[p_col]):
94
+ if not p_is_numeric:
92
95
  errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
93
96
 
94
- # Check value ranges
95
- if (df[pos_col] <= 0).any():
96
- n_invalid = (df[pos_col] <= 0).sum()
97
- errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
97
+ # Only check value ranges if columns are numeric (avoid confusing errors)
98
+ if pos_is_numeric:
99
+ if (df[pos_col] <= 0).any():
100
+ n_invalid = (df[pos_col] <= 0).sum()
101
+ errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
98
102
 
99
- if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
100
- n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
101
- errors.append(f"Column '{p_col}' has {n_invalid} values outside range (0, 1]")
103
+ if df[pos_col].isna().any():
104
+ n_na = df[pos_col].isna().sum()
105
+ errors.append(f"Column '{pos_col}' has {n_na} missing values")
102
106
 
103
- # Check for NaN in required columns
104
- if df[pos_col].isna().any():
105
- n_na = df[pos_col].isna().sum()
106
- errors.append(f"Column '{pos_col}' has {n_na} missing values")
107
+ if p_is_numeric:
108
+ if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
109
+ n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
110
+ errors.append(
111
+ f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
112
+ )
107
113
 
108
- if df[p_col].isna().any():
109
- n_na = df[p_col].isna().sum()
110
- errors.append(f"Column '{p_col}' has {n_na} missing values")
114
+ if df[p_col].isna().any():
115
+ n_na = df[p_col].isna().sum()
116
+ errors.append(f"Column '{p_col}' has {n_na} missing values")
111
117
 
112
118
  if errors:
113
119
  raise LoaderValidationError(
@@ -344,20 +350,25 @@ def validate_genes_dataframe(
344
350
  )
345
351
 
346
352
  # Check data types
347
- if not pd.api.types.is_numeric_dtype(df["start"]):
353
+ start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
354
+ end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
355
+
356
+ if not start_is_numeric:
348
357
  errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
349
358
 
350
- if not pd.api.types.is_numeric_dtype(df["end"]):
359
+ if not end_is_numeric:
351
360
  errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
352
361
 
353
- # Check ranges
354
- if (df["start"] < 0).any():
355
- n_invalid = (df["start"] < 0).sum()
356
- errors.append(f"Column 'start' has {n_invalid} negative values")
362
+ # Only check ranges if columns are numeric (avoid confusing errors)
363
+ if start_is_numeric:
364
+ if (df["start"] < 0).any():
365
+ n_invalid = (df["start"] < 0).sum()
366
+ errors.append(f"Column 'start' has {n_invalid} negative values")
357
367
 
358
- if (df["end"] < df["start"]).any():
359
- n_invalid = (df["end"] < df["start"]).sum()
360
- errors.append(f"Found {n_invalid} genes where end < start")
368
+ if start_is_numeric and end_is_numeric:
369
+ if (df["end"] < df["start"]).any():
370
+ n_invalid = (df["end"] < df["start"]).sum()
371
+ errors.append(f"Found {n_invalid} genes where end < start")
361
372
 
362
373
  if errors:
363
374
  raise LoaderValidationError(