pylocuszoom 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +74 -2
- pylocuszoom/backends/base.py +131 -0
- pylocuszoom/backends/bokeh_backend.py +254 -68
- pylocuszoom/backends/matplotlib_backend.py +173 -0
- pylocuszoom/backends/plotly_backend.py +327 -87
- pylocuszoom/colors.py +44 -1
- pylocuszoom/forest.py +37 -0
- pylocuszoom/gene_track.py +1 -0
- pylocuszoom/loaders.py +880 -0
- pylocuszoom/phewas.py +35 -0
- pylocuszoom/plotter.py +342 -117
- pylocuszoom/py.typed +0 -0
- pylocuszoom/recombination.py +49 -35
- pylocuszoom/schemas.py +406 -0
- {pylocuszoom-0.3.0.dist-info → pylocuszoom-0.6.0.dist-info}/METADATA +153 -25
- pylocuszoom-0.6.0.dist-info/RECORD +26 -0
- pylocuszoom-0.3.0.dist-info/RECORD +0 -21
- {pylocuszoom-0.3.0.dist-info → pylocuszoom-0.6.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-0.3.0.dist-info → pylocuszoom-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/recombination.py
CHANGED
|
@@ -9,12 +9,13 @@ Provides:
|
|
|
9
9
|
import os
|
|
10
10
|
import tarfile
|
|
11
11
|
import tempfile
|
|
12
|
-
import urllib.request
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
from typing import Optional
|
|
15
14
|
|
|
16
15
|
import pandas as pd
|
|
16
|
+
import requests
|
|
17
17
|
from matplotlib.axes import Axes
|
|
18
|
+
from tqdm import tqdm
|
|
18
19
|
|
|
19
20
|
from .logging import logger
|
|
20
21
|
|
|
@@ -42,7 +43,7 @@ def _normalize_build(build: Optional[str]) -> Optional[str]:
|
|
|
42
43
|
if build is None:
|
|
43
44
|
return None
|
|
44
45
|
build_lower = build.lower().replace(".", "").replace("_", "")
|
|
45
|
-
if
|
|
46
|
+
if any(x in build_lower for x in ("canfam4", "uucfamgsd")):
|
|
46
47
|
return "canfam4"
|
|
47
48
|
if "canfam3" in build_lower:
|
|
48
49
|
return "canfam3"
|
|
@@ -54,6 +55,38 @@ def get_chain_file_path() -> Path:
|
|
|
54
55
|
return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
|
|
55
56
|
|
|
56
57
|
|
|
58
|
+
def _download_with_progress(
|
|
59
|
+
url: str, dest_path: Path, desc: str = "Downloading"
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Download a file with a progress bar.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
url: URL to download from.
|
|
65
|
+
dest_path: Destination file path.
|
|
66
|
+
desc: Description for the progress bar.
|
|
67
|
+
"""
|
|
68
|
+
response = requests.get(url, stream=True, timeout=60)
|
|
69
|
+
response.raise_for_status()
|
|
70
|
+
|
|
71
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
72
|
+
|
|
73
|
+
with (
|
|
74
|
+
open(dest_path, "wb") as f,
|
|
75
|
+
tqdm(
|
|
76
|
+
total=total_size,
|
|
77
|
+
unit="B",
|
|
78
|
+
unit_scale=True,
|
|
79
|
+
unit_divisor=1024,
|
|
80
|
+
desc=desc,
|
|
81
|
+
disable=total_size == 0, # Disable if size unknown
|
|
82
|
+
) as pbar,
|
|
83
|
+
):
|
|
84
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
85
|
+
if chunk:
|
|
86
|
+
f.write(chunk)
|
|
87
|
+
pbar.update(len(chunk))
|
|
88
|
+
|
|
89
|
+
|
|
57
90
|
def download_liftover_chain(force: bool = False) -> Path:
|
|
58
91
|
"""Download the CanFam3 to CanFam4 liftover chain file.
|
|
59
92
|
|
|
@@ -73,20 +106,11 @@ def download_liftover_chain(force: bool = False) -> Path:
|
|
|
73
106
|
logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
|
|
74
107
|
logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
|
|
75
108
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
import requests
|
|
82
|
-
|
|
83
|
-
response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
|
|
84
|
-
response.raise_for_status()
|
|
85
|
-
chain_path.write_bytes(response.content)
|
|
86
|
-
except ImportError:
|
|
87
|
-
raise RuntimeError(
|
|
88
|
-
"Failed to download. Install requests: pip install requests"
|
|
89
|
-
)
|
|
109
|
+
_download_with_progress(
|
|
110
|
+
CANFAM3_TO_CANFAM4_CHAIN_URL,
|
|
111
|
+
chain_path,
|
|
112
|
+
desc="Liftover chain",
|
|
113
|
+
)
|
|
90
114
|
|
|
91
115
|
logger.info(f"Chain file saved to: {chain_path}")
|
|
92
116
|
return chain_path
|
|
@@ -158,9 +182,9 @@ def get_default_data_dir() -> Path:
|
|
|
158
182
|
"""Get default directory for recombination map data.
|
|
159
183
|
|
|
160
184
|
Returns platform-appropriate cache directory:
|
|
161
|
-
- macOS:
|
|
162
|
-
- Linux: ~/.cache/snp-scope-plot
|
|
185
|
+
- macOS/Linux: ~/.cache/snp-scope-plot (or $XDG_CACHE_HOME if set)
|
|
163
186
|
- Windows: %LOCALAPPDATA%/snp-scope-plot
|
|
187
|
+
- Databricks: /dbfs/FileStore/reference_data/recombination_maps
|
|
164
188
|
"""
|
|
165
189
|
if os.name == "nt": # Windows
|
|
166
190
|
base = Path(os.environ.get("LOCALAPPDATA", Path.home()))
|
|
@@ -207,7 +231,7 @@ def download_canine_recombination_maps(
|
|
|
207
231
|
# Check if already downloaded
|
|
208
232
|
if output_path.exists() and not force:
|
|
209
233
|
existing_files = list(output_path.glob("chr*_recomb.tsv"))
|
|
210
|
-
if len(existing_files) >=
|
|
234
|
+
if len(existing_files) >= 39: # 38 autosomes + X
|
|
211
235
|
return output_path
|
|
212
236
|
|
|
213
237
|
# Create output directory
|
|
@@ -217,24 +241,14 @@ def download_canine_recombination_maps(
|
|
|
217
241
|
logger.debug(f"Source: {CANINE_RECOMB_URL}")
|
|
218
242
|
|
|
219
243
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
220
|
-
# Download tar.gz file
|
|
244
|
+
# Download tar.gz file with progress bar
|
|
221
245
|
tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
|
|
222
246
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
try:
|
|
229
|
-
import requests
|
|
230
|
-
|
|
231
|
-
response = requests.get(CANINE_RECOMB_URL, timeout=60)
|
|
232
|
-
response.raise_for_status()
|
|
233
|
-
tar_path.write_bytes(response.content)
|
|
234
|
-
except ImportError:
|
|
235
|
-
raise RuntimeError(
|
|
236
|
-
"Failed to download. Install requests: pip install requests"
|
|
237
|
-
)
|
|
247
|
+
_download_with_progress(
|
|
248
|
+
CANINE_RECOMB_URL,
|
|
249
|
+
tar_path,
|
|
250
|
+
desc="Recombination maps",
|
|
251
|
+
)
|
|
238
252
|
|
|
239
253
|
logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
|
|
240
254
|
|
pylocuszoom/schemas.py
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""Pydantic validation schemas for loaded data.
|
|
2
|
+
|
|
3
|
+
Provides validation models for GWAS, eQTL, fine-mapping, and gene annotation
|
|
4
|
+
DataFrames to ensure data quality before plotting.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, Union
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LoaderValidationError(Exception):
|
|
15
|
+
"""Raised when loaded data fails validation."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# =============================================================================
|
|
21
|
+
# GWAS Validation
|
|
22
|
+
# =============================================================================
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GWASRowModel(BaseModel):
|
|
26
|
+
"""Validation model for a single GWAS row."""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(extra="allow")
|
|
29
|
+
|
|
30
|
+
ps: int
|
|
31
|
+
p_wald: float
|
|
32
|
+
rs: Optional[str] = None
|
|
33
|
+
chr: Optional[Union[str, int]] = None
|
|
34
|
+
|
|
35
|
+
@field_validator("ps")
|
|
36
|
+
@classmethod
|
|
37
|
+
def position_positive(cls, v: int) -> int:
|
|
38
|
+
"""Position must be positive."""
|
|
39
|
+
if v <= 0:
|
|
40
|
+
raise ValueError(f"Position must be positive, got {v}")
|
|
41
|
+
return v
|
|
42
|
+
|
|
43
|
+
@field_validator("p_wald")
|
|
44
|
+
@classmethod
|
|
45
|
+
def pvalue_in_range(cls, v: float) -> float:
|
|
46
|
+
"""P-value must be between 0 and 1."""
|
|
47
|
+
if not (0 < v <= 1):
|
|
48
|
+
raise ValueError(f"P-value must be in range (0, 1], got {v}")
|
|
49
|
+
return v
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def validate_gwas_dataframe(
|
|
53
|
+
df: pd.DataFrame,
|
|
54
|
+
pos_col: str = "ps",
|
|
55
|
+
p_col: str = "p_wald",
|
|
56
|
+
rs_col: str = "rs",
|
|
57
|
+
strict: bool = False,
|
|
58
|
+
) -> pd.DataFrame:
|
|
59
|
+
"""Validate a GWAS DataFrame.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
df: DataFrame to validate.
|
|
63
|
+
pos_col: Column name for position.
|
|
64
|
+
p_col: Column name for p-value.
|
|
65
|
+
rs_col: Column name for SNP ID.
|
|
66
|
+
strict: If True, validate every row. If False (default), validate schema only.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Validated DataFrame.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
LoaderValidationError: If validation fails.
|
|
73
|
+
"""
|
|
74
|
+
errors = []
|
|
75
|
+
|
|
76
|
+
# Check required columns exist
|
|
77
|
+
if pos_col not in df.columns:
|
|
78
|
+
errors.append(f"Missing required column: '{pos_col}'")
|
|
79
|
+
if p_col not in df.columns:
|
|
80
|
+
errors.append(f"Missing required column: '{p_col}'")
|
|
81
|
+
|
|
82
|
+
if errors:
|
|
83
|
+
raise LoaderValidationError(
|
|
84
|
+
"GWAS validation failed:\n - " + "\n - ".join(errors)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Check data types (must be numeric for range checks)
|
|
88
|
+
pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
|
|
89
|
+
p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
|
|
90
|
+
|
|
91
|
+
if not pos_is_numeric:
|
|
92
|
+
errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
|
|
93
|
+
|
|
94
|
+
if not p_is_numeric:
|
|
95
|
+
errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
|
|
96
|
+
|
|
97
|
+
# Only check value ranges if columns are numeric (avoid confusing errors)
|
|
98
|
+
if pos_is_numeric:
|
|
99
|
+
if (df[pos_col] <= 0).any():
|
|
100
|
+
n_invalid = (df[pos_col] <= 0).sum()
|
|
101
|
+
errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
|
|
102
|
+
|
|
103
|
+
if df[pos_col].isna().any():
|
|
104
|
+
n_na = df[pos_col].isna().sum()
|
|
105
|
+
errors.append(f"Column '{pos_col}' has {n_na} missing values")
|
|
106
|
+
|
|
107
|
+
if p_is_numeric:
|
|
108
|
+
if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
|
|
109
|
+
n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
|
|
110
|
+
errors.append(
|
|
111
|
+
f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if df[p_col].isna().any():
|
|
115
|
+
n_na = df[p_col].isna().sum()
|
|
116
|
+
errors.append(f"Column '{p_col}' has {n_na} missing values")
|
|
117
|
+
|
|
118
|
+
if errors:
|
|
119
|
+
raise LoaderValidationError(
|
|
120
|
+
"GWAS validation failed:\n - " + "\n - ".join(errors)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return df
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# =============================================================================
|
|
127
|
+
# eQTL Validation
|
|
128
|
+
# =============================================================================
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class EQTLRowModel(BaseModel):
|
|
132
|
+
"""Validation model for a single eQTL row."""
|
|
133
|
+
|
|
134
|
+
model_config = ConfigDict(extra="allow")
|
|
135
|
+
|
|
136
|
+
pos: int
|
|
137
|
+
p_value: float
|
|
138
|
+
gene: str
|
|
139
|
+
effect: Optional[float] = None
|
|
140
|
+
|
|
141
|
+
@field_validator("pos")
|
|
142
|
+
@classmethod
|
|
143
|
+
def position_positive(cls, v: int) -> int:
|
|
144
|
+
"""Position must be positive."""
|
|
145
|
+
if v <= 0:
|
|
146
|
+
raise ValueError(f"Position must be positive, got {v}")
|
|
147
|
+
return v
|
|
148
|
+
|
|
149
|
+
@field_validator("p_value")
|
|
150
|
+
@classmethod
|
|
151
|
+
def pvalue_in_range(cls, v: float) -> float:
|
|
152
|
+
"""P-value must be between 0 and 1."""
|
|
153
|
+
if not (0 < v <= 1):
|
|
154
|
+
raise ValueError(f"P-value must be in range (0, 1], got {v}")
|
|
155
|
+
return v
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def validate_eqtl_dataframe(
|
|
159
|
+
df: pd.DataFrame,
|
|
160
|
+
strict: bool = False,
|
|
161
|
+
) -> pd.DataFrame:
|
|
162
|
+
"""Validate an eQTL DataFrame.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
df: DataFrame to validate.
|
|
166
|
+
strict: If True, validate every row.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Validated DataFrame.
|
|
170
|
+
|
|
171
|
+
Raises:
|
|
172
|
+
LoaderValidationError: If validation fails.
|
|
173
|
+
"""
|
|
174
|
+
errors = []
|
|
175
|
+
|
|
176
|
+
# Check required columns
|
|
177
|
+
required = ["pos", "p_value", "gene"]
|
|
178
|
+
for col in required:
|
|
179
|
+
if col not in df.columns:
|
|
180
|
+
errors.append(f"Missing required column: '{col}'")
|
|
181
|
+
|
|
182
|
+
if errors:
|
|
183
|
+
raise LoaderValidationError(
|
|
184
|
+
"eQTL validation failed:\n - " + "\n - ".join(errors)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Check data types and ranges
|
|
188
|
+
if not pd.api.types.is_numeric_dtype(df["pos"]):
|
|
189
|
+
errors.append(f"Column 'pos' must be numeric, got {df['pos'].dtype}")
|
|
190
|
+
elif (df["pos"] <= 0).any():
|
|
191
|
+
n_invalid = (df["pos"] <= 0).sum()
|
|
192
|
+
errors.append(f"Column 'pos' has {n_invalid} non-positive values")
|
|
193
|
+
|
|
194
|
+
if not pd.api.types.is_numeric_dtype(df["p_value"]):
|
|
195
|
+
errors.append(f"Column 'p_value' must be numeric, got {df['p_value'].dtype}")
|
|
196
|
+
elif ((df["p_value"] <= 0) | (df["p_value"] > 1)).any():
|
|
197
|
+
n_invalid = ((df["p_value"] <= 0) | (df["p_value"] > 1)).sum()
|
|
198
|
+
errors.append(f"Column 'p_value' has {n_invalid} values outside range (0, 1]")
|
|
199
|
+
|
|
200
|
+
if errors:
|
|
201
|
+
raise LoaderValidationError(
|
|
202
|
+
"eQTL validation failed:\n - " + "\n - ".join(errors)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return df
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# =============================================================================
|
|
209
|
+
# Fine-mapping Validation
|
|
210
|
+
# =============================================================================
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class FinemappingRowModel(BaseModel):
|
|
214
|
+
"""Validation model for a single fine-mapping row."""
|
|
215
|
+
|
|
216
|
+
model_config = ConfigDict(extra="allow")
|
|
217
|
+
|
|
218
|
+
pos: int
|
|
219
|
+
pip: float
|
|
220
|
+
cs: Optional[int] = None
|
|
221
|
+
|
|
222
|
+
@field_validator("pos")
|
|
223
|
+
@classmethod
|
|
224
|
+
def position_positive(cls, v: int) -> int:
|
|
225
|
+
"""Position must be positive."""
|
|
226
|
+
if v <= 0:
|
|
227
|
+
raise ValueError(f"Position must be positive, got {v}")
|
|
228
|
+
return v
|
|
229
|
+
|
|
230
|
+
@field_validator("pip")
|
|
231
|
+
@classmethod
|
|
232
|
+
def pip_in_range(cls, v: float) -> float:
|
|
233
|
+
"""PIP must be between 0 and 1."""
|
|
234
|
+
if not (0 <= v <= 1):
|
|
235
|
+
raise ValueError(f"PIP must be in range [0, 1], got {v}")
|
|
236
|
+
return v
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def validate_finemapping_dataframe(
|
|
240
|
+
df: pd.DataFrame,
|
|
241
|
+
cs_col: str = "cs",
|
|
242
|
+
strict: bool = False,
|
|
243
|
+
) -> pd.DataFrame:
|
|
244
|
+
"""Validate a fine-mapping DataFrame.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
df: DataFrame to validate.
|
|
248
|
+
cs_col: Column name for credible set.
|
|
249
|
+
strict: If True, validate every row.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Validated DataFrame.
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
LoaderValidationError: If validation fails.
|
|
256
|
+
"""
|
|
257
|
+
errors = []
|
|
258
|
+
|
|
259
|
+
# Check required columns
|
|
260
|
+
if "pos" not in df.columns:
|
|
261
|
+
errors.append("Missing required column: 'pos'")
|
|
262
|
+
if "pip" not in df.columns:
|
|
263
|
+
errors.append("Missing required column: 'pip'")
|
|
264
|
+
|
|
265
|
+
if errors:
|
|
266
|
+
raise LoaderValidationError(
|
|
267
|
+
"Fine-mapping validation failed:\n - " + "\n - ".join(errors)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Check data types and ranges
|
|
271
|
+
if not pd.api.types.is_numeric_dtype(df["pos"]):
|
|
272
|
+
errors.append(f"Column 'pos' must be numeric, got {df['pos'].dtype}")
|
|
273
|
+
elif (df["pos"] <= 0).any():
|
|
274
|
+
n_invalid = (df["pos"] <= 0).sum()
|
|
275
|
+
errors.append(f"Column 'pos' has {n_invalid} non-positive values")
|
|
276
|
+
|
|
277
|
+
if not pd.api.types.is_numeric_dtype(df["pip"]):
|
|
278
|
+
errors.append(f"Column 'pip' must be numeric, got {df['pip'].dtype}")
|
|
279
|
+
elif ((df["pip"] < 0) | (df["pip"] > 1)).any():
|
|
280
|
+
n_invalid = ((df["pip"] < 0) | (df["pip"] > 1)).sum()
|
|
281
|
+
errors.append(f"Column 'pip' has {n_invalid} values outside range [0, 1]")
|
|
282
|
+
|
|
283
|
+
if errors:
|
|
284
|
+
raise LoaderValidationError(
|
|
285
|
+
"Fine-mapping validation failed:\n - " + "\n - ".join(errors)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
return df
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
# =============================================================================
|
|
292
|
+
# Gene Annotation Validation
|
|
293
|
+
# =============================================================================
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class GeneRowModel(BaseModel):
|
|
297
|
+
"""Validation model for a single gene annotation row."""
|
|
298
|
+
|
|
299
|
+
model_config = ConfigDict(extra="allow")
|
|
300
|
+
|
|
301
|
+
chr: Union[str, int]
|
|
302
|
+
start: int
|
|
303
|
+
end: int
|
|
304
|
+
gene_name: str
|
|
305
|
+
strand: Optional[str] = None
|
|
306
|
+
|
|
307
|
+
@field_validator("start", "end")
|
|
308
|
+
@classmethod
|
|
309
|
+
def position_positive(cls, v: int) -> int:
|
|
310
|
+
"""Position must be positive."""
|
|
311
|
+
if v < 0:
|
|
312
|
+
raise ValueError(f"Position must be non-negative, got {v}")
|
|
313
|
+
return v
|
|
314
|
+
|
|
315
|
+
@model_validator(mode="after")
|
|
316
|
+
def start_before_end(self):
|
|
317
|
+
"""Start must be <= end."""
|
|
318
|
+
if self.start > self.end:
|
|
319
|
+
raise ValueError(f"Start ({self.start}) must be <= end ({self.end})")
|
|
320
|
+
return self
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def validate_genes_dataframe(
|
|
324
|
+
df: pd.DataFrame,
|
|
325
|
+
strict: bool = False,
|
|
326
|
+
) -> pd.DataFrame:
|
|
327
|
+
"""Validate a genes DataFrame.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
df: DataFrame to validate.
|
|
331
|
+
strict: If True, validate every row.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Validated DataFrame.
|
|
335
|
+
|
|
336
|
+
Raises:
|
|
337
|
+
LoaderValidationError: If validation fails.
|
|
338
|
+
"""
|
|
339
|
+
errors = []
|
|
340
|
+
|
|
341
|
+
# Check required columns
|
|
342
|
+
required = ["chr", "start", "end", "gene_name"]
|
|
343
|
+
for col in required:
|
|
344
|
+
if col not in df.columns:
|
|
345
|
+
errors.append(f"Missing required column: '{col}'")
|
|
346
|
+
|
|
347
|
+
if errors:
|
|
348
|
+
raise LoaderValidationError(
|
|
349
|
+
"Gene annotation validation failed:\n - " + "\n - ".join(errors)
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
# Check data types
|
|
353
|
+
start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
|
|
354
|
+
end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
|
|
355
|
+
|
|
356
|
+
if not start_is_numeric:
|
|
357
|
+
errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
|
|
358
|
+
|
|
359
|
+
if not end_is_numeric:
|
|
360
|
+
errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
|
|
361
|
+
|
|
362
|
+
# Only check ranges if columns are numeric (avoid confusing errors)
|
|
363
|
+
if start_is_numeric:
|
|
364
|
+
if (df["start"] < 0).any():
|
|
365
|
+
n_invalid = (df["start"] < 0).sum()
|
|
366
|
+
errors.append(f"Column 'start' has {n_invalid} negative values")
|
|
367
|
+
|
|
368
|
+
if start_is_numeric and end_is_numeric:
|
|
369
|
+
if (df["end"] < df["start"]).any():
|
|
370
|
+
n_invalid = (df["end"] < df["start"]).sum()
|
|
371
|
+
errors.append(f"Found {n_invalid} genes where end < start")
|
|
372
|
+
|
|
373
|
+
if errors:
|
|
374
|
+
raise LoaderValidationError(
|
|
375
|
+
"Gene annotation validation failed:\n - " + "\n - ".join(errors)
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
return df
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# =============================================================================
|
|
382
|
+
# File Path Validation
|
|
383
|
+
# =============================================================================
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def validate_file_path(filepath: Union[str, Path]) -> Path:
|
|
387
|
+
"""Validate that a file path exists and is readable.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
filepath: Path to validate.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Validated Path object.
|
|
394
|
+
|
|
395
|
+
Raises:
|
|
396
|
+
LoaderValidationError: If file doesn't exist or isn't readable.
|
|
397
|
+
"""
|
|
398
|
+
path = Path(filepath)
|
|
399
|
+
|
|
400
|
+
if not path.exists():
|
|
401
|
+
raise LoaderValidationError(f"File not found: {path}")
|
|
402
|
+
|
|
403
|
+
if not path.is_file():
|
|
404
|
+
raise LoaderValidationError(f"Not a file: {path}")
|
|
405
|
+
|
|
406
|
+
return path
|