pylocuszoom 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,12 +9,13 @@ Provides:
9
9
  import os
10
10
  import tarfile
11
11
  import tempfile
12
- import urllib.request
13
12
  from pathlib import Path
14
13
  from typing import Optional
15
14
 
16
15
  import pandas as pd
16
+ import requests
17
17
  from matplotlib.axes import Axes
18
+ from tqdm import tqdm
18
19
 
19
20
  from .logging import logger
20
21
 
@@ -42,7 +43,7 @@ def _normalize_build(build: Optional[str]) -> Optional[str]:
42
43
  if build is None:
43
44
  return None
44
45
  build_lower = build.lower().replace(".", "").replace("_", "")
45
- if "canfam4" in build_lower or "uucfamgsd" in build_lower:
46
+ if any(x in build_lower for x in ("canfam4", "uucfamgsd")):
46
47
  return "canfam4"
47
48
  if "canfam3" in build_lower:
48
49
  return "canfam3"
@@ -54,6 +55,38 @@ def get_chain_file_path() -> Path:
54
55
  return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
55
56
 
56
57
 
58
+ def _download_with_progress(
59
+ url: str, dest_path: Path, desc: str = "Downloading"
60
+ ) -> None:
61
+ """Download a file with a progress bar.
62
+
63
+ Args:
64
+ url: URL to download from.
65
+ dest_path: Destination file path.
66
+ desc: Description for the progress bar.
67
+ """
68
+ response = requests.get(url, stream=True, timeout=60)
69
+ response.raise_for_status()
70
+
71
+ total_size = int(response.headers.get("content-length", 0))
72
+
73
+ with (
74
+ open(dest_path, "wb") as f,
75
+ tqdm(
76
+ total=total_size,
77
+ unit="B",
78
+ unit_scale=True,
79
+ unit_divisor=1024,
80
+ desc=desc,
81
+ disable=total_size == 0, # Disable if size unknown
82
+ ) as pbar,
83
+ ):
84
+ for chunk in response.iter_content(chunk_size=8192):
85
+ if chunk:
86
+ f.write(chunk)
87
+ pbar.update(len(chunk))
88
+
89
+
57
90
  def download_liftover_chain(force: bool = False) -> Path:
58
91
  """Download the CanFam3 to CanFam4 liftover chain file.
59
92
 
@@ -73,20 +106,11 @@ def download_liftover_chain(force: bool = False) -> Path:
73
106
  logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
74
107
  logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
75
108
 
76
- try:
77
- urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
78
- except Exception as e:
79
- logger.debug(f"urllib download failed: {e}")
80
- try:
81
- import requests
82
-
83
- response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
84
- response.raise_for_status()
85
- chain_path.write_bytes(response.content)
86
- except ImportError:
87
- raise RuntimeError(
88
- "Failed to download. Install requests: pip install requests"
89
- )
109
+ _download_with_progress(
110
+ CANFAM3_TO_CANFAM4_CHAIN_URL,
111
+ chain_path,
112
+ desc="Liftover chain",
113
+ )
90
114
 
91
115
  logger.info(f"Chain file saved to: {chain_path}")
92
116
  return chain_path
@@ -158,9 +182,9 @@ def get_default_data_dir() -> Path:
158
182
  """Get default directory for recombination map data.
159
183
 
160
184
  Returns platform-appropriate cache directory:
161
- - macOS: ~/Library/Caches/snp-scope-plot
162
- - Linux: ~/.cache/snp-scope-plot
185
+ - macOS/Linux: ~/.cache/snp-scope-plot (or $XDG_CACHE_HOME if set)
163
186
  - Windows: %LOCALAPPDATA%/snp-scope-plot
187
+ - Databricks: /dbfs/FileStore/reference_data/recombination_maps
164
188
  """
165
189
  if os.name == "nt": # Windows
166
190
  base = Path(os.environ.get("LOCALAPPDATA", Path.home()))
@@ -207,7 +231,7 @@ def download_canine_recombination_maps(
207
231
  # Check if already downloaded
208
232
  if output_path.exists() and not force:
209
233
  existing_files = list(output_path.glob("chr*_recomb.tsv"))
210
- if len(existing_files) >= 38: # 38 autosomes + X
234
+ if len(existing_files) >= 39: # 38 autosomes + X
211
235
  return output_path
212
236
 
213
237
  # Create output directory
@@ -217,24 +241,14 @@ def download_canine_recombination_maps(
217
241
  logger.debug(f"Source: {CANINE_RECOMB_URL}")
218
242
 
219
243
  with tempfile.TemporaryDirectory() as tmpdir:
220
- # Download tar.gz file
244
+ # Download tar.gz file with progress bar
221
245
  tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
222
246
 
223
- try:
224
- urllib.request.urlretrieve(CANINE_RECOMB_URL, tar_path)
225
- except Exception as e:
226
- logger.debug(f"urllib download failed: {e}")
227
- logger.debug("Trying alternative method with requests...")
228
- try:
229
- import requests
230
-
231
- response = requests.get(CANINE_RECOMB_URL, timeout=60)
232
- response.raise_for_status()
233
- tar_path.write_bytes(response.content)
234
- except ImportError:
235
- raise RuntimeError(
236
- "Failed to download. Install requests: pip install requests"
237
- )
247
+ _download_with_progress(
248
+ CANINE_RECOMB_URL,
249
+ tar_path,
250
+ desc="Recombination maps",
251
+ )
238
252
 
239
253
  logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
240
254
 
pylocuszoom/schemas.py ADDED
@@ -0,0 +1,406 @@
1
+ """Pydantic validation schemas for loaded data.
2
+
3
+ Provides validation models for GWAS, eQTL, fine-mapping, and gene annotation
4
+ DataFrames to ensure data quality before plotting.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Optional, Union
9
+
10
+ import pandas as pd
11
+ from pydantic import BaseModel, ConfigDict, field_validator, model_validator
12
+
13
+
14
+ class LoaderValidationError(Exception):
15
+ """Raised when loaded data fails validation."""
16
+
17
+ pass
18
+
19
+
20
+ # =============================================================================
21
+ # GWAS Validation
22
+ # =============================================================================
23
+
24
+
25
+ class GWASRowModel(BaseModel):
26
+ """Validation model for a single GWAS row."""
27
+
28
+ model_config = ConfigDict(extra="allow")
29
+
30
+ ps: int
31
+ p_wald: float
32
+ rs: Optional[str] = None
33
+ chr: Optional[Union[str, int]] = None
34
+
35
+ @field_validator("ps")
36
+ @classmethod
37
+ def position_positive(cls, v: int) -> int:
38
+ """Position must be positive."""
39
+ if v <= 0:
40
+ raise ValueError(f"Position must be positive, got {v}")
41
+ return v
42
+
43
+ @field_validator("p_wald")
44
+ @classmethod
45
+ def pvalue_in_range(cls, v: float) -> float:
46
+ """P-value must be between 0 and 1."""
47
+ if not (0 < v <= 1):
48
+ raise ValueError(f"P-value must be in range (0, 1], got {v}")
49
+ return v
50
+
51
+
52
+ def validate_gwas_dataframe(
53
+ df: pd.DataFrame,
54
+ pos_col: str = "ps",
55
+ p_col: str = "p_wald",
56
+ rs_col: str = "rs",
57
+ strict: bool = False,
58
+ ) -> pd.DataFrame:
59
+ """Validate a GWAS DataFrame.
60
+
61
+ Args:
62
+ df: DataFrame to validate.
63
+ pos_col: Column name for position.
64
+ p_col: Column name for p-value.
65
+ rs_col: Column name for SNP ID.
66
+ strict: If True, validate every row. If False (default), validate schema only.
67
+
68
+ Returns:
69
+ Validated DataFrame.
70
+
71
+ Raises:
72
+ LoaderValidationError: If validation fails.
73
+ """
74
+ errors = []
75
+
76
+ # Check required columns exist
77
+ if pos_col not in df.columns:
78
+ errors.append(f"Missing required column: '{pos_col}'")
79
+ if p_col not in df.columns:
80
+ errors.append(f"Missing required column: '{p_col}'")
81
+
82
+ if errors:
83
+ raise LoaderValidationError(
84
+ "GWAS validation failed:\n - " + "\n - ".join(errors)
85
+ )
86
+
87
+ # Check data types (must be numeric for range checks)
88
+ pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
89
+ p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
90
+
91
+ if not pos_is_numeric:
92
+ errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
93
+
94
+ if not p_is_numeric:
95
+ errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
96
+
97
+ # Only check value ranges if columns are numeric (avoid confusing errors)
98
+ if pos_is_numeric:
99
+ if (df[pos_col] <= 0).any():
100
+ n_invalid = (df[pos_col] <= 0).sum()
101
+ errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
102
+
103
+ if df[pos_col].isna().any():
104
+ n_na = df[pos_col].isna().sum()
105
+ errors.append(f"Column '{pos_col}' has {n_na} missing values")
106
+
107
+ if p_is_numeric:
108
+ if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
109
+ n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
110
+ errors.append(
111
+ f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
112
+ )
113
+
114
+ if df[p_col].isna().any():
115
+ n_na = df[p_col].isna().sum()
116
+ errors.append(f"Column '{p_col}' has {n_na} missing values")
117
+
118
+ if errors:
119
+ raise LoaderValidationError(
120
+ "GWAS validation failed:\n - " + "\n - ".join(errors)
121
+ )
122
+
123
+ return df
124
+
125
+
126
+ # =============================================================================
127
+ # eQTL Validation
128
+ # =============================================================================
129
+
130
+
131
+ class EQTLRowModel(BaseModel):
132
+ """Validation model for a single eQTL row."""
133
+
134
+ model_config = ConfigDict(extra="allow")
135
+
136
+ pos: int
137
+ p_value: float
138
+ gene: str
139
+ effect: Optional[float] = None
140
+
141
+ @field_validator("pos")
142
+ @classmethod
143
+ def position_positive(cls, v: int) -> int:
144
+ """Position must be positive."""
145
+ if v <= 0:
146
+ raise ValueError(f"Position must be positive, got {v}")
147
+ return v
148
+
149
+ @field_validator("p_value")
150
+ @classmethod
151
+ def pvalue_in_range(cls, v: float) -> float:
152
+ """P-value must be between 0 and 1."""
153
+ if not (0 < v <= 1):
154
+ raise ValueError(f"P-value must be in range (0, 1], got {v}")
155
+ return v
156
+
157
+
158
+ def validate_eqtl_dataframe(
159
+ df: pd.DataFrame,
160
+ strict: bool = False,
161
+ ) -> pd.DataFrame:
162
+ """Validate an eQTL DataFrame.
163
+
164
+ Args:
165
+ df: DataFrame to validate.
166
+ strict: If True, validate every row.
167
+
168
+ Returns:
169
+ Validated DataFrame.
170
+
171
+ Raises:
172
+ LoaderValidationError: If validation fails.
173
+ """
174
+ errors = []
175
+
176
+ # Check required columns
177
+ required = ["pos", "p_value", "gene"]
178
+ for col in required:
179
+ if col not in df.columns:
180
+ errors.append(f"Missing required column: '{col}'")
181
+
182
+ if errors:
183
+ raise LoaderValidationError(
184
+ "eQTL validation failed:\n - " + "\n - ".join(errors)
185
+ )
186
+
187
+ # Check data types and ranges
188
+ if not pd.api.types.is_numeric_dtype(df["pos"]):
189
+ errors.append(f"Column 'pos' must be numeric, got {df['pos'].dtype}")
190
+ elif (df["pos"] <= 0).any():
191
+ n_invalid = (df["pos"] <= 0).sum()
192
+ errors.append(f"Column 'pos' has {n_invalid} non-positive values")
193
+
194
+ if not pd.api.types.is_numeric_dtype(df["p_value"]):
195
+ errors.append(f"Column 'p_value' must be numeric, got {df['p_value'].dtype}")
196
+ elif ((df["p_value"] <= 0) | (df["p_value"] > 1)).any():
197
+ n_invalid = ((df["p_value"] <= 0) | (df["p_value"] > 1)).sum()
198
+ errors.append(f"Column 'p_value' has {n_invalid} values outside range (0, 1]")
199
+
200
+ if errors:
201
+ raise LoaderValidationError(
202
+ "eQTL validation failed:\n - " + "\n - ".join(errors)
203
+ )
204
+
205
+ return df
206
+
207
+
208
+ # =============================================================================
209
+ # Fine-mapping Validation
210
+ # =============================================================================
211
+
212
+
213
+ class FinemappingRowModel(BaseModel):
214
+ """Validation model for a single fine-mapping row."""
215
+
216
+ model_config = ConfigDict(extra="allow")
217
+
218
+ pos: int
219
+ pip: float
220
+ cs: Optional[int] = None
221
+
222
+ @field_validator("pos")
223
+ @classmethod
224
+ def position_positive(cls, v: int) -> int:
225
+ """Position must be positive."""
226
+ if v <= 0:
227
+ raise ValueError(f"Position must be positive, got {v}")
228
+ return v
229
+
230
+ @field_validator("pip")
231
+ @classmethod
232
+ def pip_in_range(cls, v: float) -> float:
233
+ """PIP must be between 0 and 1."""
234
+ if not (0 <= v <= 1):
235
+ raise ValueError(f"PIP must be in range [0, 1], got {v}")
236
+ return v
237
+
238
+
239
+ def validate_finemapping_dataframe(
240
+ df: pd.DataFrame,
241
+ cs_col: str = "cs",
242
+ strict: bool = False,
243
+ ) -> pd.DataFrame:
244
+ """Validate a fine-mapping DataFrame.
245
+
246
+ Args:
247
+ df: DataFrame to validate.
248
+ cs_col: Column name for credible set.
249
+ strict: If True, validate every row.
250
+
251
+ Returns:
252
+ Validated DataFrame.
253
+
254
+ Raises:
255
+ LoaderValidationError: If validation fails.
256
+ """
257
+ errors = []
258
+
259
+ # Check required columns
260
+ if "pos" not in df.columns:
261
+ errors.append("Missing required column: 'pos'")
262
+ if "pip" not in df.columns:
263
+ errors.append("Missing required column: 'pip'")
264
+
265
+ if errors:
266
+ raise LoaderValidationError(
267
+ "Fine-mapping validation failed:\n - " + "\n - ".join(errors)
268
+ )
269
+
270
+ # Check data types and ranges
271
+ if not pd.api.types.is_numeric_dtype(df["pos"]):
272
+ errors.append(f"Column 'pos' must be numeric, got {df['pos'].dtype}")
273
+ elif (df["pos"] <= 0).any():
274
+ n_invalid = (df["pos"] <= 0).sum()
275
+ errors.append(f"Column 'pos' has {n_invalid} non-positive values")
276
+
277
+ if not pd.api.types.is_numeric_dtype(df["pip"]):
278
+ errors.append(f"Column 'pip' must be numeric, got {df['pip'].dtype}")
279
+ elif ((df["pip"] < 0) | (df["pip"] > 1)).any():
280
+ n_invalid = ((df["pip"] < 0) | (df["pip"] > 1)).sum()
281
+ errors.append(f"Column 'pip' has {n_invalid} values outside range [0, 1]")
282
+
283
+ if errors:
284
+ raise LoaderValidationError(
285
+ "Fine-mapping validation failed:\n - " + "\n - ".join(errors)
286
+ )
287
+
288
+ return df
289
+
290
+
291
+ # =============================================================================
292
+ # Gene Annotation Validation
293
+ # =============================================================================
294
+
295
+
296
+ class GeneRowModel(BaseModel):
297
+ """Validation model for a single gene annotation row."""
298
+
299
+ model_config = ConfigDict(extra="allow")
300
+
301
+ chr: Union[str, int]
302
+ start: int
303
+ end: int
304
+ gene_name: str
305
+ strand: Optional[str] = None
306
+
307
+ @field_validator("start", "end")
308
+ @classmethod
309
+ def position_positive(cls, v: int) -> int:
310
+ """Position must be positive."""
311
+ if v < 0:
312
+ raise ValueError(f"Position must be non-negative, got {v}")
313
+ return v
314
+
315
+ @model_validator(mode="after")
316
+ def start_before_end(self):
317
+ """Start must be <= end."""
318
+ if self.start > self.end:
319
+ raise ValueError(f"Start ({self.start}) must be <= end ({self.end})")
320
+ return self
321
+
322
+
323
+ def validate_genes_dataframe(
324
+ df: pd.DataFrame,
325
+ strict: bool = False,
326
+ ) -> pd.DataFrame:
327
+ """Validate a genes DataFrame.
328
+
329
+ Args:
330
+ df: DataFrame to validate.
331
+ strict: If True, validate every row.
332
+
333
+ Returns:
334
+ Validated DataFrame.
335
+
336
+ Raises:
337
+ LoaderValidationError: If validation fails.
338
+ """
339
+ errors = []
340
+
341
+ # Check required columns
342
+ required = ["chr", "start", "end", "gene_name"]
343
+ for col in required:
344
+ if col not in df.columns:
345
+ errors.append(f"Missing required column: '{col}'")
346
+
347
+ if errors:
348
+ raise LoaderValidationError(
349
+ "Gene annotation validation failed:\n - " + "\n - ".join(errors)
350
+ )
351
+
352
+ # Check data types
353
+ start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
354
+ end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
355
+
356
+ if not start_is_numeric:
357
+ errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
358
+
359
+ if not end_is_numeric:
360
+ errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
361
+
362
+ # Only check ranges if columns are numeric (avoid confusing errors)
363
+ if start_is_numeric:
364
+ if (df["start"] < 0).any():
365
+ n_invalid = (df["start"] < 0).sum()
366
+ errors.append(f"Column 'start' has {n_invalid} negative values")
367
+
368
+ if start_is_numeric and end_is_numeric:
369
+ if (df["end"] < df["start"]).any():
370
+ n_invalid = (df["end"] < df["start"]).sum()
371
+ errors.append(f"Found {n_invalid} genes where end < start")
372
+
373
+ if errors:
374
+ raise LoaderValidationError(
375
+ "Gene annotation validation failed:\n - " + "\n - ".join(errors)
376
+ )
377
+
378
+ return df
379
+
380
+
381
+ # =============================================================================
382
+ # File Path Validation
383
+ # =============================================================================
384
+
385
+
386
+ def validate_file_path(filepath: Union[str, Path]) -> Path:
387
+ """Validate that a file path exists and is readable.
388
+
389
+ Args:
390
+ filepath: Path to validate.
391
+
392
+ Returns:
393
+ Validated Path object.
394
+
395
+ Raises:
396
+ LoaderValidationError: If file doesn't exist or isn't readable.
397
+ """
398
+ path = Path(filepath)
399
+
400
+ if not path.exists():
401
+ raise LoaderValidationError(f"File not found: {path}")
402
+
403
+ if not path.is_file():
404
+ raise LoaderValidationError(f"Not a file: {path}")
405
+
406
+ return path