pylocuszoom 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,14 +9,16 @@ Provides:
9
9
  import os
10
10
  import tarfile
11
11
  import tempfile
12
- import urllib.request
13
12
  from pathlib import Path
14
13
  from typing import Optional
15
14
 
16
15
  import pandas as pd
16
+ import requests
17
17
  from matplotlib.axes import Axes
18
+ from tqdm import tqdm
18
19
 
19
20
  from .logging import logger
21
+ from .utils import filter_by_region
20
22
 
21
23
  # Recombination overlay color
22
24
  RECOMB_COLOR = "#7FCDFF" # Light blue
@@ -54,6 +56,38 @@ def get_chain_file_path() -> Path:
54
56
  return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
55
57
 
56
58
 
59
+ def _download_with_progress(
60
+ url: str, dest_path: Path, desc: str = "Downloading"
61
+ ) -> None:
62
+ """Download a file with a progress bar.
63
+
64
+ Args:
65
+ url: URL to download from.
66
+ dest_path: Destination file path.
67
+ desc: Description for the progress bar.
68
+ """
69
+ response = requests.get(url, stream=True, timeout=60)
70
+ response.raise_for_status()
71
+
72
+ total_size = int(response.headers.get("content-length", 0))
73
+
74
+ with (
75
+ open(dest_path, "wb") as f,
76
+ tqdm(
77
+ total=total_size,
78
+ unit="B",
79
+ unit_scale=True,
80
+ unit_divisor=1024,
81
+ desc=desc,
82
+ disable=total_size == 0, # Disable if size unknown
83
+ ) as pbar,
84
+ ):
85
+ for chunk in response.iter_content(chunk_size=8192):
86
+ if chunk:
87
+ f.write(chunk)
88
+ pbar.update(len(chunk))
89
+
90
+
57
91
  def download_liftover_chain(force: bool = False) -> Path:
58
92
  """Download the CanFam3 to CanFam4 liftover chain file.
59
93
 
@@ -73,20 +107,11 @@ def download_liftover_chain(force: bool = False) -> Path:
73
107
  logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
74
108
  logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
75
109
 
76
- try:
77
- urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
78
- except Exception as e:
79
- logger.debug(f"urllib download failed: {e}")
80
- try:
81
- import requests
82
-
83
- response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
84
- response.raise_for_status()
85
- chain_path.write_bytes(response.content)
86
- except ImportError:
87
- raise RuntimeError(
88
- "Failed to download. Install requests: pip install requests"
89
- )
110
+ _download_with_progress(
111
+ CANFAM3_TO_CANFAM4_CHAIN_URL,
112
+ chain_path,
113
+ desc="Liftover chain",
114
+ )
90
115
 
91
116
  logger.info(f"Chain file saved to: {chain_path}")
92
117
  return chain_path
@@ -217,31 +242,31 @@ def download_canine_recombination_maps(
217
242
  logger.debug(f"Source: {CANINE_RECOMB_URL}")
218
243
 
219
244
  with tempfile.TemporaryDirectory() as tmpdir:
220
- # Download tar.gz file
245
+ # Download tar.gz file with progress bar
221
246
  tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
222
247
 
223
- try:
224
- urllib.request.urlretrieve(CANINE_RECOMB_URL, tar_path)
225
- except Exception as e:
226
- logger.debug(f"urllib download failed: {e}")
227
- logger.debug("Trying alternative method with requests...")
228
- try:
229
- import requests
230
-
231
- response = requests.get(CANINE_RECOMB_URL, timeout=60)
232
- response.raise_for_status()
233
- tar_path.write_bytes(response.content)
234
- except ImportError:
235
- raise RuntimeError(
236
- "Failed to download. Install requests: pip install requests"
237
- )
248
+ _download_with_progress(
249
+ CANINE_RECOMB_URL,
250
+ tar_path,
251
+ desc="Recombination maps",
252
+ )
238
253
 
239
254
  logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
240
255
 
241
- # Extract tar.gz
256
+ # Extract tar.gz with path traversal protection
242
257
  logger.debug("Extracting genetic maps...")
243
258
  with tarfile.open(tar_path, "r:gz") as tar:
244
- tar.extractall(tmpdir)
259
+ # Filter to prevent path traversal attacks
260
+ safe_members = []
261
+ for member in tar.getmembers():
262
+ # Resolve the path and ensure it stays within tmpdir
263
+ member_path = Path(tmpdir) / member.name
264
+ try:
265
+ member_path.resolve().relative_to(Path(tmpdir).resolve())
266
+ safe_members.append(member)
267
+ except ValueError:
268
+ logger.warning(f"Skipping unsafe path in archive: {member.name}")
269
+ tar.extractall(tmpdir, members=safe_members)
245
270
 
246
271
  # Find and process the extracted files
247
272
  extracted_dir = Path(tmpdir)
@@ -360,7 +385,12 @@ def get_recombination_rate_for_region(
360
385
  )
361
386
 
362
387
  # Filter to region
363
- region_df = df[(df["pos"] >= start) & (df["pos"] <= end)].copy()
388
+ region_df = filter_by_region(
389
+ df,
390
+ region=(chrom, start, end),
391
+ chrom_col="", # Recomb maps don't have chromosome column
392
+ pos_col="pos",
393
+ )
364
394
 
365
395
  return region_df[["pos", "rate"]]
366
396
 
pylocuszoom/schemas.py CHANGED
@@ -84,30 +84,36 @@ def validate_gwas_dataframe(
84
84
  "GWAS validation failed:\n - " + "\n - ".join(errors)
85
85
  )
86
86
 
87
- # Check data types
88
- if not pd.api.types.is_numeric_dtype(df[pos_col]):
87
+ # Check data types (must be numeric for range checks)
88
+ pos_is_numeric = pd.api.types.is_numeric_dtype(df[pos_col])
89
+ p_is_numeric = pd.api.types.is_numeric_dtype(df[p_col])
90
+
91
+ if not pos_is_numeric:
89
92
  errors.append(f"Column '{pos_col}' must be numeric, got {df[pos_col].dtype}")
90
93
 
91
- if not pd.api.types.is_numeric_dtype(df[p_col]):
94
+ if not p_is_numeric:
92
95
  errors.append(f"Column '{p_col}' must be numeric, got {df[p_col].dtype}")
93
96
 
94
- # Check value ranges
95
- if (df[pos_col] <= 0).any():
96
- n_invalid = (df[pos_col] <= 0).sum()
97
- errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
97
+ # Only check value ranges if columns are numeric (avoid confusing errors)
98
+ if pos_is_numeric:
99
+ if (df[pos_col] <= 0).any():
100
+ n_invalid = (df[pos_col] <= 0).sum()
101
+ errors.append(f"Column '{pos_col}' has {n_invalid} non-positive values")
98
102
 
99
- if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
100
- n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
101
- errors.append(f"Column '{p_col}' has {n_invalid} values outside range (0, 1]")
103
+ if df[pos_col].isna().any():
104
+ n_na = df[pos_col].isna().sum()
105
+ errors.append(f"Column '{pos_col}' has {n_na} missing values")
102
106
 
103
- # Check for NaN in required columns
104
- if df[pos_col].isna().any():
105
- n_na = df[pos_col].isna().sum()
106
- errors.append(f"Column '{pos_col}' has {n_na} missing values")
107
+ if p_is_numeric:
108
+ if ((df[p_col] <= 0) | (df[p_col] > 1)).any():
109
+ n_invalid = ((df[p_col] <= 0) | (df[p_col] > 1)).sum()
110
+ errors.append(
111
+ f"Column '{p_col}' has {n_invalid} values outside range (0, 1]"
112
+ )
107
113
 
108
- if df[p_col].isna().any():
109
- n_na = df[p_col].isna().sum()
110
- errors.append(f"Column '{p_col}' has {n_na} missing values")
114
+ if df[p_col].isna().any():
115
+ n_na = df[p_col].isna().sum()
116
+ errors.append(f"Column '{p_col}' has {n_na} missing values")
111
117
 
112
118
  if errors:
113
119
  raise LoaderValidationError(
@@ -344,20 +350,25 @@ def validate_genes_dataframe(
344
350
  )
345
351
 
346
352
  # Check data types
347
- if not pd.api.types.is_numeric_dtype(df["start"]):
353
+ start_is_numeric = pd.api.types.is_numeric_dtype(df["start"])
354
+ end_is_numeric = pd.api.types.is_numeric_dtype(df["end"])
355
+
356
+ if not start_is_numeric:
348
357
  errors.append(f"Column 'start' must be numeric, got {df['start'].dtype}")
349
358
 
350
- if not pd.api.types.is_numeric_dtype(df["end"]):
359
+ if not end_is_numeric:
351
360
  errors.append(f"Column 'end' must be numeric, got {df['end'].dtype}")
352
361
 
353
- # Check ranges
354
- if (df["start"] < 0).any():
355
- n_invalid = (df["start"] < 0).sum()
356
- errors.append(f"Column 'start' has {n_invalid} negative values")
362
+ # Only check ranges if columns are numeric (avoid confusing errors)
363
+ if start_is_numeric:
364
+ if (df["start"] < 0).any():
365
+ n_invalid = (df["start"] < 0).sum()
366
+ errors.append(f"Column 'start' has {n_invalid} negative values")
357
367
 
358
- if (df["end"] < df["start"]).any():
359
- n_invalid = (df["end"] < df["start"]).sum()
360
- errors.append(f"Found {n_invalid} genes where end < start")
368
+ if start_is_numeric and end_is_numeric:
369
+ if (df["end"] < df["start"]).any():
370
+ n_invalid = (df["end"] < df["start"]).sum()
371
+ errors.append(f"Found {n_invalid} genes where end < start")
361
372
 
362
373
  if errors:
363
374
  raise LoaderValidationError(
pylocuszoom/utils.py CHANGED
@@ -106,6 +106,58 @@ def normalize_chrom(chrom: Union[int, str]) -> str:
106
106
  return str(chrom).replace("chr", "")
107
107
 
108
108
 
109
+ def filter_by_region(
110
+ df: pd.DataFrame,
111
+ region: tuple,
112
+ chrom_col: str = "chrom",
113
+ pos_col: str = "pos",
114
+ ) -> pd.DataFrame:
115
+ """Filter DataFrame to genomic region with inclusive bounds.
116
+
117
+ Filters rows where position is within [start, end] (inclusive).
118
+ If chrom_col exists in DataFrame, also filters by chromosome.
119
+ Chromosome comparison normalizes types (int/str, chr prefix).
120
+
121
+ Args:
122
+ df: DataFrame to filter.
123
+ region: Tuple of (chrom, start, end) defining the region.
124
+ chrom_col: Column name for chromosome (default: "chrom").
125
+ If column doesn't exist, filters by position only.
126
+ pos_col: Column name for position (default: "pos").
127
+
128
+ Returns:
129
+ Filtered DataFrame (copy, not view).
130
+
131
+ Raises:
132
+ KeyError: If pos_col is not found in DataFrame.
133
+
134
+ Example:
135
+ >>> filtered = filter_by_region(df, region=(1, 1000000, 2000000))
136
+ >>> filtered = filter_by_region(df, region=("chr1", 1e6, 2e6), pos_col="position")
137
+ """
138
+ chrom, start, end = region
139
+
140
+ # Validate position column exists
141
+ if pos_col not in df.columns:
142
+ raise KeyError(
143
+ f"Position column '{pos_col}' not found in DataFrame. "
144
+ f"Available columns: {list(df.columns)}"
145
+ )
146
+
147
+ # Position filtering (inclusive bounds)
148
+ mask = (df[pos_col] >= start) & (df[pos_col] <= end)
149
+
150
+ # Chromosome filtering (if column exists)
151
+ if chrom_col in df.columns:
152
+ chrom_normalized = normalize_chrom(chrom)
153
+ df_chrom_normalized = (
154
+ df[chrom_col].astype(str).str.replace("chr", "", regex=False)
155
+ )
156
+ mask = mask & (df_chrom_normalized == chrom_normalized)
157
+
158
+ return df[mask].copy()
159
+
160
+
109
161
  def validate_dataframe(
110
162
  df: pd.DataFrame,
111
163
  required_cols: List[str],
@@ -0,0 +1,172 @@
1
+ """DataFrame validation builder for pyLocusZoom.
2
+
3
+ Provides a fluent API for validating pandas DataFrames with composable
4
+ validation rules. Accumulates all validation errors before raising.
5
+ """
6
+
7
+ from typing import List, Optional
8
+
9
+ import pandas as pd
10
+ from pandas.api.types import is_numeric_dtype
11
+
12
+ from .utils import ValidationError
13
+
14
+
15
+ class DataFrameValidator:
16
+ """Builder for composable DataFrame validation.
17
+
18
+ Validates DataFrames with method chaining and accumulates all errors
19
+ before raising. This enables clear, readable validation code with
20
+ comprehensive error messages.
21
+
22
+ Example:
23
+ >>> validator = DataFrameValidator(df, name="gwas_df")
24
+ >>> validator.require_columns(["chr", "pos", "p"])
25
+ ... .require_numeric(["pos", "p"])
26
+ ... .require_range("p", min_val=0, max_val=1)
27
+ ... .validate()
28
+ """
29
+
30
+ def __init__(self, df: pd.DataFrame, name: str = "DataFrame"):
31
+ """Initialize validator.
32
+
33
+ Args:
34
+ df: DataFrame to validate.
35
+ name: Name for error messages (e.g., "gwas_df", "genes_df").
36
+ """
37
+ self._df = df
38
+ self._name = name
39
+ self._errors: List[str] = []
40
+
41
+ def require_columns(self, columns: List[str]) -> "DataFrameValidator":
42
+ """Check that required columns exist in DataFrame.
43
+
44
+ Args:
45
+ columns: List of required column names.
46
+
47
+ Returns:
48
+ Self for method chaining.
49
+ """
50
+ if not columns:
51
+ return self
52
+
53
+ missing = [col for col in columns if col not in self._df.columns]
54
+ if missing:
55
+ available = list(self._df.columns)
56
+ self._errors.append(f"Missing columns: {missing}. Available: {available}")
57
+
58
+ return self
59
+
60
+ def require_numeric(self, columns: List[str]) -> "DataFrameValidator":
61
+ """Check that columns have numeric dtype.
62
+
63
+ Skips columns that don't exist (checked separately by require_columns).
64
+
65
+ Args:
66
+ columns: List of column names that should be numeric.
67
+
68
+ Returns:
69
+ Self for method chaining.
70
+ """
71
+ for col in columns:
72
+ # Skip missing columns - let require_columns handle that
73
+ if col not in self._df.columns:
74
+ continue
75
+
76
+ if not is_numeric_dtype(self._df[col]):
77
+ actual_dtype = self._df[col].dtype
78
+ self._errors.append(
79
+ f"Column '{col}' must be numeric, got {actual_dtype}"
80
+ )
81
+
82
+ return self
83
+
84
+ def require_range(
85
+ self,
86
+ column: str,
87
+ min_val: Optional[float] = None,
88
+ max_val: Optional[float] = None,
89
+ exclusive_min: bool = False,
90
+ exclusive_max: bool = False,
91
+ ) -> "DataFrameValidator":
92
+ """Check that column values are within specified range.
93
+
94
+ Args:
95
+ column: Column name to check.
96
+ min_val: Minimum allowed value (inclusive by default).
97
+ max_val: Maximum allowed value (inclusive by default).
98
+ exclusive_min: If True, minimum is exclusive (values must be > min_val).
99
+ exclusive_max: If True, maximum is exclusive (values must be < max_val).
100
+
101
+ Returns:
102
+ Self for method chaining.
103
+ """
104
+ # Skip missing columns
105
+ if column not in self._df.columns:
106
+ return self
107
+
108
+ col_data = self._df[column]
109
+
110
+ # Check minimum bound
111
+ if min_val is not None:
112
+ if exclusive_min:
113
+ invalid_count = (col_data <= min_val).sum()
114
+ if invalid_count > 0:
115
+ self._errors.append(
116
+ f"Column '{column}': {invalid_count} values <= {min_val}"
117
+ )
118
+ else:
119
+ invalid_count = (col_data < min_val).sum()
120
+ if invalid_count > 0:
121
+ self._errors.append(
122
+ f"Column '{column}': {invalid_count} values < {min_val}"
123
+ )
124
+
125
+ # Check maximum bound
126
+ if max_val is not None:
127
+ if exclusive_max:
128
+ invalid_count = (col_data >= max_val).sum()
129
+ if invalid_count > 0:
130
+ self._errors.append(
131
+ f"Column '{column}': {invalid_count} values >= {max_val}"
132
+ )
133
+ else:
134
+ invalid_count = (col_data > max_val).sum()
135
+ if invalid_count > 0:
136
+ self._errors.append(
137
+ f"Column '{column}': {invalid_count} values > {max_val}"
138
+ )
139
+
140
+ return self
141
+
142
+ def require_not_null(self, columns: List[str]) -> "DataFrameValidator":
143
+ """Check that columns have no null (NaN or None) values.
144
+
145
+ Args:
146
+ columns: List of column names to check for nulls.
147
+
148
+ Returns:
149
+ Self for method chaining.
150
+ """
151
+ for col in columns:
152
+ # Skip missing columns
153
+ if col not in self._df.columns:
154
+ continue
155
+
156
+ null_count = self._df[col].isna().sum()
157
+ if null_count > 0:
158
+ self._errors.append(f"Column '{col}' has {null_count} null values")
159
+
160
+ return self
161
+
162
+ def validate(self) -> None:
163
+ """Raise ValidationError if any validation rules failed.
164
+
165
+ Raises:
166
+ ValidationError: If any validation errors were accumulated.
167
+ Error message includes all accumulated errors.
168
+ """
169
+ if self._errors:
170
+ error_msg = f"{self._name} validation failed:\n"
171
+ error_msg += "\n".join(f" - {error}" for error in self._errors)
172
+ raise ValidationError(error_msg)