pywombat 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -40,6 +40,11 @@ import yaml
40
40
  type=click.Path(exists=True, path_type=Path),
41
41
  help="Filter configuration YAML file to apply quality and impact filters.",
42
42
  )
43
+ @click.option(
44
+ "--debug",
45
+ type=str,
46
+ help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
47
+ )
43
48
  def cli(
44
49
  input_file: Path,
45
50
  output: Optional[str],
@@ -47,6 +52,7 @@ def cli(
47
52
  verbose: bool,
48
53
  pedigree: Optional[Path],
49
54
  filter_config: Optional[Path],
55
+ debug: Optional[str],
50
56
  ):
51
57
  """
52
58
  Wombat: A tool for processing bcftools tabulated TSV files.
@@ -94,6 +100,11 @@ def cli(
94
100
  click.echo(f"Reading filter config: {filter_config}", err=True)
95
101
  filter_config_data = load_filter_config(filter_config)
96
102
 
103
+ # Debug mode: show specific variant
104
+ if debug:
105
+ debug_variant(input_file, pedigree_df, filter_config_data, debug, verbose)
106
+ return
107
+
97
108
  # Determine output prefix
98
109
  if output is None:
99
110
  # Generate default output prefix from input filename
@@ -116,7 +127,20 @@ def cli(
116
127
  click.echo("Processing with streaming mode...", err=True)
117
128
 
118
129
  # Build lazy query
119
- lazy_df = pl.scan_csv(input_file, separator="\t")
130
+ # Force certain columns to string type
131
+ string_columns = [
132
+ "FID",
133
+ "sample_id",
134
+ "father_id",
135
+ "mother_id",
136
+ "FatherBarcode",
137
+ "MotherBarcode",
138
+ "sample",
139
+ ]
140
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
141
+ lazy_df = pl.scan_csv(
142
+ input_file, separator="\t", schema_overrides=schema_overrides
143
+ )
120
144
 
121
145
  # Apply formatting transformations
122
146
  lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
@@ -147,6 +171,106 @@ def cli(
147
171
  raise click.Abort()
148
172
 
149
173
 
174
+ def debug_variant(
175
+ input_file: Path,
176
+ pedigree_df: Optional[pl.DataFrame],
177
+ filter_config: Optional[dict],
178
+ debug_pos: str,
179
+ verbose: bool,
180
+ ):
181
+ """Debug mode: display rows matching a specific chrom:pos."""
182
+ # Parse debug position
183
+ if ":" not in debug_pos:
184
+ click.echo(
185
+ "Error: Debug position must be in format 'chrom:pos' (e.g., chr11:70486013)",
186
+ err=True,
187
+ )
188
+ raise click.Abort()
189
+
190
+ chrom, pos = debug_pos.split(":", 1)
191
+ try:
192
+ pos = int(pos)
193
+ except ValueError:
194
+ click.echo(f"Error: Position must be an integer, got '{pos}'", err=True)
195
+ raise click.Abort()
196
+
197
+ if verbose:
198
+ click.echo(f"Debug mode: searching for {chrom}:{pos}", err=True)
199
+
200
+ # Read and format the data
201
+ # Force certain columns to string type
202
+ string_columns = [
203
+ "FID",
204
+ "sample_id",
205
+ "father_id",
206
+ "mother_id",
207
+ "FatherBarcode",
208
+ "MotherBarcode",
209
+ "sample",
210
+ ]
211
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
212
+ df = pl.read_csv(input_file, separator="\t", schema_overrides=schema_overrides)
213
+ formatted_df = format_bcftools_tsv(df, pedigree_df)
214
+
215
+ # Filter to matching rows
216
+ matching_rows = formatted_df.filter(
217
+ (pl.col("#CHROM") == chrom) & (pl.col("POS") == pos)
218
+ )
219
+
220
+ if matching_rows.shape[0] == 0:
221
+ click.echo(f"No rows found matching {chrom}:{pos}", err=True)
222
+ return
223
+
224
+ # Determine which columns to display
225
+ columns_to_show = ["#CHROM", "POS"]
226
+
227
+ # Add VEP_SYMBOL if it exists
228
+ if "VEP_SYMBOL" in matching_rows.columns:
229
+ columns_to_show.append("VEP_SYMBOL")
230
+
231
+ # Extract column names from expression if filter config provided
232
+ if filter_config and "expression" in filter_config:
233
+ expression = filter_config["expression"]
234
+ # Extract column names from expression using regex
235
+ # Match patterns like "column_name" before operators
236
+ column_pattern = r"\b([A-Za-z_][A-Za-z0-9_]*)\b\s*[=!<>]"
237
+ found_columns = re.findall(column_pattern, expression)
238
+
239
+ for col in found_columns:
240
+ if col in matching_rows.columns and col not in columns_to_show:
241
+ columns_to_show.append(col)
242
+
243
+ # Select only the columns we want to display
244
+ display_df = matching_rows.select(
245
+ [col for col in columns_to_show if col in matching_rows.columns]
246
+ )
247
+
248
+ # Replace null and NaN values with <null> and <NaN> for display
249
+ for col in display_df.columns:
250
+ if display_df[col].dtype in [pl.Float32, pl.Float64]:
251
+ # For numeric columns, handle both NaN and null
252
+ display_df = display_df.with_columns(
253
+ pl.when(pl.col(col).is_null())
254
+ .then(pl.lit("<null>"))
255
+ .when(pl.col(col).is_nan())
256
+ .then(pl.lit("<NaN>"))
257
+ .otherwise(pl.col(col).cast(pl.Utf8))
258
+ .alias(col)
259
+ )
260
+ else:
261
+ # For non-numeric columns, only handle null
262
+ display_df = display_df.with_columns(
263
+ pl.when(pl.col(col).is_null())
264
+ .then(pl.lit("<null>"))
265
+ .otherwise(pl.col(col).cast(pl.Utf8))
266
+ .alias(col)
267
+ )
268
+
269
+ # Display the results
270
+ click.echo(f"\nFound {matching_rows.shape[0]} row(s) matching {chrom}:{pos}:\n")
271
+ click.echo(display_df.write_csv(separator="\t"))
272
+
273
+
150
274
  def load_filter_config(config_path: Path) -> dict:
151
275
  """Load and parse filter configuration from YAML file."""
152
276
  with open(config_path, "r") as f:
@@ -366,6 +490,30 @@ def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr
366
490
  if col_name not in df.columns:
367
491
  raise ValueError(f"Column '{col_name}' not found in dataframe")
368
492
 
493
+ # Check for null value
494
+ if value.upper() == "NULL":
495
+ col_expr = pl.col(col_name)
496
+ if op == "=":
497
+ return col_expr.is_null()
498
+ elif op == "!=":
499
+ return ~col_expr.is_null()
500
+ else:
501
+ raise ValueError(
502
+ f"Operator '{op}' not supported for null comparison, use = or !="
503
+ )
504
+
505
+ # Check for NaN value
506
+ if value.upper() == "NAN":
507
+ col_expr = pl.col(col_name).cast(pl.Float64, strict=False)
508
+ if op == "=":
509
+ return col_expr.is_nan()
510
+ elif op == "!=":
511
+ return ~col_expr.is_nan()
512
+ else:
513
+ raise ValueError(
514
+ f"Operator '{op}' not supported for NaN comparison, use = or !="
515
+ )
516
+
369
517
  # Try to convert value to number, otherwise treat as string
370
518
  try:
371
519
  value_num = float(value)
@@ -647,7 +795,18 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
647
795
  DataFrame with columns: sample_id, father_id, mother_id
648
796
  """
649
797
  # Try reading with header first
650
- df = pl.read_csv(pedigree_path, separator="\t")
798
+ # Force certain columns to string type
799
+ string_columns = [
800
+ "FID",
801
+ "sample_id",
802
+ "father_id",
803
+ "mother_id",
804
+ "FatherBarcode",
805
+ "MotherBarcode",
806
+ "sample",
807
+ ]
808
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
809
+ df = pl.read_csv(pedigree_path, separator="\t", schema_overrides=schema_overrides)
651
810
 
652
811
  # Check if first row has 'FID' in first column (indicates header)
653
812
  if df.columns[0] == "FID" or "sample_id" in df.columns:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=0nBlwyRu1Q01a0EHcVyIYtKmgezCWA85pQtEXpnuzL4,44535
3
+ pywombat-0.5.0.dist-info/METADATA,sha256=2Py8xwNxZBD18u4r-tJI_mQezMBg4td3ruWOm61MbdA,4982
4
+ pywombat-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-0.5.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-0.5.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=kv03IFXcwe9pdv-KyoT5Cu1pJ9r-O7ww-Kh0ZT2ysa4,38920
3
- pywombat-0.3.0.dist-info/METADATA,sha256=eASint-XgzgUGWshtZYr4nekDCs-VKSTilHLRupH5ic,4982
4
- pywombat-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-0.3.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-0.3.0.dist-info/RECORD,,