pywombat 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -40,6 +40,11 @@ import yaml
40
40
  type=click.Path(exists=True, path_type=Path),
41
41
  help="Filter configuration YAML file to apply quality and impact filters.",
42
42
  )
43
+ @click.option(
44
+ "--debug",
45
+ type=str,
46
+ help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
47
+ )
43
48
  def cli(
44
49
  input_file: Path,
45
50
  output: Optional[str],
@@ -47,6 +52,7 @@ def cli(
47
52
  verbose: bool,
48
53
  pedigree: Optional[Path],
49
54
  filter_config: Optional[Path],
55
+ debug: Optional[str],
50
56
  ):
51
57
  """
52
58
  Wombat: A tool for processing bcftools tabulated TSV files.
@@ -80,14 +86,6 @@ def cli(
80
86
  if verbose and is_gzipped:
81
87
  click.echo("Detected gzipped file", err=True)
82
88
 
83
- # Read the TSV file (handles both plain and gzipped)
84
- df = pl.read_csv(input_file, separator="\t")
85
-
86
- if verbose:
87
- click.echo(
88
- f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
89
- )
90
-
91
89
  # Read pedigree file if provided
92
90
  pedigree_df = None
93
91
  if pedigree:
@@ -95,22 +93,18 @@ def cli(
95
93
  click.echo(f"Reading pedigree file: {pedigree}", err=True)
96
94
  pedigree_df = read_pedigree(pedigree)
97
95
 
98
- # Process the dataframe
99
- formatted_df = format_bcftools_tsv(df, pedigree_df)
100
-
101
- if verbose:
102
- click.echo(
103
- f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
104
- err=True,
105
- )
106
-
107
- # Apply filters if provided
96
+ # Load filter config if provided
108
97
  filter_config_data = None
109
98
  if filter_config:
110
99
  if verbose:
111
100
  click.echo(f"Reading filter config: {filter_config}", err=True)
112
101
  filter_config_data = load_filter_config(filter_config)
113
102
 
103
+ # Debug mode: show specific variant
104
+ if debug:
105
+ debug_variant(input_file, pedigree_df, filter_config_data, debug, verbose)
106
+ return
107
+
114
108
  # Determine output prefix
115
109
  if output is None:
116
110
  # Generate default output prefix from input filename
@@ -128,36 +122,131 @@ def cli(
128
122
  else:
129
123
  output = input_stem
130
124
 
131
- # Apply filters and write output
125
+ # Use streaming approach with lazy API
126
+ if verbose:
127
+ click.echo("Processing with streaming mode...", err=True)
128
+
129
+ # Build lazy query
130
+ lazy_df = pl.scan_csv(input_file, separator="\t")
131
+
132
+ # Apply formatting transformations
133
+ lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
134
+
135
+ # Apply filters if provided
132
136
  if filter_config_data:
133
- apply_filters_and_write(
134
- formatted_df,
135
- filter_config_data,
136
- output,
137
- output_format,
138
- verbose,
139
- )
140
- else:
141
- # No filters - write single output file
142
- # Construct output filename with prefix and format
143
- output_path = Path(f"{output}.{output_format}")
137
+ lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
144
138
 
145
- if output_format == "tsv":
146
- formatted_df.write_csv(output_path, separator="\t")
147
- elif output_format == "tsv.gz":
148
- csv_content = formatted_df.write_csv(separator="\t")
149
- with gzip.open(output_path, "wt") as f:
150
- f.write(csv_content)
151
- elif output_format == "parquet":
152
- formatted_df.write_parquet(output_path)
139
+ # Write output
140
+ output_path = Path(f"{output}.{output_format}")
153
141
 
154
- click.echo(f"Formatted data written to {output_path}", err=True)
142
+ if output_format == "tsv":
143
+ lazy_df.sink_csv(output_path, separator="\t")
144
+ elif output_format == "tsv.gz":
145
+ # For gzip, we need to collect and write
146
+ df = lazy_df.collect()
147
+ csv_content = df.write_csv(separator="\t")
148
+ with gzip.open(output_path, "wt") as f:
149
+ f.write(csv_content)
150
+ elif output_format == "parquet":
151
+ lazy_df.sink_parquet(output_path)
152
+
153
+ if verbose:
154
+ click.echo(f"Data written to {output_path}", err=True)
155
155
 
156
156
  except Exception as e:
157
157
  click.echo(f"Error: {e}", err=True)
158
158
  raise click.Abort()
159
159
 
160
160
 
161
+ def debug_variant(
162
+ input_file: Path,
163
+ pedigree_df: Optional[pl.DataFrame],
164
+ filter_config: Optional[dict],
165
+ debug_pos: str,
166
+ verbose: bool,
167
+ ):
168
+ """Debug mode: display rows matching a specific chrom:pos."""
169
+ # Parse debug position
170
+ if ":" not in debug_pos:
171
+ click.echo(
172
+ "Error: Debug position must be in format 'chrom:pos' (e.g., chr11:70486013)",
173
+ err=True,
174
+ )
175
+ raise click.Abort()
176
+
177
+ chrom, pos = debug_pos.split(":", 1)
178
+ try:
179
+ pos = int(pos)
180
+ except ValueError:
181
+ click.echo(f"Error: Position must be an integer, got '{pos}'", err=True)
182
+ raise click.Abort()
183
+
184
+ if verbose:
185
+ click.echo(f"Debug mode: searching for {chrom}:{pos}", err=True)
186
+
187
+ # Read and format the data
188
+ df = pl.read_csv(input_file, separator="\t")
189
+ formatted_df = format_bcftools_tsv(df, pedigree_df)
190
+
191
+ # Filter to matching rows
192
+ matching_rows = formatted_df.filter(
193
+ (pl.col("#CHROM") == chrom) & (pl.col("POS") == pos)
194
+ )
195
+
196
+ if matching_rows.shape[0] == 0:
197
+ click.echo(f"No rows found matching {chrom}:{pos}", err=True)
198
+ return
199
+
200
+ # Determine which columns to display
201
+ columns_to_show = ["#CHROM", "POS"]
202
+
203
+ # Add VEP_SYMBOL if it exists
204
+ if "VEP_SYMBOL" in matching_rows.columns:
205
+ columns_to_show.append("VEP_SYMBOL")
206
+
207
+ # Extract column names from expression if filter config provided
208
+ if filter_config and "expression" in filter_config:
209
+ expression = filter_config["expression"]
210
+ # Extract column names from expression using regex
211
+ # Match patterns like "column_name" before operators
212
+ column_pattern = r"\b([A-Za-z_][A-Za-z0-9_]*)\b\s*[=!<>]"
213
+ found_columns = re.findall(column_pattern, expression)
214
+
215
+ for col in found_columns:
216
+ if col in matching_rows.columns and col not in columns_to_show:
217
+ columns_to_show.append(col)
218
+
219
+ # Select only the columns we want to display
220
+ display_df = matching_rows.select(
221
+ [col for col in columns_to_show if col in matching_rows.columns]
222
+ )
223
+
224
+ # Replace null and NaN values with <null> and <NaN> for display
225
+ for col in display_df.columns:
226
+ if display_df[col].dtype in [pl.Float32, pl.Float64]:
227
+ # For numeric columns, handle both NaN and null
228
+ display_df = display_df.with_columns(
229
+ pl.when(pl.col(col).is_null())
230
+ .then(pl.lit("<null>"))
231
+ .when(pl.col(col).is_nan())
232
+ .then(pl.lit("<NaN>"))
233
+ .otherwise(pl.col(col).cast(pl.Utf8))
234
+ .alias(col)
235
+ )
236
+ else:
237
+ # For non-numeric columns, only handle null
238
+ display_df = display_df.with_columns(
239
+ pl.when(pl.col(col).is_null())
240
+ .then(pl.lit("<null>"))
241
+ .otherwise(pl.col(col).cast(pl.Utf8))
242
+ .alias(col)
243
+ )
244
+
245
+ # Display the results
246
+ click.echo(f"\nFound {matching_rows.shape[0]} row(s) matching {chrom}:{pos}:\n")
247
+ click.echo(display_df.write_csv(separator="\t"))
248
+
249
+
161
250
  def load_filter_config(config_path: Path) -> dict:
162
251
  """Load and parse filter configuration from YAML file."""
163
252
  with open(config_path, "r") as f:
@@ -377,6 +466,30 @@ def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr
377
466
  if col_name not in df.columns:
378
467
  raise ValueError(f"Column '{col_name}' not found in dataframe")
379
468
 
469
+ # Check for null value
470
+ if value.upper() == "NULL":
471
+ col_expr = pl.col(col_name)
472
+ if op == "=":
473
+ return col_expr.is_null()
474
+ elif op == "!=":
475
+ return ~col_expr.is_null()
476
+ else:
477
+ raise ValueError(
478
+ f"Operator '{op}' not supported for null comparison, use = or !="
479
+ )
480
+
481
+ # Check for NaN value
482
+ if value.upper() == "NAN":
483
+ col_expr = pl.col(col_name).cast(pl.Float64, strict=False)
484
+ if op == "=":
485
+ return col_expr.is_nan()
486
+ elif op == "!=":
487
+ return ~col_expr.is_nan()
488
+ else:
489
+ raise ValueError(
490
+ f"Operator '{op}' not supported for NaN comparison, use = or !="
491
+ )
492
+
380
493
  # Try to convert value to number, otherwise treat as string
381
494
  try:
382
495
  value_num = float(value)
@@ -957,5 +1070,142 @@ def format_bcftools_tsv(
957
1070
  return melted_df
958
1071
 
959
1072
 
1073
+ def format_bcftools_tsv_lazy(
1074
+ lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
1075
+ ) -> pl.LazyFrame:
1076
+ """
1077
+ Format a bcftools tabulated TSV using lazy operations for streaming.
1078
+
1079
+ This is a simplified version that collects minimally for complex operations.
1080
+ """
1081
+ # For complex transformations like melting, we need to collect temporarily
1082
+ # but we do this in a streaming fashion
1083
+ df = lazy_df.collect(streaming=True)
1084
+ formatted_df = format_bcftools_tsv(df, pedigree_df)
1085
+ return formatted_df.lazy()
1086
+
1087
+
1088
+ def apply_filters_lazy(
1089
+ lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
1090
+ ) -> pl.LazyFrame:
1091
+ """Apply quality and expression filters using lazy operations."""
1092
+ quality_config = filter_config.get("quality", {})
1093
+ expression = filter_config.get("expression")
1094
+
1095
+ # Apply quality filters
1096
+ if quality_config:
1097
+ # Filter: sample_gt must contain at least one '1' (default: true)
1098
+ filter_no_alt = quality_config.get("filter_no_alt_allele", True)
1099
+ if filter_no_alt:
1100
+ lazy_df = lazy_df.filter(
1101
+ pl.col("sample_gt").str.contains("1")
1102
+ | pl.col("sample_gt").str.contains("2")
1103
+ )
1104
+
1105
+ # Apply minimum depth filter
1106
+ if "sample_dp_min" in quality_config:
1107
+ min_dp = quality_config["sample_dp_min"]
1108
+ lazy_df = lazy_df.filter(
1109
+ pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp
1110
+ )
1111
+
1112
+ # Apply minimum GQ filter
1113
+ if "sample_gq_min" in quality_config:
1114
+ min_gq = quality_config["sample_gq_min"]
1115
+ lazy_df = lazy_df.filter(
1116
+ pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq
1117
+ )
1118
+
1119
+ # VAF filters for heterozygous (0/1 or 1/0)
1120
+ if (
1121
+ "sample_vaf_het_min" in quality_config
1122
+ or "sample_vaf_het_max" in quality_config
1123
+ ):
1124
+ # Check if genotype is het (contains one '1' and one '0', no '2')
1125
+ is_het = (
1126
+ (pl.col("sample_gt").str.count_matches("1") == 1)
1127
+ & (pl.col("sample_gt").str.count_matches("0") == 1)
1128
+ & (~pl.col("sample_gt").str.contains("2"))
1129
+ )
1130
+
1131
+ het_conditions = []
1132
+ if "sample_vaf_het_min" in quality_config:
1133
+ het_conditions.append(
1134
+ pl.col("sample_vaf") >= quality_config["sample_vaf_het_min"]
1135
+ )
1136
+ if "sample_vaf_het_max" in quality_config:
1137
+ het_conditions.append(
1138
+ pl.col("sample_vaf") <= quality_config["sample_vaf_het_max"]
1139
+ )
1140
+
1141
+ if het_conditions:
1142
+ het_filter = het_conditions[0]
1143
+ for cond in het_conditions[1:]:
1144
+ het_filter = het_filter & cond
1145
+
1146
+ lazy_df = lazy_df.filter(~is_het | het_filter)
1147
+
1148
+ # VAF filter for homozygous alternate (1/1)
1149
+ if "sample_vaf_homalt_min" in quality_config:
1150
+ is_homalt = pl.col("sample_gt") == "1/1"
1151
+ lazy_df = lazy_df.filter(
1152
+ ~is_homalt
1153
+ | (pl.col("sample_vaf") >= quality_config["sample_vaf_homalt_min"])
1154
+ )
1155
+
1156
+ # VAF filter for homozygous reference (0/0)
1157
+ if "sample_vaf_hom_ref_max" in quality_config:
1158
+ is_hom_ref = pl.col("sample_gt") == "0/0"
1159
+ lazy_df = lazy_df.filter(
1160
+ ~is_hom_ref
1161
+ | (pl.col("sample_vaf") <= quality_config["sample_vaf_hom_ref_max"])
1162
+ )
1163
+
1164
+ # Apply same filters to parents if requested
1165
+ apply_to_parents = quality_config.get("apply_to_parents", False)
1166
+ if apply_to_parents:
1167
+ # Father filters
1168
+ if "sample_dp_min" in quality_config:
1169
+ min_dp = quality_config["sample_dp_min"]
1170
+ lazy_df = lazy_df.filter(
1171
+ (pl.col("father_dp").is_null())
1172
+ | (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
1173
+ )
1174
+
1175
+ if "sample_gq_min" in quality_config:
1176
+ min_gq = quality_config["sample_gq_min"]
1177
+ lazy_df = lazy_df.filter(
1178
+ (pl.col("father_gq").is_null())
1179
+ | (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
1180
+ )
1181
+
1182
+ # Mother filters
1183
+ if "sample_dp_min" in quality_config:
1184
+ min_dp = quality_config["sample_dp_min"]
1185
+ lazy_df = lazy_df.filter(
1186
+ (pl.col("mother_dp").is_null())
1187
+ | (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
1188
+ )
1189
+
1190
+ if "sample_gq_min" in quality_config:
1191
+ min_gq = quality_config["sample_gq_min"]
1192
+ lazy_df = lazy_df.filter(
1193
+ (pl.col("mother_gq").is_null())
1194
+ | (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
1195
+ )
1196
+
1197
+ # Apply expression filter if provided
1198
+ if expression:
1199
+ if verbose:
1200
+ click.echo(f"Applying expression filter: {expression}", err=True)
1201
+
1202
+ # We need to collect temporarily to use parse_impact_filter_expression
1203
+ df = lazy_df.collect(streaming=True)
1204
+ filter_expr = parse_impact_filter_expression(expression, df)
1205
+ lazy_df = df.lazy().filter(filter_expr)
1206
+
1207
+ return lazy_df
1208
+
1209
+
960
1210
  if __name__ == "__main__":
961
1211
  cli()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=dg38E39VpdJhKQt3aGSHwSiLWn1W8JnUkcsy3ZUHD5w,43518
3
+ pywombat-0.4.0.dist-info/METADATA,sha256=ZKPTIp9ud2AIVbcujg4ciq900DX-UkGs5oafa41jxTQ,4982
4
+ pywombat-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-0.4.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-0.4.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=PZKV6FoqZyGgG7_mMIO2FzyeONdBaCqnhDATYsQJqMo,33899
3
- pywombat-0.2.0.dist-info/METADATA,sha256=7Qg2XnaTM92pmIewu5fw_vrcQW5JCVkkj2q6mNC9v88,4982
4
- pywombat-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-0.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-0.2.0.dist-info/RECORD,,