pywombat 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +161 -2
- {pywombat-0.3.0.dist-info → pywombat-0.5.0.dist-info}/METADATA +1 -1
- pywombat-0.5.0.dist-info/RECORD +6 -0
- pywombat-0.3.0.dist-info/RECORD +0 -6
- {pywombat-0.3.0.dist-info → pywombat-0.5.0.dist-info}/WHEEL +0 -0
- {pywombat-0.3.0.dist-info → pywombat-0.5.0.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -40,6 +40,11 @@ import yaml
|
|
|
40
40
|
type=click.Path(exists=True, path_type=Path),
|
|
41
41
|
help="Filter configuration YAML file to apply quality and impact filters.",
|
|
42
42
|
)
|
|
43
|
+
@click.option(
|
|
44
|
+
"--debug",
|
|
45
|
+
type=str,
|
|
46
|
+
help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
|
|
47
|
+
)
|
|
43
48
|
def cli(
|
|
44
49
|
input_file: Path,
|
|
45
50
|
output: Optional[str],
|
|
@@ -47,6 +52,7 @@ def cli(
|
|
|
47
52
|
verbose: bool,
|
|
48
53
|
pedigree: Optional[Path],
|
|
49
54
|
filter_config: Optional[Path],
|
|
55
|
+
debug: Optional[str],
|
|
50
56
|
):
|
|
51
57
|
"""
|
|
52
58
|
Wombat: A tool for processing bcftools tabulated TSV files.
|
|
@@ -94,6 +100,11 @@ def cli(
|
|
|
94
100
|
click.echo(f"Reading filter config: {filter_config}", err=True)
|
|
95
101
|
filter_config_data = load_filter_config(filter_config)
|
|
96
102
|
|
|
103
|
+
# Debug mode: show specific variant
|
|
104
|
+
if debug:
|
|
105
|
+
debug_variant(input_file, pedigree_df, filter_config_data, debug, verbose)
|
|
106
|
+
return
|
|
107
|
+
|
|
97
108
|
# Determine output prefix
|
|
98
109
|
if output is None:
|
|
99
110
|
# Generate default output prefix from input filename
|
|
@@ -116,7 +127,20 @@ def cli(
|
|
|
116
127
|
click.echo("Processing with streaming mode...", err=True)
|
|
117
128
|
|
|
118
129
|
# Build lazy query
|
|
119
|
-
|
|
130
|
+
# Force certain columns to string type
|
|
131
|
+
string_columns = [
|
|
132
|
+
"FID",
|
|
133
|
+
"sample_id",
|
|
134
|
+
"father_id",
|
|
135
|
+
"mother_id",
|
|
136
|
+
"FatherBarcode",
|
|
137
|
+
"MotherBarcode",
|
|
138
|
+
"sample",
|
|
139
|
+
]
|
|
140
|
+
schema_overrides = {col: pl.Utf8 for col in string_columns}
|
|
141
|
+
lazy_df = pl.scan_csv(
|
|
142
|
+
input_file, separator="\t", schema_overrides=schema_overrides
|
|
143
|
+
)
|
|
120
144
|
|
|
121
145
|
# Apply formatting transformations
|
|
122
146
|
lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
|
|
@@ -147,6 +171,106 @@ def cli(
|
|
|
147
171
|
raise click.Abort()
|
|
148
172
|
|
|
149
173
|
|
|
174
|
+
def debug_variant(
|
|
175
|
+
input_file: Path,
|
|
176
|
+
pedigree_df: Optional[pl.DataFrame],
|
|
177
|
+
filter_config: Optional[dict],
|
|
178
|
+
debug_pos: str,
|
|
179
|
+
verbose: bool,
|
|
180
|
+
):
|
|
181
|
+
"""Debug mode: display rows matching a specific chrom:pos."""
|
|
182
|
+
# Parse debug position
|
|
183
|
+
if ":" not in debug_pos:
|
|
184
|
+
click.echo(
|
|
185
|
+
"Error: Debug position must be in format 'chrom:pos' (e.g., chr11:70486013)",
|
|
186
|
+
err=True,
|
|
187
|
+
)
|
|
188
|
+
raise click.Abort()
|
|
189
|
+
|
|
190
|
+
chrom, pos = debug_pos.split(":", 1)
|
|
191
|
+
try:
|
|
192
|
+
pos = int(pos)
|
|
193
|
+
except ValueError:
|
|
194
|
+
click.echo(f"Error: Position must be an integer, got '{pos}'", err=True)
|
|
195
|
+
raise click.Abort()
|
|
196
|
+
|
|
197
|
+
if verbose:
|
|
198
|
+
click.echo(f"Debug mode: searching for {chrom}:{pos}", err=True)
|
|
199
|
+
|
|
200
|
+
# Read and format the data
|
|
201
|
+
# Force certain columns to string type
|
|
202
|
+
string_columns = [
|
|
203
|
+
"FID",
|
|
204
|
+
"sample_id",
|
|
205
|
+
"father_id",
|
|
206
|
+
"mother_id",
|
|
207
|
+
"FatherBarcode",
|
|
208
|
+
"MotherBarcode",
|
|
209
|
+
"sample",
|
|
210
|
+
]
|
|
211
|
+
schema_overrides = {col: pl.Utf8 for col in string_columns}
|
|
212
|
+
df = pl.read_csv(input_file, separator="\t", schema_overrides=schema_overrides)
|
|
213
|
+
formatted_df = format_bcftools_tsv(df, pedigree_df)
|
|
214
|
+
|
|
215
|
+
# Filter to matching rows
|
|
216
|
+
matching_rows = formatted_df.filter(
|
|
217
|
+
(pl.col("#CHROM") == chrom) & (pl.col("POS") == pos)
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if matching_rows.shape[0] == 0:
|
|
221
|
+
click.echo(f"No rows found matching {chrom}:{pos}", err=True)
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
# Determine which columns to display
|
|
225
|
+
columns_to_show = ["#CHROM", "POS"]
|
|
226
|
+
|
|
227
|
+
# Add VEP_SYMBOL if it exists
|
|
228
|
+
if "VEP_SYMBOL" in matching_rows.columns:
|
|
229
|
+
columns_to_show.append("VEP_SYMBOL")
|
|
230
|
+
|
|
231
|
+
# Extract column names from expression if filter config provided
|
|
232
|
+
if filter_config and "expression" in filter_config:
|
|
233
|
+
expression = filter_config["expression"]
|
|
234
|
+
# Extract column names from expression using regex
|
|
235
|
+
# Match patterns like "column_name" before operators
|
|
236
|
+
column_pattern = r"\b([A-Za-z_][A-Za-z0-9_]*)\b\s*[=!<>]"
|
|
237
|
+
found_columns = re.findall(column_pattern, expression)
|
|
238
|
+
|
|
239
|
+
for col in found_columns:
|
|
240
|
+
if col in matching_rows.columns and col not in columns_to_show:
|
|
241
|
+
columns_to_show.append(col)
|
|
242
|
+
|
|
243
|
+
# Select only the columns we want to display
|
|
244
|
+
display_df = matching_rows.select(
|
|
245
|
+
[col for col in columns_to_show if col in matching_rows.columns]
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Replace null and NaN values with <null> and <NaN> for display
|
|
249
|
+
for col in display_df.columns:
|
|
250
|
+
if display_df[col].dtype in [pl.Float32, pl.Float64]:
|
|
251
|
+
# For numeric columns, handle both NaN and null
|
|
252
|
+
display_df = display_df.with_columns(
|
|
253
|
+
pl.when(pl.col(col).is_null())
|
|
254
|
+
.then(pl.lit("<null>"))
|
|
255
|
+
.when(pl.col(col).is_nan())
|
|
256
|
+
.then(pl.lit("<NaN>"))
|
|
257
|
+
.otherwise(pl.col(col).cast(pl.Utf8))
|
|
258
|
+
.alias(col)
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
# For non-numeric columns, only handle null
|
|
262
|
+
display_df = display_df.with_columns(
|
|
263
|
+
pl.when(pl.col(col).is_null())
|
|
264
|
+
.then(pl.lit("<null>"))
|
|
265
|
+
.otherwise(pl.col(col).cast(pl.Utf8))
|
|
266
|
+
.alias(col)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Display the results
|
|
270
|
+
click.echo(f"\nFound {matching_rows.shape[0]} row(s) matching {chrom}:{pos}:\n")
|
|
271
|
+
click.echo(display_df.write_csv(separator="\t"))
|
|
272
|
+
|
|
273
|
+
|
|
150
274
|
def load_filter_config(config_path: Path) -> dict:
|
|
151
275
|
"""Load and parse filter configuration from YAML file."""
|
|
152
276
|
with open(config_path, "r") as f:
|
|
@@ -366,6 +490,30 @@ def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr
|
|
|
366
490
|
if col_name not in df.columns:
|
|
367
491
|
raise ValueError(f"Column '{col_name}' not found in dataframe")
|
|
368
492
|
|
|
493
|
+
# Check for null value
|
|
494
|
+
if value.upper() == "NULL":
|
|
495
|
+
col_expr = pl.col(col_name)
|
|
496
|
+
if op == "=":
|
|
497
|
+
return col_expr.is_null()
|
|
498
|
+
elif op == "!=":
|
|
499
|
+
return ~col_expr.is_null()
|
|
500
|
+
else:
|
|
501
|
+
raise ValueError(
|
|
502
|
+
f"Operator '{op}' not supported for null comparison, use = or !="
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Check for NaN value
|
|
506
|
+
if value.upper() == "NAN":
|
|
507
|
+
col_expr = pl.col(col_name).cast(pl.Float64, strict=False)
|
|
508
|
+
if op == "=":
|
|
509
|
+
return col_expr.is_nan()
|
|
510
|
+
elif op == "!=":
|
|
511
|
+
return ~col_expr.is_nan()
|
|
512
|
+
else:
|
|
513
|
+
raise ValueError(
|
|
514
|
+
f"Operator '{op}' not supported for NaN comparison, use = or !="
|
|
515
|
+
)
|
|
516
|
+
|
|
369
517
|
# Try to convert value to number, otherwise treat as string
|
|
370
518
|
try:
|
|
371
519
|
value_num = float(value)
|
|
@@ -647,7 +795,18 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
|
|
|
647
795
|
DataFrame with columns: sample_id, father_id, mother_id
|
|
648
796
|
"""
|
|
649
797
|
# Try reading with header first
|
|
650
|
-
|
|
798
|
+
# Force certain columns to string type
|
|
799
|
+
string_columns = [
|
|
800
|
+
"FID",
|
|
801
|
+
"sample_id",
|
|
802
|
+
"father_id",
|
|
803
|
+
"mother_id",
|
|
804
|
+
"FatherBarcode",
|
|
805
|
+
"MotherBarcode",
|
|
806
|
+
"sample",
|
|
807
|
+
]
|
|
808
|
+
schema_overrides = {col: pl.Utf8 for col in string_columns}
|
|
809
|
+
df = pl.read_csv(pedigree_path, separator="\t", schema_overrides=schema_overrides)
|
|
651
810
|
|
|
652
811
|
# Check if first row has 'FID' in first column (indicates header)
|
|
653
812
|
if df.columns[0] == "FID" or "sample_id" in df.columns:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pywombat
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
5
|
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
6
|
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=0nBlwyRu1Q01a0EHcVyIYtKmgezCWA85pQtEXpnuzL4,44535
|
|
3
|
+
pywombat-0.5.0.dist-info/METADATA,sha256=2Py8xwNxZBD18u4r-tJI_mQezMBg4td3ruWOm61MbdA,4982
|
|
4
|
+
pywombat-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pywombat-0.5.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-0.5.0.dist-info/RECORD,,
|
pywombat-0.3.0.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
-
pywombat/cli.py,sha256=kv03IFXcwe9pdv-KyoT5Cu1pJ9r-O7ww-Kh0ZT2ysa4,38920
|
|
3
|
-
pywombat-0.3.0.dist-info/METADATA,sha256=eASint-XgzgUGWshtZYr4nekDCs-VKSTilHLRupH5ic,4982
|
|
4
|
-
pywombat-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
-
pywombat-0.3.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
-
pywombat-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|