pywombat 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +198 -46
- {pywombat-0.1.0.dist-info → pywombat-0.3.0.dist-info}/METADATA +1 -1
- pywombat-0.3.0.dist-info/RECORD +6 -0
- {pywombat-0.1.0.dist-info → pywombat-0.3.0.dist-info}/WHEEL +1 -1
- pywombat-0.1.0.dist-info/RECORD +0 -6
- {pywombat-0.1.0.dist-info → pywombat-0.3.0.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""CLI for wombat tool."""
|
|
2
2
|
|
|
3
|
+
import gzip
|
|
3
4
|
import re
|
|
4
5
|
import warnings
|
|
5
6
|
from pathlib import Path
|
|
@@ -22,9 +23,9 @@ import yaml
|
|
|
22
23
|
"-f",
|
|
23
24
|
"--format",
|
|
24
25
|
"output_format",
|
|
25
|
-
type=click.Choice(["tsv", "parquet"], case_sensitive=False),
|
|
26
|
+
type=click.Choice(["tsv", "tsv.gz", "parquet"], case_sensitive=False),
|
|
26
27
|
default="tsv",
|
|
27
|
-
help="Output format: tsv (default) or parquet.",
|
|
28
|
+
help="Output format: tsv (default), tsv.gz (compressed), or parquet.",
|
|
28
29
|
)
|
|
29
30
|
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
|
|
30
31
|
@click.option(
|
|
@@ -73,13 +74,11 @@ def cli(
|
|
|
73
74
|
if verbose:
|
|
74
75
|
click.echo(f"Reading input file: {input_file}", err=True)
|
|
75
76
|
|
|
76
|
-
#
|
|
77
|
-
|
|
77
|
+
# Detect if file is gzipped based on extension
|
|
78
|
+
is_gzipped = str(input_file).endswith(".gz")
|
|
78
79
|
|
|
79
|
-
if verbose:
|
|
80
|
-
click.echo(
|
|
81
|
-
f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
|
|
82
|
-
)
|
|
80
|
+
if verbose and is_gzipped:
|
|
81
|
+
click.echo("Detected gzipped file", err=True)
|
|
83
82
|
|
|
84
83
|
# Read pedigree file if provided
|
|
85
84
|
pedigree_df = None
|
|
@@ -88,52 +87,60 @@ def cli(
|
|
|
88
87
|
click.echo(f"Reading pedigree file: {pedigree}", err=True)
|
|
89
88
|
pedigree_df = read_pedigree(pedigree)
|
|
90
89
|
|
|
91
|
-
#
|
|
92
|
-
formatted_df = format_bcftools_tsv(df, pedigree_df)
|
|
93
|
-
|
|
94
|
-
if verbose:
|
|
95
|
-
click.echo(
|
|
96
|
-
f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
|
|
97
|
-
err=True,
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
# Apply filters if provided
|
|
90
|
+
# Load filter config if provided
|
|
101
91
|
filter_config_data = None
|
|
102
92
|
if filter_config:
|
|
103
93
|
if verbose:
|
|
104
94
|
click.echo(f"Reading filter config: {filter_config}", err=True)
|
|
105
95
|
filter_config_data = load_filter_config(filter_config)
|
|
106
96
|
|
|
107
|
-
#
|
|
97
|
+
# Determine output prefix
|
|
98
|
+
if output is None:
|
|
99
|
+
# Generate default output prefix from input filename
|
|
100
|
+
input_stem = input_file.name
|
|
101
|
+
# Remove .tsv.gz or .tsv extension
|
|
102
|
+
if input_stem.endswith(".tsv.gz"):
|
|
103
|
+
input_stem = input_stem[:-7] # Remove .tsv.gz
|
|
104
|
+
elif input_stem.endswith(".tsv"):
|
|
105
|
+
input_stem = input_stem[:-4] # Remove .tsv
|
|
106
|
+
|
|
107
|
+
# Add config name if filter is provided
|
|
108
|
+
if filter_config:
|
|
109
|
+
config_name = filter_config.stem # Get basename without extension
|
|
110
|
+
output = f"{input_stem}.{config_name}"
|
|
111
|
+
else:
|
|
112
|
+
output = input_stem
|
|
113
|
+
|
|
114
|
+
# Use streaming approach with lazy API
|
|
115
|
+
if verbose:
|
|
116
|
+
click.echo("Processing with streaming mode...", err=True)
|
|
117
|
+
|
|
118
|
+
# Build lazy query
|
|
119
|
+
lazy_df = pl.scan_csv(input_file, separator="\t")
|
|
120
|
+
|
|
121
|
+
# Apply formatting transformations
|
|
122
|
+
lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
|
|
123
|
+
|
|
124
|
+
# Apply filters if provided
|
|
108
125
|
if filter_config_data:
|
|
109
|
-
|
|
110
|
-
formatted_df,
|
|
111
|
-
filter_config_data,
|
|
112
|
-
output,
|
|
113
|
-
output_format,
|
|
114
|
-
verbose,
|
|
115
|
-
)
|
|
116
|
-
else:
|
|
117
|
-
# No filters - write single output file
|
|
118
|
-
if output:
|
|
119
|
-
# Construct output filename with prefix and format
|
|
120
|
-
output_path = Path(f"{output}.{output_format}")
|
|
126
|
+
lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
|
|
121
127
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
elif output_format == "parquet":
|
|
125
|
-
formatted_df.write_parquet(output_path)
|
|
128
|
+
# Write output
|
|
129
|
+
output_path = Path(f"{output}.{output_format}")
|
|
126
130
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
131
|
+
if output_format == "tsv":
|
|
132
|
+
lazy_df.sink_csv(output_path, separator="\t")
|
|
133
|
+
elif output_format == "tsv.gz":
|
|
134
|
+
# For gzip, we need to collect and write
|
|
135
|
+
df = lazy_df.collect()
|
|
136
|
+
csv_content = df.write_csv(separator="\t")
|
|
137
|
+
with gzip.open(output_path, "wt") as f:
|
|
138
|
+
f.write(csv_content)
|
|
139
|
+
elif output_format == "parquet":
|
|
140
|
+
lazy_df.sink_parquet(output_path)
|
|
141
|
+
|
|
142
|
+
if verbose:
|
|
143
|
+
click.echo(f"Data written to {output_path}", err=True)
|
|
137
144
|
|
|
138
145
|
except Exception as e:
|
|
139
146
|
click.echo(f"Error: {e}", err=True)
|
|
@@ -555,11 +562,15 @@ def apply_impact_filters(
|
|
|
555
562
|
)
|
|
556
563
|
|
|
557
564
|
# Write to file
|
|
558
|
-
output_filename = f"{output_prefix}
|
|
565
|
+
output_filename = f"{output_prefix}.{name}.{output_format}"
|
|
559
566
|
output_path = Path(output_filename)
|
|
560
567
|
|
|
561
568
|
if output_format == "tsv":
|
|
562
569
|
filtered_df.write_csv(output_path, separator="\t")
|
|
570
|
+
elif output_format == "tsv.gz":
|
|
571
|
+
csv_content = filtered_df.write_csv(separator="\t")
|
|
572
|
+
with gzip.open(output_path, "wt") as f:
|
|
573
|
+
f.write(csv_content)
|
|
563
574
|
elif output_format == "parquet":
|
|
564
575
|
filtered_df.write_parquet(output_path)
|
|
565
576
|
|
|
@@ -599,6 +610,10 @@ def apply_filters_and_write(
|
|
|
599
610
|
|
|
600
611
|
if output_format == "tsv":
|
|
601
612
|
filtered_df.write_csv(output_path, separator="\t")
|
|
613
|
+
elif output_format == "tsv.gz":
|
|
614
|
+
csv_content = filtered_df.write_csv(separator="\t")
|
|
615
|
+
with gzip.open(output_path, "wt") as f:
|
|
616
|
+
f.write(csv_content)
|
|
602
617
|
elif output_format == "parquet":
|
|
603
618
|
filtered_df.write_parquet(output_path)
|
|
604
619
|
|
|
@@ -931,5 +946,142 @@ def format_bcftools_tsv(
|
|
|
931
946
|
return melted_df
|
|
932
947
|
|
|
933
948
|
|
|
949
|
+
def format_bcftools_tsv_lazy(
|
|
950
|
+
lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
951
|
+
) -> pl.LazyFrame:
|
|
952
|
+
"""
|
|
953
|
+
Format a bcftools tabulated TSV using lazy operations for streaming.
|
|
954
|
+
|
|
955
|
+
This is a simplified version that collects minimally for complex operations.
|
|
956
|
+
"""
|
|
957
|
+
# For complex transformations like melting, we need to collect temporarily
|
|
958
|
+
# but we do this in a streaming fashion
|
|
959
|
+
df = lazy_df.collect(streaming=True)
|
|
960
|
+
formatted_df = format_bcftools_tsv(df, pedigree_df)
|
|
961
|
+
return formatted_df.lazy()
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def apply_filters_lazy(
|
|
965
|
+
lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
|
|
966
|
+
) -> pl.LazyFrame:
|
|
967
|
+
"""Apply quality and expression filters using lazy operations."""
|
|
968
|
+
quality_config = filter_config.get("quality", {})
|
|
969
|
+
expression = filter_config.get("expression")
|
|
970
|
+
|
|
971
|
+
# Apply quality filters
|
|
972
|
+
if quality_config:
|
|
973
|
+
# Filter: sample_gt must contain at least one '1' (default: true)
|
|
974
|
+
filter_no_alt = quality_config.get("filter_no_alt_allele", True)
|
|
975
|
+
if filter_no_alt:
|
|
976
|
+
lazy_df = lazy_df.filter(
|
|
977
|
+
pl.col("sample_gt").str.contains("1")
|
|
978
|
+
| pl.col("sample_gt").str.contains("2")
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# Apply minimum depth filter
|
|
982
|
+
if "sample_dp_min" in quality_config:
|
|
983
|
+
min_dp = quality_config["sample_dp_min"]
|
|
984
|
+
lazy_df = lazy_df.filter(
|
|
985
|
+
pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
# Apply minimum GQ filter
|
|
989
|
+
if "sample_gq_min" in quality_config:
|
|
990
|
+
min_gq = quality_config["sample_gq_min"]
|
|
991
|
+
lazy_df = lazy_df.filter(
|
|
992
|
+
pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
# VAF filters for heterozygous (0/1 or 1/0)
|
|
996
|
+
if (
|
|
997
|
+
"sample_vaf_het_min" in quality_config
|
|
998
|
+
or "sample_vaf_het_max" in quality_config
|
|
999
|
+
):
|
|
1000
|
+
# Check if genotype is het (contains one '1' and one '0', no '2')
|
|
1001
|
+
is_het = (
|
|
1002
|
+
(pl.col("sample_gt").str.count_matches("1") == 1)
|
|
1003
|
+
& (pl.col("sample_gt").str.count_matches("0") == 1)
|
|
1004
|
+
& (~pl.col("sample_gt").str.contains("2"))
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
het_conditions = []
|
|
1008
|
+
if "sample_vaf_het_min" in quality_config:
|
|
1009
|
+
het_conditions.append(
|
|
1010
|
+
pl.col("sample_vaf") >= quality_config["sample_vaf_het_min"]
|
|
1011
|
+
)
|
|
1012
|
+
if "sample_vaf_het_max" in quality_config:
|
|
1013
|
+
het_conditions.append(
|
|
1014
|
+
pl.col("sample_vaf") <= quality_config["sample_vaf_het_max"]
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
if het_conditions:
|
|
1018
|
+
het_filter = het_conditions[0]
|
|
1019
|
+
for cond in het_conditions[1:]:
|
|
1020
|
+
het_filter = het_filter & cond
|
|
1021
|
+
|
|
1022
|
+
lazy_df = lazy_df.filter(~is_het | het_filter)
|
|
1023
|
+
|
|
1024
|
+
# VAF filter for homozygous alternate (1/1)
|
|
1025
|
+
if "sample_vaf_homalt_min" in quality_config:
|
|
1026
|
+
is_homalt = pl.col("sample_gt") == "1/1"
|
|
1027
|
+
lazy_df = lazy_df.filter(
|
|
1028
|
+
~is_homalt
|
|
1029
|
+
| (pl.col("sample_vaf") >= quality_config["sample_vaf_homalt_min"])
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
# VAF filter for homozygous reference (0/0)
|
|
1033
|
+
if "sample_vaf_hom_ref_max" in quality_config:
|
|
1034
|
+
is_hom_ref = pl.col("sample_gt") == "0/0"
|
|
1035
|
+
lazy_df = lazy_df.filter(
|
|
1036
|
+
~is_hom_ref
|
|
1037
|
+
| (pl.col("sample_vaf") <= quality_config["sample_vaf_hom_ref_max"])
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
# Apply same filters to parents if requested
|
|
1041
|
+
apply_to_parents = quality_config.get("apply_to_parents", False)
|
|
1042
|
+
if apply_to_parents:
|
|
1043
|
+
# Father filters
|
|
1044
|
+
if "sample_dp_min" in quality_config:
|
|
1045
|
+
min_dp = quality_config["sample_dp_min"]
|
|
1046
|
+
lazy_df = lazy_df.filter(
|
|
1047
|
+
(pl.col("father_dp").is_null())
|
|
1048
|
+
| (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
if "sample_gq_min" in quality_config:
|
|
1052
|
+
min_gq = quality_config["sample_gq_min"]
|
|
1053
|
+
lazy_df = lazy_df.filter(
|
|
1054
|
+
(pl.col("father_gq").is_null())
|
|
1055
|
+
| (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
1056
|
+
)
|
|
1057
|
+
|
|
1058
|
+
# Mother filters
|
|
1059
|
+
if "sample_dp_min" in quality_config:
|
|
1060
|
+
min_dp = quality_config["sample_dp_min"]
|
|
1061
|
+
lazy_df = lazy_df.filter(
|
|
1062
|
+
(pl.col("mother_dp").is_null())
|
|
1063
|
+
| (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
if "sample_gq_min" in quality_config:
|
|
1067
|
+
min_gq = quality_config["sample_gq_min"]
|
|
1068
|
+
lazy_df = lazy_df.filter(
|
|
1069
|
+
(pl.col("mother_gq").is_null())
|
|
1070
|
+
| (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
# Apply expression filter if provided
|
|
1074
|
+
if expression:
|
|
1075
|
+
if verbose:
|
|
1076
|
+
click.echo(f"Applying expression filter: {expression}", err=True)
|
|
1077
|
+
|
|
1078
|
+
# We need to collect temporarily to use parse_impact_filter_expression
|
|
1079
|
+
df = lazy_df.collect(streaming=True)
|
|
1080
|
+
filter_expr = parse_impact_filter_expression(expression, df)
|
|
1081
|
+
lazy_df = df.lazy().filter(filter_expr)
|
|
1082
|
+
|
|
1083
|
+
return lazy_df
|
|
1084
|
+
|
|
1085
|
+
|
|
934
1086
|
if __name__ == "__main__":
|
|
935
1087
|
cli()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pywombat
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
5
|
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
6
|
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=kv03IFXcwe9pdv-KyoT5Cu1pJ9r-O7ww-Kh0ZT2ysa4,38920
|
|
3
|
+
pywombat-0.3.0.dist-info/METADATA,sha256=eASint-XgzgUGWshtZYr4nekDCs-VKSTilHLRupH5ic,4982
|
|
4
|
+
pywombat-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pywombat-0.3.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-0.3.0.dist-info/RECORD,,
|
pywombat-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
-
pywombat/cli.py,sha256=kUokXfnaSCKLXiCu7jXbYOPlGTtL5wSzocM9gFtPy30,32801
|
|
3
|
-
pywombat-0.1.0.dist-info/METADATA,sha256=3RlA_lLC7hKUxIrhQvvbBKEolYGOl_EVJgyDfDLI0sU,4982
|
|
4
|
-
pywombat-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
-
pywombat-0.1.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
-
pywombat-0.1.0.dist-info/RECORD,,
|
|
File without changes
|