pywombat 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +165 -39
- {pywombat-0.2.0.dist-info → pywombat-0.3.0.dist-info}/METADATA +1 -1
- pywombat-0.3.0.dist-info/RECORD +6 -0
- pywombat-0.2.0.dist-info/RECORD +0 -6
- {pywombat-0.2.0.dist-info → pywombat-0.3.0.dist-info}/WHEEL +0 -0
- {pywombat-0.2.0.dist-info → pywombat-0.3.0.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -80,14 +80,6 @@ def cli(
|
|
|
80
80
|
if verbose and is_gzipped:
|
|
81
81
|
click.echo("Detected gzipped file", err=True)
|
|
82
82
|
|
|
83
|
-
# Read the TSV file (handles both plain and gzipped)
|
|
84
|
-
df = pl.read_csv(input_file, separator="\t")
|
|
85
|
-
|
|
86
|
-
if verbose:
|
|
87
|
-
click.echo(
|
|
88
|
-
f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
|
|
89
|
-
)
|
|
90
|
-
|
|
91
83
|
# Read pedigree file if provided
|
|
92
84
|
pedigree_df = None
|
|
93
85
|
if pedigree:
|
|
@@ -95,16 +87,7 @@ def cli(
|
|
|
95
87
|
click.echo(f"Reading pedigree file: {pedigree}", err=True)
|
|
96
88
|
pedigree_df = read_pedigree(pedigree)
|
|
97
89
|
|
|
98
|
-
#
|
|
99
|
-
formatted_df = format_bcftools_tsv(df, pedigree_df)
|
|
100
|
-
|
|
101
|
-
if verbose:
|
|
102
|
-
click.echo(
|
|
103
|
-
f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
|
|
104
|
-
err=True,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
# Apply filters if provided
|
|
90
|
+
# Load filter config if provided
|
|
108
91
|
filter_config_data = None
|
|
109
92
|
if filter_config:
|
|
110
93
|
if verbose:
|
|
@@ -128,30 +111,36 @@ def cli(
|
|
|
128
111
|
else:
|
|
129
112
|
output = input_stem
|
|
130
113
|
|
|
131
|
-
#
|
|
114
|
+
# Use streaming approach with lazy API
|
|
115
|
+
if verbose:
|
|
116
|
+
click.echo("Processing with streaming mode...", err=True)
|
|
117
|
+
|
|
118
|
+
# Build lazy query
|
|
119
|
+
lazy_df = pl.scan_csv(input_file, separator="\t")
|
|
120
|
+
|
|
121
|
+
# Apply formatting transformations
|
|
122
|
+
lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
|
|
123
|
+
|
|
124
|
+
# Apply filters if provided
|
|
132
125
|
if filter_config_data:
|
|
133
|
-
|
|
134
|
-
formatted_df,
|
|
135
|
-
filter_config_data,
|
|
136
|
-
output,
|
|
137
|
-
output_format,
|
|
138
|
-
verbose,
|
|
139
|
-
)
|
|
140
|
-
else:
|
|
141
|
-
# No filters - write single output file
|
|
142
|
-
# Construct output filename with prefix and format
|
|
143
|
-
output_path = Path(f"{output}.{output_format}")
|
|
126
|
+
lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
|
|
144
127
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
elif output_format == "tsv.gz":
|
|
148
|
-
csv_content = formatted_df.write_csv(separator="\t")
|
|
149
|
-
with gzip.open(output_path, "wt") as f:
|
|
150
|
-
f.write(csv_content)
|
|
151
|
-
elif output_format == "parquet":
|
|
152
|
-
formatted_df.write_parquet(output_path)
|
|
128
|
+
# Write output
|
|
129
|
+
output_path = Path(f"{output}.{output_format}")
|
|
153
130
|
|
|
154
|
-
|
|
131
|
+
if output_format == "tsv":
|
|
132
|
+
lazy_df.sink_csv(output_path, separator="\t")
|
|
133
|
+
elif output_format == "tsv.gz":
|
|
134
|
+
# For gzip, we need to collect and write
|
|
135
|
+
df = lazy_df.collect()
|
|
136
|
+
csv_content = df.write_csv(separator="\t")
|
|
137
|
+
with gzip.open(output_path, "wt") as f:
|
|
138
|
+
f.write(csv_content)
|
|
139
|
+
elif output_format == "parquet":
|
|
140
|
+
lazy_df.sink_parquet(output_path)
|
|
141
|
+
|
|
142
|
+
if verbose:
|
|
143
|
+
click.echo(f"Data written to {output_path}", err=True)
|
|
155
144
|
|
|
156
145
|
except Exception as e:
|
|
157
146
|
click.echo(f"Error: {e}", err=True)
|
|
@@ -957,5 +946,142 @@ def format_bcftools_tsv(
|
|
|
957
946
|
return melted_df
|
|
958
947
|
|
|
959
948
|
|
|
949
|
+
def format_bcftools_tsv_lazy(
|
|
950
|
+
lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
951
|
+
) -> pl.LazyFrame:
|
|
952
|
+
"""
|
|
953
|
+
Format a bcftools tabulated TSV using lazy operations for streaming.
|
|
954
|
+
|
|
955
|
+
This is a simplified version that collects minimally for complex operations.
|
|
956
|
+
"""
|
|
957
|
+
# For complex transformations like melting, we need to collect temporarily
|
|
958
|
+
# but we do this in a streaming fashion
|
|
959
|
+
df = lazy_df.collect(streaming=True)
|
|
960
|
+
formatted_df = format_bcftools_tsv(df, pedigree_df)
|
|
961
|
+
return formatted_df.lazy()
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def apply_filters_lazy(
|
|
965
|
+
lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
|
|
966
|
+
) -> pl.LazyFrame:
|
|
967
|
+
"""Apply quality and expression filters using lazy operations."""
|
|
968
|
+
quality_config = filter_config.get("quality", {})
|
|
969
|
+
expression = filter_config.get("expression")
|
|
970
|
+
|
|
971
|
+
# Apply quality filters
|
|
972
|
+
if quality_config:
|
|
973
|
+
# Filter: sample_gt must contain at least one '1' (default: true)
|
|
974
|
+
filter_no_alt = quality_config.get("filter_no_alt_allele", True)
|
|
975
|
+
if filter_no_alt:
|
|
976
|
+
lazy_df = lazy_df.filter(
|
|
977
|
+
pl.col("sample_gt").str.contains("1")
|
|
978
|
+
| pl.col("sample_gt").str.contains("2")
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# Apply minimum depth filter
|
|
982
|
+
if "sample_dp_min" in quality_config:
|
|
983
|
+
min_dp = quality_config["sample_dp_min"]
|
|
984
|
+
lazy_df = lazy_df.filter(
|
|
985
|
+
pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
# Apply minimum GQ filter
|
|
989
|
+
if "sample_gq_min" in quality_config:
|
|
990
|
+
min_gq = quality_config["sample_gq_min"]
|
|
991
|
+
lazy_df = lazy_df.filter(
|
|
992
|
+
pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
# VAF filters for heterozygous (0/1 or 1/0)
|
|
996
|
+
if (
|
|
997
|
+
"sample_vaf_het_min" in quality_config
|
|
998
|
+
or "sample_vaf_het_max" in quality_config
|
|
999
|
+
):
|
|
1000
|
+
# Check if genotype is het (contains one '1' and one '0', no '2')
|
|
1001
|
+
is_het = (
|
|
1002
|
+
(pl.col("sample_gt").str.count_matches("1") == 1)
|
|
1003
|
+
& (pl.col("sample_gt").str.count_matches("0") == 1)
|
|
1004
|
+
& (~pl.col("sample_gt").str.contains("2"))
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
het_conditions = []
|
|
1008
|
+
if "sample_vaf_het_min" in quality_config:
|
|
1009
|
+
het_conditions.append(
|
|
1010
|
+
pl.col("sample_vaf") >= quality_config["sample_vaf_het_min"]
|
|
1011
|
+
)
|
|
1012
|
+
if "sample_vaf_het_max" in quality_config:
|
|
1013
|
+
het_conditions.append(
|
|
1014
|
+
pl.col("sample_vaf") <= quality_config["sample_vaf_het_max"]
|
|
1015
|
+
)
|
|
1016
|
+
|
|
1017
|
+
if het_conditions:
|
|
1018
|
+
het_filter = het_conditions[0]
|
|
1019
|
+
for cond in het_conditions[1:]:
|
|
1020
|
+
het_filter = het_filter & cond
|
|
1021
|
+
|
|
1022
|
+
lazy_df = lazy_df.filter(~is_het | het_filter)
|
|
1023
|
+
|
|
1024
|
+
# VAF filter for homozygous alternate (1/1)
|
|
1025
|
+
if "sample_vaf_homalt_min" in quality_config:
|
|
1026
|
+
is_homalt = pl.col("sample_gt") == "1/1"
|
|
1027
|
+
lazy_df = lazy_df.filter(
|
|
1028
|
+
~is_homalt
|
|
1029
|
+
| (pl.col("sample_vaf") >= quality_config["sample_vaf_homalt_min"])
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
# VAF filter for homozygous reference (0/0)
|
|
1033
|
+
if "sample_vaf_hom_ref_max" in quality_config:
|
|
1034
|
+
is_hom_ref = pl.col("sample_gt") == "0/0"
|
|
1035
|
+
lazy_df = lazy_df.filter(
|
|
1036
|
+
~is_hom_ref
|
|
1037
|
+
| (pl.col("sample_vaf") <= quality_config["sample_vaf_hom_ref_max"])
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
# Apply same filters to parents if requested
|
|
1041
|
+
apply_to_parents = quality_config.get("apply_to_parents", False)
|
|
1042
|
+
if apply_to_parents:
|
|
1043
|
+
# Father filters
|
|
1044
|
+
if "sample_dp_min" in quality_config:
|
|
1045
|
+
min_dp = quality_config["sample_dp_min"]
|
|
1046
|
+
lazy_df = lazy_df.filter(
|
|
1047
|
+
(pl.col("father_dp").is_null())
|
|
1048
|
+
| (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
if "sample_gq_min" in quality_config:
|
|
1052
|
+
min_gq = quality_config["sample_gq_min"]
|
|
1053
|
+
lazy_df = lazy_df.filter(
|
|
1054
|
+
(pl.col("father_gq").is_null())
|
|
1055
|
+
| (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
1056
|
+
)
|
|
1057
|
+
|
|
1058
|
+
# Mother filters
|
|
1059
|
+
if "sample_dp_min" in quality_config:
|
|
1060
|
+
min_dp = quality_config["sample_dp_min"]
|
|
1061
|
+
lazy_df = lazy_df.filter(
|
|
1062
|
+
(pl.col("mother_dp").is_null())
|
|
1063
|
+
| (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
if "sample_gq_min" in quality_config:
|
|
1067
|
+
min_gq = quality_config["sample_gq_min"]
|
|
1068
|
+
lazy_df = lazy_df.filter(
|
|
1069
|
+
(pl.col("mother_gq").is_null())
|
|
1070
|
+
| (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
# Apply expression filter if provided
|
|
1074
|
+
if expression:
|
|
1075
|
+
if verbose:
|
|
1076
|
+
click.echo(f"Applying expression filter: {expression}", err=True)
|
|
1077
|
+
|
|
1078
|
+
# We need to collect temporarily to use parse_impact_filter_expression
|
|
1079
|
+
df = lazy_df.collect(streaming=True)
|
|
1080
|
+
filter_expr = parse_impact_filter_expression(expression, df)
|
|
1081
|
+
lazy_df = df.lazy().filter(filter_expr)
|
|
1082
|
+
|
|
1083
|
+
return lazy_df
|
|
1084
|
+
|
|
1085
|
+
|
|
960
1086
|
if __name__ == "__main__":
|
|
961
1087
|
cli()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pywombat
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
5
|
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
6
|
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=kv03IFXcwe9pdv-KyoT5Cu1pJ9r-O7ww-Kh0ZT2ysa4,38920
|
|
3
|
+
pywombat-0.3.0.dist-info/METADATA,sha256=eASint-XgzgUGWshtZYr4nekDCs-VKSTilHLRupH5ic,4982
|
|
4
|
+
pywombat-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pywombat-0.3.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-0.3.0.dist-info/RECORD,,
|
pywombat-0.2.0.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
-
pywombat/cli.py,sha256=PZKV6FoqZyGgG7_mMIO2FzyeONdBaCqnhDATYsQJqMo,33899
|
|
3
|
-
pywombat-0.2.0.dist-info/METADATA,sha256=7Qg2XnaTM92pmIewu5fw_vrcQW5JCVkkj2q6mNC9v88,4982
|
|
4
|
-
pywombat-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
-
pywombat-0.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
-
pywombat-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|