pywombat 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -80,14 +80,6 @@ def cli(
80
80
  if verbose and is_gzipped:
81
81
  click.echo("Detected gzipped file", err=True)
82
82
 
83
- # Read the TSV file (handles both plain and gzipped)
84
- df = pl.read_csv(input_file, separator="\t")
85
-
86
- if verbose:
87
- click.echo(
88
- f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
89
- )
90
-
91
83
  # Read pedigree file if provided
92
84
  pedigree_df = None
93
85
  if pedigree:
@@ -95,16 +87,7 @@ def cli(
95
87
  click.echo(f"Reading pedigree file: {pedigree}", err=True)
96
88
  pedigree_df = read_pedigree(pedigree)
97
89
 
98
- # Process the dataframe
99
- formatted_df = format_bcftools_tsv(df, pedigree_df)
100
-
101
- if verbose:
102
- click.echo(
103
- f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
104
- err=True,
105
- )
106
-
107
- # Apply filters if provided
90
+ # Load filter config if provided
108
91
  filter_config_data = None
109
92
  if filter_config:
110
93
  if verbose:
@@ -128,30 +111,36 @@ def cli(
128
111
  else:
129
112
  output = input_stem
130
113
 
131
- # Apply filters and write output
114
+ # Use streaming approach with lazy API
115
+ if verbose:
116
+ click.echo("Processing with streaming mode...", err=True)
117
+
118
+ # Build lazy query
119
+ lazy_df = pl.scan_csv(input_file, separator="\t")
120
+
121
+ # Apply formatting transformations
122
+ lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
123
+
124
+ # Apply filters if provided
132
125
  if filter_config_data:
133
- apply_filters_and_write(
134
- formatted_df,
135
- filter_config_data,
136
- output,
137
- output_format,
138
- verbose,
139
- )
140
- else:
141
- # No filters - write single output file
142
- # Construct output filename with prefix and format
143
- output_path = Path(f"{output}.{output_format}")
126
+ lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
144
127
 
145
- if output_format == "tsv":
146
- formatted_df.write_csv(output_path, separator="\t")
147
- elif output_format == "tsv.gz":
148
- csv_content = formatted_df.write_csv(separator="\t")
149
- with gzip.open(output_path, "wt") as f:
150
- f.write(csv_content)
151
- elif output_format == "parquet":
152
- formatted_df.write_parquet(output_path)
128
+ # Write output
129
+ output_path = Path(f"{output}.{output_format}")
153
130
 
154
- click.echo(f"Formatted data written to {output_path}", err=True)
131
+ if output_format == "tsv":
132
+ lazy_df.sink_csv(output_path, separator="\t")
133
+ elif output_format == "tsv.gz":
134
+ # For gzip, we need to collect and write
135
+ df = lazy_df.collect()
136
+ csv_content = df.write_csv(separator="\t")
137
+ with gzip.open(output_path, "wt") as f:
138
+ f.write(csv_content)
139
+ elif output_format == "parquet":
140
+ lazy_df.sink_parquet(output_path)
141
+
142
+ if verbose:
143
+ click.echo(f"Data written to {output_path}", err=True)
155
144
 
156
145
  except Exception as e:
157
146
  click.echo(f"Error: {e}", err=True)
@@ -957,5 +946,142 @@ def format_bcftools_tsv(
957
946
  return melted_df
958
947
 
959
948
 
949
+ def format_bcftools_tsv_lazy(
950
+ lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
951
+ ) -> pl.LazyFrame:
952
+ """
953
+ Format a bcftools tabulated TSV using lazy operations for streaming.
954
+
955
+ This is a simplified version that collects minimally for complex operations.
956
+ """
957
+ # For complex transformations like melting, we need to collect temporarily
958
+ # but we do this in a streaming fashion
959
+ df = lazy_df.collect(streaming=True)
960
+ formatted_df = format_bcftools_tsv(df, pedigree_df)
961
+ return formatted_df.lazy()
962
+
963
+
964
+ def apply_filters_lazy(
965
+ lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
966
+ ) -> pl.LazyFrame:
967
+ """Apply quality and expression filters using lazy operations."""
968
+ quality_config = filter_config.get("quality", {})
969
+ expression = filter_config.get("expression")
970
+
971
+ # Apply quality filters
972
+ if quality_config:
973
+ # Filter: sample_gt must contain at least one '1' (default: true)
974
+ filter_no_alt = quality_config.get("filter_no_alt_allele", True)
975
+ if filter_no_alt:
976
+ lazy_df = lazy_df.filter(
977
+ pl.col("sample_gt").str.contains("1")
978
+ | pl.col("sample_gt").str.contains("2")
979
+ )
980
+
981
+ # Apply minimum depth filter
982
+ if "sample_dp_min" in quality_config:
983
+ min_dp = quality_config["sample_dp_min"]
984
+ lazy_df = lazy_df.filter(
985
+ pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp
986
+ )
987
+
988
+ # Apply minimum GQ filter
989
+ if "sample_gq_min" in quality_config:
990
+ min_gq = quality_config["sample_gq_min"]
991
+ lazy_df = lazy_df.filter(
992
+ pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq
993
+ )
994
+
995
+ # VAF filters for heterozygous (0/1 or 1/0)
996
+ if (
997
+ "sample_vaf_het_min" in quality_config
998
+ or "sample_vaf_het_max" in quality_config
999
+ ):
1000
+ # Check if genotype is het (contains one '1' and one '0', no '2')
1001
+ is_het = (
1002
+ (pl.col("sample_gt").str.count_matches("1") == 1)
1003
+ & (pl.col("sample_gt").str.count_matches("0") == 1)
1004
+ & (~pl.col("sample_gt").str.contains("2"))
1005
+ )
1006
+
1007
+ het_conditions = []
1008
+ if "sample_vaf_het_min" in quality_config:
1009
+ het_conditions.append(
1010
+ pl.col("sample_vaf") >= quality_config["sample_vaf_het_min"]
1011
+ )
1012
+ if "sample_vaf_het_max" in quality_config:
1013
+ het_conditions.append(
1014
+ pl.col("sample_vaf") <= quality_config["sample_vaf_het_max"]
1015
+ )
1016
+
1017
+ if het_conditions:
1018
+ het_filter = het_conditions[0]
1019
+ for cond in het_conditions[1:]:
1020
+ het_filter = het_filter & cond
1021
+
1022
+ lazy_df = lazy_df.filter(~is_het | het_filter)
1023
+
1024
+ # VAF filter for homozygous alternate (1/1)
1025
+ if "sample_vaf_homalt_min" in quality_config:
1026
+ is_homalt = pl.col("sample_gt") == "1/1"
1027
+ lazy_df = lazy_df.filter(
1028
+ ~is_homalt
1029
+ | (pl.col("sample_vaf") >= quality_config["sample_vaf_homalt_min"])
1030
+ )
1031
+
1032
+ # VAF filter for homozygous reference (0/0)
1033
+ if "sample_vaf_hom_ref_max" in quality_config:
1034
+ is_hom_ref = pl.col("sample_gt") == "0/0"
1035
+ lazy_df = lazy_df.filter(
1036
+ ~is_hom_ref
1037
+ | (pl.col("sample_vaf") <= quality_config["sample_vaf_hom_ref_max"])
1038
+ )
1039
+
1040
+ # Apply same filters to parents if requested
1041
+ apply_to_parents = quality_config.get("apply_to_parents", False)
1042
+ if apply_to_parents:
1043
+ # Father filters
1044
+ if "sample_dp_min" in quality_config:
1045
+ min_dp = quality_config["sample_dp_min"]
1046
+ lazy_df = lazy_df.filter(
1047
+ (pl.col("father_dp").is_null())
1048
+ | (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
1049
+ )
1050
+
1051
+ if "sample_gq_min" in quality_config:
1052
+ min_gq = quality_config["sample_gq_min"]
1053
+ lazy_df = lazy_df.filter(
1054
+ (pl.col("father_gq").is_null())
1055
+ | (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
1056
+ )
1057
+
1058
+ # Mother filters
1059
+ if "sample_dp_min" in quality_config:
1060
+ min_dp = quality_config["sample_dp_min"]
1061
+ lazy_df = lazy_df.filter(
1062
+ (pl.col("mother_dp").is_null())
1063
+ | (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
1064
+ )
1065
+
1066
+ if "sample_gq_min" in quality_config:
1067
+ min_gq = quality_config["sample_gq_min"]
1068
+ lazy_df = lazy_df.filter(
1069
+ (pl.col("mother_gq").is_null())
1070
+ | (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
1071
+ )
1072
+
1073
+ # Apply expression filter if provided
1074
+ if expression:
1075
+ if verbose:
1076
+ click.echo(f"Applying expression filter: {expression}", err=True)
1077
+
1078
+ # We need to collect temporarily to use parse_impact_filter_expression
1079
+ df = lazy_df.collect(streaming=True)
1080
+ filter_expr = parse_impact_filter_expression(expression, df)
1081
+ lazy_df = df.lazy().filter(filter_expr)
1082
+
1083
+ return lazy_df
1084
+
1085
+
960
1086
  if __name__ == "__main__":
961
1087
  cli()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=kv03IFXcwe9pdv-KyoT5Cu1pJ9r-O7ww-Kh0ZT2ysa4,38920
3
+ pywombat-0.3.0.dist-info/METADATA,sha256=eASint-XgzgUGWshtZYr4nekDCs-VKSTilHLRupH5ic,4982
4
+ pywombat-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-0.3.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-0.3.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=PZKV6FoqZyGgG7_mMIO2FzyeONdBaCqnhDATYsQJqMo,33899
3
- pywombat-0.2.0.dist-info/METADATA,sha256=7Qg2XnaTM92pmIewu5fw_vrcQW5JCVkkj2q6mNC9v88,4982
4
- pywombat-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-0.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-0.2.0.dist-info/RECORD,,