pywombat 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ output.tsv
12
+ tests/*
13
+ test.short.tabulated.filter_simple.tsv
14
+ test.short.tabulated.filter_simple.tsv.gz
15
+ test.short.tabulated.filter.tsv
16
+ test.tabulated.filter.HIGH.tsv
17
+ test.tabulated.filter.HIGH.tsv.gz
18
+ test.tabulated.filter.MODERATE.tsv.gz
19
+ test.tabulated.filter.RARE.tsv.gz
20
+ .DS_Store
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pywombat"
3
- version = "0.1.0"
3
+ version = "0.3.0"
4
4
  description = "A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "Freddy Cliquet", email = "fcliquet@pasteur.fr" }]
@@ -1,5 +1,6 @@
1
1
  """CLI for wombat tool."""
2
2
 
3
+ import gzip
3
4
  import re
4
5
  import warnings
5
6
  from pathlib import Path
@@ -22,9 +23,9 @@ import yaml
22
23
  "-f",
23
24
  "--format",
24
25
  "output_format",
25
- type=click.Choice(["tsv", "parquet"], case_sensitive=False),
26
+ type=click.Choice(["tsv", "tsv.gz", "parquet"], case_sensitive=False),
26
27
  default="tsv",
27
- help="Output format: tsv (default) or parquet.",
28
+ help="Output format: tsv (default), tsv.gz (compressed), or parquet.",
28
29
  )
29
30
  @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
30
31
  @click.option(
@@ -73,13 +74,11 @@ def cli(
73
74
  if verbose:
74
75
  click.echo(f"Reading input file: {input_file}", err=True)
75
76
 
76
- # Read the TSV file
77
- df = pl.read_csv(input_file, separator="\t")
77
+ # Detect if file is gzipped based on extension
78
+ is_gzipped = str(input_file).endswith(".gz")
78
79
 
79
- if verbose:
80
- click.echo(
81
- f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
82
- )
80
+ if verbose and is_gzipped:
81
+ click.echo("Detected gzipped file", err=True)
83
82
 
84
83
  # Read pedigree file if provided
85
84
  pedigree_df = None
@@ -88,52 +87,60 @@ def cli(
88
87
  click.echo(f"Reading pedigree file: {pedigree}", err=True)
89
88
  pedigree_df = read_pedigree(pedigree)
90
89
 
91
- # Process the dataframe
92
- formatted_df = format_bcftools_tsv(df, pedigree_df)
93
-
94
- if verbose:
95
- click.echo(
96
- f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
97
- err=True,
98
- )
99
-
100
- # Apply filters if provided
90
+ # Load filter config if provided
101
91
  filter_config_data = None
102
92
  if filter_config:
103
93
  if verbose:
104
94
  click.echo(f"Reading filter config: {filter_config}", err=True)
105
95
  filter_config_data = load_filter_config(filter_config)
106
96
 
107
- # Apply filters and write output
97
+ # Determine output prefix
98
+ if output is None:
99
+ # Generate default output prefix from input filename
100
+ input_stem = input_file.name
101
+ # Remove .tsv.gz or .tsv extension
102
+ if input_stem.endswith(".tsv.gz"):
103
+ input_stem = input_stem[:-7] # Remove .tsv.gz
104
+ elif input_stem.endswith(".tsv"):
105
+ input_stem = input_stem[:-4] # Remove .tsv
106
+
107
+ # Add config name if filter is provided
108
+ if filter_config:
109
+ config_name = filter_config.stem # Get basename without extension
110
+ output = f"{input_stem}.{config_name}"
111
+ else:
112
+ output = input_stem
113
+
114
+ # Use streaming approach with lazy API
115
+ if verbose:
116
+ click.echo("Processing with streaming mode...", err=True)
117
+
118
+ # Build lazy query
119
+ lazy_df = pl.scan_csv(input_file, separator="\t")
120
+
121
+ # Apply formatting transformations
122
+ lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
123
+
124
+ # Apply filters if provided
108
125
  if filter_config_data:
109
- apply_filters_and_write(
110
- formatted_df,
111
- filter_config_data,
112
- output,
113
- output_format,
114
- verbose,
115
- )
116
- else:
117
- # No filters - write single output file
118
- if output:
119
- # Construct output filename with prefix and format
120
- output_path = Path(f"{output}.{output_format}")
126
+ lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
121
127
 
122
- if output_format == "tsv":
123
- formatted_df.write_csv(output_path, separator="\t")
124
- elif output_format == "parquet":
125
- formatted_df.write_parquet(output_path)
128
+ # Write output
129
+ output_path = Path(f"{output}.{output_format}")
126
130
 
127
- click.echo(f"Formatted data written to {output_path}", err=True)
128
- else:
129
- # Write to stdout (only for TSV format)
130
- if output_format != "tsv":
131
- click.echo(
132
- "Error: stdout output only supported for TSV format. Use -o to specify an output prefix for parquet.",
133
- err=True,
134
- )
135
- raise click.Abort()
136
- click.echo(formatted_df.write_csv(separator="\t"), nl=False)
131
+ if output_format == "tsv":
132
+ lazy_df.sink_csv(output_path, separator="\t")
133
+ elif output_format == "tsv.gz":
134
+ # For gzip, we need to collect and write
135
+ df = lazy_df.collect()
136
+ csv_content = df.write_csv(separator="\t")
137
+ with gzip.open(output_path, "wt") as f:
138
+ f.write(csv_content)
139
+ elif output_format == "parquet":
140
+ lazy_df.sink_parquet(output_path)
141
+
142
+ if verbose:
143
+ click.echo(f"Data written to {output_path}", err=True)
137
144
 
138
145
  except Exception as e:
139
146
  click.echo(f"Error: {e}", err=True)
@@ -555,11 +562,15 @@ def apply_impact_filters(
555
562
  )
556
563
 
557
564
  # Write to file
558
- output_filename = f"{output_prefix}_{name}.{output_format}"
565
+ output_filename = f"{output_prefix}.{name}.{output_format}"
559
566
  output_path = Path(output_filename)
560
567
 
561
568
  if output_format == "tsv":
562
569
  filtered_df.write_csv(output_path, separator="\t")
570
+ elif output_format == "tsv.gz":
571
+ csv_content = filtered_df.write_csv(separator="\t")
572
+ with gzip.open(output_path, "wt") as f:
573
+ f.write(csv_content)
563
574
  elif output_format == "parquet":
564
575
  filtered_df.write_parquet(output_path)
565
576
 
@@ -599,6 +610,10 @@ def apply_filters_and_write(
599
610
 
600
611
  if output_format == "tsv":
601
612
  filtered_df.write_csv(output_path, separator="\t")
613
+ elif output_format == "tsv.gz":
614
+ csv_content = filtered_df.write_csv(separator="\t")
615
+ with gzip.open(output_path, "wt") as f:
616
+ f.write(csv_content)
602
617
  elif output_format == "parquet":
603
618
  filtered_df.write_parquet(output_path)
604
619
 
@@ -931,5 +946,142 @@ def format_bcftools_tsv(
931
946
  return melted_df
932
947
 
933
948
 
949
+ def format_bcftools_tsv_lazy(
950
+ lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
951
+ ) -> pl.LazyFrame:
952
+ """
953
+ Format a bcftools tabulated TSV using lazy operations for streaming.
954
+
955
+ This is a simplified version that collects minimally for complex operations.
956
+ """
957
+ # For complex transformations like melting, we need to collect temporarily
958
+ # but we do this in a streaming fashion
959
+ df = lazy_df.collect(streaming=True)
960
+ formatted_df = format_bcftools_tsv(df, pedigree_df)
961
+ return formatted_df.lazy()
962
+
963
+
964
+ def apply_filters_lazy(
965
+ lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
966
+ ) -> pl.LazyFrame:
967
+ """Apply quality and expression filters using lazy operations."""
968
+ quality_config = filter_config.get("quality", {})
969
+ expression = filter_config.get("expression")
970
+
971
+ # Apply quality filters
972
+ if quality_config:
973
+ # Filter: sample_gt must contain at least one '1' (default: true)
974
+ filter_no_alt = quality_config.get("filter_no_alt_allele", True)
975
+ if filter_no_alt:
976
+ lazy_df = lazy_df.filter(
977
+ pl.col("sample_gt").str.contains("1")
978
+ | pl.col("sample_gt").str.contains("2")
979
+ )
980
+
981
+ # Apply minimum depth filter
982
+ if "sample_dp_min" in quality_config:
983
+ min_dp = quality_config["sample_dp_min"]
984
+ lazy_df = lazy_df.filter(
985
+ pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp
986
+ )
987
+
988
+ # Apply minimum GQ filter
989
+ if "sample_gq_min" in quality_config:
990
+ min_gq = quality_config["sample_gq_min"]
991
+ lazy_df = lazy_df.filter(
992
+ pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq
993
+ )
994
+
995
+ # VAF filters for heterozygous (0/1 or 1/0)
996
+ if (
997
+ "sample_vaf_het_min" in quality_config
998
+ or "sample_vaf_het_max" in quality_config
999
+ ):
1000
+ # Check if genotype is het (contains one '1' and one '0', no '2')
1001
+ is_het = (
1002
+ (pl.col("sample_gt").str.count_matches("1") == 1)
1003
+ & (pl.col("sample_gt").str.count_matches("0") == 1)
1004
+ & (~pl.col("sample_gt").str.contains("2"))
1005
+ )
1006
+
1007
+ het_conditions = []
1008
+ if "sample_vaf_het_min" in quality_config:
1009
+ het_conditions.append(
1010
+ pl.col("sample_vaf") >= quality_config["sample_vaf_het_min"]
1011
+ )
1012
+ if "sample_vaf_het_max" in quality_config:
1013
+ het_conditions.append(
1014
+ pl.col("sample_vaf") <= quality_config["sample_vaf_het_max"]
1015
+ )
1016
+
1017
+ if het_conditions:
1018
+ het_filter = het_conditions[0]
1019
+ for cond in het_conditions[1:]:
1020
+ het_filter = het_filter & cond
1021
+
1022
+ lazy_df = lazy_df.filter(~is_het | het_filter)
1023
+
1024
+ # VAF filter for homozygous alternate (1/1)
1025
+ if "sample_vaf_homalt_min" in quality_config:
1026
+ is_homalt = pl.col("sample_gt") == "1/1"
1027
+ lazy_df = lazy_df.filter(
1028
+ ~is_homalt
1029
+ | (pl.col("sample_vaf") >= quality_config["sample_vaf_homalt_min"])
1030
+ )
1031
+
1032
+ # VAF filter for homozygous reference (0/0)
1033
+ if "sample_vaf_hom_ref_max" in quality_config:
1034
+ is_hom_ref = pl.col("sample_gt") == "0/0"
1035
+ lazy_df = lazy_df.filter(
1036
+ ~is_hom_ref
1037
+ | (pl.col("sample_vaf") <= quality_config["sample_vaf_hom_ref_max"])
1038
+ )
1039
+
1040
+ # Apply same filters to parents if requested
1041
+ apply_to_parents = quality_config.get("apply_to_parents", False)
1042
+ if apply_to_parents:
1043
+ # Father filters
1044
+ if "sample_dp_min" in quality_config:
1045
+ min_dp = quality_config["sample_dp_min"]
1046
+ lazy_df = lazy_df.filter(
1047
+ (pl.col("father_dp").is_null())
1048
+ | (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
1049
+ )
1050
+
1051
+ if "sample_gq_min" in quality_config:
1052
+ min_gq = quality_config["sample_gq_min"]
1053
+ lazy_df = lazy_df.filter(
1054
+ (pl.col("father_gq").is_null())
1055
+ | (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
1056
+ )
1057
+
1058
+ # Mother filters
1059
+ if "sample_dp_min" in quality_config:
1060
+ min_dp = quality_config["sample_dp_min"]
1061
+ lazy_df = lazy_df.filter(
1062
+ (pl.col("mother_dp").is_null())
1063
+ | (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
1064
+ )
1065
+
1066
+ if "sample_gq_min" in quality_config:
1067
+ min_gq = quality_config["sample_gq_min"]
1068
+ lazy_df = lazy_df.filter(
1069
+ (pl.col("mother_gq").is_null())
1070
+ | (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
1071
+ )
1072
+
1073
+ # Apply expression filter if provided
1074
+ if expression:
1075
+ if verbose:
1076
+ click.echo(f"Applying expression filter: {expression}", err=True)
1077
+
1078
+ # We need to collect temporarily to use parse_impact_filter_expression
1079
+ df = lazy_df.collect(streaming=True)
1080
+ filter_expr = parse_impact_filter_expression(expression, df)
1081
+ lazy_df = df.lazy().filter(filter_expr)
1082
+
1083
+ return lazy_df
1084
+
1085
+
934
1086
  if __name__ == "__main__":
935
1087
  cli()
@@ -51,7 +51,7 @@ wheels = [
51
51
 
52
52
  [[package]]
53
53
  name = "pywombat"
54
- version = "0.1.0"
54
+ version = "0.2.0"
55
55
  source = { editable = "." }
56
56
  dependencies = [
57
57
  { name = "click" },
pywombat-0.1.0/.gitignore DELETED
@@ -1,12 +0,0 @@
1
- # Python-generated files
2
- __pycache__/
3
- *.py[oc]
4
- build/
5
- dist/
6
- wheels/
7
- *.egg-info
8
-
9
- # Virtual environments
10
- .venv
11
- output.tsv
12
- tests/*
File without changes
File without changes
File without changes