pywombat 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -1198,15 +1198,18 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
1198
1198
  pedigree_df = df.select(select_cols)
1199
1199
 
1200
1200
  # Replace 0 and -9 with null (indicating no parent)
1201
+ # Explicit cast to Utf8 ensures type is preserved even when all values become null
1201
1202
  pedigree_df = pedigree_df.with_columns(
1202
1203
  [
1203
1204
  pl.when(pl.col("father_id").cast(pl.Utf8).is_in(["0", "-9"]))
1204
1205
  .then(None)
1205
1206
  .otherwise(pl.col("father_id"))
1207
+ .cast(pl.Utf8)
1206
1208
  .alias("father_id"),
1207
1209
  pl.when(pl.col("mother_id").cast(pl.Utf8).is_in(["0", "-9"]))
1208
1210
  .then(None)
1209
1211
  .otherwise(pl.col("mother_id"))
1212
+ .cast(pl.Utf8)
1210
1213
  .alias("mother_id"),
1211
1214
  ]
1212
1215
  )
@@ -1313,6 +1316,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
1313
1316
  This is a separate step that can be applied after filtering to avoid
1314
1317
  expensive annotation expansion on variants that will be filtered out.
1315
1318
 
1319
+ Handles two types of INFO fields:
1320
+ - Key-value pairs (e.g., "DP=30") -> extracted as string values
1321
+ - Boolean flags (e.g., "PASS", "DB") -> created as True/False columns
1322
+
1316
1323
  Args:
1317
1324
  df: DataFrame with (null) column
1318
1325
 
@@ -1324,9 +1331,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
1324
1331
  # Already expanded or missing - return as-is
1325
1332
  return df
1326
1333
 
1327
- # Extract all unique field names from the (null) column
1334
+ # Extract all unique field names and flags from the (null) column
1328
1335
  null_values = df.select("(null)").to_series()
1329
1336
  all_fields = set()
1337
+ all_flags = set()
1330
1338
 
1331
1339
  for value in null_values:
1332
1340
  if value and not (isinstance(value, float)): # Skip null/NaN values
@@ -1335,8 +1343,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
1335
1343
  if "=" in pair:
1336
1344
  field_name = pair.split("=", 1)[0]
1337
1345
  all_fields.add(field_name)
1346
+ elif pair.strip(): # Boolean flag (no '=')
1347
+ all_flags.add(pair.strip())
1338
1348
 
1339
- # Create expressions to extract each field
1349
+ # Create expressions to extract each key-value field
1340
1350
  for field in sorted(all_fields):
1341
1351
  # Extract the field value from the (null) column
1342
1352
  # Pattern: extract value after "field=" and before ";" or end of string
@@ -1344,6 +1354,14 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
1344
1354
  pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
1345
1355
  )
1346
1356
 
1357
+ # Create boolean columns for flags
1358
+ for flag in sorted(all_flags):
1359
+ # Check if flag appears in the (null) column (as whole word)
1360
+ # Use regex to match flag as a separate field (not part of another field name)
1361
+ df = df.with_columns(
1362
+ pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
1363
+ )
1364
+
1347
1365
  # Drop the original (null) column
1348
1366
  df = df.drop("(null)")
1349
1367
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -35,6 +35,7 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
35
35
  🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
36
36
  📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
37
37
  🎯 **Expression Filters**: Complex filtering with logical expressions
38
+ 🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
38
39
  ⚡ **Streaming Mode**: Memory-efficient processing of large files
39
40
 
40
41
  ---
@@ -77,7 +78,7 @@ uv run wombat input.tsv -o output
77
78
 
78
79
  PyWombat transforms bcftools tabulated TSV files into analysis-ready formats by:
79
80
 
80
- 1. **Expanding the `(null)` INFO column**: Extracts all `NAME=value` fields (e.g., `DP=30;AF=0.5;AC=2`) into separate columns
81
+ 1. **Expanding the `(null)` INFO column**: Extracts all `NAME=value` fields (e.g., `DP=30;AF=0.5;AC=2`) and boolean flags (e.g., `PASS`, `DB`) into separate columns
81
82
  2. **Melting sample columns**: Converts wide-format sample data into long format with one row per variant-sample combination
82
83
  3. **Extracting genotype data**: Parses `GT:DP:GQ:AD` format into separate columns with calculated VAF
83
84
  4. **Adding parent data**: Joins father/mother genotypes when pedigree is provided
@@ -88,20 +89,22 @@ PyWombat transforms bcftools tabulated TSV files into analysis-ready formats by:
88
89
  **Input (Wide Format):**
89
90
 
90
91
  ```tsv
91
- #CHROM POS REF ALT (null) Sample1:GT:DP:GQ:AD Sample2:GT:DP:GQ:AD
92
- chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
92
+ #CHROM POS REF ALT (null) Sample1:GT:DP:GQ:AD Sample2:GT:DP:GQ:AD
93
+ chr1 100 A T DP=30;AF=0.5;PASS;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
93
94
  ```
94
95
 
95
96
  **Output (Long Format):**
96
97
 
97
98
  ```tsv
98
- #CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
99
- chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
100
- chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
99
+ #CHROM POS REF ALT AC AF DP PASS sample sample_gt sample_dp sample_gq sample_ad sample_vaf
100
+ chr1 100 A T 2 0.5 30 true Sample1 0/1 15 99 10 0.6667
101
+ chr1 100 A T 2 0.5 30 true Sample2 1/1 18 99 18 1.0
101
102
  ```
102
103
 
103
104
  **Generated Columns:**
104
105
 
106
+ - INFO fields with `=`: Extracted as separate columns (e.g., `DP`, `AF`, `AC`)
107
+ - INFO boolean flags: Extracted as True/False columns (e.g., `PASS`, `DB`, `SOMATIC`)
105
108
  - `sample`: Sample identifier
106
109
  - `sample_gt`: Genotype (e.g., 0/1, 1/1)
107
110
  - `sample_dp`: Read depth (total coverage)
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=76pVpYYyl9rCm6TCq86j7xAqEC9pOvjiWCX9MmrBB_o,74994
3
+ pywombat-1.0.2.dist-info/METADATA,sha256=slcagFwSvA99GVzKLRZggFPy5dkLEh_09O3dIB8Hfr4,17168
4
+ pywombat-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-1.0.2.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-1.0.2.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=FK1bEKtFD1Drp5LNdXaVie4zyjYbZc3wTbsjms-wISU,74176
3
- pywombat-1.0.0.dist-info/METADATA,sha256=bIm5-Az795PLluvA_6yBPcHkcq6EOZbvB_g-4jPjx_U,16828
4
- pywombat-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-1.0.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-1.0.0.dist-info/RECORD,,