smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +9 -4
- smftools/_version.py +1 -1
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +0 -2
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/fast5_to_pod5.py +4 -1
- smftools/informatics/helpers/__init__.py +3 -4
- smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
- smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
- smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +29 -3
- smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
- smftools/informatics/helpers/find_conversion_sites.py +5 -4
- smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
- smftools/informatics/helpers/split_and_index_BAM.py +1 -5
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/general_plotting.py +566 -89
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +13 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +849 -43
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/evaluation/__init__.py +0 -0
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu'):
|
|
1
|
+
def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu', deaminase_footprinting=False, mismatch_trend_per_read={}, on_missing="nan"):
|
|
2
2
|
"""
|
|
3
3
|
Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
|
|
4
4
|
|
|
@@ -7,38 +7,131 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
7
7
|
strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
|
|
8
8
|
modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
|
|
9
9
|
bam (str): The bam file path
|
|
10
|
-
|
|
10
|
+
deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
|
|
11
|
+
mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
|
|
12
|
+
on_missing (str): Error handling if a read is missing
|
|
13
|
+
|
|
11
14
|
Returns:
|
|
12
15
|
dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
|
|
16
|
+
If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
|
|
13
17
|
"""
|
|
14
18
|
import numpy as np
|
|
15
19
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
if mismatch_trend_per_read is None:
|
|
21
|
+
mismatch_trend_per_read = {}
|
|
22
|
+
|
|
23
|
+
# Fast path
|
|
24
|
+
if modification_type == "unconverted" and not deaminase_footprinting:
|
|
25
|
+
return {k: np.full(len(v), np.nan, dtype=np.float32) for k, v in base_identities.items()}
|
|
26
|
+
|
|
27
|
+
out = {}
|
|
28
|
+
|
|
29
|
+
if deaminase_footprinting:
|
|
30
|
+
valid_trends = {"C->T", "G->A"}
|
|
31
|
+
|
|
32
|
+
for read_id, bases in base_identities.items():
|
|
33
|
+
trend_raw = mismatch_trend_per_read.get(read_id, None)
|
|
34
|
+
if trend_raw is None:
|
|
35
|
+
if on_missing == "error":
|
|
36
|
+
raise KeyError(f"Missing mismatch trend for read '{read_id}'")
|
|
37
|
+
out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
|
|
38
|
+
continue
|
|
28
39
|
|
|
29
|
-
|
|
40
|
+
trend = trend_raw.replace(" ", "").upper()
|
|
41
|
+
if trend not in valid_trends:
|
|
42
|
+
if on_missing == "error":
|
|
43
|
+
raise KeyError(
|
|
44
|
+
f"Invalid mismatch trend '{trend_raw}' for read '{read_id}'. "
|
|
45
|
+
f"Expected one of {sorted(valid_trends)}"
|
|
46
|
+
)
|
|
47
|
+
out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
arr = np.asarray(bases, dtype="<U1")
|
|
51
|
+
res = np.full(arr.shape, np.nan, dtype=np.float32)
|
|
52
|
+
|
|
53
|
+
if trend == "C->T":
|
|
54
|
+
# C (unconverted) -> 0, T (converted) -> 1
|
|
55
|
+
res[arr == "C"] = 0.0
|
|
56
|
+
res[arr == "T"] = 1.0
|
|
57
|
+
else: # "G->A"
|
|
58
|
+
res[arr == "G"] = 0.0
|
|
59
|
+
res[arr == "A"] = 1.0
|
|
60
|
+
|
|
61
|
+
out[read_id] = res
|
|
62
|
+
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
# Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
|
|
66
|
+
bin_maps = {
|
|
67
|
+
("top", "5mC"): {"C": 1.0, "T": 0.0},
|
|
68
|
+
("bottom", "5mC"): {"G": 1.0, "A": 0.0},
|
|
69
|
+
("top", "6mA"): {"A": 1.0, "G": 0.0},
|
|
70
|
+
("bottom", "6mA"): {"T": 1.0, "C": 0.0},
|
|
71
|
+
}
|
|
72
|
+
key = (strand, modification_type)
|
|
73
|
+
if key not in bin_maps:
|
|
30
74
|
raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
31
75
|
|
|
32
|
-
|
|
33
|
-
|
|
76
|
+
base_map = bin_maps[key]
|
|
77
|
+
|
|
78
|
+
for read_id, bases in base_identities.items():
|
|
79
|
+
arr = np.asarray(bases, dtype="<U1")
|
|
80
|
+
res = np.full(arr.shape, np.nan, dtype=np.float32)
|
|
81
|
+
# mask-assign; unknown characters (N, -, etc.) remain NaN
|
|
82
|
+
for b, v in base_map.items():
|
|
83
|
+
res[arr == b] = v
|
|
84
|
+
out[read_id] = res
|
|
85
|
+
|
|
86
|
+
return out
|
|
87
|
+
|
|
88
|
+
# if mismatch_trend_per_read is None:
|
|
89
|
+
# mismatch_trend_per_read = {}
|
|
90
|
+
|
|
91
|
+
# # If the modification type is 'unconverted', return NaN for all positions if the deaminase_footprinting strategy is not being used.
|
|
92
|
+
# if modification_type == "unconverted" and not deaminase_footprinting:
|
|
93
|
+
# #print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
94
|
+
# return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
|
|
95
|
+
|
|
96
|
+
# # Define mappings for binarization based on strand and modification type
|
|
97
|
+
# if deaminase_footprinting:
|
|
98
|
+
# binarization_maps = {
|
|
99
|
+
# ('C->T'): {'C': 0, 'T': 1},
|
|
100
|
+
# ('G->A'): {'G': 0, 'A': 1},
|
|
101
|
+
# }
|
|
102
|
+
|
|
103
|
+
# binarized_base_identities = {}
|
|
104
|
+
# for key, bases in base_identities.items():
|
|
105
|
+
# arr = np.array(bases, dtype='<U1')
|
|
106
|
+
# # Fetch the appropriate mapping
|
|
107
|
+
# conversion_type = mismatch_trend_per_read[key]
|
|
108
|
+
# base_map = binarization_maps.get(conversion_type, None)
|
|
109
|
+
# binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
|
|
110
|
+
# binarized_base_identities[key] = binarized
|
|
111
|
+
|
|
112
|
+
# return binarized_base_identities
|
|
113
|
+
|
|
114
|
+
# else:
|
|
115
|
+
# binarization_maps = {
|
|
116
|
+
# ('top', '5mC'): {'C': 1, 'T': 0},
|
|
117
|
+
# ('top', '6mA'): {'A': 1, 'G': 0},
|
|
118
|
+
# ('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
119
|
+
# ('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
120
|
+
# }
|
|
121
|
+
|
|
122
|
+
# if (strand, modification_type) not in binarization_maps:
|
|
123
|
+
# raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
124
|
+
|
|
125
|
+
# # Fetch the appropriate mapping
|
|
126
|
+
# base_map = binarization_maps[(strand, modification_type)]
|
|
34
127
|
|
|
35
|
-
binarized_base_identities = {}
|
|
36
|
-
for key, bases in base_identities.items():
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
128
|
+
# binarized_base_identities = {}
|
|
129
|
+
# for key, bases in base_identities.items():
|
|
130
|
+
# arr = np.array(bases, dtype='<U1')
|
|
131
|
+
# binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
|
|
132
|
+
# binarized_base_identities[key] = binarized
|
|
40
133
|
|
|
41
|
-
return binarized_base_identities
|
|
134
|
+
# return binarized_base_identities
|
|
42
135
|
# import torch
|
|
43
136
|
|
|
44
137
|
# # If the modification type is 'unconverted', return NaN for all positions
|
|
@@ -1,55 +1,378 @@
|
|
|
1
1
|
# concatenate_fastqs_to_bam
|
|
2
2
|
|
|
3
|
-
def concatenate_fastqs_to_bam(
|
|
3
|
+
def concatenate_fastqs_to_bam(
|
|
4
|
+
fastq_files,
|
|
5
|
+
output_bam,
|
|
6
|
+
barcode_tag='BC',
|
|
7
|
+
gzip_suffixes=('.gz',),
|
|
8
|
+
barcode_map=None,
|
|
9
|
+
add_read_group=True,
|
|
10
|
+
rg_sample_field=None,
|
|
11
|
+
progress=True,
|
|
12
|
+
auto_pair=True,
|
|
13
|
+
):
|
|
4
14
|
"""
|
|
5
|
-
Concatenate
|
|
15
|
+
Concatenate FASTQ(s) into an unaligned BAM. Supports single-end and paired-end (auto-detect or explicit).
|
|
6
16
|
|
|
7
|
-
Parameters
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
fastq_files : list[str] or list[(str,str)]
|
|
20
|
+
If list of tuples: each tuple is (R1_path, R2_path).
|
|
21
|
+
If list of strings and auto_pair=True: the function will attempt to automatically pair files.
|
|
22
|
+
output_bam : str
|
|
23
|
+
Path to output BAM (will be overwritten).
|
|
24
|
+
barcode_tag : str
|
|
25
|
+
SAM tag used for barcode (default 'BC').
|
|
26
|
+
gzip_suffixes : tuple
|
|
27
|
+
Compressed suffixes to consider (default ('.gz',)).
|
|
28
|
+
barcode_map : dict or None
|
|
29
|
+
Optional mapping {path: barcode} to override automatic extraction.
|
|
30
|
+
add_read_group : bool
|
|
31
|
+
If True, add RG entries and set RG tag per-read (ID = barcode).
|
|
32
|
+
rg_sample_field : str or None
|
|
33
|
+
If set, includes SM field in RG header entries.
|
|
34
|
+
progress : bool
|
|
35
|
+
Show tqdm progress bar.
|
|
36
|
+
auto_pair : bool
|
|
37
|
+
If True and `fastq_files` is a list of strings, attempt to auto-pair R1/R2 by filename patterns.
|
|
12
38
|
|
|
13
|
-
Returns
|
|
14
|
-
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
dict
|
|
42
|
+
Summary: {'total_reads', 'per_file_counts', 'paired_count', 'unpaired_count', 'barcodes'}
|
|
15
43
|
"""
|
|
16
44
|
import os
|
|
17
|
-
import
|
|
45
|
+
import re
|
|
18
46
|
import gzip
|
|
47
|
+
from itertools import zip_longest
|
|
19
48
|
from Bio import SeqIO
|
|
49
|
+
import pysam
|
|
20
50
|
from tqdm import tqdm
|
|
21
51
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
for
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
52
|
+
# ---------- helpers ----------
|
|
53
|
+
def _is_gz(path):
|
|
54
|
+
pl = path.lower()
|
|
55
|
+
return any(pl.endswith(suf) for suf in gzip_suffixes)
|
|
56
|
+
|
|
57
|
+
def _strip_fastq_ext(basn):
|
|
58
|
+
# remove .fastq.gz .fq.gz .fastq .fq
|
|
59
|
+
for ext in ('.fastq.gz', '.fq.gz', '.fastq', '.fq'):
|
|
60
|
+
if basn.lower().endswith(ext):
|
|
61
|
+
return basn[:-len(ext)]
|
|
62
|
+
# fallback remove last suffix
|
|
63
|
+
return os.path.splitext(basn)[0]
|
|
64
|
+
|
|
65
|
+
def _extract_barcode_from_filename(path):
|
|
66
|
+
# heuristic: barcode is last underscore-separated token in filename (before ext)
|
|
67
|
+
stem = _strip_fastq_ext(os.path.basename(path))
|
|
68
|
+
if '_' in stem:
|
|
69
|
+
token = stem.split('_')[-1]
|
|
70
|
+
if token:
|
|
71
|
+
return token
|
|
72
|
+
# fallback to whole stem
|
|
73
|
+
return stem
|
|
74
|
+
|
|
75
|
+
# pairing heuristics: try to identify suffix that marks read number
|
|
76
|
+
def _classify_read_token(stem):
|
|
77
|
+
# returns (prefix, readnum) if matches, else (None, None)
|
|
78
|
+
patterns = [
|
|
79
|
+
r'(?i)(.*?)[._-]r?([12])$', # prefix_R1 or prefix.r1 or prefix-1
|
|
80
|
+
r'(?i)(.*?)[._-]read[_-]?([12])$',
|
|
81
|
+
r'(?i)(.*?)[/_]([12])$', # sometimes /1 is used (rare in filenames)
|
|
82
|
+
]
|
|
83
|
+
for pat in patterns:
|
|
84
|
+
m = re.match(pat, stem)
|
|
85
|
+
if m:
|
|
86
|
+
prefix = m.group(1)
|
|
87
|
+
num = m.group(2)
|
|
88
|
+
return prefix, int(num)
|
|
89
|
+
return None, None
|
|
90
|
+
|
|
91
|
+
def pair_by_filename(paths):
|
|
92
|
+
# paths: list of strings
|
|
93
|
+
map_pref = {} # prefix -> {1: path, 2: path, 'orphans': [..]}
|
|
94
|
+
unpaired = []
|
|
95
|
+
for p in paths:
|
|
96
|
+
name = os.path.basename(p)
|
|
97
|
+
stem = _strip_fastq_ext(name)
|
|
98
|
+
pref, num = _classify_read_token(stem)
|
|
99
|
+
if pref is not None:
|
|
100
|
+
entry = map_pref.setdefault(pref, {})
|
|
101
|
+
entry[num] = p
|
|
102
|
+
else:
|
|
103
|
+
# try fallback: split by last underscore or dot and check last token is 1/2 or R1/R2
|
|
104
|
+
toks = re.split(r'[_\.]', stem)
|
|
105
|
+
if toks and toks[-1] in ('1', '2', 'R1', 'R2', 'r1', 'r2'):
|
|
106
|
+
last = toks[-1]
|
|
107
|
+
basepref = "_".join(toks[:-1]) if len(toks) > 1 else toks[0]
|
|
108
|
+
num = 1 if last.endswith('1') else 2
|
|
109
|
+
entry = map_pref.setdefault(basepref, {})
|
|
110
|
+
entry[num] = p
|
|
37
111
|
else:
|
|
38
|
-
|
|
112
|
+
unpaired.append(p)
|
|
113
|
+
pairs = []
|
|
114
|
+
leftovers = []
|
|
115
|
+
for k, d in map_pref.items():
|
|
116
|
+
if 1 in d and 2 in d:
|
|
117
|
+
pairs.append((d[1], d[2]))
|
|
39
118
|
else:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
119
|
+
# put whoever exists into leftovers
|
|
120
|
+
leftovers.extend([v for kk, v in d.items()])
|
|
121
|
+
# append also unpaired
|
|
122
|
+
leftovers.extend(unpaired)
|
|
123
|
+
return pairs, leftovers
|
|
124
|
+
|
|
125
|
+
# ---------- normalize input ----------
|
|
126
|
+
explicit_pairs = []
|
|
127
|
+
singles = []
|
|
128
|
+
if not isinstance(fastq_files, (list, tuple)):
|
|
129
|
+
raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
|
|
130
|
+
|
|
131
|
+
# mixture: if user supplied tuples -> treat as explicit pairs
|
|
132
|
+
if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
|
|
133
|
+
explicit_pairs = [(str(a), str(b)) for a, b in fastq_files]
|
|
134
|
+
else:
|
|
135
|
+
# flatten and coerce to strings, ignore None
|
|
136
|
+
paths = [str(x) for x in fastq_files if x is not None]
|
|
137
|
+
if auto_pair:
|
|
138
|
+
explicit_pairs, leftovers = pair_by_filename(paths)
|
|
139
|
+
singles = leftovers
|
|
140
|
+
else:
|
|
141
|
+
singles = paths
|
|
142
|
+
|
|
143
|
+
# Build barcode map and ordered barcodes
|
|
144
|
+
barcode_map = barcode_map or {}
|
|
145
|
+
per_path_barcode = {}
|
|
146
|
+
barcodes_in_order = []
|
|
147
|
+
|
|
148
|
+
# pairs: assign barcode per pair from either provided barcode_map for first file or from filenames
|
|
149
|
+
for r1, r2 in explicit_pairs:
|
|
150
|
+
bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
|
|
151
|
+
per_path_barcode[r1] = bc
|
|
152
|
+
per_path_barcode[r2] = bc
|
|
153
|
+
if bc not in barcodes_in_order:
|
|
154
|
+
barcodes_in_order.append(bc)
|
|
155
|
+
for p in singles:
|
|
156
|
+
bc = barcode_map.get(p) or _extract_barcode_from_filename(p)
|
|
157
|
+
per_path_barcode[p] = bc
|
|
158
|
+
if bc not in barcodes_in_order:
|
|
159
|
+
barcodes_in_order.append(bc)
|
|
160
|
+
|
|
161
|
+
# prepare BAM header
|
|
162
|
+
header = {"HD": {"VN": "1.0"}, "SQ": []}
|
|
163
|
+
if add_read_group:
|
|
164
|
+
rg_list = []
|
|
165
|
+
for bc in barcodes_in_order:
|
|
166
|
+
rg = {"ID": bc}
|
|
167
|
+
if rg_sample_field:
|
|
168
|
+
rg["SM"] = rg_sample_field
|
|
169
|
+
rg_list.append(rg)
|
|
170
|
+
header["RG"] = rg_list
|
|
171
|
+
|
|
172
|
+
# ---------- write BAM ----------
|
|
173
|
+
per_file_counts = {}
|
|
174
|
+
total_written = 0
|
|
175
|
+
paired_count = 0
|
|
176
|
+
unpaired_count = 0
|
|
177
|
+
|
|
178
|
+
def _open_fh(path):
|
|
179
|
+
return gzip.open(path, "rt") if _is_gz(path) else open(path, "rt")
|
|
180
|
+
|
|
181
|
+
with pysam.AlignmentFile(output_bam, "wb", header=header) as bam_out:
|
|
182
|
+
# process paired files first
|
|
183
|
+
seq_iter = list(explicit_pairs) # list of (r1,r2)
|
|
184
|
+
if progress:
|
|
185
|
+
seq_iter = tqdm(seq_iter, desc="Paired FASTQ->BAM")
|
|
186
|
+
for r1_path, r2_path in seq_iter:
|
|
187
|
+
if not (os.path.exists(r1_path) and os.path.exists(r2_path)):
|
|
188
|
+
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
189
|
+
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
190
|
+
# open both and iterate in parallel
|
|
191
|
+
with _open_fh(r1_path) as fh1, _open_fh(r2_path) as fh2:
|
|
192
|
+
it1 = SeqIO.parse(fh1, "fastq")
|
|
193
|
+
it2 = SeqIO.parse(fh2, "fastq")
|
|
194
|
+
# iterate in lockstep; if one shorter we still write remaining as unpaired (zip_longest)
|
|
195
|
+
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
196
|
+
# determine a common read name
|
|
197
|
+
if rec1 is not None:
|
|
198
|
+
id1 = rec1.id
|
|
199
|
+
else:
|
|
200
|
+
id1 = None
|
|
201
|
+
if rec2 is not None:
|
|
202
|
+
id2 = rec2.id
|
|
203
|
+
else:
|
|
204
|
+
id2 = None
|
|
205
|
+
# try to derive a common name (strip /1 or /2 if present)
|
|
206
|
+
def _strip_end_id(s):
|
|
207
|
+
if s is None:
|
|
208
|
+
return None
|
|
209
|
+
return re.sub(r'(?:/1$|/2$|\s[12]$)', '', s)
|
|
210
|
+
common_name = _strip_end_id(id1) or _strip_end_id(id2) or (id1 or id2)
|
|
211
|
+
|
|
212
|
+
# create AlignedSegment for read1
|
|
213
|
+
if rec1 is not None:
|
|
214
|
+
a1 = pysam.AlignedSegment()
|
|
215
|
+
a1.query_name = common_name
|
|
216
|
+
a1.query_sequence = str(rec1.seq)
|
|
217
|
+
a1.is_paired = True
|
|
218
|
+
a1.is_read1 = True
|
|
219
|
+
a1.is_read2 = False
|
|
220
|
+
a1.is_unmapped = True
|
|
221
|
+
a1.mate_is_unmapped = True
|
|
222
|
+
# reference fields for unmapped
|
|
223
|
+
a1.reference_id = -1
|
|
224
|
+
a1.reference_start = -1
|
|
225
|
+
a1.next_reference_id = -1
|
|
226
|
+
a1.next_reference_start = -1
|
|
227
|
+
a1.template_length = 0
|
|
228
|
+
# qualities
|
|
229
|
+
if "phred_quality" in rec1.letter_annotations:
|
|
230
|
+
try:
|
|
231
|
+
a1.query_qualities = [int(x) for x in rec1.letter_annotations["phred_quality"]]
|
|
232
|
+
except Exception:
|
|
233
|
+
a1.query_qualities = None
|
|
234
|
+
# tags
|
|
235
|
+
a1.set_tag(barcode_tag, str(bc), value_type='Z')
|
|
236
|
+
if add_read_group:
|
|
237
|
+
a1.set_tag("RG", str(bc), value_type='Z')
|
|
238
|
+
bam_out.write(a1)
|
|
239
|
+
per_file_counts.setdefault(r1_path, 0)
|
|
240
|
+
per_file_counts[r1_path] += 1
|
|
241
|
+
total_written += 1
|
|
242
|
+
# create AlignedSegment for read2
|
|
243
|
+
if rec2 is not None:
|
|
244
|
+
a2 = pysam.AlignedSegment()
|
|
245
|
+
a2.query_name = common_name
|
|
246
|
+
a2.query_sequence = str(rec2.seq)
|
|
247
|
+
a2.is_paired = True
|
|
248
|
+
a2.is_read1 = False
|
|
249
|
+
a2.is_read2 = True
|
|
250
|
+
a2.is_unmapped = True
|
|
251
|
+
a2.mate_is_unmapped = True
|
|
252
|
+
a2.reference_id = -1
|
|
253
|
+
a2.reference_start = -1
|
|
254
|
+
a2.next_reference_id = -1
|
|
255
|
+
a2.next_reference_start = -1
|
|
256
|
+
a2.template_length = 0
|
|
257
|
+
if "phred_quality" in rec2.letter_annotations:
|
|
258
|
+
try:
|
|
259
|
+
a2.query_qualities = [int(x) for x in rec2.letter_annotations["phred_quality"]]
|
|
260
|
+
except Exception:
|
|
261
|
+
a2.query_qualities = None
|
|
262
|
+
a2.set_tag(barcode_tag, str(bc), value_type='Z')
|
|
263
|
+
if add_read_group:
|
|
264
|
+
a2.set_tag("RG", str(bc), value_type='Z')
|
|
265
|
+
bam_out.write(a2)
|
|
266
|
+
per_file_counts.setdefault(r2_path, 0)
|
|
267
|
+
per_file_counts[r2_path] += 1
|
|
268
|
+
total_written += 1
|
|
269
|
+
# count paired/unpaired bookkeeping
|
|
270
|
+
if rec1 is not None and rec2 is not None:
|
|
271
|
+
paired_count += 1
|
|
272
|
+
else:
|
|
273
|
+
# one side missing -> counted as unpaired for whichever exists
|
|
274
|
+
if rec1 is not None:
|
|
275
|
+
unpaired_count += 1
|
|
276
|
+
if rec2 is not None:
|
|
277
|
+
unpaired_count += 1
|
|
278
|
+
|
|
279
|
+
# process singletons
|
|
280
|
+
single_iter = list(singles)
|
|
281
|
+
if progress:
|
|
282
|
+
single_iter = tqdm(single_iter, desc="Single FASTQ->BAM")
|
|
283
|
+
for p in single_iter:
|
|
284
|
+
if not os.path.exists(p):
|
|
285
|
+
raise FileNotFoundError(p)
|
|
286
|
+
bc = per_path_barcode.get(p, "barcode")
|
|
287
|
+
with _open_fh(p) as fh:
|
|
288
|
+
for rec in SeqIO.parse(fh, "fastq"):
|
|
289
|
+
a = pysam.AlignedSegment()
|
|
290
|
+
a.query_name = rec.id
|
|
291
|
+
a.query_sequence = str(rec.seq)
|
|
292
|
+
a.is_paired = False
|
|
293
|
+
a.is_read1 = False
|
|
294
|
+
a.is_read2 = False
|
|
295
|
+
a.is_unmapped = True
|
|
296
|
+
a.mate_is_unmapped = True
|
|
297
|
+
a.reference_id = -1
|
|
298
|
+
a.reference_start = -1
|
|
299
|
+
a.next_reference_id = -1
|
|
300
|
+
a.next_reference_start = -1
|
|
301
|
+
a.template_length = 0
|
|
302
|
+
if "phred_quality" in rec.letter_annotations:
|
|
303
|
+
try:
|
|
304
|
+
a.query_qualities = [int(x) for x in rec.letter_annotations["phred_quality"]]
|
|
305
|
+
except Exception:
|
|
306
|
+
a.query_qualities = None
|
|
307
|
+
a.set_tag(barcode_tag, str(bc), value_type='Z')
|
|
308
|
+
if add_read_group:
|
|
309
|
+
a.set_tag("RG", str(bc), value_type='Z')
|
|
310
|
+
bam_out.write(a)
|
|
311
|
+
per_file_counts.setdefault(p, 0)
|
|
312
|
+
per_file_counts[p] += 1
|
|
313
|
+
total_written += 1
|
|
314
|
+
unpaired_count += 1
|
|
315
|
+
|
|
316
|
+
summary = {
|
|
317
|
+
"total_reads": total_written,
|
|
318
|
+
"per_file": per_file_counts,
|
|
319
|
+
"paired_pairs_written": paired_count,
|
|
320
|
+
"singletons_written": unpaired_count,
|
|
321
|
+
"barcodes": barcodes_in_order
|
|
322
|
+
}
|
|
323
|
+
return summary
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
|
|
327
|
+
# """
|
|
328
|
+
# Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
|
|
329
|
+
|
|
330
|
+
# Parameters:
|
|
331
|
+
# fastq_files (list): List of paths to demultiplexed FASTQ files.
|
|
332
|
+
# output_bam (str): Path to the output BAM file.
|
|
333
|
+
# barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
|
|
334
|
+
# gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
|
|
335
|
+
|
|
336
|
+
# Returns:
|
|
337
|
+
# None
|
|
338
|
+
# """
|
|
339
|
+
# import os
|
|
340
|
+
# import pysam
|
|
341
|
+
# import gzip
|
|
342
|
+
# from Bio import SeqIO
|
|
343
|
+
# from tqdm import tqdm
|
|
344
|
+
|
|
345
|
+
# n_fastqs = len(fastq_files)
|
|
346
|
+
|
|
347
|
+
# with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
|
|
348
|
+
# for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
|
|
349
|
+
# # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
|
|
350
|
+
# base_name = os.path.basename(fastq_file)
|
|
351
|
+
# if n_fastqs > 1:
|
|
352
|
+
# if base_name.endswith('.fastq.gz'):
|
|
353
|
+
# barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
|
|
354
|
+
# elif base_name.endswith('.fq.gz'):
|
|
355
|
+
# barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
|
|
356
|
+
# elif base_name.endswith('.fastq'):
|
|
357
|
+
# barcode = base_name.split('_')[-1].replace('.fastq', '')
|
|
358
|
+
# elif base_name.endswith('.fq'):
|
|
359
|
+
# barcode = base_name.split('_')[-1].replace('.fq', '')
|
|
360
|
+
# else:
|
|
361
|
+
# raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
|
|
362
|
+
# else:
|
|
363
|
+
# barcode = 'barcode0'
|
|
364
|
+
|
|
365
|
+
# # Read the FASTQ file (handle gzipped and non-gzipped files)
|
|
366
|
+
# open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
|
|
367
|
+
# with open_func(fastq_file, 'rt') as fq_in:
|
|
368
|
+
# for record in SeqIO.parse(fq_in, 'fastq'):
|
|
369
|
+
# # Create an unaligned BAM entry for each FASTQ record
|
|
370
|
+
# aln = pysam.AlignedSegment()
|
|
371
|
+
# aln.query_name = record.id
|
|
372
|
+
# aln.query_sequence = str(record.seq)
|
|
373
|
+
# aln.flag = 4 # Unmapped
|
|
374
|
+
# aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
|
|
375
|
+
# # Add the barcode to the BC tag
|
|
376
|
+
# aln.set_tag(barcode_tag, barcode)
|
|
377
|
+
# # Write to BAM file
|
|
378
|
+
# bam_out.write(aln)
|