smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# bed_to_bigwig
|
|
2
|
-
|
|
3
|
-
def bed_to_bigwig(fasta, bed):
|
|
4
|
-
"""
|
|
5
|
-
Takes a bed file of reads and makes a bedgraph plus a bigwig
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fasta (str): File path to the reference genome to align to.
|
|
9
|
-
bed (str): File path to the input bed.
|
|
10
|
-
Returns:
|
|
11
|
-
None
|
|
12
|
-
"""
|
|
13
|
-
import os
|
|
14
|
-
import subprocess
|
|
15
|
-
|
|
16
|
-
bed_basename = os.path.basename(bed)
|
|
17
|
-
parent_dir = os.path.dirname(bed)
|
|
18
|
-
bed_basename_minus_suffix = bed_basename.split('.bed')[0]
|
|
19
|
-
fasta_basename = os.path.basename(fasta)
|
|
20
|
-
fasta_dir = os.path.dirname(fasta)
|
|
21
|
-
fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
|
|
22
|
-
chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
|
|
23
|
-
chrom_path = os.path.join(fasta_dir, chrom_basename)
|
|
24
|
-
bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
|
|
25
|
-
bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
|
|
26
|
-
bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
|
|
27
|
-
bigwig_output = os.path.join(parent_dir, bigwig_basename)
|
|
28
|
-
|
|
29
|
-
# Make the bedgraph
|
|
30
|
-
with open(bedgraph_output, 'w') as outfile:
|
|
31
|
-
# Command as a list
|
|
32
|
-
command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
|
|
33
|
-
print(f'Making bedgraph from {bed_basename}')
|
|
34
|
-
subprocess.run(command, stdout=outfile)
|
|
35
|
-
|
|
36
|
-
# Make the bigwig
|
|
37
|
-
command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
|
|
38
|
-
print(f'Making bigwig from {bedgraph_basename}')
|
|
39
|
-
subprocess.run(command)
|
|
@@ -1,378 +0,0 @@
|
|
|
1
|
-
# concatenate_fastqs_to_bam
|
|
2
|
-
|
|
3
|
-
def concatenate_fastqs_to_bam(
|
|
4
|
-
fastq_files,
|
|
5
|
-
output_bam,
|
|
6
|
-
barcode_tag='BC',
|
|
7
|
-
gzip_suffixes=('.gz',),
|
|
8
|
-
barcode_map=None,
|
|
9
|
-
add_read_group=True,
|
|
10
|
-
rg_sample_field=None,
|
|
11
|
-
progress=True,
|
|
12
|
-
auto_pair=True,
|
|
13
|
-
):
|
|
14
|
-
"""
|
|
15
|
-
Concatenate FASTQ(s) into an unaligned BAM. Supports single-end and paired-end (auto-detect or explicit).
|
|
16
|
-
|
|
17
|
-
Parameters
|
|
18
|
-
----------
|
|
19
|
-
fastq_files : list[str] or list[(str,str)]
|
|
20
|
-
If list of tuples: each tuple is (R1_path, R2_path).
|
|
21
|
-
If list of strings and auto_pair=True: the function will attempt to automatically pair files.
|
|
22
|
-
output_bam : str
|
|
23
|
-
Path to output BAM (will be overwritten).
|
|
24
|
-
barcode_tag : str
|
|
25
|
-
SAM tag used for barcode (default 'BC').
|
|
26
|
-
gzip_suffixes : tuple
|
|
27
|
-
Compressed suffixes to consider (default ('.gz',)).
|
|
28
|
-
barcode_map : dict or None
|
|
29
|
-
Optional mapping {path: barcode} to override automatic extraction.
|
|
30
|
-
add_read_group : bool
|
|
31
|
-
If True, add RG entries and set RG tag per-read (ID = barcode).
|
|
32
|
-
rg_sample_field : str or None
|
|
33
|
-
If set, includes SM field in RG header entries.
|
|
34
|
-
progress : bool
|
|
35
|
-
Show tqdm progress bar.
|
|
36
|
-
auto_pair : bool
|
|
37
|
-
If True and `fastq_files` is a list of strings, attempt to auto-pair R1/R2 by filename patterns.
|
|
38
|
-
|
|
39
|
-
Returns
|
|
40
|
-
-------
|
|
41
|
-
dict
|
|
42
|
-
Summary: {'total_reads', 'per_file_counts', 'paired_count', 'unpaired_count', 'barcodes'}
|
|
43
|
-
"""
|
|
44
|
-
import os
|
|
45
|
-
import re
|
|
46
|
-
import gzip
|
|
47
|
-
from itertools import zip_longest
|
|
48
|
-
from Bio import SeqIO
|
|
49
|
-
import pysam
|
|
50
|
-
from tqdm import tqdm
|
|
51
|
-
|
|
52
|
-
# ---------- helpers ----------
|
|
53
|
-
def _is_gz(path):
|
|
54
|
-
pl = path.lower()
|
|
55
|
-
return any(pl.endswith(suf) for suf in gzip_suffixes)
|
|
56
|
-
|
|
57
|
-
def _strip_fastq_ext(basn):
|
|
58
|
-
# remove .fastq.gz .fq.gz .fastq .fq
|
|
59
|
-
for ext in ('.fastq.gz', '.fq.gz', '.fastq', '.fq'):
|
|
60
|
-
if basn.lower().endswith(ext):
|
|
61
|
-
return basn[:-len(ext)]
|
|
62
|
-
# fallback remove last suffix
|
|
63
|
-
return os.path.splitext(basn)[0]
|
|
64
|
-
|
|
65
|
-
def _extract_barcode_from_filename(path):
|
|
66
|
-
# heuristic: barcode is last underscore-separated token in filename (before ext)
|
|
67
|
-
stem = _strip_fastq_ext(os.path.basename(path))
|
|
68
|
-
if '_' in stem:
|
|
69
|
-
token = stem.split('_')[-1]
|
|
70
|
-
if token:
|
|
71
|
-
return token
|
|
72
|
-
# fallback to whole stem
|
|
73
|
-
return stem
|
|
74
|
-
|
|
75
|
-
# pairing heuristics: try to identify suffix that marks read number
|
|
76
|
-
def _classify_read_token(stem):
|
|
77
|
-
# returns (prefix, readnum) if matches, else (None, None)
|
|
78
|
-
patterns = [
|
|
79
|
-
r'(?i)(.*?)[._-]r?([12])$', # prefix_R1 or prefix.r1 or prefix-1
|
|
80
|
-
r'(?i)(.*?)[._-]read[_-]?([12])$',
|
|
81
|
-
r'(?i)(.*?)[/_]([12])$', # sometimes /1 is used (rare in filenames)
|
|
82
|
-
]
|
|
83
|
-
for pat in patterns:
|
|
84
|
-
m = re.match(pat, stem)
|
|
85
|
-
if m:
|
|
86
|
-
prefix = m.group(1)
|
|
87
|
-
num = m.group(2)
|
|
88
|
-
return prefix, int(num)
|
|
89
|
-
return None, None
|
|
90
|
-
|
|
91
|
-
def pair_by_filename(paths):
|
|
92
|
-
# paths: list of strings
|
|
93
|
-
map_pref = {} # prefix -> {1: path, 2: path, 'orphans': [..]}
|
|
94
|
-
unpaired = []
|
|
95
|
-
for p in paths:
|
|
96
|
-
name = os.path.basename(p)
|
|
97
|
-
stem = _strip_fastq_ext(name)
|
|
98
|
-
pref, num = _classify_read_token(stem)
|
|
99
|
-
if pref is not None:
|
|
100
|
-
entry = map_pref.setdefault(pref, {})
|
|
101
|
-
entry[num] = p
|
|
102
|
-
else:
|
|
103
|
-
# try fallback: split by last underscore or dot and check last token is 1/2 or R1/R2
|
|
104
|
-
toks = re.split(r'[_\.]', stem)
|
|
105
|
-
if toks and toks[-1] in ('1', '2', 'R1', 'R2', 'r1', 'r2'):
|
|
106
|
-
last = toks[-1]
|
|
107
|
-
basepref = "_".join(toks[:-1]) if len(toks) > 1 else toks[0]
|
|
108
|
-
num = 1 if last.endswith('1') else 2
|
|
109
|
-
entry = map_pref.setdefault(basepref, {})
|
|
110
|
-
entry[num] = p
|
|
111
|
-
else:
|
|
112
|
-
unpaired.append(p)
|
|
113
|
-
pairs = []
|
|
114
|
-
leftovers = []
|
|
115
|
-
for k, d in map_pref.items():
|
|
116
|
-
if 1 in d and 2 in d:
|
|
117
|
-
pairs.append((d[1], d[2]))
|
|
118
|
-
else:
|
|
119
|
-
# put whoever exists into leftovers
|
|
120
|
-
leftovers.extend([v for kk, v in d.items()])
|
|
121
|
-
# append also unpaired
|
|
122
|
-
leftovers.extend(unpaired)
|
|
123
|
-
return pairs, leftovers
|
|
124
|
-
|
|
125
|
-
# ---------- normalize input ----------
|
|
126
|
-
explicit_pairs = []
|
|
127
|
-
singles = []
|
|
128
|
-
if not isinstance(fastq_files, (list, tuple)):
|
|
129
|
-
raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
|
|
130
|
-
|
|
131
|
-
# mixture: if user supplied tuples -> treat as explicit pairs
|
|
132
|
-
if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
|
|
133
|
-
explicit_pairs = [(str(a), str(b)) for a, b in fastq_files]
|
|
134
|
-
else:
|
|
135
|
-
# flatten and coerce to strings, ignore None
|
|
136
|
-
paths = [str(x) for x in fastq_files if x is not None]
|
|
137
|
-
if auto_pair:
|
|
138
|
-
explicit_pairs, leftovers = pair_by_filename(paths)
|
|
139
|
-
singles = leftovers
|
|
140
|
-
else:
|
|
141
|
-
singles = paths
|
|
142
|
-
|
|
143
|
-
# Build barcode map and ordered barcodes
|
|
144
|
-
barcode_map = barcode_map or {}
|
|
145
|
-
per_path_barcode = {}
|
|
146
|
-
barcodes_in_order = []
|
|
147
|
-
|
|
148
|
-
# pairs: assign barcode per pair from either provided barcode_map for first file or from filenames
|
|
149
|
-
for r1, r2 in explicit_pairs:
|
|
150
|
-
bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
|
|
151
|
-
per_path_barcode[r1] = bc
|
|
152
|
-
per_path_barcode[r2] = bc
|
|
153
|
-
if bc not in barcodes_in_order:
|
|
154
|
-
barcodes_in_order.append(bc)
|
|
155
|
-
for p in singles:
|
|
156
|
-
bc = barcode_map.get(p) or _extract_barcode_from_filename(p)
|
|
157
|
-
per_path_barcode[p] = bc
|
|
158
|
-
if bc not in barcodes_in_order:
|
|
159
|
-
barcodes_in_order.append(bc)
|
|
160
|
-
|
|
161
|
-
# prepare BAM header
|
|
162
|
-
header = {"HD": {"VN": "1.0"}, "SQ": []}
|
|
163
|
-
if add_read_group:
|
|
164
|
-
rg_list = []
|
|
165
|
-
for bc in barcodes_in_order:
|
|
166
|
-
rg = {"ID": bc}
|
|
167
|
-
if rg_sample_field:
|
|
168
|
-
rg["SM"] = rg_sample_field
|
|
169
|
-
rg_list.append(rg)
|
|
170
|
-
header["RG"] = rg_list
|
|
171
|
-
|
|
172
|
-
# ---------- write BAM ----------
|
|
173
|
-
per_file_counts = {}
|
|
174
|
-
total_written = 0
|
|
175
|
-
paired_count = 0
|
|
176
|
-
unpaired_count = 0
|
|
177
|
-
|
|
178
|
-
def _open_fh(path):
|
|
179
|
-
return gzip.open(path, "rt") if _is_gz(path) else open(path, "rt")
|
|
180
|
-
|
|
181
|
-
with pysam.AlignmentFile(output_bam, "wb", header=header) as bam_out:
|
|
182
|
-
# process paired files first
|
|
183
|
-
seq_iter = list(explicit_pairs) # list of (r1,r2)
|
|
184
|
-
if progress:
|
|
185
|
-
seq_iter = tqdm(seq_iter, desc="Paired FASTQ->BAM")
|
|
186
|
-
for r1_path, r2_path in seq_iter:
|
|
187
|
-
if not (os.path.exists(r1_path) and os.path.exists(r2_path)):
|
|
188
|
-
raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
|
|
189
|
-
bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
|
|
190
|
-
# open both and iterate in parallel
|
|
191
|
-
with _open_fh(r1_path) as fh1, _open_fh(r2_path) as fh2:
|
|
192
|
-
it1 = SeqIO.parse(fh1, "fastq")
|
|
193
|
-
it2 = SeqIO.parse(fh2, "fastq")
|
|
194
|
-
# iterate in lockstep; if one shorter we still write remaining as unpaired (zip_longest)
|
|
195
|
-
for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
|
|
196
|
-
# determine a common read name
|
|
197
|
-
if rec1 is not None:
|
|
198
|
-
id1 = rec1.id
|
|
199
|
-
else:
|
|
200
|
-
id1 = None
|
|
201
|
-
if rec2 is not None:
|
|
202
|
-
id2 = rec2.id
|
|
203
|
-
else:
|
|
204
|
-
id2 = None
|
|
205
|
-
# try to derive a common name (strip /1 or /2 if present)
|
|
206
|
-
def _strip_end_id(s):
|
|
207
|
-
if s is None:
|
|
208
|
-
return None
|
|
209
|
-
return re.sub(r'(?:/1$|/2$|\s[12]$)', '', s)
|
|
210
|
-
common_name = _strip_end_id(id1) or _strip_end_id(id2) or (id1 or id2)
|
|
211
|
-
|
|
212
|
-
# create AlignedSegment for read1
|
|
213
|
-
if rec1 is not None:
|
|
214
|
-
a1 = pysam.AlignedSegment()
|
|
215
|
-
a1.query_name = common_name
|
|
216
|
-
a1.query_sequence = str(rec1.seq)
|
|
217
|
-
a1.is_paired = True
|
|
218
|
-
a1.is_read1 = True
|
|
219
|
-
a1.is_read2 = False
|
|
220
|
-
a1.is_unmapped = True
|
|
221
|
-
a1.mate_is_unmapped = True
|
|
222
|
-
# reference fields for unmapped
|
|
223
|
-
a1.reference_id = -1
|
|
224
|
-
a1.reference_start = -1
|
|
225
|
-
a1.next_reference_id = -1
|
|
226
|
-
a1.next_reference_start = -1
|
|
227
|
-
a1.template_length = 0
|
|
228
|
-
# qualities
|
|
229
|
-
if "phred_quality" in rec1.letter_annotations:
|
|
230
|
-
try:
|
|
231
|
-
a1.query_qualities = [int(x) for x in rec1.letter_annotations["phred_quality"]]
|
|
232
|
-
except Exception:
|
|
233
|
-
a1.query_qualities = None
|
|
234
|
-
# tags
|
|
235
|
-
a1.set_tag(barcode_tag, str(bc), value_type='Z')
|
|
236
|
-
if add_read_group:
|
|
237
|
-
a1.set_tag("RG", str(bc), value_type='Z')
|
|
238
|
-
bam_out.write(a1)
|
|
239
|
-
per_file_counts.setdefault(r1_path, 0)
|
|
240
|
-
per_file_counts[r1_path] += 1
|
|
241
|
-
total_written += 1
|
|
242
|
-
# create AlignedSegment for read2
|
|
243
|
-
if rec2 is not None:
|
|
244
|
-
a2 = pysam.AlignedSegment()
|
|
245
|
-
a2.query_name = common_name
|
|
246
|
-
a2.query_sequence = str(rec2.seq)
|
|
247
|
-
a2.is_paired = True
|
|
248
|
-
a2.is_read1 = False
|
|
249
|
-
a2.is_read2 = True
|
|
250
|
-
a2.is_unmapped = True
|
|
251
|
-
a2.mate_is_unmapped = True
|
|
252
|
-
a2.reference_id = -1
|
|
253
|
-
a2.reference_start = -1
|
|
254
|
-
a2.next_reference_id = -1
|
|
255
|
-
a2.next_reference_start = -1
|
|
256
|
-
a2.template_length = 0
|
|
257
|
-
if "phred_quality" in rec2.letter_annotations:
|
|
258
|
-
try:
|
|
259
|
-
a2.query_qualities = [int(x) for x in rec2.letter_annotations["phred_quality"]]
|
|
260
|
-
except Exception:
|
|
261
|
-
a2.query_qualities = None
|
|
262
|
-
a2.set_tag(barcode_tag, str(bc), value_type='Z')
|
|
263
|
-
if add_read_group:
|
|
264
|
-
a2.set_tag("RG", str(bc), value_type='Z')
|
|
265
|
-
bam_out.write(a2)
|
|
266
|
-
per_file_counts.setdefault(r2_path, 0)
|
|
267
|
-
per_file_counts[r2_path] += 1
|
|
268
|
-
total_written += 1
|
|
269
|
-
# count paired/unpaired bookkeeping
|
|
270
|
-
if rec1 is not None and rec2 is not None:
|
|
271
|
-
paired_count += 1
|
|
272
|
-
else:
|
|
273
|
-
# one side missing -> counted as unpaired for whichever exists
|
|
274
|
-
if rec1 is not None:
|
|
275
|
-
unpaired_count += 1
|
|
276
|
-
if rec2 is not None:
|
|
277
|
-
unpaired_count += 1
|
|
278
|
-
|
|
279
|
-
# process singletons
|
|
280
|
-
single_iter = list(singles)
|
|
281
|
-
if progress:
|
|
282
|
-
single_iter = tqdm(single_iter, desc="Single FASTQ->BAM")
|
|
283
|
-
for p in single_iter:
|
|
284
|
-
if not os.path.exists(p):
|
|
285
|
-
raise FileNotFoundError(p)
|
|
286
|
-
bc = per_path_barcode.get(p, "barcode")
|
|
287
|
-
with _open_fh(p) as fh:
|
|
288
|
-
for rec in SeqIO.parse(fh, "fastq"):
|
|
289
|
-
a = pysam.AlignedSegment()
|
|
290
|
-
a.query_name = rec.id
|
|
291
|
-
a.query_sequence = str(rec.seq)
|
|
292
|
-
a.is_paired = False
|
|
293
|
-
a.is_read1 = False
|
|
294
|
-
a.is_read2 = False
|
|
295
|
-
a.is_unmapped = True
|
|
296
|
-
a.mate_is_unmapped = True
|
|
297
|
-
a.reference_id = -1
|
|
298
|
-
a.reference_start = -1
|
|
299
|
-
a.next_reference_id = -1
|
|
300
|
-
a.next_reference_start = -1
|
|
301
|
-
a.template_length = 0
|
|
302
|
-
if "phred_quality" in rec.letter_annotations:
|
|
303
|
-
try:
|
|
304
|
-
a.query_qualities = [int(x) for x in rec.letter_annotations["phred_quality"]]
|
|
305
|
-
except Exception:
|
|
306
|
-
a.query_qualities = None
|
|
307
|
-
a.set_tag(barcode_tag, str(bc), value_type='Z')
|
|
308
|
-
if add_read_group:
|
|
309
|
-
a.set_tag("RG", str(bc), value_type='Z')
|
|
310
|
-
bam_out.write(a)
|
|
311
|
-
per_file_counts.setdefault(p, 0)
|
|
312
|
-
per_file_counts[p] += 1
|
|
313
|
-
total_written += 1
|
|
314
|
-
unpaired_count += 1
|
|
315
|
-
|
|
316
|
-
summary = {
|
|
317
|
-
"total_reads": total_written,
|
|
318
|
-
"per_file": per_file_counts,
|
|
319
|
-
"paired_pairs_written": paired_count,
|
|
320
|
-
"singletons_written": unpaired_count,
|
|
321
|
-
"barcodes": barcodes_in_order
|
|
322
|
-
}
|
|
323
|
-
return summary
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
# def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
|
|
327
|
-
# """
|
|
328
|
-
# Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
|
|
329
|
-
|
|
330
|
-
# Parameters:
|
|
331
|
-
# fastq_files (list): List of paths to demultiplexed FASTQ files.
|
|
332
|
-
# output_bam (str): Path to the output BAM file.
|
|
333
|
-
# barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
|
|
334
|
-
# gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
|
|
335
|
-
|
|
336
|
-
# Returns:
|
|
337
|
-
# None
|
|
338
|
-
# """
|
|
339
|
-
# import os
|
|
340
|
-
# import pysam
|
|
341
|
-
# import gzip
|
|
342
|
-
# from Bio import SeqIO
|
|
343
|
-
# from tqdm import tqdm
|
|
344
|
-
|
|
345
|
-
# n_fastqs = len(fastq_files)
|
|
346
|
-
|
|
347
|
-
# with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
|
|
348
|
-
# for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
|
|
349
|
-
# # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
|
|
350
|
-
# base_name = os.path.basename(fastq_file)
|
|
351
|
-
# if n_fastqs > 1:
|
|
352
|
-
# if base_name.endswith('.fastq.gz'):
|
|
353
|
-
# barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
|
|
354
|
-
# elif base_name.endswith('.fq.gz'):
|
|
355
|
-
# barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
|
|
356
|
-
# elif base_name.endswith('.fastq'):
|
|
357
|
-
# barcode = base_name.split('_')[-1].replace('.fastq', '')
|
|
358
|
-
# elif base_name.endswith('.fq'):
|
|
359
|
-
# barcode = base_name.split('_')[-1].replace('.fq', '')
|
|
360
|
-
# else:
|
|
361
|
-
# raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
|
|
362
|
-
# else:
|
|
363
|
-
# barcode = 'barcode0'
|
|
364
|
-
|
|
365
|
-
# # Read the FASTQ file (handle gzipped and non-gzipped files)
|
|
366
|
-
# open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
|
|
367
|
-
# with open_func(fastq_file, 'rt') as fq_in:
|
|
368
|
-
# for record in SeqIO.parse(fq_in, 'fastq'):
|
|
369
|
-
# # Create an unaligned BAM entry for each FASTQ record
|
|
370
|
-
# aln = pysam.AlignedSegment()
|
|
371
|
-
# aln.query_name = record.id
|
|
372
|
-
# aln.query_sequence = str(record.seq)
|
|
373
|
-
# aln.flag = 4 # Unmapped
|
|
374
|
-
# aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
|
|
375
|
-
# # Add the barcode to the BC tag
|
|
376
|
-
# aln.set_tag(barcode_tag, barcode)
|
|
377
|
-
# # Write to BAM file
|
|
378
|
-
# bam_out.write(aln)
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import Dict, List, Any, Tuple
|
|
3
|
-
|
|
4
|
-
def discover_input_files(
|
|
5
|
-
input_data_path: str,
|
|
6
|
-
bam_suffix: str = ".bam",
|
|
7
|
-
recursive: bool = False,
|
|
8
|
-
follow_symlinks: bool = False,
|
|
9
|
-
) -> Dict[str, Any]:
|
|
10
|
-
"""
|
|
11
|
-
Discover input files under `input_data_path`.
|
|
12
|
-
|
|
13
|
-
Returns a dict with:
|
|
14
|
-
- pod5_paths, fast5_paths, fastq_paths, bam_paths (lists of str)
|
|
15
|
-
- input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
|
|
16
|
-
- all_files_searched (int)
|
|
17
|
-
Behavior:
|
|
18
|
-
- If `input_data_path` is a file, returns that single file categorized.
|
|
19
|
-
- If it is a directory, scans either immediate children (recursive=False)
|
|
20
|
-
or entire tree (recursive=True). Uses Path.suffixes to detect .fastq.gz etc.
|
|
21
|
-
"""
|
|
22
|
-
p = Path(input_data_path)
|
|
23
|
-
pod5_exts = {".pod5", ".p5"}
|
|
24
|
-
fast5_exts = {".fast5", ".f5"}
|
|
25
|
-
fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.xz", ".fq.xz"}
|
|
26
|
-
# normalize bam suffix with leading dot
|
|
27
|
-
if not bam_suffix.startswith("."):
|
|
28
|
-
bam_suffix = "." + bam_suffix
|
|
29
|
-
bam_suffix = bam_suffix.lower()
|
|
30
|
-
|
|
31
|
-
pod5_paths: List[str] = []
|
|
32
|
-
fast5_paths: List[str] = []
|
|
33
|
-
fastq_paths: List[str] = []
|
|
34
|
-
bam_paths: List[str] = []
|
|
35
|
-
other_paths: List[str] = []
|
|
36
|
-
|
|
37
|
-
def _file_ext_key(pp: Path) -> str:
|
|
38
|
-
# join suffixes to handle .fastq.gz
|
|
39
|
-
return "".join(pp.suffixes).lower() if pp.suffixes else pp.suffix.lower()
|
|
40
|
-
|
|
41
|
-
if p.exists() and p.is_file():
|
|
42
|
-
ext_key = _file_ext_key(p)
|
|
43
|
-
if ext_key in pod5_exts:
|
|
44
|
-
pod5_paths.append(str(p))
|
|
45
|
-
elif ext_key in fast5_exts:
|
|
46
|
-
fast5_paths.append(str(p))
|
|
47
|
-
elif ext_key in fastq_exts:
|
|
48
|
-
fastq_paths.append(str(p))
|
|
49
|
-
elif ext_key == bam_suffix:
|
|
50
|
-
bam_paths.append(str(p))
|
|
51
|
-
else:
|
|
52
|
-
other_paths.append(str(p))
|
|
53
|
-
total_searched = 1
|
|
54
|
-
elif p.exists() and p.is_dir():
|
|
55
|
-
if recursive:
|
|
56
|
-
iterator = p.rglob("*")
|
|
57
|
-
else:
|
|
58
|
-
iterator = p.iterdir()
|
|
59
|
-
total_searched = 0
|
|
60
|
-
for fp in iterator:
|
|
61
|
-
if not fp.is_file():
|
|
62
|
-
continue
|
|
63
|
-
total_searched += 1
|
|
64
|
-
ext_key = _file_ext_key(fp)
|
|
65
|
-
if ext_key in pod5_exts:
|
|
66
|
-
pod5_paths.append(str(fp))
|
|
67
|
-
elif ext_key in fast5_exts:
|
|
68
|
-
fast5_paths.append(str(fp))
|
|
69
|
-
elif ext_key in fastq_exts:
|
|
70
|
-
fastq_paths.append(str(fp))
|
|
71
|
-
elif ext_key == bam_suffix:
|
|
72
|
-
bam_paths.append(str(fp))
|
|
73
|
-
else:
|
|
74
|
-
# additional heuristic: check filename contains extension fragments (.pod5 etc)
|
|
75
|
-
name = fp.name.lower()
|
|
76
|
-
if any(e in name for e in pod5_exts):
|
|
77
|
-
pod5_paths.append(str(fp))
|
|
78
|
-
elif any(e in name for e in fast5_exts):
|
|
79
|
-
fast5_paths.append(str(fp))
|
|
80
|
-
elif any(e in name for e in [".fastq", ".fq"]):
|
|
81
|
-
fastq_paths.append(str(fp))
|
|
82
|
-
elif name.endswith(bam_suffix):
|
|
83
|
-
bam_paths.append(str(fp))
|
|
84
|
-
else:
|
|
85
|
-
other_paths.append(str(fp))
|
|
86
|
-
else:
|
|
87
|
-
raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
|
|
88
|
-
|
|
89
|
-
return {
|
|
90
|
-
"pod5_paths": sorted(pod5_paths),
|
|
91
|
-
"fast5_paths": sorted(fast5_paths),
|
|
92
|
-
"fastq_paths": sorted(fastq_paths),
|
|
93
|
-
"bam_paths": sorted(bam_paths),
|
|
94
|
-
"other_paths": sorted(other_paths),
|
|
95
|
-
"input_is_pod5": len(pod5_paths) > 0,
|
|
96
|
-
"input_is_fast5": len(fast5_paths) > 0,
|
|
97
|
-
"input_is_fastq": len(fastq_paths) > 0,
|
|
98
|
-
"input_is_bam": len(bam_paths) > 0,
|
|
99
|
-
"all_files_searched": total_searched,
|
|
100
|
-
}
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# index_fasta
|
|
2
|
-
|
|
3
|
-
def index_fasta(fasta):
|
|
4
|
-
"""
|
|
5
|
-
Generate a FASTA index file for an input fasta.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fasta (str): Path to the input fasta to make an index file for.
|
|
9
|
-
"""
|
|
10
|
-
import subprocess
|
|
11
|
-
|
|
12
|
-
subprocess.run(["samtools", "faidx", fasta])
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
## make_dirs
|
|
2
|
-
|
|
3
|
-
# General
|
|
4
|
-
def make_dirs(directories):
|
|
5
|
-
"""
|
|
6
|
-
Takes a list of file paths and makes new directories if the directory does not already exist.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
directories (list): A list of directories to make
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
import os
|
|
15
|
-
|
|
16
|
-
for directory in directories:
|
|
17
|
-
if not os.path.isdir(directory):
|
|
18
|
-
os.mkdir(directory)
|
|
19
|
-
print(f"Directory '{directory}' created successfully.")
|
|
20
|
-
else:
|
|
21
|
-
print(f"Directory '{directory}' already exists.")
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
## readwrite ##
|
|
2
|
-
|
|
3
|
-
######################################################################################################
|
|
4
|
-
## Datetime functionality
|
|
5
|
-
def date_string():
|
|
6
|
-
"""
|
|
7
|
-
Each time this is called, it returns the current date string
|
|
8
|
-
"""
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
current_date = datetime.now()
|
|
11
|
-
date_string = current_date.strftime("%Y%m%d")
|
|
12
|
-
date_string = date_string[2:]
|
|
13
|
-
return date_string
|
|
14
|
-
|
|
15
|
-
def time_string():
|
|
16
|
-
"""
|
|
17
|
-
Each time this is called, it returns the current time string
|
|
18
|
-
"""
|
|
19
|
-
from datetime import datetime
|
|
20
|
-
current_time = datetime.now()
|
|
21
|
-
return current_time.strftime("%H:%M:%S")
|
|
22
|
-
######################################################################################################
|
|
23
|
-
|
|
24
|
-
######################################################################################################
|
|
25
|
-
## Numpy, Pandas, Anndata functionality
|
|
26
|
-
def adata_to_df(adata, layer=None):
|
|
27
|
-
"""
|
|
28
|
-
Input: An adata object with a specified layer.
|
|
29
|
-
Output: A dataframe for the specific layer.
|
|
30
|
-
"""
|
|
31
|
-
import pandas as pd
|
|
32
|
-
import anndata as ad
|
|
33
|
-
|
|
34
|
-
# Extract the data matrix from the given layer
|
|
35
|
-
if layer:
|
|
36
|
-
data_matrix = adata.layers[layer]
|
|
37
|
-
else:
|
|
38
|
-
data_matrix = adata.X
|
|
39
|
-
# Extract observation (read) annotations
|
|
40
|
-
obs_df = adata.obs
|
|
41
|
-
# Extract variable (position) annotations
|
|
42
|
-
var_df = adata.var
|
|
43
|
-
# Convert data matrix and annotations to pandas DataFrames
|
|
44
|
-
df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
|
|
45
|
-
return df
|
|
46
|
-
|
|
47
|
-
def save_matrix(matrix, save_name):
|
|
48
|
-
"""
|
|
49
|
-
Input: A numpy matrix and a save_name
|
|
50
|
-
Output: A txt file representation of the data matrix
|
|
51
|
-
"""
|
|
52
|
-
import numpy as np
|
|
53
|
-
np.savetxt(f'{save_name}.txt', matrix)
|
|
54
|
-
|
|
55
|
-
def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
56
|
-
"""
|
|
57
|
-
Concatenate all h5ad files in a directory and delete them after the final adata is written out.
|
|
58
|
-
Input: an output file path relative to the directory in which the function is called
|
|
59
|
-
"""
|
|
60
|
-
import os
|
|
61
|
-
import anndata as ad
|
|
62
|
-
# Runtime warnings
|
|
63
|
-
import warnings
|
|
64
|
-
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
65
|
-
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
66
|
-
|
|
67
|
-
# List all files in the directory
|
|
68
|
-
files = os.listdir(os.getcwd())
|
|
69
|
-
# get current working directory
|
|
70
|
-
cwd = os.getcwd()
|
|
71
|
-
suffix = file_suffix
|
|
72
|
-
# Filter file names that contain the search string in their filename and keep them in a list
|
|
73
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
74
|
-
# Sort file list by names and print the list of file names
|
|
75
|
-
hdfs.sort()
|
|
76
|
-
print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
|
|
77
|
-
# Iterate over all of the hdf5 files and concatenate them.
|
|
78
|
-
final_adata = None
|
|
79
|
-
for hdf in hdfs:
|
|
80
|
-
print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
|
|
81
|
-
temp_adata = ad.read_h5ad(hdf)
|
|
82
|
-
if final_adata:
|
|
83
|
-
print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
84
|
-
final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
|
|
85
|
-
else:
|
|
86
|
-
print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
87
|
-
final_adata = temp_adata
|
|
88
|
-
print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
|
|
89
|
-
final_adata.write_h5ad(output_file, compression='gzip')
|
|
90
|
-
|
|
91
|
-
# Delete the individual h5ad files and only keep the final concatenated file
|
|
92
|
-
if delete_inputs:
|
|
93
|
-
files = os.listdir(os.getcwd())
|
|
94
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
95
|
-
if output_file in hdfs:
|
|
96
|
-
hdfs.remove(output_file)
|
|
97
|
-
# Iterate over the files and delete them
|
|
98
|
-
for hdf in hdfs:
|
|
99
|
-
try:
|
|
100
|
-
os.remove(hdf)
|
|
101
|
-
print(f"Deleted file: {hdf}")
|
|
102
|
-
except OSError as e:
|
|
103
|
-
print(f"Error deleting file {hdf}: {e}")
|
|
104
|
-
else:
|
|
105
|
-
print('Keeping input files')
|
|
106
|
-
######################################################################################################
|