spacer-count 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacer_count-0.1.0/PKG-INFO +8 -0
- spacer_count-0.1.0/README.md +0 -0
- spacer_count-0.1.0/pyproject.toml +21 -0
- spacer_count-0.1.0/setup.cfg +4 -0
- spacer_count-0.1.0/src/__init__.py +0 -0
- spacer_count-0.1.0/src/spacer_count.egg-info/PKG-INFO +8 -0
- spacer_count-0.1.0/src/spacer_count.egg-info/SOURCES.txt +10 -0
- spacer_count-0.1.0/src/spacer_count.egg-info/dependency_links.txt +1 -0
- spacer_count-0.1.0/src/spacer_count.egg-info/requires.txt +2 -0
- spacer_count-0.1.0/src/spacer_count.egg-info/top_level.txt +2 -0
- spacer_count-0.1.0/src/spacer_counter.py +207 -0
- spacer_count-0.1.0/tests/test_counter.py +29 -0
|
File without changes
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "spacer-count"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"biopython>1.84",
|
|
9
|
+
"pandas>2.2",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[dependency-groups]
|
|
13
|
+
dev = [
|
|
14
|
+
"pytest>=9.0.2",
|
|
15
|
+
"pytest-cov>=7.0.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.pytest.ini_options]
|
|
19
|
+
pythonpath = [
|
|
20
|
+
".", "src",
|
|
21
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/__init__.py
|
|
4
|
+
src/spacer_counter.py
|
|
5
|
+
src/spacer_count.egg-info/PKG-INFO
|
|
6
|
+
src/spacer_count.egg-info/SOURCES.txt
|
|
7
|
+
src/spacer_count.egg-info/dependency_links.txt
|
|
8
|
+
src/spacer_count.egg-info/requires.txt
|
|
9
|
+
src/spacer_count.egg-info/top_level.txt
|
|
10
|
+
tests/test_counter.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import re
|
|
3
|
+
from multiprocessing import Pool
|
|
4
|
+
from functools import lru_cache, partial
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from Bio import SeqIO, Align, Seq
|
|
8
|
+
|
|
9
|
+
from os import listdir
|
|
10
|
+
|
|
11
|
+
class SpacerCounter:
|
|
12
|
+
|
|
13
|
+
def __init__(self, flanking_seqs, spacer_size=20, spacer_df=None, spacer_info_csv=None, spacer_size_flex=1):
|
|
14
|
+
left_flanking_seq, right_flanking_seq = flanking_seqs
|
|
15
|
+
|
|
16
|
+
if len(left_flanking_seq) < 5 or len(right_flanking_seq) < 5:
|
|
17
|
+
raise Exception("Flanking sequences must be at least 5 bases long.")
|
|
18
|
+
|
|
19
|
+
if spacer_info_csv is not None:
|
|
20
|
+
print("Using provided spacer info CSV; ignoring spacer_size parameter.")
|
|
21
|
+
self.spacer_df = pd.read_csv(spacer_info_csv, header=None, names=["guide_id", "sequence", 'gene'])
|
|
22
|
+
self.spacer_df['sequence'] = self.spacer_df['sequence'].str.upper()
|
|
23
|
+
self.spacer_df['guide_id'] = self.spacer_df['guide_id'].astype(str)
|
|
24
|
+
self.spacer_df['gene'] = self.spacer_df['gene'].astype(str)
|
|
25
|
+
|
|
26
|
+
self.spacer_size_lims= [self.spacer_df['sequence'].apply(len).min(), self.spacer_df['sequence'].apply(len).max()]
|
|
27
|
+
self.spacer_size_lims = [self.spacer_size_lims[0] - spacer_size_flex, self.spacer_size_lims[1] + spacer_size_flex]
|
|
28
|
+
|
|
29
|
+
elif spacer_df is not None:
|
|
30
|
+
print("Using provided spacer info DataFrame; ignoring spacer_size parameter.")
|
|
31
|
+
self.spacer_df = spacer_df
|
|
32
|
+
self.spacer_size_lims= [self.spacer_df['sequence'].apply(len).min(), self.spacer_df['sequence'].apply(len).max()]
|
|
33
|
+
self.spacer_size_lims = [self.spacer_size_lims[0] - spacer_size_flex, self.spacer_size_lims[1] + spacer_size_flex]
|
|
34
|
+
|
|
35
|
+
else:
|
|
36
|
+
print("No spacer info DataFrame provided; using spacer_size parameter.")
|
|
37
|
+
self.spacer_size_lims = [spacer_size - spacer_size_flex, spacer_size + spacer_size_flex]
|
|
38
|
+
self.spacer_df = pd.DataFrame(columns=["guide_id", "sequence", 'gene'])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
self.spacer_size_flex = spacer_size_flex
|
|
42
|
+
|
|
43
|
+
left_flanking_seq = left_flanking_seq[-5:].replace('N', '[ACGT]')
|
|
44
|
+
right_flanking_seq = right_flanking_seq[0:5].replace('N', '[ACGT]')
|
|
45
|
+
|
|
46
|
+
self.re_pattern = re.compile("{0}((A|C|T|G){{{1},{2}}}){3}".format(
|
|
47
|
+
left_flanking_seq, self.spacer_size_lims[0], self.spacer_size_lims[1], right_flanking_seq))
|
|
48
|
+
|
|
49
|
+
'''
|
|
50
|
+
Instance from fasta and csv files.
|
|
51
|
+
The fasta file should contain two sequences with ids "flanking_left" and "flanking_right".
|
|
52
|
+
The csv file should contain three columns: guide_id, sequence, and gene (with no header).
|
|
53
|
+
'''
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_fasta_csv(cls, flanking_fasta_path, spacer_info_csv, spacer_size_flex=1):
|
|
56
|
+
with open(flanking_fasta_path, "r") as f:
|
|
57
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
58
|
+
if record.id == "flanking_left":
|
|
59
|
+
left_flanking_seq = str(record.seq).upper()
|
|
60
|
+
elif record.id == "flanking_right":
|
|
61
|
+
right_flanking_seq = str(record.seq).upper()
|
|
62
|
+
|
|
63
|
+
if len(left_flanking_seq) < 5 or len(right_flanking_seq) < 5:
|
|
64
|
+
raise Exception("Flanking sequences must be at least 5 bases long.")
|
|
65
|
+
|
|
66
|
+
spacer_df = pd.read_csv(spacer_info_csv, header=None, names=["guide_id", "sequence", 'gene'])
|
|
67
|
+
spacer_df['sequence'] = spacer_df['sequence'].str.upper()
|
|
68
|
+
spacer_df['guide_id'] = spacer_df['guide_id'].astype(str)
|
|
69
|
+
spacer_df['gene'] = spacer_df['gene'].astype(str)
|
|
70
|
+
|
|
71
|
+
return cls([left_flanking_seq, right_flanking_seq], 0, spacer_info_csv=spacer_info_csv, spacer_size_flex=spacer_size_flex)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def count_spacers(self, fastq_path, basename=None, threads=1):
|
|
75
|
+
# Extract spacers from the fastq file
|
|
76
|
+
id_spacers = self.parse_fastq(fastq_path)
|
|
77
|
+
id_spacers = [(id, spacer) for id, spacer in id_spacers if spacer != ""]
|
|
78
|
+
|
|
79
|
+
# Initialize dict based on spacer_df with an additional row for unknown spacers
|
|
80
|
+
seq_count_dict = {}
|
|
81
|
+
for spacer_seq in self.spacer_df['sequence']:
|
|
82
|
+
if spacer_seq not in seq_count_dict:
|
|
83
|
+
seq_count_dict[spacer_seq] = 0
|
|
84
|
+
unknown_spacer_list = []
|
|
85
|
+
|
|
86
|
+
for id, spacer in id_spacers:
|
|
87
|
+
if spacer in seq_count_dict:
|
|
88
|
+
seq_count_dict[spacer] += 1
|
|
89
|
+
elif spacer != "":
|
|
90
|
+
unknown_spacer_list.append((id, spacer))
|
|
91
|
+
|
|
92
|
+
spacer_tup = tuple(self.spacer_df['sequence'].tolist())
|
|
93
|
+
align2correct_partial = partial(align2correct, spacer_tup)
|
|
94
|
+
|
|
95
|
+
if threads > 1:
|
|
96
|
+
# Use multiprocessing to align unknown spacers in parallel
|
|
97
|
+
with Pool(threads) as pool:
|
|
98
|
+
corrected_spacers = pool.map(align2correct_partial, [spacer for _, spacer in unknown_spacer_list])
|
|
99
|
+
corrected_results = list(zip([id for id, _ in unknown_spacer_list], corrected_spacers))
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
# Align unknown spacers sequentially, this will benefit from lru_cache
|
|
103
|
+
corrected_results = []
|
|
104
|
+
for id, spacer in unknown_spacer_list:
|
|
105
|
+
corrected_spacer = align2correct_partial(spacer)
|
|
106
|
+
corrected_results.append((id, corrected_spacer))
|
|
107
|
+
|
|
108
|
+
unknown_spacer_list2 = []
|
|
109
|
+
for idx, (id, spacer) in enumerate(corrected_results):
|
|
110
|
+
if spacer is not None:
|
|
111
|
+
seq_count_dict[spacer] += 1
|
|
112
|
+
else:
|
|
113
|
+
unknown_spacer_list2.append(unknown_spacer_list[idx])
|
|
114
|
+
|
|
115
|
+
unknown_dict = {}
|
|
116
|
+
for unknown_id, unknown_seq in unknown_spacer_list2:
|
|
117
|
+
if unknown_seq in unknown_dict:
|
|
118
|
+
unknown_dict[unknown_seq] += 1
|
|
119
|
+
else:
|
|
120
|
+
unknown_dict[unknown_seq] = 1
|
|
121
|
+
|
|
122
|
+
print(' Out of total {0} total spacers, {1} ({2:.2%}) were matched to a known spacer.'.format(
|
|
123
|
+
len(id_spacers), len(id_spacers) - len(unknown_spacer_list2),
|
|
124
|
+
(len(id_spacers) - len(unknown_spacer_list2)) / len(id_spacers), self.spacer_size_flex))
|
|
125
|
+
print(' Among them, {0} needed alignment (not exact match). {1} remains as unknown even after alignment.'.format(
|
|
126
|
+
len(unknown_spacer_list), len(unknown_spacer_list2)))
|
|
127
|
+
print()
|
|
128
|
+
|
|
129
|
+
output_df = self.spacer_df.copy()
|
|
130
|
+
output_df['count'] = output_df['sequence'].map(seq_count_dict).fillna(0).astype(int)
|
|
131
|
+
output_df.loc[len(output_df.index)] = ['unknown_spacer', 'N' * 5 + '...' + 'N' * 5, 'unknown_gene', len(unknown_spacer_list2)]
|
|
132
|
+
|
|
133
|
+
unknown_df = pd.DataFrame(columns=output_df.columns)
|
|
134
|
+
for unknown_seq, count in unknown_dict.items():
|
|
135
|
+
unknown_df = pd.concat([unknown_df, pd.DataFrame([['unknown_spacer', unknown_seq, 'unknown_gene', count]], columns=output_df.columns)], ignore_index=True)
|
|
136
|
+
|
|
137
|
+
if basename is not None:
|
|
138
|
+
output_df.to_csv(basename + "spacer_count.csv", index=False)
|
|
139
|
+
unknown_df.to_csv(basename + "unknown_spacer.csv", index=False)
|
|
140
|
+
|
|
141
|
+
return output_df, unknown_df
|
|
142
|
+
|
|
143
|
+
def parse_fastq(self, fastq_path):
|
|
144
|
+
|
|
145
|
+
print("Extracting spacers from file: {0}".format(fastq_path))
|
|
146
|
+
|
|
147
|
+
id_spacers = []
|
|
148
|
+
no_guide_count = 0
|
|
149
|
+
id_sequences = load_fasta_to_seqs(fastq_path)
|
|
150
|
+
for id, seq in id_sequences:
|
|
151
|
+
|
|
152
|
+
# Try the forward strand
|
|
153
|
+
match = self.re_pattern.search(seq)
|
|
154
|
+
if match:
|
|
155
|
+
id_spacers.append((id, match.group(1)))
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
# Try the reverse strand
|
|
159
|
+
rev_seq = str(Seq.Seq(seq).reverse_complement()).upper()
|
|
160
|
+
match = self.re_pattern.search(rev_seq)
|
|
161
|
+
if match:
|
|
162
|
+
id_spacers.append((id, match.group(1)))
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# If no match is found, increment the no_guide_count
|
|
166
|
+
id_spacers.append((id, ""))
|
|
167
|
+
no_guide_count += 1
|
|
168
|
+
|
|
169
|
+
print(' Out of total {0} reads, {1} ({2:.2%}) likely contain a spacer. (Flexibility = {3})'.format(
|
|
170
|
+
len(id_spacers), len(id_spacers) - no_guide_count, (len(id_spacers) - no_guide_count) / len(id_spacers), self.spacer_size_flex))
|
|
171
|
+
|
|
172
|
+
return id_spacers
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@lru_cache(maxsize=1024)
|
|
176
|
+
def align2correct(spacer_tup, spacer):
|
|
177
|
+
corrected_spacer = None
|
|
178
|
+
|
|
179
|
+
aligner = Align.PairwiseAligner()
|
|
180
|
+
aligner.mode = 'local'
|
|
181
|
+
aligner.match_score = 1
|
|
182
|
+
aligner.open_gap_score = -0.5
|
|
183
|
+
aligner.extend_gap_score = -0.5
|
|
184
|
+
|
|
185
|
+
for index, ref_spacer in enumerate(spacer_tup):
|
|
186
|
+
align_re = aligner.align(spacer, ref_spacer)
|
|
187
|
+
if align_re.score > 0.9 * len(ref_spacer):
|
|
188
|
+
corrected_spacer = ref_spacer
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
return (corrected_spacer)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def load_fasta_to_seqs(fastq_path):
|
|
195
|
+
id_sequences = []
|
|
196
|
+
if fastq_path.endswith(".gz"):
|
|
197
|
+
with gzip.open(fastq_path, "rt") as handle:
|
|
198
|
+
for record in SeqIO.parse(handle, "fastq"):
|
|
199
|
+
id_sequences.append((record.id, str(record.seq).upper()))
|
|
200
|
+
else:
|
|
201
|
+
with open(fastq_path, "r") as handle:
|
|
202
|
+
for record in SeqIO.parse(handle, "fastq"):
|
|
203
|
+
id_sequences.append((record.id, str(record.seq).upper()))
|
|
204
|
+
return id_sequences
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
pass
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from src.spacer_counter import SpacerCounter, load_fasta_to_seqs, align2correct
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
class TestExtractSpacers:
|
|
6
|
+
def test_loading_fastq(self):
|
|
7
|
+
id_sequences = load_fasta_to_seqs('data/long_read.fastq')
|
|
8
|
+
assert len(id_sequences) == 10000
|
|
9
|
+
assert id_sequences[0][0] == '0_NNB8S3_1'
|
|
10
|
+
|
|
11
|
+
id_sequences = load_fasta_to_seqs('data/long_read.fastq.gz')
|
|
12
|
+
assert len(id_sequences) == 10000
|
|
13
|
+
assert id_sequences[0][0] == '0_NNB8S3_1'
|
|
14
|
+
|
|
15
|
+
def test_counter(self):
|
|
16
|
+
counter = SpacerCounter(['GATCT', 'ACGCG'], spacer_size_flex=1, spacer_info_csv='data/spacer_info.csv')
|
|
17
|
+
assert counter.spacer_df is not None
|
|
18
|
+
assert counter.spacer_df.shape == (254, 3)
|
|
19
|
+
assert counter.re_pattern == re.compile("GATCT((A|C|T|G){19,21})ACGCG")
|
|
20
|
+
|
|
21
|
+
output_df, unknown_df = counter.count_spacers('data/long_read.fastq', basename='data/test_', threads=8)
|
|
22
|
+
assert output_df.shape[0] == 255
|
|
23
|
+
assert output_df['count'][2] == 247
|
|
24
|
+
assert unknown_df.shape[0] == 149
|
|
25
|
+
|
|
26
|
+
def test_lru_caching(self):
|
|
27
|
+
counter = SpacerCounter(['GATCT', 'ACGCG'], spacer_size_flex=1, spacer_info_csv='data/spacer_info.csv')
|
|
28
|
+
output_df, unknown_df = counter.count_spacers('data/long_read.fastq', basename='data/test_', threads=1)
|
|
29
|
+
assert align2correct.cache_info().hits > 0
|