spacer-count 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: spacer-count
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: biopython>1.84
8
+ Requires-Dist: pandas>2.2
File without changes
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "spacer-count"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "biopython>1.84",
9
+ "pandas>2.2",
10
+ ]
11
+
12
+ [dependency-groups]
13
+ dev = [
14
+ "pytest>=9.0.2",
15
+ "pytest-cov>=7.0.0",
16
+ ]
17
+
18
+ [tool.pytest.ini_options]
19
+ pythonpath = [
20
+ ".", "src",
21
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: spacer-count
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: biopython>1.84
8
+ Requires-Dist: pandas>2.2
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/__init__.py
4
+ src/spacer_counter.py
5
+ src/spacer_count.egg-info/PKG-INFO
6
+ src/spacer_count.egg-info/SOURCES.txt
7
+ src/spacer_count.egg-info/dependency_links.txt
8
+ src/spacer_count.egg-info/requires.txt
9
+ src/spacer_count.egg-info/top_level.txt
10
+ tests/test_counter.py
@@ -0,0 +1,2 @@
1
+ biopython>1.84
2
+ pandas>2.2
@@ -0,0 +1,2 @@
1
+ __init__
2
+ spacer_counter
@@ -0,0 +1,207 @@
1
+ import gzip
2
+ import re
3
+ from multiprocessing import Pool
4
+ from functools import lru_cache, partial
5
+
6
+ import pandas as pd
7
+ from Bio import SeqIO, Align, Seq
8
+
9
+ from os import listdir
10
+
11
+ class SpacerCounter:
12
+
13
+ def __init__(self, flanking_seqs, spacer_size=20, spacer_df=None, spacer_info_csv=None, spacer_size_flex=1):
14
+ left_flanking_seq, right_flanking_seq = flanking_seqs
15
+
16
+ if len(left_flanking_seq) < 5 or len(right_flanking_seq) < 5:
17
+ raise Exception("Flanking sequences must be at least 5 bases long.")
18
+
19
+ if spacer_info_csv is not None:
20
+ print("Using provided spacer info CSV; ignoring spacer_size parameter.")
21
+ self.spacer_df = pd.read_csv(spacer_info_csv, header=None, names=["guide_id", "sequence", 'gene'])
22
+ self.spacer_df['sequence'] = self.spacer_df['sequence'].str.upper()
23
+ self.spacer_df['guide_id'] = self.spacer_df['guide_id'].astype(str)
24
+ self.spacer_df['gene'] = self.spacer_df['gene'].astype(str)
25
+
26
+ self.spacer_size_lims= [self.spacer_df['sequence'].apply(len).min(), self.spacer_df['sequence'].apply(len).max()]
27
+ self.spacer_size_lims = [self.spacer_size_lims[0] - spacer_size_flex, self.spacer_size_lims[1] + spacer_size_flex]
28
+
29
+ elif spacer_df is not None:
30
+ print("Using provided spacer info DataFrame; ignoring spacer_size parameter.")
31
+ self.spacer_df = spacer_df
32
+ self.spacer_size_lims= [self.spacer_df['sequence'].apply(len).min(), self.spacer_df['sequence'].apply(len).max()]
33
+ self.spacer_size_lims = [self.spacer_size_lims[0] - spacer_size_flex, self.spacer_size_lims[1] + spacer_size_flex]
34
+
35
+ else:
36
+ print("No spacer info DataFrame provided; using spacer_size parameter.")
37
+ self.spacer_size_lims = [spacer_size - spacer_size_flex, spacer_size + spacer_size_flex]
38
+ self.spacer_df = pd.DataFrame(columns=["guide_id", "sequence", 'gene'])
39
+
40
+
41
+ self.spacer_size_flex = spacer_size_flex
42
+
43
+ left_flanking_seq = left_flanking_seq[-5:].replace('N', '[ACGT]')
44
+ right_flanking_seq = right_flanking_seq[0:5].replace('N', '[ACGT]')
45
+
46
+ self.re_pattern = re.compile("{0}((A|C|T|G){{{1},{2}}}){3}".format(
47
+ left_flanking_seq, self.spacer_size_lims[0], self.spacer_size_lims[1], right_flanking_seq))
48
+
49
+ '''
50
+ Instance from fasta and csv files.
51
+ The fasta file should contain two sequences with ids "flanking_left" and "flanking_right".
52
+ The csv file should contain three columns: guide_id, sequence, and gene (with no header).
53
+ '''
54
+ @classmethod
55
+ def from_fasta_csv(cls, flanking_fasta_path, spacer_info_csv, spacer_size_flex=1):
56
+ with open(flanking_fasta_path, "r") as f:
57
+ for record in SeqIO.parse(f, "fasta"):
58
+ if record.id == "flanking_left":
59
+ left_flanking_seq = str(record.seq).upper()
60
+ elif record.id == "flanking_right":
61
+ right_flanking_seq = str(record.seq).upper()
62
+
63
+ if len(left_flanking_seq) < 5 or len(right_flanking_seq) < 5:
64
+ raise Exception("Flanking sequences must be at least 5 bases long.")
65
+
66
+ spacer_df = pd.read_csv(spacer_info_csv, header=None, names=["guide_id", "sequence", 'gene'])
67
+ spacer_df['sequence'] = spacer_df['sequence'].str.upper()
68
+ spacer_df['guide_id'] = spacer_df['guide_id'].astype(str)
69
+ spacer_df['gene'] = spacer_df['gene'].astype(str)
70
+
71
+ return cls([left_flanking_seq, right_flanking_seq], 0, spacer_info_csv=spacer_info_csv, spacer_size_flex=spacer_size_flex)
72
+
73
+
74
+ def count_spacers(self, fastq_path, basename=None, threads=1):
75
+ # Extract spacers from the fastq file
76
+ id_spacers = self.parse_fastq(fastq_path)
77
+ id_spacers = [(id, spacer) for id, spacer in id_spacers if spacer != ""]
78
+
79
+ # Initialize dict based on spacer_df with an additional row for unknown spacers
80
+ seq_count_dict = {}
81
+ for spacer_seq in self.spacer_df['sequence']:
82
+ if spacer_seq not in seq_count_dict:
83
+ seq_count_dict[spacer_seq] = 0
84
+ unknown_spacer_list = []
85
+
86
+ for id, spacer in id_spacers:
87
+ if spacer in seq_count_dict:
88
+ seq_count_dict[spacer] += 1
89
+ elif spacer != "":
90
+ unknown_spacer_list.append((id, spacer))
91
+
92
+ spacer_tup = tuple(self.spacer_df['sequence'].tolist())
93
+ align2correct_partial = partial(align2correct, spacer_tup)
94
+
95
+ if threads > 1:
96
+ # Use multiprocessing to align unknown spacers in parallel
97
+ with Pool(threads) as pool:
98
+ corrected_spacers = pool.map(align2correct_partial, [spacer for _, spacer in unknown_spacer_list])
99
+ corrected_results = list(zip([id for id, _ in unknown_spacer_list], corrected_spacers))
100
+
101
+ else:
102
+ # Align unknown spacers sequentially, this will benefit from lru_cache
103
+ corrected_results = []
104
+ for id, spacer in unknown_spacer_list:
105
+ corrected_spacer = align2correct_partial(spacer)
106
+ corrected_results.append((id, corrected_spacer))
107
+
108
+ unknown_spacer_list2 = []
109
+ for idx, (id, spacer) in enumerate(corrected_results):
110
+ if spacer is not None:
111
+ seq_count_dict[spacer] += 1
112
+ else:
113
+ unknown_spacer_list2.append(unknown_spacer_list[idx])
114
+
115
+ unknown_dict = {}
116
+ for unknown_id, unknown_seq in unknown_spacer_list2:
117
+ if unknown_seq in unknown_dict:
118
+ unknown_dict[unknown_seq] += 1
119
+ else:
120
+ unknown_dict[unknown_seq] = 1
121
+
122
+ print(' Out of total {0} total spacers, {1} ({2:.2%}) were matched to a known spacer.'.format(
123
+ len(id_spacers), len(id_spacers) - len(unknown_spacer_list2),
124
+ (len(id_spacers) - len(unknown_spacer_list2)) / len(id_spacers), self.spacer_size_flex))
125
+ print(' Among them, {0} needed alignment (not exact match). {1} remains as unknown even after alignment.'.format(
126
+ len(unknown_spacer_list), len(unknown_spacer_list2)))
127
+ print()
128
+
129
+ output_df = self.spacer_df.copy()
130
+ output_df['count'] = output_df['sequence'].map(seq_count_dict).fillna(0).astype(int)
131
+ output_df.loc[len(output_df.index)] = ['unknown_spacer', 'N' * 5 + '...' + 'N' * 5, 'unknown_gene', len(unknown_spacer_list2)]
132
+
133
+ unknown_df = pd.DataFrame(columns=output_df.columns)
134
+ for unknown_seq, count in unknown_dict.items():
135
+ unknown_df = pd.concat([unknown_df, pd.DataFrame([['unknown_spacer', unknown_seq, 'unknown_gene', count]], columns=output_df.columns)], ignore_index=True)
136
+
137
+ if basename is not None:
138
+ output_df.to_csv(basename + "spacer_count.csv", index=False)
139
+ unknown_df.to_csv(basename + "unknown_spacer.csv", index=False)
140
+
141
+ return output_df, unknown_df
142
+
143
+ def parse_fastq(self, fastq_path):
144
+
145
+ print("Extracting spacers from file: {0}".format(fastq_path))
146
+
147
+ id_spacers = []
148
+ no_guide_count = 0
149
+ id_sequences = load_fasta_to_seqs(fastq_path)
150
+ for id, seq in id_sequences:
151
+
152
+ # Try the forward strand
153
+ match = self.re_pattern.search(seq)
154
+ if match:
155
+ id_spacers.append((id, match.group(1)))
156
+ continue
157
+
158
+ # Try the reverse strand
159
+ rev_seq = str(Seq.Seq(seq).reverse_complement()).upper()
160
+ match = self.re_pattern.search(rev_seq)
161
+ if match:
162
+ id_spacers.append((id, match.group(1)))
163
+ continue
164
+
165
+ # If no match is found, increment the no_guide_count
166
+ id_spacers.append((id, ""))
167
+ no_guide_count += 1
168
+
169
+ print(' Out of total {0} reads, {1} ({2:.2%}) likely contain a spacer. (Flexibility = {3})'.format(
170
+ len(id_spacers), len(id_spacers) - no_guide_count, (len(id_spacers) - no_guide_count) / len(id_spacers), self.spacer_size_flex))
171
+
172
+ return id_spacers
173
+
174
+
175
+ @lru_cache(maxsize=1024)
176
+ def align2correct(spacer_tup, spacer):
177
+ corrected_spacer = None
178
+
179
+ aligner = Align.PairwiseAligner()
180
+ aligner.mode = 'local'
181
+ aligner.match_score = 1
182
+ aligner.open_gap_score = -0.5
183
+ aligner.extend_gap_score = -0.5
184
+
185
+ for index, ref_spacer in enumerate(spacer_tup):
186
+ align_re = aligner.align(spacer, ref_spacer)
187
+ if align_re.score > 0.9 * len(ref_spacer):
188
+ corrected_spacer = ref_spacer
189
+ break
190
+
191
+ return (corrected_spacer)
192
+
193
+
194
+ def load_fasta_to_seqs(fastq_path):
195
+ id_sequences = []
196
+ if fastq_path.endswith(".gz"):
197
+ with gzip.open(fastq_path, "rt") as handle:
198
+ for record in SeqIO.parse(handle, "fastq"):
199
+ id_sequences.append((record.id, str(record.seq).upper()))
200
+ else:
201
+ with open(fastq_path, "r") as handle:
202
+ for record in SeqIO.parse(handle, "fastq"):
203
+ id_sequences.append((record.id, str(record.seq).upper()))
204
+ return id_sequences
205
+
206
+ if __name__ == "__main__":
207
+ pass
@@ -0,0 +1,29 @@
1
+ import pytest
2
+ from src.spacer_counter import SpacerCounter, load_fasta_to_seqs, align2correct
3
+ import re
4
+
5
+ class TestExtractSpacers:
6
+ def test_loading_fastq(self):
7
+ id_sequences = load_fasta_to_seqs('data/long_read.fastq')
8
+ assert len(id_sequences) == 10000
9
+ assert id_sequences[0][0] == '0_NNB8S3_1'
10
+
11
+ id_sequences = load_fasta_to_seqs('data/long_read.fastq.gz')
12
+ assert len(id_sequences) == 10000
13
+ assert id_sequences[0][0] == '0_NNB8S3_1'
14
+
15
+ def test_counter(self):
16
+ counter = SpacerCounter(['GATCT', 'ACGCG'], spacer_size_flex=1, spacer_info_csv='data/spacer_info.csv')
17
+ assert counter.spacer_df is not None
18
+ assert counter.spacer_df.shape == (254, 3)
19
+ assert counter.re_pattern == re.compile("GATCT((A|C|T|G){19,21})ACGCG")
20
+
21
+ output_df, unknown_df = counter.count_spacers('data/long_read.fastq', basename='data/test_', threads=8)
22
+ assert output_df.shape[0] == 255
23
+ assert output_df['count'][2] == 247
24
+ assert unknown_df.shape[0] == 149
25
+
26
+ def test_lru_caching(self):
27
+ counter = SpacerCounter(['GATCT', 'ACGCG'], spacer_size_flex=1, spacer_info_csv='data/spacer_info.csv')
28
+ output_df, unknown_df = counter.count_spacers('data/long_read.fastq', basename='data/test_', threads=1)
29
+ assert align2correct.cache_info().hits > 0