spacr 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/__init__.py +3 -2
- spacr/app_classify.py +10 -0
- spacr/app_mask.py +9 -0
- spacr/app_measure.py +9 -0
- spacr/app_sequencing.py +9 -0
- spacr/core.py +172 -1
- spacr/deep_spacr.py +296 -7
- spacr/gui.py +68 -0
- spacr/gui_core.py +319 -10
- spacr/gui_elements.py +772 -13
- spacr/gui_utils.py +304 -12
- spacr/io.py +887 -71
- spacr/logger.py +36 -0
- spacr/measure.py +206 -28
- spacr/ml.py +606 -142
- spacr/plot.py +797 -131
- spacr/sequencing.py +363 -8
- spacr/settings.py +1158 -38
- spacr/sp_stats.py +80 -12
- spacr/spacr_cellpose.py +115 -2
- spacr/submodules.py +747 -19
- spacr/timelapse.py +237 -53
- spacr/toxo.py +132 -6
- spacr/utils.py +2422 -80
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/METADATA +31 -17
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/RECORD +30 -30
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/LICENSE +0 -0
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/WHEEL +0 -0
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/entry_points.txt +0 -0
- {spacr-1.0.9.dist-info → spacr-1.1.1.dist-info}/top_level.txt +0 -0
spacr/sequencing.py
CHANGED
@@ -10,6 +10,23 @@ from IPython.display import display
|
|
10
10
|
|
11
11
|
# Function to map sequences to names (same as your original)
|
12
12
|
def map_sequences_to_names(csv_file, sequences, rc):
|
13
|
+
"""
|
14
|
+
Maps DNA sequences to their corresponding names based on a CSV file.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
csv_file (str): Path to the CSV file containing 'sequence' and 'name' columns.
|
18
|
+
sequences (list of str): List of DNA sequences to map.
|
19
|
+
rc (bool): If True, reverse complement the sequences in the CSV before mapping.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
list: A list of names corresponding to the input sequences. If a sequence is not found,
|
23
|
+
`pd.NA` is returned in its place.
|
24
|
+
|
25
|
+
Notes:
|
26
|
+
- The CSV file must contain columns named 'sequence' and 'name'.
|
27
|
+
- If `rc` is True, sequences in the CSV will be reverse complemented prior to mapping.
|
28
|
+
- Sequences in `sequences` are not altered—only sequences in the CSV are reverse complemented.
|
29
|
+
"""
|
13
30
|
def rev_comp(dna_sequence):
|
14
31
|
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
15
32
|
reverse_seq = dna_sequence[::-1]
|
@@ -24,6 +41,24 @@ def map_sequences_to_names(csv_file, sequences, rc):
|
|
24
41
|
|
25
42
|
# Functions to save data (same as your original)
|
26
43
|
def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
|
44
|
+
"""
|
45
|
+
Saves a pandas DataFrame to an HDF5 file, optionally appending to an existing dataset.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
df (pd.DataFrame): The DataFrame to save.
|
49
|
+
hdf5_file (str): Path to the target HDF5 file.
|
50
|
+
key (str, optional): Key under which to store the DataFrame. Defaults to 'df'.
|
51
|
+
comp_type (str, optional): Compression algorithm to use (e.g., 'zlib', 'bzip2', 'blosc'). Defaults to 'zlib'.
|
52
|
+
comp_level (int, optional): Compression level (0–9). Higher values yield better compression at the cost of speed. Defaults to 5.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
None
|
56
|
+
|
57
|
+
Notes:
|
58
|
+
- If the specified key already exists in the HDF5 file, the new DataFrame is appended to it.
|
59
|
+
- The combined DataFrame is saved in 'table' format to support appending and querying.
|
60
|
+
- Errors encountered during saving are printed to standard output.
|
61
|
+
"""
|
27
62
|
try:
|
28
63
|
with pd.HDFStore(hdf5_file, 'a', complib=comp_type, complevel=comp_level) as store:
|
29
64
|
if key in store:
|
@@ -34,6 +69,23 @@ def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
|
|
34
69
|
print(f"Error while saving DataFrame to HDF5: {e}")
|
35
70
|
|
36
71
|
def save_unique_combinations_to_csv(unique_combinations, csv_file):
|
72
|
+
"""
|
73
|
+
Saves or appends a DataFrame of unique gRNA combinations to a CSV file, aggregating duplicates.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
unique_combinations (pd.DataFrame): DataFrame containing 'rowID', 'columnID', and 'grna_name' columns,
|
77
|
+
along with associated count or metric columns.
|
78
|
+
csv_file (str): Path to the CSV file where data will be saved.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
None
|
82
|
+
|
83
|
+
Notes:
|
84
|
+
- If the file exists, it reads the existing contents and appends the new data.
|
85
|
+
- Duplicate combinations (same 'rowID', 'columnID', 'grna_name') are summed.
|
86
|
+
- The resulting DataFrame is saved with index included.
|
87
|
+
- Any exception during the process is caught and printed to stdout.
|
88
|
+
"""
|
37
89
|
try:
|
38
90
|
try:
|
39
91
|
existing_df = pd.read_csv(csv_file)
|
@@ -50,6 +102,22 @@ def save_unique_combinations_to_csv(unique_combinations, csv_file):
|
|
50
102
|
print(f"Error while saving unique combinations to CSV: {e}")
|
51
103
|
|
52
104
|
def save_qc_df_to_csv(qc_df, qc_csv_file):
|
105
|
+
"""
|
106
|
+
Saves or appends a QC (quality control) DataFrame to a CSV file by summing overlapping entries.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
qc_df (pd.DataFrame): DataFrame containing numeric QC metrics (e.g., counts, read stats).
|
110
|
+
qc_csv_file (str): Path to the CSV file where the QC data will be saved.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
None
|
114
|
+
|
115
|
+
Notes:
|
116
|
+
- If the file exists, it reads the existing QC data and adds the new values to it (element-wise).
|
117
|
+
- If the file doesn't exist, it creates a new one.
|
118
|
+
- The final DataFrame is saved without including the index.
|
119
|
+
- Any exception is caught and logged to stdout.
|
120
|
+
"""
|
53
121
|
try:
|
54
122
|
try:
|
55
123
|
existing_qc_df = pd.read_csv(qc_csv_file)
|
@@ -64,9 +132,34 @@ def save_qc_df_to_csv(qc_df, qc_csv_file):
|
|
64
132
|
print(f"Error while saving QC DataFrame to CSV: {e}")
|
65
133
|
|
66
134
|
def extract_sequence_and_quality(sequence, quality, start, end):
|
135
|
+
"""
|
136
|
+
Extracts a subsequence and its corresponding quality scores.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
sequence (str): DNA sequence string.
|
140
|
+
quality (str): Quality string corresponding to the sequence.
|
141
|
+
start (int): Start index of the region to extract.
|
142
|
+
end (int): End index of the region to extract (exclusive).
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
tuple: (subsequence, subquality) as strings.
|
146
|
+
"""
|
67
147
|
return sequence[start:end], quality[start:end]
|
68
148
|
|
69
149
|
def create_consensus(seq1, qual1, seq2, qual2):
|
150
|
+
"""
|
151
|
+
Constructs a consensus DNA sequence from two reads with associated quality scores.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
seq1 (str): First DNA sequence.
|
155
|
+
qual1 (str): Quality scores for `seq1` (as ASCII characters or integer-encoded).
|
156
|
+
seq2 (str): Second DNA sequence.
|
157
|
+
qual2 (str): Quality scores for `seq2`.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
str: Consensus sequence, selecting the base with the highest quality at each position.
|
161
|
+
If one base is 'N', the non-'N' base is chosen regardless of quality.
|
162
|
+
"""
|
70
163
|
consensus_seq = []
|
71
164
|
for i in range(len(seq1)):
|
72
165
|
bases = [(seq1[i], qual1[i]), (seq2[i], qual2[i])]
|
@@ -74,6 +167,15 @@ def create_consensus(seq1, qual1, seq2, qual2):
|
|
74
167
|
return ''.join(consensus_seq)
|
75
168
|
|
76
169
|
def get_consensus_base(bases):
|
170
|
+
"""
|
171
|
+
Selects the most reliable base from a list of two base-quality pairs.
|
172
|
+
|
173
|
+
Args:
|
174
|
+
bases (list of tuples): Each tuple contains (base, quality_score), expected length is 2.
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
str: The consensus base. Prefers non-'N' bases and higher quality scores.
|
178
|
+
"""
|
77
179
|
# Prefer non-'N' bases, if 'N' exists, pick the other one.
|
78
180
|
if bases[0][0] == 'N':
|
79
181
|
return bases[1][0]
|
@@ -84,13 +186,78 @@ def get_consensus_base(bases):
|
|
84
186
|
return bases[0][0] if bases[0][1] >= bases[1][1] else bases[1][0]
|
85
187
|
|
86
188
|
def reverse_complement(seq):
|
189
|
+
"""
|
190
|
+
Computes the reverse complement of a DNA sequence.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
seq (str): Input DNA sequence.
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
str: Reverse complement of the input sequence.
|
197
|
+
"""
|
87
198
|
return str(Seq(seq).reverse_complement())
|
88
199
|
|
89
200
|
# Core logic for processing a chunk (same as your original)
|
90
201
|
def process_chunk(chunk_data):
|
91
|
-
|
202
|
+
"""
|
203
|
+
Processes a chunk of sequencing reads to extract and map barcode sequences to corresponding names.
|
204
|
+
|
205
|
+
This function handles both single-end and paired-end FASTQ data. It searches for a target barcode
|
206
|
+
sequence in each read, extracts a consensus region around it, applies a regex to extract barcodes,
|
207
|
+
and maps those to known IDs using reference CSVs. Quality control data and unique combinations are
|
208
|
+
also computed.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
chunk_data (tuple): Contains either 9 or 10 elements:
|
212
|
+
|
213
|
+
For paired-end mode (10 elements):
|
214
|
+
- r1_chunk (list): List of strings, each 4-line block from R1 FASTQ.
|
215
|
+
- r2_chunk (list): List of strings, each 4-line block from R2 FASTQ.
|
216
|
+
- regex (str): Regex pattern with named groups ('rowID', 'columnID', 'grna').
|
217
|
+
- target_sequence (str): Sequence to anchor barcode extraction.
|
218
|
+
- offset_start (int): Offset from target_sequence to start consensus extraction.
|
219
|
+
- expected_end (int): Length of the region to extract.
|
220
|
+
- column_csv (str): Path to column barcode reference CSV.
|
221
|
+
- grna_csv (str): Path to gRNA barcode reference CSV.
|
222
|
+
- row_csv (str): Path to row barcode reference CSV.
|
223
|
+
- fill_na (bool): Whether to fill unmapped names with raw barcode sequences.
|
224
|
+
|
225
|
+
For single-end mode (9 elements):
|
226
|
+
- Same as above, but r2_chunk is omitted.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
tuple:
|
230
|
+
- df (pd.DataFrame): Full dataframe with columns:
|
231
|
+
['read', 'column_sequence', 'columnID', 'row_sequence', 'rowID',
|
232
|
+
'grna_sequence', 'grna_name']
|
233
|
+
- unique_combinations (pd.DataFrame): Count of each unique (rowID, columnID, grna_name) triplet.
|
234
|
+
- qc_df (pd.DataFrame): Summary of missing values and total reads.
|
235
|
+
"""
|
92
236
|
def paired_find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, regex):
|
93
|
-
|
237
|
+
"""
|
238
|
+
Processes paired-end FASTQ read chunks to extract consensus barcode sequences and decode them
|
239
|
+
using a regex pattern.
|
240
|
+
|
241
|
+
For each R1–R2 read pair, this function identifies the `target_sequence`, extracts a window of
|
242
|
+
defined length with an offset, computes a consensus sequence using base quality scores, and
|
243
|
+
applies a regex to extract barcode components.
|
244
|
+
|
245
|
+
Args:
|
246
|
+
r1_chunk (list of str): List of 4-line strings for each R1 read in the chunk.
|
247
|
+
r2_chunk (list of str): List of 4-line strings for each R2 read in the chunk.
|
248
|
+
target_sequence (str): Nucleotide sequence used as anchor for barcode extraction.
|
249
|
+
offset_start (int): Position offset from `target_sequence` to begin extracting barcode.
|
250
|
+
expected_end (int): Total length of region to extract after offset.
|
251
|
+
regex (str): Regular expression with named groups ('rowID', 'columnID', 'grna')
|
252
|
+
to parse barcodes from the extracted consensus sequence.
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
tuple:
|
256
|
+
consensus_sequences (list of str): Consensus DNA sequences extracted from read pairs.
|
257
|
+
columns (list of str): Extracted column barcode sequences.
|
258
|
+
grnas (list of str): Extracted gRNA barcode sequences.
|
259
|
+
rows (list of str): Extracted row barcode sequences.
|
260
|
+
"""
|
94
261
|
consensus_sequences, columns, grnas, rows = [], [], [], []
|
95
262
|
consensus_seq = None
|
96
263
|
|
@@ -154,6 +321,26 @@ def process_chunk(chunk_data):
|
|
154
321
|
return consensus_sequences, columns, grnas, rows
|
155
322
|
|
156
323
|
def single_find_sequence_in_chunk_reads(r1_chunk, target_sequence, offset_start, expected_end, regex):
|
324
|
+
"""
|
325
|
+
Processes single-end FASTQ read chunks to extract barcode sequences using a target motif and regex pattern.
|
326
|
+
|
327
|
+
For each R1 read, the function identifies the `target_sequence`, extracts a region starting at an offset
|
328
|
+
and of fixed length, pads if necessary, and applies a regex with named groups to decode barcodes.
|
329
|
+
|
330
|
+
Args:
|
331
|
+
r1_chunk (list of str): List of 4-line strings for each R1 read in the chunk.
|
332
|
+
target_sequence (str): Anchor sequence to locate the barcode region in R1.
|
333
|
+
offset_start (int): Position offset from the end of `target_sequence` to start barcode extraction.
|
334
|
+
expected_end (int): Total length of the barcode region to extract.
|
335
|
+
regex (str): Regular expression with named groups ('rowID', 'columnID', 'grna') to extract barcodes.
|
336
|
+
|
337
|
+
Returns:
|
338
|
+
tuple:
|
339
|
+
consensus_sequences (list of str): Extracted sequences used as barcode consensus (R1 only).
|
340
|
+
columns (list of str): Extracted column barcode subsequences.
|
341
|
+
grnas (list of str): Extracted gRNA barcode subsequences.
|
342
|
+
rows (list of str): Extracted row barcode subsequences.
|
343
|
+
"""
|
157
344
|
|
158
345
|
consensus_sequences, columns, grnas, rows = [], [], [], []
|
159
346
|
|
@@ -251,6 +438,20 @@ def process_chunk(chunk_data):
|
|
251
438
|
|
252
439
|
# Function to save data from the queue
|
253
440
|
def saver_process(save_queue, hdf5_file, save_h5, unique_combinations_csv, qc_csv_file, comp_type, comp_level):
|
441
|
+
"""
|
442
|
+
Continuously reads data from a multiprocessing queue and saves it to disk in various formats.
|
443
|
+
|
444
|
+
This function is intended to run in a separate process. It terminates when it receives the "STOP" sentinel value.
|
445
|
+
|
446
|
+
Args:
|
447
|
+
save_queue (multiprocessing.Queue): Queue containing tuples of (df, unique_combinations, qc_df).
|
448
|
+
hdf5_file (str): Path to the HDF5 file to store full reads (only used if save_h5 is True).
|
449
|
+
save_h5 (bool): Whether to save the full reads DataFrame to HDF5.
|
450
|
+
unique_combinations_csv (str): Path to the CSV file for aggregated barcode combinations.
|
451
|
+
qc_csv_file (str): Path to the CSV file for quality control statistics.
|
452
|
+
comp_type (str): Compression algorithm for HDF5 (e.g., 'zlib').
|
453
|
+
comp_level (int): Compression level for HDF5.
|
454
|
+
"""
|
254
455
|
while True:
|
255
456
|
item = save_queue.get()
|
256
457
|
if item == "STOP":
|
@@ -262,7 +463,33 @@ def saver_process(save_queue, hdf5_file, save_h5, unique_combinations_csv, qc_cs
|
|
262
463
|
save_qc_df_to_csv(qc_df, qc_csv_file)
|
263
464
|
|
264
465
|
def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
|
265
|
-
|
466
|
+
"""
|
467
|
+
Processes paired-end FASTQ files in chunks to extract barcoded sequences and generate consensus reads.
|
468
|
+
|
469
|
+
This function identifies sequences matching a regular expression in both R1 and R2 reads, extracts barcodes,
|
470
|
+
and maps them to user-defined identifiers. Processed data is saved incrementally using a separate process.
|
471
|
+
|
472
|
+
Args:
|
473
|
+
r1_file (str): Path to the gzipped R1 FASTQ file.
|
474
|
+
r2_file (str): Path to the gzipped R2 FASTQ file.
|
475
|
+
regex (str): Regular expression with named capture groups: 'rowID', 'columnID', and 'grna'.
|
476
|
+
target_sequence (str): Anchor sequence to align from.
|
477
|
+
offset_start (int): Offset from anchor to start consensus extraction.
|
478
|
+
expected_end (int): Length of the consensus region to extract.
|
479
|
+
column_csv (str): Path to CSV file mapping column barcode sequences to IDs.
|
480
|
+
grna_csv (str): Path to CSV file mapping gRNA barcode sequences to names.
|
481
|
+
row_csv (str): Path to CSV file mapping row barcode sequences to IDs.
|
482
|
+
save_h5 (bool): Whether to save the full reads DataFrame to HDF5.
|
483
|
+
comp_type (str): Compression algorithm for HDF5 (e.g., 'zlib').
|
484
|
+
comp_level (int): Compression level for HDF5.
|
485
|
+
hdf5_file (str): Path to the HDF5 output file.
|
486
|
+
unique_combinations_csv (str): Path to CSV file for saving unique row/column/gRNA combinations.
|
487
|
+
qc_csv_file (str): Path to CSV file for saving QC summary (e.g., NaN counts).
|
488
|
+
chunk_size (int, optional): Number of reads per batch. Defaults to 10000.
|
489
|
+
n_jobs (int, optional): Number of parallel workers. Defaults to cpu_count() - 3.
|
490
|
+
test (bool, optional): If True, processes only a single chunk and prints the result. Defaults to False.
|
491
|
+
fill_na (bool, optional): If True, fills unmapped IDs with raw barcode sequences. Defaults to False.
|
492
|
+
"""
|
266
493
|
from .utils import count_reads_in_fastq, print_progress
|
267
494
|
|
268
495
|
# Use cpu_count minus 3 cores if n_jobs isn't specified
|
@@ -344,7 +571,34 @@ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
|
|
344
571
|
save_process.join()
|
345
572
|
|
346
573
|
def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
|
347
|
-
|
574
|
+
"""
|
575
|
+
Processes single-end FASTQ data in chunks to extract barcoded sequences and map them to identifiers.
|
576
|
+
|
577
|
+
This function reads gzipped R1 FASTQ data, detects barcode-containing sequences using a target anchor and regex,
|
578
|
+
and maps row, column, and gRNA barcodes to user-defined identifiers. Results are processed in parallel
|
579
|
+
and saved incrementally via a background process.
|
580
|
+
|
581
|
+
Args:
|
582
|
+
r1_file (str): Path to gzipped R1 FASTQ file.
|
583
|
+
r2_file (str): Placeholder for interface consistency; not used in single-end mode.
|
584
|
+
regex (str): Regular expression with named capture groups: 'rowID', 'columnID', and 'grna'.
|
585
|
+
target_sequence (str): Anchor sequence used to locate barcode region.
|
586
|
+
offset_start (int): Offset from anchor to start barcode parsing.
|
587
|
+
expected_end (int): Length of the barcode region to extract.
|
588
|
+
column_csv (str): Path to CSV file mapping column barcode sequences to IDs.
|
589
|
+
grna_csv (str): Path to CSV file mapping gRNA barcode sequences to names.
|
590
|
+
row_csv (str): Path to CSV file mapping row barcode sequences to IDs.
|
591
|
+
save_h5 (bool): Whether to save the full reads DataFrame to HDF5 format.
|
592
|
+
comp_type (str): Compression algorithm for HDF5 (e.g., 'zlib').
|
593
|
+
comp_level (int): Compression level for HDF5.
|
594
|
+
hdf5_file (str): Output HDF5 file path.
|
595
|
+
unique_combinations_csv (str): Output path for CSV summarizing row/column/gRNA combinations.
|
596
|
+
qc_csv_file (str): Output path for CSV summarizing missing values and total reads.
|
597
|
+
chunk_size (int, optional): Number of reads per batch. Defaults to 10,000.
|
598
|
+
n_jobs (int, optional): Number of parallel worker processes. Defaults to cpu_count() - 3.
|
599
|
+
test (bool, optional): If True, processes only the first chunk and prints its result. Defaults to False.
|
600
|
+
fill_na (bool, optional): If True, fills missing mapped IDs with their corresponding barcode sequences. Defaults to False.
|
601
|
+
"""
|
348
602
|
from .utils import count_reads_in_fastq, print_progress
|
349
603
|
|
350
604
|
# Use cpu_count minus 3 cores if n_jobs isn't specified
|
@@ -422,7 +676,40 @@ def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
|
|
422
676
|
save_process.join()
|
423
677
|
|
424
678
|
def generate_barecode_mapping(settings={}):
|
425
|
-
|
679
|
+
"""
|
680
|
+
Orchestrates barcode extraction and mapping from gzipped sequencing data using user-defined or default settings.
|
681
|
+
|
682
|
+
This function parses sequencing reads from single-end or paired-end FASTQ (.gz) files, extracts barcode regions
|
683
|
+
using a regular expression, maps them to row, column, and gRNA identifiers, and saves the results to disk.
|
684
|
+
Results include the full annotated reads (optional), barcode combination counts, and a QC summary.
|
685
|
+
|
686
|
+
Args:
|
687
|
+
settings (dict, optional): Dictionary containing parameters required for barcode mapping. If not provided,
|
688
|
+
default values will be applied. Important keys include:
|
689
|
+
- 'src' (str): Source directory containing gzipped FASTQ files.
|
690
|
+
- 'mode' (str): Either 'single' or 'paired' for single-end or paired-end processing.
|
691
|
+
- 'single_direction' (str): If 'single', specifies which read to use ('R1' or 'R2').
|
692
|
+
- 'regex' (str): Regular expression with capture groups 'rowID', 'columnID', and 'grna'.
|
693
|
+
- 'target_sequence' (str): Anchor sequence to locate barcode start position.
|
694
|
+
- 'offset_start' (int): Offset from the anchor to the barcode start.
|
695
|
+
- 'expected_end' (int): Expected barcode region length.
|
696
|
+
- 'column_csv' (str): CSV file mapping column barcodes to names.
|
697
|
+
- 'grna_csv' (str): CSV file mapping gRNA barcodes to names.
|
698
|
+
- 'row_csv' (str): CSV file mapping row barcodes to names.
|
699
|
+
- 'save_h5' (bool): Whether to save annotated reads to HDF5.
|
700
|
+
- 'comp_type' (str): Compression algorithm for HDF5.
|
701
|
+
- 'comp_level' (int): Compression level for HDF5.
|
702
|
+
- 'chunk_size' (int): Number of reads to process per batch.
|
703
|
+
- 'n_jobs' (int): Number of parallel processes for barcode mapping.
|
704
|
+
- 'test' (bool): If True, only processes the first chunk for testing.
|
705
|
+
- 'fill_na' (bool): If True, fills unmapped barcodes with raw sequence instead of NaN.
|
706
|
+
|
707
|
+
Side Effects:
|
708
|
+
Saves the following files in the output directory:
|
709
|
+
- `annotated_reads.h5` (optional): Annotated read information in HDF5 format.
|
710
|
+
- `unique_combinations.csv`: Count table of (rowID, columnID, grna_name) triplets.
|
711
|
+
- `qc.csv`: Summary of missing values and read counts.
|
712
|
+
"""
|
426
713
|
from .settings import set_default_generate_barecode_mapping
|
427
714
|
from .utils import save_settings
|
428
715
|
from .io import parse_gz_files
|
@@ -490,7 +777,23 @@ def generate_barecode_mapping(settings={}):
|
|
490
777
|
|
491
778
|
# Function to read the CSV, compute reverse complement, and save it
|
492
779
|
def barecodes_reverse_complement(csv_file):
|
780
|
+
"""
|
781
|
+
Reads a barcode CSV file, computes the reverse complement of each sequence, and saves the result to a new CSV.
|
782
|
+
|
783
|
+
This function assumes the input CSV contains a column named 'sequence' with DNA barcodes. It computes the
|
784
|
+
reverse complement for each sequence and saves the modified DataFrame to a new file with '_RC' appended
|
785
|
+
to the original filename.
|
786
|
+
|
787
|
+
Args:
|
788
|
+
csv_file (str): Path to the input CSV file. Must contain a column named 'sequence'.
|
789
|
+
|
790
|
+
Side Effects:
|
791
|
+
- Saves a new CSV file in the same directory with reverse-complemented sequences.
|
792
|
+
- Prints the path of the saved file.
|
493
793
|
|
794
|
+
Output:
|
795
|
+
New file path format: <original_filename>_RC.csv
|
796
|
+
"""
|
494
797
|
def reverse_complement(sequence):
|
495
798
|
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
|
496
799
|
return ''.join(complement[base] for base in reversed(sequence))
|
@@ -512,10 +815,48 @@ def barecodes_reverse_complement(csv_file):
|
|
512
815
|
print(f"Reverse complement file saved as {new_filename}")
|
513
816
|
|
514
817
|
def graph_sequencing_stats(settings):
|
515
|
-
|
818
|
+
"""
|
819
|
+
Analyze and visualize sequencing quality metrics to determine an optimal fraction threshold
|
820
|
+
that maximizes unique gRNA representation per well across plates.
|
821
|
+
|
822
|
+
This function reads one or more CSV files containing count data, filters out control wells,
|
823
|
+
calculates the fraction of reads per gRNA in each well, and identifies the minimum fraction
|
824
|
+
required to recover a target average number of unique gRNAs per well. It generates plots to
|
825
|
+
help visualize the chosen threshold and spatial distribution of unique gRNA counts.
|
826
|
+
|
827
|
+
Args:
|
828
|
+
settings (dict): Dictionary containing the following keys:
|
829
|
+
- 'count_data' (str or list of str): Paths to CSV file(s) with 'grna', 'count', 'rowID', 'columnID' columns.
|
830
|
+
- 'target_unique_count' (int): Target number of unique gRNAs per well to recover.
|
831
|
+
- 'filter_column' (str): Column name to filter out control wells.
|
832
|
+
- 'control_wells' (list): List of control well labels to exclude.
|
833
|
+
- 'log_x' (bool): Whether to log-scale the x-axis in the threshold plot.
|
834
|
+
- 'log_y' (bool): Whether to log-scale the y-axis in the threshold plot.
|
835
|
+
|
836
|
+
Returns:
|
837
|
+
float: Closest fraction threshold that approximates the target unique gRNA count per well.
|
838
|
+
|
839
|
+
Side Effects:
|
840
|
+
- Saves a PDF plot of unique gRNA count vs fraction threshold.
|
841
|
+
- Saves a spatial plate map of unique gRNA counts.
|
842
|
+
- Prints threshold and summary statistics.
|
843
|
+
- Displays intermediate DataFrames for inspection.
|
844
|
+
"""
|
516
845
|
from .utils import correct_metadata_column_names, correct_metadata
|
517
846
|
|
518
847
|
def _plot_density(df, dependent_variable, dst=None):
|
848
|
+
"""
|
849
|
+
Plot a kernel density estimate (KDE) of a specified variable from a DataFrame.
|
850
|
+
|
851
|
+
Args:
|
852
|
+
df (pd.DataFrame): DataFrame containing the data.
|
853
|
+
dependent_variable (str): Name of the column to plot.
|
854
|
+
dst (str, optional): Directory to save the plot. If None, the plot is not saved.
|
855
|
+
|
856
|
+
Side Effects:
|
857
|
+
- Displays the KDE plot.
|
858
|
+
- Saves the plot as 'dependent_variable_density.pdf' in the specified directory if dst is provided.
|
859
|
+
"""
|
519
860
|
"""Plot a density plot of the dependent variable."""
|
520
861
|
plt.figure(figsize=(10, 10))
|
521
862
|
sns.kdeplot(df[dependent_variable], fill=True, alpha=0.6)
|
@@ -530,8 +871,22 @@ def graph_sequencing_stats(settings):
|
|
530
871
|
|
531
872
|
def find_and_visualize_fraction_threshold(df, target_unique_count=5, log_x=False, log_y=False, dst=None):
|
532
873
|
"""
|
533
|
-
|
534
|
-
and visualize the relationship between
|
874
|
+
Identify the optimal fraction threshold that yields an average number of unique gRNAs per well
|
875
|
+
closest to a specified target, and visualize the relationship between threshold and unique count.
|
876
|
+
|
877
|
+
Args:
|
878
|
+
df (pd.DataFrame): Input DataFrame containing 'fraction', 'plateID', 'rowID', 'columnID', and 'grna' columns.
|
879
|
+
target_unique_count (int, optional): Desired average number of unique gRNAs per well. Default is 5.
|
880
|
+
log_x (bool, optional): Whether to apply a log scale to the x-axis in the plot.
|
881
|
+
log_y (bool, optional): Whether to apply a log scale to the y-axis in the plot.
|
882
|
+
dst (str, optional): Directory where the plot will be saved. If None, the plot is not saved.
|
883
|
+
|
884
|
+
Returns:
|
885
|
+
float: The fraction threshold value closest to achieving the target_unique_count.
|
886
|
+
|
887
|
+
Side Effects:
|
888
|
+
- Displays a line plot of unique gRNA counts vs. fraction thresholds.
|
889
|
+
- Saves the plot as 'fraction_threshold.pdf' in a subdirectory 'results/' under `dst` if provided.
|
535
890
|
"""
|
536
891
|
|
537
892
|
def _line_plot(df, x='fraction_threshold', y='unique_count', log_x=False, log_y=False):
|