spacr 0.0.70__py3-none-any.whl → 0.0.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/sequencing.py CHANGED
@@ -1,6 +1,8 @@
1
- import os, re, time, math, subprocess
2
- import numpy as np
1
+ import os, gc, gzip, re, time, math, subprocess
3
2
  import pandas as pd
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ from Bio.Align import PairwiseAligner
4
6
  import matplotlib.pyplot as plt
5
7
  import seaborn as sns
6
8
  from Bio import pairwise2
@@ -8,6 +10,509 @@ import statsmodels.api as sm
8
10
  import statsmodels.formula.api as smf
9
11
  from scipy.stats import gmean
10
12
  from difflib import SequenceMatcher
13
+ from collections import Counter
14
+
15
+ def analyze_reads(settings):
16
+ """
17
+ Analyzes reads from gzipped fastq files and combines them based on specified settings.
18
+
19
+ Args:
20
+ settings (dict): A dictionary containing the following keys:
21
+ - 'src' (str): The path to the folder containing the input fastq files.
22
+ - 'upstream' (str, optional): The upstream sequence used for read combination. Defaults to 'CTTCTGGTAAATGGGGATGTCAAGTT'.
23
+ - 'downstream' (str, optional): The downstream sequence used for read combination. Defaults to 'GTTTAAGAGCTATGCTGGAAACAGCA'.
24
+ - 'barecode_length' (int, optional): The length of the barcode sequence. Defaults to 8.
25
+ - 'chunk_size' (int, optional): The number of reads to process and save at a time. Defaults to 1000000.
26
+
27
+ Returns:
28
+ None
29
+ """
30
+
31
+ def save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter):
32
+ """
33
+ Save a data chunk to an HDF5 file.
34
+
35
+ Parameters:
36
+ - output_file_path (str): The path to the output HDF5 file.
37
+ - data_chunk (list): The data chunk to be saved.
38
+ - chunk_counter (int): The counter for the current chunk.
39
+
40
+ Returns:
41
+ None
42
+ """
43
+ df = pd.DataFrame(data_chunk, columns=['combined_read', 'grna', 'plate_row', 'column', 'sample'])
44
+ with pd.HDFStore(output_file_path, mode='a', complevel=5, complib='blosc') as store:
45
+ store.put(f'reads/chunk_{chunk_counter}', df, format='table', append=True)
46
+
47
+ def reverse_complement(seq):
48
+ """
49
+ Returns the reverse complement of a DNA sequence.
50
+
51
+ Args:
52
+ seq (str): The DNA sequence to be reversed and complemented.
53
+
54
+ Returns:
55
+ str: The reverse complement of the input DNA sequence.
56
+
57
+ Example:
58
+ >>> reverse_complement('ATCG')
59
+ 'CGAT'
60
+ """
61
+ complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
62
+ return ''.join(complement[base] for base in reversed(seq))
63
+
64
+ def get_avg_read_length(file_path, num_reads=100):
65
+ """
66
+ Calculate the average read length from a given file.
67
+
68
+ Args:
69
+ file_path (str): The path to the input file.
70
+ num_reads (int, optional): The number of reads to process. Defaults to 100.
71
+
72
+ Returns:
73
+ float: The average read length.
74
+
75
+ Raises:
76
+ FileNotFoundError: If the input file does not exist.
77
+ """
78
+ if not file_path:
79
+ return 0
80
+ total_length = 0
81
+ count = 0
82
+ with gzip.open(file_path, 'rt') as f:
83
+ for _ in range(num_reads):
84
+ try:
85
+ f.readline() # Skip index line
86
+ read = f.readline().strip()
87
+ total_length += len(read)
88
+ f.readline() # Skip plus line
89
+ f.readline() # Skip quality line
90
+ count += 1
91
+ except StopIteration:
92
+ break
93
+ return total_length / count if count > 0 else 0
94
+
95
+ def parse_gz_files(folder_path):
96
+ """
97
+ Parses the .fastq.gz files in the specified folder path and returns a dictionary
98
+ containing the sample names and their corresponding file paths.
99
+
100
+ Args:
101
+ folder_path (str): The path to the folder containing the .fastq.gz files.
102
+
103
+ Returns:
104
+ dict: A dictionary where the keys are the sample names and the values are
105
+ dictionaries containing the file paths for the 'R1' and 'R2' read directions.
106
+ """
107
+ files = os.listdir(folder_path)
108
+ gz_files = [f for f in files if f.endswith('.fastq.gz')]
109
+
110
+ samples_dict = {}
111
+ for gz_file in gz_files:
112
+ parts = gz_file.split('_')
113
+ sample_name = parts[0]
114
+ read_direction = parts[1]
115
+
116
+ if sample_name not in samples_dict:
117
+ samples_dict[sample_name] = {}
118
+
119
+ if read_direction == "R1":
120
+ samples_dict[sample_name]['R1'] = os.path.join(folder_path, gz_file)
121
+ elif read_direction == "R2":
122
+ samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
123
+
124
+ return samples_dict
125
+
126
+ def find_overlap(r1_read_rc, r2_read):
127
+ """
128
+ Find the best alignment between two DNA reads.
129
+
130
+ Parameters:
131
+ - r1_read_rc (str): The reverse complement of the first DNA read.
132
+ - r2_read (str): The second DNA read.
133
+
134
+ Returns:
135
+ - best_alignment (Alignment): The best alignment between the two DNA reads.
136
+ """
137
+ aligner = PairwiseAligner()
138
+ alignments = aligner.align(r1_read_rc, r2_read)
139
+ best_alignment = alignments[0]
140
+ return best_alignment
141
+
142
+ def combine_reads(samples_dict, src, chunk_size, barecode_length, upstream, downstream):
143
+ """
144
+ Combine reads from paired-end sequencing files and save the combined reads to a new file.
145
+
146
+ Args:
147
+ samples_dict (dict): A dictionary mapping sample names to file paths of paired-end sequencing files.
148
+ src (str): The source directory where the combined reads will be saved.
149
+ chunk_size (int): The number of reads to be processed and saved as a chunk.
150
+ barecode_length (int): The length of the barcode sequence.
151
+ upstream (str): The upstream sequence used for read splitting.
152
+ downstream (str): The downstream sequence used for read splitting.
153
+
154
+ Returns:
155
+ None
156
+ """
157
+ dst = os.path.join(src, 'combined_reads')
158
+ if not os.path.exists(dst):
159
+ os.makedirs(dst)
160
+
161
+ for sample, paths in samples_dict.items():
162
+ print(f'Processing: {sample} with the files: {paths}')
163
+ r1_path = paths.get('R1')
164
+ r2_path = paths.get('R2')
165
+
166
+ output_file_path = os.path.join(dst, f"{sample}_combined.h5")
167
+ qc_file_path = os.path.join(dst, f"{sample}_qc.csv")
168
+
169
+ r1_file = gzip.open(r1_path, 'rt') if r1_path else None
170
+ r2_file = gzip.open(r2_path, 'rt') if r2_path else None
171
+
172
+ chunk_counter = 0
173
+ data_chunk = []
174
+
175
+ success = 0
176
+ fail = 0
177
+
178
+ # Calculate initial average read length
179
+ avg_read_length_r1 = get_avg_read_length(r1_path, 100)
180
+ avg_read_length_r2 = get_avg_read_length(r2_path, 100)
181
+ avg_read_length = (avg_read_length_r1 + avg_read_length_r2) / 2 if avg_read_length_r1 and avg_read_length_r2 else 0
182
+
183
+ print(f'Initial avg_read_length: {avg_read_length}')
184
+
185
+ # Estimate the initial number of reads based on the file size
186
+ r1_size_est = os.path.getsize(r1_path) // (avg_read_length * 4) if r1_path else 0
187
+ r2_size_est = os.path.getsize(r2_path) // (avg_read_length * 4) if r2_path else 0
188
+ max_size = max(r1_size_est, r2_size_est) * 10
189
+
190
+ with tqdm(total=max_size, desc=f"Processing {sample}") as pbar:
191
+ total_length_processed = 0
192
+ read_count = 0
193
+
194
+ while True:
195
+ try:
196
+ r1_index = next(r1_file).strip() if r1_file else None
197
+ r1_read = next(r1_file).strip() if r1_file else None
198
+ r1_plus = next(r1_file).strip() if r1_file else None
199
+ r1_quality = next(r1_file).strip() if r1_file else None
200
+
201
+ r2_index = next(r2_file).strip() if r2_file else None
202
+ r2_read = next(r2_file).strip() if r2_file else None
203
+ r2_plus = next(r2_file).strip() if r2_file else None
204
+ r2_quality = next(r2_file).strip() if r2_file else None
205
+
206
+ pbar.update(1)
207
+
208
+ if r1_index and r2_index and r1_index.split(' ')[0] != r2_index.split(' ')[0]:
209
+ fail += 1
210
+ print(f"Index mismatch: {r1_index} != {r2_index}")
211
+ continue
212
+
213
+ r1_read_rc = reverse_complement(r1_read) if r1_read else ''
214
+ r1_quality_rc = r1_quality[::-1] if r1_quality else ''
215
+
216
+ r1_rc_split_index = r1_read_rc.find(upstream)
217
+ r2_split_index = r2_read.find(upstream)
218
+
219
+ if r1_rc_split_index == -1 or r2_split_index == -1:
220
+ fail += 1
221
+ continue
222
+ else:
223
+ success += 1
224
+
225
+ read1_fragment = r1_read_rc[:r1_rc_split_index]
226
+ read2_fragment = r2_read[r2_split_index:]
227
+ read_combo = read1_fragment + read2_fragment
228
+
229
+ combo_split_index_1 = read_combo.find(upstream)
230
+ combo_split_index_2 = read_combo.find(downstream)
231
+
232
+ barcode_1 = read_combo[combo_split_index_1 - barecode_length:combo_split_index_1]
233
+ grna = read_combo[combo_split_index_1 + len(upstream):combo_split_index_2]
234
+ barcode_2 = read_combo[combo_split_index_2 + len(downstream):combo_split_index_2 + len(downstream) + barecode_length]
235
+ barcode_2 = reverse_complement(barcode_2)
236
+ data_chunk.append((read_combo, grna, barcode_1, barcode_2, sample))
237
+
238
+ read_count += 1
239
+ total_length_processed += len(r1_read) + len(r2_read)
240
+
241
+ # Periodically update the average read length and total
242
+ if read_count % 10000 == 0:
243
+ avg_read_length = total_length_processed / (read_count * 2)
244
+ max_size = (os.path.getsize(r1_path) + os.path.getsize(r2_path)) // (avg_read_length * 4)
245
+ pbar.total = max_size
246
+
247
+ if len(data_chunk) >= chunk_size:
248
+ save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
249
+ chunk_counter += 1
250
+ data_chunk = []
251
+
252
+ except StopIteration:
253
+ break
254
+
255
+ # Save any remaining data_chunk
256
+ if data_chunk:
257
+ save_chunk_to_hdf5(output_file_path, data_chunk, chunk_counter)
258
+
259
+ # Save QC metrics
260
+ qc = {'success': success, 'failed': fail}
261
+ qc_df = pd.DataFrame([qc])
262
+ qc_df.to_csv(qc_file_path, index=False)
263
+
264
+ settings.setdefault('upstream', 'CTTCTGGTAAATGGGGATGTCAAGTT')
265
+ settings.setdefault('downstream', 'GTTTAAGAGCTATGCTGGAAACAGCA')
266
+ settings.setdefault('barecode_length', 8)
267
+ settings.setdefault('chunk_size', 1000000)
268
+
269
+ samples_dict = parse_gz_files(settings['src'])
270
+ combine_reads(samples_dict, settings['src'], settings['chunk_size'], settings['barecode_length'], settings['upstream'], settings['downstream'])
271
+
272
+ def map_barcodes(h5_file_path, settings={}):
273
+ """
274
+ Maps barcodes and performs quality control on sequencing data.
275
+
276
+ Args:
277
+ h5_file_path (str): The file path to the HDF5 file containing the sequencing data.
278
+ settings (dict, optional): Additional settings for the mapping and quality control process. Defaults to {}.
279
+
280
+ Returns:
281
+ None
282
+ """
283
+ def get_read_qc(df, df_cleaned):
284
+ """
285
+ Calculate quality control metrics for sequencing reads.
286
+
287
+ Parameters:
288
+ - df: DataFrame containing the sequencing reads.
289
+ - df_cleaned: DataFrame containing the cleaned sequencing reads.
290
+
291
+ Returns:
292
+ - qc_dict: Dictionary containing the following quality control metrics:
293
+ - 'reads': Total number of reads.
294
+ - 'cleaned_reads': Total number of cleaned reads.
295
+ - 'NaN_grna': Number of reads with missing 'grna_metadata'.
296
+ - 'NaN_plate_row': Number of reads with missing 'plate_row_metadata'.
297
+ - 'NaN_column': Number of reads with missing 'column_metadata'.
298
+ - 'NaN_plate': Number of reads with missing 'plate_metadata'.
299
+ - 'unique_grna': Counter object containing the count of unique 'grna_metadata' values.
300
+ - 'unique_plate_row': Counter object containing the count of unique 'plate_row_metadata' values.
301
+ - 'unique_column': Counter object containing the count of unique 'column_metadata' values.
302
+ - 'unique_plate': Counter object containing the count of unique 'plate_metadata' values.
303
+ """
304
+ qc_dict = {}
305
+ qc_dict['reads'] = len(df)
306
+ qc_dict['cleaned_reads'] = len(df_cleaned)
307
+ qc_dict['NaN_grna'] = df['grna_metadata'].isna().sum()
308
+ qc_dict['NaN_plate_row'] = df['plate_row_metadata'].isna().sum()
309
+ qc_dict['NaN_column'] = df['column_metadata'].isna().sum()
310
+ qc_dict['NaN_plate'] = df['plate_metadata'].isna().sum()
311
+ qc_dict['unique_grna'] = Counter(df['grna_metadata'].dropna().tolist())
312
+ qc_dict['unique_plate_row'] = Counter(df['plate_row_metadata'].dropna().tolist())
313
+ qc_dict['unique_column'] = Counter(df['column_metadata'].dropna().tolist())
314
+ qc_dict['unique_plate'] = Counter(df['plate_metadata'].dropna().tolist())
315
+
316
+ return qc_dict
317
+
318
+ def mapping_dicts(df, settings):
319
+ """
320
+ Maps the values in the DataFrame columns to corresponding metadata using dictionaries.
321
+
322
+ Args:
323
+ df (pandas.DataFrame): The DataFrame containing the data to be mapped.
324
+ settings (dict): A dictionary containing the settings for mapping.
325
+
326
+ Returns:
327
+ pandas.DataFrame: The DataFrame with the mapped metadata columns added.
328
+ """
329
+ grna_df = pd.read_csv(settings['grna'])
330
+ barcode_df = pd.read_csv(settings['barcodes'])
331
+
332
+ grna_dict = {row['sequence']: row['name'] for _, row in grna_df.iterrows()}
333
+ plate_row_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('p')}
334
+ column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
335
+ plate_dict = settings['plate_dict']
336
+
337
+ df['grna_metadata'] = df['grna'].map(grna_dict)
338
+ df['grna_length'] = df['grna'].apply(len)
339
+ df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
340
+ df['column_metadata'] = df['column'].map(column_dict)
341
+ df['plate_metadata'] = df['sample'].map(plate_dict)
342
+
343
+ return df
344
+
345
+ settings.setdefault('grna', '/home/carruthers/Documents/grna_barecodes.csv')
346
+ settings.setdefault('barcodes', '/home/carruthers/Documents/SCREEN_BARECODES.csv')
347
+ settings.setdefault('plate_dict', {'EO1': 'plate1', 'EO2': 'plate2', 'EO3': 'plate3', 'EO4': 'plate4', 'EO5': 'plate5', 'EO6': 'plate6', 'EO7': 'plate7', 'EO8': 'plate8'})
348
+ settings.setdefault('test', False)
349
+ settings.setdefault('verbose', True)
350
+ settings.setdefault('min_itemsize', 1000)
351
+
352
+ qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
353
+ unique_grna_file_path = os.path.splitext(h5_file_path)[0] + '_unique_grna.csv'
354
+ unique_plate_row_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate_row.csv'
355
+ unique_column_file_path = os.path.splitext(h5_file_path)[0] + '_unique_column.csv'
356
+ unique_plate_file_path = os.path.splitext(h5_file_path)[0] + '_unique_plate.csv'
357
+ new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
358
+
359
+ # Initialize the HDF5 store for cleaned data
360
+ store_cleaned = pd.HDFStore(new_h5_file_path, mode='a', complevel=5, complib='blosc')
361
+
362
+ # Initialize the overall QC metrics
363
+ overall_qc = {
364
+ 'reads': 0,
365
+ 'cleaned_reads': 0,
366
+ 'NaN_grna': 0,
367
+ 'NaN_plate_row': 0,
368
+ 'NaN_column': 0,
369
+ 'NaN_plate': 0,
370
+ 'unique_grna': Counter(),
371
+ 'unique_plate_row': Counter(),
372
+ 'unique_column': Counter(),
373
+ 'unique_plate': Counter()
374
+ }
375
+
376
+ with pd.HDFStore(h5_file_path, mode='r') as store:
377
+ keys = [key for key in store.keys() if key.startswith('/reads/chunk_')]
378
+
379
+ for key in keys:
380
+ df = store.get(key)
381
+ df = mapping_dicts(df, settings)
382
+ df_cleaned = df.dropna()
383
+ qc_dict = get_read_qc(df, df_cleaned)
384
+
385
+ # Accumulate QC metrics
386
+ overall_qc['reads'] += qc_dict['reads']
387
+ overall_qc['cleaned_reads'] += qc_dict['cleaned_reads']
388
+ overall_qc['NaN_grna'] += qc_dict['NaN_grna']
389
+ overall_qc['NaN_plate_row'] += qc_dict['NaN_plate_row']
390
+ overall_qc['NaN_column'] += qc_dict['NaN_column']
391
+ overall_qc['NaN_plate'] += qc_dict['NaN_plate']
392
+ overall_qc['unique_grna'].update(qc_dict['unique_grna'])
393
+ overall_qc['unique_plate_row'].update(qc_dict['unique_plate_row'])
394
+ overall_qc['unique_column'].update(qc_dict['unique_column'])
395
+ overall_qc['unique_plate'].update(qc_dict['unique_plate'])
396
+
397
+ df_cleaned = df_cleaned[df_cleaned['grna_length'] >= 30]
398
+
399
+ # Save cleaned data to the new HDF5 store
400
+ store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
401
+
402
+ del df_cleaned, df
403
+ gc.collect()
404
+
405
+ # Convert the Counter objects to DataFrames and save them to CSV files
406
+ unique_grna_df = pd.DataFrame(overall_qc['unique_grna'].items(), columns=['key', 'value'])
407
+ unique_plate_row_df = pd.DataFrame(overall_qc['unique_plate_row'].items(), columns=['key', 'value'])
408
+ unique_column_df = pd.DataFrame(overall_qc['unique_column'].items(), columns=['key', 'value'])
409
+ unique_plate_df = pd.DataFrame(overall_qc['unique_plate'].items(), columns=['key', 'value'])
410
+
411
+ unique_grna_df.to_csv(unique_grna_file_path, index=False)
412
+ unique_plate_row_df.to_csv(unique_plate_row_file_path, index=False)
413
+ unique_column_df.to_csv(unique_column_file_path, index=False)
414
+ unique_plate_df.to_csv(unique_plate_file_path, index=False)
415
+
416
+ # Remove the unique counts from overall_qc for the main QC CSV file
417
+ del overall_qc['unique_grna']
418
+ del overall_qc['unique_plate_row']
419
+ del overall_qc['unique_column']
420
+ del overall_qc['unique_plate']
421
+
422
+ # Combine all remaining QC metrics into a single DataFrame and save it to CSV
423
+ qc_df = pd.DataFrame([overall_qc])
424
+ qc_df.to_csv(qc_file_path, index=False)
425
+
426
+ # Close the HDF5 store
427
+ store_cleaned.close()
428
+
429
+ gc.collect()
430
+ return
431
+
432
+ def map_barcodes_v1(h5_file_path, settings={}):
433
+
434
+ def get_read_qc(df, df_cleaned):
435
+ qc_dict = {}
436
+ qc_dict['reads'] = len(df)
437
+ qc_dict['cleaned_reads'] = len(df_cleaned)
438
+ qc_dict['NaN_grna'] = df['grna_metadata'].isna().sum()
439
+ qc_dict['NaN_plate_row'] = df['plate_row_metadata'].isna().sum()
440
+ qc_dict['NaN_column'] = df['column_metadata'].isna().sum()
441
+ qc_dict['NaN_plate'] = df['plate_metadata'].isna().sum()
442
+
443
+
444
+ qc_dict['unique_grna'] = len(df['grna_metadata'].dropna().unique().tolist())
445
+ qc_dict['unique_plate_row'] = len(df['plate_row_metadata'].dropna().unique().tolist())
446
+ qc_dict['unique_column'] = len(df['column_metadata'].dropna().unique().tolist())
447
+ qc_dict['unique_plate'] = len(df['plate_metadata'].dropna().unique().tolist())
448
+ qc_dict['value_counts_grna'] = df['grna_metadata'].value_counts(dropna=True)
449
+ qc_dict['value_counts_plate_row'] = df['plate_row_metadata'].value_counts(dropna=True)
450
+ qc_dict['value_counts_column'] = df['column_metadata'].value_counts(dropna=True)
451
+
452
+ return qc_dict
453
+
454
+ def mapping_dicts(df, settings):
455
+ grna_df = pd.read_csv(settings['grna'])
456
+ barcode_df = pd.read_csv(settings['barcodes'])
457
+
458
+ grna_dict = {row['sequence']: row['name'] for _, row in grna_df.iterrows()}
459
+ plate_row_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('p')}
460
+ column_dict = {row['sequence']: row['name'] for _, row in barcode_df.iterrows() if row['name'].startswith('c')}
461
+ plate_dict = settings['plate_dict']
462
+
463
+ df['grna_metadata'] = df['grna'].map(grna_dict)
464
+ df['grna_length'] = df['grna'].apply(len)
465
+ df['plate_row_metadata'] = df['plate_row'].map(plate_row_dict)
466
+ df['column_metadata'] = df['column'].map(column_dict)
467
+ df['plate_metadata'] = df['sample'].map(plate_dict)
468
+
469
+ return df
470
+
471
+ settings.setdefault('grna', '/home/carruthers/Documents/grna_barcodes.csv')
472
+ settings.setdefault('barcodes', '/home/carruthers/Documents/SCREEN_BARCODES.csv')
473
+ settings.setdefault('plate_dict', {'EO1': 'plate1', 'EO2': 'plate2', 'EO3': 'plate3', 'EO4': 'plate4', 'EO5': 'plate5', 'EO6': 'plate6', 'EO7': 'plate7', 'EO8': 'plate8'})
474
+ settings.setdefault('test', False)
475
+ settings.setdefault('verbose', True)
476
+ settings.setdefault('min_itemsize', 1000)
477
+
478
+ qc_file_path = os.path.splitext(h5_file_path)[0] + '_qc_step_2.csv'
479
+ new_h5_file_path = os.path.splitext(h5_file_path)[0] + '_cleaned.h5'
480
+
481
+ # Initialize the HDF5 store for cleaned data
482
+ store_cleaned = pd.HDFStore(new_h5_file_path, mode='a', complevel=5, complib='blosc')
483
+
484
+ # Initialize the DataFrame for QC metrics
485
+ qc_df_list = []
486
+
487
+ with pd.HDFStore(h5_file_path, mode='r') as store:
488
+ keys = [key for key in store.keys() if key.startswith('/reads/chunk_')]
489
+
490
+ for key in keys:
491
+ df = store.get(key)
492
+ df = mapping_dicts(df, settings)
493
+ df_cleaned = df.dropna()
494
+ qc_dict = get_read_qc(df, df_cleaned)
495
+ qc_df_list.append(qc_dict)
496
+ df_cleaned = df_cleaned[df_cleaned['grna_length'] >= 30]
497
+
498
+ # Save cleaned data to the new HDF5 store
499
+ store_cleaned.put('reads/cleaned_data', df_cleaned, format='table', append=True)
500
+
501
+ # Combine all QC metrics into a single DataFrame and save it to CSV
502
+ qc_df = pd.DataFrame(qc_df_list)
503
+ qc_df.to_csv(qc_file_path, index=False)
504
+
505
+ # Close the HDF5 store
506
+ store_cleaned.close()
507
+ return
508
+
509
+ def map_barcodes_folder(src, settings={}):
510
+ for file in os.listdir(src):
511
+ if file.endswith('.h5'):
512
+ print(file)
513
+ path = os.path.join(src, file)
514
+ map_barcodes(path, settings)
515
+ gc.collect()
11
516
 
12
517
  def reverse_complement(dna_sequence):
13
518
  complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N':'N'}
@@ -146,7 +651,6 @@ def count_mismatches(seq1, seq2, align_length=10):
146
651
  mismatches = sum(c1 != c2 for c1, c2 in zip(seq1_aligned, seq2_aligned))
147
652
  return mismatches
148
653
 
149
-
150
654
  def get_sequence_data(r1,r2):
151
655
  forward_regex = re.compile(r'^(...GGTGCCACTT)TTTCAAGTTG.*?TTCTAGCTCT(AAAAC[A-Z]{18,22}AACTT)GACATCCCCA.*?AAGGCAAACA(CCCCCTTCGG....).*')
152
656
  r1fd = forward_regex.search(r1)
@@ -640,491 +1144,4 @@ def generate_fraction_map(df, gene_column, min_=10, plates=['p1','p2','p3','p4']
640
1144
  independent_variables = independent_variables.drop('sum', axis=1)
641
1145
  independent_variables.index.name = 'prc'
642
1146
  independent_variables = independent_variables.loc[:, (independent_variables.sum() != 0)]
643
- return independent_variables
644
-
645
- # Check if filename or path
646
- def split_filenames(df, filename_column):
647
- plate_ls = []
648
- well_ls = []
649
- col_ls = []
650
- row_ls = []
651
- field_ls = []
652
- obj_ls = []
653
- ls = df[filename_column].tolist()
654
- if '/' in ls[0]:
655
- file_list = [os.path.basename(path) for path in ls]
656
- else:
657
- file_list = ls
658
- print('first file',file_list[0])
659
- for filename in file_list:
660
- plate = filename.split('_')[0]
661
- plate = plate.split('plate')[1]
662
- well = filename.split('_')[1]
663
- field = filename.split('_')[2]
664
- object_nr = filename.split('_')[3]
665
- object_nr = object_nr.split('.')[0]
666
- object_nr = 'o'+str(object_nr)
667
- if re.match('A..', well):
668
- row = 'r1'
669
- if re.match('B..', well):
670
- row = 'r2'
671
- if re.match('C..', well):
672
- row = 'r3'
673
- if re.match('D..', well):
674
- row = 'r4'
675
- if re.match('E..', well):
676
- row = 'r5'
677
- if re.match('F..', well):
678
- row = 'r6'
679
- if re.match('G..', well):
680
- row = 'r7'
681
- if re.match('H..', well):
682
- row = 'r8'
683
- if re.match('I..', well):
684
- row = 'r9'
685
- if re.match('J..', well):
686
- row = 'r10'
687
- if re.match('K..', well):
688
- row = 'r11'
689
- if re.match('L..', well):
690
- row = 'r12'
691
- if re.match('M..', well):
692
- row = 'r13'
693
- if re.match('N..', well):
694
- row = 'r14'
695
- if re.match('O..', well):
696
- row = 'r15'
697
- if re.match('P..', well):
698
- row = 'r16'
699
- if re.match('.01', well):
700
- col = 'c1'
701
- if re.match('.02', well):
702
- col = 'c2'
703
- if re.match('.03', well):
704
- col = 'c3'
705
- if re.match('.04', well):
706
- col = 'c4'
707
- if re.match('.05', well):
708
- col = 'c5'
709
- if re.match('.06', well):
710
- col = 'c6'
711
- if re.match('.07', well):
712
- col = 'c7'
713
- if re.match('.08', well):
714
- col = 'c8'
715
- if re.match('.09', well):
716
- col = 'c9'
717
- if re.match('.10', well):
718
- col = 'c10'
719
- if re.match('.11', well):
720
- col = 'c11'
721
- if re.match('.12', well):
722
- col = 'c12'
723
- if re.match('.13', well):
724
- col = 'c13'
725
- if re.match('.14', well):
726
- col = 'c14'
727
- if re.match('.15', well):
728
- col = 'c15'
729
- if re.match('.16', well):
730
- col = 'c16'
731
- if re.match('.17', well):
732
- col = 'c17'
733
- if re.match('.18', well):
734
- col = 'c18'
735
- if re.match('.19', well):
736
- col = 'c19'
737
- if re.match('.20', well):
738
- col = 'c20'
739
- if re.match('.21', well):
740
- col = 'c21'
741
- if re.match('.22', well):
742
- col = 'c22'
743
- if re.match('.23', well):
744
- col = 'c23'
745
- if re.match('.24', well):
746
- col = 'c24'
747
- plate_ls.append(plate)
748
- well_ls.append(well)
749
- field_ls.append(field)
750
- obj_ls.append(object_nr)
751
- row_ls.append(row)
752
- col_ls.append(col)
753
- df['file'] = ls
754
- df['plate'] = plate_ls
755
- df['well'] = well_ls
756
- df['row'] = row_ls
757
- df['col'] = col_ls
758
- df['field'] = field_ls
759
- df['obj'] = obj_ls
760
- df['plate_well'] = df['plate']+'_'+df['well']
761
- df = df.set_index(filename_column)
762
- return df
763
-
764
- def rename_plate_metadata(df):
765
- try:
766
- df = df.drop(['plateID'], axis=1)
767
- df = df.drop(['rowID'], axis=1)
768
- df = df.drop(['columnID'], axis=1)
769
- df = df.drop(['plate_row_col'], axis=1)
770
- df = df.drop(['Unnamed: 0'], axis=1)
771
- df = df.drop(['Unnamed: 0.1'], axis=1)
772
- except:
773
- next
774
-
775
- df['plate'] = df['plate'].astype('string')
776
- df.plate.replace('1', 'A', inplace=True)
777
- df.plate.replace('2', 'B', inplace=True)
778
- df.plate.replace('3', 'C', inplace=True)
779
- df.plate.replace('4', 'D', inplace=True)
780
- df.plate.replace('5', 'E', inplace=True)
781
- df.plate.replace('6', 'F', inplace=True)
782
- df.plate.replace('7', 'G', inplace=True)
783
- df.plate.replace('8', 'H', inplace=True)
784
- df.plate.replace('9', 'I', inplace=True)
785
- df.plate.replace('10', 'J', inplace=True)
786
-
787
- df.plate.replace('A', 'p1', inplace=True)# 1 - 1
788
- df.plate.replace('B', 'p2', inplace=True)# 2 - 2
789
- df.plate.replace('C', 'p3', inplace=True)# 3 - 3
790
- df.plate.replace('E', 'p4', inplace=True)# 5 - 4
791
-
792
- df.plate.replace('F', 'p5', inplace=True)# 6 - 5
793
- df.plate.replace('G', 'p6', inplace=True)# 7 - 6
794
- df.plate.replace('H', 'p7', inplace=True)# 8 - 7
795
- df.plate.replace('I', 'p8', inplace=True)# 9 - 8
796
-
797
- df['plateID'] = df['plate']
798
-
799
- df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c1', 'c2', 'c3'])), 'plate'] = 'p1'
800
- df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c4', 'c5', 'c6'])), 'plate'] = 'p2'
801
- df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c7', 'c8', 'c9'])), 'plate'] = 'p3'
802
- df.loc[(df['plateID'].isin(['D'])) & (df['col'].isin(['c10', 'c11', 'c12'])), 'plate'] = 'p4'
803
-
804
- df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c1', 'c2', 'c3'])), 'plate'] = 'p5'
805
- df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c4', 'c5', 'c6'])), 'plate'] = 'p6'
806
- df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c7', 'c8', 'c9'])), 'plate'] = 'p7'
807
- df.loc[(df['plateID'].isin(['J'])) & (df['col'].isin(['c10', 'c11', 'c12'])), 'plate'] = 'p8'
808
-
809
- df.loc[(df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c1', 'c4', 'c7', 'c10'])), 'col'] = 'c1'
810
- df.loc[(df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c2', 'c5', 'c8', 'c11'])), 'col'] = 'c2'
811
- df.loc[(df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c3', 'c6', 'c9', 'c12'])), 'col'] = 'c3'
812
-
813
- df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c1'])), 'col'] = 'c25'
814
- df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c2'])), 'col'] = 'c26'
815
- df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c3'])), 'col'] = 'c27'
816
-
817
- df.loc[(~df['plateID'].isin(['D', 'J'])) & (df['col'].isin(['c1'])), 'col'] = 'c25'
818
-
819
- df = df.drop(['plateID'], axis=1)
820
-
821
- df = df.loc[~df['plate'].isin(['D', 'J'])]
822
-
823
- screen_cols = ['c1','c2','c3','c4','c5','c6','c7','c8','c9','c10','c11','c12','c13','c14','c15','c16','c17','c18','c19','c20','c21','c22','c23','c24']
824
- screen_plates = ['p1','p2','p3','p4']
825
- positive_control_plates = ['p5','p6','p7','p8']
826
- positive_control_cols = screen_cols
827
- negative_control_cols = ['c25','c26','c27']
828
- #extra_plates = ['p9','p10']
829
- cond_ls = []
830
-
831
- cols = df.col.tolist()
832
- for index, plate in enumerate(df.plate.tolist()):
833
- co = cols[index]
834
- if plate in screen_plates:
835
- if co in screen_cols:
836
- cond = 'SCREEN'
837
- if co in negative_control_cols:
838
- cond = 'NC'
839
- if plate in positive_control_plates:
840
- if co in positive_control_cols:
841
- cond = 'PC'
842
- if co in negative_control_cols:
843
- cond = 'NC'
844
- cond_ls.append(cond)
845
-
846
- df['cond'] = cond_ls
847
- df['plate'] = df['plate'].astype('string')
848
- df['row'] = df['row'].astype('string')
849
- df['col'] = df['col'].astype('string')
850
- df['obj'] = df['obj'].astype('string')
851
- df['prco'] = df['plate']+'_'+df['row']+'_'+df['col']+'_'+df['field']+'_'+df['obj']
852
- df['prc'] = df['plate']+'_'+df['row']+'_'+df['col']
853
- df = df.set_index(['prco'], drop=True)
854
- df = df.sort_values(by = ['plate'], ascending = [True], na_position = 'first')
855
- values, counts = np.unique(df['plate'], return_counts=True)
856
- print('plates:', values)
857
- print('well count:', counts)
858
- return df
859
-
860
- def plot_reg_res(df, coef_col='coef', col_p='P>|t|'):
861
- df['gene'] = df.index
862
- df[coef_col] = pd.to_numeric(df[coef_col], errors='coerce')
863
- df[col_p] = pd.to_numeric(df[col_p], errors='coerce')
864
- df = df.sort_values(by = [coef_col], ascending = [False], na_position = 'first')
865
- df['color'] = 'None'
866
- df.loc[df['gene'].str.contains('239740'), 'color'] = '239740'
867
- df.loc[df['gene'].str.contains('205250'), 'color'] = '205250'
868
-
869
- df.loc[df['gene'].str.contains('000000'), 'color'] = '000000'
870
- df.loc[df['gene'].str.contains('000001'), 'color'] = '000000'
871
- df.loc[df['gene'].str.contains('000002'), 'color'] = '000000'
872
- df.loc[df['gene'].str.contains('000003'), 'color'] = '000000'
873
- df.loc[df['gene'].str.contains('000004'), 'color'] = '000000'
874
- df.loc[df['gene'].str.contains('000005'), 'color'] = '000000'
875
- df.loc[df['gene'].str.contains('000006'), 'color'] = '000000'
876
- df.loc[df['gene'].str.contains('000007'), 'color'] = '000000'
877
- df.loc[df['gene'].str.contains('000008'), 'color'] = '000000'
878
- df.loc[df['gene'].str.contains('000009'), 'color'] = '000000'
879
- df.loc[df['gene'].str.contains('000010'), 'color'] = '000000'
880
- fig, ax = plt.subplots(figsize=(10,10))
881
- df.loc[df[col_p] == 0.000, col_p] = 0.001
882
- df['logp'] = -np.log10(df[col_p])
883
- sns.scatterplot(data = df, x = coef_col, y = 'logp', legend = False, ax = ax,
884
- hue= 'color', hue_order = ['239740','205250','None', '000000'],
885
- palette = ['purple', 'teal', 'lightgrey', 'black'],
886
- size = 'color', sizes = (100, 10))
887
- g14 = df[df['gene'].str.contains('239740')]
888
- r18 = df[df['gene'].str.contains('205250')]
889
- res = pd.concat([g14, r18], axis=0)
890
- res = res[[coef_col, col_p]]
891
- print(res)
892
- return df, res
893
-
894
- def reg_model(iv_loc,dv_loc):
895
- independent_variables = pd.read_csv(iv_loc)
896
- dependent_variable = pd.read_csv(dv_loc)
897
- independent_variables = independent_variables.set_index('prc')
898
- columns = independent_variables.columns
899
- new_columns = [col.replace('TGGT1_', '') for col in columns]
900
- independent_variables.columns = new_columns
901
-
902
- dependent_variable = dependent_variable.set_index('prc')
903
-
904
- reg_input = pd.DataFrame(pd.merge(independent_variables, dependent_variable, left_index=True, right_index=True))
905
- reg_input = reg_input.dropna(axis=0, how='any')
906
- reg_input = reg_input.dropna(axis=1, how='any')
907
- print('Number of wells',len(reg_input))
908
- x = reg_input.drop(['score'], axis=1)
909
- x = sm.add_constant(x)
910
- y = np.log10(reg_input['score']+1)
911
- model = sm.OLS(y, x).fit()
912
- predictions = model.predict(x)
913
- results_summary = model.summary()
914
- print(results_summary)
915
- results_as_html = results_summary.tables[1].as_html()
916
- results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
917
- df, res = plot_reg_res(df=results_df)
918
- return df, res
919
-
920
- def mixed_model(iv_loc,dv_loc):
921
- independent_variables = pd.read_csv(iv_loc)
922
- dependent_variable = pd.read_csv(dv_loc)
923
- independent_variables = independent_variables.set_index('prc')
924
- columns = independent_variables.columns
925
- new_columns = [col.replace('TGGT1_', '') for col in columns]
926
- independent_variables.columns = new_columns
927
- dependent_variable = dependent_variable.set_index('prc')
928
- reg_input = pd.DataFrame(pd.merge(independent_variables, dependent_variable, left_index=True, right_index=True))
929
- reg_input = reg_input.dropna(axis=0, how='any')
930
-
931
- y = np.log10(reg_input['score']+1)
932
- X = reg_input.drop('score', axis=1)
933
- X.columns = pd.MultiIndex.from_tuples([tuple(col.split('_')) for col in X.columns],
934
- names=['main_variable', 'sub_variable'])
935
- # Melt the DataFrame
936
- X_long = X.melt(ignore_index=False, var_name=['main_variable', 'sub_variable'], value_name='value')
937
- X_long = X_long[X_long['value']>0]
938
-
939
- # Create a new column to represent the nested structure of gRNA within gene
940
- X_long['gene_gRNA'] = X_long['main_variable'].astype(str) + "_" + X_long['sub_variable'].astype(str)
941
-
942
- # Add 'score' to the DataFrame
943
- X_long['score'] = y
944
-
945
- # Create and convert the plate, row, and column variables to type category
946
- X_long.reset_index(inplace=True)
947
- split_values = X_long['prc'].str.split('_', expand=True)
948
- X_long[['plate', 'row', 'col']] = split_values
949
- X_long['plate'] = X_long['plate'].str[1:]
950
- X_long['plate'] = X_long['plate'].astype(int)
951
- X_long['row'] = X_long['row'].str[1:]
952
- X_long['row'] = X_long['row'].astype(int)
953
- X_long['col'] = X_long['col'].str[1:]
954
- X_long['col'] = X_long['col'].astype(int)
955
- X_long = X_long.set_index('prc')
956
- # Create a new column to represent the nested structure of plate, row, and column
957
- X_long['plate_row_col'] = X_long['plate'].astype(str) + "_" + X_long['row'].astype(str) + "_" + X_long['col'].astype(str)
958
- n_group = pd.DataFrame(X_long.groupby(['gene_gRNA']).count()['main_variable'])
959
- n_group = n_group.rename({'main_variable': 'n_group'}, axis=1)
960
- n_group = n_group.reset_index(drop=False)
961
- X_long = pd.merge(X_long, n_group, on='gene_gRNA')
962
- X_long = X_long[X_long['n_group']>1]
963
- #print(X_long.isna().sum())
964
-
965
- X_long['main_variable'] = X_long['main_variable'].astype('category')
966
- X_long['sub_variable'] = X_long['sub_variable'].astype('category')
967
- X_long['plate'] = X_long['plate'].astype('category')
968
- X_long['row'] = X_long['row'].astype('category')
969
- X_long['col'] = X_long['col'].astype('category')
970
- X_long = pd.DataFrame(X_long)
971
- print(X_long)
972
-
973
- md = smf.mixedlm("score ~ C(main_variable)", X_long,
974
- groups=X_long["sub_variable"])
975
-
976
- # Define your nonlinear function here
977
- def nonlinear_function(x, *params):
978
- pass # Implement non linear function here
979
-
980
- mdf = md.fit(method='bfgs', maxiter=1000)
981
- print(mdf.summary())
982
- summary = mdf.summary()
983
- df = pd.DataFrame(summary.tables[1])
984
- df, res = plot_reg_res(df, coef_col='Coef.', col_p='P>|z|')
985
- return df, res
986
-
987
- def calculate_accuracy(df):
988
- df.loc[df['pc_score'] <= 0.5, 'pred'] = 0
989
- df.loc[df['pc_score'] >= 0.5, 'pred'] = 1
990
- df.loc[df['cond'] == 'NC', 'lab'] = 0
991
- df.loc[df['cond'] == 'PC', 'lab'] = 1
992
- df = df[df['cond'] != 'SCREEN']
993
- df_nc = df[df['cond'] != 'NC']
994
- df_pc = df[df['cond'] != 'PC']
995
- correct = []
996
- all_ls = []
997
- pred_list = df['pred'].tolist()
998
- lab_list = df['lab'].tolist()
999
- for i,v in enumerate(pred_list):
1000
- all_ls.append(1)
1001
- if v == lab_list[i]:
1002
- correct.append(1)
1003
- print('total accuracy',len(correct)/len(all_ls))
1004
- correct = []
1005
- all_ls = []
1006
- pred_list = df_pc['pred'].tolist()
1007
- lab_list = df_pc['lab'].tolist()
1008
- for i,v in enumerate(pred_list):
1009
- all_ls.append(1)
1010
- if v == lab_list[i]:
1011
- correct.append(1)
1012
- print('positives accuracy', len(correct)/len(all_ls))
1013
- correct = []
1014
- all_ls = []
1015
- pred_list = df_nc['pred'].tolist()
1016
- lab_list = df_nc['lab'].tolist()
1017
- for i,v in enumerate(pred_list):
1018
- all_ls.append(1)
1019
- if v == lab_list[i]:
1020
- correct.append(1)
1021
- print('negatives accuracy',len(correct)/len(all_ls))
1022
-
1023
- def preprocess_image_data(df, resnet_loc, min_count=25, metric='mean', plot=True, score='pc_score'):
1024
- print('number of cells', len(df))
1025
- resnet_preds = pd.read_csv(resnet_loc)
1026
- res_df = split_filenames(df=resnet_preds, filename_column='path')
1027
- pred_df = rename_plate_metadata(df=res_df)
1028
- pred_df['prcfo'] = pred_df['plate']+'_'+pred_df['row']+'_'+pred_df['col']+'_'+pred_df['field']+'_'+pred_df['obj']
1029
- print('number of resnet scores', len(df))
1030
- merged_df = pd.merge(df, pred_df, on='prcfo', how='inner', suffixes=('', '_right'))
1031
- merged_df = merged_df.rename(columns={'pred': 'pc_score'})
1032
-
1033
- merged_df = merged_df[(merged_df['pc_score'] <= 0.25) | (merged_df['pc_score'] >= 0.75)]
1034
-
1035
- merged_df['recruitment'] = merged_df['Toxo_channel_1_quartiles75']/merged_df['Cytosol_channel_1_quartiles75']
1036
- merged_df = pd.DataFrame(merged_df[merged_df['duplicates'] == 1.0])
1037
- columns_to_drop = [col for col in merged_df.columns if col.endswith('_right')]
1038
- merged_df = merged_df.drop(columns_to_drop, axis=1)
1039
- well_group = pd.DataFrame(merged_df.groupby(['prc']).count()['cond'])
1040
- well_group = well_group.rename({'cond': 'cell_count'}, axis=1)
1041
- merged_df = pd.merge(merged_df, well_group, on='prc', how='inner', suffixes=('', '_right'))
1042
- columns_to_drop = [col for col in merged_df.columns if col.endswith('_right')]
1043
- merged_df = merged_df.drop(columns_to_drop, axis=1)
1044
- #merged_df = merged_df.drop(['duplicates', 'outlier', 'prcfo.1'], axis=1)
1045
- merged_df = merged_df.drop(['duplicates', 'prcfo.1'], axis=1)
1046
- merged_df = pd.DataFrame(merged_df[merged_df['cell_count'] >= min_count])
1047
-
1048
- if metric == 'mean':
1049
- well_scores_score = pd.DataFrame(merged_df.groupby(['prc']).mean()['pc_score'])
1050
- well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
1051
- well_scores_rec = pd.DataFrame(merged_df.groupby(['prc']).mean()['recruitment'])
1052
- well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
1053
- if metric == 'geomean':
1054
- well_scores_score = pd.DataFrame(merged_df.groupby(['prc'])['pc_score'].apply(gmean))
1055
- well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
1056
- well_scores_rec = pd.DataFrame(merged_df.groupby(['prc'])['recruitment'].apply(gmean))
1057
- well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
1058
- if metric == 'median':
1059
- well_scores_score = pd.DataFrame(merged_df.groupby(['prc']).median()['pc_score'])
1060
- well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
1061
- well_scores_rec = pd.DataFrame(merged_df.groupby(['prc']).median()['recruitment'])
1062
- well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
1063
- if metric == 'quntile':
1064
- well_scores_score = pd.DataFrame(merged_df.groupby(['prc']).quantile(0.75)['pc_score'])
1065
- well_scores_score = well_scores_score.rename({'pc_score': 'mean_pc_score'}, axis=1)
1066
- well_scores_rec = pd.DataFrame(merged_df.groupby(['prc']).quantile(0.75)['recruitment'])
1067
- well_scores_rec = well_scores_rec.rename({'recruitment': 'mean_recruitment'}, axis=1)
1068
- well = pd.DataFrame(pd.DataFrame(merged_df.select_dtypes(include=['object'])).groupby(['prc']).first())
1069
- well['mean_pc_score'] = well_scores_score['mean_pc_score']
1070
- well['mean_recruitment'] = well_scores_rec['mean_recruitment']
1071
- nc = well[well['cond'] == 'NC']
1072
- max_nc = nc['mean_recruitment'].max()
1073
- pc = well[well['cond'] == 'PC']
1074
- screen = well[well['cond'] == 'SCREEN']
1075
- screen = screen[screen['mean_recruitment'] <= max_nc]
1076
- if plot:
1077
- x_axis = 'mean_pc_score'
1078
- fig, ax = plt.subplots(1,3,figsize=(30,10))
1079
- sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='lightgray', log_scale=False)
1080
- sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='teal', log_scale=False)
1081
- sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[1], color='purple', log_scale=False)
1082
- sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='lightgray', log_scale=False)
1083
- sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='teal', log_scale=False)
1084
- sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='purple', log_scale=False)
1085
- ax[0].set_title('NC vs PC wells')
1086
- ax[1].set_title('Screen wells')
1087
- ax[2].set_title('NC vs PC vs Screen wells')
1088
- ax[0].spines['top'].set_visible(False)
1089
- ax[0].spines['right'].set_visible(False)
1090
- ax[1].spines['top'].set_visible(False)
1091
- ax[1].spines['right'].set_visible(False)
1092
- ax[2].spines['top'].set_visible(False)
1093
- ax[2].spines['right'].set_visible(False)
1094
- ax[0].set_xlim([0, 1])
1095
- ax[1].set_xlim([0, 1])
1096
- ax[2].set_xlim([0, 1])
1097
- loc = '/media/olafsson/umich/matt_graphs/resnet_score_well_av.pdf'
1098
- fig.savefig(loc, dpi = 600, format='pdf', bbox_inches='tight')
1099
- x_axis = 'mean_recruitment'
1100
- fig, ax = plt.subplots(1,3,figsize=(30,10))
1101
- sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='lightgray', log_scale=False)
1102
- sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[0], color='teal', log_scale=False)
1103
- sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[1], color='purple', log_scale=False)
1104
- sns.histplot(data=nc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='lightgray', log_scale=False)
1105
- sns.histplot(data=pc, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='teal', log_scale=False)
1106
- sns.histplot(data=screen, x=x_axis, kde=False, stat='density', element="step", ax=ax[2], color='purple', log_scale=False)
1107
- ax[0].set_title('NC vs PC wells')
1108
- ax[1].set_title('Screen wells')
1109
- ax[2].set_title('NC vs PC vs Screen wells')
1110
- ax[0].spines['top'].set_visible(False)
1111
- ax[0].spines['right'].set_visible(False)
1112
- ax[1].spines['top'].set_visible(False)
1113
- ax[1].spines['right'].set_visible(False)
1114
- ax[2].spines['top'].set_visible(False)
1115
- ax[2].spines['right'].set_visible(False)
1116
- loc = '/media/olafsson/umich/matt_graphs/mean_recruitment_well_av.pdf'
1117
- fig.savefig(loc, dpi = 600, format='pdf', bbox_inches='tight')
1118
- plates = ['p1','p2','p3','p4']
1119
- screen = screen[screen['plate'].isin(plates)]
1120
- if score == 'pc_score':
1121
- dv = pd.DataFrame(screen['mean_pc_score'])
1122
- dv = dv.rename({'mean_pc_score': 'score'}, axis=1)
1123
- if score == 'recruitment':
1124
- dv = pd.DataFrame(screen['mean_recruitment'])
1125
- dv = dv.rename({'mean_recruitment': 'score'}, axis=1)
1126
- print('dependant variable well count:', len(well))
1127
- dv_loc = '/media/olafsson/Data2/methods_paper/data/dv.csv'
1128
- dv.to_csv(dv_loc)
1129
- calculate_accuracy(df=merged_df)
1130
- return merged_df, well
1147
+ return independent_variables