spacr 0.4.12__py3-none-any.whl → 0.4.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/plot.py CHANGED
@@ -1658,36 +1658,36 @@ def generate_plate_heatmap(df, plate_number, variable, grouping, min_max, min_co
1658
1658
  num_parts = len(df['prc'].iloc[0].split('_'))
1659
1659
  if num_parts == 4:
1660
1660
  split = df['prc'].str.split('_', expand=True)
1661
- df['row_name'] = split[2]
1661
+ df['rowID'] = split[2]
1662
1662
  df['prc'] = f"{plate_number}" + '_' + split[2] + '_' + split[3]
1663
1663
 
1664
- # Construct 'prc' based on 'plate', 'row_name', and 'column' columns
1665
- #df['prc'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column'].astype(str)
1664
+ # Construct 'prc' based on 'plateID', 'rowID', and 'columnID' columns
1665
+ #df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
1666
1666
 
1667
1667
  if 'column_name' not in df.columns:
1668
1668
  if 'column' in df.columns:
1669
- df['column_name'] = df['column']
1669
+ df['columnID'] = df['column']
1670
1670
  if 'column_name' in df.columns:
1671
- df['column_name'] = df['column_name']
1671
+ df['columnID'] = df['column_name']
1672
1672
 
1673
- df['plate'], df['row_name'], df['column_name'] = zip(*df['prc'].str.split('_'))
1673
+ df['plateID'], df['rowID'], df['columnID'] = zip(*df['prc'].str.split('_'))
1674
1674
 
1675
1675
  # Filtering the dataframe based on the plate_number
1676
- df = df[df['plate'] == plate_number].copy() # Create another copy after filtering
1676
+ df = df[df['plateID'] == plate_number].copy() # Create another copy after filtering
1677
1677
 
1678
1678
  # Ensure proper ordering
1679
1679
  row_order = [f'r{i}' for i in range(1, 17)]
1680
1680
  col_order = [f'c{i}' for i in range(1, 28)] # Exclude c15 as per your earlier code
1681
1681
 
1682
- df['row_name'] = pd.Categorical(df['row_name'], categories=row_order, ordered=True)
1683
- df['column_name'] = pd.Categorical(df['column_name'], categories=col_order, ordered=True)
1684
- df['count'] = df.groupby(['row_name', 'column_name'])['row_name'].transform('count')
1682
+ df['rowID'] = pd.Categorical(df['rowID'], categories=row_order, ordered=True)
1683
+ df['columnID'] = pd.Categorical(df['columnID'], categories=col_order, ordered=True)
1684
+ df['count'] = df.groupby(['rowID', 'columnID'])['rowID'].transform('count')
1685
1685
 
1686
1686
  if min_count > 0:
1687
1687
  df = df[df['count'] >= min_count]
1688
1688
 
1689
1689
  # Explicitly set observed=True to avoid FutureWarning
1690
- grouped = df.groupby(['row_name', 'column_name'], observed=True) # Group by row and column
1690
+ grouped = df.groupby(['rowID', 'columnID'], observed=True) # Group by row and column
1691
1691
 
1692
1692
  if grouping == 'mean':
1693
1693
  plate = grouped[variable].mean().reset_index()
@@ -1699,7 +1699,7 @@ def generate_plate_heatmap(df, plate_number, variable, grouping, min_max, min_co
1699
1699
  else:
1700
1700
  raise ValueError(f"Unsupported grouping: {grouping}")
1701
1701
 
1702
- plate_map = pd.pivot_table(plate, values=variable, index='row_name', columns='column_name').fillna(0)
1702
+ plate_map = pd.pivot_table(plate, values=variable, index='rowID', columns='columnID').fillna(0)
1703
1703
 
1704
1704
  if min_max == 'all':
1705
1705
  min_max = [plate_map.min().min(), plate_map.max().max()]
@@ -1842,7 +1842,7 @@ def print_mask_and_flows(stack, mask, flows, overlay=True, max_size=1000, thickn
1842
1842
  fig.tight_layout()
1843
1843
  plt.show()
1844
1844
 
1845
- def plot_resize(images, resized_images, labels, resized_labels):
1845
+ def plot_resize_v1(images, resized_images, labels, resized_labels):
1846
1846
  # Display an example image and label before and after resizing
1847
1847
  fig, ax = plt.subplots(2, 2, figsize=(20, 20))
1848
1848
 
@@ -1866,6 +1866,48 @@ def plot_resize(images, resized_images, labels, resized_labels):
1866
1866
  ax[1, 1].set_title('Resized Label')
1867
1867
  plt.show()
1868
1868
 
1869
+
1870
+ def plot_resize(images, resized_images, labels, resized_labels):
1871
+ def prepare_image(img):
1872
+ if img.ndim == 2:
1873
+ return img, 'gray'
1874
+ elif img.ndim == 3:
1875
+ if img.shape[-1] == 1:
1876
+ return np.squeeze(img, axis=-1), 'gray'
1877
+ elif img.shape[-1] == 3:
1878
+ return img, None # RGB
1879
+ elif img.shape[-1] == 4:
1880
+ return img, None # RGBA
1881
+ else:
1882
+ # fallback: average across channels to show as grayscale
1883
+ return np.mean(img, axis=-1), 'gray'
1884
+ else:
1885
+ raise ValueError(f"Unsupported image shape: {img.shape}")
1886
+
1887
+ fig, ax = plt.subplots(2, 2, figsize=(20, 20))
1888
+
1889
+ # Original Image
1890
+ img, cmap = prepare_image(images[0])
1891
+ ax[0, 0].imshow(img, cmap=cmap)
1892
+ ax[0, 0].set_title('Original Image')
1893
+
1894
+ # Resized Image
1895
+ img, cmap = prepare_image(resized_images[0])
1896
+ ax[0, 1].imshow(img, cmap=cmap)
1897
+ ax[0, 1].set_title('Resized Image')
1898
+
1899
+ # Labels (assumed grayscale or single-channel)
1900
+ lbl, cmap = prepare_image(labels[0])
1901
+ ax[1, 0].imshow(lbl, cmap=cmap)
1902
+ ax[1, 0].set_title('Original Label')
1903
+
1904
+ lbl, cmap = prepare_image(resized_labels[0])
1905
+ ax[1, 1].imshow(lbl, cmap=cmap)
1906
+ ax[1, 1].set_title('Resized Label')
1907
+
1908
+ plt.tight_layout()
1909
+ plt.show()
1910
+
1869
1911
  def normalize_and_visualize(image, normalized_image, title=""):
1870
1912
  """Utility function for visualization"""
1871
1913
  fig, ax = plt.subplots(1, 2, figsize=(12, 6))
@@ -2529,8 +2571,8 @@ class spacrGraph:
2529
2571
  # # Group by 'prc' column if representation is 'well'
2530
2572
  # if self.representation == 'well':
2531
2573
  # df = df.groupby(['prc', self.grouping_column])[self.data_column].agg(self.summary_func).reset_index()
2532
- # if self.representation == 'plate':
2533
- # df = df.groupby(['plate', self.grouping_column])[self.data_column].agg(self.summary_func).reset_index()
2574
+ # if self.representation == 'plateID':
2575
+ # df = df.groupby(['plateID', self.grouping_column])[self.data_column].agg(self.summary_func).reset_index()
2534
2576
  # if self.order:
2535
2577
  # df[self.grouping_column] = pd.Categorical(df[self.grouping_column], categories=self.order, ordered=True)
2536
2578
  # else:
@@ -2539,8 +2581,8 @@ class spacrGraph:
2539
2581
 
2540
2582
  def preprocess_data(self):
2541
2583
  """
2542
- Preprocess the data: remove NaNs, optionally ensure 'plate' column is created,
2543
- then group by either 'prc', 'plate', or do no grouping at all if representation == 'object'.
2584
+ Preprocess the data: remove NaNs, optionally ensure 'plateID' column is created,
2585
+ then group by either 'prc', 'plateID', or do no grouping at all if representation == 'object'.
2544
2586
  """
2545
2587
  # 1) Remove NaNs in both the grouping column and each data column
2546
2588
  df = self.df.dropna(subset=[self.grouping_column] + self.data_column)
@@ -2555,21 +2597,21 @@ class spacrGraph:
2555
2597
  # Group by ['prc', grouping_column]
2556
2598
  group_cols = ['prc', self.grouping_column]
2557
2599
 
2558
- elif self.representation == 'plate':
2559
- # Make sure 'plate' exists (split from 'prc' if needed)
2560
- if 'plate' not in df.columns:
2600
+ elif self.representation == 'plateID':
2601
+ # Make sure 'plateID' exists (split from 'prc' if needed)
2602
+ if 'plateID' not in df.columns:
2561
2603
  if 'prc' in df.columns:
2562
- df[['plate', 'row', 'column']] = df['prc'].str.split('_', expand=True)
2604
+ df[['plateID', 'rowID', 'columnID']] = df['prc'].str.split('_', expand=True)
2563
2605
  else:
2564
2606
  raise KeyError(
2565
- "Representation is 'plate', but no 'plate' column found. "
2607
+ "Representation is 'plateID', but no 'plateID' column found. "
2566
2608
  "Also cannot split from 'prc' because 'prc' column is missing."
2567
2609
  )
2568
- # If the grouping column IS 'plate', only group by ['plate'] once
2569
- if self.grouping_column == 'plate':
2570
- group_cols = ['plate']
2610
+ # If the grouping column IS 'plateID', only group by ['plateID'] once
2611
+ if self.grouping_column == 'plateID':
2612
+ group_cols = ['plateID']
2571
2613
  else:
2572
- group_cols = ['plate', self.grouping_column]
2614
+ group_cols = ['plateID', self.grouping_column]
2573
2615
 
2574
2616
  else:
2575
2617
  raise ValueError(f"Unknown representation: {self.representation}")
@@ -3384,7 +3426,7 @@ def plot_data_from_db(settings):
3384
3426
  dfs.append(dft)
3385
3427
 
3386
3428
  df = pd.concat(dfs, axis=0)
3387
- df['prc'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
3429
+ df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
3388
3430
 
3389
3431
  if settings['cell_plate_metadata'] != None:
3390
3432
  df = df.dropna(subset='host_cell')
@@ -3461,19 +3503,19 @@ def plot_data_from_csv(settings):
3461
3503
  dfs = []
3462
3504
  for i, src in enumerate(srcs):
3463
3505
  dft = pd.read_csv(src)
3464
- if 'plate' not in dft.columns:
3465
- dft['plate'] = f"plate{i+1}"
3506
+ if 'plateID' not in dft.columns:
3507
+ dft['plateID'] = f"plate{i+1}"
3466
3508
  dft['common'] = 'spacr'
3467
3509
  dfs.append(dft)
3468
3510
 
3469
3511
  df = pd.concat(dfs, axis=0)
3470
3512
 
3471
3513
  if 'prc' in df.columns:
3472
- # Check if 'plate', 'row', and 'column' are all missing from df.columns
3473
- if not all(col in df.columns for col in ['plate', 'row_name', 'column_name']):
3514
+ # Check if 'plateID', 'rowID', and 'columnID' are all missing from df.columns
3515
+ if not all(col in df.columns for col in ['plate', 'rowID', 'columnID']):
3474
3516
  try:
3475
- # Split 'prc' into 'plate', 'row', and 'column'
3476
- df[['plate', 'row_name', 'column_name']] = df['prc'].str.split('_', expand=True)
3517
+ # Split 'prc' into 'plateID', 'rowID', and 'columnID'
3518
+ df[['plateID', 'rowID', 'columnID']] = df['prc'].str.split('_', expand=True)
3477
3519
  except Exception as e:
3478
3520
  print(f"Could not split the prc column: {e}")
3479
3521
 
@@ -3827,7 +3869,7 @@ def plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_col
3827
3869
  pairwise_results = chi_pairwise(raw_counts, verbose=settings.get('verbose', False))
3828
3870
 
3829
3871
  # Plot based on level setting
3830
- if level in ['well', 'plate']:
3872
+ if level in ['well', 'plateID']:
3831
3873
  # Aggregate by well for mean ± SD visualization
3832
3874
  well_proportions = (
3833
3875
  df.groupby([group_column, prc_column, bin_column])
spacr/sequencing.py CHANGED
@@ -43,9 +43,9 @@ def save_unique_combinations_to_csv(unique_combinations, csv_file):
43
43
  if not existing_df.empty:
44
44
  unique_combinations = pd.concat([existing_df, unique_combinations])
45
45
  unique_combinations = unique_combinations.groupby(
46
- ['row_name', 'column_name', 'grna_name'], as_index=False).sum()
46
+ ['rowID', 'columnID', 'grna_name'], as_index=False).sum()
47
47
 
48
- unique_combinations.to_csv(csv_file, index=False)
48
+ unique_combinations.to_csv(csv_file, index=True)
49
49
  except Exception as e:
50
50
  print(f"Error while saving unique combinations to CSV: {e}")
51
51
 
@@ -92,7 +92,8 @@ def process_chunk(chunk_data):
92
92
  def paired_find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, regex):
93
93
 
94
94
  consensus_sequences, columns, grnas, rows = [], [], [], []
95
-
95
+ consensus_seq = None
96
+
96
97
  for r1_lines, r2_lines in zip(r1_chunk, r2_chunk):
97
98
  _, r1_sequence, _, r1_quality = r1_lines.split('\n')
98
99
  _, r2_sequence, _, r2_quality = r2_lines.split('\n')
@@ -123,23 +124,32 @@ def process_chunk(chunk_data):
123
124
  match = re.match(regex, consensus_seq)
124
125
  if match:
125
126
  consensus_sequences.append(consensus_seq)
126
- column_sequence = match.group('column')
127
+
128
+ #print(f"r1_seq: {r1_seq}")
129
+ #print(f"r2_seq: {r2_seq}")
130
+ #print(f"consensus_sequences: {consensus_sequences}")
131
+
132
+ column_sequence = match.group('columnID')
127
133
  grna_sequence = match.group('grna')
128
- row_sequence = match.group('row_name')
134
+ row_sequence = match.group('rowID')
129
135
  columns.append(column_sequence)
130
136
  grnas.append(grna_sequence)
131
137
  rows.append(row_sequence)
138
+
139
+ #print(f"row bc: {row_sequence} col bc: {column_sequence} grna bc: {grna_sequence}")
140
+ #print(f"row bc: {rows} col bc: {columns} grna bc: {grnas}")
132
141
 
133
142
  if len(consensus_sequences) == 0:
134
143
  print(f"WARNING: No sequences matched {regex} in chunk")
135
144
  print(f"Are bacode sequences in the correct orientation?")
136
145
  print(f"Is {consensus_seq} compatible with {regex} ?")
137
-
138
- if len(consensus_seq) >= expected_end:
139
- consensus_seq_rc = reverse_complement(consensus_seq)
140
- match = re.match(regex, consensus_seq_rc)
141
- if match:
142
- print(f"Reverse complement of last sequence in chunk matched {regex}")
146
+
147
+ if consensus_seq:
148
+ if len(consensus_seq) >= expected_end:
149
+ consensus_seq_rc = reverse_complement(consensus_seq)
150
+ match = re.match(regex, consensus_seq_rc)
151
+ if match:
152
+ print(f"Reverse complement of last sequence in chunk matched {regex}")
143
153
 
144
154
  return consensus_sequences, columns, grnas, rows
145
155
 
@@ -174,9 +184,9 @@ def process_chunk(chunk_data):
174
184
  match = re.match(regex, consensus_seq)
175
185
  if match:
176
186
  consensus_sequences.append(consensus_seq)
177
- column_sequence = match.group('column')
187
+ column_sequence = match.group('columnID')
178
188
  grna_sequence = match.group('grna')
179
- row_sequence = match.group('row_name')
189
+ row_sequence = match.group('rowID')
180
190
  columns.append(column_sequence)
181
191
  grnas.append(grna_sequence)
182
192
  rows.append(row_sequence)
@@ -194,10 +204,10 @@ def process_chunk(chunk_data):
194
204
 
195
205
  return consensus_sequences, columns, grnas, rows
196
206
 
207
+ if len(chunk_data) == 10:
208
+ r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na = chunk_data
197
209
  if len(chunk_data) == 9:
198
- r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv = chunk_data
199
- if len(chunk_data) == 8:
200
- r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv = chunk_data
210
+ r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na = chunk_data
201
211
  r2_chunk = None
202
212
 
203
213
  if r2_chunk is None:
@@ -212,9 +222,9 @@ def process_chunk(chunk_data):
212
222
  df = pd.DataFrame({
213
223
  'read': consensus_sequences,
214
224
  'column_sequence': columns,
215
- 'column_name': column_names,
225
+ 'columnID': column_names,
216
226
  'row_sequence': rows,
217
- 'row_name': row_names,
227
+ 'rowID': row_names,
218
228
  'grna_sequence': grnas,
219
229
  'grna_name': grna_names
220
230
  })
@@ -223,8 +233,20 @@ def process_chunk(chunk_data):
223
233
  qc_df.columns = df.columns
224
234
  qc_df.index = ["NaN_Counts"]
225
235
  qc_df['total_reads'] = len(df)
236
+
237
+ if fill_na:
238
+ df2 = df.copy()
239
+ if 'columnID' in df2.columns:
240
+ df2['columnID'] = df2['columnID'].fillna(df2['column_sequence'])
241
+ if 'rowID' in df2.columns:
242
+ df2['rowID'] = df2['rowID'].fillna(df2['row_sequence'])
243
+ if 'grna_name' in df2.columns:
244
+ df2['grna_name'] = df2['grna_name'].fillna(df2['grna_sequence'])
245
+
246
+ unique_combinations = df2.groupby(['rowID', 'columnID', 'grna_name']).size().reset_index(name='count')
247
+ else:
248
+ unique_combinations = df.groupby(['rowID', 'columnID', 'grna_name']).size().reset_index(name='count')
226
249
 
227
- unique_combinations = df.groupby(['row_name', 'column_name', 'grna_name']).size().reset_index(name='count')
228
250
  return df, unique_combinations, qc_df
229
251
 
230
252
  # Function to save data from the queue
@@ -239,7 +261,7 @@ def saver_process(save_queue, hdf5_file, save_h5, unique_combinations_csv, qc_cs
239
261
  save_unique_combinations_to_csv(unique_combinations, unique_combinations_csv)
240
262
  save_qc_df_to_csv(qc_df, qc_csv_file)
241
263
 
242
- def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False):
264
+ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
243
265
 
244
266
  from .utils import count_reads_in_fastq, print_progress
245
267
 
@@ -295,14 +317,12 @@ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
295
317
  break
296
318
 
297
319
  chunk_count += 1
298
- chunk_data = (r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
320
+ chunk_data = (r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na)
299
321
 
300
- # Process chunks in parallel
322
+ # Process chunks in parallel-
301
323
  result = pool.apply_async(process_chunk, (chunk_data,))
302
324
 
303
325
  df, unique_combinations, qc_df = result.get()
304
-
305
- # Queue the results for saving
306
326
  save_queue.put((df, unique_combinations, qc_df))
307
327
 
308
328
  end_time = time.time()
@@ -323,7 +343,7 @@ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
323
343
  save_queue.put("STOP")
324
344
  save_process.join()
325
345
 
326
- def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False):
346
+ def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
327
347
 
328
348
  from .utils import count_reads_in_fastq, print_progress
329
349
 
@@ -373,10 +393,11 @@ def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
373
393
  break
374
394
 
375
395
  chunk_count += 1
376
- chunk_data = (r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
396
+ chunk_data = (r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na)
377
397
 
378
398
  # Process chunks in parallel
379
399
  result = pool.apply_async(process_chunk, (chunk_data,))
400
+
380
401
  df, unique_combinations, qc_df = result.get()
381
402
 
382
403
  # Queue the results for saving
@@ -414,6 +435,8 @@ def generate_barecode_mapping(settings={}):
414
435
  print(f'Using regex: {regex} to extract barcode information')
415
436
 
416
437
  samples_dict = parse_gz_files(settings['src'])
438
+
439
+ print(samples_dict)
417
440
 
418
441
  print(f'If compression is low and save_h5 is True, saving might take longer than processing.')
419
442
 
@@ -462,7 +485,8 @@ def generate_barecode_mapping(settings={}):
462
485
  qc_csv_file=qc_csv_file,
463
486
  chunk_size=settings['chunk_size'],
464
487
  n_jobs=settings['n_jobs'],
465
- test=settings['test'])
488
+ test=settings['test'],
489
+ fill_na=settings['fill_na'])
466
490
 
467
491
  # Function to read the CSV, compute reverse complement, and save it
468
492
  def barecodes_reverse_complement(csv_file):
@@ -489,7 +513,7 @@ def barecodes_reverse_complement(csv_file):
489
513
 
490
514
  def graph_sequencing_stats(settings):
491
515
 
492
- from .utils import correct_metadata_column_names
516
+ from .utils import correct_metadata_column_names, correct_metadata
493
517
 
494
518
  def _plot_density(df, dependent_variable, dst=None):
495
519
  """Plot a density plot of the dependent variable."""
@@ -532,7 +556,7 @@ def graph_sequencing_stats(settings):
532
556
  # Iterate through the fraction thresholds
533
557
  for threshold in fraction_thresholds:
534
558
  filtered_df = df[df['fraction'] >= threshold]
535
- unique_count = filtered_df.groupby(['plate', 'row_name', 'column'])['grna'].nunique().mean()
559
+ unique_count = filtered_df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().mean()
536
560
  results.append((threshold, unique_count))
537
561
 
538
562
  results_df = pd.DataFrame(results, columns=['fraction_threshold', 'unique_count'])
@@ -568,8 +592,19 @@ def graph_sequencing_stats(settings):
568
592
  dfs = []
569
593
  for i, count_data in enumerate(settings['count_data']):
570
594
  df = pd.read_csv(count_data)
571
- df['plate'] = f'plate{i+1}'
572
- df['prc'] = df['plate'].astype(str) + '_' + df['row_name'].astype(str) + '_' + df['column_name'].astype(str)
595
+
596
+ df = correct_metadata(df)
597
+
598
+ if 'plateID' not in df.columns:
599
+ df['plateID'] = f'plate{i+1}'
600
+
601
+ display(df)
602
+
603
+ if all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
604
+ df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
605
+ else:
606
+ raise ValueError("The DataFrame must contain 'plateID', 'rowID', and 'columnID' columns.")
607
+
573
608
  df['total_count'] = df.groupby(['prc'])['count'].transform('sum')
574
609
  df['fraction'] = df['count'] / df['total_count']
575
610
  dfs.append(df)
@@ -588,20 +623,20 @@ def graph_sequencing_stats(settings):
588
623
  # Apply the closest threshold to the DataFrame
589
624
  df = df[df['fraction'] >= closest_threshold]
590
625
 
591
- # Group by 'plate', 'row_name', 'column' and compute unique counts of 'grna'
592
- unique_counts = df.groupby(['plate', 'row_name', 'column'])['grna'].nunique().reset_index(name='unique_counts')
593
- unique_count_mean = df.groupby(['plate', 'row_name', 'column'])['grna'].nunique().mean()
594
- unique_count_std = df.groupby(['plate', 'row_name', 'column'])['grna'].nunique().std()
626
+ # Group by 'plateID', 'rowID', 'columnID' and compute unique counts of 'grna'
627
+ unique_counts = df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().reset_index(name='unique_counts')
628
+ unique_count_mean = df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().mean()
629
+ unique_count_std = df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().std()
595
630
 
596
631
  # Merge the unique counts back into the original DataFrame
597
- df = pd.merge(df, unique_counts, on=['plate', 'row_name', 'column'], how='left')
632
+ df = pd.merge(df, unique_counts, on=['plateID', 'rowID', 'columnID'], how='left')
598
633
 
599
634
  print(f"unique_count mean: {unique_count_mean} std: {unique_count_std}")
600
635
  #_plot_density(df, dependent_variable='unique_counts')
601
636
 
602
- has_underscore = df['row_name'].str.contains('_').any()
637
+ has_underscore = df['rowID'].str.contains('_').any()
603
638
  if has_underscore:
604
- df['row_name'] = df['row_name'].apply(lambda x: x.split('_')[1])
639
+ df['rowID'] = df['rowID'].apply(lambda x: x.split('_')[1])
605
640
 
606
641
  plot_plates(df=df, variable='unique_counts', grouping='mean', min_max='allq', cmap='viridis',min_count=0, verbose=True, dst=dst)
607
642