spacr 0.4.15__py3-none-any.whl → 0.4.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/core.py +52 -9
- spacr/deep_spacr.py +2 -3
- spacr/gui_core.py +247 -41
- spacr/gui_elements.py +133 -2
- spacr/gui_utils.py +17 -15
- spacr/io.py +540 -55
- spacr/ml.py +141 -258
- spacr/plot.py +76 -34
- spacr/sequencing.py +73 -38
- spacr/settings.py +136 -128
- spacr/submodules.py +619 -213
- spacr/timelapse.py +25 -25
- spacr/toxo.py +23 -23
- spacr/utils.py +162 -89
- {spacr-0.4.15.dist-info → spacr-0.4.60.dist-info}/METADATA +2 -1
- {spacr-0.4.15.dist-info → spacr-0.4.60.dist-info}/RECORD +20 -20
- {spacr-0.4.15.dist-info → spacr-0.4.60.dist-info}/LICENSE +0 -0
- {spacr-0.4.15.dist-info → spacr-0.4.60.dist-info}/WHEEL +0 -0
- {spacr-0.4.15.dist-info → spacr-0.4.60.dist-info}/entry_points.txt +0 -0
- {spacr-0.4.15.dist-info → spacr-0.4.60.dist-info}/top_level.txt +0 -0
spacr/plot.py
CHANGED
@@ -1658,36 +1658,36 @@ def generate_plate_heatmap(df, plate_number, variable, grouping, min_max, min_co
|
|
1658
1658
|
num_parts = len(df['prc'].iloc[0].split('_'))
|
1659
1659
|
if num_parts == 4:
|
1660
1660
|
split = df['prc'].str.split('_', expand=True)
|
1661
|
-
df['
|
1661
|
+
df['rowID'] = split[2]
|
1662
1662
|
df['prc'] = f"{plate_number}" + '_' + split[2] + '_' + split[3]
|
1663
1663
|
|
1664
|
-
# Construct 'prc' based on '
|
1665
|
-
#df['prc'] = df['
|
1664
|
+
# Construct 'prc' based on 'plateID', 'rowID', and 'columnID' columns
|
1665
|
+
#df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
|
1666
1666
|
|
1667
1667
|
if 'column_name' not in df.columns:
|
1668
1668
|
if 'column' in df.columns:
|
1669
|
-
df['
|
1669
|
+
df['columnID'] = df['column']
|
1670
1670
|
if 'column_name' in df.columns:
|
1671
|
-
df['
|
1671
|
+
df['columnID'] = df['column_name']
|
1672
1672
|
|
1673
|
-
df['
|
1673
|
+
df['plateID'], df['rowID'], df['columnID'] = zip(*df['prc'].str.split('_'))
|
1674
1674
|
|
1675
1675
|
# Filtering the dataframe based on the plate_number
|
1676
|
-
df = df[df['
|
1676
|
+
df = df[df['plateID'] == plate_number].copy() # Create another copy after filtering
|
1677
1677
|
|
1678
1678
|
# Ensure proper ordering
|
1679
1679
|
row_order = [f'r{i}' for i in range(1, 17)]
|
1680
1680
|
col_order = [f'c{i}' for i in range(1, 28)] # Exclude c15 as per your earlier code
|
1681
1681
|
|
1682
|
-
df['
|
1683
|
-
df['
|
1684
|
-
df['count'] = df.groupby(['
|
1682
|
+
df['rowID'] = pd.Categorical(df['rowID'], categories=row_order, ordered=True)
|
1683
|
+
df['columnID'] = pd.Categorical(df['columnID'], categories=col_order, ordered=True)
|
1684
|
+
df['count'] = df.groupby(['rowID', 'columnID'])['rowID'].transform('count')
|
1685
1685
|
|
1686
1686
|
if min_count > 0:
|
1687
1687
|
df = df[df['count'] >= min_count]
|
1688
1688
|
|
1689
1689
|
# Explicitly set observed=True to avoid FutureWarning
|
1690
|
-
grouped = df.groupby(['
|
1690
|
+
grouped = df.groupby(['rowID', 'columnID'], observed=True) # Group by row and column
|
1691
1691
|
|
1692
1692
|
if grouping == 'mean':
|
1693
1693
|
plate = grouped[variable].mean().reset_index()
|
@@ -1699,7 +1699,7 @@ def generate_plate_heatmap(df, plate_number, variable, grouping, min_max, min_co
|
|
1699
1699
|
else:
|
1700
1700
|
raise ValueError(f"Unsupported grouping: {grouping}")
|
1701
1701
|
|
1702
|
-
plate_map = pd.pivot_table(plate, values=variable, index='
|
1702
|
+
plate_map = pd.pivot_table(plate, values=variable, index='rowID', columns='columnID').fillna(0)
|
1703
1703
|
|
1704
1704
|
if min_max == 'all':
|
1705
1705
|
min_max = [plate_map.min().min(), plate_map.max().max()]
|
@@ -1842,7 +1842,7 @@ def print_mask_and_flows(stack, mask, flows, overlay=True, max_size=1000, thickn
|
|
1842
1842
|
fig.tight_layout()
|
1843
1843
|
plt.show()
|
1844
1844
|
|
1845
|
-
def
|
1845
|
+
def plot_resize_v1(images, resized_images, labels, resized_labels):
|
1846
1846
|
# Display an example image and label before and after resizing
|
1847
1847
|
fig, ax = plt.subplots(2, 2, figsize=(20, 20))
|
1848
1848
|
|
@@ -1866,6 +1866,48 @@ def plot_resize(images, resized_images, labels, resized_labels):
|
|
1866
1866
|
ax[1, 1].set_title('Resized Label')
|
1867
1867
|
plt.show()
|
1868
1868
|
|
1869
|
+
|
1870
|
+
def plot_resize(images, resized_images, labels, resized_labels):
|
1871
|
+
def prepare_image(img):
|
1872
|
+
if img.ndim == 2:
|
1873
|
+
return img, 'gray'
|
1874
|
+
elif img.ndim == 3:
|
1875
|
+
if img.shape[-1] == 1:
|
1876
|
+
return np.squeeze(img, axis=-1), 'gray'
|
1877
|
+
elif img.shape[-1] == 3:
|
1878
|
+
return img, None # RGB
|
1879
|
+
elif img.shape[-1] == 4:
|
1880
|
+
return img, None # RGBA
|
1881
|
+
else:
|
1882
|
+
# fallback: average across channels to show as grayscale
|
1883
|
+
return np.mean(img, axis=-1), 'gray'
|
1884
|
+
else:
|
1885
|
+
raise ValueError(f"Unsupported image shape: {img.shape}")
|
1886
|
+
|
1887
|
+
fig, ax = plt.subplots(2, 2, figsize=(20, 20))
|
1888
|
+
|
1889
|
+
# Original Image
|
1890
|
+
img, cmap = prepare_image(images[0])
|
1891
|
+
ax[0, 0].imshow(img, cmap=cmap)
|
1892
|
+
ax[0, 0].set_title('Original Image')
|
1893
|
+
|
1894
|
+
# Resized Image
|
1895
|
+
img, cmap = prepare_image(resized_images[0])
|
1896
|
+
ax[0, 1].imshow(img, cmap=cmap)
|
1897
|
+
ax[0, 1].set_title('Resized Image')
|
1898
|
+
|
1899
|
+
# Labels (assumed grayscale or single-channel)
|
1900
|
+
lbl, cmap = prepare_image(labels[0])
|
1901
|
+
ax[1, 0].imshow(lbl, cmap=cmap)
|
1902
|
+
ax[1, 0].set_title('Original Label')
|
1903
|
+
|
1904
|
+
lbl, cmap = prepare_image(resized_labels[0])
|
1905
|
+
ax[1, 1].imshow(lbl, cmap=cmap)
|
1906
|
+
ax[1, 1].set_title('Resized Label')
|
1907
|
+
|
1908
|
+
plt.tight_layout()
|
1909
|
+
plt.show()
|
1910
|
+
|
1869
1911
|
def normalize_and_visualize(image, normalized_image, title=""):
|
1870
1912
|
"""Utility function for visualization"""
|
1871
1913
|
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
|
@@ -2529,8 +2571,8 @@ class spacrGraph:
|
|
2529
2571
|
# # Group by 'prc' column if representation is 'well'
|
2530
2572
|
# if self.representation == 'well':
|
2531
2573
|
# df = df.groupby(['prc', self.grouping_column])[self.data_column].agg(self.summary_func).reset_index()
|
2532
|
-
# if self.representation == '
|
2533
|
-
# df = df.groupby(['
|
2574
|
+
# if self.representation == 'plateID':
|
2575
|
+
# df = df.groupby(['plateID', self.grouping_column])[self.data_column].agg(self.summary_func).reset_index()
|
2534
2576
|
# if self.order:
|
2535
2577
|
# df[self.grouping_column] = pd.Categorical(df[self.grouping_column], categories=self.order, ordered=True)
|
2536
2578
|
# else:
|
@@ -2539,8 +2581,8 @@ class spacrGraph:
|
|
2539
2581
|
|
2540
2582
|
def preprocess_data(self):
|
2541
2583
|
"""
|
2542
|
-
Preprocess the data: remove NaNs, optionally ensure '
|
2543
|
-
then group by either 'prc', '
|
2584
|
+
Preprocess the data: remove NaNs, optionally ensure 'plateID' column is created,
|
2585
|
+
then group by either 'prc', 'plateID', or do no grouping at all if representation == 'object'.
|
2544
2586
|
"""
|
2545
2587
|
# 1) Remove NaNs in both the grouping column and each data column
|
2546
2588
|
df = self.df.dropna(subset=[self.grouping_column] + self.data_column)
|
@@ -2555,21 +2597,21 @@ class spacrGraph:
|
|
2555
2597
|
# Group by ['prc', grouping_column]
|
2556
2598
|
group_cols = ['prc', self.grouping_column]
|
2557
2599
|
|
2558
|
-
elif self.representation == '
|
2559
|
-
# Make sure '
|
2560
|
-
if '
|
2600
|
+
elif self.representation == 'plateID':
|
2601
|
+
# Make sure 'plateID' exists (split from 'prc' if needed)
|
2602
|
+
if 'plateID' not in df.columns:
|
2561
2603
|
if 'prc' in df.columns:
|
2562
|
-
df[['
|
2604
|
+
df[['plateID', 'rowID', 'columnID']] = df['prc'].str.split('_', expand=True)
|
2563
2605
|
else:
|
2564
2606
|
raise KeyError(
|
2565
|
-
"Representation is '
|
2607
|
+
"Representation is 'plateID', but no 'plateID' column found. "
|
2566
2608
|
"Also cannot split from 'prc' because 'prc' column is missing."
|
2567
2609
|
)
|
2568
|
-
# If the grouping column IS '
|
2569
|
-
if self.grouping_column == '
|
2570
|
-
group_cols = ['
|
2610
|
+
# If the grouping column IS 'plateID', only group by ['plateID'] once
|
2611
|
+
if self.grouping_column == 'plateID':
|
2612
|
+
group_cols = ['plateID']
|
2571
2613
|
else:
|
2572
|
-
group_cols = ['
|
2614
|
+
group_cols = ['plateID', self.grouping_column]
|
2573
2615
|
|
2574
2616
|
else:
|
2575
2617
|
raise ValueError(f"Unknown representation: {self.representation}")
|
@@ -3384,7 +3426,7 @@ def plot_data_from_db(settings):
|
|
3384
3426
|
dfs.append(dft)
|
3385
3427
|
|
3386
3428
|
df = pd.concat(dfs, axis=0)
|
3387
|
-
df['prc'] = df['
|
3429
|
+
df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
|
3388
3430
|
|
3389
3431
|
if settings['cell_plate_metadata'] != None:
|
3390
3432
|
df = df.dropna(subset='host_cell')
|
@@ -3461,19 +3503,19 @@ def plot_data_from_csv(settings):
|
|
3461
3503
|
dfs = []
|
3462
3504
|
for i, src in enumerate(srcs):
|
3463
3505
|
dft = pd.read_csv(src)
|
3464
|
-
if '
|
3465
|
-
dft['
|
3506
|
+
if 'plateID' not in dft.columns:
|
3507
|
+
dft['plateID'] = f"plate{i+1}"
|
3466
3508
|
dft['common'] = 'spacr'
|
3467
3509
|
dfs.append(dft)
|
3468
3510
|
|
3469
3511
|
df = pd.concat(dfs, axis=0)
|
3470
3512
|
|
3471
3513
|
if 'prc' in df.columns:
|
3472
|
-
# Check if '
|
3473
|
-
if not all(col in df.columns for col in ['plate', '
|
3514
|
+
# Check if 'plateID', 'rowID', and 'columnID' are all missing from df.columns
|
3515
|
+
if not all(col in df.columns for col in ['plate', 'rowID', 'columnID']):
|
3474
3516
|
try:
|
3475
|
-
# Split 'prc' into '
|
3476
|
-
df[['
|
3517
|
+
# Split 'prc' into 'plateID', 'rowID', and 'columnID'
|
3518
|
+
df[['plateID', 'rowID', 'columnID']] = df['prc'].str.split('_', expand=True)
|
3477
3519
|
except Exception as e:
|
3478
3520
|
print(f"Could not split the prc column: {e}")
|
3479
3521
|
|
@@ -3827,7 +3869,7 @@ def plot_proportion_stacked_bars(settings, df, group_column, bin_column, prc_col
|
|
3827
3869
|
pairwise_results = chi_pairwise(raw_counts, verbose=settings.get('verbose', False))
|
3828
3870
|
|
3829
3871
|
# Plot based on level setting
|
3830
|
-
if level in ['well', '
|
3872
|
+
if level in ['well', 'plateID']:
|
3831
3873
|
# Aggregate by well for mean ± SD visualization
|
3832
3874
|
well_proportions = (
|
3833
3875
|
df.groupby([group_column, prc_column, bin_column])
|
spacr/sequencing.py
CHANGED
@@ -43,9 +43,9 @@ def save_unique_combinations_to_csv(unique_combinations, csv_file):
|
|
43
43
|
if not existing_df.empty:
|
44
44
|
unique_combinations = pd.concat([existing_df, unique_combinations])
|
45
45
|
unique_combinations = unique_combinations.groupby(
|
46
|
-
['
|
46
|
+
['rowID', 'columnID', 'grna_name'], as_index=False).sum()
|
47
47
|
|
48
|
-
unique_combinations.to_csv(csv_file, index=
|
48
|
+
unique_combinations.to_csv(csv_file, index=True)
|
49
49
|
except Exception as e:
|
50
50
|
print(f"Error while saving unique combinations to CSV: {e}")
|
51
51
|
|
@@ -92,7 +92,8 @@ def process_chunk(chunk_data):
|
|
92
92
|
def paired_find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, regex):
|
93
93
|
|
94
94
|
consensus_sequences, columns, grnas, rows = [], [], [], []
|
95
|
-
|
95
|
+
consensus_seq = None
|
96
|
+
|
96
97
|
for r1_lines, r2_lines in zip(r1_chunk, r2_chunk):
|
97
98
|
_, r1_sequence, _, r1_quality = r1_lines.split('\n')
|
98
99
|
_, r2_sequence, _, r2_quality = r2_lines.split('\n')
|
@@ -123,23 +124,32 @@ def process_chunk(chunk_data):
|
|
123
124
|
match = re.match(regex, consensus_seq)
|
124
125
|
if match:
|
125
126
|
consensus_sequences.append(consensus_seq)
|
126
|
-
|
127
|
+
|
128
|
+
#print(f"r1_seq: {r1_seq}")
|
129
|
+
#print(f"r2_seq: {r2_seq}")
|
130
|
+
#print(f"consensus_sequences: {consensus_sequences}")
|
131
|
+
|
132
|
+
column_sequence = match.group('columnID')
|
127
133
|
grna_sequence = match.group('grna')
|
128
|
-
row_sequence = match.group('
|
134
|
+
row_sequence = match.group('rowID')
|
129
135
|
columns.append(column_sequence)
|
130
136
|
grnas.append(grna_sequence)
|
131
137
|
rows.append(row_sequence)
|
138
|
+
|
139
|
+
#print(f"row bc: {row_sequence} col bc: {column_sequence} grna bc: {grna_sequence}")
|
140
|
+
#print(f"row bc: {rows} col bc: {columns} grna bc: {grnas}")
|
132
141
|
|
133
142
|
if len(consensus_sequences) == 0:
|
134
143
|
print(f"WARNING: No sequences matched {regex} in chunk")
|
135
144
|
print(f"Are bacode sequences in the correct orientation?")
|
136
145
|
print(f"Is {consensus_seq} compatible with {regex} ?")
|
137
|
-
|
138
|
-
if
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
146
|
+
|
147
|
+
if consensus_seq:
|
148
|
+
if len(consensus_seq) >= expected_end:
|
149
|
+
consensus_seq_rc = reverse_complement(consensus_seq)
|
150
|
+
match = re.match(regex, consensus_seq_rc)
|
151
|
+
if match:
|
152
|
+
print(f"Reverse complement of last sequence in chunk matched {regex}")
|
143
153
|
|
144
154
|
return consensus_sequences, columns, grnas, rows
|
145
155
|
|
@@ -174,9 +184,9 @@ def process_chunk(chunk_data):
|
|
174
184
|
match = re.match(regex, consensus_seq)
|
175
185
|
if match:
|
176
186
|
consensus_sequences.append(consensus_seq)
|
177
|
-
column_sequence = match.group('
|
187
|
+
column_sequence = match.group('columnID')
|
178
188
|
grna_sequence = match.group('grna')
|
179
|
-
row_sequence = match.group('
|
189
|
+
row_sequence = match.group('rowID')
|
180
190
|
columns.append(column_sequence)
|
181
191
|
grnas.append(grna_sequence)
|
182
192
|
rows.append(row_sequence)
|
@@ -194,10 +204,10 @@ def process_chunk(chunk_data):
|
|
194
204
|
|
195
205
|
return consensus_sequences, columns, grnas, rows
|
196
206
|
|
207
|
+
if len(chunk_data) == 10:
|
208
|
+
r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na = chunk_data
|
197
209
|
if len(chunk_data) == 9:
|
198
|
-
r1_chunk,
|
199
|
-
if len(chunk_data) == 8:
|
200
|
-
r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv = chunk_data
|
210
|
+
r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na = chunk_data
|
201
211
|
r2_chunk = None
|
202
212
|
|
203
213
|
if r2_chunk is None:
|
@@ -212,9 +222,9 @@ def process_chunk(chunk_data):
|
|
212
222
|
df = pd.DataFrame({
|
213
223
|
'read': consensus_sequences,
|
214
224
|
'column_sequence': columns,
|
215
|
-
'
|
225
|
+
'columnID': column_names,
|
216
226
|
'row_sequence': rows,
|
217
|
-
'
|
227
|
+
'rowID': row_names,
|
218
228
|
'grna_sequence': grnas,
|
219
229
|
'grna_name': grna_names
|
220
230
|
})
|
@@ -223,8 +233,20 @@ def process_chunk(chunk_data):
|
|
223
233
|
qc_df.columns = df.columns
|
224
234
|
qc_df.index = ["NaN_Counts"]
|
225
235
|
qc_df['total_reads'] = len(df)
|
236
|
+
|
237
|
+
if fill_na:
|
238
|
+
df2 = df.copy()
|
239
|
+
if 'columnID' in df2.columns:
|
240
|
+
df2['columnID'] = df2['columnID'].fillna(df2['column_sequence'])
|
241
|
+
if 'rowID' in df2.columns:
|
242
|
+
df2['rowID'] = df2['rowID'].fillna(df2['row_sequence'])
|
243
|
+
if 'grna_name' in df2.columns:
|
244
|
+
df2['grna_name'] = df2['grna_name'].fillna(df2['grna_sequence'])
|
245
|
+
|
246
|
+
unique_combinations = df2.groupby(['rowID', 'columnID', 'grna_name']).size().reset_index(name='count')
|
247
|
+
else:
|
248
|
+
unique_combinations = df.groupby(['rowID', 'columnID', 'grna_name']).size().reset_index(name='count')
|
226
249
|
|
227
|
-
unique_combinations = df.groupby(['row_name', 'column_name', 'grna_name']).size().reset_index(name='count')
|
228
250
|
return df, unique_combinations, qc_df
|
229
251
|
|
230
252
|
# Function to save data from the queue
|
@@ -239,7 +261,7 @@ def saver_process(save_queue, hdf5_file, save_h5, unique_combinations_csv, qc_cs
|
|
239
261
|
save_unique_combinations_to_csv(unique_combinations, unique_combinations_csv)
|
240
262
|
save_qc_df_to_csv(qc_df, qc_csv_file)
|
241
263
|
|
242
|
-
def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False):
|
264
|
+
def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
|
243
265
|
|
244
266
|
from .utils import count_reads_in_fastq, print_progress
|
245
267
|
|
@@ -295,14 +317,12 @@ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
|
|
295
317
|
break
|
296
318
|
|
297
319
|
chunk_count += 1
|
298
|
-
chunk_data = (r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
|
320
|
+
chunk_data = (r1_chunk, r2_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na)
|
299
321
|
|
300
|
-
# Process chunks in parallel
|
322
|
+
# Process chunks in parallel-
|
301
323
|
result = pool.apply_async(process_chunk, (chunk_data,))
|
302
324
|
|
303
325
|
df, unique_combinations, qc_df = result.get()
|
304
|
-
|
305
|
-
# Queue the results for saving
|
306
326
|
save_queue.put((df, unique_combinations, qc_df))
|
307
327
|
|
308
328
|
end_time = time.time()
|
@@ -323,7 +343,7 @@ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
|
|
323
343
|
save_queue.put("STOP")
|
324
344
|
save_process.join()
|
325
345
|
|
326
|
-
def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False):
|
346
|
+
def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
|
327
347
|
|
328
348
|
from .utils import count_reads_in_fastq, print_progress
|
329
349
|
|
@@ -373,10 +393,11 @@ def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
|
|
373
393
|
break
|
374
394
|
|
375
395
|
chunk_count += 1
|
376
|
-
chunk_data = (r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
|
396
|
+
chunk_data = (r1_chunk, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, fill_na)
|
377
397
|
|
378
398
|
# Process chunks in parallel
|
379
399
|
result = pool.apply_async(process_chunk, (chunk_data,))
|
400
|
+
|
380
401
|
df, unique_combinations, qc_df = result.get()
|
381
402
|
|
382
403
|
# Queue the results for saving
|
@@ -414,6 +435,8 @@ def generate_barecode_mapping(settings={}):
|
|
414
435
|
print(f'Using regex: {regex} to extract barcode information')
|
415
436
|
|
416
437
|
samples_dict = parse_gz_files(settings['src'])
|
438
|
+
|
439
|
+
print(samples_dict)
|
417
440
|
|
418
441
|
print(f'If compression is low and save_h5 is True, saving might take longer than processing.')
|
419
442
|
|
@@ -462,7 +485,8 @@ def generate_barecode_mapping(settings={}):
|
|
462
485
|
qc_csv_file=qc_csv_file,
|
463
486
|
chunk_size=settings['chunk_size'],
|
464
487
|
n_jobs=settings['n_jobs'],
|
465
|
-
test=settings['test']
|
488
|
+
test=settings['test'],
|
489
|
+
fill_na=settings['fill_na'])
|
466
490
|
|
467
491
|
# Function to read the CSV, compute reverse complement, and save it
|
468
492
|
def barecodes_reverse_complement(csv_file):
|
@@ -489,7 +513,7 @@ def barecodes_reverse_complement(csv_file):
|
|
489
513
|
|
490
514
|
def graph_sequencing_stats(settings):
|
491
515
|
|
492
|
-
from .utils import correct_metadata_column_names
|
516
|
+
from .utils import correct_metadata_column_names, correct_metadata
|
493
517
|
|
494
518
|
def _plot_density(df, dependent_variable, dst=None):
|
495
519
|
"""Plot a density plot of the dependent variable."""
|
@@ -532,7 +556,7 @@ def graph_sequencing_stats(settings):
|
|
532
556
|
# Iterate through the fraction thresholds
|
533
557
|
for threshold in fraction_thresholds:
|
534
558
|
filtered_df = df[df['fraction'] >= threshold]
|
535
|
-
unique_count = filtered_df.groupby(['
|
559
|
+
unique_count = filtered_df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().mean()
|
536
560
|
results.append((threshold, unique_count))
|
537
561
|
|
538
562
|
results_df = pd.DataFrame(results, columns=['fraction_threshold', 'unique_count'])
|
@@ -568,8 +592,19 @@ def graph_sequencing_stats(settings):
|
|
568
592
|
dfs = []
|
569
593
|
for i, count_data in enumerate(settings['count_data']):
|
570
594
|
df = pd.read_csv(count_data)
|
571
|
-
|
572
|
-
df
|
595
|
+
|
596
|
+
df = correct_metadata(df)
|
597
|
+
|
598
|
+
if 'plateID' not in df.columns:
|
599
|
+
df['plateID'] = f'plate{i+1}'
|
600
|
+
|
601
|
+
display(df)
|
602
|
+
|
603
|
+
if all(col in df.columns for col in ['plateID', 'rowID', 'columnID']):
|
604
|
+
df['prc'] = df['plateID'].astype(str) + '_' + df['rowID'].astype(str) + '_' + df['columnID'].astype(str)
|
605
|
+
else:
|
606
|
+
raise ValueError("The DataFrame must contain 'plateID', 'rowID', and 'columnID' columns.")
|
607
|
+
|
573
608
|
df['total_count'] = df.groupby(['prc'])['count'].transform('sum')
|
574
609
|
df['fraction'] = df['count'] / df['total_count']
|
575
610
|
dfs.append(df)
|
@@ -588,20 +623,20 @@ def graph_sequencing_stats(settings):
|
|
588
623
|
# Apply the closest threshold to the DataFrame
|
589
624
|
df = df[df['fraction'] >= closest_threshold]
|
590
625
|
|
591
|
-
# Group by '
|
592
|
-
unique_counts = df.groupby(['
|
593
|
-
unique_count_mean = df.groupby(['
|
594
|
-
unique_count_std = df.groupby(['
|
626
|
+
# Group by 'plateID', 'rowID', 'columnID' and compute unique counts of 'grna'
|
627
|
+
unique_counts = df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().reset_index(name='unique_counts')
|
628
|
+
unique_count_mean = df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().mean()
|
629
|
+
unique_count_std = df.groupby(['plateID', 'rowID', 'columnID'])['grna'].nunique().std()
|
595
630
|
|
596
631
|
# Merge the unique counts back into the original DataFrame
|
597
|
-
df = pd.merge(df, unique_counts, on=['
|
632
|
+
df = pd.merge(df, unique_counts, on=['plateID', 'rowID', 'columnID'], how='left')
|
598
633
|
|
599
634
|
print(f"unique_count mean: {unique_count_mean} std: {unique_count_std}")
|
600
635
|
#_plot_density(df, dependent_variable='unique_counts')
|
601
636
|
|
602
|
-
has_underscore = df['
|
637
|
+
has_underscore = df['rowID'].str.contains('_').any()
|
603
638
|
if has_underscore:
|
604
|
-
df['
|
639
|
+
df['rowID'] = df['rowID'].apply(lambda x: x.split('_')[1])
|
605
640
|
|
606
641
|
plot_plates(df=df, variable='unique_counts', grouping='mean', min_max='allq', cmap='viridis',min_count=0, verbose=True, dst=dst)
|
607
642
|
|