spacr 0.3.60__py3-none-any.whl → 0.3.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacr/app_annotate.py +0 -8
- spacr/core.py +12 -7
- spacr/gui_utils.py +24 -8
- spacr/io.py +134 -157
- spacr/ml.py +3 -4
- spacr/plot.py +82 -23
- spacr/settings.py +4 -13
- spacr/submodules.py +299 -5
- spacr/utils.py +96 -3
- {spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/METADATA +1 -1
- {spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/RECORD +15 -15
- {spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/LICENSE +0 -0
- {spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/WHEEL +0 -0
- {spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/entry_points.txt +0 -0
- {spacr-0.3.60.dist-info → spacr-0.3.62.dist-info}/top_level.txt +0 -0
spacr/app_annotate.py
CHANGED
@@ -4,14 +4,6 @@ from .gui import MainApp
|
|
4
4
|
from .gui_elements import set_dark_style, spacrButton
|
5
5
|
|
6
6
|
def convert_to_number(value):
|
7
|
-
|
8
|
-
"""
|
9
|
-
Converts a string value to an integer if possible, otherwise converts to a float.
|
10
|
-
Args:
|
11
|
-
value (str): The string representation of the number.
|
12
|
-
Returns:
|
13
|
-
int or float: The converted number.
|
14
|
-
"""
|
15
7
|
try:
|
16
8
|
return int(value)
|
17
9
|
except ValueError:
|
spacr/core.py
CHANGED
@@ -465,10 +465,8 @@ def generate_image_umap(settings={}):
|
|
465
465
|
display(settings_df)
|
466
466
|
|
467
467
|
db_paths = get_db_paths(settings['src'])
|
468
|
-
|
469
468
|
tables = settings['tables'] + ['png_list']
|
470
469
|
all_df = pd.DataFrame()
|
471
|
-
#image_paths = []
|
472
470
|
|
473
471
|
for i,db_path in enumerate(db_paths):
|
474
472
|
df = _read_and_join_tables(db_path, table_names=tables)
|
@@ -476,7 +474,7 @@ def generate_image_umap(settings={}):
|
|
476
474
|
all_df = pd.concat([all_df, df], axis=0)
|
477
475
|
#image_paths.extend(image_paths_tmp)
|
478
476
|
|
479
|
-
all_df['cond'] = all_df['
|
477
|
+
all_df['cond'] = all_df['column_name'].apply(map_condition, neg=settings['neg'], pos=settings['pos'], mix=settings['mix'])
|
480
478
|
|
481
479
|
if settings['exclude_conditions']:
|
482
480
|
if isinstance(settings['exclude_conditions'], str):
|
@@ -495,7 +493,10 @@ def generate_image_umap(settings={}):
|
|
495
493
|
|
496
494
|
# Extract and reset the index for the column to compare
|
497
495
|
col_to_compare = all_df[settings['col_to_compare']].reset_index(drop=True)
|
498
|
-
|
496
|
+
|
497
|
+
#if settings['only_top_features']:
|
498
|
+
# column_list = None
|
499
|
+
|
499
500
|
# Preprocess the data to obtain numeric data
|
500
501
|
numeric_data = preprocess_data(all_df, settings['filter_by'], settings['remove_highly_correlated'], settings['log_data'], settings['exclude'])
|
501
502
|
|
@@ -571,7 +572,11 @@ def generate_image_umap(settings={}):
|
|
571
572
|
print(f'Saved {reduction_method} embedding to {embedding_path} and grid to {grid_path}')
|
572
573
|
|
573
574
|
# Add cluster labels to the dataframe
|
574
|
-
|
575
|
+
if len(labels) > 0:
|
576
|
+
all_df['cluster'] = labels
|
577
|
+
else:
|
578
|
+
all_df['cluster'] = 1 # Assign a default cluster label
|
579
|
+
print("No clusters found. Consider reducing 'min_samples' or increasing 'eps' for DBSCAN.")
|
575
580
|
|
576
581
|
# Save the results to a CSV file
|
577
582
|
results_dir = os.path.join(settings['src'][0], 'results')
|
@@ -653,7 +658,7 @@ def reducer_hyperparameter_search(settings={}, reduction_params=None, dbscan_par
|
|
653
658
|
df = _read_and_join_tables(db_path, table_names=tables)
|
654
659
|
all_df = pd.concat([all_df, df], axis=0)
|
655
660
|
|
656
|
-
all_df['cond'] = all_df['
|
661
|
+
all_df['cond'] = all_df['column_name'].apply(map_condition, neg=settings['neg'], pos=settings['pos'], mix=settings['mix'])
|
657
662
|
|
658
663
|
if settings['exclude_conditions']:
|
659
664
|
if isinstance(settings['exclude_conditions'], str):
|
@@ -882,7 +887,7 @@ def generate_screen_graphs(settings):
|
|
882
887
|
db_loc = [os.path.join(src, 'measurements', 'measurements.db')]
|
883
888
|
|
884
889
|
# Read and merge data from the database
|
885
|
-
df, _ = _read_and_merge_data(db_loc, settings['tables'], verbose=True, nuclei_limit=settings['nuclei_limit'], pathogen_limit=settings['pathogen_limit']
|
890
|
+
df, _ = _read_and_merge_data(db_loc, settings['tables'], verbose=True, nuclei_limit=settings['nuclei_limit'], pathogen_limit=settings['pathogen_limit'])
|
886
891
|
|
887
892
|
# Annotate the data
|
888
893
|
df = annotate_conditions(df, cells=settings['cells'], cell_loc=None, pathogens=settings['controls'], pathogen_loc=settings['controls_loc'], treatments=None, treatment_loc=None)
|
spacr/gui_utils.py
CHANGED
@@ -225,14 +225,30 @@ def annotate(settings):
|
|
225
225
|
conn.close()
|
226
226
|
|
227
227
|
root = tk.Tk()
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
228
|
+
|
229
|
+
root.geometry(f"{root.winfo_screenwidth()}x{root.winfo_screenheight()}")
|
230
|
+
|
231
|
+
db_path = os.path.join(settings['src'], 'measurements/measurements.db')
|
232
|
+
|
233
|
+
app = AnnotateApp(root,
|
234
|
+
db_path=db_path,
|
235
|
+
src=settings['src'],
|
236
|
+
image_type=settings['image_type'],
|
237
|
+
channels=settings['channels'],
|
238
|
+
image_size=settings['img_size'],
|
239
|
+
annotation_column=settings['annotation_column'],
|
240
|
+
normalize=settings['normalize'],
|
241
|
+
percentiles=settings['percentiles'],
|
242
|
+
measurement=settings['measurement'],
|
243
|
+
threshold=settings['threshold'],
|
244
|
+
normalize_channels=settings['normalize_channels'])
|
245
|
+
|
246
|
+
#next_button = tk.Button(root, text="Next", command=app.next_page)
|
247
|
+
#next_button.grid(row=app.grid_rows, column=app.grid_cols - 1)
|
248
|
+
#back_button = tk.Button(root, text="Back", command=app.previous_page)
|
249
|
+
#back_button.grid(row=app.grid_rows, column=app.grid_cols - 2)
|
250
|
+
#exit_button = tk.Button(root, text="Exit", command=app.shutdown)
|
251
|
+
#exit_button.grid(row=app.grid_rows, column=app.grid_cols - 3)
|
236
252
|
|
237
253
|
app.load_images()
|
238
254
|
root.mainloop()
|
spacr/io.py
CHANGED
@@ -1777,7 +1777,7 @@ def _read_and_join_tables(db_path, table_names=['cell', 'cytoplasm', 'nucleus',
|
|
1777
1777
|
png_list_df['cell_id'] = png_list_df['cell_id'].str[1:].astype(int)
|
1778
1778
|
png_list_df.rename(columns={'cell_id': 'object_label'}, inplace=True)
|
1779
1779
|
if 'cell' in dataframes:
|
1780
|
-
join_cols = ['object_label', 'plate', 'row_name', 'column_name']
|
1780
|
+
join_cols = ['object_label', 'plate', 'row_name', 'column_name','field']
|
1781
1781
|
dataframes['cell'] = pd.merge(dataframes['cell'], png_list_df, on=join_cols, how='left')
|
1782
1782
|
else:
|
1783
1783
|
print("Cell table not found in database tables.")
|
@@ -2089,150 +2089,6 @@ def _read_db(db_loc, tables):
|
|
2089
2089
|
conn.close()
|
2090
2090
|
return dfs
|
2091
2091
|
|
2092
|
-
def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False, uninfected=False):
|
2093
|
-
"""
|
2094
|
-
Read and merge data from SQLite databases and perform data preprocessing.
|
2095
|
-
|
2096
|
-
Parameters:
|
2097
|
-
- locs (list): A list of file paths to the SQLite database files.
|
2098
|
-
- tables (list): A list of table names to read from the databases.
|
2099
|
-
- verbose (bool): Whether to print verbose output. Default is False.
|
2100
|
-
- nuclei_limit (bool): Whether to include multinucleated cells. Default is False.
|
2101
|
-
- pathogen_limit (bool): Whether to include cells with multiple infections. Default is False.
|
2102
|
-
- uninfected (bool): Whether to include non-infected cells. Default is False.
|
2103
|
-
|
2104
|
-
Returns:
|
2105
|
-
- merged_df (pandas.DataFrame): The merged and preprocessed dataframe.
|
2106
|
-
- obj_df_ls (list): A list of pandas DataFrames, each containing the data for a specific object type.
|
2107
|
-
"""
|
2108
|
-
|
2109
|
-
from .utils import _split_data
|
2110
|
-
|
2111
|
-
#Extract plate DataFrames
|
2112
|
-
all_dfs = []
|
2113
|
-
for loc in locs:
|
2114
|
-
db_dfs = _read_db(loc, tables)
|
2115
|
-
all_dfs.append(db_dfs)
|
2116
|
-
|
2117
|
-
#Extract Tables from DataFrames and concatinate rows
|
2118
|
-
for i, dfs in enumerate(all_dfs):
|
2119
|
-
if 'cell' in tables:
|
2120
|
-
cell = dfs[0]
|
2121
|
-
print(f'plate: {i+1} cells:{len(cell)}')
|
2122
|
-
|
2123
|
-
if 'nucleus' in tables:
|
2124
|
-
nucleus = dfs[1]
|
2125
|
-
print(f'plate: {i+1} nucleus:{len(nucleus)} ')
|
2126
|
-
|
2127
|
-
if 'pathogen' in tables:
|
2128
|
-
pathogen = dfs[2]
|
2129
|
-
|
2130
|
-
print(f'plate: {i+1} pathogens:{len(pathogen)}')
|
2131
|
-
if 'cytoplasm' in tables:
|
2132
|
-
if not 'pathogen' in tables:
|
2133
|
-
cytoplasm = dfs[2]
|
2134
|
-
else:
|
2135
|
-
cytoplasm = dfs[3]
|
2136
|
-
print(f'plate: {i+1} cytoplasms: {len(cytoplasm)}')
|
2137
|
-
|
2138
|
-
if i > 0:
|
2139
|
-
if 'cell' in tables:
|
2140
|
-
cells = pd.concat([cells, cell], axis = 0)
|
2141
|
-
if 'nucleus' in tables:
|
2142
|
-
nucleus = pd.concat([nucleus, nucleus], axis = 0)
|
2143
|
-
if 'pathogen' in tables:
|
2144
|
-
pathogens = pd.concat([pathogens, pathogen], axis = 0)
|
2145
|
-
if 'cytoplasm' in tables:
|
2146
|
-
cytoplasms = pd.concat([cytoplasms, cytoplasm], axis = 0)
|
2147
|
-
else:
|
2148
|
-
if 'cell' in tables:
|
2149
|
-
cells = cell.copy()
|
2150
|
-
if 'nucleus' in tables:
|
2151
|
-
nucleus = nucleus.copy()
|
2152
|
-
if 'pathogen' in tables:
|
2153
|
-
pathogens = pathogen.copy()
|
2154
|
-
if 'cytoplasm' in tables:
|
2155
|
-
cytoplasms = cytoplasm.copy()
|
2156
|
-
|
2157
|
-
#Add an o in front of all object and cell lables to convert them to strings
|
2158
|
-
if 'cell' in tables:
|
2159
|
-
cells = cells.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2160
|
-
cells = cells.assign(prcfo = lambda x: x['prcf'] + '_' + x['object_label'])
|
2161
|
-
cells_g_df, metadata = _split_data(cells, 'prcfo', 'object_label')
|
2162
|
-
print(f'cells: {len(cells)}')
|
2163
|
-
print(f'cells grouped: {len(cells_g_df)}')
|
2164
|
-
if 'cytoplasm' in tables:
|
2165
|
-
cytoplasms = cytoplasms.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2166
|
-
cytoplasms = cytoplasms.assign(prcfo = lambda x: x['prcf'] + '_' + x['object_label'])
|
2167
|
-
cytoplasms_g_df, _ = _split_data(cytoplasms, 'prcfo', 'object_label')
|
2168
|
-
merged_df = cells_g_df.merge(cytoplasms_g_df, left_index=True, right_index=True)
|
2169
|
-
print(f'cytoplasms: {len(cytoplasms)}')
|
2170
|
-
print(f'cytoplasms grouped: {len(cytoplasms_g_df)}')
|
2171
|
-
if 'nucleus' in tables:
|
2172
|
-
nucleus = nucleus.dropna(subset=['cell_id'])
|
2173
|
-
nucleus = nucleus.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2174
|
-
nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
|
2175
|
-
nucleus = nucleus.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
|
2176
|
-
nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
|
2177
|
-
if nuclei_limit == False:
|
2178
|
-
#nucleus = nucleus[~nucleus['prcfo'].duplicated()]
|
2179
|
-
nucleus = nucleus[nucleus['nucleus_prcfo_count']==1]
|
2180
|
-
nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
|
2181
|
-
print(f'nucleus: {len(nucleus)}')
|
2182
|
-
print(f'nucleus grouped: {len(nucleus_g_df)}')
|
2183
|
-
if 'cytoplasm' in tables:
|
2184
|
-
merged_df = merged_df.merge(nucleus_g_df, left_index=True, right_index=True)
|
2185
|
-
else:
|
2186
|
-
merged_df = cells_g_df.merge(nucleus_g_df, left_index=True, right_index=True)
|
2187
|
-
if 'pathogen' in tables:
|
2188
|
-
pathogens = pathogens.dropna(subset=['cell_id'])
|
2189
|
-
pathogens = pathogens.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2190
|
-
pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
|
2191
|
-
pathogens = pathogens.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
|
2192
|
-
pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
|
2193
|
-
if uninfected == False:
|
2194
|
-
pathogens = pathogens[pathogens['pathogen_prcfo_count']>=1]
|
2195
|
-
if pathogen_limit == False:
|
2196
|
-
pathogens = pathogens[pathogens['pathogen_prcfo_count']<=1]
|
2197
|
-
pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
|
2198
|
-
print(f'pathogens: {len(pathogens)}')
|
2199
|
-
print(f'pathogens grouped: {len(pathogens_g_df)}')
|
2200
|
-
merged_df = merged_df.merge(pathogens_g_df, left_index=True, right_index=True)
|
2201
|
-
|
2202
|
-
#Add prc column (plate row column)
|
2203
|
-
metadata = metadata.assign(prc = lambda x: x['plate'] + '_' + x['row_name'] + '_' +x['column_name'])
|
2204
|
-
|
2205
|
-
#Count cells per well
|
2206
|
-
cells_well = pd.DataFrame(metadata.groupby('prc')['object_label'].nunique())
|
2207
|
-
|
2208
|
-
cells_well.reset_index(inplace=True)
|
2209
|
-
cells_well.rename(columns={'object_label': 'cells_per_well'}, inplace=True)
|
2210
|
-
metadata = pd.merge(metadata, cells_well, on='prc', how='inner', suffixes=('', '_drop_col'))
|
2211
|
-
object_label_cols = [col for col in metadata.columns if '_drop_col' in col]
|
2212
|
-
metadata.drop(columns=object_label_cols, inplace=True)
|
2213
|
-
|
2214
|
-
#Add prcfo column (plate row column field object)
|
2215
|
-
metadata = metadata.assign(prcfo = lambda x: x['plate'] + '_' + x['row_name'] + '_' +x['column_name']+ '_' +x['field']+ '_' +x['object_label'])
|
2216
|
-
metadata.set_index('prcfo', inplace=True)
|
2217
|
-
|
2218
|
-
merged_df = metadata.merge(merged_df, left_index=True, right_index=True)
|
2219
|
-
|
2220
|
-
merged_df = merged_df.dropna(axis=1)
|
2221
|
-
|
2222
|
-
print(f'Generated dataframe with: {len(merged_df.columns)} columns and {len(merged_df)} rows')
|
2223
|
-
|
2224
|
-
obj_df_ls = []
|
2225
|
-
if 'cell' in tables:
|
2226
|
-
obj_df_ls.append(cells)
|
2227
|
-
if 'cytoplasm' in tables:
|
2228
|
-
obj_df_ls.append(cytoplasms)
|
2229
|
-
if 'nucleus' in tables:
|
2230
|
-
obj_df_ls.append(nucleus)
|
2231
|
-
if 'pathogen' in tables:
|
2232
|
-
obj_df_ls.append(pathogens)
|
2233
|
-
|
2234
|
-
return merged_df, obj_df_ls
|
2235
|
-
|
2236
2092
|
def _results_to_csv(src, df, df_well):
|
2237
2093
|
"""
|
2238
2094
|
Save the given dataframes as CSV files in the specified directory.
|
@@ -2420,7 +2276,7 @@ def _read_db(db_loc, tables):
|
|
2420
2276
|
conn.close() # Close the connection
|
2421
2277
|
return dfs
|
2422
2278
|
|
2423
|
-
def
|
2279
|
+
def _read_and_merge_data_v1(locs, tables, verbose=False, nuclei_limit=False, pathogen_limit=False):
|
2424
2280
|
|
2425
2281
|
from .utils import _split_data
|
2426
2282
|
|
@@ -2532,11 +2388,6 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathog
|
|
2532
2388
|
pathogens = pathogens.assign(prcfo = lambda x: x['prcf'] + '_' + x['cell_id'])
|
2533
2389
|
pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
|
2534
2390
|
|
2535
|
-
print(f"before noninfected: {len(pathogens)}")
|
2536
|
-
if uninfected == False:
|
2537
|
-
pathogens = pathogens[pathogens['pathogen_prcfo_count']>=1]
|
2538
|
-
print(f"after noninfected: {len(pathogens)}")
|
2539
|
-
|
2540
2391
|
if isinstance(pathogen_limit, bool):
|
2541
2392
|
if pathogen_limit == False:
|
2542
2393
|
pathogens = pathogens[pathogens['pathogen_prcfo_count']<=1]
|
@@ -2592,7 +2443,135 @@ def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=False, pathog
|
|
2592
2443
|
if 'pathogen' in tables:
|
2593
2444
|
obj_df_ls.append(pathogens)
|
2594
2445
|
|
2595
|
-
return merged_df, obj_df_ls
|
2446
|
+
return merged_df, obj_df_ls
|
2447
|
+
|
2448
|
+
def _read_and_merge_data(locs, tables, verbose=False, nuclei_limit=10, pathogen_limit=10):
|
2449
|
+
from .io import _read_db
|
2450
|
+
from .utils import _split_data
|
2451
|
+
|
2452
|
+
# Initialize an empty dictionary to store DataFrames by table name
|
2453
|
+
data_dict = {table: [] for table in tables}
|
2454
|
+
|
2455
|
+
# Extract plate DataFrames
|
2456
|
+
for loc in locs:
|
2457
|
+
db_dfs = _read_db(loc, tables)
|
2458
|
+
for table, df in zip(tables, db_dfs):
|
2459
|
+
data_dict[table].append(df)
|
2460
|
+
|
2461
|
+
# Concatenate rows across locations for each table
|
2462
|
+
for table, dfs in data_dict.items():
|
2463
|
+
if dfs:
|
2464
|
+
data_dict[table] = pd.concat(dfs, axis=0)
|
2465
|
+
if verbose:
|
2466
|
+
print(f"{table}: {len(data_dict[table])}")
|
2467
|
+
|
2468
|
+
# Initialize merged DataFrame with 'cells' if available
|
2469
|
+
merged_df = pd.DataFrame()
|
2470
|
+
|
2471
|
+
# Process each table
|
2472
|
+
if 'cell' in data_dict:
|
2473
|
+
cells = data_dict['cell'].copy()
|
2474
|
+
cells = cells.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2475
|
+
cells = cells.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
|
2476
|
+
cells_g_df, metadata = _split_data(cells, 'prcfo', 'object_label')
|
2477
|
+
merged_df = cells_g_df.copy()
|
2478
|
+
if verbose:
|
2479
|
+
print(f'cells: {len(cells)}, cells grouped: {len(cells_g_df)}')
|
2480
|
+
|
2481
|
+
if 'cytoplasm' in data_dict:
|
2482
|
+
cytoplasms = data_dict['cytoplasm'].copy()
|
2483
|
+
cytoplasms = cytoplasms.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2484
|
+
cytoplasms = cytoplasms.assign(prcfo=lambda x: x['prcf'] + '_' + x['object_label'])
|
2485
|
+
|
2486
|
+
if not 'cell' in data_dict:
|
2487
|
+
merged_df, metadata = _split_data(cytoplasms, 'prcfo', 'object_label')
|
2488
|
+
|
2489
|
+
if verbose:
|
2490
|
+
print(f'nucleus: {len(cytoplasms)}, cytoplasms grouped: {len(merged_df)}')
|
2491
|
+
|
2492
|
+
else:
|
2493
|
+
cytoplasms_g_df, _ = _split_data(cytoplasms, 'prcfo', 'object_label')
|
2494
|
+
merged_df = merged_df.merge(cytoplasms_g_df, left_index=True, right_index=True)
|
2495
|
+
|
2496
|
+
if verbose:
|
2497
|
+
print(f'cytoplasms: {len(cytoplasms)}, cytoplasms grouped: {len(cytoplasms_g_df)}')
|
2498
|
+
|
2499
|
+
if 'nucleus' in data_dict:
|
2500
|
+
nucleus = data_dict['nucleus'].copy()
|
2501
|
+
nucleus = nucleus.dropna(subset=['cell_id'])
|
2502
|
+
nucleus = nucleus.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2503
|
+
nucleus = nucleus.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
|
2504
|
+
nucleus = nucleus.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
|
2505
|
+
nucleus['nucleus_prcfo_count'] = nucleus.groupby('prcfo')['prcfo'].transform('count')
|
2506
|
+
if not nuclei_limit:
|
2507
|
+
nucleus = nucleus[nucleus['nucleus_prcfo_count'] == 1]
|
2508
|
+
|
2509
|
+
if all(key not in data_dict for key in ['cell', 'cytoplasm']):
|
2510
|
+
merged_df, metadata = _split_data(nucleus, 'prcfo', 'cell_id')
|
2511
|
+
|
2512
|
+
if verbose:
|
2513
|
+
print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(merged_df)}')
|
2514
|
+
|
2515
|
+
else:
|
2516
|
+
nucleus_g_df, _ = _split_data(nucleus, 'prcfo', 'cell_id')
|
2517
|
+
merged_df = merged_df.merge(nucleus_g_df, left_index=True, right_index=True)
|
2518
|
+
|
2519
|
+
if verbose:
|
2520
|
+
print(f'nucleus: {len(nucleus)}, nucleus grouped: {len(nucleus_g_df)}')
|
2521
|
+
|
2522
|
+
if 'pathogen' in data_dict:
|
2523
|
+
pathogens = data_dict['pathogen'].copy()
|
2524
|
+
pathogens = pathogens.dropna(subset=['cell_id'])
|
2525
|
+
pathogens = pathogens.assign(object_label=lambda x: 'o' + x['object_label'].astype(int).astype(str))
|
2526
|
+
pathogens = pathogens.assign(cell_id=lambda x: 'o' + x['cell_id'].astype(int).astype(str))
|
2527
|
+
pathogens = pathogens.assign(prcfo=lambda x: x['prcf'] + '_' + x['cell_id'])
|
2528
|
+
pathogens['pathogen_prcfo_count'] = pathogens.groupby('prcfo')['prcfo'].transform('count')
|
2529
|
+
|
2530
|
+
if isinstance(pathogen_limit, bool) and not pathogen_limit:
|
2531
|
+
pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= 1]
|
2532
|
+
elif isinstance(pathogen_limit, (float, int)):
|
2533
|
+
pathogens = pathogens[pathogens['pathogen_prcfo_count'] <= int(pathogen_limit)]
|
2534
|
+
|
2535
|
+
if all(key not in data_dict for key in ['cell', 'cytoplasm', 'nucleus']):
|
2536
|
+
merged_df, metadata = _split_data(pathogens, 'prcfo', 'cell_id')
|
2537
|
+
|
2538
|
+
if verbose:
|
2539
|
+
print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(merged_df)}')
|
2540
|
+
|
2541
|
+
else:
|
2542
|
+
pathogens_g_df, _ = _split_data(pathogens, 'prcfo', 'cell_id')
|
2543
|
+
merged_df = merged_df.merge(pathogens_g_df, left_index=True, right_index=True)
|
2544
|
+
|
2545
|
+
if verbose:
|
2546
|
+
print(f'pathogens: {len(pathogens)}, pathogens grouped: {len(pathogens_g_df)}')
|
2547
|
+
|
2548
|
+
if 'png_list' in data_dict:
|
2549
|
+
png_list = data_dict['png_list'].copy()
|
2550
|
+
png_list_g_df_numeric, png_list_g_df_non_numeric = _split_data(png_list, 'prcfo', 'cell_id')
|
2551
|
+
png_list_g_df_non_numeric.drop(columns=['plate','row_name','column_name','field','file_name','cell_id', 'prcf'], inplace=True)
|
2552
|
+
if verbose:
|
2553
|
+
print(f'png_list: {len(png_list)}, png_list grouped: {len(png_list_g_df_numeric)}')
|
2554
|
+
merged_df = merged_df.merge(png_list_g_df_numeric, left_index=True, right_index=True)
|
2555
|
+
merged_df = merged_df.merge(png_list_g_df_non_numeric, left_index=True, right_index=True)
|
2556
|
+
|
2557
|
+
# Add prc (plate row column) and prcfo (plate row column field object) columns
|
2558
|
+
metadata = metadata.assign(prc=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'])
|
2559
|
+
cells_well = metadata.groupby('prc')['object_label'].nunique().reset_index(name='cells_per_well')
|
2560
|
+
metadata = metadata.merge(cells_well, on='prc')
|
2561
|
+
metadata = metadata.assign(prcfo=lambda x: x['plate'] + '_' + x['row_name'] + '_' + x['column_name'] + '_' + x['field'] + '_' + x['object_label'])
|
2562
|
+
metadata.set_index('prcfo', inplace=True)
|
2563
|
+
|
2564
|
+
# Merge metadata with final merged DataFrame
|
2565
|
+
merged_df = metadata.merge(merged_df, left_index=True, right_index=True).dropna(axis=1)
|
2566
|
+
merged_df.drop(columns=['label_list_morphology', 'label_list_intensity'], errors='ignore', inplace=True)
|
2567
|
+
|
2568
|
+
if verbose:
|
2569
|
+
print(f'Generated dataframe with: {len(merged_df.columns)} columns and {len(merged_df)} rows')
|
2570
|
+
|
2571
|
+
# Prepare object DataFrames for output
|
2572
|
+
obj_df_ls = [data_dict[table] for table in ['cell', 'cytoplasm', 'nucleus', 'pathogen'] if table in data_dict]
|
2573
|
+
|
2574
|
+
return merged_df, obj_df_ls
|
2596
2575
|
|
2597
2576
|
def _read_mask(mask_path):
|
2598
2577
|
mask = imageio2.imread(mask_path)
|
@@ -2929,8 +2908,8 @@ def generate_training_dataset(settings):
|
|
2929
2908
|
tables=tables,
|
2930
2909
|
verbose=False,
|
2931
2910
|
nuclei_limit=settings['nuclei_limit'],
|
2932
|
-
pathogen_limit=settings['pathogen_limit']
|
2933
|
-
|
2911
|
+
pathogen_limit=settings['pathogen_limit'])
|
2912
|
+
|
2934
2913
|
[png_list_df] = _read_db(db_loc=db_path, tables=['png_list'])
|
2935
2914
|
filtered_png_list_df = png_list_df[png_list_df['prcfo'].isin(df.index)]
|
2936
2915
|
return filtered_png_list_df
|
@@ -2952,8 +2931,7 @@ def generate_training_dataset(settings):
|
|
2952
2931
|
tables=tables,
|
2953
2932
|
verbose=False,
|
2954
2933
|
nuclei_limit=settings['nuclei_limit'],
|
2955
|
-
pathogen_limit=settings['pathogen_limit']
|
2956
|
-
uninfected=settings['uninfected'])
|
2934
|
+
pathogen_limit=settings['pathogen_limit'])
|
2957
2935
|
|
2958
2936
|
print('length df 1', len(df))
|
2959
2937
|
df = annotate_conditions(df, cells=['HeLa'], pathogens=['pathogen'], treatments=settings['classes'],
|
@@ -3034,7 +3012,6 @@ def generate_training_dataset(settings):
|
|
3034
3012
|
|
3035
3013
|
if 'pathogen' not in settings['tables']:
|
3036
3014
|
settings['pathogen_limit'] = 0
|
3037
|
-
settings['uninfected'] = True
|
3038
3015
|
|
3039
3016
|
# Set default settings and save
|
3040
3017
|
settings = set_generate_training_dataset_defaults(settings)
|
spacr/ml.py
CHANGED
@@ -1172,15 +1172,14 @@ def generate_ml_scores(settings):
|
|
1172
1172
|
db_loc = [src+'/measurements/measurements.db']
|
1173
1173
|
tables = ['cell', 'nucleus', 'pathogen','cytoplasm']
|
1174
1174
|
|
1175
|
-
nuclei_limit, pathogen_limit
|
1175
|
+
nuclei_limit, pathogen_limit = settings['nuclei_limit'], settings['pathogen_limit']
|
1176
1176
|
|
1177
1177
|
df, _ = _read_and_merge_data(db_loc,
|
1178
1178
|
tables,
|
1179
1179
|
settings['verbose'],
|
1180
1180
|
nuclei_limit,
|
1181
|
-
pathogen_limit
|
1182
|
-
|
1183
|
-
|
1181
|
+
pathogen_limit)
|
1182
|
+
|
1184
1183
|
if settings['annotation_column'] is not None:
|
1185
1184
|
|
1186
1185
|
settings['location_column'] = settings['annotation_column']
|