masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/h5.py +1 -1
- masster/sample/helpers.py +3 -7
- masster/sample/load.py +2 -2
- masster/sample/plot.py +2 -1
- masster/study/export.py +27 -10
- masster/study/h5.py +58 -40
- masster/study/helpers.py +275 -225
- masster/study/helpers_optimized.py +5 -5
- masster/study/load.py +148 -121
- masster/study/plot.py +306 -106
- masster/study/processing.py +9 -5
- masster/study/study.py +2 -6
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/METADATA +1 -1
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/RECORD +18 -18
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/WHEEL +0 -0
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/entry_points.txt +0 -0
- {masster-0.3.17.dist-info → masster-0.3.19.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -45,12 +45,12 @@ def add(
|
|
|
45
45
|
"""Add samples from a folder to the study.
|
|
46
46
|
|
|
47
47
|
Args:
|
|
48
|
-
folder (str, optional): Path to folder containing sample files.
|
|
48
|
+
folder (str, optional): Path to folder containing sample files.
|
|
49
49
|
Defaults to study folder or current working directory.
|
|
50
50
|
reset (bool, optional): Whether to reset the study before adding samples.
|
|
51
51
|
Defaults to False.
|
|
52
52
|
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
53
|
-
max_files (int, optional): Maximum number of files to process.
|
|
53
|
+
max_files (int, optional): Maximum number of files to process.
|
|
54
54
|
Defaults to None (no limit).
|
|
55
55
|
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
56
56
|
for better performance. Defaults to True.
|
|
@@ -104,21 +104,27 @@ def add(
|
|
|
104
104
|
for file in files:
|
|
105
105
|
if max_files is not None and counter >= max_files:
|
|
106
106
|
break
|
|
107
|
-
|
|
107
|
+
|
|
108
108
|
# Get filename without extension for blacklist check
|
|
109
109
|
basename = os.path.basename(file)
|
|
110
110
|
filename_no_ext = os.path.splitext(basename)[0]
|
|
111
|
-
|
|
111
|
+
|
|
112
112
|
# Check if this filename (without extension) is already in blacklist
|
|
113
113
|
if filename_no_ext not in blacklist:
|
|
114
114
|
files_to_process.append(file)
|
|
115
|
-
if len(files_to_process) + counter >= (max_files or float(
|
|
115
|
+
if len(files_to_process) + counter >= (max_files or float("inf")):
|
|
116
116
|
break
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
# Batch process all files of this extension using ultra-optimized method
|
|
119
119
|
if files_to_process:
|
|
120
120
|
self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
|
|
121
|
-
successful = self._add_samples_batch(
|
|
121
|
+
successful = self._add_samples_batch(
|
|
122
|
+
files_to_process,
|
|
123
|
+
reset=reset,
|
|
124
|
+
adducts=adducts,
|
|
125
|
+
blacklist=blacklist,
|
|
126
|
+
fast=fast,
|
|
127
|
+
)
|
|
122
128
|
counter += successful
|
|
123
129
|
if successful > 0:
|
|
124
130
|
not_zero = True
|
|
@@ -140,7 +146,7 @@ def add(
|
|
|
140
146
|
def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
|
|
141
147
|
"""
|
|
142
148
|
Add a single sample to the study.
|
|
143
|
-
|
|
149
|
+
|
|
144
150
|
Args:
|
|
145
151
|
file (str): Path to the sample file
|
|
146
152
|
type (str, optional): File type to force. Defaults to None (auto-detect).
|
|
@@ -148,31 +154,31 @@ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
|
|
|
148
154
|
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
149
155
|
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
150
156
|
for better performance. Defaults to True.
|
|
151
|
-
|
|
157
|
+
|
|
152
158
|
Returns:
|
|
153
159
|
bool: True if successful, False otherwise.
|
|
154
160
|
"""
|
|
155
161
|
if fast:
|
|
156
162
|
# Use optimized method for better performance
|
|
157
163
|
success = self._add_sample_optimized(
|
|
158
|
-
file,
|
|
159
|
-
type=type,
|
|
160
|
-
reset=reset,
|
|
164
|
+
file,
|
|
165
|
+
type=type,
|
|
166
|
+
reset=reset,
|
|
161
167
|
adducts=adducts,
|
|
162
168
|
skip_color_reset=False, # Do color reset for individual calls
|
|
163
|
-
skip_schema_check=True
|
|
169
|
+
skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
|
|
164
170
|
)
|
|
165
171
|
else:
|
|
166
172
|
# Use standard method with full ms1_df loading
|
|
167
173
|
success = self._add_sample_standard(
|
|
168
|
-
file,
|
|
169
|
-
type=type,
|
|
170
|
-
reset=reset,
|
|
174
|
+
file,
|
|
175
|
+
type=type,
|
|
176
|
+
reset=reset,
|
|
171
177
|
adducts=adducts,
|
|
172
178
|
skip_color_reset=False, # Do color reset for individual calls
|
|
173
|
-
skip_schema_check=True
|
|
179
|
+
skip_schema_check=True, # Skip schema check for performance
|
|
174
180
|
)
|
|
175
|
-
|
|
181
|
+
|
|
176
182
|
return success
|
|
177
183
|
|
|
178
184
|
|
|
@@ -1193,17 +1199,18 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1193
1199
|
fh.load(filename, self.consensus_map)
|
|
1194
1200
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1195
1201
|
|
|
1202
|
+
|
|
1196
1203
|
def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
|
|
1197
1204
|
"""
|
|
1198
1205
|
Optimized batch addition of samples.
|
|
1199
|
-
|
|
1206
|
+
|
|
1200
1207
|
Args:
|
|
1201
1208
|
files (list): List of file paths to process
|
|
1202
1209
|
reset (bool): Whether to reset features before processing
|
|
1203
1210
|
adducts: Adducts to use for sample loading
|
|
1204
1211
|
blacklist (set): Set of filenames already processed
|
|
1205
1212
|
fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
|
|
1206
|
-
|
|
1213
|
+
|
|
1207
1214
|
Performance optimizations:
|
|
1208
1215
|
1. No per-sample color reset
|
|
1209
1216
|
2. No schema enforcement during addition
|
|
@@ -1212,126 +1219,135 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
|
|
|
1212
1219
|
"""
|
|
1213
1220
|
if not files:
|
|
1214
1221
|
return 0
|
|
1215
|
-
|
|
1222
|
+
|
|
1216
1223
|
if blacklist is None:
|
|
1217
1224
|
blacklist = set()
|
|
1218
|
-
|
|
1225
|
+
|
|
1219
1226
|
self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
|
|
1220
|
-
|
|
1227
|
+
|
|
1221
1228
|
successful_additions = 0
|
|
1222
1229
|
failed_additions = 0
|
|
1223
|
-
|
|
1230
|
+
|
|
1224
1231
|
# Progress reporting setup
|
|
1225
1232
|
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1226
|
-
|
|
1233
|
+
|
|
1227
1234
|
for i, file in enumerate(
|
|
1228
1235
|
tqdm(
|
|
1229
1236
|
files,
|
|
1230
1237
|
total=len(files),
|
|
1231
1238
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Batch add",
|
|
1232
1239
|
disable=tqdm_disable,
|
|
1233
|
-
)
|
|
1240
|
+
),
|
|
1234
1241
|
):
|
|
1235
1242
|
try:
|
|
1236
1243
|
# Choose between optimized and standard loading
|
|
1237
1244
|
if fast:
|
|
1238
1245
|
success = self._add_sample_optimized(
|
|
1239
|
-
file,
|
|
1240
|
-
reset=reset,
|
|
1246
|
+
file,
|
|
1247
|
+
reset=reset,
|
|
1241
1248
|
adducts=adducts,
|
|
1242
1249
|
skip_color_reset=True, # Skip color reset during batch
|
|
1243
|
-
skip_schema_check=True # Skip schema enforcement
|
|
1250
|
+
skip_schema_check=True, # Skip schema enforcement
|
|
1244
1251
|
)
|
|
1245
1252
|
else:
|
|
1246
1253
|
success = self._add_sample_standard(
|
|
1247
|
-
file,
|
|
1248
|
-
reset=reset,
|
|
1254
|
+
file,
|
|
1255
|
+
reset=reset,
|
|
1249
1256
|
adducts=adducts,
|
|
1250
1257
|
skip_color_reset=True, # Skip color reset during batch
|
|
1251
|
-
skip_schema_check=True # Skip schema enforcement
|
|
1258
|
+
skip_schema_check=True, # Skip schema enforcement
|
|
1252
1259
|
)
|
|
1253
|
-
|
|
1260
|
+
|
|
1254
1261
|
if success:
|
|
1255
1262
|
# Add to blacklist for filename tracking
|
|
1256
1263
|
basename = os.path.basename(file)
|
|
1257
1264
|
filename_no_ext = os.path.splitext(basename)[0]
|
|
1258
1265
|
blacklist.add(filename_no_ext)
|
|
1259
1266
|
successful_additions += 1
|
|
1260
|
-
|
|
1267
|
+
|
|
1261
1268
|
except Exception as e:
|
|
1262
1269
|
self.logger.warning(f"Failed to add sample {file}: {e}")
|
|
1263
1270
|
failed_additions += 1
|
|
1264
1271
|
continue
|
|
1265
|
-
|
|
1272
|
+
|
|
1266
1273
|
# Final cleanup operations done once at the end
|
|
1267
1274
|
if successful_additions > 0:
|
|
1268
1275
|
self.logger.debug("Performing final batch cleanup...")
|
|
1269
|
-
|
|
1276
|
+
|
|
1270
1277
|
# Optional: Only do schema enforcement if specifically needed (usually not required)
|
|
1271
1278
|
# self._ensure_features_df_schema_order()
|
|
1272
|
-
|
|
1279
|
+
|
|
1273
1280
|
# Color assignment done once for all samples
|
|
1274
1281
|
self._sample_color_reset_optimized()
|
|
1275
|
-
|
|
1282
|
+
|
|
1276
1283
|
self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
|
|
1277
|
-
|
|
1284
|
+
|
|
1278
1285
|
return successful_additions
|
|
1279
1286
|
|
|
1280
|
-
|
|
1287
|
+
|
|
1288
|
+
def _add_sample_optimized(
|
|
1289
|
+
self,
|
|
1290
|
+
file,
|
|
1291
|
+
type=None,
|
|
1292
|
+
reset=False,
|
|
1293
|
+
adducts=None,
|
|
1294
|
+
skip_color_reset=True,
|
|
1295
|
+
skip_schema_check=True,
|
|
1296
|
+
):
|
|
1281
1297
|
"""
|
|
1282
1298
|
Optimized add_sample with performance improvements integrated.
|
|
1283
|
-
|
|
1299
|
+
|
|
1284
1300
|
Removes:
|
|
1285
1301
|
- Schema enforcement (_ensure_features_df_schema_order)
|
|
1286
1302
|
- Complex column alignment and type casting
|
|
1287
1303
|
- Per-addition color reset
|
|
1288
1304
|
- Unnecessary column reordering
|
|
1289
|
-
|
|
1305
|
+
|
|
1290
1306
|
Returns True if successful, False otherwise.
|
|
1291
1307
|
"""
|
|
1292
1308
|
self.logger.debug(f"Adding: {file}")
|
|
1293
|
-
|
|
1309
|
+
|
|
1294
1310
|
# Basic validation
|
|
1295
1311
|
basename = os.path.basename(file)
|
|
1296
1312
|
sample_name = os.path.splitext(basename)[0]
|
|
1297
|
-
|
|
1313
|
+
|
|
1298
1314
|
if sample_name in self.samples_df["sample_name"].to_list():
|
|
1299
1315
|
self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
|
|
1300
1316
|
return False
|
|
1301
|
-
|
|
1317
|
+
|
|
1302
1318
|
if not os.path.exists(file):
|
|
1303
1319
|
self.logger.error(f"File {file} does not exist.")
|
|
1304
1320
|
return False
|
|
1305
|
-
|
|
1321
|
+
|
|
1306
1322
|
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
1307
1323
|
self.logger.error(f"Unsupported file type: {file}")
|
|
1308
1324
|
return False
|
|
1309
|
-
|
|
1325
|
+
|
|
1310
1326
|
# Load sample
|
|
1311
1327
|
ddaobj = Sample()
|
|
1312
1328
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1313
1329
|
# Use standard loading method temporarily to test if this fixes the astuple error
|
|
1314
1330
|
ddaobj.load(file)
|
|
1315
|
-
|
|
1331
|
+
|
|
1316
1332
|
if ddaobj.features_df is None and not reset:
|
|
1317
1333
|
ddaobj.features = None
|
|
1318
|
-
|
|
1334
|
+
|
|
1319
1335
|
if ddaobj.features is None or reset:
|
|
1320
1336
|
ddaobj.find_features()
|
|
1321
1337
|
ddaobj.find_adducts(adducts=adducts)
|
|
1322
1338
|
ddaobj.find_ms2()
|
|
1323
|
-
|
|
1339
|
+
|
|
1324
1340
|
self.features_maps.append(ddaobj.features)
|
|
1325
|
-
|
|
1341
|
+
|
|
1326
1342
|
# Determine sample type
|
|
1327
1343
|
sample_type = "sample" if type is None else type
|
|
1328
1344
|
if "qc" in sample_name.lower():
|
|
1329
1345
|
sample_type = "qc"
|
|
1330
1346
|
if "blank" in sample_name.lower():
|
|
1331
1347
|
sample_type = "blank"
|
|
1332
|
-
|
|
1348
|
+
|
|
1333
1349
|
map_id_value = len(self.features_maps) - 1
|
|
1334
|
-
|
|
1350
|
+
|
|
1335
1351
|
# Handle file paths
|
|
1336
1352
|
if file.endswith(".sample5"):
|
|
1337
1353
|
final_sample_path = file
|
|
@@ -1345,7 +1361,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
|
|
|
1345
1361
|
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1346
1362
|
ddaobj.save(final_sample_path)
|
|
1347
1363
|
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1348
|
-
|
|
1364
|
+
|
|
1349
1365
|
# Efficient scan counting
|
|
1350
1366
|
ms1_count = ms2_count = 0
|
|
1351
1367
|
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
@@ -1357,7 +1373,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
|
|
|
1357
1373
|
ms1_count = count
|
|
1358
1374
|
elif level == 2:
|
|
1359
1375
|
ms2_count = count
|
|
1360
|
-
|
|
1376
|
+
|
|
1361
1377
|
# Create sample entry
|
|
1362
1378
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1363
1379
|
new_sample = pl.DataFrame({
|
|
@@ -1375,11 +1391,11 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
|
|
|
1375
1391
|
"num_ms1": [ms1_count],
|
|
1376
1392
|
"num_ms2": [ms2_count],
|
|
1377
1393
|
})
|
|
1378
|
-
|
|
1394
|
+
|
|
1379
1395
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1380
|
-
|
|
1396
|
+
|
|
1381
1397
|
# SIMPLIFIED feature processing
|
|
1382
|
-
current_sample_uid = len(self.samples_df)
|
|
1398
|
+
current_sample_uid = len(self.samples_df)
|
|
1383
1399
|
|
|
1384
1400
|
# Add required columns with minimal operations
|
|
1385
1401
|
columns_to_add = [
|
|
@@ -1387,92 +1403,100 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
|
|
|
1387
1403
|
pl.lit(False).alias("filled"),
|
|
1388
1404
|
pl.lit(-1.0).alias("chrom_area"),
|
|
1389
1405
|
]
|
|
1390
|
-
|
|
1406
|
+
|
|
1391
1407
|
# Only add rt_original if it doesn't exist
|
|
1392
1408
|
if "rt_original" not in ddaobj.features_df.columns:
|
|
1393
1409
|
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1394
|
-
|
|
1410
|
+
|
|
1395
1411
|
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1396
|
-
|
|
1412
|
+
|
|
1397
1413
|
if self.features_df.is_empty():
|
|
1398
1414
|
# First sample
|
|
1399
1415
|
self.features_df = f_df.with_columns(
|
|
1400
|
-
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1416
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid"),
|
|
1401
1417
|
)
|
|
1402
1418
|
else:
|
|
1403
1419
|
# Subsequent samples - minimal overhead
|
|
1404
1420
|
offset = self.features_df["feature_uid"].max() + 1
|
|
1405
1421
|
f_df = f_df.with_columns(
|
|
1406
|
-
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1422
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid"),
|
|
1407
1423
|
)
|
|
1408
|
-
|
|
1424
|
+
|
|
1409
1425
|
# OPTIMIZED: Use diagonal concatenation without any schema enforcement
|
|
1410
1426
|
# This is the fastest concatenation method in Polars and handles type mismatches automatically
|
|
1411
1427
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1412
|
-
|
|
1428
|
+
|
|
1413
1429
|
# REMOVED ALL EXPENSIVE OPERATIONS:
|
|
1414
|
-
# - No _ensure_features_df_schema_order()
|
|
1430
|
+
# - No _ensure_features_df_schema_order()
|
|
1415
1431
|
# - No complex column alignment
|
|
1416
1432
|
# - No type casting loops
|
|
1417
1433
|
# - No sample_color_reset()
|
|
1418
|
-
|
|
1434
|
+
|
|
1419
1435
|
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
|
|
1420
1436
|
return True
|
|
1421
1437
|
|
|
1422
1438
|
|
|
1423
|
-
def _add_sample_standard(
|
|
1439
|
+
def _add_sample_standard(
|
|
1440
|
+
self,
|
|
1441
|
+
file,
|
|
1442
|
+
type=None,
|
|
1443
|
+
reset=False,
|
|
1444
|
+
adducts=None,
|
|
1445
|
+
skip_color_reset=True,
|
|
1446
|
+
skip_schema_check=True,
|
|
1447
|
+
):
|
|
1424
1448
|
"""
|
|
1425
1449
|
Standard add_sample method that uses full sample loading (includes ms1_df).
|
|
1426
|
-
|
|
1450
|
+
|
|
1427
1451
|
This method uses the standard sample.load() method which loads all data
|
|
1428
1452
|
including ms1_df, providing full functionality but potentially slower performance
|
|
1429
1453
|
for large MS1 datasets.
|
|
1430
|
-
|
|
1454
|
+
|
|
1431
1455
|
Returns True if successful, False otherwise.
|
|
1432
1456
|
"""
|
|
1433
1457
|
self.logger.debug(f"Adding (standard): {file}")
|
|
1434
|
-
|
|
1458
|
+
|
|
1435
1459
|
# Basic validation
|
|
1436
1460
|
basename = os.path.basename(file)
|
|
1437
1461
|
sample_name = os.path.splitext(basename)[0]
|
|
1438
|
-
|
|
1462
|
+
|
|
1439
1463
|
if sample_name in self.samples_df["sample_name"].to_list():
|
|
1440
1464
|
self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
|
|
1441
1465
|
return False
|
|
1442
|
-
|
|
1466
|
+
|
|
1443
1467
|
if not os.path.exists(file):
|
|
1444
1468
|
self.logger.error(f"File {file} does not exist.")
|
|
1445
1469
|
return False
|
|
1446
|
-
|
|
1470
|
+
|
|
1447
1471
|
if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
|
|
1448
1472
|
self.logger.error(f"Unsupported file type: {file}")
|
|
1449
1473
|
return False
|
|
1450
|
-
|
|
1474
|
+
|
|
1451
1475
|
# Load sample using standard method (includes ms1_df)
|
|
1452
1476
|
ddaobj = Sample()
|
|
1453
1477
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1454
1478
|
# Use standard loading method that loads all data including ms1_df
|
|
1455
1479
|
ddaobj.load(file)
|
|
1456
|
-
|
|
1480
|
+
|
|
1457
1481
|
if ddaobj.features_df is None and not reset:
|
|
1458
1482
|
ddaobj.features = None
|
|
1459
|
-
|
|
1483
|
+
|
|
1460
1484
|
if ddaobj.features is None or reset:
|
|
1461
1485
|
ddaobj.find_features()
|
|
1462
1486
|
ddaobj.find_adducts(adducts=adducts)
|
|
1463
1487
|
ddaobj.find_ms2()
|
|
1464
|
-
|
|
1488
|
+
|
|
1465
1489
|
self.features_maps.append(ddaobj.features)
|
|
1466
|
-
|
|
1490
|
+
|
|
1467
1491
|
# Determine sample type
|
|
1468
1492
|
sample_type = "sample" if type is None else type
|
|
1469
1493
|
if "qc" in sample_name.lower():
|
|
1470
1494
|
sample_type = "qc"
|
|
1471
1495
|
if "blank" in sample_name.lower():
|
|
1472
1496
|
sample_type = "blank"
|
|
1473
|
-
|
|
1497
|
+
|
|
1474
1498
|
map_id_value = len(self.features_maps) - 1
|
|
1475
|
-
|
|
1499
|
+
|
|
1476
1500
|
# Handle file paths
|
|
1477
1501
|
if file.endswith(".sample5"):
|
|
1478
1502
|
final_sample_path = file
|
|
@@ -1486,7 +1510,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1486
1510
|
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1487
1511
|
ddaobj.save(final_sample_path)
|
|
1488
1512
|
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1489
|
-
|
|
1513
|
+
|
|
1490
1514
|
# Efficient scan counting
|
|
1491
1515
|
ms1_count = ms2_count = 0
|
|
1492
1516
|
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
@@ -1498,7 +1522,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1498
1522
|
ms1_count = count
|
|
1499
1523
|
elif level == 2:
|
|
1500
1524
|
ms2_count = count
|
|
1501
|
-
|
|
1525
|
+
|
|
1502
1526
|
# Create sample entry
|
|
1503
1527
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1504
1528
|
new_sample = pl.DataFrame({
|
|
@@ -1516,11 +1540,11 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1516
1540
|
"num_ms1": [ms1_count],
|
|
1517
1541
|
"num_ms2": [ms2_count],
|
|
1518
1542
|
})
|
|
1519
|
-
|
|
1543
|
+
|
|
1520
1544
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1521
|
-
|
|
1545
|
+
|
|
1522
1546
|
# SIMPLIFIED feature processing
|
|
1523
|
-
current_sample_uid = len(self.samples_df)
|
|
1547
|
+
current_sample_uid = len(self.samples_df)
|
|
1524
1548
|
|
|
1525
1549
|
# Add required columns with minimal operations
|
|
1526
1550
|
columns_to_add = [
|
|
@@ -1528,52 +1552,53 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1528
1552
|
pl.lit(False).alias("filled"),
|
|
1529
1553
|
pl.lit(-1.0).alias("chrom_area"),
|
|
1530
1554
|
]
|
|
1531
|
-
|
|
1555
|
+
|
|
1532
1556
|
# Only add rt_original if it doesn't exist
|
|
1533
1557
|
if "rt_original" not in ddaobj.features_df.columns:
|
|
1534
1558
|
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1535
|
-
|
|
1559
|
+
|
|
1536
1560
|
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1537
|
-
|
|
1561
|
+
|
|
1538
1562
|
if self.features_df.is_empty():
|
|
1539
1563
|
# First sample
|
|
1540
1564
|
self.features_df = f_df.with_columns(
|
|
1541
|
-
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1565
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid"),
|
|
1542
1566
|
)
|
|
1543
1567
|
else:
|
|
1544
1568
|
# Subsequent samples - minimal overhead
|
|
1545
1569
|
offset = self.features_df["feature_uid"].max() + 1
|
|
1546
1570
|
f_df = f_df.with_columns(
|
|
1547
|
-
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1571
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid"),
|
|
1548
1572
|
)
|
|
1549
|
-
|
|
1573
|
+
|
|
1550
1574
|
# Use diagonal concatenation for flexibility
|
|
1551
1575
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1552
|
-
|
|
1576
|
+
|
|
1553
1577
|
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
|
|
1554
1578
|
return True
|
|
1579
|
+
## COMMENT AR: Is this intentional?
|
|
1555
1580
|
# Use standard loading method that loads all data including ms1_df
|
|
1556
1581
|
ddaobj.load(file)
|
|
1557
|
-
|
|
1582
|
+
|
|
1558
1583
|
if ddaobj.features_df is None and not reset:
|
|
1559
1584
|
ddaobj.features = None
|
|
1560
|
-
|
|
1585
|
+
|
|
1561
1586
|
if ddaobj.features is None or reset:
|
|
1562
1587
|
ddaobj.find_features()
|
|
1563
1588
|
ddaobj.find_adducts(adducts=adducts)
|
|
1564
1589
|
ddaobj.find_ms2()
|
|
1565
|
-
|
|
1590
|
+
|
|
1566
1591
|
self.features_maps.append(ddaobj.features)
|
|
1567
|
-
|
|
1592
|
+
|
|
1568
1593
|
# Determine sample type
|
|
1569
1594
|
sample_type = "sample" if type is None else type
|
|
1570
1595
|
if "qc" in sample_name.lower():
|
|
1571
1596
|
sample_type = "qc"
|
|
1572
1597
|
if "blank" in sample_name.lower():
|
|
1573
1598
|
sample_type = "blank"
|
|
1574
|
-
|
|
1599
|
+
|
|
1575
1600
|
map_id_value = len(self.features_maps) - 1
|
|
1576
|
-
|
|
1601
|
+
|
|
1577
1602
|
# Handle file paths
|
|
1578
1603
|
if file.endswith(".sample5"):
|
|
1579
1604
|
final_sample_path = file
|
|
@@ -1587,7 +1612,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1587
1612
|
final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
|
|
1588
1613
|
ddaobj.save(final_sample_path)
|
|
1589
1614
|
self.logger.debug(f"Saved converted sample: {final_sample_path}")
|
|
1590
|
-
|
|
1615
|
+
|
|
1591
1616
|
# Efficient scan counting
|
|
1592
1617
|
ms1_count = ms2_count = 0
|
|
1593
1618
|
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
@@ -1599,7 +1624,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1599
1624
|
ms1_count = count
|
|
1600
1625
|
elif level == 2:
|
|
1601
1626
|
ms2_count = count
|
|
1602
|
-
|
|
1627
|
+
|
|
1603
1628
|
# Create sample entry
|
|
1604
1629
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1605
1630
|
new_sample = pl.DataFrame({
|
|
@@ -1617,11 +1642,11 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1617
1642
|
"num_ms1": [ms1_count],
|
|
1618
1643
|
"num_ms2": [ms2_count],
|
|
1619
1644
|
})
|
|
1620
|
-
|
|
1645
|
+
|
|
1621
1646
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1622
|
-
|
|
1647
|
+
|
|
1623
1648
|
# SIMPLIFIED feature processing
|
|
1624
|
-
current_sample_uid = len(self.samples_df)
|
|
1649
|
+
current_sample_uid = len(self.samples_df)
|
|
1625
1650
|
|
|
1626
1651
|
# Add required columns with minimal operations
|
|
1627
1652
|
columns_to_add = [
|
|
@@ -1629,28 +1654,28 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
|
|
|
1629
1654
|
pl.lit(False).alias("filled"),
|
|
1630
1655
|
pl.lit(-1.0).alias("chrom_area"),
|
|
1631
1656
|
]
|
|
1632
|
-
|
|
1657
|
+
|
|
1633
1658
|
# Only add rt_original if it doesn't exist
|
|
1634
1659
|
if "rt_original" not in ddaobj.features_df.columns:
|
|
1635
1660
|
columns_to_add.append(pl.col("rt").alias("rt_original"))
|
|
1636
|
-
|
|
1661
|
+
|
|
1637
1662
|
f_df = ddaobj.features_df.with_columns(columns_to_add)
|
|
1638
|
-
|
|
1663
|
+
|
|
1639
1664
|
if self.features_df.is_empty():
|
|
1640
1665
|
# First sample
|
|
1641
1666
|
self.features_df = f_df.with_columns(
|
|
1642
|
-
pl.int_range(pl.len()).add(1).alias("feature_uid")
|
|
1667
|
+
pl.int_range(pl.len()).add(1).alias("feature_uid"),
|
|
1643
1668
|
)
|
|
1644
1669
|
else:
|
|
1645
1670
|
# Subsequent samples - minimal overhead
|
|
1646
1671
|
offset = self.features_df["feature_uid"].max() + 1
|
|
1647
1672
|
f_df = f_df.with_columns(
|
|
1648
|
-
pl.int_range(pl.len()).add(offset).alias("feature_uid")
|
|
1673
|
+
pl.int_range(pl.len()).add(offset).alias("feature_uid"),
|
|
1649
1674
|
)
|
|
1650
|
-
|
|
1675
|
+
|
|
1651
1676
|
# Use diagonal concatenation for flexibility
|
|
1652
1677
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1653
|
-
|
|
1678
|
+
|
|
1654
1679
|
self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
|
|
1655
1680
|
return True
|
|
1656
1681
|
|
|
@@ -1662,36 +1687,38 @@ def _sample_color_reset_optimized(self):
|
|
|
1662
1687
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
1663
1688
|
self.logger.warning("No samples found in study.")
|
|
1664
1689
|
return
|
|
1665
|
-
|
|
1690
|
+
|
|
1666
1691
|
# Cache the colormap if not already cached
|
|
1667
|
-
if not hasattr(self,
|
|
1692
|
+
if not hasattr(self, "_cached_colormap"):
|
|
1668
1693
|
try:
|
|
1669
1694
|
from cmap import Colormap
|
|
1670
|
-
|
|
1695
|
+
|
|
1696
|
+
self._cached_colormap = Colormap("turbo")
|
|
1671
1697
|
except ImportError:
|
|
1672
1698
|
self.logger.warning("cmap package not available, using default colors")
|
|
1673
1699
|
return
|
|
1674
|
-
|
|
1700
|
+
|
|
1675
1701
|
cm = self._cached_colormap
|
|
1676
1702
|
n_samples = len(self.samples_df)
|
|
1677
|
-
|
|
1703
|
+
|
|
1678
1704
|
# Pre-allocate colors list for better performance
|
|
1679
1705
|
colors = [None] * n_samples
|
|
1680
|
-
|
|
1706
|
+
|
|
1681
1707
|
# Vectorized color generation
|
|
1682
1708
|
for i in range(n_samples):
|
|
1683
1709
|
normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
|
|
1684
1710
|
color_rgba = cm(normalized_value)
|
|
1685
|
-
|
|
1711
|
+
|
|
1686
1712
|
if len(color_rgba) >= 3:
|
|
1687
1713
|
r, g, b = color_rgba[:3]
|
|
1688
1714
|
if max(color_rgba[:3]) <= 1.0:
|
|
1689
1715
|
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
|
1690
1716
|
colors[i] = f"#{r:02x}{g:02x}{b:02x}"
|
|
1691
|
-
|
|
1717
|
+
|
|
1692
1718
|
# Update the sample_color column efficiently
|
|
1693
1719
|
self.samples_df = self.samples_df.with_columns(
|
|
1694
|
-
pl.Series("sample_color", colors).alias("sample_color")
|
|
1720
|
+
pl.Series("sample_color", colors).alias("sample_color"),
|
|
1695
1721
|
)
|
|
1696
|
-
|
|
1722
|
+
|
|
1697
1723
|
self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")
|
|
1724
|
+
|