masster 0.5.17__py3-none-any.whl → 0.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/load.py CHANGED
@@ -161,7 +161,8 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
161
161
  bool: True if successful, False otherwise.
162
162
  """
163
163
 
164
- success = self._add_sample_optimized(
164
+ success = _add_sample_noms1(
165
+ self,
165
166
  file,
166
167
  type=type,
167
168
  reset=reset,
@@ -1031,511 +1032,6 @@ def _process_sample_for_parallel_fill(
1031
1032
 
1032
1033
  return new_features, new_mapping, counter
1033
1034
 
1034
- '''
1035
- def _load_ms1_optimized(self, sample_path, mz_ranges, rt_ranges):
1036
- """
1037
- OPTIMIZED: Load only the MS1 data we actually need instead of the entire file.
1038
- Pre-filter by m/z and RT ranges to reduce memory usage and processing time.
1039
- """
1040
- try:
1041
- # Load full MS1 data (we'll optimize this further later)
1042
- ms1_data = self._load_ms1(filename=sample_path)
1043
- if ms1_data is None or ms1_data.is_empty():
1044
- return ms1_data
1045
-
1046
- # OPTIMIZATION: Pre-filter to only relevant m/z ranges to reduce data size
1047
- if mz_ranges:
1048
- # Build comprehensive m/z filter covering all ranges
1049
- mz_min = min(r[0] for r in mz_ranges)
1050
- mz_max = max(r[1] for r in mz_ranges)
1051
-
1052
- # Pre-filter by broad m/z range first (much faster than multiple OR conditions)
1053
- ms1_filtered = ms1_data.filter(
1054
- (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
1055
- )
1056
-
1057
- # If we have RT ranges, also pre-filter by RT
1058
- if rt_ranges and len(rt_ranges) > 0:
1059
- rt_min = min(r[0] for r in rt_ranges)
1060
- rt_max = max(r[1] for r in rt_ranges)
1061
- ms1_filtered = ms1_filtered.filter(
1062
- (pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
1063
- )
1064
-
1065
- return ms1_filtered
1066
-
1067
- return ms1_data
1068
-
1069
- except Exception:
1070
- return pl.DataFrame()
1071
- '''
1072
-
1073
- '''
1074
- def _create_empty_features(self, consensus_uids, consensus_info, sample_uid, features_df_max_uid):
1075
- """Create empty features for consensus UIDs when no MS1 data is available."""
1076
- new_features = []
1077
- new_mapping = []
1078
-
1079
- for i, consensus_uid in enumerate(consensus_uids):
1080
- cons = consensus_info[consensus_uid]
1081
- feature_uid = features_df_max_uid + i + 1
1082
-
1083
- # Create minimal empty feature
1084
- empty_eic = Chromatogram(
1085
- rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
1086
- inty=np.array([0.0, 0.0]),
1087
- label=f"EIC mz={cons['mz']:.4f}",
1088
- file="",
1089
- mz=cons["mz"],
1090
- feature_start=cons["rt_start_mean"],
1091
- feature_end=cons["rt_end_mean"],
1092
- feature_apex=cons["rt"],
1093
- )
1094
-
1095
- new_feature = {
1096
- "sample_uid": sample_uid,
1097
- "feature_uid": feature_uid,
1098
- "feature_id": None,
1099
- "mz": cons["mz"],
1100
- "rt": cons["rt"],
1101
- "rt_original": 0.0 if cons["rt"] == 0.0 else None,
1102
- "rt_start": cons["rt_start_mean"],
1103
- "rt_end": cons["rt_end_mean"],
1104
- "rt_delta": cons["rt_end_mean"] - cons["rt_start_mean"],
1105
- "mz_start": None,
1106
- "mz_end": None,
1107
- "inty": 0.0,
1108
- "quality": None,
1109
- "charge": None,
1110
- "iso": None,
1111
- "iso_of": None,
1112
- "adduct": None,
1113
- "adduct_mass": None,
1114
- "adduct_group": None,
1115
- "chrom": empty_eic,
1116
- "filled": True,
1117
- "chrom_area": 0.0,
1118
- "chrom_coherence": None,
1119
- "chrom_prominence": None,
1120
- "chrom_prominence_scaled": None,
1121
- "chrom_height_scaled": None,
1122
- "ms2_scans": None,
1123
- "ms2_specs": None,
1124
- }
1125
-
1126
- new_features.append(new_feature)
1127
- new_mapping.append({
1128
- "consensus_uid": consensus_uid,
1129
- "sample_uid": sample_uid,
1130
- "feature_uid": feature_uid,
1131
- })
1132
-
1133
- return new_features, new_mapping, len(new_features)
1134
- '''
1135
-
1136
- '''
1137
- def _create_feature_fast(self, consensus_uid, sample_uid, features_df_max_uid, consensus_info):
1138
- """
1139
- OPTIMIZED: Create a minimal empty feature quickly without expensive operations.
1140
- Used for RT=0 features and other cases where we just need a placeholder feature.
1141
- """
1142
- cons = consensus_info[consensus_uid]
1143
- feature_uid = features_df_max_uid
1144
-
1145
- # Create minimal empty feature
1146
- empty_eic = Chromatogram(
1147
- rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
1148
- inty=np.array([0.0, 0.0]),
1149
- label=f"EIC mz={cons['mz']:.4f}",
1150
- file="",
1151
- mz=cons["mz"],
1152
- feature_start=cons["rt_start_mean"],
1153
- feature_end=cons["rt_end_mean"]
1154
- )
1155
-
1156
- new_feature = {
1157
- "uid": feature_uid,
1158
- "sample_uid": sample_uid,
1159
- "mz": cons["mz"],
1160
- "rt": cons["rt"],
1161
- "mz_centroid": None,
1162
- "rt_centroid": None,
1163
- "iso": None,
1164
- "iso_of": None,
1165
- "adduct": None,
1166
- "adduct_mass": None,
1167
- "adduct_group": None,
1168
- "chrom": empty_eic,
1169
- "filled": True,
1170
- "chrom_area": 0.0,
1171
- "chrom_coherence": None,
1172
- "chrom_prominence": None,
1173
- "chrom_prominence_scaled": None,
1174
- "chrom_height_scaled": None,
1175
- "ms2_scans": None,
1176
- "ms2_specs": None,
1177
- }
1178
-
1179
- new_features = [new_feature]
1180
- new_mapping = [{
1181
- "consensus_uid": consensus_uid,
1182
- "sample_uid": sample_uid,
1183
- "feature_uid": feature_uid,
1184
- }]
1185
-
1186
- return new_features, new_mapping, 1
1187
- '''
1188
-
1189
- '''
1190
- def _process_rt_zero_features_batch(self, rt_zero_consensus_uids, consensus_info, sample_uid,
1191
- features_df_max_uid, rt_zero_features):
1192
- """
1193
- OPTIMIZED: Process all RT=0 features in a batch since they share similar characteristics.
1194
- RT=0 features are typically not real peaks but artifacts or noise.
1195
- """
1196
- new_features = []
1197
- new_mapping = []
1198
-
1199
- for consensus_uid in rt_zero_consensus_uids:
1200
- new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
1201
- consensus_uid, sample_uid, features_df_max_uid, consensus_info
1202
- )
1203
- new_features.extend(new_features_batch)
1204
- new_mapping.extend(new_mapping_batch)
1205
- features_df_max_uid += 1
1206
-
1207
- # Track RT=0 features for statistics
1208
- rt_zero_features.append(1)
1209
-
1210
- return new_features, new_mapping, features_df_max_uid
1211
- '''
1212
-
1213
- '''
1214
- def _process_normal_rt_features_batch(self, normal_rt_consensus_uids, consensus_info, ms1_data,
1215
- sample_uid, sample_path, mz_tol, rt_tol, features_df_max_uid):
1216
- """
1217
- OPTIMIZED: Process normal RT features in batch with pre-filtered MS1 data.
1218
- Only loads chromatograms once per batch instead of per feature.
1219
- """
1220
- new_features = []
1221
- new_mapping = []
1222
-
1223
- if len(normal_rt_consensus_uids) == 0:
1224
- return new_features, new_mapping, features_df_max_uid
1225
-
1226
- # OPTIMIZATION: Pre-filter MS1 data by m/z range to reduce data size
1227
- all_mzs = [consensus_info[cuid]["mz"] for cuid in normal_rt_consensus_uids]
1228
- mz_min = min(all_mzs) - max(0.01, min(all_mzs) * mz_tol / 1e6)
1229
- mz_max = max(all_mzs) + max(0.01, max(all_mzs) * mz_tol / 1e6)
1230
-
1231
- # Pre-filter MS1 data once for all features
1232
- ms1_filtered = ms1_data.filter(
1233
- (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
1234
- )
1235
-
1236
- # Early exit if no data in m/z range
1237
- if ms1_filtered.shape[0] == 0:
1238
- # Create empty features for all consensus UIDs
1239
- for consensus_uid in normal_rt_consensus_uids:
1240
- new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
1241
- consensus_uid, sample_uid, features_df_max_uid, consensus_info
1242
- )
1243
- new_features.extend(new_features_batch)
1244
- new_mapping.extend(new_mapping_batch)
1245
- features_df_max_uid += 1
1246
- return new_features, new_mapping, features_df_max_uid
1247
-
1248
- # Process each feature with pre-filtered data
1249
- for consensus_uid in normal_rt_consensus_uids:
1250
- info = consensus_info[consensus_uid]
1251
- mz, rt = info["mz"], info["rt"]
1252
-
1253
- # Extract chromatogram using pre-loaded MS1 data (FIXED!)
1254
- sample_obj = self._load_ms1(sample_path) # Get the sample object for extract_eic method
1255
- eic = sample_obj.extract_eic(
1256
- mz, mz_tol, rt, rt_tol, ms1_data=ms1_filtered # Use the pre-filtered data!
1257
- )
1258
-
1259
- # Find best peak
1260
- best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol)
1261
-
1262
- # Create feature
1263
- new_feature = {
1264
- "uid": features_df_max_uid,
1265
- "sample_uid": sample_uid,
1266
- "mz": mz,
1267
- "rt": rt,
1268
- "mz_centroid": None,
1269
- "rt_centroid": None,
1270
- "iso": None,
1271
- "iso_of": None,
1272
- "adduct": None,
1273
- "adduct_mass": None,
1274
- "adduct_group": None,
1275
- "chrom": eic if best_peak else Chromatogram(
1276
- rt=np.array([rt, rt]),
1277
- inty=np.array([0.0, 0.0]),
1278
- label=f"EIC mz={mz:.4f}",
1279
- file="",
1280
- mz=mz,
1281
- feature_start=rt,
1282
- feature_end=rt
1283
- ),
1284
- "filled": True,
1285
- "chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
1286
- "chrom_coherence": best_peak.get("coherence") if best_peak else None,
1287
- "chrom_prominence": best_peak.get("prominence") if best_peak else None,
1288
- "chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
1289
- "chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
1290
- "ms2_scans": None,
1291
- "ms2_specs": None,
1292
- }
1293
-
1294
- new_features.append(new_feature)
1295
- new_mapping.append({
1296
- "consensus_uid": consensus_uid,
1297
- "sample_uid": sample_uid,
1298
- "feature_uid": features_df_max_uid,
1299
- })
1300
- features_df_max_uid += 1
1301
-
1302
- return new_features, new_mapping, features_df_max_uid
1303
- '''
1304
-
1305
- '''def _batch_process_features(self, consensus_uids, consensus_info, ms1_data, sample_uid, sample_path,
1306
- mz_tol, rt_tol, features_df_max_uid, rt_zero_features):
1307
- """
1308
- OPTIMIZED: Process all missing features for a sample in a single batch operation.
1309
- This avoids repeated filtering of the MS1 dataframe.
1310
- """
1311
- new_features = []
1312
- new_mapping = []
1313
-
1314
- # OPTIMIZATION: Process RT=0 features separately (they need special handling)
1315
- rt_zero_data = {}
1316
- if rt_zero_features:
1317
- rt_zero_data = self._process_rt_zero_features_batch(
1318
- rt_zero_features, consensus_info, ms1_data, mz_tol, rt_tol
1319
- )
1320
-
1321
- # OPTIMIZATION: Build comprehensive filter for all normal RT features at once
1322
- normal_rt_features = [uid for uid in consensus_uids if uid not in rt_zero_features]
1323
- normal_rt_data = {}
1324
- if normal_rt_features:
1325
- normal_rt_data = self._process_normal_rt_features_batch(
1326
- normal_rt_features, consensus_info, ms1_data, mz_tol, rt_tol
1327
- )
1328
-
1329
- # Combine results and create features
1330
- all_feature_data = {**rt_zero_data, **normal_rt_data}
1331
-
1332
- for i, consensus_uid in enumerate(consensus_uids):
1333
- feature_uid = features_df_max_uid + i + 1
1334
- cons = consensus_info[consensus_uid]
1335
-
1336
- # Get pre-processed data for this feature
1337
- feature_ms1_data = all_feature_data.get(consensus_uid, pl.DataFrame())
1338
-
1339
- # Create feature using optimized chromatogram creation
1340
- new_feature, area = self._create_feature_fast(
1341
- consensus_uid, cons, feature_ms1_data, sample_uid, sample_path,
1342
- feature_uid, mz_tol, rt_tol
1343
- )
1344
-
1345
- new_features.append(new_feature)
1346
- new_mapping.append({
1347
- "consensus_uid": consensus_uid,
1348
- "sample_uid": sample_uid,
1349
- "feature_uid": feature_uid,
1350
- })
1351
-
1352
- return new_features, new_mapping, len(new_features)
1353
-
1354
- # Process each missing feature
1355
- for consensus_uid in sample_missing:
1356
- cons = consensus_info[consensus_uid]
1357
- mz = cons["mz"]
1358
- rt = cons["rt"]
1359
- rt_start_mean = cons["rt_start_mean"]
1360
- rt_end_mean = cons["rt_end_mean"]
1361
-
1362
- # Filter MS1 data for this feature
1363
- if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
1364
- # Special handling for RT=0 (library-derived features)
1365
- if rt == 0.0:
1366
- # Simple RT=0 processing: find max intensity across full m/z range
1367
- d_full = file.ms1_df.filter(
1368
- (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
1369
- )
1370
-
1371
- if not d_full.is_empty():
1372
- max_inty = d_full["inty"].max()
1373
- if max_inty > 0:
1374
- max_rt = d_full.filter(pl.col("inty") == max_inty)["rt"].min()
1375
-
1376
- # Use default rt_tol for RT=0 features
1377
- eic_rt_tol = rt_tol
1378
-
1379
- # Filter around max RT
1380
- d = d_full.filter(
1381
- (pl.col("rt") >= max_rt - eic_rt_tol) &
1382
- (pl.col("rt") <= max_rt + eic_rt_tol)
1383
- )
1384
-
1385
- # Update consensus RT info
1386
- rt = max_rt
1387
- rt_start_mean = max_rt - eic_rt_tol
1388
- rt_end_mean = max_rt + eic_rt_tol
1389
- else:
1390
- d = pl.DataFrame()
1391
- else:
1392
- d = pl.DataFrame()
1393
- else:
1394
- # Normal RT-based filtering for non-zero RT
1395
- d = file.ms1_df.filter(
1396
- (pl.col("mz") >= mz - mz_tol)
1397
- & (pl.col("mz") <= mz + mz_tol)
1398
- & (pl.col("rt") >= rt_start_mean - rt_tol)
1399
- & (pl.col("rt") <= rt_end_mean + rt_tol),
1400
- )
1401
- else:
1402
- d = pl.DataFrame()
1403
-
1404
- # Create chromatogram
1405
- if d.is_empty():
1406
- eic = Chromatogram(
1407
- rt=np.array([rt_start_mean, rt_end_mean]),
1408
- inty=np.array([0.0, 0.0]),
1409
- label=f"EIC mz={mz:.4f}",
1410
- file=sample_path,
1411
- mz=mz,
1412
- mz_tol=mz_tol,
1413
- feature_start=rt_start_mean,
1414
- feature_end=rt_end_mean,
1415
- feature_apex=rt,
1416
- )
1417
- max_inty = 0.0
1418
- area = 0.0
1419
- chrom_coherence = None
1420
- chrom_prominence = None
1421
- chrom_prominence_scaled = None
1422
- chrom_height_scaled = None
1423
- peak_rt_start = rt_start_mean
1424
- peak_rt_end = rt_end_mean
1425
- peak_rt_delta = rt_end_mean - rt_start_mean
1426
- else:
1427
- eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
1428
-
1429
- if len(eic_rt) > 4:
1430
- eic = Chromatogram(
1431
- eic_rt["rt"].to_numpy(),
1432
- eic_rt["inty"].to_numpy(),
1433
- label=f"EIC mz={mz:.4f}",
1434
- file=sample_path,
1435
- mz=mz,
1436
- mz_tol=mz_tol,
1437
- feature_start=rt_start_mean,
1438
- feature_end=rt_end_mean,
1439
- feature_apex=rt,
1440
- ).find_peaks()
1441
- max_inty = np.max(eic.inty)
1442
- area = eic.feature_area
1443
-
1444
- # Extract chromatogram peak properties from first peak (if available)
1445
- if len(eic.peak_rts) > 0 and eic.feature_start is not None and eic.feature_end is not None:
1446
- chrom_coherence = round(eic.feature_coherence, 3) if eic.feature_coherence is not None else None
1447
- chrom_prominence = round(eic.peak_prominences[0], 3) if len(eic.peak_prominences) > 0 else None
1448
- chrom_prominence_scaled = round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_prominences) > 0 else None
1449
- chrom_height_scaled = round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_heights) > 0 else None
1450
- peak_rt_start = eic.feature_start
1451
- peak_rt_end = eic.feature_end
1452
- peak_rt_delta = peak_rt_end - peak_rt_start
1453
- else:
1454
- chrom_coherence = None
1455
- chrom_prominence = None
1456
- chrom_prominence_scaled = None
1457
- chrom_height_scaled = None
1458
- peak_rt_start = rt_start_mean
1459
- peak_rt_end = rt_end_mean
1460
- peak_rt_delta = rt_end_mean - rt_start_mean
1461
- else:
1462
- eic = Chromatogram(
1463
- eic_rt["rt"].to_numpy(),
1464
- eic_rt["inty"].to_numpy(),
1465
- label=f"EIC mz={mz:.4f}",
1466
- file=sample_path,
1467
- mz=mz,
1468
- mz_tol=mz_tol,
1469
- feature_start=rt_start_mean,
1470
- feature_end=rt_end_mean,
1471
- feature_apex=rt,
1472
- )
1473
- max_inty = 0.0
1474
- area = 0.0
1475
- chrom_coherence = None
1476
- chrom_prominence = None
1477
- chrom_prominence_scaled = None
1478
- chrom_height_scaled = None
1479
- peak_rt_start = rt_start_mean
1480
- peak_rt_end = rt_end_mean
1481
- peak_rt_delta = rt_end_mean - rt_start_mean
1482
-
1483
- # Generate feature UID (will be adjusted later to ensure global uniqueness)
1484
- feature_uid = features_df_max_uid + len(new_features) + 1
1485
-
1486
- # Handle rt_original: for RT=0 features, set to 0; otherwise estimate from closest feature
1487
- if rt == 0.0 or (hasattr(cons, 'get') and cons.get("rt") == 0.0):
1488
- estimated_rt_original = 0.0
1489
- else:
1490
- estimated_rt_original = _estimate_rt_original_for_filled_feature(
1491
- self, sample_uid, rt, logger=self.logger if hasattr(self, 'logger') else None
1492
- )
1493
-
1494
- # Create new feature entry with updated chromatogram properties
1495
- new_feature = {
1496
- "sample_uid": sample_uid,
1497
- "feature_uid": feature_uid,
1498
- "feature_id": None,
1499
- "mz": mz,
1500
- "rt": rt,
1501
- "rt_original": estimated_rt_original,
1502
- "rt_start": peak_rt_start,
1503
- "rt_end": peak_rt_end,
1504
- "rt_delta": peak_rt_delta,
1505
- "mz_start": None,
1506
- "mz_end": None,
1507
- "inty": max_inty,
1508
- "quality": None,
1509
- "charge": None,
1510
- "iso": None,
1511
- "iso_of": None,
1512
- "adduct": None,
1513
- "adduct_mass": None,
1514
- "adduct_group": None,
1515
- "chrom": eic,
1516
- "filled": True,
1517
- "chrom_area": area,
1518
- "chrom_coherence": chrom_coherence,
1519
- "chrom_prominence": chrom_prominence,
1520
- "chrom_prominence_scaled": chrom_prominence_scaled,
1521
- "chrom_height_scaled": chrom_height_scaled,
1522
- "ms2_scans": None,
1523
- "ms2_specs": None,
1524
- }
1525
-
1526
- new_features.append(new_feature)
1527
- new_mapping.append(
1528
- {
1529
- "consensus_uid": consensus_uid,
1530
- "sample_uid": sample_uid,
1531
- "feature_uid": feature_uid,
1532
- },
1533
- )
1534
- counter += 1
1535
-
1536
- return new_features, new_mapping, counter
1537
- '''
1538
-
1539
1035
  def _fill_chrom_impl(
1540
1036
  self,
1541
1037
  uids=None,
@@ -2198,10 +1694,19 @@ def _add_sample_noms1(
2198
1694
  self.logger.warning(f"Failed to add sample {file}: {e}")
2199
1695
  return False
2200
1696
 
2201
- # Check if features map was created successfully
2202
- #if ddaobj._oms_features_map is None:
2203
- # self.logger.warning(f"Failed to add sample {file}: No features map created")
2204
- # return False
1697
+ # Check polarity compatibility
1698
+ sample_polarity = getattr(ddaobj, 'polarity', None)
1699
+ study_polarity = getattr(self, 'polarity', None)
1700
+
1701
+ if sample_polarity is not None and study_polarity is not None:
1702
+ # Normalize polarity names for comparison
1703
+ sample_pol_norm = "positive" if sample_polarity in ["pos", "positive"] else "negative" if sample_polarity in ["neg", "negative"] else sample_polarity
1704
+ study_pol_norm = "positive" if study_polarity in ["pos", "positive"] else "negative" if study_polarity in ["neg", "negative"] else study_polarity
1705
+
1706
+ if sample_pol_norm != study_pol_norm:
1707
+ self.logger.warning(f"Sample {sample_name} polarity ({sample_polarity}) differs from study polarity ({study_polarity}). Skipping sample.")
1708
+ return False
1709
+
2205
1710
 
2206
1711
  #self.features_maps.append(ddaobj._oms_features_map)
2207
1712
 
@@ -2310,285 +1815,3 @@ def _add_sample_noms1(
2310
1815
  f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)",
2311
1816
  )
2312
1817
  return True
2313
-
2314
- '''
2315
- def _add_sample_standard(
2316
- self,
2317
- file,
2318
- type=None,
2319
- reset=False,
2320
- adducts=None,
2321
- skip_color_reset=True,
2322
- skip_schema_check=True,
2323
- ):
2324
- """
2325
- Standard add_sample method that uses full sample loading (includes ms1_df).
2326
-
2327
- This method uses the standard sample.load() method which loads all data
2328
- including ms1_df, providing full functionality but potentially slower performance
2329
- for large MS1 datasets.
2330
-
2331
- Returns True if successful, False otherwise.
2332
- """
2333
- self.logger.debug(f"Adding (standard): {file}")
2334
-
2335
- # Basic validation
2336
- basename = os.path.basename(file)
2337
- sample_name = os.path.splitext(basename)[0]
2338
-
2339
- if sample_name in self.samples_df["sample_name"].to_list():
2340
- self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
2341
- return False
2342
-
2343
- if not os.path.exists(file):
2344
- self.logger.error(f"File {file} does not exist.")
2345
- return False
2346
-
2347
- if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
2348
- self.logger.error(f"Unsupported file type: {file}")
2349
- return False
2350
-
2351
- # Load sample using standard method (includes ms1_df)
2352
- ddaobj = Sample()
2353
- ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
2354
- # Use standard loading method that loads all data including ms1_df
2355
-
2356
- if file.endswith(".sample5"):
2357
- ddaobj.load(file)
2358
- # restore _oms_features_map
2359
- ddaobj._get_feature_map()
2360
- else:
2361
- try:
2362
- ddaobj.load(file)
2363
- ddaobj.find_features()
2364
- ddaobj.find_adducts(adducts=adducts)
2365
- ddaobj.find_ms2()
2366
- except Exception as e:
2367
- self.logger.warning(f"Failed to add sample {file}: {e}")
2368
- return False
2369
-
2370
- # Check if features map was created successfully
2371
- if ddaobj._oms_features_map is None:
2372
- self.logger.warning(f"Failed to add sample {file}: No features map created")
2373
- return False
2374
-
2375
- self.features_maps.append(ddaobj._oms_features_map)
2376
-
2377
- # Determine sample type
2378
- sample_type = "sample" if type is None else type
2379
- if "qc" in sample_name.lower():
2380
- sample_type = "qc"
2381
- if "blank" in sample_name.lower():
2382
- sample_type = "blank"
2383
-
2384
- map_id_value = len(self.features_maps) - 1
2385
-
2386
- # Handle file paths
2387
- if file.endswith(".sample5"):
2388
- final_sample_path = file
2389
- # self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
2390
- else:
2391
- if self.folder is not None:
2392
- if not os.path.exists(self.folder):
2393
- os.makedirs(self.folder)
2394
- final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
2395
- else:
2396
- final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
2397
- ddaobj.save(final_sample_path)
2398
- self.logger.debug(f"Saved converted sample: {final_sample_path}")
2399
-
2400
- # Efficient scan counting
2401
- ms1_count = ms2_count = 0
2402
- if (
2403
- hasattr(ddaobj, "scans_df")
2404
- and ddaobj.scans_df is not None
2405
- and not ddaobj.scans_df.is_empty()
2406
- ):
2407
- scan_counts = (
2408
- ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
2409
- )
2410
- ms_levels = scan_counts.get("ms_level", [])
2411
- counts = scan_counts.get("len", [])
2412
- for level, count in zip(ms_levels, counts):
2413
- if level == 1:
2414
- ms1_count = count
2415
- elif level == 2:
2416
- ms2_count = count
2417
-
2418
- # Create sample entry
2419
- next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
2420
- new_sample = pl.DataFrame(
2421
- {
2422
- "sample_uid": [int(len(self.samples_df) + 1)],
2423
- "sample_name": [sample_name],
2424
- "sample_path": [final_sample_path],
2425
- "sample_type": [sample_type],
2426
- "map_id": [map_id_value],
2427
- "sample_source": [getattr(ddaobj, "file_source", file)],
2428
- "sample_color": [None], # Will be set in batch at end
2429
- "sample_group": [""],
2430
- "sample_batch": [1],
2431
- "sample_sequence": [next_sequence],
2432
- "num_features": [int(ddaobj._oms_features_map.size())],
2433
- "num_ms1": [ms1_count],
2434
- "num_ms2": [ms2_count],
2435
- },
2436
- )
2437
-
2438
- self.samples_df = pl.concat([self.samples_df, new_sample])
2439
-
2440
- # SIMPLIFIED feature processing
2441
- current_sample_uid = len(self.samples_df)
2442
-
2443
- # Add required columns with minimal operations
2444
- columns_to_add = [
2445
- pl.lit(current_sample_uid).alias("sample_uid"),
2446
- pl.lit(False).alias("filled"),
2447
- pl.lit(-1.0).alias("chrom_area"),
2448
- ]
2449
-
2450
- # Only add rt_original if it doesn't exist
2451
- if "rt_original" not in ddaobj.features_df.columns:
2452
- columns_to_add.append(pl.col("rt").alias("rt_original"))
2453
-
2454
- f_df = ddaobj.features_df.with_columns(columns_to_add)
2455
-
2456
- if self.features_df.is_empty():
2457
- # First sample
2458
- self.features_df = f_df.with_columns(
2459
- pl.int_range(pl.len()).add(1).alias("feature_uid"),
2460
- )
2461
- else:
2462
- # Subsequent samples - minimal overhead
2463
- offset = self.features_df["feature_uid"].max() + 1
2464
- f_df = f_df.with_columns(
2465
- pl.int_range(pl.len()).add(offset).alias("feature_uid"),
2466
- )
2467
-
2468
- # Use diagonal concatenation for flexibility
2469
- self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
2470
-
2471
- self.logger.debug(
2472
- f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
2473
- )
2474
- return True
2475
- ## COMMENT AR: Is this intentional?
2476
- # Use standard loading method that loads all data including ms1_df
2477
- ddaobj.load(file)
2478
-
2479
- if ddaobj.features_df is None and not reset:
2480
- ddaobj._oms_features_map = None
2481
-
2482
- if ddaobj._oms_features_map is None or reset:
2483
- ddaobj.find_features()
2484
- ddaobj.find_adducts(adducts=adducts)
2485
- ddaobj.find_ms2()
2486
-
2487
- self.features_maps.append(ddaobj._oms_features_map)
2488
-
2489
- # Determine sample type
2490
- sample_type = "sample" if type is None else type
2491
- if "qc" in sample_name.lower():
2492
- sample_type = "qc"
2493
- if "blank" in sample_name.lower():
2494
- sample_type = "blank"
2495
-
2496
- map_id_value = len(self.features_maps) - 1
2497
-
2498
- # Handle file paths
2499
- if file.endswith(".sample5"):
2500
- final_sample_path = file
2501
- # self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
2502
- else:
2503
- if self.folder is not None:
2504
- if not os.path.exists(self.folder):
2505
- os.makedirs(self.folder)
2506
- final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
2507
- else:
2508
- final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
2509
- ddaobj.save(final_sample_path)
2510
- self.logger.debug(f"Saved converted sample: {final_sample_path}")
2511
-
2512
- # Efficient scan counting
2513
- ms1_count = ms2_count = 0
2514
- if (
2515
- hasattr(ddaobj, "scans_df")
2516
- and ddaobj.scans_df is not None
2517
- and not ddaobj.scans_df.is_empty()
2518
- ):
2519
- scan_counts = (
2520
- ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
2521
- )
2522
- ms_levels = scan_counts.get("ms_level", [])
2523
- counts = scan_counts.get("len", [])
2524
- for level, count in zip(ms_levels, counts):
2525
- if level == 1:
2526
- ms1_count = count
2527
- elif level == 2:
2528
- ms2_count = count
2529
-
2530
- # Create sample entry
2531
- next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
2532
- new_sample = pl.DataFrame(
2533
- {
2534
- "sample_uid": [int(len(self.samples_df) + 1)],
2535
- "sample_name": [sample_name],
2536
- "sample_path": [final_sample_path],
2537
- "sample_type": [sample_type],
2538
- "map_id": [map_id_value],
2539
- "sample_source": [getattr(ddaobj, "file_source", file)],
2540
- "sample_color": [None], # Will be set in batch at end
2541
- "sample_group": [""],
2542
- "sample_batch": [1],
2543
- "sample_sequence": [next_sequence],
2544
- "num_features": [int(ddaobj._oms_features_map.size())],
2545
- "num_ms1": [ms1_count],
2546
- "num_ms2": [ms2_count],
2547
- },
2548
- )
2549
-
2550
- self.samples_df = pl.concat([self.samples_df, new_sample])
2551
-
2552
- # SIMPLIFIED feature processing
2553
- current_sample_uid = len(self.samples_df)
2554
-
2555
- # Add required columns with minimal operations
2556
- columns_to_add = [
2557
- pl.lit(current_sample_uid).alias("sample_uid"),
2558
- pl.lit(False).alias("filled"),
2559
- pl.lit(-1.0).alias("chrom_area"),
2560
- ]
2561
-
2562
- # Only add rt_original if it doesn't exist
2563
- if "rt_original" not in ddaobj.features_df.columns:
2564
- columns_to_add.append(pl.col("rt").alias("rt_original"))
2565
-
2566
- f_df = ddaobj.features_df.with_columns(columns_to_add)
2567
-
2568
- if self.features_df.is_empty():
2569
- # First sample
2570
- self.features_df = f_df.with_columns(
2571
- pl.int_range(pl.len()).add(1).alias("feature_uid"),
2572
- )
2573
- else:
2574
- # Subsequent samples - minimal overhead
2575
- offset = self.features_df["feature_uid"].max() + 1
2576
- f_df = f_df.with_columns(
2577
- pl.int_range(pl.len()).add(offset).alias("feature_uid"),
2578
- )
2579
-
2580
- # Use diagonal concatenation for flexibility
2581
- self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
2582
-
2583
- self.logger.debug(
2584
- f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
2585
- )
2586
- return True
2587
-
2588
- '''
2589
- '''def _sample_color_reset_optimized(self):
2590
- """
2591
- Optimized version of sample color reset using set_samples_color.
2592
- """
2593
- return self.set_samples_color(by=None)
2594
- '''