masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/load.py CHANGED
@@ -45,12 +45,12 @@ def add(
45
45
  """Add samples from a folder to the study.
46
46
 
47
47
  Args:
48
- folder (str, optional): Path to folder containing sample files.
48
+ folder (str, optional): Path to folder containing sample files.
49
49
  Defaults to study folder or current working directory.
50
50
  reset (bool, optional): Whether to reset the study before adding samples.
51
51
  Defaults to False.
52
52
  adducts (optional): Adducts to use for sample loading. Defaults to None.
53
- max_files (int, optional): Maximum number of files to process.
53
+ max_files (int, optional): Maximum number of files to process.
54
54
  Defaults to None (no limit).
55
55
  fast (bool, optional): Whether to use optimized loading that skips ms1_df
56
56
  for better performance. Defaults to True.
@@ -104,21 +104,27 @@ def add(
104
104
  for file in files:
105
105
  if max_files is not None and counter >= max_files:
106
106
  break
107
-
107
+
108
108
  # Get filename without extension for blacklist check
109
109
  basename = os.path.basename(file)
110
110
  filename_no_ext = os.path.splitext(basename)[0]
111
-
111
+
112
112
  # Check if this filename (without extension) is already in blacklist
113
113
  if filename_no_ext not in blacklist:
114
114
  files_to_process.append(file)
115
- if len(files_to_process) + counter >= (max_files or float('inf')):
115
+ if len(files_to_process) + counter >= (max_files or float("inf")):
116
116
  break
117
-
117
+
118
118
  # Batch process all files of this extension using ultra-optimized method
119
119
  if files_to_process:
120
120
  self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
121
- successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
121
+ successful = self._add_samples_batch(
122
+ files_to_process,
123
+ reset=reset,
124
+ adducts=adducts,
125
+ blacklist=blacklist,
126
+ fast=fast,
127
+ )
122
128
  counter += successful
123
129
  if successful > 0:
124
130
  not_zero = True
@@ -140,7 +146,7 @@ def add(
140
146
  def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
141
147
  """
142
148
  Add a single sample to the study.
143
-
149
+
144
150
  Args:
145
151
  file (str): Path to the sample file
146
152
  type (str, optional): File type to force. Defaults to None (auto-detect).
@@ -148,31 +154,31 @@ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
148
154
  adducts (optional): Adducts to use for sample loading. Defaults to None.
149
155
  fast (bool, optional): Whether to use optimized loading that skips ms1_df
150
156
  for better performance. Defaults to True.
151
-
157
+
152
158
  Returns:
153
159
  bool: True if successful, False otherwise.
154
160
  """
155
161
  if fast:
156
162
  # Use optimized method for better performance
157
163
  success = self._add_sample_optimized(
158
- file,
159
- type=type,
160
- reset=reset,
164
+ file,
165
+ type=type,
166
+ reset=reset,
161
167
  adducts=adducts,
162
168
  skip_color_reset=False, # Do color reset for individual calls
163
- skip_schema_check=True # Skip schema check for performance (safe with diagonal concat)
169
+ skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
164
170
  )
165
171
  else:
166
172
  # Use standard method with full ms1_df loading
167
173
  success = self._add_sample_standard(
168
- file,
169
- type=type,
170
- reset=reset,
174
+ file,
175
+ type=type,
176
+ reset=reset,
171
177
  adducts=adducts,
172
178
  skip_color_reset=False, # Do color reset for individual calls
173
- skip_schema_check=True # Skip schema check for performance
179
+ skip_schema_check=True, # Skip schema check for performance
174
180
  )
175
-
181
+
176
182
  return success
177
183
 
178
184
 
@@ -1193,17 +1199,18 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1193
1199
  fh.load(filename, self.consensus_map)
1194
1200
  self.logger.debug(f"Loaded consensus map from {filename}.")
1195
1201
 
1202
+
1196
1203
  def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
1197
1204
  """
1198
1205
  Optimized batch addition of samples.
1199
-
1206
+
1200
1207
  Args:
1201
1208
  files (list): List of file paths to process
1202
1209
  reset (bool): Whether to reset features before processing
1203
1210
  adducts: Adducts to use for sample loading
1204
1211
  blacklist (set): Set of filenames already processed
1205
1212
  fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
1206
-
1213
+
1207
1214
  Performance optimizations:
1208
1215
  1. No per-sample color reset
1209
1216
  2. No schema enforcement during addition
@@ -1212,126 +1219,135 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
1212
1219
  """
1213
1220
  if not files:
1214
1221
  return 0
1215
-
1222
+
1216
1223
  if blacklist is None:
1217
1224
  blacklist = set()
1218
-
1225
+
1219
1226
  self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
1220
-
1227
+
1221
1228
  successful_additions = 0
1222
1229
  failed_additions = 0
1223
-
1230
+
1224
1231
  # Progress reporting setup
1225
1232
  tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1226
-
1233
+
1227
1234
  for i, file in enumerate(
1228
1235
  tqdm(
1229
1236
  files,
1230
1237
  total=len(files),
1231
1238
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Batch add",
1232
1239
  disable=tqdm_disable,
1233
- )
1240
+ ),
1234
1241
  ):
1235
1242
  try:
1236
1243
  # Choose between optimized and standard loading
1237
1244
  if fast:
1238
1245
  success = self._add_sample_optimized(
1239
- file,
1240
- reset=reset,
1246
+ file,
1247
+ reset=reset,
1241
1248
  adducts=adducts,
1242
1249
  skip_color_reset=True, # Skip color reset during batch
1243
- skip_schema_check=True # Skip schema enforcement
1250
+ skip_schema_check=True, # Skip schema enforcement
1244
1251
  )
1245
1252
  else:
1246
1253
  success = self._add_sample_standard(
1247
- file,
1248
- reset=reset,
1254
+ file,
1255
+ reset=reset,
1249
1256
  adducts=adducts,
1250
1257
  skip_color_reset=True, # Skip color reset during batch
1251
- skip_schema_check=True # Skip schema enforcement
1258
+ skip_schema_check=True, # Skip schema enforcement
1252
1259
  )
1253
-
1260
+
1254
1261
  if success:
1255
1262
  # Add to blacklist for filename tracking
1256
1263
  basename = os.path.basename(file)
1257
1264
  filename_no_ext = os.path.splitext(basename)[0]
1258
1265
  blacklist.add(filename_no_ext)
1259
1266
  successful_additions += 1
1260
-
1267
+
1261
1268
  except Exception as e:
1262
1269
  self.logger.warning(f"Failed to add sample {file}: {e}")
1263
1270
  failed_additions += 1
1264
1271
  continue
1265
-
1272
+
1266
1273
  # Final cleanup operations done once at the end
1267
1274
  if successful_additions > 0:
1268
1275
  self.logger.debug("Performing final batch cleanup...")
1269
-
1276
+
1270
1277
  # Optional: Only do schema enforcement if specifically needed (usually not required)
1271
1278
  # self._ensure_features_df_schema_order()
1272
-
1279
+
1273
1280
  # Color assignment done once for all samples
1274
1281
  self._sample_color_reset_optimized()
1275
-
1282
+
1276
1283
  self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
1277
-
1284
+
1278
1285
  return successful_additions
1279
1286
 
1280
- def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
1287
+
1288
+ def _add_sample_optimized(
1289
+ self,
1290
+ file,
1291
+ type=None,
1292
+ reset=False,
1293
+ adducts=None,
1294
+ skip_color_reset=True,
1295
+ skip_schema_check=True,
1296
+ ):
1281
1297
  """
1282
1298
  Optimized add_sample with performance improvements integrated.
1283
-
1299
+
1284
1300
  Removes:
1285
1301
  - Schema enforcement (_ensure_features_df_schema_order)
1286
1302
  - Complex column alignment and type casting
1287
1303
  - Per-addition color reset
1288
1304
  - Unnecessary column reordering
1289
-
1305
+
1290
1306
  Returns True if successful, False otherwise.
1291
1307
  """
1292
1308
  self.logger.debug(f"Adding: {file}")
1293
-
1309
+
1294
1310
  # Basic validation
1295
1311
  basename = os.path.basename(file)
1296
1312
  sample_name = os.path.splitext(basename)[0]
1297
-
1313
+
1298
1314
  if sample_name in self.samples_df["sample_name"].to_list():
1299
1315
  self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
1300
1316
  return False
1301
-
1317
+
1302
1318
  if not os.path.exists(file):
1303
1319
  self.logger.error(f"File {file} does not exist.")
1304
1320
  return False
1305
-
1321
+
1306
1322
  if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
1307
1323
  self.logger.error(f"Unsupported file type: {file}")
1308
1324
  return False
1309
-
1325
+
1310
1326
  # Load sample
1311
1327
  ddaobj = Sample()
1312
1328
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1313
1329
  # Use standard loading method temporarily to test if this fixes the astuple error
1314
1330
  ddaobj.load(file)
1315
-
1331
+
1316
1332
  if ddaobj.features_df is None and not reset:
1317
1333
  ddaobj.features = None
1318
-
1334
+
1319
1335
  if ddaobj.features is None or reset:
1320
1336
  ddaobj.find_features()
1321
1337
  ddaobj.find_adducts(adducts=adducts)
1322
1338
  ddaobj.find_ms2()
1323
-
1339
+
1324
1340
  self.features_maps.append(ddaobj.features)
1325
-
1341
+
1326
1342
  # Determine sample type
1327
1343
  sample_type = "sample" if type is None else type
1328
1344
  if "qc" in sample_name.lower():
1329
1345
  sample_type = "qc"
1330
1346
  if "blank" in sample_name.lower():
1331
1347
  sample_type = "blank"
1332
-
1348
+
1333
1349
  map_id_value = len(self.features_maps) - 1
1334
-
1350
+
1335
1351
  # Handle file paths
1336
1352
  if file.endswith(".sample5"):
1337
1353
  final_sample_path = file
@@ -1345,7 +1361,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
1345
1361
  final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1346
1362
  ddaobj.save(final_sample_path)
1347
1363
  self.logger.debug(f"Saved converted sample: {final_sample_path}")
1348
-
1364
+
1349
1365
  # Efficient scan counting
1350
1366
  ms1_count = ms2_count = 0
1351
1367
  if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
@@ -1357,7 +1373,7 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
1357
1373
  ms1_count = count
1358
1374
  elif level == 2:
1359
1375
  ms2_count = count
1360
-
1376
+
1361
1377
  # Create sample entry
1362
1378
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1363
1379
  new_sample = pl.DataFrame({
@@ -1375,11 +1391,11 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
1375
1391
  "num_ms1": [ms1_count],
1376
1392
  "num_ms2": [ms2_count],
1377
1393
  })
1378
-
1394
+
1379
1395
  self.samples_df = pl.concat([self.samples_df, new_sample])
1380
-
1396
+
1381
1397
  # SIMPLIFIED feature processing
1382
- current_sample_uid = len(self.samples_df) - 1
1398
+ current_sample_uid = len(self.samples_df)
1383
1399
 
1384
1400
  # Add required columns with minimal operations
1385
1401
  columns_to_add = [
@@ -1387,92 +1403,100 @@ def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip
1387
1403
  pl.lit(False).alias("filled"),
1388
1404
  pl.lit(-1.0).alias("chrom_area"),
1389
1405
  ]
1390
-
1406
+
1391
1407
  # Only add rt_original if it doesn't exist
1392
1408
  if "rt_original" not in ddaobj.features_df.columns:
1393
1409
  columns_to_add.append(pl.col("rt").alias("rt_original"))
1394
-
1410
+
1395
1411
  f_df = ddaobj.features_df.with_columns(columns_to_add)
1396
-
1412
+
1397
1413
  if self.features_df.is_empty():
1398
1414
  # First sample
1399
1415
  self.features_df = f_df.with_columns(
1400
- pl.int_range(pl.len()).add(1).alias("feature_uid")
1416
+ pl.int_range(pl.len()).add(1).alias("feature_uid"),
1401
1417
  )
1402
1418
  else:
1403
1419
  # Subsequent samples - minimal overhead
1404
1420
  offset = self.features_df["feature_uid"].max() + 1
1405
1421
  f_df = f_df.with_columns(
1406
- pl.int_range(pl.len()).add(offset).alias("feature_uid")
1422
+ pl.int_range(pl.len()).add(offset).alias("feature_uid"),
1407
1423
  )
1408
-
1424
+
1409
1425
  # OPTIMIZED: Use diagonal concatenation without any schema enforcement
1410
1426
  # This is the fastest concatenation method in Polars and handles type mismatches automatically
1411
1427
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1412
-
1428
+
1413
1429
  # REMOVED ALL EXPENSIVE OPERATIONS:
1414
- # - No _ensure_features_df_schema_order()
1430
+ # - No _ensure_features_df_schema_order()
1415
1431
  # - No complex column alignment
1416
1432
  # - No type casting loops
1417
1433
  # - No sample_color_reset()
1418
-
1434
+
1419
1435
  self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
1420
1436
  return True
1421
1437
 
1422
1438
 
1423
- def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
1439
+ def _add_sample_standard(
1440
+ self,
1441
+ file,
1442
+ type=None,
1443
+ reset=False,
1444
+ adducts=None,
1445
+ skip_color_reset=True,
1446
+ skip_schema_check=True,
1447
+ ):
1424
1448
  """
1425
1449
  Standard add_sample method that uses full sample loading (includes ms1_df).
1426
-
1450
+
1427
1451
  This method uses the standard sample.load() method which loads all data
1428
1452
  including ms1_df, providing full functionality but potentially slower performance
1429
1453
  for large MS1 datasets.
1430
-
1454
+
1431
1455
  Returns True if successful, False otherwise.
1432
1456
  """
1433
1457
  self.logger.debug(f"Adding (standard): {file}")
1434
-
1458
+
1435
1459
  # Basic validation
1436
1460
  basename = os.path.basename(file)
1437
1461
  sample_name = os.path.splitext(basename)[0]
1438
-
1462
+
1439
1463
  if sample_name in self.samples_df["sample_name"].to_list():
1440
1464
  self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
1441
1465
  return False
1442
-
1466
+
1443
1467
  if not os.path.exists(file):
1444
1468
  self.logger.error(f"File {file} does not exist.")
1445
1469
  return False
1446
-
1470
+
1447
1471
  if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
1448
1472
  self.logger.error(f"Unsupported file type: {file}")
1449
1473
  return False
1450
-
1474
+
1451
1475
  # Load sample using standard method (includes ms1_df)
1452
1476
  ddaobj = Sample()
1453
1477
  ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
1454
1478
  # Use standard loading method that loads all data including ms1_df
1455
1479
  ddaobj.load(file)
1456
-
1480
+
1457
1481
  if ddaobj.features_df is None and not reset:
1458
1482
  ddaobj.features = None
1459
-
1483
+
1460
1484
  if ddaobj.features is None or reset:
1461
1485
  ddaobj.find_features()
1462
1486
  ddaobj.find_adducts(adducts=adducts)
1463
1487
  ddaobj.find_ms2()
1464
-
1488
+
1465
1489
  self.features_maps.append(ddaobj.features)
1466
-
1490
+
1467
1491
  # Determine sample type
1468
1492
  sample_type = "sample" if type is None else type
1469
1493
  if "qc" in sample_name.lower():
1470
1494
  sample_type = "qc"
1471
1495
  if "blank" in sample_name.lower():
1472
1496
  sample_type = "blank"
1473
-
1497
+
1474
1498
  map_id_value = len(self.features_maps) - 1
1475
-
1499
+
1476
1500
  # Handle file paths
1477
1501
  if file.endswith(".sample5"):
1478
1502
  final_sample_path = file
@@ -1486,7 +1510,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1486
1510
  final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1487
1511
  ddaobj.save(final_sample_path)
1488
1512
  self.logger.debug(f"Saved converted sample: {final_sample_path}")
1489
-
1513
+
1490
1514
  # Efficient scan counting
1491
1515
  ms1_count = ms2_count = 0
1492
1516
  if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
@@ -1498,7 +1522,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1498
1522
  ms1_count = count
1499
1523
  elif level == 2:
1500
1524
  ms2_count = count
1501
-
1525
+
1502
1526
  # Create sample entry
1503
1527
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1504
1528
  new_sample = pl.DataFrame({
@@ -1516,11 +1540,11 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1516
1540
  "num_ms1": [ms1_count],
1517
1541
  "num_ms2": [ms2_count],
1518
1542
  })
1519
-
1543
+
1520
1544
  self.samples_df = pl.concat([self.samples_df, new_sample])
1521
-
1545
+
1522
1546
  # SIMPLIFIED feature processing
1523
- current_sample_uid = len(self.samples_df) - 1
1547
+ current_sample_uid = len(self.samples_df)
1524
1548
 
1525
1549
  # Add required columns with minimal operations
1526
1550
  columns_to_add = [
@@ -1528,52 +1552,53 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1528
1552
  pl.lit(False).alias("filled"),
1529
1553
  pl.lit(-1.0).alias("chrom_area"),
1530
1554
  ]
1531
-
1555
+
1532
1556
  # Only add rt_original if it doesn't exist
1533
1557
  if "rt_original" not in ddaobj.features_df.columns:
1534
1558
  columns_to_add.append(pl.col("rt").alias("rt_original"))
1535
-
1559
+
1536
1560
  f_df = ddaobj.features_df.with_columns(columns_to_add)
1537
-
1561
+
1538
1562
  if self.features_df.is_empty():
1539
1563
  # First sample
1540
1564
  self.features_df = f_df.with_columns(
1541
- pl.int_range(pl.len()).add(1).alias("feature_uid")
1565
+ pl.int_range(pl.len()).add(1).alias("feature_uid"),
1542
1566
  )
1543
1567
  else:
1544
1568
  # Subsequent samples - minimal overhead
1545
1569
  offset = self.features_df["feature_uid"].max() + 1
1546
1570
  f_df = f_df.with_columns(
1547
- pl.int_range(pl.len()).add(offset).alias("feature_uid")
1571
+ pl.int_range(pl.len()).add(offset).alias("feature_uid"),
1548
1572
  )
1549
-
1573
+
1550
1574
  # Use diagonal concatenation for flexibility
1551
1575
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1552
-
1576
+
1553
1577
  self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
1554
1578
  return True
1579
+ ## COMMENT AR: Is this intentional?
1555
1580
  # Use standard loading method that loads all data including ms1_df
1556
1581
  ddaobj.load(file)
1557
-
1582
+
1558
1583
  if ddaobj.features_df is None and not reset:
1559
1584
  ddaobj.features = None
1560
-
1585
+
1561
1586
  if ddaobj.features is None or reset:
1562
1587
  ddaobj.find_features()
1563
1588
  ddaobj.find_adducts(adducts=adducts)
1564
1589
  ddaobj.find_ms2()
1565
-
1590
+
1566
1591
  self.features_maps.append(ddaobj.features)
1567
-
1592
+
1568
1593
  # Determine sample type
1569
1594
  sample_type = "sample" if type is None else type
1570
1595
  if "qc" in sample_name.lower():
1571
1596
  sample_type = "qc"
1572
1597
  if "blank" in sample_name.lower():
1573
1598
  sample_type = "blank"
1574
-
1599
+
1575
1600
  map_id_value = len(self.features_maps) - 1
1576
-
1601
+
1577
1602
  # Handle file paths
1578
1603
  if file.endswith(".sample5"):
1579
1604
  final_sample_path = file
@@ -1587,7 +1612,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1587
1612
  final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
1588
1613
  ddaobj.save(final_sample_path)
1589
1614
  self.logger.debug(f"Saved converted sample: {final_sample_path}")
1590
-
1615
+
1591
1616
  # Efficient scan counting
1592
1617
  ms1_count = ms2_count = 0
1593
1618
  if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
@@ -1599,7 +1624,7 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1599
1624
  ms1_count = count
1600
1625
  elif level == 2:
1601
1626
  ms2_count = count
1602
-
1627
+
1603
1628
  # Create sample entry
1604
1629
  next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
1605
1630
  new_sample = pl.DataFrame({
@@ -1617,11 +1642,11 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1617
1642
  "num_ms1": [ms1_count],
1618
1643
  "num_ms2": [ms2_count],
1619
1644
  })
1620
-
1645
+
1621
1646
  self.samples_df = pl.concat([self.samples_df, new_sample])
1622
-
1647
+
1623
1648
  # SIMPLIFIED feature processing
1624
- current_sample_uid = len(self.samples_df) - 1
1649
+ current_sample_uid = len(self.samples_df)
1625
1650
 
1626
1651
  # Add required columns with minimal operations
1627
1652
  columns_to_add = [
@@ -1629,28 +1654,28 @@ def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_
1629
1654
  pl.lit(False).alias("filled"),
1630
1655
  pl.lit(-1.0).alias("chrom_area"),
1631
1656
  ]
1632
-
1657
+
1633
1658
  # Only add rt_original if it doesn't exist
1634
1659
  if "rt_original" not in ddaobj.features_df.columns:
1635
1660
  columns_to_add.append(pl.col("rt").alias("rt_original"))
1636
-
1661
+
1637
1662
  f_df = ddaobj.features_df.with_columns(columns_to_add)
1638
-
1663
+
1639
1664
  if self.features_df.is_empty():
1640
1665
  # First sample
1641
1666
  self.features_df = f_df.with_columns(
1642
- pl.int_range(pl.len()).add(1).alias("feature_uid")
1667
+ pl.int_range(pl.len()).add(1).alias("feature_uid"),
1643
1668
  )
1644
1669
  else:
1645
1670
  # Subsequent samples - minimal overhead
1646
1671
  offset = self.features_df["feature_uid"].max() + 1
1647
1672
  f_df = f_df.with_columns(
1648
- pl.int_range(pl.len()).add(offset).alias("feature_uid")
1673
+ pl.int_range(pl.len()).add(offset).alias("feature_uid"),
1649
1674
  )
1650
-
1675
+
1651
1676
  # Use diagonal concatenation for flexibility
1652
1677
  self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
1653
-
1678
+
1654
1679
  self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
1655
1680
  return True
1656
1681
 
@@ -1662,36 +1687,38 @@ def _sample_color_reset_optimized(self):
1662
1687
  if self.samples_df is None or len(self.samples_df) == 0:
1663
1688
  self.logger.warning("No samples found in study.")
1664
1689
  return
1665
-
1690
+
1666
1691
  # Cache the colormap if not already cached
1667
- if not hasattr(self, '_cached_colormap'):
1692
+ if not hasattr(self, "_cached_colormap"):
1668
1693
  try:
1669
1694
  from cmap import Colormap
1670
- self._cached_colormap = Colormap('turbo')
1695
+
1696
+ self._cached_colormap = Colormap("turbo")
1671
1697
  except ImportError:
1672
1698
  self.logger.warning("cmap package not available, using default colors")
1673
1699
  return
1674
-
1700
+
1675
1701
  cm = self._cached_colormap
1676
1702
  n_samples = len(self.samples_df)
1677
-
1703
+
1678
1704
  # Pre-allocate colors list for better performance
1679
1705
  colors = [None] * n_samples
1680
-
1706
+
1681
1707
  # Vectorized color generation
1682
1708
  for i in range(n_samples):
1683
1709
  normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
1684
1710
  color_rgba = cm(normalized_value)
1685
-
1711
+
1686
1712
  if len(color_rgba) >= 3:
1687
1713
  r, g, b = color_rgba[:3]
1688
1714
  if max(color_rgba[:3]) <= 1.0:
1689
1715
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
1690
1716
  colors[i] = f"#{r:02x}{g:02x}{b:02x}"
1691
-
1717
+
1692
1718
  # Update the sample_color column efficiently
1693
1719
  self.samples_df = self.samples_df.with_columns(
1694
- pl.Series("sample_color", colors).alias("sample_color")
1720
+ pl.Series("sample_color", colors).alias("sample_color"),
1695
1721
  )
1696
-
1722
+
1697
1723
  self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")
1724
+