masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.3.17"
4
+ __version__ = "0.3.19"
5
5
 
6
6
 
7
7
  def get_version():
masster/sample/h5.py CHANGED
@@ -900,7 +900,7 @@ def _load_sample5(self, filename: str, map: bool = True):
900
900
  def _load_sample5_study(self, filename: str, map: bool = True):
901
901
  """
902
902
  Optimized variant of _load_sample5 for study loading that skips reading ms1_df.
903
-
903
+
904
904
  This is used when adding samples to studies where ms1_df data is not needed,
905
905
  improving loading throughput by skipping the potentially large ms1_df dataset.
906
906
 
masster/sample/helpers.py CHANGED
@@ -176,7 +176,7 @@ def _get_feature_uids(self, features=None, verbose=True):
176
176
  if not isinstance(features, pd.DataFrame):
177
177
  if verbose:
178
178
  self.logger.error(
179
- "Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame."
179
+ "Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.",
180
180
  )
181
181
  return []
182
182
 
@@ -298,7 +298,7 @@ def get_eic(self, mz, mz_tol=None):
298
298
  """
299
299
  # Use default mz_tol from sample parameters if not provided
300
300
  if mz_tol is None:
301
- if hasattr(self, 'parameters') and hasattr(self.parameters, 'eic_mz_tol'):
301
+ if hasattr(self, "parameters") and hasattr(self.parameters, "eic_mz_tol"):
302
302
  mz_tol = self.parameters.eic_mz_tol
303
303
  else:
304
304
  mz_tol = 0.01 # fallback default
@@ -323,11 +323,7 @@ def get_eic(self, mz, mz_tol=None):
323
323
  return None
324
324
 
325
325
  # Aggregate intensities per retention time. Use sum in case multiple points per rt.
326
- chrom = (
327
- matches.group_by("rt")
328
- .agg([pl.col("inty").sum().alias("inty")])
329
- .sort("rt")
330
- )
326
+ chrom = matches.group_by("rt").agg([pl.col("inty").sum().alias("inty")]).sort("rt")
331
327
 
332
328
  # Attach to Sample
333
329
  self.chrom_df = chrom
masster/sample/load.py CHANGED
@@ -119,7 +119,7 @@ def load_study(
119
119
  ):
120
120
  """
121
121
  Optimized load method for study use that skips loading ms1_df for better performance.
122
-
122
+
123
123
  This method is identical to load() but uses _load_sample5_study() for .sample5 files,
124
124
  which skips reading the potentially large ms1_df dataset to improve throughput when
125
125
  adding samples to studies.
@@ -983,7 +983,7 @@ def index_file(self):
983
983
  self.set_source(self.file_source.replace(".sample5", ".mzml"))
984
984
  else:
985
985
  raise FileNotFoundError(
986
- f"File {self.file_source} not found. Did the path change? Consider running source()."
986
+ f"File {self.file_source} not found. Did the path change? Consider running source().",
987
987
  )
988
988
  self.index_file()
989
989
  else:
masster/sample/plot.py CHANGED
@@ -87,9 +87,10 @@ def _is_notebook_environment():
87
87
  # Check if marimo is in modules
88
88
  if "marimo" in sys.modules:
89
89
  return True
90
-
90
+
91
91
  # Check for marimo in the call stack or environment
92
92
  import inspect
93
+
93
94
  frame = inspect.currentframe()
94
95
  try:
95
96
  while frame:
masster/study/export.py CHANGED
@@ -445,7 +445,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
445
445
  mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
446
446
  mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
447
447
  mtd_lines.append(
448
- "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]"
448
+ "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
449
449
  )
450
450
  mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
451
451
  mtd_lines.append("")
@@ -499,8 +499,16 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
499
499
  # Use the matrix as-is since it already has the correct sample columns
500
500
  # The matrix columns are sample names, which is what we want for the assay columns
501
501
 
502
- # round to int
503
- abundance_matrix = abundance_matrix.round(0)
502
+ # round to int - handle both Polars and Pandas DataFrames
503
+ if hasattr(abundance_matrix, 'with_columns'):
504
+ # Polars DataFrame
505
+ numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
506
+ abundance_matrix = abundance_matrix.with_columns([
507
+ abundance_matrix[col].round(0) for col in numeric_cols
508
+ ])
509
+ else:
510
+ # Pandas DataFrame
511
+ abundance_matrix = abundance_matrix.round(0)
504
512
 
505
513
  # Use actual number of samples from the abundance matrix
506
514
  n_assays = len(abundance_matrix.columns)
@@ -570,9 +578,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
570
578
  ]
571
579
  # Add abundance values for each assay
572
580
  consensus_uid = row["consensus_uid"]
573
- if consensus_uid in abundance_matrix.index:
574
- abundance_values = abundance_matrix.loc[consensus_uid].tolist()
575
- sml_row += [str(val) if pd.notna(val) else "null" for val in abundance_values]
581
+ # Check if consensus_uid exists in the abundance_matrix (Polars)
582
+ filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
583
+ if filtered_matrix.height > 0:
584
+ # Get the first (and should be only) matching row
585
+ abundance_row = filtered_matrix.row(0, named=True)
586
+ # Extract values excluding the consensus_uid column
587
+ abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
588
+ sml_row += [str(val) if val is not None else "null" for val in abundance_values]
576
589
  else:
577
590
  sml_row += ["null"] * n_assays
578
591
  sml_row += ["null", "null"]
@@ -615,11 +628,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
615
628
  str(row.get("retention_time_in_seconds_start", "null")),
616
629
  str(row.get("retention_time_in_seconds_end", "null")),
617
630
  ]
618
- # Add abundance values for each assay - same as SML
631
+ # Add abundance values for each assay - same as SML (Polars)
619
632
  consensus_uid = row["consensus_uid"]
620
- if consensus_uid in abundance_matrix.index:
621
- abundance_values = abundance_matrix.loc[consensus_uid].tolist()
622
- smf_row += [str(val) if pd.notna(val) else "null" for val in abundance_values]
633
+ filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
634
+ if filtered_matrix.height > 0:
635
+ # Get the first (and should be only) matching row
636
+ abundance_row = filtered_matrix.row(0, named=True)
637
+ # Extract values excluding the consensus_uid column
638
+ abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
639
+ smf_row += [str(val) if val is not None else "null" for val in abundance_values]
623
640
  else:
624
641
  smf_row += ["null"] * n_assays
625
642
  smf_lines.append("\t".join(smf_row))
masster/study/h5.py CHANGED
@@ -94,7 +94,7 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
94
94
  numeric_cols.append(col)
95
95
 
96
96
  logger.debug(
97
- f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
97
+ f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns",
98
98
  )
99
99
 
100
100
  # Process numeric columns in batch (most efficient)
@@ -277,7 +277,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
277
277
  results[chunk_start] = chunk_result
278
278
  except Exception as e:
279
279
  logger.warning(
280
- f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}"
280
+ f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}",
281
281
  )
282
282
  # Fallback to simple string conversion for this chunk
283
283
  chunk = data_list[chunk_start : chunk_start + chunk_size]
@@ -435,7 +435,7 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
435
435
  group.create_dataset(col, data=data_as_str, compression=compression)
436
436
  else:
437
437
  logger.warning(
438
- f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column."
438
+ f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
439
439
  )
440
440
  elif dtype == "string":
441
441
  # Handle string columns
@@ -698,17 +698,17 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
698
698
  # Get available columns from HDF5 file
699
699
  hdf5_columns = list(group.keys())
700
700
  logger.debug(f"HDF5 columns available: {hdf5_columns}")
701
-
701
+
702
702
  # Handle column name migrations for backward compatibility first
703
703
  if df_name == "samples_df":
704
704
  # Migrate old column names to new names
705
705
  column_migrations = {
706
706
  "size": "num_features",
707
- "file_source": "sample_source",
707
+ "file_source": "sample_source",
708
708
  "ms1": "num_ms1",
709
- "ms2": "num_ms2"
709
+ "ms2": "num_ms2",
710
710
  }
711
-
711
+
712
712
  # Create a mapping of what's actually available after migrations
713
713
  effective_columns = hdf5_columns.copy()
714
714
  for old_name, new_name in column_migrations.items():
@@ -720,14 +720,14 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
720
720
  # First pass: load all existing columns (including migrated ones)
721
721
  for col in schema_columns or []:
722
722
  source_col = col
723
-
723
+
724
724
  # Check if we need to load from a migrated column name
725
725
  if df_name == "samples_df":
726
726
  column_migrations = {
727
727
  "size": "num_features",
728
- "file_source": "sample_source",
728
+ "file_source": "sample_source",
729
729
  "ms1": "num_ms1",
730
- "ms2": "num_ms2"
730
+ "ms2": "num_ms2",
731
731
  }
732
732
  # Reverse lookup - find old name for new name
733
733
  reverse_migrations = {v: k for k, v in column_migrations.items()}
@@ -736,7 +736,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
736
736
  if old_name in group:
737
737
  source_col = old_name
738
738
  logger.info(f"Loading '{col}' from old column name '{old_name}'")
739
-
739
+
740
740
  if source_col not in group:
741
741
  missing_columns.append(col)
742
742
  continue
@@ -829,12 +829,12 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
829
829
  if df_name == "samples_df":
830
830
  column_migrations = {
831
831
  "size": "num_features",
832
- "file_source": "sample_source",
832
+ "file_source": "sample_source",
833
833
  "ms1": "num_ms1",
834
- "ms2": "num_ms2"
834
+ "ms2": "num_ms2",
835
835
  }
836
836
  migrated_old_names = set(column_migrations.keys())
837
-
837
+
838
838
  extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
839
839
 
840
840
  for col in extra_columns:
@@ -974,7 +974,7 @@ def _save_study5_compressed(self, filename=None):
974
974
 
975
975
  pbar.update(1)
976
976
  pbar.set_description(
977
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
977
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes",
978
978
  )
979
979
 
980
980
  # Store samples_df - use optimized batch processing
@@ -987,7 +987,7 @@ def _save_study5_compressed(self, filename=None):
987
987
  # Store features_df - use fast method that skips chrom and ms2_specs columns
988
988
  if self.features_df is not None and not self.features_df.is_empty():
989
989
  self.logger.debug(
990
- f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)"
990
+ f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
991
991
  )
992
992
  _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
993
993
  pbar.update(1)
@@ -1066,7 +1066,7 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
1066
1066
  numeric_cols.append(col)
1067
1067
 
1068
1068
  logger.debug(
1069
- f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
1069
+ f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns",
1070
1070
  )
1071
1071
 
1072
1072
  # Process numeric columns in batch (most efficient)
@@ -1184,7 +1184,7 @@ def _save_study5(self, filename=None):
1184
1184
 
1185
1185
  pbar.update(1)
1186
1186
  pbar.set_description(
1187
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes"
1187
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes",
1188
1188
  )
1189
1189
 
1190
1190
  # Store samples_df - use optimized batch processing
@@ -1309,7 +1309,7 @@ def _load_study5(self, filename=None):
1309
1309
  ) as pbar:
1310
1310
  # Load metadata
1311
1311
  pbar.set_description(
1312
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata"
1312
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading metadata",
1313
1313
  )
1314
1314
  if "metadata" in f:
1315
1315
  metadata = f["metadata"]
@@ -1371,7 +1371,7 @@ def _load_study5(self, filename=None):
1371
1371
 
1372
1372
  # Load samples_df
1373
1373
  pbar.set_description(
1374
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
1374
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1375
1375
  )
1376
1376
  if "samples" in f and len(f["samples"].keys()) > 0:
1377
1377
  self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
@@ -1411,7 +1411,7 @@ def _load_study5(self, filename=None):
1411
1411
  pbar.update(1)
1412
1412
  # Load samples_df
1413
1413
  pbar.set_description(
1414
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples"
1414
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
1415
1415
  )
1416
1416
  if "samples" in f and len(f["samples"].keys()) > 0:
1417
1417
  self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
@@ -1452,12 +1452,16 @@ def _load_study5(self, filename=None):
1452
1452
 
1453
1453
  # Load features_df
1454
1454
  pbar.set_description(
1455
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features"
1455
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading features",
1456
1456
  )
1457
1457
  if "features" in f and len(f["features"].keys()) > 0:
1458
1458
  object_columns = ["chrom", "ms2_scans", "ms2_specs"]
1459
1459
  self.features_df = _load_dataframe_from_group(
1460
- f["features"], schema, "features_df", self.logger, object_columns
1460
+ f["features"],
1461
+ schema,
1462
+ "features_df",
1463
+ self.logger,
1464
+ object_columns,
1461
1465
  )
1462
1466
  else:
1463
1467
  self.features_df = None
@@ -1465,7 +1469,7 @@ def _load_study5(self, filename=None):
1465
1469
 
1466
1470
  # Load consensus_df
1467
1471
  pbar.set_description(
1468
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus"
1472
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus",
1469
1473
  )
1470
1474
  if "consensus" in f and len(f["consensus"].keys()) > 0:
1471
1475
  # Only include adducts in object_columns if it actually exists in the file
@@ -1474,7 +1478,11 @@ def _load_study5(self, filename=None):
1474
1478
  object_columns.append("adducts")
1475
1479
 
1476
1480
  self.consensus_df = _load_dataframe_from_group(
1477
- f["consensus"], schema, "consensus_df", self.logger, object_columns
1481
+ f["consensus"],
1482
+ schema,
1483
+ "consensus_df",
1484
+ self.logger,
1485
+ object_columns,
1478
1486
  )
1479
1487
 
1480
1488
  # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
@@ -1507,22 +1515,28 @@ def _load_study5(self, filename=None):
1507
1515
 
1508
1516
  # Load consensus_mapping_df
1509
1517
  pbar.set_description(
1510
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
1518
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping",
1511
1519
  )
1512
1520
  if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1513
1521
  self.consensus_mapping_df = _load_dataframe_from_group(
1514
- f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
1522
+ f["consensus_mapping"],
1523
+ schema,
1524
+ "consensus_mapping_df",
1525
+ self.logger,
1515
1526
  )
1516
1527
  else:
1517
1528
  self.consensus_mapping_df = None
1518
1529
  pbar.update(1)
1519
1530
  # Load consensus_mapping_df
1520
1531
  pbar.set_description(
1521
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping"
1532
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping",
1522
1533
  )
1523
1534
  if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
1524
1535
  self.consensus_mapping_df = _load_dataframe_from_group(
1525
- f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
1536
+ f["consensus_mapping"],
1537
+ schema,
1538
+ "consensus_mapping_df",
1539
+ self.logger,
1526
1540
  )
1527
1541
  else:
1528
1542
  self.consensus_mapping_df = None
@@ -1530,34 +1544,38 @@ def _load_study5(self, filename=None):
1530
1544
 
1531
1545
  # Load consensus_ms2
1532
1546
  pbar.set_description(
1533
- f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2"
1547
+ f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus MS2",
1534
1548
  )
1535
1549
  if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
1536
1550
  object_columns = ["spec"]
1537
1551
  self.consensus_ms2 = _load_dataframe_from_group(
1538
- f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns
1552
+ f["consensus_ms2"],
1553
+ schema,
1554
+ "consensus_ms2",
1555
+ self.logger,
1556
+ object_columns,
1539
1557
  )
1540
1558
  else:
1541
1559
  self.consensus_ms2 = None
1542
1560
  pbar.update(1)
1543
1561
 
1544
1562
  # Check and migrate old string-based map_id to integer indices
1545
- if (self.samples_df is not None and
1546
- not self.samples_df.is_empty() and
1547
- self.samples_df['map_id'].dtype == pl.Utf8):
1563
+ if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
1548
1564
  self.logger.info("Detected old string-based map_id format, migrating to integer indices")
1549
-
1565
+
1550
1566
  # Convert string-based map_id to integer indices
1551
1567
  sample_count = len(self.samples_df)
1552
1568
  new_map_ids = list(range(sample_count))
1553
-
1569
+
1554
1570
  self.samples_df = self.samples_df.with_columns(
1555
- pl.lit(new_map_ids).alias("map_id")
1571
+ pl.lit(new_map_ids).alias("map_id"),
1556
1572
  )
1557
-
1573
+
1558
1574
  # Ensure the column is Int64 type
1559
1575
  self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
1560
-
1561
- self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
1576
+
1577
+ self.logger.info(
1578
+ f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
1579
+ )
1562
1580
 
1563
1581
  self.logger.debug("Study loaded")