masster 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.5.0"
4
+ __version__ = "0.5.1"
5
5
 
6
6
 
7
7
  def get_version():
@@ -0,0 +1,22 @@
1
+ name,smiles,inchikey,formula,db_id,db
2
+ L-Glutamic acid,N[C@@H](CCC(O)=O)C(O)=O,WHUUTDBJXJRKMK-VKHMYHEASA-N,C5H9NO4,CID:33032,pubchem
3
+ L-Tyrosine,N[C@@H](CC1=CC=C(O)C=C1)C(O)=O,OUYCCCASQSFEME-QMMMGPOBSA-N,C9H11NO3,CID:6057,pubchem
4
+ L-Phenylalanine,N[C@@H](CC1=CC=CC=C1)C(O)=O,COLNVLDHVKWLRT-QMMMGPOBSA-N,C9H11NO2,CID:6140,pubchem
5
+ L-Alanine,C[C@H](N)C(O)=O,QNAYBMKLOCPYGJ-REOHCLBHSA-N,C3H7NO2,CID:5950,pubchem
6
+ L-Proline,OC(=O)[C@@H]1CCCN1,ONIBWKKTOPOVIA-BYPYZUCNSA-N,C5H9NO2,CID:145742,pubchem
7
+ L-Threonine,C[C@@H](O)[C@H](N)C(O)=O,AYFVYJQAPQTCCC-GBXIJSLDSA-N,C4H9NO3,CID:6288,pubchem
8
+ L-Asparagine,N[C@@H](CC(N)=O)C(O)=O,DCXYFEDJOCDNAF-REOHCLBHSA-N,C4H8N2O3,CID:6267,pubchem
9
+ L-Isoleucine,CC[C@H](C)[C@H](N)C(O)=O,AGPKZVBTJJNPAG-WHFBIAKZSA-N,C6H13NO2,CID:6306,pubchem
10
+ L-Histidine,N[C@@H](CC1=CN=CN1)C(O)=O,HNDVDQJCIGZPNO-YFKPBYRVSA-N,C6H9N3O2,CID:6274,pubchem
11
+ L-Lysine,NCCCC[C@H](N)C(O)=O,KDXKERNSBIXSRK-YFKPBYRVSA-N,C6H14N2O2,CID:5962,pubchem
12
+ L-Serine,N[C@@H](CO)C(O)=O,MTCFGRXMJLQNBG-REOHCLBHSA-N,C3H7NO3,CID:5951,pubchem
13
+ L-Aspartic acid,N[C@@H](CC(O)=O)C(O)=O,CKLJMWTZIZZHCS-REOHCLBHSA-N,C4H7NO4,CID:5960,pubchem
14
+ L-Cystine,N[C@@H](CSSC[C@H](N)C(O)=O)C(O)=O,LEVWYRKDKASIDU-IMJSIDKUSA-N,C6H12N2O4S2,CID:67678,pubchem
15
+ L-Arginine,N[C@@H](CCCNC(N)=N)C(O)=O,ODKSFYDXXFIFQN-BYPYZUCNSA-N,C6H14N4O2,CID:6322,pubchem
16
+ L-Cysteine,N[C@@H](CS)C(O)=O,XUJNEKJLAYXESH-REOHCLBHSA-N,C3H7NO2S,CID:5862,pubchem
17
+ L-Glutamine,N[C@@H](CCC(N)=O)C(O)=O,ZDXPYRJPNDTMRX-VKHMYHEASA-N,C5H10N2O3,CID:5961,pubchem
18
+ L-Leucine,CC(C)C[C@H](N)C(O)=O,ROHFNLRQFUQHCH-YFKPBYRVSA-N,C6H13NO2,CID:6106,pubchem
19
+ L-Methionine,CSCC[C@H](N)C(O)=O,FFEARJCKVFRZRR-BYPYZUCNSA-N,C5H11NO2S,CID:6137,pubchem
20
+ L-Valine,CC(C)[C@H](N)C(O)=O,KZSNJWFQEVHDMF-BYPYZUCNSA-N,C5H11NO2,CID:6287,pubchem
21
+ L-Tryptophan,N[C@@H](CC1=CNC2=C1C=CC=C2)C(O)=O,QIVBCDIJIAJPQS-VIFPVBQESA-N,C11H12N2O2,CID:6305,pubchem
22
+ Glycine,NCC(O)=O,QNAYBMKLOCPYGJ-UHFFFAOYSA-N,C2H5NO2,CID:750,Glycine
masster/lib/lib.py CHANGED
@@ -123,11 +123,13 @@ class Lib:
123
123
  "inchi": pl.Series([], dtype=pl.Utf8),
124
124
  "inchikey": pl.Series([], dtype=pl.Utf8),
125
125
  "formula": pl.Series([], dtype=pl.Utf8),
126
+ "iso": pl.Series([], dtype=pl.Int64),
126
127
  "adduct": pl.Series([], dtype=pl.Utf8),
127
128
  "m": pl.Series([], dtype=pl.Float64),
128
129
  "z": pl.Series([], dtype=pl.Int8),
129
130
  "mz": pl.Series([], dtype=pl.Float64),
130
131
  "rt": pl.Series([], dtype=pl.Float64),
132
+ "quant_group": pl.Series([], dtype=pl.Int64),
131
133
  "db_id": pl.Series([], dtype=pl.Utf8),
132
134
  "db": pl.Series([], dtype=pl.Utf8),
133
135
  })
@@ -245,11 +247,13 @@ class Lib:
245
247
  "inchi": compound_data.get("inchi", ""),
246
248
  "inchikey": compound_data.get("inchikey", ""),
247
249
  "formula": compound_data["formula"],
250
+ "iso": 0, # Default to zero
248
251
  "adduct": adduct,
249
252
  "m": adducted_mass,
250
253
  "z": charge,
251
254
  "mz": mz,
252
255
  "rt": compound_data.get("rt", None),
256
+ "quant_group": counter, # Use same as lib_uid for default
253
257
  "db_id": compound_data.get("db_id", None),
254
258
  "db": compound_data.get("db", None),
255
259
  }
@@ -526,12 +530,14 @@ class Lib:
526
530
  "source_id": match_row.get("source_id"),
527
531
  "name": match_row["name"],
528
532
  "formula": match_row["formula"],
533
+ "iso": match_row.get("iso", 0),
529
534
  "adduct": match_row["adduct"],
530
535
  "smiles": match_row["smiles"],
531
536
  "inchi": match_row["inchi"],
532
537
  "inchikey": match_row["inchikey"],
533
538
  "lib_mz": match_row["mz"],
534
539
  "lib_rt": match_row["rt"],
540
+ "quant_group": match_row.get("quant_group"),
535
541
  "delta_mz": abs(feature_mz - match_row["mz"]),
536
542
  "delta_rt": abs(feature_rt - match_row["rt"]) if feature_rt is not None and match_row["rt"] is not None else None,
537
543
  }
@@ -58,7 +58,7 @@ class fill_defaults:
58
58
  "dtype": int,
59
59
  "description": "Minimum absolute samples threshold",
60
60
  "default": 5,
61
- "min_value": 1,
61
+ "min_value": 0,
62
62
  "max_value": 100,
63
63
  },
64
64
  },
masster/study/h5.py CHANGED
@@ -2007,6 +2007,9 @@ def _load_study5(self, filename=None):
2007
2007
  f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
2008
2008
  )
2009
2009
 
2010
+ # Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
2011
+ self._sanitize_null_ids()
2012
+
2010
2013
  self.logger.debug("Study loaded")
2011
2014
 
2012
2015
 
masster/study/id.py CHANGED
@@ -15,6 +15,7 @@ def lib_load(
15
15
  lib_source,
16
16
  polarity: str | None = None,
17
17
  adducts: list | None = None,
18
+ iso: str | None = None,
18
19
  ):
19
20
  """Load a compound library into the study.
20
21
 
@@ -23,6 +24,7 @@ def lib_load(
23
24
  lib_source: either a CSV file path (str) or a Lib instance
24
25
  polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV path
25
26
  adducts: specific adducts to generate - used when lib_source is a CSV path
27
+ iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
26
28
 
27
29
  Side effects:
28
30
  sets study.lib_df to a Polars DataFrame and stores the lib object on
@@ -97,6 +99,56 @@ def lib_load(
97
99
  # Store pointer and DataFrame on study
98
100
  study._lib = lib_obj
99
101
 
102
+ # Add source_id column with filename (without path) if loading from CSV
103
+ if isinstance(lib_source, str):
104
+ import os
105
+ filename_only = os.path.basename(lib_source)
106
+ filtered_lf = filtered_lf.with_columns(pl.lit(filename_only).alias("source_id"))
107
+
108
+ # Ensure required columns exist and set correct values
109
+ required_columns = {
110
+ "quant_group": pl.Int64,
111
+ "iso": pl.Int64
112
+ }
113
+
114
+ for col_name, col_dtype in required_columns.items():
115
+ if col_name == "quant_group":
116
+ # Set quant_group using cmpd_uid (same for isotopomers of same compound)
117
+ if "cmpd_uid" in filtered_lf.columns:
118
+ filtered_lf = filtered_lf.with_columns(pl.col("cmpd_uid").cast(col_dtype).alias("quant_group"))
119
+ else:
120
+ # Fallback to lib_uid if cmpd_uid doesn't exist
121
+ filtered_lf = filtered_lf.with_columns(pl.col("lib_uid").cast(col_dtype).alias("quant_group"))
122
+ elif col_name == "iso":
123
+ if col_name not in filtered_lf.columns:
124
+ # Default to zero for iso
125
+ filtered_lf = filtered_lf.with_columns(pl.lit(0).cast(col_dtype).alias(col_name))
126
+
127
+ # Generate 13C isotopes if requested
128
+ original_count = len(filtered_lf)
129
+ if iso == '13C':
130
+ filtered_lf = _generate_13c_isotopes(filtered_lf)
131
+ # Update the log message to show the correct count after isotope generation
132
+ if isinstance(lib_source, str):
133
+ import os
134
+ filename_only = os.path.basename(lib_source)
135
+ print(f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}")
136
+
137
+ # Reorder columns to place quant_group after rt and iso after formula
138
+ column_order = []
139
+ columns_list = list(filtered_lf.columns)
140
+
141
+ for col in columns_list:
142
+ if col not in column_order: # Only add if not already added
143
+ column_order.append(col)
144
+ if col == "rt" and "quant_group" in columns_list and "quant_group" not in column_order:
145
+ column_order.append("quant_group")
146
+ elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
147
+ column_order.append("iso")
148
+
149
+ # Apply the column ordering
150
+ filtered_lf = filtered_lf.select(column_order)
151
+
100
152
  # Add to existing lib_df instead of replacing
101
153
  if (
102
154
  hasattr(study, "lib_df")
@@ -127,7 +179,7 @@ def lib_load(
127
179
  if hasattr(study, "update_history"):
128
180
  study.update_history(
129
181
  ["lib_load"],
130
- {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts},
182
+ {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts, "iso": iso},
131
183
  )
132
184
 
133
185
 
@@ -349,6 +401,7 @@ def _update_identification_results(study, results, logger):
349
401
  "rt_delta": match["rt_delta"],
350
402
  "matcher": match["matcher"],
351
403
  "score": match["score"],
404
+ "iso": 0, # Default to zero
352
405
  })
353
406
 
354
407
  # Convert to DataFrame and append to existing results
@@ -356,6 +409,13 @@ def _update_identification_results(study, results, logger):
356
409
 
357
410
  if not new_results_df.is_empty():
358
411
  if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
412
+ # Check if existing id_df has the iso column
413
+ if "iso" not in study.id_df.columns:
414
+ # Add iso column to existing id_df with default value 0
415
+ study.id_df = study.id_df.with_columns(pl.lit(0).alias("iso"))
416
+ if logger:
417
+ logger.debug("Added 'iso' column to existing id_df for schema compatibility")
418
+
359
419
  study.id_df = pl.concat([study.id_df, new_results_df])
360
420
  else:
361
421
  study.id_df = new_results_df
@@ -1043,8 +1103,10 @@ def lib_reset(study):
1043
1103
  - study.id_df (identification results DataFrame)
1044
1104
  - study.lib_df (library DataFrame)
1045
1105
  - study._lib (library object reference)
1106
+ - Consensus features created by lib_to_consensus() (number_samples = -1 or 0)
1046
1107
  - 'identify' from study.history
1047
1108
  - 'lib_load' from study.history (if exists)
1109
+ - 'lib_to_consensus' from study.history (if exists)
1048
1110
  - Resets id_top_* columns in consensus_df to null
1049
1111
 
1050
1112
  Args:
@@ -1053,6 +1115,36 @@ def lib_reset(study):
1053
1115
  # Get logger from study if available
1054
1116
  logger = getattr(study, "logger", None)
1055
1117
 
1118
+ # Remove consensus features created by lib_to_consensus()
1119
+ # These are identified by number_samples = -1 or 0
1120
+ if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
1121
+ if logger:
1122
+ logger.debug("Checking for consensus features created by lib_to_consensus()")
1123
+
1124
+ try:
1125
+ # Filter for features with number_samples = -1 or 0
1126
+ # Since consensus_select doesn't support list of discrete values, use direct filtering
1127
+ lib_consensus_features = study.consensus_df.filter(
1128
+ (pl.col("number_samples") == -1) | (pl.col("number_samples") == 0)
1129
+ )
1130
+
1131
+ if lib_consensus_features is not None and not lib_consensus_features.is_empty():
1132
+ num_lib_features = len(lib_consensus_features)
1133
+ if logger:
1134
+ logger.info(f"Removing {num_lib_features} consensus features created by lib_to_consensus()")
1135
+
1136
+ # Use consensus_delete to remove these features and all dependent data
1137
+ study.consensus_delete(lib_consensus_features)
1138
+
1139
+ if logger:
1140
+ logger.debug("Successfully removed library-derived consensus features")
1141
+ else:
1142
+ if logger:
1143
+ logger.debug("No library-derived consensus features found to remove")
1144
+ except Exception as e:
1145
+ if logger:
1146
+ logger.warning(f"Error removing library-derived consensus features: {e}")
1147
+
1056
1148
  # Remove id_df
1057
1149
  if hasattr(study, "id_df"):
1058
1150
  if logger:
@@ -1099,6 +1191,11 @@ def lib_reset(study):
1099
1191
  if logger:
1100
1192
  logger.debug("Removing 'lib_load' from history")
1101
1193
  del study.history["lib_load"]
1194
+
1195
+ if "lib_to_consensus" in study.history:
1196
+ if logger:
1197
+ logger.debug("Removing 'lib_to_consensus' from history")
1198
+ del study.history["lib_to_consensus"]
1102
1199
 
1103
1200
  if logger:
1104
1201
  logger.info("Library and identification data reset completed")
@@ -1438,3 +1535,447 @@ def _format_adduct_name(components: list[dict]) -> str:
1438
1535
  )
1439
1536
 
1440
1537
  return f"[M{formula}]{charge_str}"
1538
+
1539
+
1540
+ def _generate_13c_isotopes(lib_df):
1541
+ """
1542
+ Generate 13C isotope variants for library entries.
1543
+
1544
+ For each compound with n carbon atoms, creates n+1 entries:
1545
+ - iso=0: original compound (no 13C)
1546
+ - iso=1: one 13C isotope (+1.00335 Da)
1547
+ - iso=2: two 13C isotopes (+2.00670 Da)
1548
+ - ...
1549
+ - iso=n: n 13C isotopes (+n*1.00335 Da)
1550
+
1551
+ All isotopomers share the same quant_group.
1552
+
1553
+ Args:
1554
+ lib_df: Polars DataFrame with library entries
1555
+
1556
+ Returns:
1557
+ Polars DataFrame with additional 13C isotope entries
1558
+ """
1559
+ if lib_df.is_empty():
1560
+ return lib_df
1561
+
1562
+ # First, ensure all original entries have iso=0
1563
+ original_df = lib_df.with_columns(pl.lit(0).alias("iso"))
1564
+
1565
+ isotope_entries = []
1566
+ next_lib_uid = lib_df["lib_uid"].max() + 1 if len(lib_df) > 0 else 1
1567
+
1568
+ # Mass difference for one 13C isotope
1569
+ c13_mass_shift = 1.00335 # Mass difference between 13C and 12C
1570
+
1571
+ for row in original_df.iter_rows(named=True):
1572
+ formula = row.get("formula", "")
1573
+ if not formula:
1574
+ continue
1575
+
1576
+ # Count carbon atoms in the formula
1577
+ carbon_count = _count_carbon_atoms(formula)
1578
+ if carbon_count == 0:
1579
+ continue
1580
+
1581
+ # Get the original quant_group to keep it consistent across isotopes
1582
+ # All isotopomers of the same compound should have the same quant_group
1583
+ quant_group = row.get("quant_group", row.get("cmpd_uid", row.get("lib_uid", 1)))
1584
+
1585
+ # Generate isotope variants (1 to n 13C atoms)
1586
+ for iso_num in range(1, carbon_count + 1):
1587
+ # Calculate mass shift for this number of 13C isotopes
1588
+ mass_shift = iso_num * c13_mass_shift
1589
+
1590
+ # Create new entry
1591
+ isotope_entry = dict(row) # Copy all fields
1592
+ isotope_entry["lib_uid"] = next_lib_uid
1593
+ isotope_entry["iso"] = iso_num
1594
+ isotope_entry["m"] = row["m"] + mass_shift
1595
+ isotope_entry["mz"] = (row["m"] + mass_shift) / abs(row["z"]) if row["z"] != 0 else row["m"] + mass_shift
1596
+ isotope_entry["quant_group"] = quant_group # Keep same quant_group
1597
+
1598
+ isotope_entries.append(isotope_entry)
1599
+ next_lib_uid += 1
1600
+
1601
+ # Combine original entries (now with iso=0) with isotope entries
1602
+ if isotope_entries:
1603
+ isotope_df = pl.DataFrame(isotope_entries)
1604
+ # Ensure schema compatibility by aligning data types
1605
+ try:
1606
+ return pl.concat([original_df, isotope_df])
1607
+ except Exception as e:
1608
+ # If concat fails due to schema mismatch, convert to compatible types
1609
+ # Get common schema
1610
+ original_schema = original_df.schema
1611
+ isotope_schema = isotope_df.schema
1612
+
1613
+ # Cast isotope_df columns to match original_df schema where possible
1614
+ cast_exprs = []
1615
+ for col_name in isotope_df.columns:
1616
+ if col_name in original_schema:
1617
+ target_dtype = original_schema[col_name]
1618
+ cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
1619
+ else:
1620
+ cast_exprs.append(pl.col(col_name))
1621
+
1622
+ isotope_df_cast = isotope_df.select(cast_exprs)
1623
+ return pl.concat([original_df, isotope_df_cast])
1624
+ else:
1625
+ return original_df
1626
+
1627
+
1628
+ def _count_carbon_atoms(formula: str) -> int:
1629
+ """
1630
+ Count the number of carbon atoms in a molecular formula.
1631
+
1632
+ Args:
1633
+ formula: Molecular formula string like "C6H12O6"
1634
+
1635
+ Returns:
1636
+ Number of carbon atoms
1637
+ """
1638
+ import re
1639
+
1640
+ if not formula or not isinstance(formula, str):
1641
+ return 0
1642
+
1643
+ # Look for carbon followed by optional number
1644
+ # C followed by digits, or just C (which means 1)
1645
+ carbon_matches = re.findall(r'C(\d*)', formula)
1646
+
1647
+ total_carbons = 0
1648
+ for match in carbon_matches:
1649
+ if match == '':
1650
+ # Just 'C' without number means 1 carbon
1651
+ total_carbons += 1
1652
+ else:
1653
+ # 'C' followed by number
1654
+ total_carbons += int(match)
1655
+
1656
+ return total_carbons
1657
+
1658
+
1659
+ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_tol: float = 2.0):
1660
+ """Create consensus features from library entries instead of features_df.
1661
+
1662
+ This method takes all rows from lib_df and creates corresponding entries in
1663
+ consensus_df with the same columns as merge(). Instead of relying on
1664
+ features_df, it populates consensus features directly from library data.
1665
+
1666
+ Before creating new features, it checks for pre-existing consensus features:
1667
+ - If rt in lib_df is null: picks consensus feature with matching mz and largest inty_mean
1668
+ - If rt is not null: picks consensus feature with matching mz and rt within tolerance
1669
+ - If a match is found, skips to the next library entry
1670
+
1671
+ Args:
1672
+ study: Study instance with lib_df populated
1673
+ chrom_fhwm: Chromatographic full width at half maximum in seconds
1674
+ to infer rt_start_mean and rt_end_mean (default: 5.0)
1675
+ mz_tol: m/z tolerance for matching existing consensus features (default: 0.01)
1676
+ rt_tol: RT tolerance for matching existing consensus features (default: 2.0)
1677
+
1678
+ Side effects:
1679
+ Adds rows to study.consensus_df and study.consensus_mapping_df
1680
+ Calls study.find_ms2() at the end
1681
+ """
1682
+ # Get logger from study if available
1683
+ logger = getattr(study, "logger", None)
1684
+
1685
+ # Validate inputs
1686
+ if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
1687
+ if logger:
1688
+ logger.error("Library (study.lib_df) is empty; call lib_load() first")
1689
+ raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
1690
+
1691
+ if logger:
1692
+ logger.info(f"Creating consensus features from {len(study.lib_df)} library entries")
1693
+
1694
+ # Initialize consensus DataFrames if they don't exist
1695
+ if not hasattr(study, "consensus_df") or study.consensus_df is None:
1696
+ study.consensus_df = pl.DataFrame()
1697
+ if not hasattr(study, "consensus_mapping_df") or study.consensus_mapping_df is None:
1698
+ study.consensus_mapping_df = pl.DataFrame()
1699
+
1700
+ # Get cached adducts for consistent adduct handling
1701
+ cached_adducts_df = None
1702
+ cached_valid_adducts = None
1703
+ try:
1704
+ cached_adducts_df = _get_adducts(study)
1705
+ if not cached_adducts_df.is_empty():
1706
+ cached_valid_adducts = set(cached_adducts_df["name"].to_list())
1707
+ else:
1708
+ cached_valid_adducts = set()
1709
+ except Exception as e:
1710
+ if logger:
1711
+ logger.warning(f"Could not retrieve study adducts: {e}")
1712
+ cached_valid_adducts = set()
1713
+
1714
+ # Always allow '?' adducts
1715
+ cached_valid_adducts.add("?")
1716
+
1717
+ # Get starting consensus_uid counter
1718
+ if not study.consensus_df.is_empty():
1719
+ max_existing_uid = study.consensus_df["consensus_uid"].max()
1720
+ consensus_uid_counter = int(max_existing_uid) + 1 if max_existing_uid is not None else 0
1721
+ else:
1722
+ consensus_uid_counter = 0
1723
+
1724
+ # Track [M+H] iso=0 and [M-H] iso=0 entries for adduct grouping
1725
+ base_adduct_groups = {} # key: (mz, adduct_base), value: adduct_group
1726
+
1727
+ # Process each library entry
1728
+ consensus_metadata = []
1729
+ consensus_mapping_list = []
1730
+ matched_count = 0
1731
+ skipped_count = 0
1732
+
1733
+ for lib_row in study.lib_df.iter_rows(named=True):
1734
+ # Extract basic library data
1735
+ lib_uid = lib_row.get("lib_uid")
1736
+ mz = lib_row.get("mz")
1737
+ rt = lib_row.get("rt")
1738
+ iso = lib_row.get("iso", 0)
1739
+ adduct = lib_row.get("adduct")
1740
+ z = lib_row.get("z", 1) # charge
1741
+
1742
+ # Skip entries without essential data
1743
+ if mz is None:
1744
+ if logger:
1745
+ logger.warning(f"Skipping library entry {lib_uid} - no m/z value")
1746
+ continue
1747
+
1748
+ # Check for pre-existing consensus features
1749
+ existing_match = None
1750
+ if not study.consensus_df.is_empty():
1751
+ # Filter by m/z tolerance first
1752
+ mz_matches = study.consensus_df.filter(
1753
+ (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
1754
+ )
1755
+
1756
+ if not mz_matches.is_empty():
1757
+ if rt is None:
1758
+ # If rt is null, pick the consensus feature with largest inty_mean
1759
+ existing_match = mz_matches.sort("inty_mean", descending=True).head(1)
1760
+ else:
1761
+ # If rt is not null, filter by RT tolerance and pick largest inty_mean
1762
+ rt_tolerance = chrom_fhwm # Use chrom_fhwm as RT tolerance range
1763
+ rt_matches = mz_matches.filter(
1764
+ (pl.col("rt") >= rt - rt_tolerance) & (pl.col("rt") <= rt + rt_tolerance)
1765
+ )
1766
+ if not rt_matches.is_empty():
1767
+ existing_match = rt_matches.sort("inty_mean", descending=True).head(1)
1768
+
1769
+ if existing_match is not None and len(existing_match) > 0:
1770
+ # Found a matching consensus feature, skip this library entry
1771
+ matched_count += 1
1772
+ if logger and matched_count <= 5: # Log first few matches
1773
+ match_uid = existing_match["consensus_uid"][0]
1774
+ match_mz = existing_match["mz"][0]
1775
+ match_rt = existing_match["rt"][0]
1776
+ logger.debug(f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})")
1777
+ continue
1778
+
1779
+ # No match found, create new consensus feature
1780
+ # Handle missing RT - use 0 as placeholder
1781
+ if rt is None:
1782
+ rt = 0.0
1783
+ if logger and skipped_count < 5: # Log first few
1784
+ logger.debug(f"Library entry {lib_uid} has no RT, using 0.0")
1785
+
1786
+ # Calculate RT range based on chrom_fhwm
1787
+ half_width = chrom_fhwm / 2.0
1788
+ rt_start = rt - half_width
1789
+ rt_end = rt + half_width
1790
+
1791
+ # Get adduct information
1792
+ adduct_top = adduct if adduct else "?"
1793
+ adduct_charge_top = None
1794
+ adduct_mass_shift_top = None
1795
+ adduct_mass_neutral_top = None
1796
+
1797
+ # Parse adduct to get charge and mass shift
1798
+ if adduct_top and cached_adducts_df is not None and not cached_adducts_df.is_empty():
1799
+ # Look for exact match in study adducts
1800
+ matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct_top)
1801
+ if not matching_adduct.is_empty():
1802
+ adduct_row = matching_adduct.row(0, named=True)
1803
+ adduct_charge_top = adduct_row["charge"]
1804
+ adduct_mass_shift_top = adduct_row["mass_shift"]
1805
+
1806
+ # Fallback to default values if not found
1807
+ if adduct_charge_top is None:
1808
+ adduct_charge_top = int(z) if z else 1
1809
+ # Default based on study polarity
1810
+ study_polarity = getattr(study, "polarity", "positive")
1811
+ if study_polarity in ["negative", "neg"]:
1812
+ if adduct_charge_top > 0:
1813
+ adduct_charge_top = -adduct_charge_top
1814
+ adduct_mass_shift_top = -1.007825
1815
+ if adduct_top == "?":
1816
+ adduct_top = "[M-?]1-"
1817
+ else:
1818
+ if adduct_charge_top < 0:
1819
+ adduct_charge_top = -adduct_charge_top
1820
+ adduct_mass_shift_top = 1.007825
1821
+ if adduct_top == "?":
1822
+ adduct_top = "[M+?]1+"
1823
+
1824
+ # Calculate neutral mass
1825
+ if adduct_charge_top and adduct_mass_shift_top is not None:
1826
+ adduct_mass_neutral_top = mz * abs(adduct_charge_top) - adduct_mass_shift_top
1827
+
1828
+ # Determine adduct group for isotopologues and related adducts
1829
+ adduct_group = consensus_uid_counter # Default: each entry gets its own group
1830
+ adduct_of = 0 # Default: this is the base adduct
1831
+
1832
+ # Track base adducts ([M+H] iso=0 or [M-H] iso=0) for grouping
1833
+ base_adduct_key = None
1834
+ if iso == 0 and adduct_top in ["[M+H]+", "[M+H]1+", "[M-H]-", "[M-H]1-"]:
1835
+ # This is a base adduct with iso=0
1836
+ base_adduct_key = (round(mz, 4), adduct_top)
1837
+ base_adduct_groups[base_adduct_key] = consensus_uid_counter
1838
+ elif iso > 0:
1839
+ # This is an isotopologue, try to find the base adduct
1840
+ # Calculate the base m/z (subtract isotope mass shifts)
1841
+ c13_mass_shift = 1.00335
1842
+ base_mz = mz - (iso * c13_mass_shift / abs(adduct_charge_top))
1843
+
1844
+ # Look for matching base adduct
1845
+ for (stored_mz, stored_adduct), stored_group in base_adduct_groups.items():
1846
+ if abs(stored_mz - base_mz) < mz_tol and stored_adduct == adduct_top:
1847
+ adduct_group = stored_group
1848
+ adduct_of = stored_group
1849
+ break
1850
+
1851
+ # Create adduct values list with proper structure (format: structured data with fields: adduct, count, percentage, mass)
1852
+ adduct_values = [{"adduct": adduct_top, "count": 1, "percentage": 100.0, "mass": 0.0}]
1853
+
1854
+ # Generate unique consensus_id string
1855
+ import uuid
1856
+ consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
1857
+
1858
+ # Build consensus metadata with requested modifications for new entries
1859
+ metadata = {
1860
+ "consensus_uid": consensus_uid_counter,
1861
+ "consensus_id": consensus_id_str,
1862
+ "quality": 1.0,
1863
+ "number_samples": 0.0, # Set to 0.0 for library entries
1864
+ "rt": float(rt),
1865
+ "mz": float(mz),
1866
+ "rt_min": float(rt), # Set to rt as requested
1867
+ "rt_max": float(rt), # Set to rt as requested
1868
+ "rt_mean": float(rt), # Set to rt as requested
1869
+ "rt_start_mean": float(rt_start),
1870
+ "rt_end_mean": float(rt_end),
1871
+ "rt_delta_mean": 0.0, # Set to 0.0 as requested
1872
+ "mz_min": float(mz), # Set to mz as requested
1873
+ "mz_max": float(mz), # Set to mz as requested
1874
+ "mz_mean": float(mz), # Set to mz as requested
1875
+ "mz_start_mean": float(mz), # Set to mz as requested
1876
+ "mz_end_mean": float(mz), # Set to mz as requested
1877
+ "inty_mean": -1.0, # Set to -1.0 as requested
1878
+ "bl": -1.0,
1879
+ "chrom_coherence_mean": -1.0, # Set to -1.0 as requested
1880
+ "chrom_prominence_mean": -1.0, # Set to -1.0 as requested
1881
+ "chrom_prominence_scaled_mean": -1.0, # Set to -1.0 as requested
1882
+ "chrom_height_scaled_mean": -1.0, # Set to -1.0 as requested
1883
+ "iso": iso, # Set to iso from lib_df as requested
1884
+ "iso_mean": float(iso), # Set to iso from lib_df as requested
1885
+ "charge_mean": float(abs(z)) if z else 1.0, # Set to z as requested
1886
+ "number_ms2": 0, # Will be updated by find_ms2
1887
+ "adducts": adduct_values,
1888
+ "adduct_charge_top": adduct_charge_top,
1889
+ "adduct_group": adduct_group, # Use calculated adduct group
1890
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
1891
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
1892
+ "adduct_of": adduct_of, # Use calculated adduct_of
1893
+ "adduct_top": adduct_top,
1894
+ "id_top_name": None, # Set to null as requested
1895
+ "id_top_class": None, # Set to null as requested
1896
+ "id_top_adduct": None, # Set to null as requested
1897
+ "id_top_score": None, # Set to null as requested
1898
+ }
1899
+
1900
+ consensus_metadata.append(metadata)
1901
+
1902
+ # Create mapping entry (maps to library entry as "virtual" feature)
1903
+ # Use lib_uid as the feature_uid and a virtual sample_uid of 0
1904
+ # Match existing consensus_mapping_df column order: consensus_uid, feature_uid, sample_uid
1905
+ consensus_mapping_list.append({
1906
+ "consensus_uid": consensus_uid_counter,
1907
+ "feature_uid": lib_uid, # Use lib_uid as feature reference
1908
+ "sample_uid": 0, # Virtual sample for library entries
1909
+ })
1910
+
1911
+ consensus_uid_counter += 1
1912
+
1913
+ # Log matching statistics
1914
+ if logger:
1915
+ total_processed = matched_count + len(consensus_metadata)
1916
+ logger.info(f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features")
1917
+
1918
+ # Convert to DataFrames with proper schema alignment
1919
+ if consensus_metadata:
1920
+ new_consensus_df = pl.DataFrame(consensus_metadata, strict=False)
1921
+
1922
+ # Ensure schema compatibility with existing consensus_df
1923
+ if not study.consensus_df.is_empty():
1924
+ # Cast columns to match existing schema
1925
+ existing_schema = study.consensus_df.schema
1926
+ cast_exprs = []
1927
+ for col_name in new_consensus_df.columns:
1928
+ if col_name in existing_schema:
1929
+ target_dtype = existing_schema[col_name]
1930
+ if target_dtype == pl.Null:
1931
+ # For Null columns, use lit(None) to maintain Null type
1932
+ cast_exprs.append(pl.lit(None).alias(col_name))
1933
+ else:
1934
+ cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
1935
+ else:
1936
+ cast_exprs.append(pl.col(col_name))
1937
+
1938
+ new_consensus_df = new_consensus_df.select(cast_exprs)
1939
+
1940
+ new_consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
1941
+
1942
+ # Append to existing DataFrames
1943
+ if not study.consensus_df.is_empty():
1944
+ study.consensus_df = pl.concat([study.consensus_df, new_consensus_df])
1945
+ else:
1946
+ study.consensus_df = new_consensus_df
1947
+
1948
+ if not study.consensus_mapping_df.is_empty():
1949
+ study.consensus_mapping_df = pl.concat([study.consensus_mapping_df, new_consensus_mapping_df])
1950
+ else:
1951
+ study.consensus_mapping_df = new_consensus_mapping_df
1952
+
1953
+ if logger:
1954
+ logger.info(f"Added {len(consensus_metadata)} consensus features from library")
1955
+ else:
1956
+ if logger:
1957
+ logger.warning("No valid consensus features created from library")
1958
+ return
1959
+
1960
+ # Store operation in history
1961
+ if hasattr(study, "update_history"):
1962
+ study.update_history(
1963
+ ["lib_to_consensus"],
1964
+ {"chrom_fhwm": chrom_fhwm, "lib_entries": len(study.lib_df)},
1965
+ )
1966
+
1967
+ # Perform find_ms2 at the end
1968
+ try:
1969
+ if hasattr(study, "find_ms2"):
1970
+ if logger:
1971
+ logger.info("Running find_ms2 to link MS2 spectra to library-derived consensus features")
1972
+ study.find_ms2()
1973
+ else:
1974
+ if logger:
1975
+ logger.warning("find_ms2 method not available on study object")
1976
+ except Exception as e:
1977
+ if logger:
1978
+ logger.warning(f"find_ms2 failed: {e}")
1979
+
1980
+ if logger:
1981
+ logger.info(f"lib_to_consensus completed: {len(consensus_metadata)} features added")
masster/study/load.py CHANGED
@@ -261,9 +261,14 @@ def _fill_chrom_single_impl(
261
261
  min_number_abs = 1
262
262
  if isinstance(min_samples_rel, float) and min_samples_rel > 0:
263
263
  min_number_rel = int(min_samples_rel * len(self.samples_df))
264
- if isinstance(min_samples_abs, int) and min_samples_abs > 0:
265
- min_number_abs = int(min_samples_abs)
264
+ if isinstance(min_samples_abs, int) and min_samples_abs >= 0:
265
+ min_number_abs = int(min_samples_abs) if min_samples_abs > 0 else 0
266
266
  min_number = max(min_number_rel, min_number_abs)
267
+
268
+ # Special case: if min_samples_abs is explicitly 0, allow 0-sample features (like library features)
269
+ if isinstance(min_samples_abs, int) and min_samples_abs == 0:
270
+ min_number = 0
271
+
267
272
  self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
268
273
 
269
274
  if min_number > 0:
@@ -277,7 +282,7 @@ def _fill_chrom_single_impl(
277
282
  )
278
283
  self.logger.debug("Identifying missing features...")
279
284
  # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
280
- missing_combinations = self._get_missing_consensus_sample_combinations(uids)
285
+ missing_combinations = _get_missing_consensus_sample_combinations(self,uids)
281
286
  if not missing_combinations:
282
287
  self.logger.info("No missing features found to fill.")
283
288
  return
@@ -754,10 +759,14 @@ def _fill_chrom_impl(
754
759
  min_number_abs = 1
755
760
  if isinstance(min_samples_rel, float) and min_samples_rel > 0:
756
761
  min_number_rel = int(min_samples_rel * len(self.samples_df))
757
- if isinstance(min_samples_abs, int) and min_samples_abs > 0:
758
- min_number_abs = int(min_samples_abs)
762
+ if isinstance(min_samples_abs, int) and min_samples_abs >= 0:
763
+ min_number_abs = int(min_samples_abs) if min_samples_abs > 0 else 0
759
764
  min_number = max(min_number_rel, min_number_abs)
760
765
 
766
+ # Special case: if min_samples_abs is explicitly 0, allow 0-sample features (like library features)
767
+ if isinstance(min_samples_abs, int) and min_samples_abs == 0:
768
+ min_number = 0
769
+
761
770
  self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
762
771
 
763
772
  if min_number > 0:
@@ -770,7 +779,7 @@ def _fill_chrom_impl(
770
779
 
771
780
  # Get missing consensus/sample combinations using the optimized method
772
781
  self.logger.debug("Identifying missing features...")
773
- missing_combinations = self._get_missing_consensus_sample_combinations(uids)
782
+ missing_combinations = _get_missing_consensus_sample_combinations(self, uids)
774
783
 
775
784
  if not missing_combinations or len(missing_combinations) == 0:
776
785
  self.logger.info("No missing features found to fill.")
@@ -846,7 +855,7 @@ def _fill_chrom_impl(
846
855
  future_to_sample = {}
847
856
  for sample_info in samples_to_process:
848
857
  future = executor.submit(
849
- self._process_sample_for_parallel_fill,
858
+ _process_sample_for_parallel_fill, self,
850
859
  sample_info,
851
860
  consensus_info,
852
861
  uids,
masster/study/merge.py CHANGED
@@ -505,13 +505,99 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
505
505
  return consensus_map
506
506
 
507
507
 
508
+ def _generate_feature_maps_from_samples(study):
509
+ """
510
+ Generate feature maps using Study-level features_df instead of Sample-level loading.
511
+ This uses the study's existing features_df which is already loaded.
512
+
513
+ Args:
514
+ study: Study object containing features_df
515
+
516
+ Returns:
517
+ list: List of temporary FeatureMap objects built from Study-level data
518
+ """
519
+ import pyopenms as oms
520
+
521
+ temp_feature_maps = []
522
+
523
+ study.logger.info(f"Building feature maps using Study-level features_df from {len(study.samples_df)} samples")
524
+
525
+ # Use the features_df from the study that's already loaded
526
+ if not hasattr(study, 'features_df') or study.features_df is None or study.features_df.is_empty():
527
+ study.logger.warning("No features_df available - features must be loaded first")
528
+ return temp_feature_maps
529
+
530
+ # Group features by sample
531
+ study.logger.info(f"Processing {len(study.features_df)} features grouped by sample")
532
+
533
+ # Get unique sample names/indices
534
+ if 'sample_uid' in study.features_df.columns:
535
+ sample_groups = study.features_df.group_by('sample_uid')
536
+ study.logger.debug("Grouping features by 'sample_uid' column")
537
+ elif 'sample_id' in study.features_df.columns:
538
+ sample_groups = study.features_df.group_by('sample_id')
539
+ study.logger.debug("Grouping features by 'sample_id' column")
540
+ elif 'sample' in study.features_df.columns:
541
+ sample_groups = study.features_df.group_by('sample')
542
+ study.logger.debug("Grouping features by 'sample' column")
543
+ else:
544
+ study.logger.warning("No sample grouping column found in features_df")
545
+ study.logger.info(f"Available columns: {study.features_df.columns}")
546
+ return temp_feature_maps
547
+
548
+ # Process each sample group
549
+ processed_samples = 0
550
+ for sample_key, sample_features in sample_groups:
551
+ try:
552
+ feature_map = oms.FeatureMap()
553
+ feature_count = 0
554
+
555
+ # Build features from this sample's features
556
+ for row in sample_features.iter_rows(named=True):
557
+ try:
558
+ feature = oms.Feature()
559
+
560
+ # Set feature properties
561
+ if row.get("feature_id") is not None:
562
+ feature.setUniqueId(int(row["feature_id"]))
563
+ if row.get("mz") is not None:
564
+ feature.setMZ(float(row["mz"]))
565
+ if row.get("rt") is not None:
566
+ feature.setRT(float(row["rt"]))
567
+ if row.get("inty") is not None:
568
+ feature.setIntensity(float(row["inty"]))
569
+ if row.get("quality") is not None:
570
+ feature.setOverallQuality(float(row["quality"]))
571
+ if row.get("charge") is not None:
572
+ feature.setCharge(int(row["charge"]))
573
+
574
+ feature_map.push_back(feature)
575
+ feature_count += 1
576
+
577
+ except (ValueError, TypeError) as e:
578
+ study.logger.warning(f"Skipping feature in sample {sample_key} due to conversion error: {e}")
579
+ continue
580
+
581
+ temp_feature_maps.append(feature_map)
582
+ processed_samples += 1
583
+ study.logger.debug(f"Built feature map for sample {sample_key} with {feature_count} features")
584
+
585
+ except Exception as e:
586
+ study.logger.warning(f"Failed to process sample group {sample_key}: {e}")
587
+ # Add empty feature map for failed samples to maintain sample order
588
+ temp_feature_maps.append(oms.FeatureMap())
589
+
590
+ study.logger.info(f"Generated {len(temp_feature_maps)} feature maps from {processed_samples} samples using Study-level features_df")
591
+ return temp_feature_maps
592
+
593
+
508
594
  def _generate_feature_maps_on_demand(study):
509
595
  """
510
- Generate feature maps on-demand from study.features_df for merge operations.
596
+ Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
511
597
  Returns temporary feature maps that are not cached in the study.
512
598
 
513
599
  Args:
514
- study: Study object containing features_df and samples_df
600
+ study: Study object containing samples
515
601
 
516
602
  Returns:
517
603
  list: List of temporary FeatureMap objects
@@ -520,6 +606,15 @@ def _generate_feature_maps_on_demand(study):
520
606
  import pyopenms as oms
521
607
  import numpy as np
522
608
 
609
+ # Check if we should use Sample-level loading instead of features_df
610
+ use_sample_loading = True # Default to Sample-level loading as requested
611
+
612
+ # Use Sample-level loading if requested and samples_df is available
613
+ if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
614
+ study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
615
+ return _generate_feature_maps_from_samples(study)
616
+
617
+ # Fallback to original features_df approach
523
618
  if study.features_df is None or len(study.features_df) == 0:
524
619
  study.logger.error("No features_df available for generating feature maps")
525
620
  return []
@@ -62,16 +62,8 @@ def _generate_feature_maps_on_demand_for_align(study):
62
62
  if feature_row["inty"] is None:
63
63
  study.logger.warning("Skipping feature due to missing inty")
64
64
  continue
65
-
66
- # Handle missing feature_id by generating a new one
67
- if feature_row["feature_id"] is None:
68
- # Use a simple incremental ID for alignment purposes
69
- feature_id = len(temp_feature_maps) * 100000 + feature_map.size() + 1
70
- study.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID in sample {sample_name}")
71
- else:
72
- feature_id = int(feature_row["feature_id"])
73
-
74
- feature.setUniqueId(feature_id)
65
+
66
+ feature.setUniqueId(int(feature_row["feature_id"]))
75
67
  feature.setMZ(float(feature_row["mz"]))
76
68
  feature.setRT(float(feature_row["rt"]))
77
69
  feature.setIntensity(float(feature_row["inty"]))
masster/study/study.py CHANGED
@@ -440,6 +440,11 @@ class Study:
440
440
 
441
441
  # === Identification and Library Matching ===
442
442
  lib_load = lib_load
443
+
444
+ def lib_to_consensus(self, **kwargs):
445
+ """Create consensus features from library entries."""
446
+ from masster.study.id import lib_to_consensus as _lib_to_consensus
447
+ return _lib_to_consensus(self, **kwargs)
443
448
  identify = identify
444
449
  get_id = get_id
445
450
  id_reset = id_reset
@@ -562,6 +567,83 @@ class Study:
562
567
  except Exception as e:
563
568
  self.logger.error(f"Failed to reload current module {current_module}: {e}")
564
569
 
570
+ def _sanitize_null_ids(self):
571
+ """
572
+ Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
573
+ For feature_id: generates large sequential integers that can be converted by merge/align functions.
574
+ For consensus_id: uses 16-character UUID strings (as expected by merge function).
575
+ """
576
+ import uuid
577
+ import polars as pl
578
+ import time
579
+
580
+ # Sanitize features_df feature_id column
581
+ if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
582
+ # Check for null feature_ids
583
+ null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
584
+ if null_feature_ids > 0:
585
+ self.logger.info(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
586
+
587
+ # Find the maximum existing feature_id (convert strings to int if possible)
588
+ max_existing_id = 0
589
+ existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
590
+ for fid in existing_ids:
591
+ try:
592
+ int_id = int(fid)
593
+ max_existing_id = max(max_existing_id, int_id)
594
+ except (ValueError, TypeError):
595
+ # Skip non-integer IDs
596
+ pass
597
+
598
+ # Generate new sequential integer IDs starting from max + timestamp offset
599
+ # Use timestamp to ensure uniqueness across different sanitization runs
600
+ base_id = max(max_existing_id + 1, int(time.time() * 1000000)) # Microsecond timestamp
601
+ new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
602
+ uid_index = 0
603
+
604
+ # Create a list to store all feature_ids
605
+ feature_ids = []
606
+ for feature_id in self.features_df["feature_id"].to_list():
607
+ if feature_id is None:
608
+ feature_ids.append(new_int_ids[uid_index])
609
+ uid_index += 1
610
+ else:
611
+ feature_ids.append(feature_id)
612
+
613
+ # Update the DataFrame with sanitized feature_ids
614
+ self.features_df = self.features_df.with_columns(
615
+ pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
616
+ )
617
+
618
+ self.logger.info(f"Successfully sanitized {null_feature_ids} feature_id values")
619
+
620
+ # Sanitize consensus_df consensus_id column
621
+ if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
622
+ if "consensus_id" in self.consensus_df.columns:
623
+ null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
624
+ if null_consensus_ids > 0:
625
+ self.logger.info(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
626
+
627
+ # Generate new UIDs for null values using the same method as merge()
628
+ new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
629
+ uid_index = 0
630
+
631
+ # Create a list to store all consensus_ids
632
+ consensus_ids = []
633
+ for consensus_id in self.consensus_df["consensus_id"].to_list():
634
+ if consensus_id is None:
635
+ consensus_ids.append(new_uids[uid_index])
636
+ uid_index += 1
637
+ else:
638
+ consensus_ids.append(consensus_id)
639
+
640
+ # Update the DataFrame with sanitized consensus_ids
641
+ self.consensus_df = self.consensus_df.with_columns(
642
+ pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
643
+ )
644
+
645
+ self.logger.info(f"Successfully sanitized {null_consensus_ids} consensus_id values")
646
+
565
647
  def __dir__(self):
566
648
  """
567
649
  Custom __dir__ implementation to hide internal methods starting with '_'
@@ -327,6 +327,9 @@
327
327
  "formula": {
328
328
  "dtype": "pl.String"
329
329
  },
330
+ "iso": {
331
+ "dtype": "pl.Int64"
332
+ },
330
333
  "adduct": {
331
334
  "dtype": "pl.String"
332
335
  },
@@ -342,6 +345,9 @@
342
345
  "rt": {
343
346
  "dtype": "pl.Null"
344
347
  },
348
+ "quant_group": {
349
+ "dtype": "pl.Int64"
350
+ },
345
351
  "db_id": {
346
352
  "dtype": "pl.String"
347
353
  },
@@ -369,6 +375,9 @@
369
375
  },
370
376
  "score": {
371
377
  "dtype": "pl.Float64"
378
+ },
379
+ "iso": {
380
+ "dtype": "pl.Int64"
372
381
  }
373
382
  }
374
383
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,5 +1,5 @@
1
1
  masster/__init__.py,sha256=ueZ224WPNRRjQEYTaQUol818nwQgJwB93HbEfmtPRmg,1041
2
- masster/_version.py,sha256=P5MV1QkrG88uczENqaXQiyOnJWkFQaZ4FsWNCqYySbg,256
2
+ masster/_version.py,sha256=dkqPLCQGfsGL65orxLHNgDpbEE9aMOWq4b_vYspojyk,256
3
3
  masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
4
4
  masster/logger.py,sha256=tR65N23zfrNpcZNbZm2ot_Aual9XrGB1MWjLrovZkMs,16749
5
5
  masster/spectrum.py,sha256=XJSUrqXZSzfpWnD8v5IMClXMRZLKLYIk014qaMOS9_k,49738
@@ -8,6 +8,7 @@ masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_Q
8
8
  masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5,sha256=dSd2cIgYYdRcNSzkhqlZCeWKi3x8Hhhcx8BFMuiVG4c,11382948
9
9
  masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5,sha256=wER8CHSBz54Yx1kwmU7ghPPWVwYvxv_lXGB8-8a1xpQ,9508434
10
10
  masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5,sha256=h2OOAWWTwKXzTNewhiYeL-cMYdp_JYLPya8Q9Nv9Lvw,12389587
11
+ masster/data/libs/aa.csv,sha256=Sja1DyMsiaM2NfLcct4kAAcXYwPCukJJW8sDkup9w_c,1924
11
12
  masster/data/libs/ccm.csv,sha256=Q6nylV1152uTpX-ydqWeGrc6L9kgv45xN_fBZ4f7Tvo,12754
12
13
  masster/data/libs/urine.csv,sha256=iRrR4N8Wzb8KDhHJA4LqoQC35pp93FSaOKvXPrgFHis,653736
13
14
  masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data,sha256=01vC6m__Qqm2rLvlTMZoeKIKowFvovBTUnrNl8Uav3E,24576
@@ -15,7 +16,7 @@ masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecR
15
16
  masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan,sha256=ahi1Y3UhAj9Bj4Q2MlbgPekNdkJvMOoMXVOoR6CeIxc,13881220
16
17
  masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2,sha256=TFB0HW4Agkig6yht7FtgjUdbXax8jjKaHpSZSvuU5vs,3252224
17
18
  masster/lib/__init__.py,sha256=TcePNx3SYZHz6763TL9Sg4gUNXaRWjlrOtyS6vsu-hg,178
18
- masster/lib/lib.py,sha256=mxUYBCBmkSBZB82557smSHCS25BAusuCewvW8zwsLGg,27130
19
+ masster/lib/lib.py,sha256=SSN06UtiM-hIdjS3eCiIHsJ_8S4YHRGOLGmdPIh-efo,27481
19
20
  masster/sample/__init__.py,sha256=HL0m1ept0PMAYUCQtDDnkdOS12IFl6oLAq4TZQz83uY,170
20
21
  masster/sample/adducts.py,sha256=S7meba3L1tSdjoDhkSiTI71H2NJLu4i1dtJwfDKWI1M,32591
21
22
  masster/sample/h5.py,sha256=B0gAmhrnoFoybotqsqiT8s-PkeZWUdIQfI-4cnM52Zc,115430
@@ -39,22 +40,22 @@ masster/sample/defaults/sample_def.py,sha256=keoXyMyrm_iLgbYqfIbqCpJ3XHBVlNwCNmb
39
40
  masster/study/__init__.py,sha256=55axdFuqRX4aXtJ8ocnhcLB32fNtmmJpCi58moO0r4g,237
40
41
  masster/study/analysis.py,sha256=L-wXBnGZCLB5UUDrjIdOiMG9zdej3Tw_SftcEmmTukM,84264
41
42
  masster/study/export.py,sha256=Rp1vc5iDl-XFWo_RBVCJDGBNSKakq9f8aC2FeUCP9GA,59398
42
- masster/study/h5.py,sha256=eINlVmcJuntwbkkZHwzm10c63Kg7zib49vkzLDj1PyU,84790
43
+ masster/study/h5.py,sha256=6_nyjMGg_dkKkrx_Mv77wGg5SmWsVOZxu7HZasoXbRU,84916
43
44
  masster/study/helpers.py,sha256=dU2YxAGPmu1w55mpcgNoHPpg2fNW-vK944aJy3YwLsU,163555
44
- masster/study/id.py,sha256=L5S0etAeEmtkzE06M32PNo3rp-WE01sLU6M5_TdNC3w,55266
45
- masster/study/load.py,sha256=x0OvfOoSY-6lQ8_B9KeDUh_E99eYRm1WGLnCY3CGqF8,71222
46
- masster/study/merge.py,sha256=MvvEwS2_UCWIrxROtuYTkJOu3pk8EasorbrfWug2e68,159736
45
+ masster/study/id.py,sha256=r_vZQYNxqNXf_pjgk_CLkl1doLnLa956mTuVmlHN52o,80075
46
+ masster/study/load.py,sha256=W4mljmYVR71sas4no7vKWIVfdnQjb-rTcEUhE0ZMr0k,71696
47
+ masster/study/merge.py,sha256=XF4NxNuLSxwf2j1__ReIInXVRGDRoSHFeKdcCSayKU4,164298
47
48
  masster/study/parameters.py,sha256=bTvmcwX9INxzcrEAmTiFH8qeWVhwkvMTZjuP394pz5o,3279
48
49
  masster/study/plot.py,sha256=pAN5uQKYPUpupQVtKBloWjKOKpM_C9o2e3VWkJ-aZN8,102041
49
- masster/study/processing.py,sha256=hf8FEPONXz4x6G1fke6SEUnCJCv47cGeo--6h1c1Sis,56441
50
+ masster/study/processing.py,sha256=TKeTzRLmaMxUKCt66pXPfx_7xc-R5__ZwEZdFHOxg6A,55916
50
51
  masster/study/save.py,sha256=47AP518epJJ9TjaGGyrLKsMsyjIk8_J4ka7bmsnRtFQ,9268
51
- masster/study/study.py,sha256=YLt6tqCPkWvmKGBvhiyh1LeZgr165f9eqgiIV5OLomY,38393
52
- masster/study/study5_schema.json,sha256=ghBeAXFS4a4Uavdn6TUVs9GaR1QOTnADCjQTOkN0tjU,7563
52
+ masster/study/study.py,sha256=vbP_bPa62-KYN0OTUN6PpSyCoFcW-TdbLbx67ShkEx0,42930
53
+ masster/study/study5_schema.json,sha256=0IZxM9VVI0TUlx74BPzJDT44kySi6NZZ6iLR0j8bU_s,7736
53
54
  masster/study/defaults/__init__.py,sha256=m3Z5KXGqsTdh7GjYzZoENERt39yRg0ceVRV1DeCt1P0,610
54
55
  masster/study/defaults/align_def.py,sha256=hHQbGgsOqMRHHr0Wn8Onr8XeaRz3-fFE0qGE-OMst80,20324
55
56
  masster/study/defaults/export_def.py,sha256=eXl3h4aoLX88XkHTpqahLd-QZ2gjUqrmjq8IJULXeWo,1203
56
57
  masster/study/defaults/fill_chrom_def.py,sha256=hB6-tyC9bhx-IpGj2HC8FinQdW4VLYj_pn5t1rlj-Ew,8887
57
- masster/study/defaults/fill_def.py,sha256=TdDqOt-fva44JptLvxOy7GNUCR5isOKz1jR2xj_V8sQ,8869
58
+ masster/study/defaults/fill_def.py,sha256=H-ZNKyiXxBLWdLoCMqxfvphNyc9wrDVFMC7TyRNYEm0,8869
58
59
  masster/study/defaults/find_consensus_def.py,sha256=2KRRMsCDP7pwNrLCC6eI5uQgMXqiNdiI6pSvxNJ8L5M,8598
59
60
  masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQus8DpSoJE,5070
60
61
  masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
@@ -66,8 +67,8 @@ masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,1414
66
67
  masster/wizard/__init__.py,sha256=a2hcZnHASjfuw1lqZhZnvTR58rc33rRnoGAY_JfvGhI,683
67
68
  masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,7983
68
69
  masster/wizard/wizard.py,sha256=esgaifLRyaGxytif9qOkTy-21VxlUQxrvl47K-l-BpE,37666
69
- masster-0.5.0.dist-info/METADATA,sha256=HSAp3U_YG6chyHAIJ6YAzyQbN0OQ1L2dxs_S0p_CGns,45113
70
- masster-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
71
- masster-0.5.0.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
72
- masster-0.5.0.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
73
- masster-0.5.0.dist-info/RECORD,,
70
+ masster-0.5.1.dist-info/METADATA,sha256=01v713yHW9RJPqFXY89wd5e21Ls3crfs6kEBDhDrUlc,45113
71
+ masster-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
72
+ masster-0.5.1.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
73
+ masster-0.5.1.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
74
+ masster-0.5.1.dist-info/RECORD,,