masster 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/sample/save.py CHANGED
@@ -808,7 +808,7 @@ def export_dda_stats(self, filename="stats.csv"):
808
808
  self.logger.success(f"DDA statistics exported to {filename}")
809
809
 
810
810
 
811
- def export_xlsx(self, filename="features.xlsx"):
811
+ def export_excel(self, filename="features.xlsx"):
812
812
  """
813
813
  Export the features DataFrame to an Excel file.
814
814
 
@@ -857,6 +857,143 @@ def export_xlsx(self, filename="features.xlsx"):
857
857
  self.logger.debug(f"Exported {len(clean_df)} features with {len(exportable_columns)} columns")
858
858
 
859
859
 
860
+ def export_slaw(self, filename="features_slaw.csv"):
861
+ """
862
+ Export the features DataFrame to a SLAW-formatted CSV file.
863
+
864
+ This method exports the features to a tab-separated CSV format compatible with SLAW,
865
+ including feature metadata and intensity quantification. The file contains comprehensive
866
+ feature information including m/z, RT, annotations, isotopic patterns, MS2 data, and
867
+ intensity values.
868
+
869
+ Parameters:
870
+ filename (str): The path to the output CSV file. Defaults to 'features_slaw.csv'.
871
+
872
+ Side Effects:
873
+ Writes the exported data to the specified CSV file and logs the export operation.
874
+ """
875
+ if self.features_df is None:
876
+ self.logger.warning("No features found. Cannot export to SLAW format.")
877
+ return
878
+
879
+ filename = os.path.abspath(filename)
880
+
881
+ # Get base filename for quant column
882
+ if self.file_path is not None:
883
+ base_name = os.path.splitext(os.path.basename(self.file_path))[0]
884
+ else:
885
+ base_name = "sample"
886
+
887
+ quant_column_name = f"quant_{base_name}.csv"
888
+
889
+ # Prepare the SLAW dataframe with required columns
890
+ import polars as pl
891
+
892
+ df = self.features_df
893
+
894
+ # Evaluate the charge column first if adduct_charge exists
895
+ if "adduct_charge" in df.columns:
896
+ charge_series = df.select(
897
+ pl.when(pl.col("adduct_charge") == 0)
898
+ .then(1 if self.polarity == "positive" else -1)
899
+ .otherwise(pl.col("adduct_charge"))
900
+ .alias("charge")
901
+ ).get_column("charge")
902
+ else:
903
+ charge_series = pl.Series([1 if self.polarity == "positive" else -1] * len(df))
904
+
905
+ # Evaluate the group column (from adduct_group)
906
+ # Features with adduct_group == 0 should each get a unique group index
907
+ if "adduct_group" in df.columns:
908
+ max_adduct_group = df.get_column("adduct_group").max()
909
+ if max_adduct_group is None:
910
+ max_adduct_group = 0
911
+
912
+ # Create a row number starting from max_adduct_group + 1 for features with adduct_group == 0
913
+ group_series = df.select(
914
+ pl.when(pl.col("adduct_group") == 0)
915
+ .then(max_adduct_group + 1 + pl.int_range(pl.len()).over(pl.col("adduct_group") == 0))
916
+ .otherwise(pl.col("adduct_group"))
917
+ .alias("group")
918
+ ).get_column("group")
919
+ else:
920
+ group_series = pl.Series([None] * len(df))
921
+
922
+ # Evaluate the annotation column (adduct + isotope info)
923
+ # annotation = adduct for iso==0, adduct + " +{iso}" for iso>0
924
+ if "adduct" in df.columns and "iso" in df.columns:
925
+ annotation_series = df.select(
926
+ pl.when(pl.col("iso") == 0)
927
+ .then(pl.col("adduct").str.replace(r"\?", "H"))
928
+ .otherwise(pl.col("adduct").str.replace(r"\?", "H") + " +" + pl.col("iso").cast(pl.Utf8))
929
+ .alias("annotation")
930
+ ).get_column("annotation")
931
+ elif "adduct" in df.columns:
932
+ annotation_series = df.get_column("adduct").str.replace(r"\?", "H")
933
+ else:
934
+ annotation_series = pl.Series([""] * len(df))
935
+
936
+ # Create SLAW columns with appropriate mappings from features_df
937
+ # Columns are ordered according to SLAW specification
938
+ slaw_data = {
939
+ "feature_id": df.get_column("feature_id") if "feature_id" in df.columns else pl.Series(range(1, len(df) + 1)),
940
+ "mz": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
941
+ "rt": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
942
+ "group": group_series,
943
+ "annotation": annotation_series,
944
+ "neutral_mass": df.get_column("adduct_neutral_mass") if "adduct_neutral_mass" in df.columns else pl.Series([None] * len(df)),
945
+ "charge": charge_series,
946
+ "main_id": df.get_column("main_id") if "main_id" in df.columns else df.get_column("feature_id") if "feature_id" in df.columns else pl.Series(range(1, len(df) + 1)),
947
+ "ion": df.get_column("adduct").str.replace(r"\?", "H") if "adduct" in df.columns else pl.Series([""] * len(df)),
948
+ "iso": df.get_column("iso") if "iso" in df.columns else pl.Series([0] * len(df)),
949
+ "clique": df.get_column("clique") if "clique" in df.columns else pl.Series([None] * len(df)),
950
+ "num_detection": pl.Series([1] * len(df)), # Single sample always 1
951
+ "total_detection": pl.Series([1] * len(df)), # Single sample always 1
952
+ "mz_mean": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
953
+ "mz_min": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
954
+ "mz_max": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
955
+ "rt_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
956
+ "rt_min": df.get_column("rt_start") if "rt_start" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
957
+ "rt_max": df.get_column("rt_end") if "rt_end" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
958
+ "rt_cor_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
959
+ "rt_cor_min": df.get_column("rt_start") if "rt_start" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
960
+ "rt_cor_max": df.get_column("rt_end") if "rt_end" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
961
+ "height_mean": df.get_column("height") if "height" in df.columns else pl.Series([None] * len(df)),
962
+ "height_min": df.get_column("height") if "height" in df.columns else pl.Series([None] * len(df)),
963
+ "height_max": df.get_column("height") if "height" in df.columns else pl.Series([None] * len(df)),
964
+ "intensity_mean": df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
965
+ "intensity_min": df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
966
+ "intensity_max": df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
967
+ "SN_mean": df.get_column("sn") if "sn" in df.columns else pl.Series([None] * len(df)),
968
+ "SN_min": df.get_column("sn") if "sn" in df.columns else pl.Series([None] * len(df)),
969
+ "SN_max": df.get_column("sn") if "sn" in df.columns else pl.Series([None] * len(df)),
970
+ "peakwidth_mean": (df.get_column("rt_end") - df.get_column("rt_start")) if ("rt_end" in df.columns and "rt_start" in df.columns) else pl.Series([None] * len(df)),
971
+ "peakwidth_min": (df.get_column("rt_end") - df.get_column("rt_start")) if ("rt_end" in df.columns and "rt_start" in df.columns) else pl.Series([None] * len(df)),
972
+ "peakwidth_max": (df.get_column("rt_end") - df.get_column("rt_start")) if ("rt_end" in df.columns and "rt_start" in df.columns) else pl.Series([None] * len(df)),
973
+ "ms2_mgf_id": df.get_column("ms2_mgf_id") if "ms2_mgf_id" in df.columns else pl.Series([""] * len(df)),
974
+ "ms2_num_fused": df.get_column("ms2_scans").list.len() if "ms2_scans" in df.columns and df["ms2_scans"].dtype == pl.List else pl.Series([None] * len(df)),
975
+ "ms2_source": df.get_column("ms2_source") if "ms2_source" in df.columns else pl.Series([""] * len(df)),
976
+ "isotopic_pattern_annot": df.get_column("isotopic_pattern_annot") if "isotopic_pattern_annot" in df.columns else pl.Series([""] * len(df)),
977
+ "isotopic_pattern_rel": df.get_column("isotopic_pattern_rel") if "isotopic_pattern_rel" in df.columns else pl.Series([""] * len(df)),
978
+ "isotopic_pattern_abs": df.get_column("isotopic_pattern_abs") if "isotopic_pattern_abs" in df.columns else pl.Series([""] * len(df)),
979
+ quant_column_name: df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
980
+ }
981
+
982
+ # Create the polars DataFrame
983
+ slaw_df = pl.DataFrame(slaw_data)
984
+
985
+ # Convert to pandas for CSV export with comma separator
986
+ pandas_df = slaw_df.to_pandas()
987
+
988
+ # Export to CSV with comma separator - only quote when necessary (QUOTE_MINIMAL)
989
+ try:
990
+ pandas_df.to_csv(filename, sep=',', index=False, quoting=0) # quoting=0 means QUOTE_MINIMAL
991
+ self.logger.success(f"Features exported to {filename} (SLAW format)")
992
+ self.logger.debug(f"Exported {len(slaw_df)} features with {len(slaw_df.columns)} columns")
993
+ except PermissionError:
994
+ self.logger.error(f"Permission denied: Cannot write to {filename}. The file may be open in another program. Please close it and try again.")
995
+
996
+
860
997
  def export_chrom(self, filename="chrom.csv"):
861
998
  # saves self.chrom_df to a csv file. Remove the scan_uid and chrom columns if the file already exists
862
999
  if self.chrom_df is None:
@@ -872,3 +1009,589 @@ def export_chrom(self, filename="chrom.csv"):
872
1009
  if "chrom" in data.columns:
873
1010
  data = data.drop("chrom")
874
1011
  data.to_csv(filename, index=False)
1012
+
1013
+
1014
+ def export_mztab(self, filename=None, title=None, description=None, include_mgf=False, **kwargs):
1015
+ """
1016
+ Export the sample as a fully compliant mzTab-M file.
1017
+
1018
+ Args:
1019
+ filename (str, optional): Path to the output mzTab-M file. Defaults to "sample.mztab".
1020
+ title (str, optional): Human-readable title for the file.
1021
+ description (str, optional): Human-readable description.
1022
+ include_mgf (bool, optional): Include MGF table with MS2 spectra. Defaults to False.
1023
+ **kwargs: Additional metadata or export options.
1024
+ """
1025
+ from masster._version import __version__
1026
+
1027
+ def safe_str(value, default="null"):
1028
+ """Convert value to string, replacing empty strings with 'null'"""
1029
+ if value is None:
1030
+ return default
1031
+ str_val = str(value)
1032
+ return str_val if str_val.strip() != "" else default
1033
+
1034
+ if filename is None:
1035
+ filename = "sample.mztab"
1036
+ if not os.path.isabs(filename):
1037
+ filename = os.path.abspath(filename)
1038
+
1039
+ # Get identification data if available using get_id() function
1040
+ id_data = None
1041
+ top_id_data = None
1042
+ full_id_data = None
1043
+
1044
+ try:
1045
+ # Import get_id function from sample.id module
1046
+ from masster.sample.id import get_id
1047
+
1048
+ # Get full enriched identification data
1049
+ full_id_data = get_id(self)
1050
+ if full_id_data is not None and not full_id_data.is_empty():
1051
+ # Get top scoring identification for each feature_uid for SML section
1052
+ top_id_data = (
1053
+ full_id_data.group_by("feature_uid")
1054
+ .agg(pl.all().sort_by("score", descending=True).first())
1055
+ .sort("feature_uid")
1056
+ )
1057
+ # Keep raw id_data for backward compatibility (if needed elsewhere)
1058
+ id_data = self.id_df if hasattr(self, "id_df") and self.id_df is not None else None
1059
+ else:
1060
+ self.logger.info("No identification data available for mzTab export")
1061
+ except Exception as e:
1062
+ self.logger.debug(f"Could not retrieve identification data: {e}")
1063
+ id_data = None
1064
+ top_id_data = None
1065
+ full_id_data = None
1066
+
1067
+ # Get MGF data only if requested
1068
+ mgf_data = None
1069
+ mgf_mapping: dict[int, list[int]] = {}
1070
+ if include_mgf:
1071
+ # Create MGF data from features_df
1072
+ if self.features_df is not None:
1073
+ mgf_rows = []
1074
+ mgf_index = 1
1075
+
1076
+ for feature_row in self.features_df.iter_rows(named=True):
1077
+ feature_uid = feature_row["feature_uid"]
1078
+ feature_id = feature_row.get("feature_id", feature_uid)
1079
+
1080
+ # Check if this feature has MS2 scans
1081
+ if feature_row.get("ms2_scans") is None:
1082
+ continue
1083
+
1084
+ ms2_scans = feature_row["ms2_scans"]
1085
+ if not isinstance(ms2_scans, list):
1086
+ ms2_scans = [ms2_scans]
1087
+
1088
+ # Process each MS2 scan
1089
+ for scan_uid in ms2_scans:
1090
+ spec = self.get_spectrum(scan_uid)
1091
+ if spec is None or len(spec.mz) == 0:
1092
+ continue
1093
+
1094
+ mgf_row = {
1095
+ "mgf_index": mgf_index,
1096
+ "feature_uid": feature_uid,
1097
+ "feature_id": feature_id,
1098
+ "rtinseconds": feature_row.get("rt", 0),
1099
+ "pepmass": feature_row.get("mz", 0),
1100
+ "energy": spec.energy if hasattr(spec, "energy") else 0,
1101
+ "mslevel": spec.ms_level if hasattr(spec, "ms_level") else 2,
1102
+ "title": f"uid:{feature_uid}, rt:{feature_row.get('rt', 0):.2f}, mz:{feature_row.get('mz', 0):.4f}",
1103
+ "spec_mz": spec.mz,
1104
+ "spec_int": spec.inty,
1105
+ "spec_len": len(spec.mz),
1106
+ }
1107
+ mgf_rows.append(mgf_row)
1108
+
1109
+ # Track mapping
1110
+ if feature_uid not in mgf_mapping:
1111
+ mgf_mapping[feature_uid] = []
1112
+ mgf_mapping[feature_uid].append(mgf_index)
1113
+
1114
+ mgf_index += 1
1115
+
1116
+ if mgf_rows:
1117
+ mgf_data = pl.DataFrame(mgf_rows)
1118
+
1119
+ # --- Prepare MTD (metadata) section ---
1120
+ mtd_lines = []
1121
+ mtd_lines.append(
1122
+ f"COM\tfile generated by MASSter {__version__} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
1123
+ )
1124
+ mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
1125
+
1126
+ # Use sample name or filename as mzTab-id
1127
+ sample_name = getattr(self, "label", None) or os.path.splitext(os.path.basename(self.file_path))[0] if hasattr(self, "file_path") and self.file_path else "sample"
1128
+ mtd_lines.append(f"MTD\tmzTab-id\t{sample_name}")
1129
+ mtd_lines.append("")
1130
+
1131
+ # CV definitions
1132
+ mtd_lines.append("MTD\tcv[1]-label\tMS")
1133
+ mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
1134
+ mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
1135
+ mtd_lines.append(
1136
+ "MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
1137
+ )
1138
+ mtd_lines.append("")
1139
+
1140
+ # Quantification units
1141
+ mtd_lines.append(
1142
+ "MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
1143
+ )
1144
+ mtd_lines.append(
1145
+ "MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
1146
+ )
1147
+ mtd_lines.append(
1148
+ "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
1149
+ )
1150
+
1151
+ # Identification confidence
1152
+ mtd_lines.append(
1153
+ "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
1154
+ )
1155
+ mtd_lines.append("")
1156
+
1157
+ # Software
1158
+ mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
1159
+ mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {__version__}]")
1160
+ mtd_lines.append(
1161
+ "MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
1162
+ )
1163
+ mtd_lines.append("")
1164
+
1165
+ # Database information - updated based on identification data
1166
+ if full_id_data is not None and hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
1167
+ mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
1168
+ mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
1169
+ mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
1170
+ mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
1171
+ else:
1172
+ mtd_lines.append('MTD\tdatabase[1]\t[, , "PubChem", ]')
1173
+ mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
1174
+ mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
1175
+ mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
1176
+
1177
+ # Single sample metadata
1178
+ mtd_lines.append(f"\nMTD\tsample[1]\t{sample_name}")
1179
+ mtd_lines.append(f"MTD\tsample[1]-description\t{sample_name}")
1180
+ mtd_lines.append(f"MTD\tms_run[1]-location\tfile://unknown")
1181
+
1182
+ # Scan polarity
1183
+ sample_polarity = getattr(self, "polarity", "positive")
1184
+ if sample_polarity in ["negative", "neg"]:
1185
+ scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
1186
+ else:
1187
+ scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
1188
+ mtd_lines.append(f"MTD\tms_run[1]-scan_polarity\t{scan_polarity_cv}")
1189
+
1190
+ mtd_lines.append("MTD\tassay[1]\tAssay_1")
1191
+ mtd_lines.append("MTD\tassay[1]-sample_ref\tsample[1]")
1192
+ mtd_lines.append("MTD\tassay[1]-ms_run_ref\tms_run[1]")
1193
+ mtd_lines.append("")
1194
+ mtd_lines.append("MTD\tstudy_variable[1]\tundefined")
1195
+ mtd_lines.append("MTD\tstudy_variable[1]-assay_refs\tassay[1]")
1196
+ mtd_lines.append("MTD\tstudy_variable[1]-description\tSingle sample")
1197
+
1198
+ with open(filename, "w", encoding="utf-8") as f:
1199
+ for line in mtd_lines:
1200
+ f.write(line + "\n")
1201
+
1202
+ # --- SML (Small Molecule) table ---
1203
+ sml_lines = []
1204
+ sml_header = [
1205
+ "SMH",
1206
+ "SML_ID",
1207
+ "SMF_ID_REFS",
1208
+ "database_identifier",
1209
+ "chemical_formula",
1210
+ "smiles",
1211
+ "inchi",
1212
+ "chemical_name",
1213
+ "uri",
1214
+ "theoretical_neutral_mass",
1215
+ "adduct_ions",
1216
+ "reliability",
1217
+ "best_id_confidence_measure",
1218
+ "best_id_confidence_value",
1219
+ "opt_global_mgf_index",
1220
+ "abundance_assay[1]",
1221
+ "abundance_study_variable[1]",
1222
+ "abundance_variation_study_variable[1]",
1223
+ ]
1224
+ sml_lines.append("\t".join(sml_header))
1225
+
1226
+ # Get adducts from features_df['adduct']
1227
+ adduct_list = []
1228
+ for row in self.features_df.iter_rows(named=True):
1229
+ adduct = "null"
1230
+ if "adduct" in row and row["adduct"] is not None:
1231
+ adduct = str(row["adduct"]).replace("?", "H")
1232
+ adduct_list.append(adduct)
1233
+
1234
+ for idx, row in enumerate(self.features_df.iter_rows(named=True), 1):
1235
+ feature_uid = row["feature_uid"]
1236
+
1237
+ # Get identification information for this feature_uid if available
1238
+ id_info = None
1239
+ if top_id_data is not None:
1240
+ id_matches = top_id_data.filter(pl.col("feature_uid") == feature_uid)
1241
+ if id_matches.height > 0:
1242
+ id_info = id_matches.row(0, named=True)
1243
+
1244
+ # Populate identification fields
1245
+ database_identifier = "null"
1246
+ chemical_formula = "null"
1247
+ smiles_val = "null"
1248
+ inchi_val = "null"
1249
+ chemical_name = "null"
1250
+ best_id_confidence_measure = "null"
1251
+ best_id_confidence_value = "null"
1252
+ reliability = "4" # Default: unknown compound
1253
+ theoretical_neutral_mass = "null"
1254
+
1255
+ if id_info:
1256
+ # Use cmpd_uid as database identifier with prefix
1257
+ if id_info.get("cmpd_uid") is not None:
1258
+ database_identifier = f"cmpd:{id_info['cmpd_uid']}"
1259
+
1260
+ # Chemical formula
1261
+ if id_info.get("formula") is not None and id_info["formula"] != "":
1262
+ chemical_formula = safe_str(id_info["formula"])
1263
+
1264
+ # SMILES
1265
+ if id_info.get("smiles") is not None and id_info["smiles"] != "":
1266
+ smiles_val = safe_str(id_info["smiles"])
1267
+
1268
+ # InChI
1269
+ if id_info.get("inchi") is not None and id_info["inchi"] != "":
1270
+ inchi_val = safe_str(id_info["inchi"])
1271
+
1272
+ # Chemical name
1273
+ if id_info.get("name") is not None and id_info["name"] != "":
1274
+ chemical_name = safe_str(id_info["name"])
1275
+
1276
+ # Theoretical neutral mass
1277
+ if id_info.get("neutral_mass") is not None:
1278
+ theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
1279
+ elif id_info.get("mass") is not None:
1280
+ theoretical_neutral_mass = safe_str(id_info["mass"])
1281
+
1282
+ # Identification confidence
1283
+ if id_info.get("matcher") is not None:
1284
+ best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
1285
+
1286
+ if id_info.get("score") is not None:
1287
+ best_id_confidence_value = safe_str(id_info["score"])
1288
+
1289
+ # Set reliability based on identification quality
1290
+ if id_info.get("score", 0) >= 0.8:
1291
+ reliability = "2a" # High confidence compound match
1292
+ elif id_info.get("score", 0) >= 0.5:
1293
+ reliability = "2b" # Moderate confidence match
1294
+ elif id_info.get("score", 0) >= 0.2:
1295
+ reliability = "3" # Compound class level
1296
+ else:
1297
+ reliability = "4" # Unknown compound
1298
+
1299
+ # Get MGF indexes for this feature
1300
+ mgf_indexes = mgf_mapping.get(feature_uid, [])
1301
+
1302
+ # Get intensity value for abundance
1303
+ abundance_value = row.get("inty", None)
1304
+ abundance_str = safe_str(abundance_value) if abundance_value is not None else "null"
1305
+
1306
+ sml_row = [
1307
+ "SML",
1308
+ str(idx),
1309
+ str(idx), # SMF_ID_REFS - same as SML_ID for single features
1310
+ database_identifier,
1311
+ chemical_formula,
1312
+ smiles_val,
1313
+ inchi_val,
1314
+ chemical_name,
1315
+ safe_str(row.get("uri", "null")),
1316
+ theoretical_neutral_mass,
1317
+ adduct_list[idx - 1],
1318
+ reliability,
1319
+ best_id_confidence_measure,
1320
+ best_id_confidence_value,
1321
+ ",".join(map(str, mgf_indexes)) if mgf_indexes else "null",
1322
+ abundance_str, # abundance_assay[1]
1323
+ abundance_str, # abundance_study_variable[1] (same for single sample)
1324
+ "null", # abundance_variation_study_variable[1] (no variation for single sample)
1325
+ ]
1326
+ sml_lines.append("\t".join(sml_row))
1327
+
1328
+ with open(filename, "a", encoding="utf-8") as f:
1329
+ f.write("\n")
1330
+ for line in sml_lines:
1331
+ f.write(line + "\n")
1332
+
1333
+ # --- SMF (Small Molecule Feature) table ---
1334
+ smf_lines = []
1335
+ smf_header = [
1336
+ "SFH",
1337
+ "SMF_ID",
1338
+ "SME_ID_REFS",
1339
+ "SME_ID_REF_ambiguity_code",
1340
+ "adduct_ion",
1341
+ "isotopomer",
1342
+ "exp_mass_to_charge",
1343
+ "charge",
1344
+ "retention_time_in_seconds",
1345
+ "retention_time_in_seconds_start",
1346
+ "retention_time_in_seconds_end",
1347
+ "abundance_assay[1]",
1348
+ "abundance_study_variable[1]",
1349
+ "abundance_variation_study_variable[1]",
1350
+ ]
1351
+ smf_lines.append("\t".join(smf_header))
1352
+
1353
+ for idx, row in enumerate(self.features_df.iter_rows(named=True), 1):
1354
+ feature_uid = row["feature_uid"]
1355
+
1356
+ # References to SME entries
1357
+ SME_refs = "null"
1358
+ SME_ambiguity = "null"
1359
+
1360
+ if full_id_data is not None:
1361
+ # Find all SME entries for this feature_uid
1362
+ SME_matches = full_id_data.filter(pl.col("feature_uid") == feature_uid)
1363
+ if SME_matches.height > 0:
1364
+ # Generate SME IDs
1365
+ SME_ids = []
1366
+ for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
1367
+ SME_id_base = feature_uid * 1000
1368
+ SME_id = SME_id_base + i + 1
1369
+ SME_ids.append(str(SME_id))
1370
+
1371
+ if SME_ids:
1372
+ SME_refs = "|".join(SME_ids)
1373
+ # Set ambiguity code
1374
+ if len(SME_ids) > 1:
1375
+ unique_cmpds = {
1376
+ match["cmpd_uid"]
1377
+ for match in SME_matches.iter_rows(named=True)
1378
+ if match.get("cmpd_uid") is not None
1379
+ }
1380
+ if len(unique_cmpds) > 1:
1381
+ SME_ambiguity = "1" # Ambiguous identification
1382
+ else:
1383
+ SME_ambiguity = "2" # Multiple evidence for same molecule
1384
+ else:
1385
+ SME_ambiguity = "null"
1386
+
1387
+ # Format isotopomer
1388
+ iso_value = row.get("iso", 0)
1389
+ if iso_value is not None and round(iso_value) != 0:
1390
+ isotopomer = f'[MS,MS:1002957,"isotopomer MS peak","+{round(iso_value)}"]'
1391
+ else:
1392
+ isotopomer = "null"
1393
+
1394
+ # Get abundance value
1395
+ abundance_value = row.get("inty", None)
1396
+ abundance_str = safe_str(abundance_value) if abundance_value is not None else "null"
1397
+
1398
+ smf_row = [
1399
+ "SMF",
1400
+ str(idx),
1401
+ SME_refs,
1402
+ SME_ambiguity,
1403
+ adduct_list[idx - 1], # adduct_ion
1404
+ isotopomer,
1405
+ safe_str(row.get("mz", "null")), # exp_mass_to_charge
1406
+ safe_str(row.get("charge", "null")),
1407
+ safe_str(row.get("rt", "null")), # retention_time_in_seconds
1408
+ safe_str(row.get("rt_start", "null")),
1409
+ safe_str(row.get("rt_end", "null")),
1410
+ abundance_str, # abundance_assay[1]
1411
+ abundance_str, # abundance_study_variable[1]
1412
+ "null", # abundance_variation_study_variable[1]
1413
+ ]
1414
+ smf_lines.append("\t".join(smf_row))
1415
+
1416
+ with open(filename, "a", encoding="utf-8") as f:
1417
+ f.write("\n")
1418
+ for line in smf_lines:
1419
+ f.write(line + "\n")
1420
+
1421
+ # --- SME (Small Molecule Evidence) table ---
1422
+ if full_id_data is not None and not full_id_data.is_empty():
1423
+ SME_lines = []
1424
+ SME_lines.append(
1425
+ "COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
1426
+ )
1427
+ SME_header = [
1428
+ "SEH",
1429
+ "SME_ID",
1430
+ "evidence_input_id",
1431
+ "database_identifier",
1432
+ "chemical_formula",
1433
+ "smiles",
1434
+ "inchi",
1435
+ "chemical_name",
1436
+ "uri",
1437
+ "derivatized_form",
1438
+ "adduct_ion",
1439
+ "exp_mass_to_charge",
1440
+ "charge",
1441
+ "theoretical_mass_to_charge",
1442
+ "spectra_ref",
1443
+ "identification_method",
1444
+ "ms_level",
1445
+ "id_confidence_measure[1]",
1446
+ "rank",
1447
+ ]
1448
+ SME_lines.append("\t".join(SME_header))
1449
+
1450
+ # Create SME entries for all identification results
1451
+ for feature_uid in self.features_df.select("feature_uid").to_series().unique():
1452
+ # Get feature data
1453
+ feature_data = self.features_df.filter(pl.col("feature_uid") == feature_uid)
1454
+ if feature_data.height == 0:
1455
+ continue
1456
+ feature_row = feature_data.row(0, named=True)
1457
+
1458
+ # Get all identification results for this feature
1459
+ SME_matches = full_id_data.filter(pl.col("feature_uid") == feature_uid)
1460
+
1461
+ if SME_matches.height > 0:
1462
+ # Sort by score descending
1463
+ SME_matches = SME_matches.sort("score", descending=True)
1464
+
1465
+ for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
1466
+ # Generate unique SME_ID
1467
+ SME_id_base = feature_uid * 1000
1468
+ SME_id = SME_id_base + i + 1
1469
+
1470
+ # Create evidence input ID
1471
+ feature_mz = feature_row.get("mz", 0)
1472
+ feature_rt = feature_row.get("rt", 0)
1473
+ feature_id = feature_row.get("feature_id", feature_uid)
1474
+ evidence_id = f"feature_uid={feature_uid}:feature_id={feature_id}:mz={feature_mz:.4f}:rt={feature_rt:.2f}"
1475
+
1476
+ # Database identifier
1477
+ db_id = "null"
1478
+ if SME_row.get("db_id") is not None and SME_row["db_id"] != "":
1479
+ db_id = safe_str(SME_row["db_id"])
1480
+ elif SME_row.get("cmpd_uid") is not None:
1481
+ db_id = f"cmpd:{SME_row['cmpd_uid']}"
1482
+
1483
+ # Get adduct information
1484
+ adduct_ion = "null"
1485
+ if SME_row.get("adduct") is not None and SME_row["adduct"] != "":
1486
+ adduct_ion = safe_str(SME_row["adduct"]).replace("?", "H")
1487
+
1488
+ # Spectra reference
1489
+ spectra_ref = "ms_run[1]:spectrum=0"
1490
+
1491
+ # Identification method
1492
+ id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
1493
+ if SME_row.get("matcher") is not None:
1494
+ id_method = f"[MS, MS:1002888, {SME_row['matcher']}, ]"
1495
+
1496
+ # MS level - check if ms1 exists in matched
1497
+ if 'ms1' in SME_row['matcher'].lower():
1498
+ ms_level = "[MS, MS:1000511, ms level, 1]"
1499
+ else:
1500
+ ms_level = "[MS,MS:1000511, ms level, 2]"
1501
+
1502
+ # Experimental mass-to-charge
1503
+ exp_mz = safe_str(feature_mz)
1504
+
1505
+ # Theoretical mass-to-charge
1506
+ theoretical_mz = "null"
1507
+ if SME_row.get("mz") is not None:
1508
+ theoretical_mz = safe_str(SME_row["mz"])
1509
+
1510
+ SME_line = [
1511
+ "SME",
1512
+ str(SME_id),
1513
+ evidence_id,
1514
+ db_id,
1515
+ safe_str(SME_row.get("formula", "null")),
1516
+ safe_str(SME_row.get("smiles", "null")),
1517
+ safe_str(SME_row.get("inchi", "null")),
1518
+ safe_str(SME_row.get("name", "null")),
1519
+ "null", # uri
1520
+ "null", # derivatized_form
1521
+ adduct_ion,
1522
+ exp_mz,
1523
+ safe_str(feature_row.get("charge", "1")),
1524
+ theoretical_mz,
1525
+ spectra_ref,
1526
+ id_method,
1527
+ ms_level,
1528
+ safe_str(SME_row.get("score", "null")),
1529
+ str(i + 1), # rank
1530
+ ]
1531
+ SME_lines.append("\t".join(SME_line))
1532
+
1533
+ # Write SME table
1534
+ with open(filename, "a", encoding="utf-8") as f:
1535
+ f.write("\n")
1536
+ for line in SME_lines:
1537
+ f.write(line + "\n")
1538
+
1539
+ # --- MGF table ---
1540
+ if include_mgf and mgf_data is not None and len(mgf_data) > 0:
1541
+ mgf_lines = []
1542
+ # Header
1543
+ mgf_header = [
1544
+ "COM",
1545
+ "MGH",
1546
+ "mgf_id",
1547
+ "prec_id",
1548
+ "prec_rt",
1549
+ "prec_mz",
1550
+ "prec_int",
1551
+ "energy",
1552
+ "level",
1553
+ "title",
1554
+ "spec_tic",
1555
+ "spec_len",
1556
+ "spec_mz",
1557
+ "spec_int",
1558
+ ]
1559
+ mgf_lines.append("\t".join(mgf_header))
1560
+
1561
+ # Data rows
1562
+ for row in mgf_data.iter_rows(named=True):
1563
+ # Calculate spectrum TIC
1564
+ spectrum_mz = row["spec_mz"]
1565
+ spectrum_inty = row["spec_int"]
1566
+ spec_tic = sum(spectrum_inty) if spectrum_inty else 0
1567
+ spec_len = row["spec_len"] if row["spec_len"] is not None else 0
1568
+
1569
+ # Format spectrum data as pipe-separated strings
1570
+ spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
1571
+ spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
1572
+
1573
+ mgf_row = [
1574
+ "COM",
1575
+ "MGF",
1576
+ str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
1577
+ str(row["feature_id"]) if row["feature_id"] is not None else "null",
1578
+ f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
1579
+ f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
1580
+ "null", # prec_int
1581
+ str(row["energy"]) if row["energy"] is not None else "null",
1582
+ str(row["mslevel"]) if row["mslevel"] is not None else "null",
1583
+ str(row["title"]) if row["title"] is not None else "null",
1584
+ f"{int(spec_tic)}" if spec_tic > 0 else "null",
1585
+ str(spec_len) if spec_len > 0 else "null",
1586
+ spec_mz_str if spec_mz_str else "null",
1587
+ spec_int_str if spec_int_str else "null",
1588
+ ]
1589
+ mgf_lines.append("\t".join(mgf_row))
1590
+
1591
+ # Write MGF table
1592
+ with open(filename, "a", encoding="utf-8") as f:
1593
+ f.write("\n")
1594
+ for line in mgf_lines:
1595
+ f.write(line + "\n")
1596
+
1597
+ self.logger.success(f"Exported mzTab-M to {filename}")