masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -10,10 +10,10 @@ import pyopenms as oms
|
|
|
10
10
|
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
from
|
|
13
|
+
from masster.chromatogram import Chromatogram
|
|
14
|
+
from masster.study.defaults import fill_defaults
|
|
15
|
+
from masster.sample.sample import Sample
|
|
16
|
+
from masster.spectrum import Spectrum
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
# Pre-import heavy modules to avoid repeated loading in add_sample()
|
|
@@ -94,9 +94,7 @@ def add(
|
|
|
94
94
|
|
|
95
95
|
if len(files) > 0:
|
|
96
96
|
# Limit files if max_files is specified
|
|
97
|
-
remaining_slots = (
|
|
98
|
-
max_files - counter if max_files is not None else len(files)
|
|
99
|
-
)
|
|
97
|
+
remaining_slots = max_files - counter if max_files is not None else len(files)
|
|
100
98
|
files = files[:remaining_slots]
|
|
101
99
|
|
|
102
100
|
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
@@ -119,9 +117,7 @@ def add(
|
|
|
119
117
|
|
|
120
118
|
# Batch process all files of this extension using ultra-optimized method
|
|
121
119
|
if files_to_process:
|
|
122
|
-
self.logger.debug(
|
|
123
|
-
f"Batch processing {len(files_to_process)} {ext} files",
|
|
124
|
-
)
|
|
120
|
+
self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
|
|
125
121
|
successful = self._add_samples_batch(
|
|
126
122
|
files_to_process,
|
|
127
123
|
reset=reset,
|
|
@@ -260,8 +256,7 @@ def _fill_chrom_single_impl(
|
|
|
260
256
|
if min_number > 0:
|
|
261
257
|
original_count = len(uids)
|
|
262
258
|
uids = self.consensus_df.filter(
|
|
263
|
-
(pl.col("number_samples") >= min_number)
|
|
264
|
-
& (pl.col("consensus_uid").is_in(uids)),
|
|
259
|
+
(pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
|
|
265
260
|
)["consensus_uid"].to_list()
|
|
266
261
|
self.logger.debug(
|
|
267
262
|
f"Features to fill: {original_count} -> {len(uids)}",
|
|
@@ -276,15 +271,13 @@ def _fill_chrom_single_impl(
|
|
|
276
271
|
# Build lookup dictionaries
|
|
277
272
|
self.logger.debug("Building lookup dictionaries...")
|
|
278
273
|
consensus_info = {}
|
|
279
|
-
consensus_subset = self.consensus_df.select(
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
],
|
|
287
|
-
).filter(pl.col("consensus_uid").is_in(uids))
|
|
274
|
+
consensus_subset = self.consensus_df.select([
|
|
275
|
+
"consensus_uid",
|
|
276
|
+
"rt_start_mean",
|
|
277
|
+
"rt_end_mean",
|
|
278
|
+
"mz",
|
|
279
|
+
"rt",
|
|
280
|
+
]).filter(pl.col("consensus_uid").is_in(uids))
|
|
288
281
|
|
|
289
282
|
for row in consensus_subset.iter_rows(named=True):
|
|
290
283
|
consensus_info[row["consensus_uid"]] = {
|
|
@@ -451,13 +444,11 @@ def _fill_chrom_single_impl(
|
|
|
451
444
|
}
|
|
452
445
|
|
|
453
446
|
new_features.append(new_feature)
|
|
454
|
-
new_mapping.append(
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
},
|
|
460
|
-
)
|
|
447
|
+
new_mapping.append({
|
|
448
|
+
"consensus_uid": consensus_uid,
|
|
449
|
+
"sample_uid": sample_uid,
|
|
450
|
+
"feature_uid": feature_uid,
|
|
451
|
+
})
|
|
461
452
|
counter += 1
|
|
462
453
|
|
|
463
454
|
# Add new features to DataFrames
|
|
@@ -480,10 +471,7 @@ def _fill_chrom_single_impl(
|
|
|
480
471
|
for row in rows_to_add:
|
|
481
472
|
# Cast numeric columns to ensure consistency
|
|
482
473
|
for key, value in row.items():
|
|
483
|
-
if
|
|
484
|
-
key in ["mz", "rt", "intensity", "area", "height"]
|
|
485
|
-
and value is not None
|
|
486
|
-
):
|
|
474
|
+
if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
|
|
487
475
|
row[key] = float(value)
|
|
488
476
|
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
489
477
|
row[key] = int(value)
|
|
@@ -530,7 +518,7 @@ def fill_single(self, **kwargs):
|
|
|
530
518
|
min_samples_abs: Absolute minimum sample threshold (default: 2)
|
|
531
519
|
"""
|
|
532
520
|
# parameters initialization
|
|
533
|
-
from
|
|
521
|
+
from masster.study.defaults import fill_defaults
|
|
534
522
|
|
|
535
523
|
params = fill_defaults()
|
|
536
524
|
|
|
@@ -702,13 +690,11 @@ def _process_sample_for_parallel_fill(
|
|
|
702
690
|
}
|
|
703
691
|
|
|
704
692
|
new_features.append(new_feature)
|
|
705
|
-
new_mapping.append(
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
},
|
|
711
|
-
)
|
|
693
|
+
new_mapping.append({
|
|
694
|
+
"consensus_uid": consensus_uid,
|
|
695
|
+
"sample_uid": sample_uid,
|
|
696
|
+
"feature_uid": feature_uid,
|
|
697
|
+
})
|
|
712
698
|
counter += 1
|
|
713
699
|
|
|
714
700
|
return new_features, new_mapping, counter
|
|
@@ -754,8 +740,7 @@ def _fill_chrom_impl(
|
|
|
754
740
|
if min_number > 0:
|
|
755
741
|
original_count = len(uids)
|
|
756
742
|
uids = self.consensus_df.filter(
|
|
757
|
-
(pl.col("number_samples") >= min_number)
|
|
758
|
-
& (pl.col("consensus_uid").is_in(uids)),
|
|
743
|
+
(pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
|
|
759
744
|
)["consensus_uid"].to_list()
|
|
760
745
|
self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
|
|
761
746
|
|
|
@@ -782,15 +767,13 @@ def _fill_chrom_impl(
|
|
|
782
767
|
# Build lookup dictionaries
|
|
783
768
|
self.logger.debug("Building lookup dictionaries...")
|
|
784
769
|
consensus_info = {}
|
|
785
|
-
consensus_subset = self.consensus_df.select(
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
],
|
|
793
|
-
).filter(pl.col("consensus_uid").is_in(uids))
|
|
770
|
+
consensus_subset = self.consensus_df.select([
|
|
771
|
+
"consensus_uid",
|
|
772
|
+
"rt_start_mean",
|
|
773
|
+
"rt_end_mean",
|
|
774
|
+
"mz",
|
|
775
|
+
"rt",
|
|
776
|
+
]).filter(pl.col("consensus_uid").is_in(uids))
|
|
794
777
|
|
|
795
778
|
for row in consensus_subset.iter_rows(named=True):
|
|
796
779
|
consensus_info[row["consensus_uid"]] = {
|
|
@@ -807,13 +790,11 @@ def _fill_chrom_impl(
|
|
|
807
790
|
for row in self.samples_df.filter(
|
|
808
791
|
pl.col("sample_uid").is_in(unique_sample_uids),
|
|
809
792
|
).iter_rows(named=True):
|
|
810
|
-
samples_to_process.append(
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
},
|
|
816
|
-
)
|
|
793
|
+
samples_to_process.append({
|
|
794
|
+
"sample_name": row["sample_name"],
|
|
795
|
+
"sample_uid": row["sample_uid"],
|
|
796
|
+
"sample_path": row["sample_path"],
|
|
797
|
+
})
|
|
817
798
|
|
|
818
799
|
total_missing = len(missing_combinations_df)
|
|
819
800
|
self.logger.debug(
|
|
@@ -821,9 +802,7 @@ def _fill_chrom_impl(
|
|
|
821
802
|
)
|
|
822
803
|
|
|
823
804
|
# Calculate current max feature_uid to avoid conflicts
|
|
824
|
-
features_df_max_uid = (
|
|
825
|
-
self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
|
|
826
|
-
)
|
|
805
|
+
features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
|
|
827
806
|
|
|
828
807
|
# Process samples in parallel
|
|
829
808
|
all_new_features: list[dict] = []
|
|
@@ -897,10 +876,7 @@ def _fill_chrom_impl(
|
|
|
897
876
|
for row in rows_to_add:
|
|
898
877
|
# Cast numeric columns to ensure consistency
|
|
899
878
|
for key, value in row.items():
|
|
900
|
-
if
|
|
901
|
-
key in ["mz", "rt", "intensity", "area", "height"]
|
|
902
|
-
and value is not None
|
|
903
|
-
):
|
|
879
|
+
if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
|
|
904
880
|
row[key] = float(value)
|
|
905
881
|
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
906
882
|
row[key] = int(value)
|
|
@@ -949,10 +925,7 @@ def fill(self, **kwargs):
|
|
|
949
925
|
"""
|
|
950
926
|
# parameters initialization
|
|
951
927
|
params = fill_defaults()
|
|
952
|
-
num_workers = kwargs.get(
|
|
953
|
-
"num_workers",
|
|
954
|
-
4,
|
|
955
|
-
) # Default parameter not in defaults class
|
|
928
|
+
num_workers = kwargs.get("num_workers", 4) # Default parameter not in defaults class
|
|
956
929
|
|
|
957
930
|
for key, value in kwargs.items():
|
|
958
931
|
if isinstance(value, fill_defaults):
|
|
@@ -1015,9 +988,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1015
988
|
.agg(pl.count("feature_uid").alias("count"))
|
|
1016
989
|
)
|
|
1017
990
|
|
|
1018
|
-
total_existing = (
|
|
1019
|
-
consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
1020
|
-
)
|
|
991
|
+
total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
1021
992
|
|
|
1022
993
|
# If >95% filled, likely no gaps (common case)
|
|
1023
994
|
if total_existing >= total_possible * 0.95:
|
|
@@ -1036,12 +1007,10 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1036
1007
|
|
|
1037
1008
|
# Get existing combinations for target UIDs only
|
|
1038
1009
|
existing_by_consensus = {}
|
|
1039
|
-
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
],
|
|
1044
|
-
).iter_rows():
|
|
1010
|
+
for consensus_uid, feature_uid in self.consensus_mapping_df.select([
|
|
1011
|
+
"consensus_uid",
|
|
1012
|
+
"feature_uid",
|
|
1013
|
+
]).iter_rows():
|
|
1045
1014
|
if consensus_uid in uids_set and feature_uid in feature_to_sample:
|
|
1046
1015
|
if consensus_uid not in existing_by_consensus:
|
|
1047
1016
|
existing_by_consensus[consensus_uid] = set()
|
|
@@ -1049,9 +1018,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1049
1018
|
|
|
1050
1019
|
# Get sample info once
|
|
1051
1020
|
all_samples = list(
|
|
1052
|
-
self.samples_df.select(
|
|
1053
|
-
["sample_uid", "sample_name", "sample_path"],
|
|
1054
|
-
).iter_rows(),
|
|
1021
|
+
self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows(),
|
|
1055
1022
|
)
|
|
1056
1023
|
|
|
1057
1024
|
# Check for missing combinations
|
|
@@ -1059,17 +1026,13 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1059
1026
|
existing_samples = existing_by_consensus.get(consensus_uid, set())
|
|
1060
1027
|
for sample_uid, sample_name, sample_path in all_samples:
|
|
1061
1028
|
if sample_uid not in existing_samples:
|
|
1062
|
-
missing_combinations.append(
|
|
1063
|
-
(consensus_uid, sample_uid, sample_name, sample_path),
|
|
1064
|
-
)
|
|
1029
|
+
missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
|
|
1065
1030
|
|
|
1066
1031
|
return missing_combinations
|
|
1067
1032
|
|
|
1068
1033
|
else:
|
|
1069
1034
|
# For studies with many gaps, use bulk operations
|
|
1070
|
-
self.logger.debug(
|
|
1071
|
-
f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization",
|
|
1072
|
-
)
|
|
1035
|
+
self.logger.debug(f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization")
|
|
1073
1036
|
|
|
1074
1037
|
# Build efficient lookups
|
|
1075
1038
|
uids_set = set(uids)
|
|
@@ -1080,20 +1043,16 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1080
1043
|
# Build existing combinations set
|
|
1081
1044
|
existing_combinations = {
|
|
1082
1045
|
(consensus_uid, feature_to_sample[feature_uid])
|
|
1083
|
-
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
],
|
|
1088
|
-
).iter_rows()
|
|
1046
|
+
for consensus_uid, feature_uid in self.consensus_mapping_df.select([
|
|
1047
|
+
"consensus_uid",
|
|
1048
|
+
"feature_uid",
|
|
1049
|
+
]).iter_rows()
|
|
1089
1050
|
if consensus_uid in uids_set and feature_uid in feature_to_sample
|
|
1090
1051
|
}
|
|
1091
1052
|
|
|
1092
1053
|
# Get all sample info
|
|
1093
1054
|
all_samples = list(
|
|
1094
|
-
self.samples_df.select(
|
|
1095
|
-
["sample_uid", "sample_name", "sample_path"],
|
|
1096
|
-
).iter_rows(),
|
|
1055
|
+
self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows(),
|
|
1097
1056
|
)
|
|
1098
1057
|
|
|
1099
1058
|
# Generate all missing combinations
|
|
@@ -1162,10 +1121,7 @@ def sanitize(self):
|
|
|
1162
1121
|
for ms2_specs in row_data["ms2_specs"]:
|
|
1163
1122
|
if not isinstance(ms2_specs, Spectrum):
|
|
1164
1123
|
try:
|
|
1165
|
-
new_ms2_specs = Spectrum(
|
|
1166
|
-
mz=np.array([0]),
|
|
1167
|
-
inty=np.array([0]),
|
|
1168
|
-
)
|
|
1124
|
+
new_ms2_specs = Spectrum(mz=np.array([0]), inty=np.array([0]))
|
|
1169
1125
|
if hasattr(ms2_specs, "__dict__"):
|
|
1170
1126
|
new_ms2_specs.from_dict(ms2_specs.__dict__)
|
|
1171
1127
|
else:
|
|
@@ -1204,8 +1160,8 @@ def sanitize(self):
|
|
|
1204
1160
|
def load_features(self):
|
|
1205
1161
|
"""
|
|
1206
1162
|
Load features by reconstructing FeatureMaps from the processed features_df data.
|
|
1207
|
-
|
|
1208
|
-
This ensures that the loaded FeatureMaps contain the same processed features
|
|
1163
|
+
|
|
1164
|
+
This ensures that the loaded FeatureMaps contain the same processed features
|
|
1209
1165
|
as stored in features_df, rather than loading raw features from .featureXML files
|
|
1210
1166
|
which may not match the processed data after filtering, alignment, etc.
|
|
1211
1167
|
"""
|
|
@@ -1213,25 +1169,25 @@ def load_features(self):
|
|
|
1213
1169
|
import pyopenms as oms
|
|
1214
1170
|
from tqdm import tqdm
|
|
1215
1171
|
from datetime import datetime
|
|
1216
|
-
|
|
1172
|
+
|
|
1217
1173
|
self.features_maps = []
|
|
1218
|
-
|
|
1174
|
+
|
|
1219
1175
|
# Check if features_df exists and is not empty
|
|
1220
1176
|
if self.features_df is None:
|
|
1221
1177
|
self.logger.warning("features_df is None. Falling back to XML loading.")
|
|
1222
1178
|
self._load_features_from_xml()
|
|
1223
1179
|
return
|
|
1224
|
-
|
|
1180
|
+
|
|
1225
1181
|
if len(self.features_df) == 0:
|
|
1226
1182
|
self.logger.warning("features_df is empty. Falling back to XML loading.")
|
|
1227
1183
|
self._load_features_from_xml()
|
|
1228
1184
|
return
|
|
1229
|
-
|
|
1185
|
+
|
|
1230
1186
|
# If we get here, we should use the new method
|
|
1231
1187
|
self.logger.debug("Reconstructing FeatureMaps from features_df.")
|
|
1232
|
-
|
|
1188
|
+
|
|
1233
1189
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1234
|
-
|
|
1190
|
+
|
|
1235
1191
|
# Process each sample in order
|
|
1236
1192
|
for sample_index, row_dict in tqdm(
|
|
1237
1193
|
enumerate(self.samples_df.iter_rows(named=True)),
|
|
@@ -1239,39 +1195,37 @@ def load_features(self):
|
|
|
1239
1195
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Reconstruct FeatureMaps from DataFrame",
|
|
1240
1196
|
disable=tdqm_disable,
|
|
1241
1197
|
):
|
|
1242
|
-
sample_uid = row_dict[
|
|
1243
|
-
sample_name = row_dict[
|
|
1244
|
-
|
|
1198
|
+
sample_uid = row_dict['sample_uid']
|
|
1199
|
+
sample_name = row_dict['sample_name']
|
|
1200
|
+
|
|
1245
1201
|
# Get features for this sample from features_df
|
|
1246
|
-
sample_features = self.features_df.filter(pl.col(
|
|
1247
|
-
|
|
1202
|
+
sample_features = self.features_df.filter(pl.col('sample_uid') == sample_uid)
|
|
1203
|
+
|
|
1248
1204
|
# Create new FeatureMap
|
|
1249
1205
|
feature_map = oms.FeatureMap()
|
|
1250
|
-
|
|
1206
|
+
|
|
1251
1207
|
# Convert DataFrame features to OpenMS Features
|
|
1252
1208
|
for feature_row in sample_features.iter_rows(named=True):
|
|
1253
1209
|
feature = oms.Feature()
|
|
1254
|
-
|
|
1210
|
+
|
|
1255
1211
|
# Set properties from DataFrame (handle missing values gracefully)
|
|
1256
1212
|
try:
|
|
1257
|
-
feature.setUniqueId(int(feature_row[
|
|
1258
|
-
feature.setMZ(float(feature_row[
|
|
1259
|
-
feature.setRT(float(feature_row[
|
|
1260
|
-
feature.setIntensity(float(feature_row[
|
|
1261
|
-
feature.setOverallQuality(float(feature_row[
|
|
1262
|
-
feature.setCharge(int(feature_row[
|
|
1263
|
-
|
|
1213
|
+
feature.setUniqueId(int(feature_row['feature_id']))
|
|
1214
|
+
feature.setMZ(float(feature_row['mz']))
|
|
1215
|
+
feature.setRT(float(feature_row['rt']))
|
|
1216
|
+
feature.setIntensity(float(feature_row['inty']))
|
|
1217
|
+
feature.setOverallQuality(float(feature_row['quality']))
|
|
1218
|
+
feature.setCharge(int(feature_row['charge']))
|
|
1219
|
+
|
|
1264
1220
|
# Add to feature map
|
|
1265
1221
|
feature_map.push_back(feature)
|
|
1266
1222
|
except (ValueError, TypeError) as e:
|
|
1267
1223
|
self.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
1268
1224
|
continue
|
|
1269
|
-
|
|
1225
|
+
|
|
1270
1226
|
self.features_maps.append(feature_map)
|
|
1271
|
-
|
|
1272
|
-
self.logger.debug(
|
|
1273
|
-
f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
|
|
1274
|
-
)
|
|
1227
|
+
|
|
1228
|
+
self.logger.debug(f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.")
|
|
1275
1229
|
|
|
1276
1230
|
|
|
1277
1231
|
def _load_features_from_xml(self):
|
|
@@ -1326,14 +1280,7 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1326
1280
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1327
1281
|
|
|
1328
1282
|
|
|
1329
|
-
def _add_samples_batch(
|
|
1330
|
-
self,
|
|
1331
|
-
files,
|
|
1332
|
-
reset=False,
|
|
1333
|
-
adducts=None,
|
|
1334
|
-
blacklist=None,
|
|
1335
|
-
fast=True,
|
|
1336
|
-
):
|
|
1283
|
+
def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
|
|
1337
1284
|
"""
|
|
1338
1285
|
Optimized batch addition of samples.
|
|
1339
1286
|
|
|
@@ -1356,9 +1303,7 @@ def _add_samples_batch(
|
|
|
1356
1303
|
if blacklist is None:
|
|
1357
1304
|
blacklist = set()
|
|
1358
1305
|
|
|
1359
|
-
self.logger.debug(
|
|
1360
|
-
f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...",
|
|
1361
|
-
)
|
|
1306
|
+
self.logger.debug(f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...")
|
|
1362
1307
|
|
|
1363
1308
|
successful_additions = 0
|
|
1364
1309
|
failed_additions = 0
|
|
@@ -1415,9 +1360,7 @@ def _add_samples_batch(
|
|
|
1415
1360
|
# Color assignment done once for all samples
|
|
1416
1361
|
self._sample_color_reset_optimized()
|
|
1417
1362
|
|
|
1418
|
-
self.logger.debug(
|
|
1419
|
-
f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
|
|
1420
|
-
)
|
|
1363
|
+
self.logger.debug(f"Add samples complete: {successful_additions} successful, {failed_additions} failed")
|
|
1421
1364
|
|
|
1422
1365
|
return successful_additions
|
|
1423
1366
|
|
|
@@ -1463,7 +1406,7 @@ def _add_sample_optimized(
|
|
|
1463
1406
|
# Load sample
|
|
1464
1407
|
ddaobj = Sample()
|
|
1465
1408
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1466
|
-
|
|
1409
|
+
|
|
1467
1410
|
# Try optimized loading first (study-specific, skips ms1_df for better performance)
|
|
1468
1411
|
|
|
1469
1412
|
if file.endswith(".sample5"):
|
|
@@ -1499,7 +1442,7 @@ def _add_sample_optimized(
|
|
|
1499
1442
|
# Handle file paths
|
|
1500
1443
|
if file.endswith(".sample5"):
|
|
1501
1444
|
final_sample_path = file
|
|
1502
|
-
#
|
|
1445
|
+
#self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1503
1446
|
else:
|
|
1504
1447
|
if self.folder is not None:
|
|
1505
1448
|
if not os.path.exists(self.folder):
|
|
@@ -1512,14 +1455,8 @@ def _add_sample_optimized(
|
|
|
1512
1455
|
|
|
1513
1456
|
# Efficient scan counting
|
|
1514
1457
|
ms1_count = ms2_count = 0
|
|
1515
|
-
if (
|
|
1516
|
-
|
|
1517
|
-
and ddaobj.scans_df is not None
|
|
1518
|
-
and not ddaobj.scans_df.is_empty()
|
|
1519
|
-
):
|
|
1520
|
-
scan_counts = (
|
|
1521
|
-
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1522
|
-
)
|
|
1458
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1459
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1523
1460
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1524
1461
|
counts = scan_counts.get("len", [])
|
|
1525
1462
|
for level, count in zip(ms_levels, counts):
|
|
@@ -1530,23 +1467,21 @@ def _add_sample_optimized(
|
|
|
1530
1467
|
|
|
1531
1468
|
# Create sample entry
|
|
1532
1469
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1533
|
-
new_sample = pl.DataFrame(
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
},
|
|
1549
|
-
)
|
|
1470
|
+
new_sample = pl.DataFrame({
|
|
1471
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1472
|
+
"sample_name": [sample_name],
|
|
1473
|
+
"sample_path": [final_sample_path],
|
|
1474
|
+
"sample_type": [sample_type],
|
|
1475
|
+
"map_id": [map_id_value],
|
|
1476
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1477
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1478
|
+
"sample_group": [""],
|
|
1479
|
+
"sample_batch": [1],
|
|
1480
|
+
"sample_sequence": [next_sequence],
|
|
1481
|
+
"num_features": [int(ddaobj._oms_features_map.size())],
|
|
1482
|
+
"num_ms1": [ms1_count],
|
|
1483
|
+
"num_ms2": [ms2_count],
|
|
1484
|
+
})
|
|
1550
1485
|
|
|
1551
1486
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1552
1487
|
|
|
@@ -1588,9 +1523,7 @@ def _add_sample_optimized(
|
|
|
1588
1523
|
# - No type casting loops
|
|
1589
1524
|
# - No sample_color_reset()
|
|
1590
1525
|
|
|
1591
|
-
self.logger.debug(
|
|
1592
|
-
f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)",
|
|
1593
|
-
)
|
|
1526
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)")
|
|
1594
1527
|
return True
|
|
1595
1528
|
|
|
1596
1529
|
|
|
@@ -1634,7 +1567,7 @@ def _add_sample_standard(
|
|
|
1634
1567
|
ddaobj = Sample()
|
|
1635
1568
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1636
1569
|
# Use standard loading method that loads all data including ms1_df
|
|
1637
|
-
|
|
1570
|
+
|
|
1638
1571
|
if file.endswith(".sample5"):
|
|
1639
1572
|
ddaobj.load(file)
|
|
1640
1573
|
# restore _oms_features_map
|
|
@@ -1668,7 +1601,7 @@ def _add_sample_standard(
|
|
|
1668
1601
|
# Handle file paths
|
|
1669
1602
|
if file.endswith(".sample5"):
|
|
1670
1603
|
final_sample_path = file
|
|
1671
|
-
#
|
|
1604
|
+
#self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
|
|
1672
1605
|
else:
|
|
1673
1606
|
if self.folder is not None:
|
|
1674
1607
|
if not os.path.exists(self.folder):
|
|
@@ -1681,14 +1614,8 @@ def _add_sample_standard(
|
|
|
1681
1614
|
|
|
1682
1615
|
# Efficient scan counting
|
|
1683
1616
|
ms1_count = ms2_count = 0
|
|
1684
|
-
if (
|
|
1685
|
-
|
|
1686
|
-
and ddaobj.scans_df is not None
|
|
1687
|
-
and not ddaobj.scans_df.is_empty()
|
|
1688
|
-
):
|
|
1689
|
-
scan_counts = (
|
|
1690
|
-
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1691
|
-
)
|
|
1617
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1618
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1692
1619
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1693
1620
|
counts = scan_counts.get("len", [])
|
|
1694
1621
|
for level, count in zip(ms_levels, counts):
|
|
@@ -1699,23 +1626,21 @@ def _add_sample_standard(
|
|
|
1699
1626
|
|
|
1700
1627
|
# Create sample entry
|
|
1701
1628
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1702
|
-
new_sample = pl.DataFrame(
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
},
|
|
1718
|
-
)
|
|
1629
|
+
new_sample = pl.DataFrame({
|
|
1630
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1631
|
+
"sample_name": [sample_name],
|
|
1632
|
+
"sample_path": [final_sample_path],
|
|
1633
|
+
"sample_type": [sample_type],
|
|
1634
|
+
"map_id": [map_id_value],
|
|
1635
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1636
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1637
|
+
"sample_group": [""],
|
|
1638
|
+
"sample_batch": [1],
|
|
1639
|
+
"sample_sequence": [next_sequence],
|
|
1640
|
+
"num_features": [int(ddaobj._oms_features_map.size())],
|
|
1641
|
+
"num_ms1": [ms1_count],
|
|
1642
|
+
"num_ms2": [ms2_count],
|
|
1643
|
+
})
|
|
1719
1644
|
|
|
1720
1645
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1721
1646
|
|
|
@@ -1750,9 +1675,7 @@ def _add_sample_standard(
|
|
|
1750
1675
|
# Use diagonal concatenation for flexibility
|
|
1751
1676
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1752
1677
|
|
|
1753
|
-
self.logger.debug(
|
|
1754
|
-
f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
|
|
1755
|
-
)
|
|
1678
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)")
|
|
1756
1679
|
return True
|
|
1757
1680
|
## COMMENT AR: Is this intentional?
|
|
1758
1681
|
# Use standard loading method that loads all data including ms1_df
|
|
@@ -1780,7 +1703,7 @@ def _add_sample_standard(
|
|
|
1780
1703
|
# Handle file paths
|
|
1781
1704
|
if file.endswith(".sample5"):
|
|
1782
1705
|
final_sample_path = file
|
|
1783
|
-
#
|
|
1706
|
+
#self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
|
|
1784
1707
|
else:
|
|
1785
1708
|
if self.folder is not None:
|
|
1786
1709
|
if not os.path.exists(self.folder):
|
|
@@ -1793,14 +1716,8 @@ def _add_sample_standard(
|
|
|
1793
1716
|
|
|
1794
1717
|
# Efficient scan counting
|
|
1795
1718
|
ms1_count = ms2_count = 0
|
|
1796
|
-
if (
|
|
1797
|
-
|
|
1798
|
-
and ddaobj.scans_df is not None
|
|
1799
|
-
and not ddaobj.scans_df.is_empty()
|
|
1800
|
-
):
|
|
1801
|
-
scan_counts = (
|
|
1802
|
-
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1803
|
-
)
|
|
1719
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1720
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1804
1721
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1805
1722
|
counts = scan_counts.get("len", [])
|
|
1806
1723
|
for level, count in zip(ms_levels, counts):
|
|
@@ -1811,23 +1728,21 @@ def _add_sample_standard(
|
|
|
1811
1728
|
|
|
1812
1729
|
# Create sample entry
|
|
1813
1730
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1814
|
-
new_sample = pl.DataFrame(
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
},
|
|
1830
|
-
)
|
|
1731
|
+
new_sample = pl.DataFrame({
|
|
1732
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1733
|
+
"sample_name": [sample_name],
|
|
1734
|
+
"sample_path": [final_sample_path],
|
|
1735
|
+
"sample_type": [sample_type],
|
|
1736
|
+
"map_id": [map_id_value],
|
|
1737
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1738
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1739
|
+
"sample_group": [""],
|
|
1740
|
+
"sample_batch": [1],
|
|
1741
|
+
"sample_sequence": [next_sequence],
|
|
1742
|
+
"num_features": [int(ddaobj._oms_features_map.size())],
|
|
1743
|
+
"num_ms1": [ms1_count],
|
|
1744
|
+
"num_ms2": [ms2_count],
|
|
1745
|
+
})
|
|
1831
1746
|
|
|
1832
1747
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1833
1748
|
|
|
@@ -1862,9 +1777,7 @@ def _add_sample_standard(
|
|
|
1862
1777
|
# Use diagonal concatenation for flexibility
|
|
1863
1778
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1864
1779
|
|
|
1865
|
-
self.logger.debug(
|
|
1866
|
-
f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
|
|
1867
|
-
)
|
|
1780
|
+
self.logger.debug(f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)")
|
|
1868
1781
|
return True
|
|
1869
1782
|
|
|
1870
1783
|
|