masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/chromatogram.py +2 -2
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +8 -8
- masster/sample/adducts.py +337 -263
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +557 -278
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +2 -2
- masster/sample/load.py +25 -11
- masster/sample/plot.py +5 -5
- masster/sample/processing.py +115 -85
- masster/sample/sample.py +28 -15
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +34 -11
- masster/spectrum.py +2 -2
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +354 -204
- masster/study/h5.py +557 -155
- masster/study/helpers.py +487 -194
- masster/study/id.py +536 -347
- masster/study/load.py +228 -138
- masster/study/plot.py +68 -68
- masster/study/processing.py +455 -253
- masster/study/save.py +14 -4
- masster/study/study.py +122 -40
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0
masster/study/load.py
CHANGED
|
@@ -94,7 +94,9 @@ def add(
|
|
|
94
94
|
|
|
95
95
|
if len(files) > 0:
|
|
96
96
|
# Limit files if max_files is specified
|
|
97
|
-
remaining_slots =
|
|
97
|
+
remaining_slots = (
|
|
98
|
+
max_files - counter if max_files is not None else len(files)
|
|
99
|
+
)
|
|
98
100
|
files = files[:remaining_slots]
|
|
99
101
|
|
|
100
102
|
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
@@ -117,7 +119,9 @@ def add(
|
|
|
117
119
|
|
|
118
120
|
# Batch process all files of this extension using ultra-optimized method
|
|
119
121
|
if files_to_process:
|
|
120
|
-
self.logger.debug(
|
|
122
|
+
self.logger.debug(
|
|
123
|
+
f"Batch processing {len(files_to_process)} {ext} files",
|
|
124
|
+
)
|
|
121
125
|
successful = self._add_samples_batch(
|
|
122
126
|
files_to_process,
|
|
123
127
|
reset=reset,
|
|
@@ -140,6 +144,9 @@ def add(
|
|
|
140
144
|
)
|
|
141
145
|
else:
|
|
142
146
|
self.logger.debug(f"Successfully added {counter} samples to the study.")
|
|
147
|
+
|
|
148
|
+
# Return a simple summary to suppress marimo's automatic object display
|
|
149
|
+
return f"Added {counter} samples to study"
|
|
143
150
|
|
|
144
151
|
|
|
145
152
|
# TODO type is not used
|
|
@@ -256,7 +263,8 @@ def _fill_chrom_single_impl(
|
|
|
256
263
|
if min_number > 0:
|
|
257
264
|
original_count = len(uids)
|
|
258
265
|
uids = self.consensus_df.filter(
|
|
259
|
-
(pl.col("number_samples") >= min_number)
|
|
266
|
+
(pl.col("number_samples") >= min_number)
|
|
267
|
+
& (pl.col("consensus_uid").is_in(uids)),
|
|
260
268
|
)["consensus_uid"].to_list()
|
|
261
269
|
self.logger.debug(
|
|
262
270
|
f"Features to fill: {original_count} -> {len(uids)}",
|
|
@@ -271,13 +279,15 @@ def _fill_chrom_single_impl(
|
|
|
271
279
|
# Build lookup dictionaries
|
|
272
280
|
self.logger.debug("Building lookup dictionaries...")
|
|
273
281
|
consensus_info = {}
|
|
274
|
-
consensus_subset = self.consensus_df.select(
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
282
|
+
consensus_subset = self.consensus_df.select(
|
|
283
|
+
[
|
|
284
|
+
"consensus_uid",
|
|
285
|
+
"rt_start_mean",
|
|
286
|
+
"rt_end_mean",
|
|
287
|
+
"mz",
|
|
288
|
+
"rt",
|
|
289
|
+
],
|
|
290
|
+
).filter(pl.col("consensus_uid").is_in(uids))
|
|
281
291
|
|
|
282
292
|
for row in consensus_subset.iter_rows(named=True):
|
|
283
293
|
consensus_info[row["consensus_uid"]] = {
|
|
@@ -444,11 +454,13 @@ def _fill_chrom_single_impl(
|
|
|
444
454
|
}
|
|
445
455
|
|
|
446
456
|
new_features.append(new_feature)
|
|
447
|
-
new_mapping.append(
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
457
|
+
new_mapping.append(
|
|
458
|
+
{
|
|
459
|
+
"consensus_uid": consensus_uid,
|
|
460
|
+
"sample_uid": sample_uid,
|
|
461
|
+
"feature_uid": feature_uid,
|
|
462
|
+
},
|
|
463
|
+
)
|
|
452
464
|
counter += 1
|
|
453
465
|
|
|
454
466
|
# Add new features to DataFrames
|
|
@@ -471,7 +483,10 @@ def _fill_chrom_single_impl(
|
|
|
471
483
|
for row in rows_to_add:
|
|
472
484
|
# Cast numeric columns to ensure consistency
|
|
473
485
|
for key, value in row.items():
|
|
474
|
-
if
|
|
486
|
+
if (
|
|
487
|
+
key in ["mz", "rt", "intensity", "area", "height"]
|
|
488
|
+
and value is not None
|
|
489
|
+
):
|
|
475
490
|
row[key] = float(value)
|
|
476
491
|
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
477
492
|
row[key] = int(value)
|
|
@@ -690,11 +705,13 @@ def _process_sample_for_parallel_fill(
|
|
|
690
705
|
}
|
|
691
706
|
|
|
692
707
|
new_features.append(new_feature)
|
|
693
|
-
new_mapping.append(
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
708
|
+
new_mapping.append(
|
|
709
|
+
{
|
|
710
|
+
"consensus_uid": consensus_uid,
|
|
711
|
+
"sample_uid": sample_uid,
|
|
712
|
+
"feature_uid": feature_uid,
|
|
713
|
+
},
|
|
714
|
+
)
|
|
698
715
|
counter += 1
|
|
699
716
|
|
|
700
717
|
return new_features, new_mapping, counter
|
|
@@ -740,7 +757,8 @@ def _fill_chrom_impl(
|
|
|
740
757
|
if min_number > 0:
|
|
741
758
|
original_count = len(uids)
|
|
742
759
|
uids = self.consensus_df.filter(
|
|
743
|
-
(pl.col("number_samples") >= min_number)
|
|
760
|
+
(pl.col("number_samples") >= min_number)
|
|
761
|
+
& (pl.col("consensus_uid").is_in(uids)),
|
|
744
762
|
)["consensus_uid"].to_list()
|
|
745
763
|
self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
|
|
746
764
|
|
|
@@ -767,13 +785,15 @@ def _fill_chrom_impl(
|
|
|
767
785
|
# Build lookup dictionaries
|
|
768
786
|
self.logger.debug("Building lookup dictionaries...")
|
|
769
787
|
consensus_info = {}
|
|
770
|
-
consensus_subset = self.consensus_df.select(
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
788
|
+
consensus_subset = self.consensus_df.select(
|
|
789
|
+
[
|
|
790
|
+
"consensus_uid",
|
|
791
|
+
"rt_start_mean",
|
|
792
|
+
"rt_end_mean",
|
|
793
|
+
"mz",
|
|
794
|
+
"rt",
|
|
795
|
+
],
|
|
796
|
+
).filter(pl.col("consensus_uid").is_in(uids))
|
|
777
797
|
|
|
778
798
|
for row in consensus_subset.iter_rows(named=True):
|
|
779
799
|
consensus_info[row["consensus_uid"]] = {
|
|
@@ -790,11 +810,13 @@ def _fill_chrom_impl(
|
|
|
790
810
|
for row in self.samples_df.filter(
|
|
791
811
|
pl.col("sample_uid").is_in(unique_sample_uids),
|
|
792
812
|
).iter_rows(named=True):
|
|
793
|
-
samples_to_process.append(
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
813
|
+
samples_to_process.append(
|
|
814
|
+
{
|
|
815
|
+
"sample_name": row["sample_name"],
|
|
816
|
+
"sample_uid": row["sample_uid"],
|
|
817
|
+
"sample_path": row["sample_path"],
|
|
818
|
+
},
|
|
819
|
+
)
|
|
798
820
|
|
|
799
821
|
total_missing = len(missing_combinations_df)
|
|
800
822
|
self.logger.debug(
|
|
@@ -802,7 +824,9 @@ def _fill_chrom_impl(
|
|
|
802
824
|
)
|
|
803
825
|
|
|
804
826
|
# Calculate current max feature_uid to avoid conflicts
|
|
805
|
-
features_df_max_uid =
|
|
827
|
+
features_df_max_uid = (
|
|
828
|
+
self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
|
|
829
|
+
)
|
|
806
830
|
|
|
807
831
|
# Process samples in parallel
|
|
808
832
|
all_new_features: list[dict] = []
|
|
@@ -876,7 +900,10 @@ def _fill_chrom_impl(
|
|
|
876
900
|
for row in rows_to_add:
|
|
877
901
|
# Cast numeric columns to ensure consistency
|
|
878
902
|
for key, value in row.items():
|
|
879
|
-
if
|
|
903
|
+
if (
|
|
904
|
+
key in ["mz", "rt", "intensity", "area", "height"]
|
|
905
|
+
and value is not None
|
|
906
|
+
):
|
|
880
907
|
row[key] = float(value)
|
|
881
908
|
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
882
909
|
row[key] = int(value)
|
|
@@ -925,7 +952,10 @@ def fill(self, **kwargs):
|
|
|
925
952
|
"""
|
|
926
953
|
# parameters initialization
|
|
927
954
|
params = fill_defaults()
|
|
928
|
-
num_workers = kwargs.get(
|
|
955
|
+
num_workers = kwargs.get(
|
|
956
|
+
"num_workers",
|
|
957
|
+
4,
|
|
958
|
+
) # Default parameter not in defaults class
|
|
929
959
|
|
|
930
960
|
for key, value in kwargs.items():
|
|
931
961
|
if isinstance(value, fill_defaults):
|
|
@@ -988,7 +1018,9 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
988
1018
|
.agg(pl.count("feature_uid").alias("count"))
|
|
989
1019
|
)
|
|
990
1020
|
|
|
991
|
-
total_existing =
|
|
1021
|
+
total_existing = (
|
|
1022
|
+
consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
1023
|
+
)
|
|
992
1024
|
|
|
993
1025
|
# If >95% filled, likely no gaps (common case)
|
|
994
1026
|
if total_existing >= total_possible * 0.95:
|
|
@@ -1007,10 +1039,12 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1007
1039
|
|
|
1008
1040
|
# Get existing combinations for target UIDs only
|
|
1009
1041
|
existing_by_consensus = {}
|
|
1010
|
-
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1042
|
+
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1043
|
+
[
|
|
1044
|
+
"consensus_uid",
|
|
1045
|
+
"feature_uid",
|
|
1046
|
+
],
|
|
1047
|
+
).iter_rows():
|
|
1014
1048
|
if consensus_uid in uids_set and feature_uid in feature_to_sample:
|
|
1015
1049
|
if consensus_uid not in existing_by_consensus:
|
|
1016
1050
|
existing_by_consensus[consensus_uid] = set()
|
|
@@ -1018,7 +1052,9 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1018
1052
|
|
|
1019
1053
|
# Get sample info once
|
|
1020
1054
|
all_samples = list(
|
|
1021
|
-
self.samples_df.select(
|
|
1055
|
+
self.samples_df.select(
|
|
1056
|
+
["sample_uid", "sample_name", "sample_path"],
|
|
1057
|
+
).iter_rows(),
|
|
1022
1058
|
)
|
|
1023
1059
|
|
|
1024
1060
|
# Check for missing combinations
|
|
@@ -1026,13 +1062,17 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1026
1062
|
existing_samples = existing_by_consensus.get(consensus_uid, set())
|
|
1027
1063
|
for sample_uid, sample_name, sample_path in all_samples:
|
|
1028
1064
|
if sample_uid not in existing_samples:
|
|
1029
|
-
missing_combinations.append(
|
|
1065
|
+
missing_combinations.append(
|
|
1066
|
+
(consensus_uid, sample_uid, sample_name, sample_path),
|
|
1067
|
+
)
|
|
1030
1068
|
|
|
1031
1069
|
return missing_combinations
|
|
1032
1070
|
|
|
1033
1071
|
else:
|
|
1034
1072
|
# For studies with many gaps, use bulk operations
|
|
1035
|
-
self.logger.debug(
|
|
1073
|
+
self.logger.debug(
|
|
1074
|
+
f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization",
|
|
1075
|
+
)
|
|
1036
1076
|
|
|
1037
1077
|
# Build efficient lookups
|
|
1038
1078
|
uids_set = set(uids)
|
|
@@ -1043,16 +1083,20 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1043
1083
|
# Build existing combinations set
|
|
1044
1084
|
existing_combinations = {
|
|
1045
1085
|
(consensus_uid, feature_to_sample[feature_uid])
|
|
1046
|
-
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1086
|
+
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1087
|
+
[
|
|
1088
|
+
"consensus_uid",
|
|
1089
|
+
"feature_uid",
|
|
1090
|
+
],
|
|
1091
|
+
).iter_rows()
|
|
1050
1092
|
if consensus_uid in uids_set and feature_uid in feature_to_sample
|
|
1051
1093
|
}
|
|
1052
1094
|
|
|
1053
1095
|
# Get all sample info
|
|
1054
1096
|
all_samples = list(
|
|
1055
|
-
self.samples_df.select(
|
|
1097
|
+
self.samples_df.select(
|
|
1098
|
+
["sample_uid", "sample_name", "sample_path"],
|
|
1099
|
+
).iter_rows(),
|
|
1056
1100
|
)
|
|
1057
1101
|
|
|
1058
1102
|
# Generate all missing combinations
|
|
@@ -1121,7 +1165,10 @@ def sanitize(self):
|
|
|
1121
1165
|
for ms2_specs in row_data["ms2_specs"]:
|
|
1122
1166
|
if not isinstance(ms2_specs, Spectrum):
|
|
1123
1167
|
try:
|
|
1124
|
-
new_ms2_specs = Spectrum(
|
|
1168
|
+
new_ms2_specs = Spectrum(
|
|
1169
|
+
mz=np.array([0]),
|
|
1170
|
+
inty=np.array([0]),
|
|
1171
|
+
)
|
|
1125
1172
|
if hasattr(ms2_specs, "__dict__"):
|
|
1126
1173
|
new_ms2_specs.from_dict(ms2_specs.__dict__)
|
|
1127
1174
|
else:
|
|
@@ -1160,8 +1207,8 @@ def sanitize(self):
|
|
|
1160
1207
|
def load_features(self):
|
|
1161
1208
|
"""
|
|
1162
1209
|
Load features by reconstructing FeatureMaps from the processed features_df data.
|
|
1163
|
-
|
|
1164
|
-
This ensures that the loaded FeatureMaps contain the same processed features
|
|
1210
|
+
|
|
1211
|
+
This ensures that the loaded FeatureMaps contain the same processed features
|
|
1165
1212
|
as stored in features_df, rather than loading raw features from .featureXML files
|
|
1166
1213
|
which may not match the processed data after filtering, alignment, etc.
|
|
1167
1214
|
"""
|
|
@@ -1169,25 +1216,25 @@ def load_features(self):
|
|
|
1169
1216
|
import pyopenms as oms
|
|
1170
1217
|
from tqdm import tqdm
|
|
1171
1218
|
from datetime import datetime
|
|
1172
|
-
|
|
1219
|
+
|
|
1173
1220
|
self.features_maps = []
|
|
1174
|
-
|
|
1221
|
+
|
|
1175
1222
|
# Check if features_df exists and is not empty
|
|
1176
1223
|
if self.features_df is None:
|
|
1177
1224
|
self.logger.warning("features_df is None. Falling back to XML loading.")
|
|
1178
1225
|
self._load_features_from_xml()
|
|
1179
1226
|
return
|
|
1180
|
-
|
|
1227
|
+
|
|
1181
1228
|
if len(self.features_df) == 0:
|
|
1182
1229
|
self.logger.warning("features_df is empty. Falling back to XML loading.")
|
|
1183
1230
|
self._load_features_from_xml()
|
|
1184
1231
|
return
|
|
1185
|
-
|
|
1232
|
+
|
|
1186
1233
|
# If we get here, we should use the new method
|
|
1187
1234
|
self.logger.debug("Reconstructing FeatureMaps from features_df.")
|
|
1188
|
-
|
|
1235
|
+
|
|
1189
1236
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1190
|
-
|
|
1237
|
+
|
|
1191
1238
|
# Process each sample in order
|
|
1192
1239
|
for sample_index, row_dict in tqdm(
|
|
1193
1240
|
enumerate(self.samples_df.iter_rows(named=True)),
|
|
@@ -1195,37 +1242,39 @@ def load_features(self):
|
|
|
1195
1242
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Reconstruct FeatureMaps from DataFrame",
|
|
1196
1243
|
disable=tdqm_disable,
|
|
1197
1244
|
):
|
|
1198
|
-
sample_uid = row_dict[
|
|
1199
|
-
sample_name = row_dict[
|
|
1200
|
-
|
|
1245
|
+
sample_uid = row_dict["sample_uid"]
|
|
1246
|
+
sample_name = row_dict["sample_name"]
|
|
1247
|
+
|
|
1201
1248
|
# Get features for this sample from features_df
|
|
1202
|
-
sample_features = self.features_df.filter(pl.col(
|
|
1203
|
-
|
|
1249
|
+
sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
1250
|
+
|
|
1204
1251
|
# Create new FeatureMap
|
|
1205
1252
|
feature_map = oms.FeatureMap()
|
|
1206
|
-
|
|
1253
|
+
|
|
1207
1254
|
# Convert DataFrame features to OpenMS Features
|
|
1208
1255
|
for feature_row in sample_features.iter_rows(named=True):
|
|
1209
1256
|
feature = oms.Feature()
|
|
1210
|
-
|
|
1257
|
+
|
|
1211
1258
|
# Set properties from DataFrame (handle missing values gracefully)
|
|
1212
1259
|
try:
|
|
1213
|
-
feature.setUniqueId(int(feature_row[
|
|
1214
|
-
feature.setMZ(float(feature_row[
|
|
1215
|
-
feature.setRT(float(feature_row[
|
|
1216
|
-
feature.setIntensity(float(feature_row[
|
|
1217
|
-
feature.setOverallQuality(float(feature_row[
|
|
1218
|
-
feature.setCharge(int(feature_row[
|
|
1219
|
-
|
|
1260
|
+
feature.setUniqueId(int(feature_row["feature_id"]))
|
|
1261
|
+
feature.setMZ(float(feature_row["mz"]))
|
|
1262
|
+
feature.setRT(float(feature_row["rt"]))
|
|
1263
|
+
feature.setIntensity(float(feature_row["inty"]))
|
|
1264
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
1265
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
1266
|
+
|
|
1220
1267
|
# Add to feature map
|
|
1221
1268
|
feature_map.push_back(feature)
|
|
1222
1269
|
except (ValueError, TypeError) as e:
|
|
1223
1270
|
self.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
1224
1271
|
continue
|
|
1225
|
-
|
|
1272
|
+
|
|
1226
1273
|
self.features_maps.append(feature_map)
|
|
1227
|
-
|
|
1228
|
-
self.logger.debug(
|
|
1274
|
+
|
|
1275
|
+
self.logger.debug(
|
|
1276
|
+
f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
|
|
1277
|
+
)
|
|
1229
1278
|
|
|
1230
1279
|
|
|
1231
1280
|
def _load_features_from_xml(self):
|
|
@@ -1280,7 +1329,14 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1280
1329
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1281
1330
|
|
|
1282
1331
|
|
|
1283
|
-
def _add_samples_batch(
|
|
1332
|
+
def _add_samples_batch(
|
|
1333
|
+
self,
|
|
1334
|
+
files,
|
|
1335
|
+
reset=False,
|
|
1336
|
+
adducts=None,
|
|
1337
|
+
blacklist=None,
|
|
1338
|
+
fast=True,
|
|
1339
|
+
):
|
|
1284
1340
|
"""
|
|
1285
1341
|
Optimized batch addition of samples.
|
|
1286
1342
|
|
|
@@ -1303,7 +1359,9 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
|
|
|
1303
1359
|
if blacklist is None:
|
|
1304
1360
|
blacklist = set()
|
|
1305
1361
|
|
|
1306
|
-
self.logger.debug(
|
|
1362
|
+
self.logger.debug(
|
|
1363
|
+
f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...",
|
|
1364
|
+
)
|
|
1307
1365
|
|
|
1308
1366
|
successful_additions = 0
|
|
1309
1367
|
failed_additions = 0
|
|
@@ -1360,7 +1418,9 @@ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, f
|
|
|
1360
1418
|
# Color assignment done once for all samples
|
|
1361
1419
|
self._sample_color_reset_optimized()
|
|
1362
1420
|
|
|
1363
|
-
self.logger.debug(
|
|
1421
|
+
self.logger.debug(
|
|
1422
|
+
f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
|
|
1423
|
+
)
|
|
1364
1424
|
|
|
1365
1425
|
return successful_additions
|
|
1366
1426
|
|
|
@@ -1406,7 +1466,7 @@ def _add_sample_optimized(
|
|
|
1406
1466
|
# Load sample
|
|
1407
1467
|
ddaobj = Sample()
|
|
1408
1468
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1409
|
-
|
|
1469
|
+
|
|
1410
1470
|
# Try optimized loading first (study-specific, skips ms1_df for better performance)
|
|
1411
1471
|
|
|
1412
1472
|
if file.endswith(".sample5"):
|
|
@@ -1442,7 +1502,7 @@ def _add_sample_optimized(
|
|
|
1442
1502
|
# Handle file paths
|
|
1443
1503
|
if file.endswith(".sample5"):
|
|
1444
1504
|
final_sample_path = file
|
|
1445
|
-
#self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1505
|
+
# self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
|
|
1446
1506
|
else:
|
|
1447
1507
|
if self.folder is not None:
|
|
1448
1508
|
if not os.path.exists(self.folder):
|
|
@@ -1455,8 +1515,14 @@ def _add_sample_optimized(
|
|
|
1455
1515
|
|
|
1456
1516
|
# Efficient scan counting
|
|
1457
1517
|
ms1_count = ms2_count = 0
|
|
1458
|
-
if
|
|
1459
|
-
|
|
1518
|
+
if (
|
|
1519
|
+
hasattr(ddaobj, "scans_df")
|
|
1520
|
+
and ddaobj.scans_df is not None
|
|
1521
|
+
and not ddaobj.scans_df.is_empty()
|
|
1522
|
+
):
|
|
1523
|
+
scan_counts = (
|
|
1524
|
+
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1525
|
+
)
|
|
1460
1526
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1461
1527
|
counts = scan_counts.get("len", [])
|
|
1462
1528
|
for level, count in zip(ms_levels, counts):
|
|
@@ -1467,21 +1533,23 @@ def _add_sample_optimized(
|
|
|
1467
1533
|
|
|
1468
1534
|
# Create sample entry
|
|
1469
1535
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1470
|
-
new_sample = pl.DataFrame(
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1536
|
+
new_sample = pl.DataFrame(
|
|
1537
|
+
{
|
|
1538
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1539
|
+
"sample_name": [sample_name],
|
|
1540
|
+
"sample_path": [final_sample_path],
|
|
1541
|
+
"sample_type": [sample_type],
|
|
1542
|
+
"map_id": [map_id_value],
|
|
1543
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1544
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1545
|
+
"sample_group": [""],
|
|
1546
|
+
"sample_batch": [1],
|
|
1547
|
+
"sample_sequence": [next_sequence],
|
|
1548
|
+
"num_features": [int(ddaobj._oms_features_map.size())],
|
|
1549
|
+
"num_ms1": [ms1_count],
|
|
1550
|
+
"num_ms2": [ms2_count],
|
|
1551
|
+
},
|
|
1552
|
+
)
|
|
1485
1553
|
|
|
1486
1554
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1487
1555
|
|
|
@@ -1523,7 +1591,9 @@ def _add_sample_optimized(
|
|
|
1523
1591
|
# - No type casting loops
|
|
1524
1592
|
# - No sample_color_reset()
|
|
1525
1593
|
|
|
1526
|
-
self.logger.debug(
|
|
1594
|
+
self.logger.debug(
|
|
1595
|
+
f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)",
|
|
1596
|
+
)
|
|
1527
1597
|
return True
|
|
1528
1598
|
|
|
1529
1599
|
|
|
@@ -1567,7 +1637,7 @@ def _add_sample_standard(
|
|
|
1567
1637
|
ddaobj = Sample()
|
|
1568
1638
|
ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
|
|
1569
1639
|
# Use standard loading method that loads all data including ms1_df
|
|
1570
|
-
|
|
1640
|
+
|
|
1571
1641
|
if file.endswith(".sample5"):
|
|
1572
1642
|
ddaobj.load(file)
|
|
1573
1643
|
# restore _oms_features_map
|
|
@@ -1601,7 +1671,7 @@ def _add_sample_standard(
|
|
|
1601
1671
|
# Handle file paths
|
|
1602
1672
|
if file.endswith(".sample5"):
|
|
1603
1673
|
final_sample_path = file
|
|
1604
|
-
#self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
|
|
1674
|
+
# self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
|
|
1605
1675
|
else:
|
|
1606
1676
|
if self.folder is not None:
|
|
1607
1677
|
if not os.path.exists(self.folder):
|
|
@@ -1614,8 +1684,14 @@ def _add_sample_standard(
|
|
|
1614
1684
|
|
|
1615
1685
|
# Efficient scan counting
|
|
1616
1686
|
ms1_count = ms2_count = 0
|
|
1617
|
-
if
|
|
1618
|
-
|
|
1687
|
+
if (
|
|
1688
|
+
hasattr(ddaobj, "scans_df")
|
|
1689
|
+
and ddaobj.scans_df is not None
|
|
1690
|
+
and not ddaobj.scans_df.is_empty()
|
|
1691
|
+
):
|
|
1692
|
+
scan_counts = (
|
|
1693
|
+
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1694
|
+
)
|
|
1619
1695
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1620
1696
|
counts = scan_counts.get("len", [])
|
|
1621
1697
|
for level, count in zip(ms_levels, counts):
|
|
@@ -1626,21 +1702,23 @@ def _add_sample_standard(
|
|
|
1626
1702
|
|
|
1627
1703
|
# Create sample entry
|
|
1628
1704
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1629
|
-
new_sample = pl.DataFrame(
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1705
|
+
new_sample = pl.DataFrame(
|
|
1706
|
+
{
|
|
1707
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1708
|
+
"sample_name": [sample_name],
|
|
1709
|
+
"sample_path": [final_sample_path],
|
|
1710
|
+
"sample_type": [sample_type],
|
|
1711
|
+
"map_id": [map_id_value],
|
|
1712
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1713
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1714
|
+
"sample_group": [""],
|
|
1715
|
+
"sample_batch": [1],
|
|
1716
|
+
"sample_sequence": [next_sequence],
|
|
1717
|
+
"num_features": [int(ddaobj._oms_features_map.size())],
|
|
1718
|
+
"num_ms1": [ms1_count],
|
|
1719
|
+
"num_ms2": [ms2_count],
|
|
1720
|
+
},
|
|
1721
|
+
)
|
|
1644
1722
|
|
|
1645
1723
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1646
1724
|
|
|
@@ -1675,7 +1753,9 @@ def _add_sample_standard(
|
|
|
1675
1753
|
# Use diagonal concatenation for flexibility
|
|
1676
1754
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1677
1755
|
|
|
1678
|
-
self.logger.debug(
|
|
1756
|
+
self.logger.debug(
|
|
1757
|
+
f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
|
|
1758
|
+
)
|
|
1679
1759
|
return True
|
|
1680
1760
|
## COMMENT AR: Is this intentional?
|
|
1681
1761
|
# Use standard loading method that loads all data including ms1_df
|
|
@@ -1703,7 +1783,7 @@ def _add_sample_standard(
|
|
|
1703
1783
|
# Handle file paths
|
|
1704
1784
|
if file.endswith(".sample5"):
|
|
1705
1785
|
final_sample_path = file
|
|
1706
|
-
#self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
|
|
1786
|
+
# self.logger.trace(f"Using existing .sample5 file: {final_sample_path}")
|
|
1707
1787
|
else:
|
|
1708
1788
|
if self.folder is not None:
|
|
1709
1789
|
if not os.path.exists(self.folder):
|
|
@@ -1716,8 +1796,14 @@ def _add_sample_standard(
|
|
|
1716
1796
|
|
|
1717
1797
|
# Efficient scan counting
|
|
1718
1798
|
ms1_count = ms2_count = 0
|
|
1719
|
-
if
|
|
1720
|
-
|
|
1799
|
+
if (
|
|
1800
|
+
hasattr(ddaobj, "scans_df")
|
|
1801
|
+
and ddaobj.scans_df is not None
|
|
1802
|
+
and not ddaobj.scans_df.is_empty()
|
|
1803
|
+
):
|
|
1804
|
+
scan_counts = (
|
|
1805
|
+
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1806
|
+
)
|
|
1721
1807
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1722
1808
|
counts = scan_counts.get("len", [])
|
|
1723
1809
|
for level, count in zip(ms_levels, counts):
|
|
@@ -1728,21 +1814,23 @@ def _add_sample_standard(
|
|
|
1728
1814
|
|
|
1729
1815
|
# Create sample entry
|
|
1730
1816
|
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
1731
|
-
new_sample = pl.DataFrame(
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1817
|
+
new_sample = pl.DataFrame(
|
|
1818
|
+
{
|
|
1819
|
+
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
1820
|
+
"sample_name": [sample_name],
|
|
1821
|
+
"sample_path": [final_sample_path],
|
|
1822
|
+
"sample_type": [sample_type],
|
|
1823
|
+
"map_id": [map_id_value],
|
|
1824
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
1825
|
+
"sample_color": [None], # Will be set in batch at end
|
|
1826
|
+
"sample_group": [""],
|
|
1827
|
+
"sample_batch": [1],
|
|
1828
|
+
"sample_sequence": [next_sequence],
|
|
1829
|
+
"num_features": [int(ddaobj._oms_features_map.size())],
|
|
1830
|
+
"num_ms1": [ms1_count],
|
|
1831
|
+
"num_ms2": [ms2_count],
|
|
1832
|
+
},
|
|
1833
|
+
)
|
|
1746
1834
|
|
|
1747
1835
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
|
1748
1836
|
|
|
@@ -1777,7 +1865,9 @@ def _add_sample_standard(
|
|
|
1777
1865
|
# Use diagonal concatenation for flexibility
|
|
1778
1866
|
self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
|
|
1779
1867
|
|
|
1780
|
-
self.logger.debug(
|
|
1868
|
+
self.logger.debug(
|
|
1869
|
+
f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (standard)",
|
|
1870
|
+
)
|
|
1781
1871
|
return True
|
|
1782
1872
|
|
|
1783
1873
|
|