masster 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/helpers.py +9 -2
- masster/sample/load.py +11 -7
- masster/sample/plot.py +43 -34
- masster/study/defaults/study_def.py +20 -0
- masster/study/h5.py +120 -23
- masster/study/helpers.py +482 -11
- masster/study/load.py +21 -10
- masster/study/plot.py +9 -2
- masster/study/study.py +24 -13
- masster/study/study5_schema.json +14 -5
- {masster-0.3.14.dist-info → masster-0.3.15.dist-info}/METADATA +1 -1
- {masster-0.3.14.dist-info → masster-0.3.15.dist-info}/RECORD +16 -16
- {masster-0.3.14.dist-info → masster-0.3.15.dist-info}/WHEEL +0 -0
- {masster-0.3.14.dist-info → masster-0.3.15.dist-info}/entry_points.txt +0 -0
- {masster-0.3.14.dist-info → masster-0.3.15.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -211,7 +211,7 @@ def get_tic(owner, sample=None, label=None):
|
|
|
211
211
|
return chrom
|
|
212
212
|
|
|
213
213
|
|
|
214
|
-
def get_eic(owner, sample=None, mz=None, mz_tol=
|
|
214
|
+
def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
|
|
215
215
|
"""
|
|
216
216
|
Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
|
|
217
217
|
|
|
@@ -225,13 +225,20 @@ def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
|
|
|
225
225
|
owner: Study or Sample instance
|
|
226
226
|
sample: Sample identifier (required if owner is Study)
|
|
227
227
|
mz (float): Target m/z value
|
|
228
|
-
mz_tol (float): m/z tolerance (
|
|
228
|
+
mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
|
|
229
229
|
rt_unit (str): Retention time unit for the chromatogram
|
|
230
230
|
label (str): Optional label for the chromatogram
|
|
231
231
|
|
|
232
232
|
Returns:
|
|
233
233
|
Chromatogram
|
|
234
234
|
"""
|
|
235
|
+
# Use default mz_tol from study parameters if not provided
|
|
236
|
+
if mz_tol is None:
|
|
237
|
+
if hasattr(owner, 'parameters') and hasattr(owner.parameters, 'eic_mz_tol'):
|
|
238
|
+
mz_tol = owner.parameters.eic_mz_tol
|
|
239
|
+
else:
|
|
240
|
+
mz_tol = 0.01 # fallback default
|
|
241
|
+
|
|
235
242
|
if mz is None:
|
|
236
243
|
raise ValueError("mz must be provided for EIC computation")
|
|
237
244
|
|
|
@@ -1298,7 +1305,7 @@ def compress_chrom(self):
|
|
|
1298
1305
|
# =====================================================================================
|
|
1299
1306
|
|
|
1300
1307
|
|
|
1301
|
-
def
|
|
1308
|
+
def sample_name_replace(self, replace_dict):
|
|
1302
1309
|
"""
|
|
1303
1310
|
Replace sample names in samples_df based on a dictionary mapping.
|
|
1304
1311
|
|
|
@@ -1364,7 +1371,7 @@ def name_replace(self, replace_dict):
|
|
|
1364
1371
|
self.logger.info(f"Successfully replaced {replaced_count} sample names")
|
|
1365
1372
|
|
|
1366
1373
|
|
|
1367
|
-
def
|
|
1374
|
+
def sample_name_reset(self):
|
|
1368
1375
|
"""
|
|
1369
1376
|
Reset sample names to the basename of sample_path without extensions.
|
|
1370
1377
|
|
|
@@ -1446,7 +1453,7 @@ def set_source(self, filename):
|
|
|
1446
1453
|
failed_count = 0
|
|
1447
1454
|
|
|
1448
1455
|
# Get all current file_source values
|
|
1449
|
-
current_sources = self.samples_df.get_column("
|
|
1456
|
+
current_sources = self.samples_df.get_column("sample_source").to_list()
|
|
1450
1457
|
sample_names = self.samples_df.get_column("sample_name").to_list()
|
|
1451
1458
|
|
|
1452
1459
|
new_sources = []
|
|
@@ -1924,13 +1931,21 @@ def consensus_select(
|
|
|
1924
1931
|
chrom_prominence_scaled_mean=None,
|
|
1925
1932
|
chrom_height_scaled_mean=None,
|
|
1926
1933
|
rt_delta_mean=None,
|
|
1934
|
+
sortby=None,
|
|
1935
|
+
descending=True,
|
|
1927
1936
|
):
|
|
1928
1937
|
"""
|
|
1929
1938
|
Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
|
|
1930
1939
|
|
|
1931
1940
|
Parameters:
|
|
1932
|
-
mz: m/z
|
|
1933
|
-
|
|
1941
|
+
mz: m/z filter with flexible formats:
|
|
1942
|
+
- float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
|
|
1943
|
+
- tuple (mz_min, mz_max): range where mz_max > mz_min
|
|
1944
|
+
- tuple (mz_center, mz_tol): range where mz_tol < mz_center (interpreted as mz_center ± mz_tol)
|
|
1945
|
+
rt: retention time filter with flexible formats:
|
|
1946
|
+
- float: RT value ± default tolerance (uses study.parameters.eic_rt_tol)
|
|
1947
|
+
- tuple (rt_min, rt_max): range where rt_max > rt_min
|
|
1948
|
+
- tuple (rt_center, rt_tol): range where rt_tol < rt_center (interpreted as rt_center ± rt_tol)
|
|
1934
1949
|
inty_mean: mean intensity filter (tuple for range, single value for minimum)
|
|
1935
1950
|
consensus_uid: consensus UID filter (list, single value, or tuple for range)
|
|
1936
1951
|
consensus_id: consensus ID filter (list or single value)
|
|
@@ -1943,6 +1958,8 @@ def consensus_select(
|
|
|
1943
1958
|
chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1944
1959
|
chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1945
1960
|
rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
|
|
1961
|
+
sortby: column name(s) to sort by (string, list of strings, or None for no sorting)
|
|
1962
|
+
descending: sort direction (True for descending, False for ascending, default is True)
|
|
1946
1963
|
|
|
1947
1964
|
Returns:
|
|
1948
1965
|
polars.DataFrame: Filtered consensus DataFrame
|
|
@@ -1957,11 +1974,32 @@ def consensus_select(
|
|
|
1957
1974
|
# Filter by m/z
|
|
1958
1975
|
if mz is not None:
|
|
1959
1976
|
consensus_len_before_filter = len(consensus)
|
|
1977
|
+
|
|
1960
1978
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
1961
|
-
|
|
1979
|
+
# Check if second value is smaller than first (indicating mz, mz_tol format)
|
|
1980
|
+
if mz[1] < mz[0]:
|
|
1981
|
+
# First is mz, second is mz_tol
|
|
1982
|
+
mz_center, mz_tol = mz
|
|
1983
|
+
min_mz = mz_center - mz_tol
|
|
1984
|
+
max_mz = mz_center + mz_tol
|
|
1985
|
+
else:
|
|
1986
|
+
# Standard (min_mz, max_mz) format
|
|
1987
|
+
min_mz, max_mz = mz
|
|
1962
1988
|
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1963
1989
|
else:
|
|
1964
|
-
|
|
1990
|
+
# Single float value - use default mz tolerance from study parameters
|
|
1991
|
+
default_mz_tol = getattr(self, 'parameters', None)
|
|
1992
|
+
if default_mz_tol and hasattr(default_mz_tol, 'eic_mz_tol'):
|
|
1993
|
+
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
1994
|
+
else:
|
|
1995
|
+
# Fallback to align_defaults if study parameters not available
|
|
1996
|
+
from masster.study.defaults.align_def import align_defaults
|
|
1997
|
+
default_mz_tol = align_defaults().mz_max_diff
|
|
1998
|
+
|
|
1999
|
+
min_mz = mz - default_mz_tol
|
|
2000
|
+
max_mz = mz + default_mz_tol
|
|
2001
|
+
consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2002
|
+
|
|
1965
2003
|
self.logger.debug(
|
|
1966
2004
|
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1967
2005
|
)
|
|
@@ -1969,11 +2007,32 @@ def consensus_select(
|
|
|
1969
2007
|
# Filter by retention time
|
|
1970
2008
|
if rt is not None:
|
|
1971
2009
|
consensus_len_before_filter = len(consensus)
|
|
2010
|
+
|
|
1972
2011
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
1973
|
-
|
|
2012
|
+
# Check if second value is smaller than first (indicating rt, rt_tol format)
|
|
2013
|
+
if rt[1] < rt[0]:
|
|
2014
|
+
# First is rt, second is rt_tol
|
|
2015
|
+
rt_center, rt_tol = rt
|
|
2016
|
+
min_rt = rt_center - rt_tol
|
|
2017
|
+
max_rt = rt_center + rt_tol
|
|
2018
|
+
else:
|
|
2019
|
+
# Standard (min_rt, max_rt) format
|
|
2020
|
+
min_rt, max_rt = rt
|
|
1974
2021
|
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
1975
2022
|
else:
|
|
1976
|
-
|
|
2023
|
+
# Single float value - use default rt tolerance from study parameters
|
|
2024
|
+
default_rt_tol = getattr(self, 'parameters', None)
|
|
2025
|
+
if default_rt_tol and hasattr(default_rt_tol, 'eic_rt_tol'):
|
|
2026
|
+
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2027
|
+
else:
|
|
2028
|
+
# Fallback to align_defaults if study parameters not available
|
|
2029
|
+
from masster.study.defaults.align_def import align_defaults
|
|
2030
|
+
default_rt_tol = align_defaults().rt_max_diff
|
|
2031
|
+
|
|
2032
|
+
min_rt = rt - default_rt_tol
|
|
2033
|
+
max_rt = rt + default_rt_tol
|
|
2034
|
+
consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2035
|
+
|
|
1977
2036
|
self.logger.debug(
|
|
1978
2037
|
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1979
2038
|
)
|
|
@@ -2170,6 +2229,27 @@ def consensus_select(
|
|
|
2170
2229
|
else:
|
|
2171
2230
|
self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
|
|
2172
2231
|
|
|
2232
|
+
# Sort the results if sortby is specified
|
|
2233
|
+
if sortby is not None:
|
|
2234
|
+
if isinstance(sortby, str):
|
|
2235
|
+
# Single column
|
|
2236
|
+
if sortby in consensus.columns:
|
|
2237
|
+
consensus = consensus.sort(sortby, descending=descending)
|
|
2238
|
+
else:
|
|
2239
|
+
self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
|
|
2240
|
+
elif isinstance(sortby, (list, tuple)):
|
|
2241
|
+
# Multiple columns
|
|
2242
|
+
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2243
|
+
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2244
|
+
|
|
2245
|
+
if invalid_columns:
|
|
2246
|
+
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2247
|
+
|
|
2248
|
+
if valid_columns:
|
|
2249
|
+
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2250
|
+
else:
|
|
2251
|
+
self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
|
|
2252
|
+
|
|
2173
2253
|
return consensus
|
|
2174
2254
|
|
|
2175
2255
|
|
|
@@ -2276,6 +2356,357 @@ def consensus_delete(self, consensus):
|
|
|
2276
2356
|
self.consensus_filter(consensus)
|
|
2277
2357
|
|
|
2278
2358
|
|
|
2359
|
+
# =====================================================================================
|
|
2360
|
+
# SAMPLE MANAGEMENT AND DELETION FUNCTIONS
|
|
2361
|
+
# =====================================================================================
|
|
2362
|
+
|
|
2363
|
+
|
|
2364
|
+
def samples_select(
|
|
2365
|
+
self,
|
|
2366
|
+
sample_uid=None,
|
|
2367
|
+
sample_name=None,
|
|
2368
|
+
sample_type=None,
|
|
2369
|
+
sample_group=None,
|
|
2370
|
+
sample_batch=None,
|
|
2371
|
+
sample_sequence=None,
|
|
2372
|
+
num_features=None,
|
|
2373
|
+
num_ms1=None,
|
|
2374
|
+
num_ms2=None,
|
|
2375
|
+
):
|
|
2376
|
+
"""
|
|
2377
|
+
Select samples from samples_df based on specified criteria and return the filtered DataFrame.
|
|
2378
|
+
|
|
2379
|
+
Parameters:
|
|
2380
|
+
sample_uid: sample UID filter (list, single value, or tuple for range)
|
|
2381
|
+
sample_name: sample name filter (list or single value)
|
|
2382
|
+
sample_type: sample type filter (list or single value)
|
|
2383
|
+
sample_group: sample group filter (list or single value)
|
|
2384
|
+
sample_batch: sample batch filter (list, single value, or tuple for range)
|
|
2385
|
+
sample_sequence: sample sequence filter (list, single value, or tuple for range)
|
|
2386
|
+
num_features: number of features filter (tuple for range, single value for minimum)
|
|
2387
|
+
num_ms1: number of MS1 spectra filter (tuple for range, single value for minimum)
|
|
2388
|
+
num_ms2: number of MS2 spectra filter (tuple for range, single value for minimum)
|
|
2389
|
+
|
|
2390
|
+
Returns:
|
|
2391
|
+
polars.DataFrame: Filtered samples DataFrame
|
|
2392
|
+
"""
|
|
2393
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
2394
|
+
self.logger.warning("No samples found in study.")
|
|
2395
|
+
return pl.DataFrame()
|
|
2396
|
+
|
|
2397
|
+
# Early return if no filters provided
|
|
2398
|
+
filter_params = [
|
|
2399
|
+
sample_uid,
|
|
2400
|
+
sample_name,
|
|
2401
|
+
sample_type,
|
|
2402
|
+
sample_group,
|
|
2403
|
+
sample_batch,
|
|
2404
|
+
sample_sequence,
|
|
2405
|
+
num_features,
|
|
2406
|
+
num_ms1,
|
|
2407
|
+
num_ms2,
|
|
2408
|
+
]
|
|
2409
|
+
if all(param is None for param in filter_params):
|
|
2410
|
+
return self.samples_df.clone()
|
|
2411
|
+
|
|
2412
|
+
initial_count = len(self.samples_df)
|
|
2413
|
+
|
|
2414
|
+
# Pre-check available columns once for efficiency
|
|
2415
|
+
available_columns = set(self.samples_df.columns)
|
|
2416
|
+
|
|
2417
|
+
# Build all filter conditions first, then apply them all at once
|
|
2418
|
+
filter_conditions = []
|
|
2419
|
+
warnings = []
|
|
2420
|
+
|
|
2421
|
+
# Filter by sample_uid
|
|
2422
|
+
if sample_uid is not None:
|
|
2423
|
+
if isinstance(sample_uid, (list, tuple)):
|
|
2424
|
+
if len(sample_uid) == 2 and not isinstance(sample_uid, list):
|
|
2425
|
+
# Treat as range
|
|
2426
|
+
min_uid, max_uid = sample_uid
|
|
2427
|
+
filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
|
|
2428
|
+
else:
|
|
2429
|
+
# Treat as list
|
|
2430
|
+
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
2431
|
+
else:
|
|
2432
|
+
filter_conditions.append(pl.col("sample_uid") == sample_uid)
|
|
2433
|
+
|
|
2434
|
+
# Filter by sample_name
|
|
2435
|
+
if sample_name is not None:
|
|
2436
|
+
if isinstance(sample_name, list):
|
|
2437
|
+
filter_conditions.append(pl.col("sample_name").is_in(sample_name))
|
|
2438
|
+
else:
|
|
2439
|
+
filter_conditions.append(pl.col("sample_name") == sample_name)
|
|
2440
|
+
|
|
2441
|
+
# Filter by sample_type
|
|
2442
|
+
if sample_type is not None:
|
|
2443
|
+
if "sample_type" in available_columns:
|
|
2444
|
+
if isinstance(sample_type, list):
|
|
2445
|
+
filter_conditions.append(pl.col("sample_type").is_in(sample_type))
|
|
2446
|
+
else:
|
|
2447
|
+
filter_conditions.append(pl.col("sample_type") == sample_type)
|
|
2448
|
+
else:
|
|
2449
|
+
warnings.append("'sample_type' column not found in samples_df")
|
|
2450
|
+
|
|
2451
|
+
# Filter by sample_group
|
|
2452
|
+
if sample_group is not None:
|
|
2453
|
+
if "sample_group" in available_columns:
|
|
2454
|
+
if isinstance(sample_group, list):
|
|
2455
|
+
filter_conditions.append(pl.col("sample_group").is_in(sample_group))
|
|
2456
|
+
else:
|
|
2457
|
+
filter_conditions.append(pl.col("sample_group") == sample_group)
|
|
2458
|
+
else:
|
|
2459
|
+
warnings.append("'sample_group' column not found in samples_df")
|
|
2460
|
+
|
|
2461
|
+
# Filter by sample_batch
|
|
2462
|
+
if sample_batch is not None:
|
|
2463
|
+
if "sample_batch" in available_columns:
|
|
2464
|
+
if isinstance(sample_batch, (list, tuple)):
|
|
2465
|
+
if len(sample_batch) == 2 and not isinstance(sample_batch, list):
|
|
2466
|
+
# Treat as range
|
|
2467
|
+
min_batch, max_batch = sample_batch
|
|
2468
|
+
filter_conditions.append((pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch))
|
|
2469
|
+
else:
|
|
2470
|
+
# Treat as list
|
|
2471
|
+
filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
|
|
2472
|
+
else:
|
|
2473
|
+
filter_conditions.append(pl.col("sample_batch") == sample_batch)
|
|
2474
|
+
else:
|
|
2475
|
+
warnings.append("'sample_batch' column not found in samples_df")
|
|
2476
|
+
|
|
2477
|
+
# Filter by sample_sequence
|
|
2478
|
+
if sample_sequence is not None:
|
|
2479
|
+
if "sample_sequence" in available_columns:
|
|
2480
|
+
if isinstance(sample_sequence, (list, tuple)):
|
|
2481
|
+
if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
|
|
2482
|
+
# Treat as range
|
|
2483
|
+
min_seq, max_seq = sample_sequence
|
|
2484
|
+
filter_conditions.append((pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq))
|
|
2485
|
+
else:
|
|
2486
|
+
# Treat as list
|
|
2487
|
+
filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
|
|
2488
|
+
else:
|
|
2489
|
+
filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
|
|
2490
|
+
else:
|
|
2491
|
+
warnings.append("'sample_sequence' column not found in samples_df")
|
|
2492
|
+
|
|
2493
|
+
# Filter by num_features
|
|
2494
|
+
if num_features is not None:
|
|
2495
|
+
if "num_features" in available_columns:
|
|
2496
|
+
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
2497
|
+
min_features, max_features = num_features
|
|
2498
|
+
filter_conditions.append((pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features))
|
|
2499
|
+
else:
|
|
2500
|
+
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
2501
|
+
else:
|
|
2502
|
+
warnings.append("'num_features' column not found in samples_df")
|
|
2503
|
+
|
|
2504
|
+
# Filter by num_ms1
|
|
2505
|
+
if num_ms1 is not None:
|
|
2506
|
+
if "num_ms1" in available_columns:
|
|
2507
|
+
if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
|
|
2508
|
+
min_ms1, max_ms1 = num_ms1
|
|
2509
|
+
filter_conditions.append((pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1))
|
|
2510
|
+
else:
|
|
2511
|
+
filter_conditions.append(pl.col("num_ms1") >= num_ms1)
|
|
2512
|
+
else:
|
|
2513
|
+
warnings.append("'num_ms1' column not found in samples_df")
|
|
2514
|
+
|
|
2515
|
+
# Filter by num_ms2
|
|
2516
|
+
if num_ms2 is not None:
|
|
2517
|
+
if "num_ms2" in available_columns:
|
|
2518
|
+
if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
|
|
2519
|
+
min_ms2, max_ms2 = num_ms2
|
|
2520
|
+
filter_conditions.append((pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2))
|
|
2521
|
+
else:
|
|
2522
|
+
filter_conditions.append(pl.col("num_ms2") >= num_ms2)
|
|
2523
|
+
else:
|
|
2524
|
+
warnings.append("'num_ms2' column not found in samples_df")
|
|
2525
|
+
|
|
2526
|
+
# Log all warnings once at the end for efficiency
|
|
2527
|
+
for warning in warnings:
|
|
2528
|
+
self.logger.warning(warning)
|
|
2529
|
+
|
|
2530
|
+
# Apply all filters at once using lazy evaluation for optimal performance
|
|
2531
|
+
if filter_conditions:
|
|
2532
|
+
# Combine all conditions with AND
|
|
2533
|
+
combined_filter = filter_conditions[0]
|
|
2534
|
+
for condition in filter_conditions[1:]:
|
|
2535
|
+
combined_filter = combined_filter & condition
|
|
2536
|
+
|
|
2537
|
+
# Apply the combined filter using lazy evaluation
|
|
2538
|
+
samples = self.samples_df.lazy().filter(combined_filter).collect()
|
|
2539
|
+
else:
|
|
2540
|
+
samples = self.samples_df.clone()
|
|
2541
|
+
|
|
2542
|
+
final_count = len(samples)
|
|
2543
|
+
|
|
2544
|
+
if final_count == 0:
|
|
2545
|
+
self.logger.warning("No samples remaining after applying selection criteria.")
|
|
2546
|
+
else:
|
|
2547
|
+
self.logger.info(f"Samples selected: {final_count} (out of {initial_count})")
|
|
2548
|
+
|
|
2549
|
+
return samples
|
|
2550
|
+
|
|
2551
|
+
|
|
2552
|
+
def samples_delete(self, samples):
|
|
2553
|
+
"""
|
|
2554
|
+
Delete samples and all related data from the study based on sample identifiers.
|
|
2555
|
+
|
|
2556
|
+
This function eliminates all data related to the specified samples (and their sample_uids)
|
|
2557
|
+
from all dataframes including:
|
|
2558
|
+
- samples_df: Removes the sample rows
|
|
2559
|
+
- features_df: Removes all features belonging to these samples
|
|
2560
|
+
- consensus_mapping_df: Removes mappings for features from these samples
|
|
2561
|
+
- consensus_ms2: Removes MS2 spectra for features from these samples
|
|
2562
|
+
- feature_maps: Removes the corresponding feature maps
|
|
2563
|
+
|
|
2564
|
+
Also updates map_id values to maintain sequential indices after deletion.
|
|
2565
|
+
|
|
2566
|
+
Parameters:
|
|
2567
|
+
samples: Samples to delete. Can be:
|
|
2568
|
+
- list of int: List of sample_uids to delete
|
|
2569
|
+
- polars.DataFrame: DataFrame obtained from samples_select (will use sample_uid column)
|
|
2570
|
+
- int: Single sample_uid to delete
|
|
2571
|
+
|
|
2572
|
+
Returns:
|
|
2573
|
+
None (modifies study DataFrames and feature_maps in place)
|
|
2574
|
+
"""
|
|
2575
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
2576
|
+
self.logger.warning("No samples found in study.")
|
|
2577
|
+
return
|
|
2578
|
+
|
|
2579
|
+
# Early return if no samples provided
|
|
2580
|
+
if samples is None:
|
|
2581
|
+
self.logger.warning("No samples provided for deletion.")
|
|
2582
|
+
return
|
|
2583
|
+
|
|
2584
|
+
initial_sample_count = len(self.samples_df)
|
|
2585
|
+
|
|
2586
|
+
# Determine sample_uids to remove
|
|
2587
|
+
if isinstance(samples, pl.DataFrame):
|
|
2588
|
+
if "sample_uid" not in samples.columns:
|
|
2589
|
+
self.logger.error("samples DataFrame must contain 'sample_uid' column")
|
|
2590
|
+
return
|
|
2591
|
+
sample_uids_to_remove = samples["sample_uid"].to_list()
|
|
2592
|
+
elif isinstance(samples, (list, tuple)):
|
|
2593
|
+
sample_uids_to_remove = list(samples) # Convert tuple to list if needed
|
|
2594
|
+
elif isinstance(samples, int):
|
|
2595
|
+
sample_uids_to_remove = [samples]
|
|
2596
|
+
else:
|
|
2597
|
+
self.logger.error("samples parameter must be a DataFrame, list, tuple, or int")
|
|
2598
|
+
return
|
|
2599
|
+
|
|
2600
|
+
# Early return if no UIDs to remove
|
|
2601
|
+
if not sample_uids_to_remove:
|
|
2602
|
+
self.logger.warning("No sample UIDs provided for deletion.")
|
|
2603
|
+
return
|
|
2604
|
+
|
|
2605
|
+
# Convert to set for faster lookup if list is large
|
|
2606
|
+
if len(sample_uids_to_remove) > 100:
|
|
2607
|
+
sample_uids_set = set(sample_uids_to_remove)
|
|
2608
|
+
# Use the set for filtering if it's significantly smaller
|
|
2609
|
+
if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
|
|
2610
|
+
sample_uids_to_remove = list(sample_uids_set)
|
|
2611
|
+
|
|
2612
|
+
self.logger.info(f"Deleting {len(sample_uids_to_remove)} samples and all related data...")
|
|
2613
|
+
|
|
2614
|
+
# Get feature_uids that need to be removed from features_df
|
|
2615
|
+
feature_uids_to_remove = []
|
|
2616
|
+
initial_features_count = 0
|
|
2617
|
+
if self.features_df is not None and not self.features_df.is_empty():
|
|
2618
|
+
initial_features_count = len(self.features_df)
|
|
2619
|
+
feature_uids_to_remove = self.features_df.filter(
|
|
2620
|
+
pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2621
|
+
)["feature_uid"].to_list()
|
|
2622
|
+
|
|
2623
|
+
# Get map_ids to remove from feature_maps (needed before samples_df deletion)
|
|
2624
|
+
map_ids_to_remove = []
|
|
2625
|
+
if hasattr(self, 'feature_maps') and self.feature_maps is not None:
|
|
2626
|
+
# Get map_ids for samples to be deleted
|
|
2627
|
+
map_ids_df = self.samples_df.filter(
|
|
2628
|
+
pl.col("sample_uid").is_in(sample_uids_to_remove)
|
|
2629
|
+
).select("map_id")
|
|
2630
|
+
if not map_ids_df.is_empty():
|
|
2631
|
+
map_ids_to_remove = map_ids_df["map_id"].to_list()
|
|
2632
|
+
|
|
2633
|
+
# 1. Remove samples from samples_df
|
|
2634
|
+
self.samples_df = self.samples_df.filter(
|
|
2635
|
+
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2636
|
+
)
|
|
2637
|
+
|
|
2638
|
+
# 2. Remove corresponding features from features_df
|
|
2639
|
+
removed_features_count = 0
|
|
2640
|
+
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
2641
|
+
self.features_df = self.features_df.filter(
|
|
2642
|
+
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2643
|
+
)
|
|
2644
|
+
removed_features_count = initial_features_count - len(self.features_df)
|
|
2645
|
+
|
|
2646
|
+
# 3. Remove from consensus_mapping_df
|
|
2647
|
+
removed_mapping_count = 0
|
|
2648
|
+
if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2649
|
+
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2650
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2651
|
+
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
2652
|
+
)
|
|
2653
|
+
removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2654
|
+
|
|
2655
|
+
# 4. Remove from consensus_ms2 if it exists
|
|
2656
|
+
removed_ms2_count = 0
|
|
2657
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
2658
|
+
initial_ms2_count = len(self.consensus_ms2)
|
|
2659
|
+
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2660
|
+
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
2661
|
+
)
|
|
2662
|
+
removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
|
|
2663
|
+
|
|
2664
|
+
# 5. Remove from feature_maps and update map_id
|
|
2665
|
+
removed_maps_count = 0
|
|
2666
|
+
if hasattr(self, 'feature_maps') and self.feature_maps is not None and map_ids_to_remove:
|
|
2667
|
+
# Remove feature maps in reverse order to maintain indices
|
|
2668
|
+
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
2669
|
+
if 0 <= map_id < len(self.feature_maps):
|
|
2670
|
+
self.feature_maps.pop(map_id)
|
|
2671
|
+
removed_maps_count += 1
|
|
2672
|
+
|
|
2673
|
+
# Update map_id values in samples_df to maintain sequential indices
|
|
2674
|
+
if len(self.samples_df) > 0:
|
|
2675
|
+
new_map_ids = list(range(len(self.samples_df)))
|
|
2676
|
+
self.samples_df = self.samples_df.with_columns(
|
|
2677
|
+
pl.lit(new_map_ids).alias("map_id")
|
|
2678
|
+
)
|
|
2679
|
+
|
|
2680
|
+
# Calculate and log results
|
|
2681
|
+
removed_sample_count = initial_sample_count - len(self.samples_df)
|
|
2682
|
+
final_sample_count = len(self.samples_df)
|
|
2683
|
+
|
|
2684
|
+
# Create comprehensive summary message
|
|
2685
|
+
summary_parts = [
|
|
2686
|
+
f"Deleted {removed_sample_count} samples",
|
|
2687
|
+
]
|
|
2688
|
+
|
|
2689
|
+
if removed_features_count > 0:
|
|
2690
|
+
summary_parts.append(f"{removed_features_count} features")
|
|
2691
|
+
|
|
2692
|
+
if removed_mapping_count > 0:
|
|
2693
|
+
summary_parts.append(f"{removed_mapping_count} consensus mappings")
|
|
2694
|
+
|
|
2695
|
+
if removed_ms2_count > 0:
|
|
2696
|
+
summary_parts.append(f"{removed_ms2_count} MS2 spectra")
|
|
2697
|
+
|
|
2698
|
+
if removed_maps_count > 0:
|
|
2699
|
+
summary_parts.append(f"{removed_maps_count} feature maps")
|
|
2700
|
+
|
|
2701
|
+
summary_parts.append(f"Remaining samples: {final_sample_count}")
|
|
2702
|
+
|
|
2703
|
+
self.logger.info(". ".join(summary_parts))
|
|
2704
|
+
|
|
2705
|
+
# Update map_id indices if needed
|
|
2706
|
+
if removed_maps_count > 0 and final_sample_count > 0:
|
|
2707
|
+
self.logger.debug(f"Updated map_id values to range from 0 to {final_sample_count - 1}")
|
|
2708
|
+
|
|
2709
|
+
|
|
2279
2710
|
# =====================================================================================
|
|
2280
2711
|
# COLOR PALETTE AND VISUALIZATION FUNCTIONS
|
|
2281
2712
|
# =====================================================================================
|
|
@@ -2712,3 +3143,43 @@ def _ensure_features_df_schema_order(self):
|
|
|
2712
3143
|
|
|
2713
3144
|
except Exception as e:
|
|
2714
3145
|
self.logger.warning(f"Failed to reorder features_df columns: {e}")
|
|
3146
|
+
|
|
3147
|
+
|
|
3148
|
+
def migrate_map_id_to_index(self):
|
|
3149
|
+
"""
|
|
3150
|
+
Migrate map_id from string-based OpenMS unique IDs to integer indices.
|
|
3151
|
+
|
|
3152
|
+
This function converts the map_id column from string type (with OpenMS unique IDs)
|
|
3153
|
+
to integer type where each map_id corresponds to the index of the feature map
|
|
3154
|
+
in self.features_maps.
|
|
3155
|
+
|
|
3156
|
+
This migration is needed for studies that were created before the map_id format
|
|
3157
|
+
change from OpenMS unique IDs to feature map indices.
|
|
3158
|
+
"""
|
|
3159
|
+
if self.samples_df is None or self.samples_df.is_empty():
|
|
3160
|
+
self.logger.warning("No samples to migrate")
|
|
3161
|
+
return
|
|
3162
|
+
|
|
3163
|
+
# Check if migration is needed
|
|
3164
|
+
current_dtype = self.samples_df['map_id'].dtype
|
|
3165
|
+
if current_dtype == pl.Int64:
|
|
3166
|
+
self.logger.info("map_id column is already Int64 type - no migration needed")
|
|
3167
|
+
return
|
|
3168
|
+
|
|
3169
|
+
self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
|
|
3170
|
+
|
|
3171
|
+
# Create new map_id values based on sample order
|
|
3172
|
+
# Each sample gets a map_id that corresponds to its position in features_maps
|
|
3173
|
+
sample_count = len(self.samples_df)
|
|
3174
|
+
new_map_ids = list(range(sample_count))
|
|
3175
|
+
|
|
3176
|
+
# Update the map_id column
|
|
3177
|
+
self.samples_df = self.samples_df.with_columns(
|
|
3178
|
+
pl.lit(new_map_ids).alias("map_id")
|
|
3179
|
+
)
|
|
3180
|
+
|
|
3181
|
+
# Ensure the column is Int64 type
|
|
3182
|
+
self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
|
|
3183
|
+
|
|
3184
|
+
self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
|
|
3185
|
+
self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
|
masster/study/load.py
CHANGED
|
@@ -184,7 +184,9 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
184
184
|
sample_type = "qc"
|
|
185
185
|
if "blank" in sample_name.lower():
|
|
186
186
|
sample_type = "blank"
|
|
187
|
-
|
|
187
|
+
|
|
188
|
+
# Use the index of the feature map in self.features_maps as map_id
|
|
189
|
+
map_id_value = len(self.features_maps) - 1
|
|
188
190
|
|
|
189
191
|
# Determine the final sample path based on file type
|
|
190
192
|
if file.endswith(".sample5"):
|
|
@@ -219,30 +221,39 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
219
221
|
ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
|
|
220
222
|
ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
|
|
221
223
|
|
|
224
|
+
# Calculate next sequence number
|
|
225
|
+
next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
|
|
226
|
+
|
|
222
227
|
new_sample = pl.DataFrame(
|
|
223
228
|
{
|
|
224
229
|
"sample_uid": [int(len(self.samples_df) + 1)],
|
|
225
230
|
"sample_name": [sample_name],
|
|
226
231
|
"sample_path": [final_sample_path], # Use the determined path
|
|
227
232
|
"sample_type": [sample_type],
|
|
228
|
-
"size": [int(ddaobj.features.size())],
|
|
229
233
|
"map_id": [map_id_value],
|
|
230
|
-
"
|
|
231
|
-
"ms1": [ms1_count],
|
|
232
|
-
"ms2": [ms2_count],
|
|
234
|
+
"sample_source": [getattr(ddaobj, "file_source", file)],
|
|
233
235
|
"sample_color": [None], # Will be set by set_sample_color below
|
|
236
|
+
"sample_group": [""], # Default empty string
|
|
237
|
+
"sample_batch": [1], # Default batch 1
|
|
238
|
+
"sample_sequence": [next_sequence], # Increasing sequence number
|
|
239
|
+
"num_features": [int(ddaobj.features.size())],
|
|
240
|
+
"num_ms1": [ms1_count],
|
|
241
|
+
"num_ms2": [ms2_count],
|
|
234
242
|
},
|
|
235
243
|
schema={
|
|
236
244
|
"sample_uid": pl.Int64,
|
|
237
245
|
"sample_name": pl.Utf8,
|
|
238
246
|
"sample_path": pl.Utf8,
|
|
239
247
|
"sample_type": pl.Utf8,
|
|
240
|
-
"
|
|
241
|
-
"
|
|
242
|
-
"file_source": pl.Utf8,
|
|
243
|
-
"ms1": pl.Int64,
|
|
244
|
-
"ms2": pl.Int64,
|
|
248
|
+
"map_id": pl.Int64,
|
|
249
|
+
"sample_source": pl.Utf8,
|
|
245
250
|
"sample_color": pl.Utf8,
|
|
251
|
+
"sample_group": pl.Utf8,
|
|
252
|
+
"sample_batch": pl.Int64,
|
|
253
|
+
"sample_sequence": pl.Int64,
|
|
254
|
+
"num_features": pl.Int64,
|
|
255
|
+
"num_ms1": pl.Int64,
|
|
256
|
+
"num_ms2": pl.Int64,
|
|
246
257
|
},
|
|
247
258
|
)
|
|
248
259
|
self.samples_df = pl.concat([self.samples_df, new_sample])
|
masster/study/plot.py
CHANGED
|
@@ -375,7 +375,7 @@ def plot_consensus_2d(
|
|
|
375
375
|
])
|
|
376
376
|
|
|
377
377
|
if cmap is None:
|
|
378
|
-
cmap = "
|
|
378
|
+
cmap = "viridis"
|
|
379
379
|
elif cmap == "grey":
|
|
380
380
|
cmap = "Greys256"
|
|
381
381
|
|
|
@@ -898,7 +898,7 @@ def plot_bpc(
|
|
|
898
898
|
def plot_eic(
|
|
899
899
|
self,
|
|
900
900
|
mz,
|
|
901
|
-
mz_tol=
|
|
901
|
+
mz_tol=None,
|
|
902
902
|
samples=None,
|
|
903
903
|
title: str | None = None,
|
|
904
904
|
filename: str | None = None,
|
|
@@ -912,6 +912,9 @@ def plot_eic(
|
|
|
912
912
|
Parameters mirror `plot_bpc` with additional `mz` and `mz_tol` arguments. The function
|
|
913
913
|
retrieves a Sample object for each sample UID, calls `sample.get_eic(mz, mz_tol)`, and
|
|
914
914
|
overlays the resulting chromatograms.
|
|
915
|
+
|
|
916
|
+
Args:
|
|
917
|
+
mz_tol: m/z tolerance in Da. If None, uses study.parameters.eic_mz_tol as default.
|
|
915
918
|
"""
|
|
916
919
|
# Local imports to avoid heavy top-level deps / circular imports
|
|
917
920
|
from bokeh.plotting import figure, show, output_file
|
|
@@ -919,6 +922,10 @@ def plot_eic(
|
|
|
919
922
|
from bokeh.io.export import export_png
|
|
920
923
|
from masster.study.helpers import get_eic
|
|
921
924
|
|
|
925
|
+
# Use study's eic_mz_tol parameter as default if not provided
|
|
926
|
+
if mz_tol is None:
|
|
927
|
+
mz_tol = self.parameters.eic_mz_tol
|
|
928
|
+
|
|
922
929
|
if mz is None:
|
|
923
930
|
self.logger.error("mz must be provided for EIC plotting")
|
|
924
931
|
return
|