masster 0.5.0__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- {masster-0.5.0 → masster-0.5.3}/PKG-INFO +1 -1
- {masster-0.5.0 → masster-0.5.3}/pyproject.toml +1 -1
- {masster-0.5.0 → masster-0.5.3}/src/masster/_version.py +1 -1
- masster-0.5.3/src/masster/data/libs/aa.csv +22 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/lib/lib.py +6 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/load.py +5 -4
- masster-0.5.0/src/masster/study/defaults/fill_chrom_def.py → masster-0.5.3/src/masster/study/defaults/align_def.py +97 -63
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/fill_def.py +10 -2
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/merge_def.py +20 -69
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/export.py +25 -5
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/h5.py +162 -41
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/helpers.py +430 -53
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/id.py +542 -1
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/load.py +1002 -165
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/merge.py +691 -989
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/plot.py +43 -38
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/processing.py +337 -288
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/study.py +63 -58
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/study5_schema.json +9 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/wizard/wizard.py +20 -6
- {masster-0.5.0 → masster-0.5.3}/uv.lock +1 -1
- masster-0.5.0/src/masster/study/defaults/align_def.py +0 -498
- {masster-0.5.0 → masster-0.5.3}/.github/workflows/publish.yml +0 -0
- {masster-0.5.0 → masster-0.5.3}/.github/workflows/security.yml +0 -0
- {masster-0.5.0 → masster-0.5.3}/.github/workflows/test.yml +0 -0
- {masster-0.5.0 → masster-0.5.3}/.gitignore +0 -0
- {masster-0.5.0 → masster-0.5.3}/.pre-commit-config.yaml +0 -0
- {masster-0.5.0 → masster-0.5.3}/LICENSE +0 -0
- {masster-0.5.0 → masster-0.5.3}/Makefile +0 -0
- {masster-0.5.0 → masster-0.5.3}/README.md +0 -0
- {masster-0.5.0 → masster-0.5.3}/TESTING.md +0 -0
- {masster-0.5.0 → masster-0.5.3}/demo/example_batch_process.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/demo/example_sample_process.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/chromatogram.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/libs/ccm.csv +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/libs/urine.csv +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/lib/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/logger.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/adducts.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/defaults/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/defaults/find_adducts_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/defaults/find_features_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/defaults/find_ms2_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/defaults/sample_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/h5.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/helpers.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/lib.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/parameters.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/plot.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/processing.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/quant.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/sample.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/sample5_schema.json +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/save.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/sample/sciex.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/spectrum.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/analysis.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/export_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/find_consensus_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/find_ms2_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/identify_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/integrate_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/defaults/study_def.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/parameters.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/study/save.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/wizard/README.md +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/wizard/__init__.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/src/masster/wizard/example.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/conftest.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_chromatogram.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_defaults.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_imports.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_integration.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_logger.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_parameters.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_sample.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_spectrum.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_study.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tests/test_version.py +0 -0
- {masster-0.5.0 → masster-0.5.3}/tox.ini +0 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name,smiles,inchikey,formula,db_id,db
|
|
2
|
+
L-Glutamic acid,N[C@@H](CCC(O)=O)C(O)=O,WHUUTDBJXJRKMK-VKHMYHEASA-N,C5H9NO4,CID:33032,pubchem
|
|
3
|
+
L-Tyrosine,N[C@@H](CC1=CC=C(O)C=C1)C(O)=O,OUYCCCASQSFEME-QMMMGPOBSA-N,C9H11NO3,CID:6057,pubchem
|
|
4
|
+
L-Phenylalanine,N[C@@H](CC1=CC=CC=C1)C(O)=O,COLNVLDHVKWLRT-QMMMGPOBSA-N,C9H11NO2,CID:6140,pubchem
|
|
5
|
+
L-Alanine,C[C@H](N)C(O)=O,QNAYBMKLOCPYGJ-REOHCLBHSA-N,C3H7NO2,CID:5950,pubchem
|
|
6
|
+
L-Proline,OC(=O)[C@@H]1CCCN1,ONIBWKKTOPOVIA-BYPYZUCNSA-N,C5H9NO2,CID:145742,pubchem
|
|
7
|
+
L-Threonine,C[C@@H](O)[C@H](N)C(O)=O,AYFVYJQAPQTCCC-GBXIJSLDSA-N,C4H9NO3,CID:6288,pubchem
|
|
8
|
+
L-Asparagine,N[C@@H](CC(N)=O)C(O)=O,DCXYFEDJOCDNAF-REOHCLBHSA-N,C4H8N2O3,CID:6267,pubchem
|
|
9
|
+
L-Isoleucine,CC[C@H](C)[C@H](N)C(O)=O,AGPKZVBTJJNPAG-WHFBIAKZSA-N,C6H13NO2,CID:6306,pubchem
|
|
10
|
+
L-Histidine,N[C@@H](CC1=CN=CN1)C(O)=O,HNDVDQJCIGZPNO-YFKPBYRVSA-N,C6H9N3O2,CID:6274,pubchem
|
|
11
|
+
L-Lysine,NCCCC[C@H](N)C(O)=O,KDXKERNSBIXSRK-YFKPBYRVSA-N,C6H14N2O2,CID:5962,pubchem
|
|
12
|
+
L-Serine,N[C@@H](CO)C(O)=O,MTCFGRXMJLQNBG-REOHCLBHSA-N,C3H7NO3,CID:5951,pubchem
|
|
13
|
+
L-Aspartic acid,N[C@@H](CC(O)=O)C(O)=O,CKLJMWTZIZZHCS-REOHCLBHSA-N,C4H7NO4,CID:5960,pubchem
|
|
14
|
+
L-Cystine,N[C@@H](CSSC[C@H](N)C(O)=O)C(O)=O,LEVWYRKDKASIDU-IMJSIDKUSA-N,C6H12N2O4S2,CID:67678,pubchem
|
|
15
|
+
L-Arginine,N[C@@H](CCCNC(N)=N)C(O)=O,ODKSFYDXXFIFQN-BYPYZUCNSA-N,C6H14N4O2,CID:6322,pubchem
|
|
16
|
+
L-Cysteine,N[C@@H](CS)C(O)=O,XUJNEKJLAYXESH-REOHCLBHSA-N,C3H7NO2S,CID:5862,pubchem
|
|
17
|
+
L-Glutamine,N[C@@H](CCC(N)=O)C(O)=O,ZDXPYRJPNDTMRX-VKHMYHEASA-N,C5H10N2O3,CID:5961,pubchem
|
|
18
|
+
L-Leucine,CC(C)C[C@H](N)C(O)=O,ROHFNLRQFUQHCH-YFKPBYRVSA-N,C6H13NO2,CID:6106,pubchem
|
|
19
|
+
L-Methionine,CSCC[C@H](N)C(O)=O,FFEARJCKVFRZRR-BYPYZUCNSA-N,C5H11NO2S,CID:6137,pubchem
|
|
20
|
+
L-Valine,CC(C)[C@H](N)C(O)=O,KZSNJWFQEVHDMF-BYPYZUCNSA-N,C5H11NO2,CID:6287,pubchem
|
|
21
|
+
L-Tryptophan,N[C@@H](CC1=CNC2=C1C=CC=C2)C(O)=O,QIVBCDIJIAJPQS-VIFPVBQESA-N,C11H12N2O2,CID:6305,pubchem
|
|
22
|
+
Glycine,NCC(O)=O,QNAYBMKLOCPYGJ-UHFFFAOYSA-N,C2H5NO2,CID:750,Glycine
|
|
@@ -123,11 +123,13 @@ class Lib:
|
|
|
123
123
|
"inchi": pl.Series([], dtype=pl.Utf8),
|
|
124
124
|
"inchikey": pl.Series([], dtype=pl.Utf8),
|
|
125
125
|
"formula": pl.Series([], dtype=pl.Utf8),
|
|
126
|
+
"iso": pl.Series([], dtype=pl.Int64),
|
|
126
127
|
"adduct": pl.Series([], dtype=pl.Utf8),
|
|
127
128
|
"m": pl.Series([], dtype=pl.Float64),
|
|
128
129
|
"z": pl.Series([], dtype=pl.Int8),
|
|
129
130
|
"mz": pl.Series([], dtype=pl.Float64),
|
|
130
131
|
"rt": pl.Series([], dtype=pl.Float64),
|
|
132
|
+
"quant_group": pl.Series([], dtype=pl.Int64),
|
|
131
133
|
"db_id": pl.Series([], dtype=pl.Utf8),
|
|
132
134
|
"db": pl.Series([], dtype=pl.Utf8),
|
|
133
135
|
})
|
|
@@ -245,11 +247,13 @@ class Lib:
|
|
|
245
247
|
"inchi": compound_data.get("inchi", ""),
|
|
246
248
|
"inchikey": compound_data.get("inchikey", ""),
|
|
247
249
|
"formula": compound_data["formula"],
|
|
250
|
+
"iso": 0, # Default to zero
|
|
248
251
|
"adduct": adduct,
|
|
249
252
|
"m": adducted_mass,
|
|
250
253
|
"z": charge,
|
|
251
254
|
"mz": mz,
|
|
252
255
|
"rt": compound_data.get("rt", None),
|
|
256
|
+
"quant_group": counter, # Use same as lib_uid for default
|
|
253
257
|
"db_id": compound_data.get("db_id", None),
|
|
254
258
|
"db": compound_data.get("db", None),
|
|
255
259
|
}
|
|
@@ -526,12 +530,14 @@ class Lib:
|
|
|
526
530
|
"source_id": match_row.get("source_id"),
|
|
527
531
|
"name": match_row["name"],
|
|
528
532
|
"formula": match_row["formula"],
|
|
533
|
+
"iso": match_row.get("iso", 0),
|
|
529
534
|
"adduct": match_row["adduct"],
|
|
530
535
|
"smiles": match_row["smiles"],
|
|
531
536
|
"inchi": match_row["inchi"],
|
|
532
537
|
"inchikey": match_row["inchikey"],
|
|
533
538
|
"lib_mz": match_row["mz"],
|
|
534
539
|
"lib_rt": match_row["rt"],
|
|
540
|
+
"quant_group": match_row.get("quant_group"),
|
|
535
541
|
"delta_mz": abs(feature_mz - match_row["mz"]),
|
|
536
542
|
"delta_rt": abs(feature_rt - match_row["rt"]) if feature_rt is not None and match_row["rt"] is not None else None,
|
|
537
543
|
}
|
|
@@ -155,13 +155,14 @@ def load_noms1(
|
|
|
155
155
|
|
|
156
156
|
# check if file is mzML
|
|
157
157
|
if filename.lower().endswith(".mzml"):
|
|
158
|
-
|
|
158
|
+
_load_mzML(self, filename)
|
|
159
159
|
elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
|
|
160
|
-
|
|
160
|
+
_load_wiff(self, filename)
|
|
161
161
|
elif filename.lower().endswith(".raw"):
|
|
162
|
-
|
|
162
|
+
_load_raw(self, filename)
|
|
163
163
|
elif filename.lower().endswith(".sample5"):
|
|
164
|
-
|
|
164
|
+
from masster.sample.h5 import _load_sample5_study
|
|
165
|
+
_load_sample5_study(self, filename) # Use optimized version for study loading
|
|
165
166
|
else:
|
|
166
167
|
raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")
|
|
167
168
|
|
|
@@ -1,65 +1,104 @@
|
|
|
1
|
-
"""Parameter class for Study
|
|
1
|
+
"""Parameter class for Study align method."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclass
|
|
8
|
-
class
|
|
8
|
+
class align_defaults:
|
|
9
9
|
"""
|
|
10
|
-
Parameter class for Study
|
|
10
|
+
Parameter class for Study align method.
|
|
11
11
|
|
|
12
|
-
This class encapsulates parameters for
|
|
13
|
-
|
|
12
|
+
This class encapsulates parameters for feature alignment across samples,
|
|
13
|
+
including retention time and m/z tolerances, warping parameters, and
|
|
14
|
+
alignment algorithm settings.
|
|
14
15
|
|
|
15
16
|
Attributes:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
algorithm (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD). Default is 'pc'.
|
|
18
|
+
rt_tol (float): Maximum retention time difference for alignment. Default is 60.0.
|
|
19
|
+
mz_max_diff (float): Maximum m/z difference for alignment. Default is 0.02.
|
|
20
|
+
rt_pair_distance_frac (float): Fraction of RT difference for pair distance. Default is 0.2.
|
|
21
|
+
mz_pair_max_distance (float): Maximum m/z pair distance. Default is 0.01.
|
|
22
|
+
num_used_points (int): Number of points used for alignment. Default is 1000.
|
|
23
|
+
save_features (bool): Whether to save features after alignment. Default is False.
|
|
24
|
+
skip_blanks (bool): Whether to skip blank samples. Default is False.
|
|
25
|
+
|
|
26
|
+
KD algorithm specific parameters:
|
|
27
|
+
warp_mz_tol (float): m/z tolerance for the LOWESS fit. Default is 0.05.
|
|
21
28
|
"""
|
|
22
29
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
30
|
+
rt_tol: float = 5.0
|
|
31
|
+
mz_max_diff: float = 0.01
|
|
32
|
+
rt_pair_distance_frac: float = 0.5
|
|
33
|
+
mz_pair_max_distance: float = 0.01
|
|
34
|
+
num_used_points: int = 1000
|
|
35
|
+
save_features: bool = False
|
|
36
|
+
skip_blanks: bool = False
|
|
37
|
+
algorithm: str = "kd"
|
|
38
|
+
|
|
39
|
+
# KD algorithm specific parameters
|
|
40
|
+
warp_mz_tol: float = 0.05
|
|
28
41
|
|
|
29
42
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
30
43
|
default_factory=lambda: {
|
|
31
|
-
"
|
|
32
|
-
"dtype":
|
|
33
|
-
"description": "
|
|
34
|
-
"default":
|
|
44
|
+
"rt_tol": {
|
|
45
|
+
"dtype": float,
|
|
46
|
+
"description": "Maximum retention time difference for alignment (seconds)",
|
|
47
|
+
"default": 5.0,
|
|
48
|
+
"min_value": 1.0,
|
|
49
|
+
"max_value": 30.0,
|
|
35
50
|
},
|
|
36
|
-
"
|
|
51
|
+
"mz_max_diff": {
|
|
37
52
|
"dtype": float,
|
|
38
|
-
"description": "m/z
|
|
39
|
-
"default": 0.
|
|
53
|
+
"description": "Maximum m/z difference for alignment (Da)",
|
|
54
|
+
"default": 0.01,
|
|
40
55
|
"min_value": 0.001,
|
|
41
|
-
"max_value": 0.
|
|
56
|
+
"max_value": 0.05,
|
|
42
57
|
},
|
|
43
|
-
"
|
|
58
|
+
"rt_pair_distance_frac": {
|
|
44
59
|
"dtype": float,
|
|
45
|
-
"description": "RT
|
|
46
|
-
"default":
|
|
47
|
-
"min_value": 1
|
|
48
|
-
"max_value":
|
|
60
|
+
"description": "Fraction of RT difference for pair distance calculation",
|
|
61
|
+
"default": 0.2,
|
|
62
|
+
"min_value": 0.1,
|
|
63
|
+
"max_value": 1.0,
|
|
49
64
|
},
|
|
50
|
-
"
|
|
65
|
+
"mz_pair_max_distance": {
|
|
51
66
|
"dtype": float,
|
|
52
|
-
"description": "
|
|
53
|
-
"default": 0.
|
|
54
|
-
"min_value": 0.
|
|
55
|
-
"max_value":
|
|
67
|
+
"description": "Maximum m/z pair distance (Da)",
|
|
68
|
+
"default": 0.01,
|
|
69
|
+
"min_value": 0.001,
|
|
70
|
+
"max_value": 0.2,
|
|
56
71
|
},
|
|
57
|
-
"
|
|
72
|
+
"num_used_points": {
|
|
58
73
|
"dtype": int,
|
|
59
|
-
"description": "
|
|
60
|
-
"default":
|
|
61
|
-
"min_value":
|
|
62
|
-
"max_value":
|
|
74
|
+
"description": "Number of points used for alignment",
|
|
75
|
+
"default": 1000,
|
|
76
|
+
"min_value": 10,
|
|
77
|
+
"max_value": 10000,
|
|
78
|
+
},
|
|
79
|
+
"save_features": {
|
|
80
|
+
"dtype": bool,
|
|
81
|
+
"description": "Whether to save features after alignment",
|
|
82
|
+
"default": False,
|
|
83
|
+
},
|
|
84
|
+
"skip_blanks": {
|
|
85
|
+
"dtype": bool,
|
|
86
|
+
"description": "Whether to skip blank samples during alignment",
|
|
87
|
+
"default": False,
|
|
88
|
+
},
|
|
89
|
+
"algorithm": {
|
|
90
|
+
"dtype": str,
|
|
91
|
+
"description": "Alignment algorithm to use",
|
|
92
|
+
"default": "pc",
|
|
93
|
+
"allowed_values": ["pc", "kd"],
|
|
94
|
+
},
|
|
95
|
+
# KD algorithm specific parameters
|
|
96
|
+
"warp_mz_tol": {
|
|
97
|
+
"dtype": float,
|
|
98
|
+
"description": "m/z tolerance for the LOWESS fit in KD algorithm (Da)",
|
|
99
|
+
"default": 0.05,
|
|
100
|
+
"min_value": 0.001,
|
|
101
|
+
"max_value": 1.0,
|
|
63
102
|
},
|
|
64
103
|
},
|
|
65
104
|
repr=False,
|
|
@@ -111,14 +150,6 @@ class fill_chrom_defaults:
|
|
|
111
150
|
metadata = self._param_metadata[param_name]
|
|
112
151
|
expected_dtype = metadata["dtype"]
|
|
113
152
|
|
|
114
|
-
# Handle optional types
|
|
115
|
-
if isinstance(expected_dtype, str) and expected_dtype.startswith("Optional"):
|
|
116
|
-
if value is None:
|
|
117
|
-
return True
|
|
118
|
-
# Extract the inner type for validation
|
|
119
|
-
if "list" in expected_dtype:
|
|
120
|
-
expected_dtype = list
|
|
121
|
-
|
|
122
153
|
# Type checking
|
|
123
154
|
if expected_dtype is int:
|
|
124
155
|
if not isinstance(value, int):
|
|
@@ -132,8 +163,8 @@ class fill_chrom_defaults:
|
|
|
132
163
|
value = float(value)
|
|
133
164
|
except (ValueError, TypeError):
|
|
134
165
|
return False
|
|
135
|
-
elif expected_dtype is
|
|
136
|
-
if not isinstance(value,
|
|
166
|
+
elif expected_dtype is bool:
|
|
167
|
+
if not isinstance(value, bool):
|
|
137
168
|
return False
|
|
138
169
|
|
|
139
170
|
# Range validation for numeric types
|
|
@@ -143,6 +174,11 @@ class fill_chrom_defaults:
|
|
|
143
174
|
if "max_value" in metadata and value > metadata["max_value"]:
|
|
144
175
|
return False
|
|
145
176
|
|
|
177
|
+
# Allowed values validation for string types
|
|
178
|
+
if expected_dtype is str and "allowed_values" in metadata:
|
|
179
|
+
if value not in metadata["allowed_values"]:
|
|
180
|
+
return False
|
|
181
|
+
|
|
146
182
|
return True
|
|
147
183
|
|
|
148
184
|
def set(self, param_name: str, value: Any, validate: bool = True) -> bool:
|
|
@@ -167,20 +203,18 @@ class fill_chrom_defaults:
|
|
|
167
203
|
if param_name in self._param_metadata:
|
|
168
204
|
expected_dtype = self._param_metadata[param_name]["dtype"]
|
|
169
205
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
if validate:
|
|
183
|
-
return False
|
|
206
|
+
if expected_dtype is int and not isinstance(value, int):
|
|
207
|
+
try:
|
|
208
|
+
value = int(value)
|
|
209
|
+
except (ValueError, TypeError):
|
|
210
|
+
if validate:
|
|
211
|
+
return False
|
|
212
|
+
elif expected_dtype is float and not isinstance(value, float):
|
|
213
|
+
try:
|
|
214
|
+
value = float(value)
|
|
215
|
+
except (ValueError, TypeError):
|
|
216
|
+
if validate:
|
|
217
|
+
return False
|
|
184
218
|
|
|
185
219
|
setattr(self, param_name, value)
|
|
186
220
|
return True
|
|
@@ -21,10 +21,11 @@ class fill_defaults:
|
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
uids: Optional[list] = None
|
|
24
|
-
mz_tol: float = 0.
|
|
24
|
+
mz_tol: float = 0.050
|
|
25
25
|
rt_tol: float = 10.0
|
|
26
26
|
min_samples_rel: float = 0.00
|
|
27
27
|
min_samples_abs: int = 5
|
|
28
|
+
threads: int = 6
|
|
28
29
|
|
|
29
30
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
30
31
|
default_factory=lambda: {
|
|
@@ -58,9 +59,16 @@ class fill_defaults:
|
|
|
58
59
|
"dtype": int,
|
|
59
60
|
"description": "Minimum absolute samples threshold",
|
|
60
61
|
"default": 5,
|
|
61
|
-
"min_value":
|
|
62
|
+
"min_value": 0,
|
|
62
63
|
"max_value": 100,
|
|
63
64
|
},
|
|
65
|
+
"threads": {
|
|
66
|
+
"dtype": int,
|
|
67
|
+
"description": "Number of parallel threads",
|
|
68
|
+
"default": 6,
|
|
69
|
+
"min_value": 1,
|
|
70
|
+
"max_value": 32,
|
|
71
|
+
},
|
|
64
72
|
},
|
|
65
73
|
repr=False,
|
|
66
74
|
)
|
|
@@ -13,7 +13,7 @@ class merge_defaults:
|
|
|
13
13
|
method selection, grouping tolerances, and algorithm-specific parameters.
|
|
14
14
|
|
|
15
15
|
Attributes:
|
|
16
|
-
method (str): Merge method to use ('kd', 'qt', '
|
|
16
|
+
method (str): Merge method to use ('kd', 'qt', 'kd_chunked', 'qt_chunked'). Default is "kd".
|
|
17
17
|
min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
|
|
18
18
|
rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
|
|
19
19
|
mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
|
|
@@ -25,38 +25,31 @@ class merge_defaults:
|
|
|
25
25
|
link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
method: str = "
|
|
28
|
+
method: str = "kd"
|
|
29
29
|
min_samples: int = 2
|
|
30
30
|
rt_tol: float = 5.0
|
|
31
|
-
mz_tol: float = 0.
|
|
31
|
+
mz_tol: float = 0.05
|
|
32
32
|
chunk_size: int = 500
|
|
33
33
|
nr_partitions: int = 1000
|
|
34
34
|
min_rel_cc_size: float = 0.1
|
|
35
35
|
max_pairwise_log_fc: float = -1.0
|
|
36
36
|
max_nr_conflicts: int = 0
|
|
37
37
|
link_ms2: bool = True
|
|
38
|
+
extract_ms1: bool = True
|
|
39
|
+
|
|
40
|
+
# Cross-chunk merging parameters
|
|
41
|
+
dechunking: str = "hierarchical"
|
|
38
42
|
|
|
39
43
|
# Parallel processing parameters
|
|
40
44
|
threads: Optional[int] = None
|
|
41
|
-
|
|
42
|
-
# KD-Strict specific parameters
|
|
43
|
-
optimize_rt_tol: bool = False
|
|
44
|
-
rt_tol_range: tuple = (0.5, 4.0)
|
|
45
|
-
rt_tol_steps: int = 7
|
|
46
|
-
secondary_merge_rt_tol: float = 1.0
|
|
47
|
-
secondary_merge_mz_tol: float = 0.005
|
|
48
|
-
min_sample_overlap: float = 0.8
|
|
49
|
-
max_rt_spread: float = 2.0 # Will default to 2x rt_tol
|
|
50
|
-
min_coherence: float = 0.0
|
|
51
45
|
|
|
52
46
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
53
47
|
default_factory=lambda: {
|
|
54
48
|
"method": {
|
|
55
49
|
"dtype": str,
|
|
56
50
|
"description": "Merge method (algorithm) to use",
|
|
57
|
-
"default": "
|
|
58
|
-
"allowed_values": ["
|
|
59
|
-
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
|
|
51
|
+
"default": "kd",
|
|
52
|
+
"allowed_values": ["kd", "qt",
|
|
60
53
|
"kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
|
|
61
54
|
},
|
|
62
55
|
"min_samples": {
|
|
@@ -118,7 +111,17 @@ class merge_defaults:
|
|
|
118
111
|
"description": "Whether to link MS2 spectra to consensus features",
|
|
119
112
|
"default": True,
|
|
120
113
|
},
|
|
121
|
-
|
|
114
|
+
"extract_ms1": {
|
|
115
|
+
"dtype": bool,
|
|
116
|
+
"description": "Whether to extract MS1 chromatograms for consensus features",
|
|
117
|
+
"default": True,
|
|
118
|
+
},
|
|
119
|
+
"dechunking": {
|
|
120
|
+
"dtype": str,
|
|
121
|
+
"description": "Cross-chunk merging algorithm for chunked methods",
|
|
122
|
+
"default": "hierarchical",
|
|
123
|
+
"allowed_values": ["hierarchical", "kdtree"],
|
|
124
|
+
},
|
|
122
125
|
"threads": {
|
|
123
126
|
"dtype": [int, type(None)],
|
|
124
127
|
"description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
|
|
@@ -126,58 +129,6 @@ class merge_defaults:
|
|
|
126
129
|
"min_value": 1,
|
|
127
130
|
"max_value": 32,
|
|
128
131
|
},
|
|
129
|
-
# KD-Strict specific parameters
|
|
130
|
-
"optimize_rt_tol": {
|
|
131
|
-
"dtype": bool,
|
|
132
|
-
"description": "Enable RT tolerance optimization for kd-strict method",
|
|
133
|
-
"default": False,
|
|
134
|
-
},
|
|
135
|
-
"rt_tol_range": {
|
|
136
|
-
"dtype": tuple,
|
|
137
|
-
"description": "RT tolerance range for optimization (min, max) in seconds",
|
|
138
|
-
"default": (0.8, 2.0),
|
|
139
|
-
},
|
|
140
|
-
"rt_tol_steps": {
|
|
141
|
-
"dtype": int,
|
|
142
|
-
"description": "Number of steps for RT tolerance optimization",
|
|
143
|
-
"default": 5,
|
|
144
|
-
"min_value": 3,
|
|
145
|
-
"max_value": 20,
|
|
146
|
-
},
|
|
147
|
-
"secondary_merge_rt_tol": {
|
|
148
|
-
"dtype": float,
|
|
149
|
-
"description": "RT tolerance for secondary clustering in kd-strict (seconds)",
|
|
150
|
-
"default": 0.5,
|
|
151
|
-
"min_value": 0.1,
|
|
152
|
-
"max_value": 5.0,
|
|
153
|
-
},
|
|
154
|
-
"secondary_merge_mz_tol": {
|
|
155
|
-
"dtype": float,
|
|
156
|
-
"description": "m/z tolerance for secondary clustering in kd-strict (Da)",
|
|
157
|
-
"default": 0.005,
|
|
158
|
-
"min_value": 0.001,
|
|
159
|
-
"max_value": 0.1,
|
|
160
|
-
},
|
|
161
|
-
"min_sample_overlap": {
|
|
162
|
-
"dtype": float,
|
|
163
|
-
"description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
|
|
164
|
-
"default": 0.8,
|
|
165
|
-
"min_value": 0.0,
|
|
166
|
-
"max_value": 1.0,
|
|
167
|
-
},
|
|
168
|
-
"max_rt_spread": {
|
|
169
|
-
"dtype": float,
|
|
170
|
-
"description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
|
|
171
|
-
"default": None,
|
|
172
|
-
"min_value": 0.1,
|
|
173
|
-
},
|
|
174
|
-
"min_coherence": {
|
|
175
|
-
"dtype": float,
|
|
176
|
-
"description": "Minimum chromatographic coherence score (0.0 = disabled)",
|
|
177
|
-
"default": 0.0,
|
|
178
|
-
"min_value": 0.0,
|
|
179
|
-
"max_value": 1.0,
|
|
180
|
-
},
|
|
181
132
|
},
|
|
182
133
|
repr=False,
|
|
183
134
|
)
|
|
@@ -551,7 +551,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
|
|
|
551
551
|
else:
|
|
552
552
|
self.logger.info("No identification data available for mzTab export")
|
|
553
553
|
except Exception as e:
|
|
554
|
-
self.logger.
|
|
554
|
+
self.logger.debug(f"Could not retrieve identification data: {e}")
|
|
555
555
|
id_data = None
|
|
556
556
|
top_id_data = None
|
|
557
557
|
full_id_data = None
|
|
@@ -1190,10 +1190,12 @@ def export_xlsx(self, filename: str | None = None) -> None:
|
|
|
1190
1190
|
"""
|
|
1191
1191
|
Export the study data to an Excel workbook with multiple worksheets.
|
|
1192
1192
|
|
|
1193
|
-
The Excel file contains
|
|
1194
|
-
-
|
|
1195
|
-
-
|
|
1193
|
+
The Excel file contains five worksheets:
|
|
1194
|
+
- samples: Samples dataframe
|
|
1195
|
+
- consensus: Consensus features dataframe
|
|
1196
1196
|
- identification: Identification results with library annotations (get_id)
|
|
1197
|
+
- gaps: Gaps matrix showing filled vs non-filled features (get_gaps_matrix)
|
|
1198
|
+
- matrix: Consensus matrix with samples as columns (get_consensus_matrix)
|
|
1197
1199
|
|
|
1198
1200
|
Args:
|
|
1199
1201
|
filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
|
|
@@ -1263,7 +1265,25 @@ def export_xlsx(self, filename: str | None = None) -> None:
|
|
|
1263
1265
|
f"Error getting identification data: {e}. Skipping identification worksheet.",
|
|
1264
1266
|
)
|
|
1265
1267
|
|
|
1266
|
-
# 4.
|
|
1268
|
+
# 4. Gaps matrix (filled vs non-filled features)
|
|
1269
|
+
try:
|
|
1270
|
+
gaps_df = self.get_gaps_matrix()
|
|
1271
|
+
if gaps_df is not None and not gaps_df.is_empty():
|
|
1272
|
+
gaps_pandas = gaps_df.to_pandas()
|
|
1273
|
+
worksheets["gaps"] = gaps_pandas
|
|
1274
|
+
self.logger.debug(
|
|
1275
|
+
f"Added gaps worksheet with {len(gaps_pandas)} rows",
|
|
1276
|
+
)
|
|
1277
|
+
else:
|
|
1278
|
+
self.logger.warning(
|
|
1279
|
+
"get_gaps_matrix() returned empty data, skipping gaps worksheet",
|
|
1280
|
+
)
|
|
1281
|
+
except Exception as e:
|
|
1282
|
+
self.logger.debug(
|
|
1283
|
+
f"Error getting gaps data: {e}. Skipping gaps worksheet.",
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
# 5. Consensus matrix (last worksheet)
|
|
1267
1287
|
try:
|
|
1268
1288
|
matrix_df = self.get_consensus_matrix()
|
|
1269
1289
|
if matrix_df is not None and not matrix_df.is_empty():
|