masster 0.4.14__tar.gz → 0.4.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- {masster-0.4.14 → masster-0.4.16}/PKG-INFO +1 -1
- {masster-0.4.14 → masster-0.4.16}/pyproject.toml +1 -1
- {masster-0.4.14 → masster-0.4.16}/src/masster/__init__.py +2 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/_version.py +1 -1
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/__init__.py +1 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/find_consensus_def.py +1 -1
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/merge_def.py +69 -25
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/h5.py +65 -106
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/id.py +1 -1
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/load.py +11 -6
- masster-0.4.16/src/masster/study/merge.py +1607 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/processing.py +0 -902
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/save.py +1 -1
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/study.py +28 -31
- masster-0.4.16/src/masster/wizard/README.md +373 -0
- masster-0.4.16/src/masster/wizard/__init__.py +11 -0
- masster-0.4.16/src/masster/wizard/example.py +223 -0
- masster-0.4.16/src/masster/wizard/test_structure.py +49 -0
- masster-0.4.16/src/masster/wizard/test_wizard.py +285 -0
- masster-0.4.16/src/masster/wizard/wizard.py +1175 -0
- masster-0.4.16/src/masster/wizard.py +1175 -0
- {masster-0.4.14 → masster-0.4.16}/uv.lock +1 -1
- {masster-0.4.14 → masster-0.4.16}/.github/workflows/publish.yml +0 -0
- {masster-0.4.14 → masster-0.4.16}/.github/workflows/security.yml +0 -0
- {masster-0.4.14 → masster-0.4.16}/.github/workflows/test.yml +0 -0
- {masster-0.4.14 → masster-0.4.16}/.gitignore +0 -0
- {masster-0.4.14 → masster-0.4.16}/.pre-commit-config.yaml +0 -0
- {masster-0.4.14 → masster-0.4.16}/LICENSE +0 -0
- {masster-0.4.14 → masster-0.4.16}/Makefile +0 -0
- {masster-0.4.14 → masster-0.4.16}/README.md +0 -0
- {masster-0.4.14 → masster-0.4.16}/TESTING.md +0 -0
- {masster-0.4.14 → masster-0.4.16}/demo/example_batch_process.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/demo/example_sample_process.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/chromatogram.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/libs/ccm.csv +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/libs/urine.csv +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/lib/__init__.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/lib/lib.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/logger.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/__init__.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/adducts.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/defaults/__init__.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/defaults/find_adducts_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/defaults/find_features_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/defaults/find_ms2_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/defaults/sample_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/h5.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/helpers.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/lib.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/load.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/parameters.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/plot.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/processing.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/quant.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/sample.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/sample5_schema.json +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/save.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/sample/sciex.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/spectrum.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/__init__.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/align_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/export_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/fill_chrom_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/fill_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/find_ms2_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/identify_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/integrate_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/study_def.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/export.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/helpers.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/parameters.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/plot.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/src/masster/study/study5_schema.json +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/conftest.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_chromatogram.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_defaults.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_imports.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_integration.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_logger.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_parameters.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_sample.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_spectrum.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_study.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tests/test_version.py +0 -0
- {masster-0.4.14 → masster-0.4.16}/tox.ini +0 -0
|
@@ -16,6 +16,7 @@ from masster.lib import Lib
|
|
|
16
16
|
from masster.sample.sample import Sample
|
|
17
17
|
from masster.spectrum import Spectrum
|
|
18
18
|
from masster.study.study import Study
|
|
19
|
+
from masster.wizard import Wizard, wizard_def
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
__all__ = [
|
|
@@ -24,6 +25,7 @@ __all__ = [
|
|
|
24
25
|
"Sample",
|
|
25
26
|
"Spectrum",
|
|
26
27
|
"Study",
|
|
28
|
+
"Wizard",
|
|
27
29
|
"__version__",
|
|
28
30
|
# "get_version",
|
|
29
31
|
]
|
|
@@ -32,7 +32,7 @@ class find_consensus_defaults:
|
|
|
32
32
|
"dtype": str,
|
|
33
33
|
"description": "Feature grouping algorithm",
|
|
34
34
|
"default": "qt",
|
|
35
|
-
"allowed_values": ["qt", "kd", "unlabeled", "
|
|
35
|
+
"allowed_values": ["qt", "kd", "unlabeled", "kd-nowarp"],
|
|
36
36
|
},
|
|
37
37
|
"min_samples": {
|
|
38
38
|
"dtype": int,
|
|
@@ -9,55 +9,99 @@ class merge_defaults:
|
|
|
9
9
|
"""
|
|
10
10
|
Parameter class for Study merge method.
|
|
11
11
|
|
|
12
|
-
This class encapsulates parameters for
|
|
13
|
-
|
|
12
|
+
This class encapsulates parameters for all merge algorithms including
|
|
13
|
+
method selection, grouping tolerances, and algorithm-specific parameters.
|
|
14
14
|
|
|
15
15
|
Attributes:
|
|
16
|
-
|
|
17
|
-
min_samples (int): Minimum number of samples for a consensus feature. Default is
|
|
16
|
+
method (str): Merge method to use ('kd', 'qt', 'kd-nowarp', 'chunked'). Default is "kd".
|
|
17
|
+
min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
|
|
18
|
+
rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
|
|
19
|
+
mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
|
|
20
|
+
chunk_size (int): Chunk size for 'chunked' method. Default is 500.
|
|
21
|
+
nr_partitions (int): Number of partitions in m/z dimension for KD algorithms. Default is 500.
|
|
22
|
+
min_rel_cc_size (float): Minimum relative connected component size for conflict resolution. Default is 0.3.
|
|
23
|
+
max_pairwise_log_fc (float): Maximum pairwise log fold change for conflict resolution. Default is 0.5.
|
|
24
|
+
max_nr_conflicts (int): Maximum number of conflicts allowed in consensus feature. Default is 0.
|
|
18
25
|
link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
|
|
19
|
-
mz_tol (float): m/z tolerance for grouping (Da). Default is 0.01.
|
|
20
|
-
rt_tol (float): RT tolerance for grouping (seconds). Default is 1.0.
|
|
21
26
|
"""
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
min_samples: int =
|
|
25
|
-
|
|
28
|
+
method: str = "kd"
|
|
29
|
+
min_samples: int = 10
|
|
30
|
+
rt_tol: float = 5.0
|
|
26
31
|
mz_tol: float = 0.01
|
|
27
|
-
|
|
32
|
+
chunk_size: int = 300
|
|
33
|
+
nr_partitions: int = 1000
|
|
34
|
+
min_rel_cc_size: float = 0.2
|
|
35
|
+
max_pairwise_log_fc: float = -1.0
|
|
36
|
+
max_nr_conflicts: int = 0
|
|
37
|
+
link_ms2: bool = True
|
|
28
38
|
|
|
29
39
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
30
40
|
default_factory=lambda: {
|
|
31
|
-
"
|
|
41
|
+
"method": {
|
|
32
42
|
"dtype": str,
|
|
33
|
-
"description": "
|
|
34
|
-
"default": "
|
|
35
|
-
"allowed_values": ["
|
|
43
|
+
"description": "Merge method (algorithm) to use",
|
|
44
|
+
"default": "kd",
|
|
45
|
+
"allowed_values": ["kd", "qt", "kd-nowarp", "chunked"],
|
|
36
46
|
},
|
|
37
47
|
"min_samples": {
|
|
38
48
|
"dtype": int,
|
|
39
49
|
"description": "Minimum number of samples for a consensus feature",
|
|
40
|
-
"default":
|
|
50
|
+
"default": 50,
|
|
41
51
|
"min_value": 1,
|
|
42
52
|
},
|
|
43
|
-
"
|
|
44
|
-
"dtype":
|
|
45
|
-
"description": "
|
|
46
|
-
"default":
|
|
53
|
+
"rt_tol": {
|
|
54
|
+
"dtype": float,
|
|
55
|
+
"description": "RT tolerance for grouping (seconds)",
|
|
56
|
+
"default": 2.0,
|
|
57
|
+
"min_value": 0.1,
|
|
58
|
+
"max_value": 60.0,
|
|
47
59
|
},
|
|
48
60
|
"mz_tol": {
|
|
49
61
|
"dtype": float,
|
|
50
|
-
"description": "m/z tolerance for grouping (Da)",
|
|
62
|
+
"description": "m/z tolerance for grouping (Da for all methods)",
|
|
51
63
|
"default": 0.01,
|
|
52
64
|
"min_value": 0.001,
|
|
53
65
|
"max_value": 1.0,
|
|
54
66
|
},
|
|
55
|
-
"
|
|
67
|
+
"chunk_size": {
|
|
68
|
+
"dtype": int,
|
|
69
|
+
"description": "Chunk size for 'chunked' method",
|
|
70
|
+
"default": 500,
|
|
71
|
+
"min_value": 10,
|
|
72
|
+
},
|
|
73
|
+
"nr_partitions": {
|
|
74
|
+
"dtype": int,
|
|
75
|
+
"description": "Number of partitions in m/z dimension for KD algorithms",
|
|
76
|
+
"default": 500,
|
|
77
|
+
"min_value": 10,
|
|
78
|
+
"max_value": 10000,
|
|
79
|
+
},
|
|
80
|
+
"min_rel_cc_size": {
|
|
56
81
|
"dtype": float,
|
|
57
|
-
"description": "
|
|
58
|
-
"default":
|
|
59
|
-
"min_value": 0.
|
|
60
|
-
"max_value":
|
|
82
|
+
"description": "Minimum relative connected component size for conflict resolution",
|
|
83
|
+
"default": 0.3,
|
|
84
|
+
"min_value": 0.0,
|
|
85
|
+
"max_value": 1.0,
|
|
86
|
+
},
|
|
87
|
+
"max_pairwise_log_fc": {
|
|
88
|
+
"dtype": float,
|
|
89
|
+
"description": "Maximum pairwise log fold change for conflict resolution",
|
|
90
|
+
"default": 0.5,
|
|
91
|
+
"min_value": 0.0,
|
|
92
|
+
"max_value": 10.0,
|
|
93
|
+
},
|
|
94
|
+
"max_nr_conflicts": {
|
|
95
|
+
"dtype": int,
|
|
96
|
+
"description": "Maximum number of conflicts allowed in consensus feature",
|
|
97
|
+
"default": 0,
|
|
98
|
+
"min_value": 0,
|
|
99
|
+
"max_value": 1000,
|
|
100
|
+
},
|
|
101
|
+
"link_ms2": {
|
|
102
|
+
"dtype": bool,
|
|
103
|
+
"description": "Whether to link MS2 spectra to consensus features",
|
|
104
|
+
"default": True,
|
|
61
105
|
},
|
|
62
106
|
},
|
|
63
107
|
repr=False,
|
|
@@ -56,6 +56,45 @@ def _decode_bytes_attr(attr_value):
|
|
|
56
56
|
return str(attr_value) if attr_value is not None else ""
|
|
57
57
|
|
|
58
58
|
|
|
59
|
+
def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFrame:
|
|
60
|
+
"""Create an empty DataFrame with the correct schema based on study5_schema.json."""
|
|
61
|
+
if df_name not in schema:
|
|
62
|
+
# Fallback to basic empty DataFrame if schema not found
|
|
63
|
+
return pl.DataFrame()
|
|
64
|
+
|
|
65
|
+
df_schema = schema[df_name]["columns"]
|
|
66
|
+
empty_data = {}
|
|
67
|
+
polars_schema = {}
|
|
68
|
+
|
|
69
|
+
for col_name, col_info in df_schema.items():
|
|
70
|
+
dtype_str = col_info["dtype"]
|
|
71
|
+
# Convert string representation to actual Polars dtype
|
|
72
|
+
if dtype_str == "pl.Int64":
|
|
73
|
+
polars_dtype = pl.Int64
|
|
74
|
+
elif dtype_str == "pl.Int32":
|
|
75
|
+
polars_dtype = pl.Int32
|
|
76
|
+
elif dtype_str == "pl.Float64":
|
|
77
|
+
polars_dtype = pl.Float64
|
|
78
|
+
elif dtype_str == "pl.Utf8":
|
|
79
|
+
polars_dtype = pl.Utf8
|
|
80
|
+
elif dtype_str == "pl.String":
|
|
81
|
+
polars_dtype = pl.String
|
|
82
|
+
elif dtype_str == "pl.Boolean":
|
|
83
|
+
polars_dtype = pl.Boolean
|
|
84
|
+
elif dtype_str == "pl.Object":
|
|
85
|
+
polars_dtype = pl.Object
|
|
86
|
+
elif dtype_str == "pl.Null":
|
|
87
|
+
polars_dtype = pl.Null
|
|
88
|
+
else:
|
|
89
|
+
# Fallback to string if unknown type
|
|
90
|
+
polars_dtype = pl.String
|
|
91
|
+
|
|
92
|
+
empty_data[col_name] = []
|
|
93
|
+
polars_schema[col_name] = polars_dtype
|
|
94
|
+
|
|
95
|
+
return pl.DataFrame(empty_data, schema=polars_schema)
|
|
96
|
+
|
|
97
|
+
|
|
59
98
|
def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
|
|
60
99
|
"""
|
|
61
100
|
Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
|
|
@@ -1080,7 +1119,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1080
1119
|
if not filename.endswith(".study5"):
|
|
1081
1120
|
filename += ".study5"
|
|
1082
1121
|
|
|
1083
|
-
self.logger.debug(f"
|
|
1122
|
+
self.logger.debug(f"Save study")
|
|
1084
1123
|
|
|
1085
1124
|
# delete existing file if it exists
|
|
1086
1125
|
if os.path.exists(filename):
|
|
@@ -1132,7 +1171,7 @@ def _save_study5_compressed(self, filename):
|
|
|
1132
1171
|
|
|
1133
1172
|
with tqdm(
|
|
1134
1173
|
total=total_steps,
|
|
1135
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}
|
|
1174
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study ({sum(count for _, count in dataframes_to_save)} total rows)",
|
|
1136
1175
|
disable=tdqm_disable,
|
|
1137
1176
|
) as pbar:
|
|
1138
1177
|
# Create groups for organization
|
|
@@ -1186,8 +1225,11 @@ def _save_study5_compressed(self, filename):
|
|
|
1186
1225
|
)
|
|
1187
1226
|
pbar.update(1)
|
|
1188
1227
|
|
|
1189
|
-
|
|
1228
|
+
# Store features_df - use fast method that skips chrom and ms2_specs columns
|
|
1190
1229
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
1230
|
+
pbar.set_description(
|
|
1231
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving features ({len(self.features_df)} rows, compressed)"
|
|
1232
|
+
)
|
|
1191
1233
|
self.logger.debug(
|
|
1192
1234
|
f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
|
|
1193
1235
|
)
|
|
@@ -1411,7 +1453,7 @@ def _save_study5(self, filename):
|
|
|
1411
1453
|
if not filename.endswith(".study5"):
|
|
1412
1454
|
filename += ".study5"
|
|
1413
1455
|
|
|
1414
|
-
self.logger.info(
|
|
1456
|
+
self.logger.info("Save study...")
|
|
1415
1457
|
|
|
1416
1458
|
# delete existing file if it exists
|
|
1417
1459
|
if os.path.exists(filename):
|
|
@@ -1463,7 +1505,7 @@ def _save_study5(self, filename):
|
|
|
1463
1505
|
|
|
1464
1506
|
with tqdm(
|
|
1465
1507
|
total=total_steps,
|
|
1466
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study",
|
|
1508
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving study ({sum(count for _, count in dataframes_to_save)} total rows)",
|
|
1467
1509
|
disable=tdqm_disable,
|
|
1468
1510
|
) as pbar:
|
|
1469
1511
|
# Create groups for organization
|
|
@@ -1498,12 +1540,12 @@ def _save_study5(self, filename):
|
|
|
1498
1540
|
metadata_group.create_dataset("parameters", data="")
|
|
1499
1541
|
|
|
1500
1542
|
pbar.update(1)
|
|
1501
|
-
pbar.set_description(
|
|
1502
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving dataframes",
|
|
1503
|
-
)
|
|
1504
1543
|
|
|
1505
1544
|
# Store samples_df - use optimized batch processing
|
|
1506
1545
|
if self.samples_df is not None and not self.samples_df.is_empty():
|
|
1546
|
+
pbar.set_description(
|
|
1547
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving samples ({len(self.samples_df)} rows)"
|
|
1548
|
+
)
|
|
1507
1549
|
samples_group = f.create_group("samples")
|
|
1508
1550
|
self.logger.debug(
|
|
1509
1551
|
f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
|
|
@@ -1519,6 +1561,9 @@ def _save_study5(self, filename):
|
|
|
1519
1561
|
|
|
1520
1562
|
# Store features_df - use optimized batch processing
|
|
1521
1563
|
if self.features_df is not None and not self.features_df.is_empty():
|
|
1564
|
+
pbar.set_description(
|
|
1565
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving features ({len(self.features_df)} rows)"
|
|
1566
|
+
)
|
|
1522
1567
|
self.logger.debug(
|
|
1523
1568
|
f"Saving features_df with {len(self.features_df)} rows using optimized method",
|
|
1524
1569
|
)
|
|
@@ -1533,6 +1578,9 @@ def _save_study5(self, filename):
|
|
|
1533
1578
|
|
|
1534
1579
|
# Store consensus_df - use optimized batch processing
|
|
1535
1580
|
if self.consensus_df is not None and not self.consensus_df.is_empty():
|
|
1581
|
+
pbar.set_description(
|
|
1582
|
+
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {getattr(self, 'log_label', '')}Saving consensus ({len(self.consensus_df)} rows)"
|
|
1583
|
+
)
|
|
1536
1584
|
self.logger.debug(
|
|
1537
1585
|
f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
|
|
1538
1586
|
)
|
|
@@ -1690,8 +1738,9 @@ def _load_study5(self, filename=None):
|
|
|
1690
1738
|
# Use progress bar to show loading progress
|
|
1691
1739
|
with tqdm(
|
|
1692
1740
|
total=len(loading_steps),
|
|
1693
|
-
desc=
|
|
1741
|
+
desc="Loading study",
|
|
1694
1742
|
disable=tdqm_disable,
|
|
1743
|
+
unit="step"
|
|
1695
1744
|
) as pbar:
|
|
1696
1745
|
# Load metadata
|
|
1697
1746
|
pbar.set_description(
|
|
@@ -1792,83 +1841,7 @@ def _load_study5(self, filename=None):
|
|
|
1792
1841
|
self.logger.debug(
|
|
1793
1842
|
"No samples data found in study5 file. Initializing empty samples_df.",
|
|
1794
1843
|
)
|
|
1795
|
-
self.samples_df =
|
|
1796
|
-
{
|
|
1797
|
-
"sample_uid": [],
|
|
1798
|
-
"sample_name": [],
|
|
1799
|
-
"sample_path": [],
|
|
1800
|
-
"sample_type": [],
|
|
1801
|
-
"size": [],
|
|
1802
|
-
"map_id": [],
|
|
1803
|
-
"sample_source": [],
|
|
1804
|
-
"num_ms1": [],
|
|
1805
|
-
"num_ms2": [],
|
|
1806
|
-
"sample_group": [],
|
|
1807
|
-
"sample_batch": [],
|
|
1808
|
-
"sample_sequence": [],
|
|
1809
|
-
},
|
|
1810
|
-
schema={
|
|
1811
|
-
"sample_uid": pl.Int64,
|
|
1812
|
-
"sample_name": pl.Utf8,
|
|
1813
|
-
"sample_path": pl.Utf8,
|
|
1814
|
-
"sample_type": pl.Utf8,
|
|
1815
|
-
"size": pl.Int64,
|
|
1816
|
-
"map_id": pl.Int64,
|
|
1817
|
-
"sample_source": pl.Utf8,
|
|
1818
|
-
"num_ms1": pl.Int64,
|
|
1819
|
-
"num_ms2": pl.Int64,
|
|
1820
|
-
"sample_group": pl.Utf8,
|
|
1821
|
-
"sample_batch": pl.Int64,
|
|
1822
|
-
"sample_sequence": pl.Int64,
|
|
1823
|
-
},
|
|
1824
|
-
)
|
|
1825
|
-
pbar.update(1)
|
|
1826
|
-
# Load samples_df
|
|
1827
|
-
pbar.set_description(
|
|
1828
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading samples",
|
|
1829
|
-
)
|
|
1830
|
-
if "samples" in f and len(f["samples"].keys()) > 0:
|
|
1831
|
-
self.samples_df = _load_dataframe_from_group(
|
|
1832
|
-
f["samples"],
|
|
1833
|
-
schema,
|
|
1834
|
-
"samples_df",
|
|
1835
|
-
self.logger,
|
|
1836
|
-
)
|
|
1837
|
-
else:
|
|
1838
|
-
# Initialize empty samples_df with the correct schema if no data exists
|
|
1839
|
-
self.logger.debug(
|
|
1840
|
-
"No samples data found in study5 file. Initializing empty samples_df.",
|
|
1841
|
-
)
|
|
1842
|
-
self.samples_df = pl.DataFrame(
|
|
1843
|
-
{
|
|
1844
|
-
"sample_uid": [],
|
|
1845
|
-
"sample_name": [],
|
|
1846
|
-
"sample_path": [],
|
|
1847
|
-
"sample_type": [],
|
|
1848
|
-
"size": [],
|
|
1849
|
-
"map_id": [],
|
|
1850
|
-
"sample_source": [],
|
|
1851
|
-
"num_ms1": [],
|
|
1852
|
-
"num_ms2": [],
|
|
1853
|
-
"sample_group": [],
|
|
1854
|
-
"sample_batch": [],
|
|
1855
|
-
"sample_sequence": [],
|
|
1856
|
-
},
|
|
1857
|
-
schema={
|
|
1858
|
-
"sample_uid": pl.Int64,
|
|
1859
|
-
"sample_name": pl.Utf8,
|
|
1860
|
-
"sample_path": pl.Utf8,
|
|
1861
|
-
"sample_type": pl.Utf8,
|
|
1862
|
-
"size": pl.Int64,
|
|
1863
|
-
"map_id": pl.Int64,
|
|
1864
|
-
"sample_source": pl.Utf8,
|
|
1865
|
-
"num_ms1": pl.Int64,
|
|
1866
|
-
"num_ms2": pl.Int64,
|
|
1867
|
-
"sample_group": pl.Utf8,
|
|
1868
|
-
"sample_batch": pl.Int64,
|
|
1869
|
-
"sample_sequence": pl.Int64,
|
|
1870
|
-
},
|
|
1871
|
-
)
|
|
1844
|
+
self.samples_df = _create_empty_dataframe_from_schema("samples_df", schema)
|
|
1872
1845
|
pbar.update(1)
|
|
1873
1846
|
|
|
1874
1847
|
# Load features_df
|
|
@@ -1885,7 +1858,7 @@ def _load_study5(self, filename=None):
|
|
|
1885
1858
|
object_columns,
|
|
1886
1859
|
)
|
|
1887
1860
|
else:
|
|
1888
|
-
self.features_df =
|
|
1861
|
+
self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
|
|
1889
1862
|
pbar.update(1)
|
|
1890
1863
|
|
|
1891
1864
|
# Load consensus_df
|
|
@@ -1942,7 +1915,7 @@ def _load_study5(self, filename=None):
|
|
|
1942
1915
|
],
|
|
1943
1916
|
)
|
|
1944
1917
|
else:
|
|
1945
|
-
self.consensus_df =
|
|
1918
|
+
self.consensus_df = _create_empty_dataframe_from_schema("consensus_df", schema)
|
|
1946
1919
|
pbar.update(1)
|
|
1947
1920
|
|
|
1948
1921
|
# Load consensus_mapping_df
|
|
@@ -1957,21 +1930,7 @@ def _load_study5(self, filename=None):
|
|
|
1957
1930
|
self.logger,
|
|
1958
1931
|
)
|
|
1959
1932
|
else:
|
|
1960
|
-
self.consensus_mapping_df =
|
|
1961
|
-
pbar.update(1)
|
|
1962
|
-
# Load consensus_mapping_df
|
|
1963
|
-
pbar.set_description(
|
|
1964
|
-
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Loading consensus mapping",
|
|
1965
|
-
)
|
|
1966
|
-
if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
|
|
1967
|
-
self.consensus_mapping_df = _load_dataframe_from_group(
|
|
1968
|
-
f["consensus_mapping"],
|
|
1969
|
-
schema,
|
|
1970
|
-
"consensus_mapping_df",
|
|
1971
|
-
self.logger,
|
|
1972
|
-
)
|
|
1973
|
-
else:
|
|
1974
|
-
self.consensus_mapping_df = None
|
|
1933
|
+
self.consensus_mapping_df = _create_empty_dataframe_from_schema("consensus_mapping_df", schema)
|
|
1975
1934
|
pbar.update(1)
|
|
1976
1935
|
|
|
1977
1936
|
# Load consensus_ms2
|
|
@@ -1988,7 +1947,7 @@ def _load_study5(self, filename=None):
|
|
|
1988
1947
|
object_columns,
|
|
1989
1948
|
)
|
|
1990
1949
|
else:
|
|
1991
|
-
self.consensus_ms2 =
|
|
1950
|
+
self.consensus_ms2 = _create_empty_dataframe_from_schema("consensus_ms2", schema)
|
|
1992
1951
|
pbar.update(1)
|
|
1993
1952
|
|
|
1994
1953
|
# Load lib_df
|
|
@@ -2004,7 +1963,7 @@ def _load_study5(self, filename=None):
|
|
|
2004
1963
|
[],
|
|
2005
1964
|
)
|
|
2006
1965
|
else:
|
|
2007
|
-
self.lib_df =
|
|
1966
|
+
self.lib_df = _create_empty_dataframe_from_schema("lib_df", schema)
|
|
2008
1967
|
pbar.update(1)
|
|
2009
1968
|
|
|
2010
1969
|
# Load id_df
|
|
@@ -2020,7 +1979,7 @@ def _load_study5(self, filename=None):
|
|
|
2020
1979
|
[],
|
|
2021
1980
|
)
|
|
2022
1981
|
else:
|
|
2023
|
-
self.id_df =
|
|
1982
|
+
self.id_df = _create_empty_dataframe_from_schema("id_df", schema)
|
|
2024
1983
|
pbar.update(1)
|
|
2025
1984
|
|
|
2026
1985
|
# Check and migrate old string-based map_id to integer indices
|
|
@@ -1291,7 +1291,7 @@ def _get_adducts(study, adducts_list: list = None, **kwargs):
|
|
|
1291
1291
|
|
|
1292
1292
|
logger = getattr(study, "logger", None)
|
|
1293
1293
|
if logger:
|
|
1294
|
-
logger.
|
|
1294
|
+
logger.trace(
|
|
1295
1295
|
f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
|
|
1296
1296
|
)
|
|
1297
1297
|
|
|
@@ -214,13 +214,18 @@ def load(self, filename=None):
|
|
|
214
214
|
|
|
215
215
|
# self.logger.info(f"Loading study from {filename}")
|
|
216
216
|
self._load_study5(filename)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
if
|
|
220
|
-
|
|
221
|
-
|
|
217
|
+
|
|
218
|
+
# After loading the study, check if we have consensus features before loading consensus XML
|
|
219
|
+
if (self.consensus_df is not None and not self.consensus_df.is_empty()):
|
|
220
|
+
consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
221
|
+
if os.path.exists(consensus_xml_path):
|
|
222
|
+
self._load_consensusXML(filename=consensus_xml_path)
|
|
223
|
+
# self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
|
|
224
|
+
else:
|
|
225
|
+
self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
|
|
222
226
|
else:
|
|
223
|
-
self.logger.
|
|
227
|
+
self.logger.debug("No consensus features found, skipping consensusXML loading")
|
|
228
|
+
|
|
224
229
|
self.filename = filename
|
|
225
230
|
|
|
226
231
|
|