masster 0.5.11__tar.gz → 0.5.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (96) hide show
  1. {masster-0.5.11 → masster-0.5.13}/PKG-INFO +1 -1
  2. {masster-0.5.11 → masster-0.5.13}/pyproject.toml +1 -1
  3. {masster-0.5.11 → masster-0.5.13}/src/masster/_version.py +1 -1
  4. {masster-0.5.11 → masster-0.5.13}/src/masster/study/h5.py +41 -2
  5. {masster-0.5.11 → masster-0.5.13}/src/masster/study/id.py +4 -3
  6. masster-0.5.13/src/masster/study/importers.py +222 -0
  7. {masster-0.5.11 → masster-0.5.13}/src/masster/study/merge.py +6 -9
  8. {masster-0.5.11 → masster-0.5.13}/src/masster/study/plot.py +84 -12
  9. {masster-0.5.11 → masster-0.5.13}/src/masster/study/study.py +4 -0
  10. {masster-0.5.11 → masster-0.5.13}/src/masster/study/study5_schema.json +3 -0
  11. {masster-0.5.11 → masster-0.5.13}/uv.lock +1 -1
  12. {masster-0.5.11 → masster-0.5.13}/.github/workflows/publish.yml +0 -0
  13. {masster-0.5.11 → masster-0.5.13}/.github/workflows/security.yml +0 -0
  14. {masster-0.5.11 → masster-0.5.13}/.github/workflows/test.yml +0 -0
  15. {masster-0.5.11 → masster-0.5.13}/.gitignore +0 -0
  16. {masster-0.5.11 → masster-0.5.13}/.pre-commit-config.yaml +0 -0
  17. {masster-0.5.11 → masster-0.5.13}/LICENSE +0 -0
  18. {masster-0.5.11 → masster-0.5.13}/Makefile +0 -0
  19. {masster-0.5.11 → masster-0.5.13}/README.md +0 -0
  20. {masster-0.5.11 → masster-0.5.13}/TESTING.md +0 -0
  21. {masster-0.5.11 → masster-0.5.13}/demo/example_batch_process.py +0 -0
  22. {masster-0.5.11 → masster-0.5.13}/demo/example_sample_process.py +0 -0
  23. {masster-0.5.11 → masster-0.5.13}/src/masster/__init__.py +0 -0
  24. {masster-0.5.11 → masster-0.5.13}/src/masster/chromatogram.py +0 -0
  25. {masster-0.5.11 → masster-0.5.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  26. {masster-0.5.11 → masster-0.5.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  27. {masster-0.5.11 → masster-0.5.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  28. {masster-0.5.11 → masster-0.5.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  29. {masster-0.5.11 → masster-0.5.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  30. {masster-0.5.11 → masster-0.5.13}/src/masster/data/libs/aa.csv +0 -0
  31. {masster-0.5.11 → masster-0.5.13}/src/masster/data/libs/ccm.csv +0 -0
  32. {masster-0.5.11 → masster-0.5.13}/src/masster/data/libs/hilic.csv +0 -0
  33. {masster-0.5.11 → masster-0.5.13}/src/masster/data/libs/urine.csv +0 -0
  34. {masster-0.5.11 → masster-0.5.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  35. {masster-0.5.11 → masster-0.5.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  36. {masster-0.5.11 → masster-0.5.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  37. {masster-0.5.11 → masster-0.5.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  38. {masster-0.5.11 → masster-0.5.13}/src/masster/lib/__init__.py +0 -0
  39. {masster-0.5.11 → masster-0.5.13}/src/masster/lib/lib.py +0 -0
  40. {masster-0.5.11 → masster-0.5.13}/src/masster/logger.py +0 -0
  41. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/__init__.py +0 -0
  42. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/adducts.py +0 -0
  43. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/defaults/__init__.py +0 -0
  44. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/defaults/find_adducts_def.py +0 -0
  45. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/defaults/find_features_def.py +0 -0
  46. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/defaults/find_ms2_def.py +0 -0
  47. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
  48. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/defaults/sample_def.py +0 -0
  49. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/h5.py +0 -0
  50. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/helpers.py +0 -0
  51. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/lib.py +0 -0
  52. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/load.py +0 -0
  53. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/parameters.py +0 -0
  54. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/plot.py +0 -0
  55. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/processing.py +0 -0
  56. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/quant.py +0 -0
  57. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/sample.py +0 -0
  58. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/sample5_schema.json +0 -0
  59. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/save.py +0 -0
  60. {masster-0.5.11 → masster-0.5.13}/src/masster/sample/sciex.py +0 -0
  61. {masster-0.5.11 → masster-0.5.13}/src/masster/spectrum.py +0 -0
  62. {masster-0.5.11 → masster-0.5.13}/src/masster/study/__init__.py +0 -0
  63. {masster-0.5.11 → masster-0.5.13}/src/masster/study/analysis.py +0 -0
  64. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/__init__.py +0 -0
  65. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/align_def.py +0 -0
  66. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/export_def.py +0 -0
  67. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/fill_def.py +0 -0
  68. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/find_consensus_def.py +0 -0
  69. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/find_ms2_def.py +0 -0
  70. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/identify_def.py +0 -0
  71. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
  72. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/integrate_def.py +0 -0
  73. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/merge_def.py +0 -0
  74. {masster-0.5.11 → masster-0.5.13}/src/masster/study/defaults/study_def.py +0 -0
  75. {masster-0.5.11 → masster-0.5.13}/src/masster/study/export.py +0 -0
  76. {masster-0.5.11 → masster-0.5.13}/src/masster/study/helpers.py +0 -0
  77. {masster-0.5.11 → masster-0.5.13}/src/masster/study/load.py +0 -0
  78. {masster-0.5.11 → masster-0.5.13}/src/masster/study/parameters.py +0 -0
  79. {masster-0.5.11 → masster-0.5.13}/src/masster/study/processing.py +0 -0
  80. {masster-0.5.11 → masster-0.5.13}/src/masster/study/save.py +0 -0
  81. {masster-0.5.11 → masster-0.5.13}/src/masster/wizard/README.md +0 -0
  82. {masster-0.5.11 → masster-0.5.13}/src/masster/wizard/__init__.py +0 -0
  83. {masster-0.5.11 → masster-0.5.13}/src/masster/wizard/example.py +0 -0
  84. {masster-0.5.11 → masster-0.5.13}/src/masster/wizard/wizard.py +0 -0
  85. {masster-0.5.11 → masster-0.5.13}/tests/conftest.py +0 -0
  86. {masster-0.5.11 → masster-0.5.13}/tests/test_chromatogram.py +0 -0
  87. {masster-0.5.11 → masster-0.5.13}/tests/test_defaults.py +0 -0
  88. {masster-0.5.11 → masster-0.5.13}/tests/test_imports.py +0 -0
  89. {masster-0.5.11 → masster-0.5.13}/tests/test_integration.py +0 -0
  90. {masster-0.5.11 → masster-0.5.13}/tests/test_logger.py +0 -0
  91. {masster-0.5.11 → masster-0.5.13}/tests/test_parameters.py +0 -0
  92. {masster-0.5.11 → masster-0.5.13}/tests/test_sample.py +0 -0
  93. {masster-0.5.11 → masster-0.5.13}/tests/test_spectrum.py +0 -0
  94. {masster-0.5.11 → masster-0.5.13}/tests/test_study.py +0 -0
  95. {masster-0.5.11 → masster-0.5.13}/tests/test_version.py +0 -0
  96. {masster-0.5.11 → masster-0.5.13}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.5.11
3
+ Version: 0.5.13
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,7 +1,7 @@
1
1
 
2
2
  [project]
3
3
  name = "masster"
4
- version = "0.5.11"
4
+ version = "0.5.13"
5
5
  description = "Mass spectrometry data analysis package"
6
6
  authors = [
7
7
  { name = "Zamboni Lab" }
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.5.11"
4
+ __version__ = "0.5.13"
5
5
 
6
6
 
7
7
  def get_version():
@@ -874,8 +874,47 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
874
874
 
875
875
  # Create DataFrame with regular columns first
876
876
  if regular_data:
877
- df = pl.DataFrame(regular_data)
878
- # print(f"DEBUG: Created DataFrame with regular columns, shape: {df.shape}")
877
+ # Final safety check: convert any remaining numpy object arrays to Python lists
878
+ # and handle numpy scalars within lists
879
+ safe_regular_data = {}
880
+ import numpy as np
881
+
882
+ def convert_numpy_scalars(value):
883
+ """Convert numpy scalars to Python native types recursively."""
884
+ if isinstance(value, np.generic):
885
+ return value.item() # Convert numpy scalar to Python scalar
886
+ elif isinstance(value, list):
887
+ return [convert_numpy_scalars(item) for item in value]
888
+ else:
889
+ return value
890
+
891
+ for k, v in regular_data.items():
892
+ if hasattr(v, 'dtype') and str(v.dtype) == 'object':
893
+ # Convert numpy object array to Python list
894
+ safe_regular_data[k] = [convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, 'tolist') else list(v))]
895
+ elif isinstance(v, list):
896
+ # Handle lists that might contain numpy scalars
897
+ safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
898
+ else:
899
+ safe_regular_data[k] = convert_numpy_scalars(v)
900
+
901
+ # Create DataFrame with proper error handling
902
+ try:
903
+ df = pl.DataFrame(safe_regular_data)
904
+ except Exception as e:
905
+ # If direct creation fails, try creating column by column to identify and handle problematic columns
906
+ df = pl.DataFrame()
907
+ for k, v in safe_regular_data.items():
908
+ try:
909
+ df = df.with_columns([pl.Series(k, v)])
910
+ except Exception:
911
+ # Skip problematic columns or convert them to string as a fallback
912
+ try:
913
+ df = df.with_columns([pl.Series(k, [str(item) for item in v])])
914
+ except Exception:
915
+ # Last resort: skip the column entirely
916
+ continue
917
+
879
918
  # Add Object columns one by one
880
919
  for col, values in object_data.items():
881
920
  # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
@@ -661,7 +661,8 @@ def _update_consensus_id_columns(study, logger=None):
661
661
  ("id_top_name", pl.String),
662
662
  ("id_top_class", pl.String),
663
663
  ("id_top_adduct", pl.String),
664
- ("id_top_score", pl.Float64)
664
+ ("id_top_score", pl.Float64),
665
+ ("id_source", pl.String)
665
666
  ]:
666
667
  if col_name not in study.consensus_df.columns:
667
668
  study.consensus_df = study.consensus_df.with_columns(
@@ -1076,7 +1077,7 @@ def id_reset(study):
1076
1077
 
1077
1078
  # Check which columns exist before trying to update them
1078
1079
  id_columns_to_reset = []
1079
- for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
1080
+ for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
1080
1081
  if col in study.consensus_df.columns:
1081
1082
  if col == "id_top_score":
1082
1083
  id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
@@ -1170,7 +1171,7 @@ def lib_reset(study):
1170
1171
 
1171
1172
  # Check which columns exist before trying to update them
1172
1173
  id_columns_to_reset = []
1173
- for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
1174
+ for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
1174
1175
  if col in study.consensus_df.columns:
1175
1176
  if col == "id_top_score":
1176
1177
  id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
@@ -0,0 +1,222 @@
1
+ """
2
+ import.py
3
+
4
+ Module providing import functionality for Study class, specifically for importing
5
+ oracle identification data into consensus features.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import pandas as pd
12
+ import polars as pl
13
+
14
+
15
+ def import_oracle(
16
+ self,
17
+ folder,
18
+ min_id_level=None,
19
+ max_id_level=None,
20
+ ):
21
+ """
22
+ Import oracle identification data and map it to consensus features.
23
+
24
+ This method reads oracle identification results from folder/diag/summary_by_feature.csv
25
+ and maps them to consensus features using the 'uit' (feature_uid) column. The oracle
26
+ data is used to populate identification columns in consensus_df.
27
+
28
+ Parameters:
29
+ folder (str): Path to oracle folder containing diag/summary_by_feature.csv
30
+ min_id_level (int, optional): Minimum identification level to include
31
+ max_id_level (int, optional): Maximum identification level to include
32
+
33
+ Returns:
34
+ None: Updates consensus_df in-place with oracle identification data
35
+
36
+ Raises:
37
+ FileNotFoundError: If the oracle summary file doesn't exist
38
+ ValueError: If consensus_df is empty or doesn't have required columns
39
+
40
+ Example:
41
+ >>> study.import_oracle(
42
+ ... folder="path/to/oracle_results",
43
+ ... min_id_level=2,
44
+ ... max_id_level=4
45
+ ... )
46
+ """
47
+
48
+ self.logger.info(f"Starting oracle import from folder: {folder}")
49
+
50
+ # Validate inputs
51
+ if self.consensus_df is None or self.consensus_df.is_empty():
52
+ raise ValueError("consensus_df is empty or not available. Run merge() first.")
53
+
54
+ if "consensus_uid" not in self.consensus_df.columns:
55
+ raise ValueError("consensus_df must contain 'consensus_uid' column")
56
+
57
+ # Check if oracle file exists
58
+ oracle_file_path = os.path.join(folder, "diag", "summary_by_feature.csv")
59
+ if not os.path.exists(oracle_file_path):
60
+ raise FileNotFoundError(f"Oracle summary file not found: {oracle_file_path}")
61
+
62
+ self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
63
+
64
+ try:
65
+ # Read oracle data using pandas first for easier processing
66
+ oracle_data = pd.read_csv(oracle_file_path)
67
+ self.logger.info(f"Oracle data loaded successfully with {len(oracle_data)} rows")
68
+ except Exception as e:
69
+ self.logger.error(f"Could not read {oracle_file_path}: {e}")
70
+ raise
71
+
72
+ # Select relevant columns from oracle data
73
+ required_oracle_cols = ["title", "id_level", "id_label", "id_ion", "id_class", "score"]
74
+ missing_cols = [col for col in required_oracle_cols if col not in oracle_data.columns]
75
+ if missing_cols:
76
+ raise ValueError(f"Oracle data missing required columns: {missing_cols}")
77
+
78
+ oracle_subset = oracle_data[required_oracle_cols].copy()
79
+
80
+ # Extract consensus_uid from title column (format: "uid:XYZ, ...")
81
+ self.logger.debug("Extracting consensus UIDs from oracle titles using pattern 'uid:(\\d+)'")
82
+ oracle_subset["consensus_uid"] = oracle_subset["title"].str.extract(r"uid:(\d+)")
83
+
84
+ # Remove rows where consensus_uid extraction failed
85
+ oracle_subset = oracle_subset.dropna(subset=["consensus_uid"])
86
+ oracle_subset["consensus_uid"] = oracle_subset["consensus_uid"].astype(int)
87
+
88
+ self.logger.debug(f"Extracted consensus UIDs for {len(oracle_subset)} oracle entries")
89
+
90
+ # Apply id_level filters if specified
91
+ initial_count = len(oracle_subset)
92
+ if min_id_level is not None:
93
+ oracle_subset = oracle_subset[oracle_subset["id_level"] >= min_id_level]
94
+ self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_subset)} entries")
95
+
96
+ if max_id_level is not None:
97
+ oracle_subset = oracle_subset[oracle_subset["id_level"] <= max_id_level]
98
+ self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_subset)} entries")
99
+
100
+ if len(oracle_subset) == 0:
101
+ self.logger.warning("No oracle entries remain after filtering")
102
+ return
103
+
104
+ # Sort by id_level (descending) to prioritize higher confidence identifications
105
+ # and remove duplicates by consensus_uid, keeping the first (highest id_level)
106
+ oracle_subset = oracle_subset.sort_values(by=["id_level"], ascending=False)
107
+ oracle_subset = oracle_subset.drop_duplicates(subset=["consensus_uid"], keep="first")
108
+
109
+ self.logger.debug(f"After deduplication by consensus_uid: {len(oracle_subset)} unique identifications")
110
+
111
+ # Convert to polars for efficient joining
112
+ oracle_pl = pl.DataFrame(oracle_subset)
113
+
114
+ self.logger.debug(f"Oracle data ready for consensus mapping: {len(oracle_pl)} entries")
115
+
116
+ if oracle_pl.is_empty():
117
+ self.logger.warning("No oracle entries could be processed")
118
+ return
119
+
120
+ # Group by consensus_uid and select the best identification (highest id_level)
121
+ # In case of ties, take the first one
122
+ best_ids = (
123
+ oracle_pl
124
+ .group_by("consensus_uid")
125
+ .agg([
126
+ pl.col("id_level").max().alias("max_id_level")
127
+ ])
128
+ .join(oracle_pl, on="consensus_uid")
129
+ .filter(pl.col("id_level") == pl.col("max_id_level"))
130
+ .group_by("consensus_uid")
131
+ .first() # In case of ties, take the first
132
+ )
133
+
134
+ self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
135
+
136
+ # Prepare the identification columns
137
+ id_columns = {
138
+ "id_top_name": best_ids.select("consensus_uid", "id_label"),
139
+ "id_top_adduct": best_ids.select("consensus_uid", "id_ion"),
140
+ "id_top_class": best_ids.select("consensus_uid", "id_class"),
141
+ "id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
142
+ "id_source": best_ids.select(
143
+ "consensus_uid",
144
+ pl.when(pl.col("id_level") == 1)
145
+ .then(pl.lit("lipidoracle ms1"))
146
+ .otherwise(pl.lit("lipidoracle ms2"))
147
+ .alias("id_source")
148
+ )
149
+ }
150
+
151
+ # Initialize identification columns in consensus_df if they don't exist
152
+ for col_name in id_columns.keys():
153
+ if col_name not in self.consensus_df.columns:
154
+ if col_name == "id_top_score":
155
+ self.consensus_df = self.consensus_df.with_columns(
156
+ pl.lit(None, dtype=pl.Float64).alias(col_name)
157
+ )
158
+ else:
159
+ self.consensus_df = self.consensus_df.with_columns(
160
+ pl.lit(None, dtype=pl.String).alias(col_name)
161
+ )
162
+
163
+ # Update consensus_df with oracle identifications
164
+ for col_name, id_data in id_columns.items():
165
+ oracle_column = id_data.columns[1] # second column (after consensus_uid)
166
+
167
+ # Create update dataframe
168
+ update_data = id_data.rename({oracle_column: col_name})
169
+
170
+ # Join and update
171
+ self.consensus_df = (
172
+ self.consensus_df
173
+ .join(update_data, on="consensus_uid", how="left", suffix="_oracle")
174
+ .with_columns(
175
+ pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name)
176
+ )
177
+ .drop(f"{col_name}_oracle")
178
+ )
179
+
180
+ # Replace NaN values with None in identification columns
181
+ id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score", "id_source"]
182
+ for col_name in id_col_names:
183
+ if col_name in self.consensus_df.columns:
184
+ # For string columns, replace empty strings and "nan" with None
185
+ if col_name != "id_top_score":
186
+ self.consensus_df = self.consensus_df.with_columns(
187
+ pl.when(
188
+ pl.col(col_name).is_null() |
189
+ (pl.col(col_name) == "") |
190
+ (pl.col(col_name) == "nan") |
191
+ (pl.col(col_name) == "NaN")
192
+ )
193
+ .then(None)
194
+ .otherwise(pl.col(col_name))
195
+ .alias(col_name)
196
+ )
197
+ # For numeric columns, replace NaN with None
198
+ else:
199
+ self.consensus_df = self.consensus_df.with_columns(
200
+ pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
201
+ .then(None)
202
+ .otherwise(pl.col(col_name))
203
+ .alias(col_name)
204
+ )
205
+
206
+ # Count how many consensus features were updated
207
+ updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
208
+ total_consensus = len(self.consensus_df)
209
+
210
+ self.logger.info(
211
+ f"Oracle import complete: {updated_count}/{total_consensus} "
212
+ f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
213
+ )
214
+
215
+ # Update history
216
+ self.update_history(["import_oracle"], {
217
+ "folder": folder,
218
+ "min_id_level": min_id_level,
219
+ "max_id_level": max_id_level,
220
+ "updated_features": updated_count,
221
+ "total_features": total_consensus
222
+ })
@@ -1792,6 +1792,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1792
1792
  "id_top_class": None,
1793
1793
  "id_top_adduct": None,
1794
1794
  "id_top_score": None,
1795
+ "id_source": None,
1795
1796
  }
1796
1797
 
1797
1798
 
@@ -2194,6 +2195,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2194
2195
  "id_top_class": None,
2195
2196
  "id_top_adduct": None,
2196
2197
  "id_top_score": None,
2198
+ "id_source": None,
2197
2199
  },
2198
2200
  )
2199
2201
 
@@ -2255,15 +2257,13 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2255
2257
  {
2256
2258
  "consensus_uid": row["consensus_uid"],
2257
2259
  "rt": row["rt"],
2258
- "mz": row["mz"], # Add missing mz field
2260
+ "mz": row["mz"],
2259
2261
  "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
2260
2262
  "adduct_top": row.get("adduct_top"),
2261
2263
  "inty_mean": row.get("inty_mean", 0),
2262
2264
  },
2263
2265
  )
2264
2266
 
2265
- # Use optimized adduct grouping
2266
- #study.logger.info(f"About to call adduct grouping for {len(consensus_data)} consensus features")
2267
2267
  adduct_group_list, adduct_of_list = __merge_adduct_grouping(
2268
2268
  study, consensus_data, rt_tol/3, mz_tol
2269
2269
  )
@@ -2718,8 +2718,6 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2718
2718
  study.logger.debug("No consensus features for adduct identification by mass shift")
2719
2719
  return
2720
2720
 
2721
- study.logger.info(f"Identifying coeluting adducts by mass shifts in {len(study.consensus_df)} consensus features...")
2722
-
2723
2721
  # Get adducts DataFrame if not provided
2724
2722
  if cached_adducts_df is None or cached_adducts_df.is_empty():
2725
2723
  try:
@@ -3025,8 +3023,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
3025
3023
  pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3026
3024
  pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3027
3025
  ])
3028
-
3029
- study.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3026
+ study.logger.success(f"Adduct information updated for {updated_count} consensus features.")
3030
3027
  else:
3031
3028
  study.logger.debug("No consensus features updated based on mass shift analysis")
3032
3029
 
@@ -3395,7 +3392,7 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3395
3392
  adduct_of_list = [0] * len(consensus_data)
3396
3393
  return adduct_group_list, adduct_of_list
3397
3394
 
3398
- study.logger.info(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
3395
+ study.logger.debug(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
3399
3396
 
3400
3397
  except Exception as e:
3401
3398
  study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
@@ -3405,7 +3402,7 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3405
3402
 
3406
3403
  # Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
3407
3404
  adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
3408
- study.logger.info(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
3405
+ study.logger.debug(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
3409
3406
 
3410
3407
  # Build hash map for O(1) mass shift lookup
3411
3408
  mass_shift_map = {} # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
@@ -630,6 +630,7 @@ def plot_consensus_2d(
630
630
  height=450,
631
631
  mz_range=None,
632
632
  rt_range=None,
633
+ legend="bottom_right",
633
634
  ):
634
635
  """
635
636
  Plot consensus features in a 2D scatter plot with retention time vs m/z.
@@ -652,6 +653,9 @@ def plot_consensus_2d(
652
653
  height (int): Plot height in pixels (default: 900)
653
654
  mz_range (tuple, optional): m/z range for filtering consensus features (min_mz, max_mz)
654
655
  rt_range (tuple, optional): Retention time range for filtering consensus features (min_rt, max_rt)
656
+ legend (str, optional): Legend position for categorical data. Options: 'top_right', 'top_left',
657
+ 'bottom_right', 'bottom_left', 'right', 'left', 'top', 'bottom'.
658
+ If None, legend is hidden. Only applies to categorical coloring (default: "bottom_right")
655
659
  """
656
660
  if self.consensus_df is None:
657
661
  self.logger.error("No consensus map found.")
@@ -783,13 +787,20 @@ def plot_consensus_2d(
783
787
  # Sorting would break the correspondence between legend labels and point colors
784
788
  unique_values = [v for v in data_pd[colorby].unique() if v is not None]
785
789
 
786
- if len(unique_values) <= 20:
787
- palette = Category20[min(20, max(3, len(unique_values)))]
790
+ # Use the custom palette from cmap if available, otherwise fall back to defaults
791
+ if len(palette) >= len(unique_values):
792
+ # Use custom colormap palette - sample evenly across the palette
793
+ import numpy as np
794
+ indices = np.linspace(0, len(palette) - 1, len(unique_values)).astype(int)
795
+ categorical_palette = [palette[i] for i in indices]
796
+ elif len(unique_values) <= 20:
797
+ # Fall back to Category20 if custom palette is too small
798
+ categorical_palette = Category20[min(20, max(3, len(unique_values)))]
788
799
  else:
789
800
  # For many categories, use a subset of the viridis palette
790
- palette = viridis(min(256, len(unique_values)))
801
+ categorical_palette = viridis(min(256, len(unique_values)))
791
802
 
792
- color_mapper = factor_cmap(colorby, palette, unique_values)
803
+ color_mapper = factor_cmap(colorby, categorical_palette, unique_values)
793
804
  else:
794
805
  # Handle numeric coloring with LinearColorMapper
795
806
  color_mapper = LinearColorMapper(
@@ -809,21 +820,65 @@ def plot_consensus_2d(
809
820
  if is_categorical:
810
821
  # For categorical data, create separate renderers for each category
811
822
  # This enables proper legend interactivity where each category can be toggled independently
812
- unique_values = [v for v in data_pd[colorby].unique() if v is not None]
823
+ all_unique_values = list(data_pd[colorby].unique())
824
+ unique_values = [v for v in all_unique_values if v is not None]
825
+ has_none_values = None in all_unique_values
813
826
 
814
- if len(unique_values) <= 20:
815
- palette = Category20[min(20, max(3, len(unique_values)))]
827
+ # Use the custom palette from cmap if available, otherwise fall back to defaults
828
+ if len(palette) >= len(unique_values):
829
+ # Use custom colormap palette - sample evenly across the palette
830
+ import numpy as np
831
+ indices = np.linspace(0, len(palette) - 1, len(unique_values)).astype(int)
832
+ categorical_palette = [palette[i] for i in indices]
833
+ elif len(unique_values) <= 20:
834
+ # Fall back to Category20 if custom palette is too small
835
+ categorical_palette = Category20[min(20, max(3, len(unique_values)))]
816
836
  else:
817
- palette = viridis(min(256, len(unique_values)))
837
+ categorical_palette = viridis(min(256, len(unique_values)))
818
838
 
819
- # Create a separate renderer for each category
839
+ # Handle None values with black color FIRST so they appear in the background
840
+ if has_none_values:
841
+ # Filter data for None values
842
+ none_data = data.filter(pl.col(colorby).is_null())
843
+ none_data_pd = none_data.to_pandas()
844
+ none_source = bp.ColumnDataSource(none_data_pd)
845
+
846
+ if scaling.lower() in ["dyn", "dynamic"]:
847
+ # Calculate appropriate radius for dynamic scaling
848
+ rt_range = data["rt"].max() - data["rt"].min()
849
+ mz_range = data["mz"].max() - data["mz"].min()
850
+ dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
851
+
852
+ renderer = p.circle(
853
+ x="rt",
854
+ y="mz",
855
+ radius=dynamic_radius,
856
+ fill_color="lightgray",
857
+ line_color=None,
858
+ alpha=alpha,
859
+ source=none_source,
860
+ legend_label="None",
861
+ )
862
+ else:
863
+ renderer = p.scatter(
864
+ x="rt",
865
+ y="mz",
866
+ size="markersize",
867
+ fill_color="lightgray",
868
+ line_color=None,
869
+ alpha=alpha,
870
+ source=none_source,
871
+ legend_label="None",
872
+ )
873
+
874
+ # Create a separate renderer for each non-None category (plotted on top of None values)
820
875
  for i, category in enumerate(unique_values):
821
876
  # Filter data for this category
822
877
  category_data = data.filter(pl.col(colorby) == category)
823
878
  category_data_pd = category_data.to_pandas()
824
879
  category_source = bp.ColumnDataSource(category_data_pd)
825
880
 
826
- color = palette[i % len(palette)]
881
+ color = categorical_palette[i % len(categorical_palette)]
827
882
 
828
883
  if scaling.lower() in ["dyn", "dynamic"]:
829
884
  # Calculate appropriate radius for dynamic scaling
@@ -942,8 +997,25 @@ def plot_consensus_2d(
942
997
  p.add_layout(color_bar, "right")
943
998
  else:
944
999
  # For categorical data, configure the legend that was automatically created
945
- p.legend.location = "top_right"
946
- p.legend.click_policy = "hide"
1000
+ if legend is not None:
1001
+ # Map legend position parameter to Bokeh legend position
1002
+ legend_position_map = {
1003
+ "top_right": "top_right",
1004
+ "top_left": "top_left",
1005
+ "bottom_right": "bottom_right",
1006
+ "bottom_left": "bottom_left",
1007
+ "right": "right",
1008
+ "left": "left",
1009
+ "top": "top",
1010
+ "bottom": "bottom"
1011
+ }
1012
+
1013
+ bokeh_legend_pos = legend_position_map.get(legend, "bottom_right")
1014
+ p.legend.location = bokeh_legend_pos
1015
+ p.legend.click_policy = "hide"
1016
+ else:
1017
+ # Hide legend when legend=None
1018
+ p.legend.visible = False
947
1019
 
948
1020
  if filename is not None:
949
1021
  # Convert relative paths to absolute paths using study folder as base
@@ -109,6 +109,7 @@ from masster.study.parameters import set_parameters_property
109
109
  from masster.study.save import save, save_consensus, save_samples
110
110
  from masster.study.export import export_mgf, export_mztab, export_xlsx, export_parquet
111
111
  from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset, _get_adducts
112
+ from masster.study.importers import import_oracle
112
113
 
113
114
  from masster.logger import MassterLogger
114
115
  from masster.study.defaults.study_def import study_defaults
@@ -454,6 +455,9 @@ class Study:
454
455
  reset_id = id_reset
455
456
  lib_reset = lib_reset
456
457
  reset_lib = lib_reset
458
+
459
+ # === Oracle Import Operations ===
460
+ import_oracle = import_oracle
457
461
 
458
462
  # === Parameter Management ===
459
463
  update_history = update_history
@@ -114,6 +114,9 @@
114
114
  },
115
115
  "id_top_score": {
116
116
  "dtype": "pl.Float64"
117
+ },
118
+ "id_source": {
119
+ "dtype": "pl.String"
117
120
  }
118
121
  }
119
122
  },
@@ -1420,7 +1420,7 @@ wheels = [
1420
1420
 
1421
1421
  [[package]]
1422
1422
  name = "masster"
1423
- version = "0.5.11"
1423
+ version = "0.5.13"
1424
1424
  source = { editable = "." }
1425
1425
  dependencies = [
1426
1426
  { name = "alpharaw" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes