spec2function 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,233 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Iterable, Optional, Tuple, Union
6
+ import json
7
+
8
+ import pandas as pd
9
+
10
+ from .single_analysis import SingleSpectrumAnalyzer
11
+ from .set_analysis import MetaboliteSetAnalyzer
12
+
13
+ SpectrumInput = Union[str, Dict[str, Any]]
14
+ CSVInput = Union[str, Path, pd.DataFrame]
15
+
16
+
17
+ def parse_json_spectrum(json_input: SpectrumInput) -> Dict[str, Any]:
18
+ if isinstance(json_input, (str, bytes, bytearray)):
19
+ data = json.loads(json_input)
20
+ elif isinstance(json_input, dict):
21
+ data = json_input
22
+ else:
23
+ raise TypeError("json_input must be a JSON string or a dict.")
24
+
25
+ peaks = data.get("peaks") or []
26
+ if not peaks:
27
+ raise ValueError("No peaks found in JSON input.")
28
+
29
+ mz_values = []
30
+ intensity_values = []
31
+ for peak in peaks:
32
+ if not isinstance(peak, (list, tuple)) or len(peak) < 2:
33
+ continue
34
+ mz_values.append(float(peak[0]))
35
+ intensity_values.append(float(peak[1]))
36
+
37
+ if not mz_values:
38
+ raise ValueError("No valid m/z-intensity pairs found in JSON input.")
39
+
40
+ precursor_mz = float(data.get("precursor_mz") or 0.0)
41
+ return {
42
+ "mz": mz_values,
43
+ "intensity": intensity_values,
44
+ "precursor_mz": precursor_mz,
45
+ }
46
+
47
+
48
+ def load_set_dataframe(data: CSVInput) -> pd.DataFrame:
49
+ if isinstance(data, pd.DataFrame):
50
+ return data.copy()
51
+ return pd.read_csv(Path(data))
52
+
53
+
54
+ def filter_set_dataframe(
55
+ df: pd.DataFrame,
56
+ min_abs_logfc: float = 0.1,
57
+ max_pvalue: float = 0.05,
58
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
59
+ filtered = df.copy()
60
+ columns = list(filtered.columns)
61
+ logfc_col = next((c for c in columns if "logfc" in c.lower()), None)
62
+ pval_col = next(
63
+ (c for c in columns if "pval" in c.lower() or "p.value" in c.lower()),
64
+ None,
65
+ )
66
+
67
+ if logfc_col and min_abs_logfc and min_abs_logfc > 0:
68
+ filtered = filtered[filtered[logfc_col].abs() >= min_abs_logfc]
69
+
70
+ if pval_col and max_pvalue is not None and max_pvalue < 1:
71
+ filtered = filtered[filtered[pval_col] <= max_pvalue]
72
+
73
+ info = {
74
+ "total": len(df),
75
+ "kept": len(filtered),
76
+ "logfc_col": logfc_col,
77
+ "pval_col": pval_col,
78
+ "min_abs_logfc": min_abs_logfc,
79
+ "max_pvalue": max_pvalue,
80
+ }
81
+ return filtered, info
82
+
83
+
84
+ def _filter_papers(papers: Iterable[Dict[str, Any]], selected_pmids: Optional[Iterable[str]]) -> list:
85
+ if not selected_pmids:
86
+ return list(papers or [])
87
+ selected = {str(pmid) for pmid in selected_pmids}
88
+ return [p for p in (papers or []) if str(p.get("pmid")) in selected]
89
+
90
+
91
+ def _ensure_analyzer(
92
+ analyzer: Optional[MetaboliteSetAnalyzer],
93
+ project_root: Optional[Path],
94
+ *,
95
+ device=None,
96
+ enable_gpt_pubmed: bool = True,
97
+ ) -> MetaboliteSetAnalyzer:
98
+ if analyzer is not None:
99
+ return analyzer
100
+ if project_root is None:
101
+ raise ValueError("project_root is required when analyzer is not provided.")
102
+ return MetaboliteSetAnalyzer.create_from_spec2function_root(
103
+ project_root,
104
+ device=device,
105
+ enable_gpt_pubmed=enable_gpt_pubmed,
106
+ )
107
+
108
+
109
+ @dataclass
110
+ class MS2BioTextWorkflow:
111
+ analyzer: MetaboliteSetAnalyzer
112
+
113
+ @classmethod
114
+ def from_spec2function_root(
115
+ cls,
116
+ project_root: Path,
117
+ *,
118
+ device=None,
119
+ enable_gpt_pubmed: bool = True,
120
+ ) -> "MS2BioTextWorkflow":
121
+ analyzer = MetaboliteSetAnalyzer.create_from_spec2function_root(
122
+ project_root,
123
+ device=device,
124
+ enable_gpt_pubmed=enable_gpt_pubmed,
125
+ )
126
+ return cls(analyzer=analyzer)
127
+
128
+ def run_single(
129
+ self,
130
+ json_input: SpectrumInput,
131
+ *,
132
+ top_k: int = 10,
133
+ user_focus: Optional[str] = None,
134
+ selected_pmids: Optional[Iterable[str]] = None,
135
+ include_annotation: bool = True,
136
+ ) -> Dict[str, Any]:
137
+ spectrum = parse_json_spectrum(json_input)
138
+ result = self.analyzer.single_inference(
139
+ spectrum["mz"],
140
+ spectrum["intensity"],
141
+ precursor_mz=spectrum["precursor_mz"],
142
+ top_k=top_k,
143
+ )
144
+
145
+ if include_annotation:
146
+ papers = _filter_papers(result.get("papers", []), selected_pmids)
147
+ annotation = self.analyzer.generate_annotation(
148
+ retrieved_fragments=result.get("retrieved_fragments", []),
149
+ papers=papers,
150
+ user_focus=user_focus,
151
+ )
152
+ result["annotation"] = annotation
153
+ return result
154
+
155
+ def run_set(
156
+ self,
157
+ data: CSVInput,
158
+ *,
159
+ background_info: Optional[str] = None,
160
+ min_abs_logfc: float = 0.1,
161
+ max_pvalue: float = 0.05,
162
+ min_features: int = 5,
163
+ ) -> Dict[str, Any]:
164
+ df = load_set_dataframe(data)
165
+ filtered, info = filter_set_dataframe(df, min_abs_logfc, max_pvalue)
166
+
167
+ if len(filtered) < min_features:
168
+ return {
169
+ "error": f"Too few features selected ({len(filtered)})",
170
+ "filter": info,
171
+ }
172
+
173
+ result = self.analyzer.run_semi_supervised_analysis(
174
+ filtered, background_info=background_info
175
+ )
176
+ result["filter"] = info
177
+ return result
178
+
179
+
180
+ def run_single(
181
+ json_input: SpectrumInput,
182
+ *,
183
+ project_root: Optional[Path] = None,
184
+ analyzer: Optional[MetaboliteSetAnalyzer] = None,
185
+ device=None,
186
+ enable_gpt_pubmed: bool = True,
187
+ top_k: int = 10,
188
+ user_focus: Optional[str] = None,
189
+ selected_pmids: Optional[Iterable[str]] = None,
190
+ include_annotation: bool = True,
191
+ ) -> Dict[str, Any]:
192
+ analyzer = _ensure_analyzer(
193
+ analyzer,
194
+ project_root,
195
+ device=device,
196
+ enable_gpt_pubmed=enable_gpt_pubmed,
197
+ )
198
+ workflow = MS2BioTextWorkflow(analyzer=analyzer)
199
+ return workflow.run_single(
200
+ json_input,
201
+ top_k=top_k,
202
+ user_focus=user_focus,
203
+ selected_pmids=selected_pmids,
204
+ include_annotation=include_annotation,
205
+ )
206
+
207
+
208
+ def run_set(
209
+ data: CSVInput,
210
+ *,
211
+ project_root: Optional[Path] = None,
212
+ analyzer: Optional[MetaboliteSetAnalyzer] = None,
213
+ device=None,
214
+ enable_gpt_pubmed: bool = True,
215
+ background_info: Optional[str] = None,
216
+ min_abs_logfc: float = 0.1,
217
+ max_pvalue: float = 0.05,
218
+ min_features: int = 5,
219
+ ) -> Dict[str, Any]:
220
+ analyzer = _ensure_analyzer(
221
+ analyzer,
222
+ project_root,
223
+ device=device,
224
+ enable_gpt_pubmed=enable_gpt_pubmed,
225
+ )
226
+ workflow = MS2BioTextWorkflow(analyzer=analyzer)
227
+ return workflow.run_set(
228
+ data,
229
+ background_info=background_info,
230
+ min_abs_logfc=min_abs_logfc,
231
+ max_pvalue=max_pvalue,
232
+ min_features=min_features,
233
+ )
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: spec2function
3
+ Version: 0.1.1
4
+ Summary: Deep learning model for MS2 data annotation
5
+ Home-page: https://huggingface.co/cgxjdzz/ms2function-assets
6
+ Author: User
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: torch
13
+ Requires-Dist: transformers
14
+ Requires-Dist: numpy
15
+ Requires-Dist: pandas
16
+ Requires-Dist: scikit-learn
17
+ Requires-Dist: tqdm
18
+ Requires-Dist: wandb
19
+ Requires-Dist: huggingface_hub
20
+ Dynamic: author
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: description-content-type
24
+ Dynamic: home-page
25
+ Dynamic: license-file
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
29
+
30
+ # Spec2Function
31
+
32
+ Spec2Function provides MS2 spectrum annotation and metabolite set analysis powered by
33
+ MS2BioText models. Large model/data assets are hosted on Hugging Face Hub and are
34
+ downloaded automatically on first use.
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ pip install spec2function
40
+ ```
41
+
42
+ Or for local development:
43
+
44
+ ```bash
45
+ pip install -e .
46
+ ```
47
+
48
+ ## Assets (Hugging Face Hub)
49
+
50
+ Default asset repo: `cgxjdzz/spec2function-assets`
51
+
52
+ The package looks for the following files:
53
+
54
+ - `models/best_model.pth`
55
+ - `models/config.json`
56
+ - `data/hmdb_subsections_WITH_NAME.jsonl`
57
+ - `data/all_jsonl_embeddings.pt`
58
+
59
+ Environment overrides:
60
+
61
+ - `MS2FUNCTION_ASSET_DIR` (use local assets directory)
62
+ - `MS2FUNCTION_ASSET_REPO` (override HF repo id)
63
+ - `HUGGINGFACE_HUB_TOKEN` or `HF_TOKEN` (for private repos)
64
+
65
+ ## Quickstart (single spectrum)
66
+
67
+ ```python
68
+ from pathlib import Path
69
+ from Spec2Function import run_single
70
+
71
+ json_input = {
72
+ "peaks": [[100.1, 200.0], [150.2, 300.0]],
73
+ "precursor_mz": 250.3,
74
+ }
75
+
76
+ result = run_single(json_input, project_root=Path(r"d:\NTU\Spec2Function"))
77
+ print(result)
78
+ ```
79
+
80
+ ## Quickstart (metabolite set)
81
+
82
+ ```python
83
+ from pathlib import Path
84
+ from Spec2Function import run_set
85
+
86
+ result = run_set(
87
+ r"d:\path\to\your.csv",
88
+ project_root=Path(r"d:\NTU\Spec2Function"),
89
+ )
90
+ print(result)
91
+ ```
@@ -0,0 +1,23 @@
1
+ Spec2Function/MS2BioTextDataset.py,sha256=HRQwAhX4DWbMKZ1ImRBnoVnGaHGmhKxBsqoU92FEZ44,133704
2
+ Spec2Function/__init__.py,sha256=ygARHwFRVtdc1w5k-wBtRykYy1myXcdvwI3zTj-0YBc,443
3
+ Spec2Function/assets.py,sha256=0z2Ltca7hIF-fPJqS8MbPRJymKm0Z64pKTVqVOslG3Q,2390
4
+ Spec2Function/biotext_processor.py,sha256=aqy04iupcT-2Xxm_kwCMs9S9TWJSdq2fn1IFPY5rBlA,18225
5
+ Spec2Function/config.py,sha256=Bk1WAM3o35gBSfPnf4e2HYkMfqML1DO9G2JYAGgSv8M,4299
6
+ Spec2Function/data_augmentation.py,sha256=v2q8JbVNbkDPjPdQRlhFoac5O79_K-y8dDNSNp1D3sY,11227
7
+ Spec2Function/gpt_inference.py,sha256=9x68FluHNFf0zyTTj3om0pKdZQYq-vy21g0ofqbsjns,33776
8
+ Spec2Function/llm_client.py,sha256=VdGwepR1YzBDM-C2Z98-c4mi6H6gff2OCz_EYsIP0yI,3747
9
+ Spec2Function/model_manager.py,sha256=8OYdegqOnQcmqQNpZqe9KqJFexm3OhdD-SxGcjUgOz4,51141
10
+ Spec2Function/pubmed.py,sha256=WXeRyueZnsEk55cXOqummQ3IBi6cKo3-uOZnE327FdY,8718
11
+ Spec2Function/read_raw_data.py,sha256=J1kP326xQ5xjk2FR3uNj7rkSCveYd-IBvfCP0NbhCcU,5504
12
+ Spec2Function/utils.py,sha256=EszXC5EJTxSiFLLyTo2e6V251Xogbr0yA4s0uQxgUvU,6286
13
+ Spec2Function/workflow.py,sha256=UU9dZ8zq70yBaHsE7BlHGotJajG9mbHCE9k5dY4hSaA,6836
14
+ Spec2Function/model/MS2BioText.py,sha256=ZPF3Ay13wE-oEHeU0V__HIy1WDJVVoOlFdA7jG5s9OU,20361
15
+ Spec2Function/model/MSBERT.py,sha256=mJ5xSAXl9G65ONcR75675o6HiMWSwmLrkEL0gTMmV54,9873
16
+ Spec2Function/model/__init__.py,sha256=yhAmaYM2zaLZC13-vwGcHzQEnoMvdAb6JTJZhd_anCw,1238
17
+ Spec2Function/model/config.py,sha256=5OYH1ah7_pTY1bmHVgDvg6u4jbdeft-OJXp_bdZtOoo,8867
18
+ Spec2Function/model/utils.py,sha256=y0rGKbZ_CXEUIo26nM4LZe76aWwto9WxqenwdqxvEL0,5768
19
+ spec2function-0.1.1.dist-info/licenses/LICENSE,sha256=McZ8PZH1HyFW5b29gkn4VsVQ7pTeUw45Ls9Rs-mMMhU,1081
20
+ spec2function-0.1.1.dist-info/METADATA,sha256=7hY1As9b5nXtoMzPbzv-c0WNocrBhUhDEeyZK9pQs0A,2127
21
+ spec2function-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
22
+ spec2function-0.1.1.dist-info/top_level.txt,sha256=bMENaDm5SnPDfBwmi0N5uC7b-O8UZKzpHP3OZXQq9sU,14
23
+ spec2function-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MS2Function contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ Spec2Function