spec2function 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,251 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ PubMed literature search for Spec2Function
4
+ """
5
+ from Bio import Entrez
6
+ from typing import List, Dict
7
+ import time
8
+
9
+
10
+ class PubMedSearcher:
11
+ """PubMed literature search helper."""
12
+
13
+ def __init__(self, email: str = "your_email@example.com"):
14
+ """
15
+ Args:
16
+ email: Your email address (required by PubMed API).
17
+ """
18
+ Entrez.email = email
19
+
20
+ def search_by_metabolites(self, metabolite_names: List[str],
21
+ max_results: int = 5) -> List[Dict]:
22
+ """
23
+ Search by multiple metabolite names (combined query).
24
+ Use OR to join keywords rather than full phrases.
25
+ """
26
+ # Clean metabolite names
27
+ clean_names = [self._clean_metabolite_name(name) for name in metabolite_names[:3]]
28
+
29
+ # Extract keywords by splitting on whitespace
30
+ keywords = []
31
+ for name in clean_names:
32
+ words = name.split()
33
+ keywords.extend(words)
34
+
35
+ # Deduplicate and limit count (avoid overly long queries)
36
+ keywords = list(dict.fromkeys(keywords))[:6]
37
+
38
+ if not keywords:
39
+ print("Warning: No valid keywords extracted")
40
+ return []
41
+
42
+ # Build query with OR
43
+ query = ' OR '.join([f'"{kw}"[Title/Abstract]' for kw in keywords])
44
+
45
+ print(f" Keywords: {keywords}")
46
+ print(f"PubMed query: {query}")
47
+
48
+ try:
49
+ handle = Entrez.esearch(
50
+ db="pubmed",
51
+ term=query,
52
+ retmax=max_results,
53
+ sort="relevance"
54
+ )
55
+ record = Entrez.read(handle)
56
+ handle.close()
57
+
58
+ id_list = record.get("IdList", [])
59
+ print(f"Found {len(id_list)} papers")
60
+
61
+ if not id_list:
62
+ return []
63
+
64
+ # Fetch details
65
+ time.sleep(0.5)
66
+ handle = Entrez.efetch(db="pubmed", id=id_list, retmode="xml")
67
+ records = Entrez.read(handle)
68
+ handle.close()
69
+
70
+ papers = []
71
+ for i, article in enumerate(records.get('PubmedArticle', [])):
72
+ try:
73
+ medline = article['MedlineCitation']
74
+ article_data = medline['Article']
75
+
76
+ title = article_data.get('ArticleTitle', 'No title')
77
+
78
+ pub_date = article_data['Journal']['JournalIssue']['PubDate']
79
+ year = pub_date.get('Year', pub_date.get('MedlineDate', 'Unknown'))
80
+ try:
81
+ year = int(str(year)[:4])
82
+ except:
83
+ year = 2023
84
+
85
+ authors = []
86
+ if 'AuthorList' in article_data:
87
+ for author in article_data['AuthorList'][:3]:
88
+ if 'LastName' in author:
89
+ authors.append(author['LastName'])
90
+ authors_str = ', '.join(authors)
91
+ if len(article_data.get('AuthorList', [])) > 3:
92
+ authors_str += ' et al.'
93
+
94
+ journal = article_data['Journal'].get('Title', 'Unknown journal')
95
+
96
+ abstract = ''
97
+ if 'Abstract' in article_data:
98
+ abstract_texts = article_data['Abstract'].get('AbstractText', [])
99
+ if abstract_texts:
100
+ if isinstance(abstract_texts, list):
101
+ abstract = ' '.join(str(text) for text in abstract_texts)
102
+ else:
103
+ abstract = str(abstract_texts)
104
+
105
+ pmid = str(medline['PMID'])
106
+ relevance = 90 - i * 3
107
+
108
+ papers.append({
109
+ 'pmid': pmid,
110
+ 'title': title,
111
+ 'year': year,
112
+ 'authors': authors_str,
113
+ 'journal': journal,
114
+ 'abstract': abstract,
115
+ 'relevance': relevance
116
+ })
117
+
118
+ except Exception as e:
119
+ print(f"Warning: Error parsing article: {e}")
120
+ continue
121
+
122
+ return papers
123
+
124
+ except Exception as e:
125
+ print(f"PubMed search error: {e}")
126
+ return []
127
+
128
+ def _clean_metabolite_name(self, name: str) -> str:
129
+ """Clean metabolite names and remove noisy tokens."""
130
+ import re
131
+
132
+ # Remove stereochemical markers like (R)-, (S)-, (E)-, (Z)-, (+)-, (-)-
133
+ name = re.sub(r'\([RSZE+\-]\)-', '', name)
134
+
135
+ # Remove numeric prefixes like 1,1,2-
136
+ name = re.sub(r'^\d+,[\d,]+-', '', name)
137
+
138
+ # Collapse extra whitespace
139
+ name = ' '.join(name.split())
140
+
141
+ return name.strip()
142
+
143
+ def search_by_metabolite(self, metabolite_name: str,
144
+ max_results: int = 5) -> List[Dict]:
145
+ """Search PubMed by a single metabolite name."""
146
+
147
+ # 1. Clean the metabolite name
148
+ clean_name = self._clean_metabolite_name(metabolite_name)
149
+
150
+ # 2. Try multiple query strategies
151
+ queries = [
152
+ f'"{clean_name}"[Title/Abstract]', # exact match
153
+ f'{clean_name}[Title/Abstract]', # loose match
154
+ f'{clean_name}', # broadest
155
+ ]
156
+
157
+ for i, query in enumerate(queries):
158
+ print(f"Try #{i+1}: {query}")
159
+
160
+ try:
161
+ handle = Entrez.esearch(
162
+ db="pubmed",
163
+ term=query,
164
+ retmax=max_results,
165
+ sort="relevance"
166
+ )
167
+ record = Entrez.read(handle)
168
+ handle.close()
169
+
170
+ id_list = record.get("IdList", [])
171
+ print(f" Found {len(id_list)} results")
172
+
173
+ if id_list: # Return on first hit
174
+ return self._fetch_paper_details(id_list)
175
+
176
+ except Exception as e:
177
+ print(f" Query failed: {e}")
178
+ continue
179
+
180
+ print(f"Warning: No papers found for: {metabolite_name}")
181
+ return []
182
+
183
+ def _fetch_paper_details(self, id_list: List[str]) -> List[Dict]:
184
+ """Fetch detailed article info for a list of PubMed IDs."""
185
+ time.sleep(0.3)
186
+
187
+ try:
188
+ handle = Entrez.efetch(db="pubmed", id=id_list, retmode="xml")
189
+ records = Entrez.read(handle)
190
+ handle.close()
191
+ except Exception as e:
192
+ print(f"Fetch error: {e}")
193
+ return []
194
+
195
+ papers = []
196
+ for i, article in enumerate(records.get('PubmedArticle', [])):
197
+ try:
198
+ medline = article['MedlineCitation']
199
+ article_data = medline['Article']
200
+
201
+ title = article_data.get('ArticleTitle', 'No title')
202
+
203
+ # Year
204
+ pub_date = article_data['Journal']['JournalIssue']['PubDate']
205
+ year = pub_date.get('Year', pub_date.get('MedlineDate', 'Unknown'))
206
+ try:
207
+ year = int(str(year)[:4])
208
+ except:
209
+ year = 2023
210
+
211
+ # Authors
212
+ authors = []
213
+ if 'AuthorList' in article_data:
214
+ for author in article_data['AuthorList'][:3]:
215
+ if 'LastName' in author:
216
+ authors.append(author['LastName'])
217
+ authors_str = ', '.join(authors)
218
+ if len(article_data.get('AuthorList', [])) > 3:
219
+ authors_str += ' et al.'
220
+
221
+ # Journal
222
+ journal = article_data['Journal'].get('Title', 'Unknown')
223
+
224
+ # Abstract
225
+ abstract = ''
226
+ if 'Abstract' in article_data:
227
+ abstract_texts = article_data['Abstract'].get('AbstractText', [])
228
+ if abstract_texts:
229
+ if isinstance(abstract_texts, list):
230
+ abstract = ' '.join(str(t) for t in abstract_texts)
231
+ else:
232
+ abstract = str(abstract_texts)
233
+
234
+ pmid = str(medline['PMID'])
235
+ relevance = 95 - i * 5
236
+
237
+ papers.append({
238
+ 'pmid': pmid,
239
+ 'title': title,
240
+ 'year': year,
241
+ 'authors': authors_str,
242
+ 'journal': journal,
243
+ 'abstract': abstract,
244
+ 'relevance': relevance
245
+ })
246
+
247
+ except Exception as e:
248
+ print(f"Warning: Parse error: {e}")
249
+ continue
250
+
251
+ return papers
@@ -0,0 +1,154 @@
1
+ import os
2
+ import pandas as pd
3
+ import xml.etree.ElementTree as ET
4
+ from pathlib import Path
5
+ import numpy as np
6
+ import pickle
7
+
8
+ def parse_ms_xml_folder(folder_path):
9
+ """
10
+ 解析包含MS-MS数据的XML文件夹
11
+
12
+ 参数:
13
+ folder_path (str): 包含XML文件的文件夹路径
14
+
15
+ 返回:
16
+ tuple: (ms_data, meta_data)
17
+ - ms_data: 字典,键为不带扩展名的文件名,值为包含mz、intensity和molecule_id的字典
18
+ - meta_data: DataFrame,包含每个文件的元数据
19
+ """
20
+ # 初始化数据结构
21
+ ms_data = {}
22
+ meta_data_list = []
23
+
24
+ # 获取所有XML文件
25
+ xml_files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]
26
+
27
+ for file_name in xml_files:
28
+ file_path = os.path.join(folder_path, file_name)
29
+
30
+ try:
31
+ # 解析XML文件
32
+ tree = ET.parse(file_path)
33
+ root = tree.getroot()
34
+
35
+ # 移除文件扩展名
36
+ file_name_without_ext = os.path.splitext(file_name)[0]
37
+
38
+ # 提取MS-MS峰值数据
39
+ mz_list = []
40
+ intensity_list = []
41
+ molecule_id_list = []
42
+
43
+ for peak in root.findall('.//ms-ms-peak'):
44
+ mz = peak.find('mass-charge')
45
+ intensity = peak.find('intensity')
46
+ molecule_id = peak.find('molecule-id')
47
+
48
+ if mz is not None and intensity is not None:
49
+ mz_list.append(float(mz.text))
50
+ intensity_list.append(float(intensity.text))
51
+
52
+ # 提取molecule_id,如果为nil则为None
53
+ if molecule_id is not None and 'nil' not in molecule_id.attrib:
54
+ molecule_id_list.append(molecule_id.text)
55
+ else:
56
+ molecule_id_list.append(None)
57
+
58
+ # 获取database-id
59
+ database_id_elem = root.find('database-id')
60
+ database_id = database_id_elem.text if database_id_elem is not None and database_id_elem.text else np.nan
61
+
62
+ # 获取ionization-mode (Polarity)
63
+ polarity_elem = root.find('ionization-mode')
64
+ polarity = polarity_elem.text if polarity_elem is not None and polarity_elem.text else np.nan
65
+
66
+ # 获取precursor_mass (adduct-mass)
67
+ adduct_mass_elem = root.find('adduct-mass')
68
+ precursor_mass = adduct_mass_elem.text if adduct_mass_elem is not None and adduct_mass_elem.text else np.nan
69
+
70
+ # 获取splash-key
71
+ splash_id_elem = root.find('splash-key')
72
+ splash_id = splash_id_elem.text if splash_id_elem is not None and splash_id_elem.text else np.nan
73
+
74
+ # 存储MS数据 - 使用不带扩展名的文件名
75
+ ms_data[file_name_without_ext] = {
76
+ 'mz': mz_list,
77
+ 'intensity': intensity_list,
78
+ 'molecule_id': database_id # 使用database-id作为molecule_id
79
+ }
80
+
81
+ # 存储元数据 - 使用不带扩展名的文件名
82
+ meta_data_list.append({
83
+ 'file_name': file_name_without_ext,
84
+ 'HMDB.ID': database_id,
85
+ 'Polarity': polarity,
86
+ 'precursor_mass': precursor_mass,
87
+ 'splash_id': splash_id
88
+ })
89
+
90
+ except Exception as e:
91
+ print(f"处理文件 {file_name} 时出错: {e}")
92
+
93
+ # 创建元数据DataFrame
94
+ meta_data = pd.DataFrame(meta_data_list)
95
+
96
+ return ms_data, meta_data
97
+
98
+ def save_ms_data(ms_data, output_file):
99
+ """
100
+ 保存MS数据到pickle文件
101
+
102
+ 参数:
103
+ ms_data (dict): MS数据字典
104
+ output_file (str): 输出文件路径
105
+ """
106
+ import pickle
107
+ with open(output_file, 'wb') as f:
108
+ pickle.dump(ms_data, f)
109
+ print(f"MS数据已保存到 {output_file}")
110
+
111
+ def save_meta_data(meta_data, output_file):
112
+ """
113
+ 保存元数据到CSV文件
114
+
115
+ 参数:
116
+ meta_data (DataFrame): 元数据DataFrame
117
+ output_file (str): 输出文件路径
118
+ """
119
+ meta_data.to_csv(output_file, index=False)
120
+ print(f"元数据已保存到 {output_file}")
121
+
122
+ def main():
123
+ # 示例用法
124
+ folder_path = "/Users/cgxjdzz/Desktop/NTU phd/ms2_database_feifan/HMDB raw/hmdb_experimental_msms_spectra" # 替换为实际XML文件夹路径
125
+ output_dir = "/Users/cgxjdzz/Desktop/NTU phd/ms2_database_feifan/MS2BioText" # 替换为实际输出目录路径
126
+
127
+ # 确保输出目录存在
128
+ os.makedirs(output_dir, exist_ok=True)
129
+
130
+ # 解析XML文件
131
+ ms_data, meta_data = parse_ms_xml_folder(folder_path)
132
+
133
+ # 打印结果示例
134
+ print("MS数据样例:")
135
+ for file_name, data in list(ms_data.items())[:1]: # 只打印第一个文件的数据
136
+ print(f"文件: {file_name}")
137
+ print(f"质荷比数量: {len(data['mz'])}")
138
+ print(f"前5个质荷比: {data['mz'][:5]}")
139
+ print(f"前5个强度值: {data['intensity'][:5]}")
140
+ print(f"molecule_id: {data['molecule_id']}")
141
+ print()
142
+
143
+ print("元数据:")
144
+ print(meta_data.head())
145
+
146
+ # 保存数据
147
+ ms_data_file = os.path.join(output_dir, "new_ms_data.pkl")
148
+ meta_data_file = os.path.join(output_dir, "new_meta_data.csv")
149
+
150
+ save_ms_data(ms_data, ms_data_file)
151
+ save_meta_data(meta_data, meta_data_file)
152
+
153
+ if __name__ == "__main__":
154
+ main()
Spec2Function/utils.py ADDED
@@ -0,0 +1,216 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Utility functions for Spec2Function
4
+ """
5
+ import numpy as np
6
+ import base64
7
+ from typing import List, Dict, Tuple
8
+
9
+
10
+ def parse_mgf(file_content: str) -> List[Dict]:
11
+ """
12
+ Parse an MGF file.
13
+
14
+ Args:
15
+ file_content: MGF file content as text.
16
+ Returns:
17
+ List of spectra, each containing:
18
+ {
19
+ 'title': str,
20
+ 'precursor_mz': float,
21
+ 'charge': int,
22
+ 'mz': List[float],
23
+ 'intensity': List[float]
24
+ }
25
+ """
26
+ spectra = []
27
+ current_spectrum = None
28
+
29
+ lines = file_content.strip().split('\n')
30
+
31
+ for line in lines:
32
+ line = line.strip()
33
+
34
+ if line.startswith('BEGIN IONS'):
35
+ current_spectrum = {
36
+ 'title': '',
37
+ 'precursor_mz': 0.0,
38
+ 'charge': 0,
39
+ 'mz': [],
40
+ 'intensity': []
41
+ }
42
+
43
+ elif line.startswith('END IONS'):
44
+ if current_spectrum and len(current_spectrum['mz']) > 0:
45
+ spectra.append(current_spectrum)
46
+ current_spectrum = None
47
+
48
+ elif current_spectrum is not None:
49
+ if line.startswith('TITLE='):
50
+ current_spectrum['title'] = line.split('=', 1)[1]
51
+
52
+ elif line.startswith('PEPMASS='):
53
+ current_spectrum['precursor_mz'] = float(line.split('=')[1].split()[0])
54
+
55
+ elif line.startswith('CHARGE='):
56
+ charge_str = line.split('=')[1].replace('+', '').replace('-', '')
57
+ try:
58
+ current_spectrum['charge'] = int(charge_str)
59
+ except:
60
+ current_spectrum['charge'] = 0
61
+
62
+ elif line and not line.startswith(('TITLE', 'PEPMASS', 'CHARGE', 'RTINSECONDS', 'SCANS')):
63
+ # Peak line: m/z intensity
64
+ try:
65
+ parts = line.split()
66
+ if len(parts) >= 2:
67
+ mz = float(parts[0])
68
+ intensity = float(parts[1])
69
+ current_spectrum['mz'].append(mz)
70
+ current_spectrum['intensity'].append(intensity)
71
+ except:
72
+ pass
73
+
74
+ return spectra
75
+
76
+
77
+ def parse_msp(file_content: str) -> List[Dict]:
78
+ """
79
+ Parse an MSP file.
80
+
81
+ Args:
82
+ file_content: MSP file content as text.
83
+ Returns:
84
+ List of spectra (same format as parse_mgf)
85
+ """
86
+ spectra = []
87
+ current_spectrum = None
88
+ num_peaks = 0
89
+ peaks_read = 0
90
+
91
+ lines = file_content.strip().split('\n')
92
+
93
+ for line in lines:
94
+ line = line.strip()
95
+
96
+ if not line:
97
+ if current_spectrum and len(current_spectrum['mz']) > 0:
98
+ spectra.append(current_spectrum)
99
+ current_spectrum = None
100
+ peaks_read = 0
101
+ continue
102
+
103
+ if line.startswith('Name:'):
104
+ current_spectrum = {
105
+ 'title': line.split(':', 1)[1].strip(),
106
+ 'precursor_mz': 0.0,
107
+ 'charge': 0,
108
+ 'mz': [],
109
+ 'intensity': []
110
+ }
111
+
112
+ elif current_spectrum is not None:
113
+ if line.startswith('PrecursorMZ:') or line.startswith('PRECURSORMZ:'):
114
+ current_spectrum['precursor_mz'] = float(line.split(':')[1].strip())
115
+
116
+ elif line.startswith('Num peaks:') or line.startswith('Num Peaks:'):
117
+ num_peaks = int(line.split(':')[1].strip())
118
+
119
+ elif peaks_read < num_peaks:
120
+ # Peak line
121
+ try:
122
+ parts = line.split()
123
+ if len(parts) >= 2:
124
+ mz = float(parts[0])
125
+ intensity = float(parts[1])
126
+ current_spectrum['mz'].append(mz)
127
+ current_spectrum['intensity'].append(intensity)
128
+ peaks_read += 1
129
+ except:
130
+ pass
131
+
132
+ # Handle the last spectrum
133
+ if current_spectrum and len(current_spectrum['mz']) > 0:
134
+ spectra.append(current_spectrum)
135
+
136
+ return spectra
137
+
138
+
139
+ def parse_json_spectrum(json_data: Dict) -> Dict:
140
+ peaks = json_data.get('peaks', [])
141
+ precursor_mz = json_data.get('precursor_mz', 0.0)
142
+ return {
143
+ 'title': 'Single Spectrum',
144
+ 'precursor_mz': precursor_mz,
145
+ 'charge': 0,
146
+ 'mz': [peak[0] for peak in peaks],
147
+ 'intensity': [peak[1] for peak in peaks]
148
+ }
149
+
150
+
151
+ def decode_uploaded_file(contents: str, filename: str) -> Tuple[str, str]:
152
+ """
153
+ Decode a file uploaded via Dash Upload.
154
+ Args:
155
+ contents: Base64-encoded content string.
156
+ filename: Original filename.
157
+ Returns:
158
+ (file_content, file_type)
159
+ """
160
+ content_type, content_string = contents.split(',')
161
+ decoded = base64.b64decode(content_string)
162
+
163
+ try:
164
+ file_content = decoded.decode('utf-8')
165
+ except:
166
+ file_content = decoded.decode('latin-1')
167
+
168
+ file_type = filename.split('.')[-1].lower()
169
+
170
+ return file_content, file_type
171
+
172
+
173
+ def preprocess_spectrum(mz: List[float], intensity: List[float],
174
+ max_peaks: int = 100) -> Tuple[np.ndarray, np.ndarray]:
175
+ """
176
+ Preprocess an MS2 spectrum.
177
+
178
+ Args:
179
+ mz: m/z values
180
+ intensity: Intensity values
181
+ max_peaks: Maximum number of peaks to keep
182
+
183
+ Returns:
184
+ (mz_array, intensity_array) - preprocessed and normalized
185
+ """
186
+ mz = np.array(mz, dtype=np.float32)
187
+ intensity = np.array(intensity, dtype=np.float32)
188
+
189
+ # Sort by intensity and keep top peaks
190
+ if len(intensity) > max_peaks:
191
+ top_indices = np.argsort(intensity)[-max_peaks:]
192
+ mz = mz[top_indices]
193
+ intensity = intensity[top_indices]
194
+
195
+ # Sort by m/z
196
+ sorted_indices = np.argsort(mz)
197
+ mz = mz[sorted_indices]
198
+ intensity = intensity[sorted_indices]
199
+
200
+ # Normalize intensity
201
+ if intensity.max() > 0:
202
+ intensity = intensity / intensity.max()
203
+
204
+ return mz, intensity
205
+
206
+
207
+ def format_similarity_score(score: float) -> str:
208
+ """Format a similarity score."""
209
+ return f"{score:.3f}"
210
+
211
+
212
+ def truncate_text(text: str, max_length: int = 200) -> str:
213
+ """Truncate long text."""
214
+ if len(text) <= max_length:
215
+ return text
216
+ return text[:max_length] + "..."