chromaquant 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chromaquant/Handle/__init__.py +12 -0
- chromaquant/Handle/handleDirectories.py +89 -0
- chromaquant/Manual/HydroUI.py +418 -0
- chromaquant/Manual/QuantUPP.py +373 -0
- chromaquant/Manual/Quantification.py +1305 -0
- chromaquant/Manual/__init__.py +10 -0
- chromaquant/Manual/duplicateMatch.py +211 -0
- chromaquant/Manual/fpm_match.py +798 -0
- chromaquant/Manual/label-type.py +179 -0
- chromaquant/Match/AutoFpmMatch.py +1133 -0
- chromaquant/Match/__init__.py +12 -0
- chromaquant/Quant/AutoQuantification.py +1329 -0
- chromaquant/Quant/__init__.py +12 -0
- chromaquant/__init__.py +10 -0
- chromaquant/__main__.py +493 -0
- chromaquant/properties.json +4 -0
- chromaquant-0.3.1.dist-info/METADATA +189 -0
- chromaquant-0.3.1.dist-info/RECORD +22 -0
- chromaquant-0.3.1.dist-info/WHEEL +4 -0
- chromaquant-0.3.1.dist-info/entry_points.txt +2 -0
- chromaquant-0.3.1.dist-info/licenses/LICENSE.txt +18 -0
- chromaquant-0.3.1.dist-info/licenses/LICENSES_bundled.txt +1035 -0
|
@@ -0,0 +1,1133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
COPYRIGHT STATEMENT:
|
|
4
|
+
|
|
5
|
+
ChromaQuant – A quantification software for complex gas chromatographic data
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2024, by Julia Hancock
|
|
8
|
+
Affiliation: Dr. Julie Elaine Rorrer
|
|
9
|
+
URL: https://www.rorrerlab.com/
|
|
10
|
+
|
|
11
|
+
License: BSD 3-Clause License
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
SCRIPT WHICH MATCHES FID AND MS PEAKS
|
|
16
|
+
|
|
17
|
+
Julia Hancock
|
|
18
|
+
Started 12/29/2023
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
""" PACKAGES """
|
|
22
|
+
import sys
|
|
23
|
+
import pandas as pd
|
|
24
|
+
import os
|
|
25
|
+
from molmass import Formula
|
|
26
|
+
import math
|
|
27
|
+
import numpy as np
|
|
28
|
+
from chemformula import ChemFormula
|
|
29
|
+
import json
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
import logging
|
|
32
|
+
import scipy
|
|
33
|
+
|
|
34
|
+
""" FID AND MS MATCHING MAIN FUNCTION"""
|
|
35
|
+
def main_AutoFpmMatch(sname,sphase,splab_TF,model,directories):
|
|
36
|
+
|
|
37
|
+
""" PARAMETERS """
|
|
38
|
+
print("[AutoFpmMatch] Defining parameters...")
|
|
39
|
+
#Default third order fit arguments for gas FID and MS peak matching
|
|
40
|
+
#a (x^3)
|
|
41
|
+
a_tof = 0.0252
|
|
42
|
+
#b (x^2)
|
|
43
|
+
b_tof = -0.5274
|
|
44
|
+
#c (x)
|
|
45
|
+
c_tof = 4.8067
|
|
46
|
+
#d
|
|
47
|
+
d_tof = -3.0243
|
|
48
|
+
#Combine into a list
|
|
49
|
+
fit_const = [a_tof,b_tof,c_tof,d_tof]
|
|
50
|
+
|
|
51
|
+
""" PROCESSING SYSTEM ARGUMENTS """
|
|
52
|
+
print("[AutoFpmMatch] Processing system arguments...")
|
|
53
|
+
#Specify the allowable error for linear, third order, and speculative peak matching
|
|
54
|
+
peakError = 0.06
|
|
55
|
+
|
|
56
|
+
#Specify the allowable error for direct FID-MS RT matching
|
|
57
|
+
peakErrorRT = 0.05
|
|
58
|
+
|
|
59
|
+
#Specify the restrictions and preferences to be implemented in speculative labeling
|
|
60
|
+
#The first list contains properties which must match in order for something to be labelled
|
|
61
|
+
#The second dictionary contains properties which are preferred in deciding between multiple matches
|
|
62
|
+
#The dictionary should have key:value pairs of the form "kc_rsc":"allowable error between speculative entry and sample value"
|
|
63
|
+
#The preferences listed are applied in order such that the first preference is more valued than the last
|
|
64
|
+
restrictList = [['Gas'],{'Reaction Temperature (C)':5}]
|
|
65
|
+
|
|
66
|
+
""" COMPOUND TYPE ASSIGNMENT VARIABLES """
|
|
67
|
+
print("[AutoFpmMatch] Defining compound type variables...")
|
|
68
|
+
#This dictionary contain lists of substrings to be checked against compound name strings to
|
|
69
|
+
#assign a compound type
|
|
70
|
+
|
|
71
|
+
#Six compound types exist: linear alkanes (L), branched alkanes (B), aromatics (A), cycloalkanes (C),
|
|
72
|
+
#alkenes/alkynes (E), and other (O)
|
|
73
|
+
|
|
74
|
+
#Each compound type abbreviation will have an entry in the dictionary corresponding to a list of
|
|
75
|
+
#substrings to be checked against a compound name string
|
|
76
|
+
|
|
77
|
+
contains = {'L':['methane','ethane','propane','butane','pentane','hexane','heptane','octane','nonane',\
|
|
78
|
+
'decane','undecane','hendecane','dodecane','tridecane','tetradecane','pentadecane','hexadecane','heptadecane','octadecane','nonadecane',\
|
|
79
|
+
'icosane','eicosane','heneicosane','henicosane','docosane','tricosane','tetracosane','pentacosane','hexacosane','cerane','heptacosane','octacosane','nonacosane',\
|
|
80
|
+
'triacontane','hentriacontane','untriacontane','dotriacontane','dicetyl','tritriacontane','tetratriacontane','pentatriacontane','hexatriacontane','heptatriacontane','octatriacontane','nonatriacontane',\
|
|
81
|
+
'tetracontane','hentetracontane','dotetracontane','tritetracontane','tetratetracontane','pentatetracontane','hexatetracontane','heptatetracontane','octatetracontane','nonatetracontane','pentacontane'],\
|
|
82
|
+
|
|
83
|
+
'B':['iso','neo','methyl','ethyl','propyl','butyl','pentyl','hexyl','heptyl','octyl','nonyl',\
|
|
84
|
+
'decyl','undecyl','dodecyl','tridecyl','tetradecyl','pentadecyl','hexadecyl','heptadecyl','octadecyl','nonadecyl',\
|
|
85
|
+
'icosyl','eicosyl','heneicosyl','henicosyl','docosyl','tricosyl','tetracosyl','pentacosyl','hexacosyl','heptacosyl','octacosyl','nonacosyl',\
|
|
86
|
+
'triacontyl','hentriacontyl','untriacontyl','dotriacontyl','tritriacontyl','tetratriacontyl','pentatriacontyl','hexatriacontyl','heptatriacontyl','octatriacontyl','nonatriacontyl',\
|
|
87
|
+
'tetracontyl','hentetracontyl','dotetracontyl','tritetracontyl','tetratetracontyl','pentatetracontyl','hexatetracontyl','heptatetracontyl','octatetracontyl','nonatetracontyl','pentacontyl'],
|
|
88
|
+
|
|
89
|
+
'A':['benzyl','benzo','phenyl','benzene','toluene','xylene','mesitylene','durene','naphthalene','fluorene','anthracene','phenanthrene','phenalene',\
|
|
90
|
+
'tetracene','chrysene','triphenylene','pyrene','pentacene','perylene','corannulene','coronene','ovalene','indan','indene','tetralin'],\
|
|
91
|
+
|
|
92
|
+
'C':['cyclo','menthane'],\
|
|
93
|
+
|
|
94
|
+
'E':['ene','yne'],\
|
|
95
|
+
|
|
96
|
+
'O':[]}
|
|
97
|
+
|
|
98
|
+
#Tuple of contains keys in order of priority
|
|
99
|
+
keyLoop = ('A','C','E','B','L')
|
|
100
|
+
|
|
101
|
+
#Tuple of elements to be excluded and automatically labelled as 'O'
|
|
102
|
+
elementExclude = ('He','Li','Be','B','N','O','F','Ne','Na','Mg','Al','Si','P',\
|
|
103
|
+
'S','Cl','Ar','K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co',\
|
|
104
|
+
'Ni','Cu','Zn')
|
|
105
|
+
|
|
106
|
+
""" DIRECTORIES """
|
|
107
|
+
print("[AutoFpmMatch] Finding directories...")
|
|
108
|
+
|
|
109
|
+
#Unpack directories from passed variable
|
|
110
|
+
#Primary files directory
|
|
111
|
+
files = directories['files']
|
|
112
|
+
#Resources directory
|
|
113
|
+
RE_Dir = directories['resources']
|
|
114
|
+
#Theme directory
|
|
115
|
+
theme_Dir = directories['theme']
|
|
116
|
+
#Response factor directory
|
|
117
|
+
RF_Dir = directories['rf']
|
|
118
|
+
#Data directory
|
|
119
|
+
DF_Dir = directories['data']
|
|
120
|
+
#Images directory
|
|
121
|
+
img_Dir = directories['images']
|
|
122
|
+
#Data file log directory
|
|
123
|
+
DFlog_Dir = os.path.join(DF_Dir,sname,'log')
|
|
124
|
+
#Data file breakdowns directory
|
|
125
|
+
DFbreak_Dir = os.path.join(DF_Dir,sname,'breakdowns')
|
|
126
|
+
#Raw data file directory
|
|
127
|
+
Raw_Dir = os.path.join(DF_Dir,sname,'raw data')
|
|
128
|
+
|
|
129
|
+
#Dictionary of substrings to add to sample name to create file names
|
|
130
|
+
sub_Dict = {'Gas TCD+FID':['_GS2_TCD_CSO.csv'],
|
|
131
|
+
'Gas Labelled MS Peaks':['_GS1_UA_Comp_UPP.csv'],
|
|
132
|
+
'Gas FID+MS':['_GS2_FIDpMS.csv'],
|
|
133
|
+
'Liquid FID':['_LQ1_FID_CSO.csv'],
|
|
134
|
+
'Liquid Labelled MS Peaks':['_LQ1_UA_Comp_UPP.csv'],
|
|
135
|
+
'Liquid FID+MS':['_LQ1_FIDpMS.csv'],
|
|
136
|
+
'Info':['_INFO.json']}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
""" LOGGING """
|
|
140
|
+
print("[AutoFpmMatch] Initializing logging [WIP]...")
|
|
141
|
+
#Get current datetime
|
|
142
|
+
now = datetime.now()
|
|
143
|
+
#Get current datetime string
|
|
144
|
+
nows = now.strftime('%Y%m%d')
|
|
145
|
+
|
|
146
|
+
#If log directory does not exist within sample folder, create it
|
|
147
|
+
if not os.path.exists(DFlog_Dir):
|
|
148
|
+
os.makedirs(DFlog_Dir)
|
|
149
|
+
|
|
150
|
+
#Instantiate a logger
|
|
151
|
+
logger = logging.getLogger(__name__)
|
|
152
|
+
#Initialize logging file using current datetime
|
|
153
|
+
fh = logging.FileHandler(os.path.join(DFlog_Dir,'quantlog_'+nows+'.log'))
|
|
154
|
+
logger.addHandler(fh)
|
|
155
|
+
#Set logging level
|
|
156
|
+
logger.setLevel(logging.INFO)
|
|
157
|
+
#Create a formatter and assign to logger
|
|
158
|
+
formatter = logging.Formatter('[%(filename)s] %(asctime)s - [%(levelname)s]: %(message)s')
|
|
159
|
+
fh.setFormatter(formatter)
|
|
160
|
+
|
|
161
|
+
""" FUNCTIONS """
|
|
162
|
+
print("[AutoFpmMatch] Defining functions...")
|
|
163
|
+
#Function for selecting FID peak, MS peak, and FIDpMS pathnames according to sample name and phase
|
|
164
|
+
def fileNamer(sname,sphase,sub_Dict,pathData):
|
|
165
|
+
"""
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
sname : STR
|
|
169
|
+
The name of the sample.
|
|
170
|
+
sphase : STR
|
|
171
|
+
A string that describes whether sample is gas ("G") or liquid ("L").
|
|
172
|
+
sub_Dict : Dict
|
|
173
|
+
A dictionary of substrings to add to sample name to create file names
|
|
174
|
+
pathData : STR
|
|
175
|
+
A string containing the pathname to the datafiles directory
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
paths : List
|
|
180
|
+
A list of pathnames to return.
|
|
181
|
+
|
|
182
|
+
"""
|
|
183
|
+
#If sample phase is liquid, set pathnames accordingly
|
|
184
|
+
if sphase == "L":
|
|
185
|
+
pathFID = os.path.join(pathData,sname+sub_Dict['Liquid FID'][0])
|
|
186
|
+
pathMS = os.path.join(pathData,sname+sub_Dict['Liquid Labelled MS Peaks'][0])
|
|
187
|
+
pathFIDpMS = os.path.join(pathData,sname+sub_Dict['Liquid FID+MS'][0])
|
|
188
|
+
|
|
189
|
+
#Else if sample phase is gas, set pathnames accordingly
|
|
190
|
+
elif sphase == "G":
|
|
191
|
+
pathFID = os.path.join(pathData,sname+sub_Dict['Gas TCD+FID'][0])
|
|
192
|
+
pathMS = os.path.join(pathData,sname+sub_Dict['Gas Labelled MS Peaks'][0])
|
|
193
|
+
pathFIDpMS = os.path.join(pathData,sname+sub_Dict['Gas FID+MS'][0])
|
|
194
|
+
|
|
195
|
+
#Otherwise, set all paths to None
|
|
196
|
+
else:
|
|
197
|
+
pathFID = None
|
|
198
|
+
pathMS = None
|
|
199
|
+
pathFIDpMS = None
|
|
200
|
+
|
|
201
|
+
paths = [pathFID,pathMS,pathFIDpMS]
|
|
202
|
+
|
|
203
|
+
return paths
|
|
204
|
+
|
|
205
|
+
#Function for checking if FIDpMS file exists – creates it if necessary and imports/returns the data
|
|
206
|
+
def checkFile(fpmDir,fDir):
|
|
207
|
+
"""
|
|
208
|
+
Parameters
|
|
209
|
+
----------
|
|
210
|
+
fpmDir : STR
|
|
211
|
+
A string containing the pathname of the FIDpMS file in question.
|
|
212
|
+
fDir : STR
|
|
213
|
+
A string containing the pathname of the FID file of the same sample/phase as the FIDpMS file.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
fpmDF : DataFrame
|
|
218
|
+
A DataFrame containing the contents of the FIDpMS file.
|
|
219
|
+
exists : BOOL
|
|
220
|
+
A boolean describing whether or not the relevant file exists and has manually added peaks.
|
|
221
|
+
|
|
222
|
+
"""
|
|
223
|
+
#If FIDpMS file does not exist in data file directory, create it and return False
|
|
224
|
+
if not os.path.exists(fpmDir):
|
|
225
|
+
#Log that file wasn't found and a new one is being created
|
|
226
|
+
logger.info('FIDpMS file not found for sample and phase, creating new...')
|
|
227
|
+
#Read FID dataframe
|
|
228
|
+
fDF = pd.read_csv(fDir)
|
|
229
|
+
#Filter FID dataframe to only include FID rows, as gas samples may have TCD rows, and set to fpmDF
|
|
230
|
+
fpmDF = fDF.loc[fDF['Signal Name'] == 'FID1A'].copy()
|
|
231
|
+
#Rename FID RT and FID Area columns, as well as rename the Height column to MS RT
|
|
232
|
+
fpmDF = fpmDF.rename(columns={'RT':'FID RT','Area':'FID Area','Height':'MS RT'})
|
|
233
|
+
#Clear the contents of the MS RT column
|
|
234
|
+
fpmDF['MS RT'] = np.nan
|
|
235
|
+
#Create list of new columns to create
|
|
236
|
+
lnc = ['Formula','Match Factor','Compound Source','Compound Type Abbreviation']
|
|
237
|
+
|
|
238
|
+
#Loop through lnc, adding nan columns for each entry
|
|
239
|
+
for i in lnc:
|
|
240
|
+
fpmDF[i] = np.nan
|
|
241
|
+
|
|
242
|
+
#Remove the Injection Data File Name and Signal Name columns
|
|
243
|
+
fpmDF = fpmDF.drop(['Injection Data File Name','Signal Name'],axis=1).copy()
|
|
244
|
+
#Save fpmDF to provided pathname
|
|
245
|
+
fpmDF.to_csv(fpmDir, index=False)
|
|
246
|
+
|
|
247
|
+
return fpmDF, False
|
|
248
|
+
|
|
249
|
+
#Otherwise..
|
|
250
|
+
else:
|
|
251
|
+
fpmDF = pd.read_csv(fpmDir)
|
|
252
|
+
#If the FIDpMS exists and there exist any peaks..
|
|
253
|
+
if fpmDF['Compound Name'].any():
|
|
254
|
+
#Define a new dataframe which includes all rows with labelled peaks
|
|
255
|
+
fpmDF_labelled = fpmDF.loc[~fpmDF['Compound Name'].isna()]['Compound Source']
|
|
256
|
+
#If those peaks are manually assigned or have a blank source, return the dataframe and True
|
|
257
|
+
if 'Manual' in fpmDF_labelled.values.tolist() or pd.isna(fpmDF_labelled.values).any():
|
|
258
|
+
#Create a log entry
|
|
259
|
+
logger.info('FIDpMS file exists and contains manual and/or blank sourced entries')
|
|
260
|
+
return fpmDF, True
|
|
261
|
+
#Otherwise, if there exist no manually assigned peaks or labelled peaks with a blank source, return False
|
|
262
|
+
else:
|
|
263
|
+
#Create a log entry
|
|
264
|
+
logger.info('FIDpMS file exists but does not contains manual or blank sourced entries')
|
|
265
|
+
return fpmDF, False
|
|
266
|
+
|
|
267
|
+
#If the FIDpMS file exists but has no peaks, return False
|
|
268
|
+
else:
|
|
269
|
+
#Create a log entry
|
|
270
|
+
logger.info('FIDpMS file exists but contains no labelled peaks')
|
|
271
|
+
return fpmDF, False
|
|
272
|
+
|
|
273
|
+
#Function describing a third order fit for gas analysis
|
|
274
|
+
def defaultGas(FIDRT,fpmDF,fit_const=[a_tof,b_tof,c_tof,d_tof]):
|
|
275
|
+
"""
|
|
276
|
+
A function used to describe the default fit for gas analysis peak matching
|
|
277
|
+
|
|
278
|
+
Parameters
|
|
279
|
+
----------
|
|
280
|
+
FIDRT : Float
|
|
281
|
+
A float describing the FID retention time requiring a corresponding MS retention time.
|
|
282
|
+
fpmDF : DataFrame
|
|
283
|
+
A dataframe containing FID and MS peak info.
|
|
284
|
+
fit_const : List, optional
|
|
285
|
+
A list of floats describing a third order fit. The default is [a_tof,b_tof,c_tof,d_tof].
|
|
286
|
+
|
|
287
|
+
Returns
|
|
288
|
+
-------
|
|
289
|
+
MSRT : Float
|
|
290
|
+
A float describing the calculated MS RT using the third order fit and the FID RT
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
MSRT = fit_const[0]*FIDRT**3+fit_const[1]*FIDRT**2+fit_const[2]*FIDRT+fit_const[3]
|
|
294
|
+
|
|
295
|
+
return MSRT
|
|
296
|
+
|
|
297
|
+
#TODO: Function for creating a third order fit using manually matched peaks
|
|
298
|
+
|
|
299
|
+
#Function for creating a linear fit using manually matched peaks
|
|
300
|
+
def RTlinfit(fpmDF):
|
|
301
|
+
|
|
302
|
+
#Get a new dataframe containing only rows with labelled peaks
|
|
303
|
+
fpmDF_lab = fpmDF.loc[~fpmDF['Compound Name'].isna()]
|
|
304
|
+
|
|
305
|
+
#Get a new dataframe containing only rows with manual/blank peaks
|
|
306
|
+
fpmDF_mb = fpmDF_lab.loc[(fpmDF_lab['Compound Source']=='Manual') | (fpmDF_lab['Compound Source'].isna())]
|
|
307
|
+
|
|
308
|
+
#If dataframe contains any rows with 'Manual' as a source..
|
|
309
|
+
if 'Manual' in fpmDF_lab['Compound Source'].tolist():
|
|
310
|
+
#If dataframe also contains rows with 'nan' as a source..
|
|
311
|
+
if pd.isna(fpmDF_lab['Compound Source'].values).any():
|
|
312
|
+
|
|
313
|
+
#Manual and blank counts are appropriately assigned
|
|
314
|
+
manual_count = fpmDF_lab['Compound Source'].value_counts()['Manual']
|
|
315
|
+
blank_count = fpmDF_lab['Compound Source'].isna().sum()
|
|
316
|
+
#All count will include both manual and blank entries
|
|
317
|
+
all_count = manual_count + blank_count
|
|
318
|
+
|
|
319
|
+
#Otherwise, all count will only include manual entries
|
|
320
|
+
else:
|
|
321
|
+
manual_count = fpmDF_lab['Compound Source'].value_counts()['Manual']
|
|
322
|
+
blank_count = 0
|
|
323
|
+
all_count = manual_count
|
|
324
|
+
|
|
325
|
+
#Else if dataframe contains anyrows with 'nan' as a source..
|
|
326
|
+
elif pd.isna(fpmDF_lab['Compound Source'].values).any():
|
|
327
|
+
#All count will include only blank entries
|
|
328
|
+
manual_count = 0
|
|
329
|
+
blank_count = fpmDF_lab['Compound Source'].isna().sum()
|
|
330
|
+
all_count = fpmDF_lab['Compound Source'].isna().sum()
|
|
331
|
+
|
|
332
|
+
#Otherwise, log that the provided dataframe has no manual or blank entries and return None or 0 for all returns
|
|
333
|
+
else:
|
|
334
|
+
logger.error('Linear fit function provided a dataframe without manual or blank entries')
|
|
335
|
+
return None, [0,0,0]
|
|
336
|
+
|
|
337
|
+
#If the blank count is larger than zero, log a warning stating that one or more entries contain a blank source
|
|
338
|
+
if blank_count > 0:
|
|
339
|
+
logger.warning("One or more labelled peaks in the FIDpMS file have no entry for Compound Source")
|
|
340
|
+
#Otherwise, pass
|
|
341
|
+
else:
|
|
342
|
+
pass
|
|
343
|
+
|
|
344
|
+
#Predefine variables for use in linear fitting
|
|
345
|
+
peakDrift = 0 #Peak drift, the linear slope describing drift between FID and MS RT's
|
|
346
|
+
peakOffset = 0 #Peak offset, the initial offset between FID and MS RT's
|
|
347
|
+
peakDiff = 0 #Peak difference, the difference between a given FID and MS RT
|
|
348
|
+
r2 = 0 #Coefficient of determination, the r^2 value of a linear fit
|
|
349
|
+
|
|
350
|
+
#If all_count is equal to 1..
|
|
351
|
+
if all_count == 1:
|
|
352
|
+
|
|
353
|
+
#Set the peak offset to the peak difference for the single labelled peak
|
|
354
|
+
peakDiff = fpmDF_mb['FID RT'].iloc[0] - fpmDF_mb['MS RT'].iloc[0]
|
|
355
|
+
peakOffset = peakDiff
|
|
356
|
+
|
|
357
|
+
else:
|
|
358
|
+
|
|
359
|
+
#Loop through every labelled peak, calculating the peak difference
|
|
360
|
+
for i, row in fpmDF_mb.iterrows():
|
|
361
|
+
peakDiff = row['FID RT'] - row['MS RT']
|
|
362
|
+
#Add this peak difference to a new column in the dataframe
|
|
363
|
+
fpmDF_mb.at[i,'peakDiff'] = peakDiff
|
|
364
|
+
#Get a linear fit for peak drift and peak offset using peak differences as y-values and FID RT's as x-values
|
|
365
|
+
peakDrift, peakOffset, r_value, p_value, std_err = scipy.stats.linregress(fpmDF_mb['FID RT'],fpmDF_mb['peakDiff'])
|
|
366
|
+
#Get a coefficient of determination
|
|
367
|
+
r2 = r_value**2
|
|
368
|
+
#Get a list of all peak counts
|
|
369
|
+
counts = [all_count,manual_count,blank_count]
|
|
370
|
+
|
|
371
|
+
return fpmDF_mb, [peakDrift,peakOffset,r2], counts
|
|
372
|
+
|
|
373
|
+
#Function that estimates unknown MS RT's and matches FID and MS peaks using a provided linear fit
|
|
374
|
+
def matchPeaksLinear(fpmDF,mDF,linfits,peakError=0.06):
|
|
375
|
+
"""
|
|
376
|
+
Parameters
|
|
377
|
+
----------
|
|
378
|
+
fpmDF : DataFrame
|
|
379
|
+
Dataframe containing FID and MS peak info
|
|
380
|
+
mDF : DataFrame
|
|
381
|
+
Dataframe containing MS info about identified compounds (UA_UPP)
|
|
382
|
+
linfits : List
|
|
383
|
+
List containing info about a linear fit for estimated MS RT's in the form [m,b,r2]
|
|
384
|
+
peakError : Float, optional
|
|
385
|
+
Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
|
|
386
|
+
|
|
387
|
+
Returns
|
|
388
|
+
-------
|
|
389
|
+
fpmDF : DataFrame
|
|
390
|
+
Dataframe containing FID and MS peak info
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
def matchOne(fpmDF,fpmiter,linfits,peakError):
|
|
394
|
+
"""
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
fpmDF : DataFrame
|
|
398
|
+
Dataframe containing FID and MS peak info
|
|
399
|
+
fpmiter : List
|
|
400
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
401
|
+
linfits : List
|
|
402
|
+
List containing info about a linear fit for estimated MS RT's in the form [m,b,r2]
|
|
403
|
+
peakError : float
|
|
404
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
fpmDF : DataFrame
|
|
409
|
+
Dataframe containing FID and MS peak info
|
|
410
|
+
"""
|
|
411
|
+
|
|
412
|
+
#Unpack fpmDF iterating info
|
|
413
|
+
fpmi = int(fpmiter[0])
|
|
414
|
+
fpmrow = fpmiter[1]
|
|
415
|
+
|
|
416
|
+
#Estimate an MS RT for the row's FID RT using the linear fit
|
|
417
|
+
est_MSRT = fpmrow['FID RT'] - (peakDrift*fpmrow['FID RT'] + peakOffset)
|
|
418
|
+
#Compare the estimated MS RT to all real MS RT's, seeing if there is a match within error
|
|
419
|
+
mDF_match = mDF.loc[(mDF['Component RT'] >= est_MSRT-peakError) & (mDF['Component RT'] <= est_MSRT+peakError)].copy()
|
|
420
|
+
#If there is more than one match, select the entry with the smallest error
|
|
421
|
+
if len(mDF_match) > 1:
|
|
422
|
+
#Add an RT error to all mDF_match entries
|
|
423
|
+
for i, row in mDF_match.iterrows():
|
|
424
|
+
mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-est_MSRT)
|
|
425
|
+
|
|
426
|
+
#Set mDF_match to the row with minimum RT Error
|
|
427
|
+
mDF_match = mDF_match.nsmallest(1,'RT Error')
|
|
428
|
+
|
|
429
|
+
#Reset the mDF_match index
|
|
430
|
+
mDF_match = mDF_match.reset_index().copy()
|
|
431
|
+
|
|
432
|
+
#If the length of mDF_match is greater than zero..
|
|
433
|
+
if len(mDF_match) > 0:
|
|
434
|
+
|
|
435
|
+
#Add the MS info to the FIDpMS dataframe
|
|
436
|
+
fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
|
|
437
|
+
fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
|
|
438
|
+
fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
|
|
439
|
+
fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
|
|
440
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using a linear fit of manual peak assignments'
|
|
441
|
+
|
|
442
|
+
#Otherwise, pass
|
|
443
|
+
else:
|
|
444
|
+
pass
|
|
445
|
+
|
|
446
|
+
return fpmDF
|
|
447
|
+
|
|
448
|
+
#Get peak drift and peak offset parameters from linfits, as well as coefficient of determination
|
|
449
|
+
peakDrift = linfits[0]
|
|
450
|
+
peakOffset = linfits[1]
|
|
451
|
+
r2 = linfits[2]
|
|
452
|
+
|
|
453
|
+
#Loop through every row in the dataframe
|
|
454
|
+
for i, row in fpmDF.iterrows():
|
|
455
|
+
#If the row's compound name is not blank
|
|
456
|
+
if not pd.isna(row['Compound Name']):
|
|
457
|
+
#If the row's compound source is either manual or blank, skip it
|
|
458
|
+
if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
|
|
459
|
+
pass
|
|
460
|
+
#Otherwise..
|
|
461
|
+
else:
|
|
462
|
+
#Match one FID peak
|
|
463
|
+
fpmDF = matchOne(fpmDF, [i,row], linfits, peakError)
|
|
464
|
+
#Otherwise, if the row's compound name is blank..
|
|
465
|
+
else:
|
|
466
|
+
#Match one FID peak
|
|
467
|
+
fpmDF = matchOne(fpmDF, [i,row], linfits, peakError)
|
|
468
|
+
|
|
469
|
+
return fpmDF
|
|
470
|
+
|
|
471
|
+
#Function that estimates unknown MS RT's and matches FID and MS peaks using a provided third order fit
|
|
472
|
+
def matchPeaksThird(fpmDF,mDF,fit_const,peakError=0.06):
|
|
473
|
+
"""
|
|
474
|
+
Parameters
|
|
475
|
+
----------
|
|
476
|
+
fpmDF : DataFrame
|
|
477
|
+
Dataframe containing FID and MS peak info
|
|
478
|
+
mDF : DataFrame
|
|
479
|
+
Dataframe containing MS info about identified compounds (UA_UPP)
|
|
480
|
+
fit_const : List
|
|
481
|
+
A list of floats describing a third order fit.
|
|
482
|
+
peakError : Float, optional
|
|
483
|
+
Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
|
|
484
|
+
|
|
485
|
+
Returns
|
|
486
|
+
-------
|
|
487
|
+
fpmDF : DataFrame
|
|
488
|
+
Dataframe containing FID and MS peak info
|
|
489
|
+
"""
|
|
490
|
+
|
|
491
|
+
def matchOne(fpmDF,fpmiter,fit_const,peakError):
|
|
492
|
+
"""
|
|
493
|
+
Parameters
|
|
494
|
+
----------
|
|
495
|
+
fpmDF : DataFrame
|
|
496
|
+
Dataframe containing FID and MS peak info
|
|
497
|
+
fpmiter : List
|
|
498
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
499
|
+
fit_const : List
|
|
500
|
+
A list of floats describing a third order fit.
|
|
501
|
+
peakError : float
|
|
502
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
503
|
+
|
|
504
|
+
Returns
|
|
505
|
+
-------
|
|
506
|
+
fpmDF : DataFrame
|
|
507
|
+
Dataframe containing FID and MS peak info
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
#Unpack fpmDF iterating info
|
|
511
|
+
fpmi = int(fpmiter[0])
|
|
512
|
+
fpmrow = fpmiter[1]
|
|
513
|
+
|
|
514
|
+
#Define x as fpmrow['FID RT] for convenience
|
|
515
|
+
x = fpmrow['FID RT']
|
|
516
|
+
#Estimate an MS RT for the row's FID RT using the third order fit
|
|
517
|
+
est_MSRT = fit_const[0]*x**3 + fit_const[1]*x**2 + fit_const[2]*x + fit_const[3]
|
|
518
|
+
#Compare the estimated MS RT to all real MS RT's, seeing if there is a match within error
|
|
519
|
+
mDF_match = mDF.loc[(mDF['Component RT'] >= est_MSRT-peakError) & (mDF['Component RT'] <= est_MSRT+peakError)].copy()
|
|
520
|
+
#If there is more than one match, select the entry with the smallest error
|
|
521
|
+
if len(mDF_match) > 1:
|
|
522
|
+
#Add an RT error to all mDF_match entries
|
|
523
|
+
for i, row in mDF_match.iterrows():
|
|
524
|
+
mDF_match.at[i,'RT Error'] = abs(mDF_match.at[i,'Component RT']-est_MSRT)
|
|
525
|
+
|
|
526
|
+
#Set mDF_match to the row with minimum RT Error
|
|
527
|
+
mDF_match = mDF_match.nsmallest(1,'RT Error')
|
|
528
|
+
|
|
529
|
+
#Reset the mDF_match index
|
|
530
|
+
mDF_match = mDF_match.reset_index().copy()
|
|
531
|
+
|
|
532
|
+
#If the length of mDF_match is greater than zero..
|
|
533
|
+
if len(mDF_match) > 0:
|
|
534
|
+
|
|
535
|
+
#Add the MS info to the FIDpMS dataframe
|
|
536
|
+
fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
|
|
537
|
+
fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
|
|
538
|
+
fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
|
|
539
|
+
fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
|
|
540
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using a predetermined third-order fit'
|
|
541
|
+
|
|
542
|
+
#Otherwise, pass
|
|
543
|
+
else:
|
|
544
|
+
pass
|
|
545
|
+
|
|
546
|
+
return fpmDF
|
|
547
|
+
|
|
548
|
+
#Loop through every row in the dataframe
|
|
549
|
+
for i, row in fpmDF.iterrows():
|
|
550
|
+
#If the row's compound name is not blank
|
|
551
|
+
if not pd.isna(row['Compound Name']):
|
|
552
|
+
#If the row's compound source is either manual or a gasPeaks known peak match or blank, skip it
|
|
553
|
+
if row['Compound Source'] == 'Manual' or row['Compound Source'] == 'Automatically assigned using gas pairs provided in resources' or pd.isna(row['Compound Source']):
|
|
554
|
+
pass
|
|
555
|
+
#Otherwise..
|
|
556
|
+
else:
|
|
557
|
+
#Match one FID peak
|
|
558
|
+
fpmDF = matchOne(fpmDF, [i,row], fit_const, peakError)
|
|
559
|
+
#Otherwise, if the row's compound name is blank..
|
|
560
|
+
else:
|
|
561
|
+
#Match one FID peak
|
|
562
|
+
fpmDF = matchOne(fpmDF, [i,row], fit_const, peakError)
|
|
563
|
+
|
|
564
|
+
return fpmDF
|
|
565
|
+
|
|
566
|
+
#Function that performs a subset of speculative labeling, using known peaks hard-coded in a file gasPairs_FIDpMS.csv
|
|
567
|
+
def matchKnownPeaks(fpmDF,mDF,gp_rsc):
|
|
568
|
+
def matchOne(fpmDF,fpmiter,gp_rsc):
|
|
569
|
+
"""
|
|
570
|
+
Parameters
|
|
571
|
+
----------
|
|
572
|
+
fpmDF : DataFrame
|
|
573
|
+
Dataframe containing FID and MS peak info
|
|
574
|
+
fpmiter : List
|
|
575
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
576
|
+
gp_rsc : DataFrame
|
|
577
|
+
Dataframe containing opened gasPairs resource.
|
|
578
|
+
peakError : float
|
|
579
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
580
|
+
|
|
581
|
+
Returns
|
|
582
|
+
-------
|
|
583
|
+
fpmDF : DataFrame
|
|
584
|
+
Dataframe containing FID and MS peak info
|
|
585
|
+
"""
|
|
586
|
+
|
|
587
|
+
#Unpack fpmDF iterating info
|
|
588
|
+
fpmi = int(fpmiter[0])
|
|
589
|
+
fpmrow = fpmiter[1]
|
|
590
|
+
|
|
591
|
+
#Search the gasPairs resource to see if any known peaks/RT's match the FID peak list
|
|
592
|
+
for i, row in gp_rsc.iterrows():
|
|
593
|
+
#Set gp_match to empty string
|
|
594
|
+
gp_match = pd.Series()
|
|
595
|
+
#Define error as two times the standard deviation for the FID RT in the gasPeaks resource
|
|
596
|
+
gp_error = row['Stdev FID RT']*2
|
|
597
|
+
#Extract the FID RT from the resource
|
|
598
|
+
gp_FIDRT = row['Average FID RT']
|
|
599
|
+
#If the current fpmrow FID RT is within the error bounds of an entry in the resource, match it
|
|
600
|
+
#NOTE: prefers the first match, even if the next match is closer. Most resourceRT's are more than
|
|
601
|
+
#2*error away from each other
|
|
602
|
+
if (fpmrow['FID RT'] >= gp_FIDRT - gp_error) and (fpmrow['FID RT'] <= gp_FIDRT + gp_error):
|
|
603
|
+
gp_match = row
|
|
604
|
+
break
|
|
605
|
+
#Otherwise, pass
|
|
606
|
+
else:
|
|
607
|
+
pass
|
|
608
|
+
|
|
609
|
+
#If gp_match is empty, pass
|
|
610
|
+
if gp_match.empty:
|
|
611
|
+
pass
|
|
612
|
+
#Otherwise, add the match info
|
|
613
|
+
else:
|
|
614
|
+
#Add the resource match info to the FIDpMS dataframe
|
|
615
|
+
fpmDF.at[fpmi,'Compound Name'] = gp_match['Species']
|
|
616
|
+
fpmDF.at[fpmi,'Formula'] = gp_match['Formula']
|
|
617
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using gas pairs provided in resources'
|
|
618
|
+
|
|
619
|
+
return fpmDF
|
|
620
|
+
|
|
621
|
+
#Loop through every row in the dataframe
|
|
622
|
+
for i, row in fpmDF.iterrows():
|
|
623
|
+
#If the row's compound name is not blank
|
|
624
|
+
if not pd.isna(row['Compound Name']):
|
|
625
|
+
#If the row's compound source is either manual or blank, skip it
|
|
626
|
+
if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
|
|
627
|
+
pass
|
|
628
|
+
#Otherwise..
|
|
629
|
+
else:
|
|
630
|
+
#Match one FID peak
|
|
631
|
+
fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
|
|
632
|
+
#Otherwise, if the row's compound name is blank..
|
|
633
|
+
else:
|
|
634
|
+
#Match one FID peak
|
|
635
|
+
fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
|
|
636
|
+
|
|
637
|
+
return fpmDF
|
|
638
|
+
|
|
639
|
+
#Function that performs speculative labeling to label FID peaks which do not have a match
|
|
640
|
+
def specLab(fpmDF,kc_rsc,sinfo,counts,peakError,restrictList):
|
|
641
|
+
|
|
642
|
+
#Unpack restrictList
|
|
643
|
+
trueRestrict, prefer = restrictList
|
|
644
|
+
#Log that speculative labeling is being performed
|
|
645
|
+
logger.info('Performing speculative labeling on {0} with {1} peaks, {2} of which are labelled: {3} sourced manually and {4} with an unknown source'.format(sinfo['Sample Name'],len(fpmDF),counts[0],counts[1],counts[2]))
|
|
646
|
+
|
|
647
|
+
#Loop through every entry in fpmDF
|
|
648
|
+
for i, row in fpmDF.iterrows():
|
|
649
|
+
#Define a Boolean for use in determining whether to run the next if statement or not
|
|
650
|
+
Bool_kc_check = True
|
|
651
|
+
|
|
652
|
+
#If the compound name is blank or either form of "No Match"..
|
|
653
|
+
if pd.isna(row['Compound Name']) or row['Compound Name'] == 'No Match' or row['Compound Name'] == 'No Match':
|
|
654
|
+
|
|
655
|
+
#Get a copy of kc_rsc
|
|
656
|
+
kc_check = kc_rsc.copy()
|
|
657
|
+
#Find rows where the FID peak RT is within provided error
|
|
658
|
+
kc_check = kc_check.loc[(kc_check['FID RT']>=row['FID RT']-peakError) & (kc_check['FID RT']<=row['FID RT']+peakError)]
|
|
659
|
+
#Filter out rows that label the peak as No Match or No match
|
|
660
|
+
kc_check = kc_check.loc[(kc_check['Compound Name']!='No Match') & (kc_check['Compound Name']!='No match')]
|
|
661
|
+
|
|
662
|
+
#For every entry in trueRestrict, filter out rows where the entry property does not match
|
|
663
|
+
for entry in trueRestrict:
|
|
664
|
+
kc_check = kc_check.loc[kc_check[entry]==sinfo[entry]]
|
|
665
|
+
|
|
666
|
+
#If kc_check has more than one row...
|
|
667
|
+
if len(kc_check)>1:
|
|
668
|
+
|
|
669
|
+
#Make a copy of kc_check
|
|
670
|
+
kc_check_2 = kc_check.copy()
|
|
671
|
+
|
|
672
|
+
#Loop through every entry in prefer
|
|
673
|
+
for key in prefer:
|
|
674
|
+
|
|
675
|
+
#Select rows in which the given entry property in prefer has a value within the provided range
|
|
676
|
+
kc_check_2 = kc_check.loc[(kc_check[key]>=sinfo[key]-prefer[key])&(kc_check[key]<=sinfo[key]+prefer[key])]
|
|
677
|
+
#If this results in a DataFrame with more than one entry, filter the original kc_check
|
|
678
|
+
if len(kc_check_2)>1:
|
|
679
|
+
kc_check = kc_check_2.copy()
|
|
680
|
+
pass
|
|
681
|
+
#If this results in a DataFrame with one entry, break the loop
|
|
682
|
+
elif len(kc_check_2)==1:
|
|
683
|
+
kc_check = kc_check_2.iloc[0].copy()
|
|
684
|
+
#Define a Boolean for use in determining whether to run the next if statement or not
|
|
685
|
+
Bool_kc_check = False
|
|
686
|
+
break
|
|
687
|
+
#If this results in a DataFrame with fewer than one entry (the only other possible option)..
|
|
688
|
+
else:
|
|
689
|
+
#Pass and do not apply this preference
|
|
690
|
+
pass
|
|
691
|
+
|
|
692
|
+
#If kc_check still has more than one row..
|
|
693
|
+
if len(kc_check)>1 and Bool_kc_check:
|
|
694
|
+
#Get the row with the highest match factor
|
|
695
|
+
kc_check = kc_check.loc[kc_check['Match Factor'].idxmax()]
|
|
696
|
+
|
|
697
|
+
#Otherwise, pass
|
|
698
|
+
else:
|
|
699
|
+
pass
|
|
700
|
+
|
|
701
|
+
#Else if kc_check has only one row..
|
|
702
|
+
elif len(kc_check)==1:
|
|
703
|
+
#Convert the DataFrame into a Series
|
|
704
|
+
kc_check = kc_check.iloc[0]
|
|
705
|
+
#Otherwise, pass
|
|
706
|
+
else:
|
|
707
|
+
pass
|
|
708
|
+
|
|
709
|
+
#If kc_check is not 0..
|
|
710
|
+
if len(kc_check) > 0:
|
|
711
|
+
#Add the new kc_check entry to fpmDF for the given row
|
|
712
|
+
fpmDF.at[i,'Compound Name'] = kc_check['Compound Name']
|
|
713
|
+
fpmDF.at[i,'Formula'] = kc_check['Formula']
|
|
714
|
+
fpmDF.at[i,'Compound Source'] = 'Speculated based on {0}, which used {1} at {2}C and {3}psi'.format(kc_check['Sample Name'],kc_check['Catalyst'],kc_check['Reaction Temperature (C)'],kc_check['Reaction pressure (psi)'])
|
|
715
|
+
#Otherwise, pass
|
|
716
|
+
else:
|
|
717
|
+
pass
|
|
718
|
+
|
|
719
|
+
#Otherwise, pass
|
|
720
|
+
else:
|
|
721
|
+
pass
|
|
722
|
+
|
|
723
|
+
return fpmDF
|
|
724
|
+
|
|
725
|
+
#Function that matches FID and MS peaks by their retention time
|
|
726
|
+
def matchRT(fpmDF,mDF,peakError=0.06):
|
|
727
|
+
"""
|
|
728
|
+
Parameters
|
|
729
|
+
----------
|
|
730
|
+
fpmDF : DataFrame
|
|
731
|
+
Dataframe containing FID and MS peak info
|
|
732
|
+
mDF : DataFrame
|
|
733
|
+
Dataframe containing MS info about identified compounds (UA_UPP)
|
|
734
|
+
peakError : Float, optional
|
|
735
|
+
Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
|
|
736
|
+
|
|
737
|
+
Returns
|
|
738
|
+
-------
|
|
739
|
+
fpmDF : DataFrame
|
|
740
|
+
Dataframe containing FID and MS peak info
|
|
741
|
+
"""
|
|
742
|
+
|
|
743
|
+
def matchOne(fpmDF,fpmiter,peakError):
|
|
744
|
+
"""
|
|
745
|
+
Parameters
|
|
746
|
+
----------
|
|
747
|
+
fpmDF : DataFrame
|
|
748
|
+
Dataframe containing FID and MS peak info
|
|
749
|
+
fpmiter : List
|
|
750
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
751
|
+
peakError : float
|
|
752
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
fpmDF : DataFrame
|
|
757
|
+
Dataframe containing FID and MS peak info
|
|
758
|
+
"""
|
|
759
|
+
|
|
760
|
+
#Unpack fpmDF iterating info
|
|
761
|
+
fpmi = int(fpmiter[0])
|
|
762
|
+
fpmrow = fpmiter[1]
|
|
763
|
+
|
|
764
|
+
#Compare the FID RT to the MS RT, collecting all matches within the specified peak error
|
|
765
|
+
mDF_match = mDF.loc[(mDF['Component RT'] >= fpmrow['FID RT']-peakError) & (mDF['Component RT'] <= fpmrow['FID RT']+peakError)].copy()
|
|
766
|
+
#If there is more than one MS RT match, select the entry with the smallest error from the FID RT
|
|
767
|
+
if len(mDF_match) > 1:
|
|
768
|
+
#Add an RT error to all mDF_match entries
|
|
769
|
+
for i, row in mDF_match.iterrows():
|
|
770
|
+
mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-row['Component RT'])
|
|
771
|
+
|
|
772
|
+
#Set mDF_match to the row with minimum RT Error
|
|
773
|
+
mDF_match = mDF_match.nsmallest(1,'RT Error')
|
|
774
|
+
|
|
775
|
+
#Reset the mDF_match index
|
|
776
|
+
mDF_match = mDF_match.reset_index().copy()
|
|
777
|
+
|
|
778
|
+
#If the length of mDF_match is greater than zero..
|
|
779
|
+
if len(mDF_match) > 0:
|
|
780
|
+
|
|
781
|
+
#Add the MS info to the FIDpMS dataframe
|
|
782
|
+
fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
|
|
783
|
+
fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
|
|
784
|
+
fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
|
|
785
|
+
fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
|
|
786
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned by comparing FID and MS retention times'
|
|
787
|
+
|
|
788
|
+
#Otherwise, pass
|
|
789
|
+
else:
|
|
790
|
+
pass
|
|
791
|
+
|
|
792
|
+
return fpmDF
|
|
793
|
+
|
|
794
|
+
#Loop through every row in the dataframe
|
|
795
|
+
for i, row in fpmDF.iterrows():
|
|
796
|
+
#If the row's compound name is not blank
|
|
797
|
+
if not pd.isna(row['Compound Name']):
|
|
798
|
+
#If the row's compound source is either manual or blank, skip it
|
|
799
|
+
if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
|
|
800
|
+
pass
|
|
801
|
+
#Otherwise..
|
|
802
|
+
else:
|
|
803
|
+
#Match one FID peak
|
|
804
|
+
fpmDF = matchOne(fpmDF, [i,row], peakError)
|
|
805
|
+
#Otherwise, if the row's compound name is blank..
|
|
806
|
+
else:
|
|
807
|
+
#Match one FID peak
|
|
808
|
+
fpmDF = matchOne(fpmDF, [i,row], peakError)
|
|
809
|
+
|
|
810
|
+
return fpmDF
|
|
811
|
+
|
|
812
|
+
#Function that performs compound type abbreviation assignment
|
|
813
|
+
def ctaAssign(importDF, contains, keyLoop, elementExclude):
|
|
814
|
+
|
|
815
|
+
#Function that returns a compound type abbreviation corresponding to a compound
|
|
816
|
+
def assignType(compoundName,contains,keyLoop):
|
|
817
|
+
|
|
818
|
+
#Define default compound type abbreviation as 'O'
|
|
819
|
+
CTA = 'O'
|
|
820
|
+
|
|
821
|
+
#Function that accepts a list of substrings to check against a string and returns a boolean
|
|
822
|
+
def stringSearch(string,subList):
|
|
823
|
+
#Define export boolean default value
|
|
824
|
+
checkTF = False
|
|
825
|
+
#For every substring in subList...
|
|
826
|
+
for i in range(len(subList)):
|
|
827
|
+
|
|
828
|
+
#If the substring can be found in the string...
|
|
829
|
+
if subList[i] in string:
|
|
830
|
+
#Assign boolean to True and break
|
|
831
|
+
checkTF = True
|
|
832
|
+
break
|
|
833
|
+
#Otherwise, pass
|
|
834
|
+
else:
|
|
835
|
+
pass
|
|
836
|
+
|
|
837
|
+
return checkTF
|
|
838
|
+
|
|
839
|
+
#Loop through every key (compound type abbreviation) in contains
|
|
840
|
+
for i in keyLoop:
|
|
841
|
+
|
|
842
|
+
#If at least one substring in the key's list is found in compoundName...
|
|
843
|
+
if stringSearch(compoundName,contains[i]):
|
|
844
|
+
#Assign the compound type abbreviation to the current key and break the loop
|
|
845
|
+
CTA = i
|
|
846
|
+
break
|
|
847
|
+
#Otherwise, pass
|
|
848
|
+
else:
|
|
849
|
+
pass
|
|
850
|
+
|
|
851
|
+
return CTA
|
|
852
|
+
|
|
853
|
+
#Function that checks if formula string contains any of a list of elements
|
|
854
|
+
def checkElements(compoundFormula,elementList):
|
|
855
|
+
#Assign default export boolean to False
|
|
856
|
+
checkTF = False
|
|
857
|
+
|
|
858
|
+
#For every substring in elementList...
|
|
859
|
+
for i in range(len(elementList)):
|
|
860
|
+
#If the substring can be found in the compound formula...
|
|
861
|
+
if elementList[i] in compoundFormula:
|
|
862
|
+
#Set boolean to True and break
|
|
863
|
+
checkTF = True
|
|
864
|
+
break
|
|
865
|
+
#Otherwise, pass
|
|
866
|
+
else:
|
|
867
|
+
pass
|
|
868
|
+
|
|
869
|
+
return checkTF
|
|
870
|
+
|
|
871
|
+
#For every entry in the csv, assign a compound type abbreviation
|
|
872
|
+
for i, row in importDF.iterrows():
|
|
873
|
+
|
|
874
|
+
#Retrieve compound name and formula from row entry
|
|
875
|
+
compoundName = row['Compound Name']
|
|
876
|
+
compoundFormula = row['Formula']
|
|
877
|
+
|
|
878
|
+
#If the compound formula is a string...
|
|
879
|
+
if isinstance(compoundFormula,str):
|
|
880
|
+
|
|
881
|
+
#If the formula contains excluded elements...
|
|
882
|
+
if checkElements(compoundFormula,elementExclude):
|
|
883
|
+
|
|
884
|
+
#Assign 'O' to the row's compound type abbreviation entry
|
|
885
|
+
importDF.at[i,'Compound Type Abbreviation'] = 'O'
|
|
886
|
+
|
|
887
|
+
#Otherwise...
|
|
888
|
+
else:
|
|
889
|
+
|
|
890
|
+
#If the compound name is a string...
|
|
891
|
+
if isinstance(compoundName,str):
|
|
892
|
+
|
|
893
|
+
#Change compound name to lowercase
|
|
894
|
+
compoundName = compoundName.lower()
|
|
895
|
+
#Get a corresponding compound type abbreviation
|
|
896
|
+
CTA = assignType(compoundName, contains, keyLoop)
|
|
897
|
+
#Assign this CTA to the row's compound type abbreviation entry
|
|
898
|
+
importDF.at[i,'Compound Type Abbreviation'] = CTA
|
|
899
|
+
|
|
900
|
+
#Otherwise, pass
|
|
901
|
+
else:
|
|
902
|
+
pass
|
|
903
|
+
|
|
904
|
+
return importDF
|
|
905
|
+
|
|
906
|
+
#Define function that loops through every row in a DataFrame and modifies rows with duplicate compounds
|
|
907
|
+
def duplicateHandle(DF):
|
|
908
|
+
|
|
909
|
+
#Define function that searches for rows in a DataFrame with duplicate compound names
|
|
910
|
+
def duplicateSearch(DF,cmp_name):
|
|
911
|
+
|
|
912
|
+
#Get a new dataframe that is a copy of the first argument
|
|
913
|
+
DF_out = DF.copy()
|
|
914
|
+
|
|
915
|
+
#Filter the dataframe using the provided compound name
|
|
916
|
+
DF_out = DF_out[DF_out['Compound Name'] == cmp_name]
|
|
917
|
+
|
|
918
|
+
#Define a Boolean describing whether or not there are duplicate rows
|
|
919
|
+
duplicate_TF = False
|
|
920
|
+
|
|
921
|
+
#If the DF_out dataframe is longer than one (if there are duplicate rows)...
|
|
922
|
+
if len(DF_out) > 1:
|
|
923
|
+
|
|
924
|
+
#Assign the Boolean to true
|
|
925
|
+
duplicate_TF = True
|
|
926
|
+
|
|
927
|
+
#Define the dataframe to be returned
|
|
928
|
+
DF_return = DF_out.copy()
|
|
929
|
+
|
|
930
|
+
#Otherwise, define the return dataframe as empty
|
|
931
|
+
else:
|
|
932
|
+
DF_return = pd.DataFrame
|
|
933
|
+
|
|
934
|
+
#Return the boolean and the filtered DataFrame
|
|
935
|
+
return duplicate_TF, DF_return
|
|
936
|
+
|
|
937
|
+
#Define function that handles a given DataFrame of duplicates
|
|
938
|
+
def duplicateLogic(DF_search):
|
|
939
|
+
|
|
940
|
+
#Define the output DataFrame as an in copy
|
|
941
|
+
DF_logic = DF_search.copy()
|
|
942
|
+
|
|
943
|
+
#Get the row in the DataFrame with the largest area
|
|
944
|
+
maxSeries = DF_logic.loc[DF_logic['FID Area'].idxmax()]
|
|
945
|
+
|
|
946
|
+
#Get the name and compound type of this compound
|
|
947
|
+
max_name = maxSeries['Compound Name']
|
|
948
|
+
max_type = maxSeries['Compound Type Abbreviation']
|
|
949
|
+
|
|
950
|
+
#Get the remaining entries in the DataFrame
|
|
951
|
+
DF_logic = DF_logic.drop([maxSeries.name],axis=0)
|
|
952
|
+
|
|
953
|
+
#For every row in the remaining entries DataFrame, rename the compound to 'Isomer of..'
|
|
954
|
+
for i, row in DF_logic.iterrows():
|
|
955
|
+
|
|
956
|
+
#Get the new compound name
|
|
957
|
+
new_cmp_name = 'Isomer of ' + max_name
|
|
958
|
+
|
|
959
|
+
#Replace the compound name
|
|
960
|
+
DF_logic.at[i,'Compound Name'] = new_cmp_name
|
|
961
|
+
|
|
962
|
+
#If the compound type of the maxSeries is linear alkanes...
|
|
963
|
+
if max_type == 'L':
|
|
964
|
+
|
|
965
|
+
#Set the current row's compound type to branched alkanes
|
|
966
|
+
DF_logic.at[i,'Compound Type Abbreviation'] = 'B'
|
|
967
|
+
|
|
968
|
+
#Otherwise, pass
|
|
969
|
+
else:
|
|
970
|
+
pass
|
|
971
|
+
|
|
972
|
+
#Return the logic DataFrame
|
|
973
|
+
return DF_logic
|
|
974
|
+
|
|
975
|
+
#Define a function that replaces rows in the primary DataFrame with matches in the secondary, assuming the indices match
|
|
976
|
+
def duplicateReplace(pDF,sDF):
|
|
977
|
+
|
|
978
|
+
#For every entry in the secondary DataFrame...
|
|
979
|
+
for i, row in sDF.iterrows():
|
|
980
|
+
|
|
981
|
+
#Get the row's name, which is the numeric index in the DataFrame
|
|
982
|
+
row_name = row.name
|
|
983
|
+
|
|
984
|
+
#For every index in the row...
|
|
985
|
+
for j in row.index:
|
|
986
|
+
|
|
987
|
+
#Replace the corresponding entry in the pDF at the preserved sDF index
|
|
988
|
+
pDF.at[row_name,j] = row[j]
|
|
989
|
+
|
|
990
|
+
return pDF
|
|
991
|
+
|
|
992
|
+
#Define a list of compound names already handled
|
|
993
|
+
cmp_nameList = []
|
|
994
|
+
|
|
995
|
+
#Create a copy of the argument DataFrame to be used
|
|
996
|
+
DF_in = DF.copy()
|
|
997
|
+
|
|
998
|
+
#Initiate a DataFrame for the logic output
|
|
999
|
+
DF_logic = pd.DataFrame()
|
|
1000
|
+
|
|
1001
|
+
#For every row in the provided DataFrame
|
|
1002
|
+
for i, row in DF_in.iterrows():
|
|
1003
|
+
|
|
1004
|
+
#Get the compound name in that row
|
|
1005
|
+
cmp_name = row['Compound Name']
|
|
1006
|
+
|
|
1007
|
+
#If the compound name is in the list of compound names handled, pass
|
|
1008
|
+
if cmp_name in cmp_nameList:
|
|
1009
|
+
pass
|
|
1010
|
+
|
|
1011
|
+
#Otherwise...
|
|
1012
|
+
else:
|
|
1013
|
+
|
|
1014
|
+
#If the compound name is 'No Match' or 'No match' or nan, pass
|
|
1015
|
+
if cmp_name == 'No Match' or cmp_name == 'No Match' or pd.isna(cmp_name):
|
|
1016
|
+
pass
|
|
1017
|
+
|
|
1018
|
+
#Otherwise...
|
|
1019
|
+
else:
|
|
1020
|
+
|
|
1021
|
+
#Run the duplicate search function for that compound name
|
|
1022
|
+
duplicate_TF, DF_search = duplicateSearch(DF_in,cmp_name)
|
|
1023
|
+
|
|
1024
|
+
#If duplicate_TF is True...
|
|
1025
|
+
if duplicate_TF:
|
|
1026
|
+
#Run the duplicate logic funcion
|
|
1027
|
+
DF_logic = duplicateLogic(DF_search)
|
|
1028
|
+
|
|
1029
|
+
#Run the duplicate replace function
|
|
1030
|
+
DF_done = duplicateReplace(DF_in,DF_logic)
|
|
1031
|
+
|
|
1032
|
+
#Otherwise, pass
|
|
1033
|
+
else:
|
|
1034
|
+
pass
|
|
1035
|
+
|
|
1036
|
+
#Add the compound name to the compound name list
|
|
1037
|
+
cmp_nameList.append(cmp_name)
|
|
1038
|
+
|
|
1039
|
+
return DF_done
|
|
1040
|
+
|
|
1041
|
+
""" DATA IMPORTS """
|
|
1042
|
+
print("[AutoFpmMatch] Importing data...")
|
|
1043
|
+
#Import sample information from json file
|
|
1044
|
+
with open(os.path.join(DF_Dir,sname,sname+sub_Dict['Info'][0])) as sinfo_f:
|
|
1045
|
+
sinfo = json.load(sinfo_f)
|
|
1046
|
+
|
|
1047
|
+
#Change ISO date-time strings into datetime objects
|
|
1048
|
+
sinfo['Start Time'] = datetime.fromisoformat(sinfo['Start Time'])
|
|
1049
|
+
sinfo['End Time'] = datetime.fromisoformat(sinfo['End Time'])
|
|
1050
|
+
|
|
1051
|
+
#Calculate a reaction time using the start, end, and heat time values and add to sinfo
|
|
1052
|
+
sinfo['Reaction Time (hr)'] = abs(sinfo['End Time']-sinfo['Start Time']).total_seconds()/3600 - sinfo['Heat Time']
|
|
1053
|
+
|
|
1054
|
+
#Run the file naming function
|
|
1055
|
+
paths = fileNamer(sname,sphase,sub_Dict,Raw_Dir)
|
|
1056
|
+
|
|
1057
|
+
#Import MS UPP data
|
|
1058
|
+
mDF = pd.read_csv(paths[1])
|
|
1059
|
+
|
|
1060
|
+
#Get only relevant columns of MS UPP data
|
|
1061
|
+
mDF = mDF.loc[:,['Component RT','Compound Name','Formula','Match Factor']]
|
|
1062
|
+
|
|
1063
|
+
#Import known compounds resource
|
|
1064
|
+
kc_rsc = pd.read_csv(os.path.join(RE_Dir,'known_compounds.csv'))
|
|
1065
|
+
#Filter known compounds to only include rows with the same catalyst
|
|
1066
|
+
#AND compounds which were not identified by the current sample
|
|
1067
|
+
kc_rsc = kc_rsc.loc[(kc_rsc['Catalyst']==sinfo['Catalyst Type'])&(kc_rsc['Sample Name']!=sinfo['Sample Name'])]
|
|
1068
|
+
|
|
1069
|
+
#Import gasPairs_FIDpMS.csv resource
|
|
1070
|
+
gp_rsc = pd.read_csv(os.path.join(RE_Dir,'gasPairs_FIDpMS.csv'))
|
|
1071
|
+
|
|
1072
|
+
""" CODE """
|
|
1073
|
+
print("[AutoFpmMatch] Checking files...")
|
|
1074
|
+
#Run the file checking function
|
|
1075
|
+
fpmDF, tf = checkFile(paths[2],paths[0])
|
|
1076
|
+
|
|
1077
|
+
#If the specified model is linear...
|
|
1078
|
+
if model == "L":
|
|
1079
|
+
#If the file contains manually matched peaks..
|
|
1080
|
+
if tf:
|
|
1081
|
+
print("[AutoFpmMatch] Matching by linear fit...")
|
|
1082
|
+
#Run the linear fit function
|
|
1083
|
+
fpmDF_mb, linfits, counts = RTlinfit(fpmDF)
|
|
1084
|
+
#Run the peak matching function
|
|
1085
|
+
fpmDF = matchPeaksLinear(fpmDF,mDF,linfits,peakError)
|
|
1086
|
+
|
|
1087
|
+
else:
|
|
1088
|
+
pass
|
|
1089
|
+
|
|
1090
|
+
#Otherwise, if the specified model is third order...
|
|
1091
|
+
elif model == "T":
|
|
1092
|
+
print("[AutoFpmMatch] Matching by third order fit...")
|
|
1093
|
+
#Run the gasPeaks_FIDpMS resource matching function
|
|
1094
|
+
fpmDF = matchKnownPeaks(fpmDF,mDF,gp_rsc)
|
|
1095
|
+
#Run the third order peak matching function
|
|
1096
|
+
fpmDF = matchPeaksThird(fpmDF,mDF,fit_const,peakError)
|
|
1097
|
+
|
|
1098
|
+
#Otherwise, if the specified model is retention time match...
|
|
1099
|
+
elif model == "R":
|
|
1100
|
+
print("[AutoFpmMatch] Matching by retention time...")
|
|
1101
|
+
#Run the liquid retention time matching function
|
|
1102
|
+
fpmDF = matchRT(fpmDF,mDF,peakErrorRT)
|
|
1103
|
+
|
|
1104
|
+
#Otherwise, pass
|
|
1105
|
+
else:
|
|
1106
|
+
pass
|
|
1107
|
+
|
|
1108
|
+
#Run the speculative labeling function
|
|
1109
|
+
if splab_TF == "True":
|
|
1110
|
+
print("[AutoFpmMatch] Running speculative labeling...")
|
|
1111
|
+
fpmDF = specLab(fpmDF, kc_rsc, sinfo, counts, peakError, restrictList)
|
|
1112
|
+
else:
|
|
1113
|
+
pass
|
|
1114
|
+
|
|
1115
|
+
print("[AutoFpmMatch] Matching complete.")
|
|
1116
|
+
|
|
1117
|
+
print("[AutoFpmMatch] Assigning compound type abbreviations...")
|
|
1118
|
+
#Run the compound type abbreviation assignment function
|
|
1119
|
+
fpmDF = ctaAssign(fpmDF, contains, keyLoop, elementExclude)
|
|
1120
|
+
print("[AutoFpmMatch] Handling duplicates...")
|
|
1121
|
+
#Run the duplicate handling function
|
|
1122
|
+
fpmDF = duplicateHandle(fpmDF)
|
|
1123
|
+
|
|
1124
|
+
print("[AutoFpmMatch] Saving results...")
|
|
1125
|
+
#Save the FIDpMS data
|
|
1126
|
+
fpmDF.to_csv(paths[2],index=False)
|
|
1127
|
+
|
|
1128
|
+
print("[AutoFpmMatch] Matching complete.")
|
|
1129
|
+
#Close main function by returning
|
|
1130
|
+
return None
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
|