chromaquant 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,798 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+ COPYRIGHT STATEMENT:
6
+
7
+ ChromaQuant – A quantification software for complex gas chromatographic data
8
+
9
+ Copyright (c) 2024, by Julia Hancock
10
+ Affiliation: Dr. Julie Elaine Rorrer
11
+ URL: https://www.rorrerlab.com/
12
+
13
+ License: BSD 3-Clause License
14
+
15
+ ---
16
+
17
+ SCRIPT WHICH MATCHES FID AND MS PEAKS
18
+
19
+ Julia Hancock
20
+ Started 12/29/2023
21
+
22
+ """
23
+ """ PACKAGES """
24
+ import sys
25
+ import pandas as pd
26
+ import os
27
+ from molmass import Formula
28
+ import math
29
+ import numpy as np
30
+ from chemformula import ChemFormula
31
+ import json
32
+ from datetime import datetime
33
+ import logging
34
+ import scipy
35
+
36
+ """ PARAMETERS """
37
+ #Default third order fit arguments for gas FID and MS peak matching
38
+ #a (x^3)
39
+ a_tof = 0.03
40
+ #b (x^2)
41
+ b_tof = -0.5839
42
+ #c (x)
43
+ c_tof = 5
44
+ #d
45
+ d_tof = -3.2099
46
+ #Combine into a list
47
+ fit_const = [a_tof,b_tof,c_tof,d_tof]
48
+
49
+
50
+ """ SAMPLE INFO """
51
+ #Write sample name
52
+ sname = 'example'
53
+
54
+ #Write sample phase ("L" or "G")
55
+ sphase = "G"
56
+
57
+ #Write whether or not to perform speculative labeling
58
+ splab_TF = False
59
+
60
+ #Specify the allowable error for both linear and speculative peak matching
61
+ peakError = 0.06
62
+
63
+ #Specify model, ("T" or "L")
64
+ model = "T"
65
+
66
+ #Specify the restrictions and preferences to be implemented in speculative labeling
67
+ #The first list contains properties which must match in order for something to be labelled
68
+ #The second dictionary contains properties which are preferred in deciding between multiple matches
69
+ #The dictionary should have key:value pairs of the form "kc_rsc":"allowable error between speculative entry and sample value"
70
+ #The preferences listed are applied in order such that the first preference is more valued than the last
71
+ restrictList = [['Gas'],{'Reaction Temperature (C)':5}]
72
+ #Start time for execution time
73
+ exec_start = datetime.now()
74
+
75
+ """ COMPOUND TYPE ASSIGNMENT VARIABLES """
76
+ #This dictionary contain lists of substrings to be checked against compound name strings to
77
+ #assign a compound type
78
+
79
+ #Six compound types exist: linear alkanes (L), branched alkanes (B), aromatics (A), cycloalkanes (C),
80
+ #alkenes/alkynes (E), and other (O)
81
+
82
+ #Each compound type abbreviation will have an entry in the dictionary corresponding to a list of
83
+ #substrings to be checked against a compound name string
84
+
85
+ contains = {'L':['methane','ethane','propane','butane','pentane','hexane','heptane','octane','nonane',\
86
+ 'decane','undecane','hendecane','dodecane','tridecane','tetradecane','pentadecane','hexadecane','heptadecane','octadecane','nonadecane',\
87
+ 'icosane','eicosane','heneicosane','henicosane','docosane','tricosane','tetracosane','pentacosane','hexacosane','cerane','heptacosane','octacosane','nonacosane',\
88
+ 'triacontane','hentriacontane','untriacontane','dotriacontane','dicetyl','tritriacontane','tetratriacontane','pentatriacontane','hexatriacontane','heptatriacontane','octatriacontane','nonatriacontane',\
89
+ 'tetracontane','hentetracontane','dotetracontane','tritetracontane','tetratetracontane','pentatetracontane','hexatetracontane','heptatetracontane','octatetracontane','nonatetracontane','pentacontane'],\
90
+
91
+ 'B':['iso','methyl','ethyl','propyl','butyl','pentyl','hexyl','heptyl','octyl','nonyl',\
92
+ 'decyl','undecyl','dodecyl','tridecyl','tetradecyl','pentadecyl','hexadecyl','heptadecyl','octadecyl','nonadecyl',\
93
+ 'icosyl','eicosyl','heneicosyl','henicosyl','docosyl','tricosyl','tetracosyl','pentacosyl','hexacosyl','heptacosyl','octacosyl','nonacosyl',\
94
+ 'triacontyl','hentriacontyl','untriacontyl','dotriacontyl','tritriacontyl','tetratriacontyl','pentatriacontyl','hexatriacontyl','heptatriacontyl','octatriacontyl','nonatriacontyl',\
95
+ 'tetracontyl','hentetracontyl','dotetracontyl','tritetracontyl','tetratetracontyl','pentatetracontyl','hexatetracontyl','heptatetracontyl','octatetracontyl','nonatetracontyl','pentacontyl'],
96
+
97
+ 'A':['benzyl','benzo','phenyl','benzene','toluene','xylene','mesitylene','durene','naphthalene','fluorene','anthracene','phenanthrene','phenalene',\
98
+ 'tetracene','chrysene','triphenylene','pyrene','pentacene','perylene','corannulene','coronene','ovalene','indan','indene','tetralin'],\
99
+
100
+ 'C':['cyclo','menthane'],\
101
+
102
+ 'E':['ene','yne'],\
103
+
104
+ 'O':[]}
105
+
106
+ #Tuple of contains keys in order of priority
107
+ keyLoop = ('A','C','E','B','L')
108
+
109
+ #Tuple of elements to be excluded and automatically labelled as 'O'
110
+ elementExclude = ('He','Li','Be','B','N','O','F','Ne','Na','Mg','Al','Si','P',\
111
+ 'S','Cl','Ar','K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co',\
112
+ 'Ni','Cu','Zn')
113
+
114
+
115
+ """ DIRECTORIES """
116
+ #Main directory
117
+ cwd = "/Users/connards/Desktop/University/Rorrer Lab/Scripts/AutoQuant/"
118
+
119
+ #Set up dictionary containing all relevant directories
120
+ direcDict = {'cwd':"/Users/connards/Desktop/University/Rorrer Lab/Scripts/AutoQuant/", #Main directory
121
+ 'resources':cwd+'resources/', #Resources directory
122
+ 'DF_Dir':cwd+"data/"+sname+"/", #Data file directory
123
+ 'DF_raw':cwd+"data/"+sname+"/raw data/", #Raw data files directory
124
+ 'DFlog_Dir':cwd+"data/"+sname+"/log/"} #Data file log directory
125
+
126
+ #Dictionary of substrings to add to sample name to create file names
127
+ sub_Dict = {'Gas TCD+FID':['_GS2_TCD_CSO.csv'],
128
+ 'Gas Labelled MS Peaks':['_GS1_UA_Comp_UPP.csv'],
129
+ 'Gas FID+MS':['_GS2_FIDpMS.csv'],
130
+ 'Liquid FID':['_LQ1_FID_CSO.csv'],
131
+ 'Liquid Labelled MS Peaks':['_LQ1_UA_Comp_UPP.csv'],
132
+ 'Liquid FID+MS':['_LQ1_FIDpMS.csv'],
133
+ 'Info':['_INFO.json']}
134
+
135
+
136
+ """ LOGGING """
137
+ #Get current datetime
138
+ now = datetime.now()
139
+ #Get current datetime string
140
+ nows = now.strftime('%Y%m%d')
141
+
142
+ #If log directory does not exist within sample folder, create it
143
+ if not os.path.exists(direcDict['DFlog_Dir']):
144
+ os.makedirs(direcDict['DFlog_Dir'])
145
+
146
+ #Instantiate a logger
147
+ logger = logging.getLogger(__name__)
148
+ #Initialize logging file using current datetime
149
+ fh = logging.FileHandler(direcDict['DFlog_Dir']+'quantlog_'+nows+'.log')
150
+ logger.addHandler(fh)
151
+ #Set logging level
152
+ logger.setLevel(logging.INFO)
153
+ #Create a formatter and assign to logger
154
+ formatter = logging.Formatter('[%(filename)s] %(asctime)s - [%(levelname)s]: %(message)s')
155
+ fh.setFormatter(formatter)
156
+
157
+ """ FUNCTIONS """
158
+
159
+ #Function for selecting FID peak, MS peak, and FIDpMS pathnames according to sample name and phase
160
+ def fileNamer(sname,sphase,sub_Dict,pathData):
161
+ """
162
+ Parameters
163
+ ----------
164
+ sname : STR
165
+ The name of the sample.
166
+ sphase : STR
167
+ A string that describes whether sample is gas ("G") or liquid ("L").
168
+ sub_Dict : Dict
169
+ A dictionary of substrings to add to sample name to create file names
170
+ pathData : STR
171
+ A string containing the pathname to the datafiles directory
172
+
173
+ Returns
174
+ -------
175
+ paths : List
176
+ A list of pathnames to return.
177
+
178
+ """
179
+ #If sample phase is liquid, set pathnames accordingly
180
+ if sphase == "L":
181
+ pathFID = os.path.join(pathData,sname+sub_Dict['Liquid FID'][0])
182
+ pathMS = os.path.join(pathData,sname+sub_Dict['Liquid Labelled MS Peaks'][0])
183
+ pathFIDpMS = os.path.join(pathData,sname+sub_Dict['Liquid FID+MS'][0])
184
+
185
+ #Else if sample phase is gas, set pathnames accordingly
186
+ elif sphase == "G":
187
+ pathFID = os.path.join(pathData,sname+sub_Dict['Gas TCD+FID'][0])
188
+ pathMS = os.path.join(pathData,sname+sub_Dict['Gas Labelled MS Peaks'][0])
189
+ pathFIDpMS = os.path.join(pathData,sname+sub_Dict['Gas FID+MS'][0])
190
+
191
+ #Otherwise, set all paths to None
192
+ else:
193
+ pathFID = None
194
+ pathMS = None
195
+ pathFIDpMS = None
196
+
197
+ paths = [pathFID,pathMS,pathFIDpMS]
198
+
199
+ return paths
200
+
201
+ #Function for checking if FIDpMS file exists – creates it if necessary and imports/returns the data
202
+ def checkFile(fpmDir,fDir):
203
+ """
204
+ Parameters
205
+ ----------
206
+ fpmDir : STR
207
+ A string containing the pathname of the FIDpMS file in question.
208
+ fDir : STR
209
+ A string containing the pathname of the FID file of the same sample/phase as the FIDpMS file.
210
+
211
+ Returns
212
+ -------
213
+ fpmDF : DataFrame
214
+ A DataFrame containing the contents of the FIDpMS file.
215
+ exists : BOOL
216
+ A boolean describing whether or not the relevant file exists and has manually added peaks.
217
+
218
+ """
219
+ #If FIDpMS file does not exist in data file directory, create it and return False
220
+ if not os.path.exists(fpmDir):
221
+ #Log that file wasn't found and a new one is being created
222
+ logger.info('FIDpMS file not found for sample and phase, creating new...')
223
+ #Read FID dataframe
224
+ fDF = pd.read_csv(fDir)
225
+ #Filter FID dataframe to only include FID rows, as gas samples may have TCD rows, and set to fpmDF
226
+ fpmDF = fDF.loc[fDF['Signal Name'] == 'FID1A'].copy()
227
+ #Rename FID RT and FID Area columns, as well as rename the Height column to MS RT
228
+ fpmDF = fpmDF.rename(columns={'RT':'FID RT','Area':'FID Area','Height':'MS RT'})
229
+ #Clear the contents of the MS RT column
230
+ fpmDF['MS RT'] = np.nan
231
+ #Create list of new columns to create
232
+ lnc = ['Formula','Match Factor','Compound Source','Compound Type Abbreviation']
233
+
234
+ #Loop through lnc, adding nan columns for each entry
235
+ for i in lnc:
236
+ fpmDF[i] = np.nan
237
+
238
+ #Remove the Injection Data File Name and Signal Name columns
239
+ fpmDF = fpmDF.drop(['Injection Data File Name','Signal Name'],axis=1).copy()
240
+ #Save fpmDF to provided pathname
241
+ fpmDF.to_csv(fpmDir, index=False)
242
+
243
+ return fpmDF, False
244
+
245
+ #Otherwise..
246
+ else:
247
+ fpmDF = pd.read_csv(fpmDir)
248
+ #If the FIDpMS exists and there exist any peaks..
249
+ if fpmDF['Compound Name'].any():
250
+ #Define a new dataframe which includes all rows with labelled peaks
251
+ fpmDF_labelled = fpmDF.loc[~fpmDF['Compound Name'].isna()]['Compound Source']
252
+ #If those peaks are manually assigned or have a blank source, return the dataframe and True
253
+ if 'Manual' in fpmDF_labelled.values.tolist() or pd.isna(fpmDF_labelled.values).any():
254
+ #Create a log entry
255
+ logger.info('FIDpMS file exists and contains manual and/or blank sourced entries')
256
+ return fpmDF, True
257
+ #Otherwise, if there exist no manually assigned peaks or labelled peaks with a blank source, return False
258
+ else:
259
+ #Create a log entry
260
+ logger.info('FIDpMS file exists but does not contains manual or blank sourced entries')
261
+ return fpmDF, False
262
+
263
+ #If the FIDpMS file exists but has no peaks, return False
264
+ else:
265
+ #Create a log entry
266
+ logger.info('FIDpMS file exists but contains no labelled peaks')
267
+ return fpmDF, False
268
+
269
+ #Function describing a third order fit for gas analysis
270
+ def defaultGas(FIDRT,fpmDF,fit_const=[a_tof,b_tof,c_tof,d_tof]):
271
+ """
272
+ A function used to describe the default fit for gas analysis peak matching
273
+
274
+ Parameters
275
+ ----------
276
+ FIDRT : Float
277
+ A float describing the FID retention time requiring a corresponding MS retention time.
278
+ fpmDF : DataFrame
279
+ A dataframe containing FID and MS peak info.
280
+ fit_const : List, optional
281
+ A list of floats describing a third order fit. The default is [a_tof,b_tof,c_tof,d_tof].
282
+
283
+ Returns
284
+ -------
285
+ MSRT : Float
286
+ A float describing the calculated MS RT using the third order fit and the FID RT
287
+ """
288
+
289
+ MSRT = fit_const[0]*FIDRT**3+fit_const[1]*FIDRT**2+fit_const[2]*FIDRT+fit_const[3]
290
+
291
+ return MSRT
292
+
293
+ #TODO: Function for creating a third order fit using manually matched peaks
294
+
295
+ #Function for creating a linear fit using manually matched peaks
296
+ def RTlinfit(fpmDF):
297
+
298
+ #Get a new dataframe containing only rows with labelled peaks
299
+ fpmDF_lab = fpmDF.loc[~fpmDF['Compound Name'].isna()]
300
+
301
+ #Get a new dataframe containing only rows with manual/blank peaks
302
+ fpmDF_mb = fpmDF_lab.loc[(fpmDF_lab['Compound Source']=='Manual') | (fpmDF_lab['Compound Source'].isna())]
303
+
304
+ #If dataframe contains any rows with 'Manual' as a source..
305
+ if 'Manual' in fpmDF_lab['Compound Source'].tolist():
306
+ #If dataframe also contains rows with 'nan' as a source..
307
+ if pd.isna(fpmDF_lab['Compound Source'].values).any():
308
+
309
+ #Manual and blank counts are appropriately assigned
310
+ manual_count = fpmDF_lab['Compound Source'].value_counts()['Manual']
311
+ blank_count = fpmDF_lab['Compound Source'].isna().sum()
312
+ #All count will include both manual and blank entries
313
+ all_count = manual_count + blank_count
314
+
315
+ #Otherwise, all count will only include manual entries
316
+ else:
317
+ manual_count = fpmDF_lab['Compound Source'].value_counts()['Manual']
318
+ blank_count = 0
319
+ all_count = manual_count
320
+
321
+ #Else if dataframe contains anyrows with 'nan' as a source..
322
+ elif pd.isna(fpmDF_lab['Compound Source'].values).any():
323
+ #All count will include only blank entries
324
+ manual_count = 0
325
+ blank_count = fpmDF_lab['Compound Source'].isna().sum()
326
+ all_count = fpmDF_lab['Compound Source'].isna().sum()
327
+
328
+ #Otherwise, log that the provided dataframe has no manual or blank entries and return None or 0 for all returns
329
+ else:
330
+ logger.error('Linear fit function provided a dataframe without manual or blank entries')
331
+ return None, [0,0,0]
332
+
333
+ #If the blank count is larger than zero, log a warning stating that one or more entries contain a blank source
334
+ if blank_count > 0:
335
+ logger.warning("One or more labelled peaks in the FIDpMS file have no entry for Compound Source")
336
+ #Otherwise, pass
337
+ else:
338
+ pass
339
+
340
+ #Predefine variables for use in linear fitting
341
+ peakDrift = 0 #Peak drift, the linear slope describing drift between FID and MS RT's
342
+ peakOffset = 0 #Peak offset, the initial offset between FID and MS RT's
343
+ peakDiff = 0 #Peak difference, the difference between a given FID and MS RT
344
+ r2 = 0 #Coefficient of determination, the r^2 value of a linear fit
345
+
346
+ #If all_count is equal to 1..
347
+ if all_count == 1:
348
+
349
+ #Set the peak offset to the peak difference for the single labelled peak
350
+ peakDiff = fpmDF_mb['FID RT'].iloc[0] - fpmDF_mb['MS RT'].iloc[0]
351
+ peakOffset = peakDiff
352
+
353
+ else:
354
+
355
+ #Loop through every labelled peak, calculating the peak difference
356
+ for i, row in fpmDF_mb.iterrows():
357
+ peakDiff = row['FID RT'] - row['MS RT']
358
+ #Add this peak difference to a new column in the dataframe
359
+ fpmDF_mb.at[i,'peakDiff'] = peakDiff
360
+ #Get a linear fit for peak drift and peak offset using peak differences as y-values and FID RT's as x-values
361
+ peakDrift, peakOffset, r_value, p_value, std_err = scipy.stats.linregress(fpmDF_mb['FID RT'],fpmDF_mb['peakDiff'])
362
+ #Get a coefficient of determination
363
+ r2 = r_value**2
364
+ #Get a list of all peak counts
365
+ counts = [all_count,manual_count,blank_count]
366
+
367
+ return fpmDF_mb, [peakDrift,peakOffset,r2], counts
368
+
369
+ #Function that estimates unknown MS RT's and matches FID and MS peaks using a provided linear fit
370
+ def matchPeaksLinear(fpmDF,mDF,linfits,peakError=0.06):
371
+ """
372
+ Parameters
373
+ ----------
374
+ fpmDF : DataFrame
375
+ Dataframe containing FID and MS peak info
376
+ mDF : DataFrame
377
+ Dataframe containing MS info about identified compounds (UA_UPP)
378
+ linfits : List
379
+ List containing info about a linear fit for estimated MS RT's in the form [m,b,r2]
380
+ peakError : Float, optional
381
+ Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
382
+
383
+ Returns
384
+ -------
385
+ fpmDF : DataFrame
386
+ Dataframe containing FID and MS peak info
387
+ """
388
+
389
+ def matchOne(fpmDF,fpmiter,linfits,peakError):
390
+ """
391
+ Parameters
392
+ ----------
393
+ fpmDF : DataFrame
394
+ Dataframe containing FID and MS peak info
395
+ fpmiter : List
396
+ List containing current index and row in fpmDF of interest in form [i,row]
397
+ linfits : List
398
+ List containing info about a linear fit for estimated MS RT's in the form [m,b,r2]
399
+ peakError : float
400
+ Allowable error between estimated MS RT's and actual MS RT's
401
+
402
+ Returns
403
+ -------
404
+ fpmDF : DataFrame
405
+ Dataframe containing FID and MS peak info
406
+ """
407
+
408
+ #Unpack fpmDF iterating info
409
+ fpmi = int(fpmiter[0])
410
+ fpmrow = fpmiter[1]
411
+
412
+ #Estimate an MS RT for the row's FID RT using the linear fit
413
+ est_MSRT = fpmrow['FID RT'] - (peakDrift*fpmrow['FID RT'] + peakOffset)
414
+ #Compare the estimated MS RT to all real MS RT's, seeing if there is a match within error
415
+ mDF_match = mDF.loc[(mDF['Component RT'] >= est_MSRT-peakError) & (mDF['Component RT'] <= est_MSRT+peakError)].copy()
416
+ #If there is more than one match, select the entry with the smallest error
417
+ if len(mDF_match) > 1:
418
+ #Add an RT error to all mDF_match entries
419
+ for i, row in mDF_match.iterrows():
420
+ mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-est_MSRT)
421
+
422
+ #Set mDF_match to the row with minimum RT Error
423
+ mDF_match = mDF_match.nsmallest(1,'RT Error')
424
+
425
+ #Reset the mDF_match index
426
+ mDF_match = mDF_match.reset_index().copy()
427
+
428
+ #If the length of mDF_match is greater than zero..
429
+ if len(mDF_match) > 0:
430
+
431
+ #Add the MS info to the FIDpMS dataframe
432
+ fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
433
+ fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
434
+ fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
435
+ fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
436
+ fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using a linear fit of manual peak assignments'
437
+
438
+ #Otherwise, pass
439
+ else:
440
+ pass
441
+
442
+ return fpmDF
443
+
444
+ #Get peak drift and peak offset parameters from linfits, as well as coefficient of determination
445
+ peakDrift = linfits[0]
446
+ peakOffset = linfits[1]
447
+ r2 = linfits[2]
448
+
449
+ #Loop through every row in the dataframe
450
+ for i, row in fpmDF.iterrows():
451
+ #If the row's compound name is not blank
452
+ if not pd.isna(row['Compound Name']):
453
+ #If the row's compound source is either manual or blank, skip it
454
+ if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
455
+ pass
456
+ #Otherwise..
457
+ else:
458
+ #Match one FID peak
459
+ fpmDF = matchOne(fpmDF, [i,row], linfits, peakError)
460
+ #Otherwise, if the row's compound name is blank..
461
+ else:
462
+ #Match one FID peak
463
+ fpmDF = matchOne(fpmDF, [i,row], linfits, peakError)
464
+
465
+ return fpmDF
466
+
467
+ #Function that estimates unknown MS RT's and matches FID and MS peaks using a provided third order fit
468
+ def matchPeaksThird(fpmDF,mDF,fit_const,peakError=0.06):
469
+ """
470
+ Parameters
471
+ ----------
472
+ fpmDF : DataFrame
473
+ Dataframe containing FID and MS peak info
474
+ mDF : DataFrame
475
+ Dataframe containing MS info about identified compounds (UA_UPP)
476
+ fit_const : List
477
+ A list of floats describing a third order fit.
478
+ peakError : Float, optional
479
+ Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
480
+
481
+ Returns
482
+ -------
483
+ fpmDF : DataFrame
484
+ Dataframe containing FID and MS peak info
485
+ """
486
+
487
+ def matchOne(fpmDF,fpmiter,fit_const,peakError):
488
+ """
489
+ Parameters
490
+ ----------
491
+ fpmDF : DataFrame
492
+ Dataframe containing FID and MS peak info
493
+ fpmiter : List
494
+ List containing current index and row in fpmDF of interest in form [i,row]
495
+ fit_const : List
496
+ A list of floats describing a third order fit.
497
+ peakError : float
498
+ Allowable error between estimated MS RT's and actual MS RT's
499
+
500
+ Returns
501
+ -------
502
+ fpmDF : DataFrame
503
+ Dataframe containing FID and MS peak info
504
+ """
505
+
506
+ #Unpack fpmDF iterating info
507
+ fpmi = int(fpmiter[0])
508
+ fpmrow = fpmiter[1]
509
+
510
+ #Define x as fpmrow['FID RT] for convenience
511
+ x = fpmrow['FID RT']
512
+ #Estimate an MS RT for the row's FID RT using the third order fit
513
+ est_MSRT = fit_const[0]*x**3 + fit_const[1]*x**2 + fit_const[2]*x + fit_const[3]
514
+ #Compare the estimated MS RT to all real MS RT's, seeing if there is a match within error
515
+ mDF_match = mDF.loc[(mDF['Component RT'] >= est_MSRT-peakError) & (mDF['Component RT'] <= est_MSRT+peakError)].copy()
516
+ #If there is more than one match, select the entry with the smallest error
517
+ if len(mDF_match) > 1:
518
+ #Add an RT error to all mDF_match entries
519
+ for i, row in mDF_match.iterrows():
520
+ mDF_match.at[i,'RT Error'] = abs(mDF_match.at[i,'Component RT']-est_MSRT)
521
+
522
+ #Set mDF_match to the row with minimum RT Error
523
+ mDF_match = mDF_match.nsmallest(1,'RT Error')
524
+
525
+ #Reset the mDF_match index
526
+ mDF_match = mDF_match.reset_index().copy()
527
+
528
+ #If the length of mDF_match is greater than zero..
529
+ if len(mDF_match) > 0:
530
+
531
+ #Add the MS info to the FIDpMS dataframe
532
+ fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
533
+ fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
534
+ fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
535
+ fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
536
+ fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using a predetermined third-order fit'
537
+
538
+ #Otherwise, pass
539
+ else:
540
+ pass
541
+
542
+ return fpmDF
543
+
544
+ #Loop through every row in the dataframe
545
+ for i, row in fpmDF.iterrows():
546
+ #If the row's compound name is not blank
547
+ if not pd.isna(row['Compound Name']):
548
+ #If the row's compound source is either manual or a gasPeaks known peak match or blank, skip it
549
+ if row['Compound Source'] == 'Manual' or row['Compound Source'] == 'Automatically assigned using gas pairs provided in resources' or pd.isna(row['Compound Source']):
550
+ pass
551
+ #Otherwise..
552
+ else:
553
+ #Match one FID peak
554
+ fpmDF = matchOne(fpmDF, [i,row], fit_const, peakError)
555
+ #Otherwise, if the row's compound name is blank..
556
+ else:
557
+ #Match one FID peak
558
+ fpmDF = matchOne(fpmDF, [i,row], fit_const, peakError)
559
+
560
+ return fpmDF
561
+
562
+ #Function that performs a subset of speculative labeling, using known peaks hard-coded in a file gasPairs_FIDpMS.csv
563
+ def matchKnownPeaks(fpmDF,mDF,gp_rsc):
564
+ def matchOne(fpmDF,fpmiter,gp_rsc):
565
+ """
566
+ Parameters
567
+ ----------
568
+ fpmDF : DataFrame
569
+ Dataframe containing FID and MS peak info
570
+ fpmiter : List
571
+ List containing current index and row in fpmDF of interest in form [i,row]
572
+ gp_rsc : DataFrame
573
+ Dataframe containing opened gasPairs resource.
574
+ peakError : float
575
+ Allowable error between estimated MS RT's and actual MS RT's
576
+
577
+ Returns
578
+ -------
579
+ fpmDF : DataFrame
580
+ Dataframe containing FID and MS peak info
581
+ """
582
+
583
+ #Unpack fpmDF iterating info
584
+ fpmi = int(fpmiter[0])
585
+ fpmrow = fpmiter[1]
586
+
587
+ #Search the gasPairs resource to see if any known peaks/RT's match the FID peak list
588
+ for i, row in gp_rsc.iterrows():
589
+ #Set gp_match to empty string
590
+ gp_match = pd.Series()
591
+ #Define error as two times the standard deviation for the FID RT in the gasPeaks resource
592
+ gp_error = row['Stdev FID RT']*2
593
+ #Extract the FID RT from the resource
594
+ gp_FIDRT = row['Average FID RT']
595
+ #If the current fpmrow FID RT is within the error bounds of an entry in the resource, match it
596
+ #NOTE: prefers the first match, even if the next match is closer. Most resourceRT's are more than
597
+ #2*error away from each other
598
+ if (fpmrow['FID RT'] >= gp_FIDRT - gp_error) and (fpmrow['FID RT'] <= gp_FIDRT + gp_error):
599
+ gp_match = row
600
+ break
601
+ #Otherwise, pass
602
+ else:
603
+ pass
604
+
605
+ #If gp_match is empty, pass
606
+ if gp_match.empty:
607
+ pass
608
+ #Otherwise, add the match info
609
+ else:
610
+ #Add the resource match info to the FIDpMS dataframe
611
+ fpmDF.at[fpmi,'Compound Name'] = gp_match['Species']
612
+ fpmDF.at[fpmi,'Formula'] = gp_match['Formula']
613
+ fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using gas pairs provided in resources'
614
+
615
+ return fpmDF
616
+
617
+ #Loop through every row in the dataframe
618
+ for i, row in fpmDF.iterrows():
619
+ #If the row's compound name is not blank
620
+ if not pd.isna(row['Compound Name']):
621
+ #If the row's compound source is either manual or blank, skip it
622
+ if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
623
+ pass
624
+ #Otherwise..
625
+ else:
626
+ #Match one FID peak
627
+ fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
628
+ #Otherwise, if the row's compound name is blank..
629
+ else:
630
+ #Match one FID peak
631
+ fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
632
+
633
+ return fpmDF
634
+
635
+
636
+ #Function that performs speculative labeling to label FID peaks which do not have a match
637
+ def specLab(fpmDF,kc_rsc,sinfo,counts,peakError,restrictList):
638
+
639
+ #Unpack restrictList
640
+ trueRestrict, prefer = restrictList
641
+ #Log that speculative labeling is being performed
642
+ logger.info('Performing speculative labeling on {0} with {1} peaks, {2} of which are labelled: {3} sourced manually and {4} with an unknown source'.format(sinfo['Sample Name'],len(fpmDF),counts[0],counts[1],counts[2]))
643
+
644
+ #Loop through every entry in fpmDF
645
+ for i, row in fpmDF.iterrows():
646
+ #Define a Boolean for use in determining whether to run the next if statement or not
647
+ Bool_kc_check = True
648
+
649
+ #If the compound name is blank or either form of "No Match"..
650
+ if pd.isna(row['Compound Name']) or row['Compound Name'] == 'No Match' or row['Compound Name'] == 'No Match':
651
+
652
+ #Get a copy of kc_rsc
653
+ kc_check = kc_rsc.copy()
654
+ #Find rows where the FID peak RT is within provided error
655
+ kc_check = kc_check.loc[(kc_check['FID RT']>=row['FID RT']-peakError) & (kc_check['FID RT']<=row['FID RT']+peakError)]
656
+ #Filter out rows that label the peak as No Match or No match
657
+ kc_check = kc_check.loc[(kc_check['Compound Name']!='No Match') & (kc_check['Compound Name']!='No match')]
658
+
659
+ #For every entry in trueRestrict, filter out rows where the entry property does not match
660
+ for entry in trueRestrict:
661
+ kc_check = kc_check.loc[kc_check[entry]==sinfo[entry]]
662
+
663
+ #If kc_check has more than one row...
664
+ if len(kc_check)>1:
665
+
666
+ #Make a copy of kc_check
667
+ kc_check_2 = kc_check.copy()
668
+
669
+ #Loop through every entry in prefer
670
+ for key in prefer:
671
+
672
+ #Select rows in which the given entry property in prefer has a value within the provided range
673
+ kc_check_2 = kc_check.loc[(kc_check[key]>=sinfo[key]-prefer[key])&(kc_check[key]<=sinfo[key]+prefer[key])]
674
+ #If this results in a DataFrame with more than one entry, filter the original kc_check
675
+ if len(kc_check_2)>1:
676
+ kc_check = kc_check_2.copy()
677
+ pass
678
+ #If this results in a DataFrame with one entry, break the loop
679
+ elif len(kc_check_2)==1:
680
+ kc_check = kc_check_2.iloc[0].copy()
681
+ #Define a Boolean for use in determining whether to run the next if statement or not
682
+ Bool_kc_check = False
683
+ break
684
+ #If this results in a DataFrame with fewer than one entry (the only other possible option)..
685
+ else:
686
+ #Pass and do not apply this preference
687
+ pass
688
+
689
+ #If kc_check still has more than one row..
690
+ if len(kc_check)>1 and Bool_kc_check:
691
+ #Get the row with the highest match factor
692
+ kc_check = kc_check.loc[kc_check['Match Factor'].idxmax()]
693
+
694
+ #Otherwise, pass
695
+ else:
696
+ pass
697
+
698
+ #Else if kc_check has only one row..
699
+ elif len(kc_check)==1:
700
+ #Convert the DataFrame into a Series
701
+ kc_check = kc_check.iloc[0]
702
+ #Otherwise, pass
703
+ else:
704
+ pass
705
+
706
+ #If kc_check is not 0..
707
+ if len(kc_check) > 0:
708
+ #Add the new kc_check entry to fpmDF for the given row
709
+ fpmDF.at[i,'Compound Name'] = kc_check['Compound Name']
710
+ fpmDF.at[i,'Formula'] = kc_check['Formula']
711
+ fpmDF.at[i,'Compound Source'] = 'Speculated based on {0}, which used {1} at {2}C and {3}psi'.format(kc_check['Sample Name'],kc_check['Catalyst'],kc_check['Reaction Temperature (C)'],kc_check['Reaction pressure (psi)'])
712
+ #Otherwise, pass
713
+ else:
714
+ pass
715
+
716
+ #Otherwise, pass
717
+ else:
718
+ pass
719
+
720
+ return fpmDF
721
+
722
+
723
+ """ DATA IMPORTS """
724
+ #Import sample information from json file
725
+ with open(direcDict['DF_Dir']+sname+sub_Dict['Info'][0]) as sinfo_f:
726
+ sinfo = json.load(sinfo_f)
727
+
728
+ #Change ISO date-time strings into datetime objects
729
+ sinfo['Start Time'] = datetime.fromisoformat(sinfo['Start Time'])
730
+ sinfo['End Time'] = datetime.fromisoformat(sinfo['End Time'])
731
+
732
+ #Calculate a reaction time using the start, end, and heat time values and add to sinfo
733
+ sinfo['Reaction Time (hr)'] = abs(sinfo['End Time']-sinfo['Start Time']).total_seconds()/3600 - sinfo['Heat Time']
734
+
735
+ #Run the file naming function
736
+ paths = fileNamer(sname,sphase,sub_Dict,direcDict['DF_raw'])
737
+
738
+ #Import MS UPP data
739
+ mDF = pd.read_csv(paths[1])
740
+
741
+ #Get only relevant rows of MS UPP data
742
+ mDF = mDF.loc[:,['Component RT','Compound Name','Formula','Match Factor']]
743
+
744
+ #Import known compounds resource
745
+ kc_rsc = pd.read_csv(direcDict['resources']+'known_compounds.csv')
746
+ #Filter known compounds to only include rows with the same catalyst
747
+ #AND compounds which were not identified by the current sample
748
+ kc_rsc = kc_rsc.loc[(kc_rsc['Catalyst']==sinfo['Catalyst Type'])&(kc_rsc['Sample Name']!=sinfo['Sample Name'])]
749
+
750
+ #Import gasPairs_FIDpMS.csv resource
751
+ gp_rsc = pd.read_csv(direcDict['resources']+'gasPairs_FIDpMS.csv')
752
+
753
+ """ CODE """
754
+ #Run the file checking function
755
+ fpmDF, tf = checkFile(paths[2],paths[0])
756
+
757
+ #If the specified model is linear...
758
+ if model == "L":
759
+ #If the file contains manually matched peaks..
760
+ if tf:
761
+ #Run the linear fit function
762
+ fpmDF_mb, linfits, counts = RTlinfit(fpmDF)
763
+ #Run the peak matching function
764
+ fpmDF = matchPeaksLinear(fpmDF,mDF,linfits,peakError)
765
+
766
+ else:
767
+ pass
768
+
769
+ #Otherwise, if the specified model is third order...
770
+ elif model == "T":
771
+ #Run the gasPeaks_FIDpMS resource matching function
772
+ fpmDF = matchKnownPeaks(fpmDF,mDF,gp_rsc)
773
+ #Run the third order peak matching function
774
+ fpmDF = matchPeaksThird(fpmDF,mDF,fit_const,peakError)
775
+
776
+ #Otherwise, pass
777
+ else:
778
+ pass
779
+ #Run the speculative labeling function
780
+ if splab_TF == "True":
781
+ print("Running speculative labelling...")
782
+ fpmDF = specLab(fpmDF, kc_rsc, sinfo, counts, peakError, restrictList)
783
+ else:
784
+ pass
785
+
786
+ #Save the FIDpMS data
787
+ fpmDF.to_csv(paths[2])
788
+
789
+ #End time for execution time
790
+ exec_end = datetime.now()
791
+ #Execution time
792
+ exec_time = (exec_end-exec_start).total_seconds()*10**3
793
+ print("Time to execute: {:.03f}ms".format(exec_time))
794
+
795
+
796
+
797
+
798
+