chromaquant 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chromaquant/Handle/__init__.py +12 -0
- chromaquant/Handle/handleDirectories.py +89 -0
- chromaquant/Manual/HydroUI.py +418 -0
- chromaquant/Manual/QuantUPP.py +373 -0
- chromaquant/Manual/Quantification.py +1305 -0
- chromaquant/Manual/__init__.py +10 -0
- chromaquant/Manual/duplicateMatch.py +211 -0
- chromaquant/Manual/fpm_match.py +798 -0
- chromaquant/Manual/label-type.py +179 -0
- chromaquant/Match/AutoFpmMatch.py +1133 -0
- chromaquant/Match/__init__.py +12 -0
- chromaquant/Quant/AutoQuantification.py +1329 -0
- chromaquant/Quant/__init__.py +12 -0
- chromaquant/__init__.py +10 -0
- chromaquant/__main__.py +493 -0
- chromaquant/properties.json +4 -0
- chromaquant-0.3.1.dist-info/METADATA +189 -0
- chromaquant-0.3.1.dist-info/RECORD +22 -0
- chromaquant-0.3.1.dist-info/WHEEL +4 -0
- chromaquant-0.3.1.dist-info/entry_points.txt +2 -0
- chromaquant-0.3.1.dist-info/licenses/LICENSE.txt +18 -0
- chromaquant-0.3.1.dist-info/licenses/LICENSES_bundled.txt +1035 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
COPYRIGHT STATEMENT:
|
|
6
|
+
|
|
7
|
+
ChromaQuant – A quantification software for complex gas chromatographic data
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2024, by Julia Hancock
|
|
10
|
+
Affiliation: Dr. Julie Elaine Rorrer
|
|
11
|
+
URL: https://www.rorrerlab.com/
|
|
12
|
+
|
|
13
|
+
License: BSD 3-Clause License
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
UNKNOWNS ANALYSIS POST PROCESSING
|
|
18
|
+
Intended to sort through raw UA output to find best hits considering
|
|
19
|
+
compound constraints.
|
|
20
|
+
|
|
21
|
+
Julia Hancock
|
|
22
|
+
08/25/2023
|
|
23
|
+
|
|
24
|
+
First version (v1) completion: 08/31/2023
|
|
25
|
+
|
|
26
|
+
Improvement notes: -Add places to throw error and redirect user through console when user-inputted data goes wrong
|
|
27
|
+
-Add GUI
|
|
28
|
+
-Separate functions into packages, redesign nested function trees
|
|
29
|
+
-Check if saving data will cause an overwrite - if it does, add an additional suffix
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
""" PACKAGES """
|
|
33
|
+
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import numpy as np
|
|
36
|
+
import os
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
import re
|
|
39
|
+
|
|
40
|
+
""" PARAMETERS """
|
|
41
|
+
|
|
42
|
+
#DIRECTORIES
|
|
43
|
+
#Name of folder to search for files
|
|
44
|
+
infolder_name = "MBPR025-033_GasMS_UA_Comp"
|
|
45
|
+
#Name of folder to export results
|
|
46
|
+
outfolder_name = infolder_name + "_UPP"
|
|
47
|
+
#PARAMETERS
|
|
48
|
+
#Limit of identical peak RT
|
|
49
|
+
PeakRTLim = 0.005
|
|
50
|
+
|
|
51
|
+
""" DIRECTORIES """
|
|
52
|
+
|
|
53
|
+
#Find main file' path and its parent directory
|
|
54
|
+
main_filepath = Path(__file__)
|
|
55
|
+
main_directory = main_filepath.parent
|
|
56
|
+
import_directory = main_directory / "Imports"
|
|
57
|
+
export_directory = main_directory / "Exports"
|
|
58
|
+
|
|
59
|
+
#Directory for finding relevant files
|
|
60
|
+
fileDir = import_directory / infolder_name
|
|
61
|
+
#Directory for exporting constrained data
|
|
62
|
+
expfileDir = export_directory / outfolder_name
|
|
63
|
+
|
|
64
|
+
#Create list of string paths for each file in provided file directory
|
|
65
|
+
fileLoc = []
|
|
66
|
+
for path, subdirs, files in os.walk(fileDir):
|
|
67
|
+
for name in files:
|
|
68
|
+
fileLoc.append(os.path.join(path, name))
|
|
69
|
+
|
|
70
|
+
""" COMPOUND CONSTRAINS """
|
|
71
|
+
#Establish lists for two levels of element restrictions:
|
|
72
|
+
#1st list (softBar) - elements that will be allowed in compound match if there are no compound matches
|
|
73
|
+
# with only allowed elements. When time comes, list will be searched for matches in
|
|
74
|
+
# order of priority
|
|
75
|
+
#2nd list (noBar) - elements that will always be allowed
|
|
76
|
+
|
|
77
|
+
#Class for elements to enable compound constraints
|
|
78
|
+
class element:
|
|
79
|
+
def __init__(self, Symbol, Name, Priority=float("nan")):
|
|
80
|
+
#Element must always have symbol and name
|
|
81
|
+
self.Symbol = Symbol
|
|
82
|
+
self.Name = Name
|
|
83
|
+
#Element does not necessarily need priority - this is an integer allowing for more precise
|
|
84
|
+
#control over choosing compound matches
|
|
85
|
+
if Priority == float("nan"):
|
|
86
|
+
pass
|
|
87
|
+
else:
|
|
88
|
+
if isinstance(Priority, int) and Priority > 0:
|
|
89
|
+
self.Priority = Priority
|
|
90
|
+
else:
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
#softBar list of semi-allowed elements
|
|
94
|
+
#softBar = [element("O","Oxygen",1),element("N","Nitrogen",2),element("Si","Silicon",4)]
|
|
95
|
+
softBar = [element("O","Oxygen",1),element("Si","Silicon",4)]
|
|
96
|
+
#noBar list of allowed elements
|
|
97
|
+
noBar = [element("H","Hydrogen"),element("C","Carbon")]
|
|
98
|
+
|
|
99
|
+
""" FUNCTIONS """
|
|
100
|
+
#Function to unpack .csv file
|
|
101
|
+
def unpackUA(filepath):
|
|
102
|
+
Df = pd.read_csv(filepath)
|
|
103
|
+
return Df
|
|
104
|
+
|
|
105
|
+
#Function to add match data to dataframe
|
|
106
|
+
def concatDF(dataSlice, DFin):
|
|
107
|
+
#Assumes a dataframe provided with these columns: ['Component RT','Compound Name','Formula','Match Factor']
|
|
108
|
+
#Also assumes dataSlice will contain at least these same columns
|
|
109
|
+
col = ['Component RT','Compound Name','Formula','Match Factor','Previous Best Compound Name',\
|
|
110
|
+
'Previous Best Formula','Previous Best Match Factor','Previous Worst Compound Name',\
|
|
111
|
+
'Previous Worst Formula','Previous Worst Match Factor']
|
|
112
|
+
listOut = [dataSlice[col[i]] for i in range(len(col))]
|
|
113
|
+
DFout = pd.concat([pd.DataFrame([listOut], columns=DFin.columns), DFin], ignore_index=True)
|
|
114
|
+
|
|
115
|
+
return DFout
|
|
116
|
+
|
|
117
|
+
#Function to add series of matches with best and worst match factor to a selected match series
|
|
118
|
+
def concatSeries(dataSlice, bestSlice, worstSlice):
|
|
119
|
+
#Assumes all Series have these columns: ['Component RT','Compound Name','Formula','Match Factor']
|
|
120
|
+
|
|
121
|
+
#Define dictionaries of new index names for bestSlice and worstSlice
|
|
122
|
+
bindex = {'Component RT':'Previous Best Component RT','Compound Name':'Previous Best Compound Name',
|
|
123
|
+
'Formula':'Previous Best Formula','Match Factor':'Previous Best Match Factor'}
|
|
124
|
+
windex = {'Component RT':'Previous Worst Component RT','Compound Name':'Previous Worst Compound Name',
|
|
125
|
+
'Formula':'Previous Worst Formula','Match Factor':'Previous Worst Match Factor'}
|
|
126
|
+
|
|
127
|
+
#Rename bestSlice and worstSlice indices
|
|
128
|
+
bestSlice = bestSlice.copy()
|
|
129
|
+
worstSlice = worstSlice.copy()
|
|
130
|
+
bestSlice.rename(index=bindex, inplace=True)
|
|
131
|
+
worstSlice.rename(index=windex, inplace=True)
|
|
132
|
+
|
|
133
|
+
#Lists of indices from best/worst slices we want to add to dataSlice
|
|
134
|
+
bindexList = ['Previous Best Compound Name','Previous Best Formula','Previous Best Match Factor']
|
|
135
|
+
windexList = ['Previous Worst Compound Name','Previous Worst Formula','Previous Worst Match Factor']
|
|
136
|
+
|
|
137
|
+
#Define returnSeries
|
|
138
|
+
returnSlice = pd.concat([dataSlice,bestSlice.loc[bindexList],worstSlice.loc[windexList]], axis=0)
|
|
139
|
+
return returnSlice
|
|
140
|
+
|
|
141
|
+
#Function to group retention times, taking median to be value of grouped peaks
|
|
142
|
+
def groupRT(rawDF):
|
|
143
|
+
|
|
144
|
+
#Redefine for clarity
|
|
145
|
+
filterDF = rawDF.copy()
|
|
146
|
+
|
|
147
|
+
#Set up empty list for output RT (RT_permF), an empty list for temporary (original) RT's
|
|
148
|
+
#(RT_temp) with only the first original RT, and the median of that list
|
|
149
|
+
RT_permF = []
|
|
150
|
+
RT_temp = [rawDF['Component RT'][0]]
|
|
151
|
+
RT_temp_median = RT_temp[0]
|
|
152
|
+
|
|
153
|
+
#For all raw retention times, group times within the PeakRTLim of each other.
|
|
154
|
+
for i in range(1,len(rawDF['Component RT'])):
|
|
155
|
+
#Current retention time
|
|
156
|
+
RT_current = rawDF['Component RT'][i]
|
|
157
|
+
|
|
158
|
+
#If current retention time within the median plus the peak limits, redefine median
|
|
159
|
+
if RT_current < RT_temp_median+PeakRTLim and RT_current > RT_temp_median-PeakRTLim:
|
|
160
|
+
#Append to list of like retention times
|
|
161
|
+
RT_temp.append(RT_current)
|
|
162
|
+
#Recalculate median, rounding to 4 decimal places
|
|
163
|
+
RT_temp_median = round(np.median(RT_temp),4)
|
|
164
|
+
#If it's reached the end of the dataframe, append what's left
|
|
165
|
+
if i == len(rawDF['Component RT']) - 1:
|
|
166
|
+
RT_permF.extend(np.full(len(RT_temp),RT_temp_median))
|
|
167
|
+
RT_temp_median = RT_current
|
|
168
|
+
RT_temp = [RT_current]
|
|
169
|
+
|
|
170
|
+
#Otherwise, save the RT_temp_median to all RT_temp positions, redefine RT_temp and RT_temp_median
|
|
171
|
+
else:
|
|
172
|
+
#Set old retention times to median
|
|
173
|
+
filterDF.loc[i-len(RT_temp):i, ('Component RT')] = RT_temp_median
|
|
174
|
+
RT_permF.extend(np.full(len(RT_temp),RT_temp_median))
|
|
175
|
+
RT_temp_median = RT_current
|
|
176
|
+
RT_temp = [RT_current]
|
|
177
|
+
|
|
178
|
+
#Delete/return variables
|
|
179
|
+
del RT_permF, RT_temp, RT_temp_median, RT_current
|
|
180
|
+
return filterDF
|
|
181
|
+
|
|
182
|
+
#Function to return True if formula only contains noBar restrictions
|
|
183
|
+
def donoBar(formula, noBar):
|
|
184
|
+
|
|
185
|
+
#Find all elements present in formula
|
|
186
|
+
elements = re.findall('[A-Z][a-z]?',formula)
|
|
187
|
+
#Get list of allowed elements from noBar dataframe
|
|
188
|
+
allowed_elements = [noBar[i].Symbol for i in range(len(noBar))]
|
|
189
|
+
|
|
190
|
+
#..If a set of the difference between the lists is not empty (there are formula elements besides allowed ones), return False
|
|
191
|
+
if set(elements).difference(set(allowed_elements)):
|
|
192
|
+
tf = False
|
|
193
|
+
#..Otherwise, return True
|
|
194
|
+
else:
|
|
195
|
+
tf = True
|
|
196
|
+
|
|
197
|
+
return tf
|
|
198
|
+
|
|
199
|
+
#Function to return True if formula only contains softBar restrictions of given priority
|
|
200
|
+
def dosoftBar(formula,noBar,softBar,priority):
|
|
201
|
+
|
|
202
|
+
#Find all elements present in formula
|
|
203
|
+
elements = re.findall('[A-Z][a-z]?',formula)
|
|
204
|
+
#Get dataframe of elements and priority from softBar
|
|
205
|
+
ePDF = pd.DataFrame.from_dict({"Symbol":[obj.Symbol for obj in softBar], "Priority":[obj.Priority for obj in softBar]})
|
|
206
|
+
#Get list of symbols with provided priority or lower, add elements from noBar
|
|
207
|
+
allowed_elements = ePDF.loc[ePDF['Priority']<=priority, 'Symbol'].to_list()
|
|
208
|
+
allowed_elements.extend([noBar[i].Symbol for i in range(len(noBar))])
|
|
209
|
+
#Delete elements dataframe
|
|
210
|
+
del ePDF
|
|
211
|
+
|
|
212
|
+
#..If a set of the difference between the lists is not empty (there are formula elements besides allowed ones), return False
|
|
213
|
+
trial = set(elements).difference(set(allowed_elements))
|
|
214
|
+
if set(elements).difference(set(allowed_elements)):
|
|
215
|
+
tf = False
|
|
216
|
+
#..Otherwise, return True
|
|
217
|
+
else:
|
|
218
|
+
tf = True
|
|
219
|
+
|
|
220
|
+
return tf
|
|
221
|
+
|
|
222
|
+
#Function to choose best matches according to compound constraints
|
|
223
|
+
def constrain(filterDF, constList):
|
|
224
|
+
"""
|
|
225
|
+
This function loops through the dataframe, selecting the best match out of duplicate retention time matches.
|
|
226
|
+
|
|
227
|
+
INPUTS: filterDF - the dataframe to be filtered
|
|
228
|
+
constList - a list containing constraints in the form [noBar, softBar]
|
|
229
|
+
|
|
230
|
+
OUTPUTS: constDF - a dataframe containing the best matches for each retention time
|
|
231
|
+
|
|
232
|
+
APPROACH: 1) Get a list of all retention times in the dataframe;
|
|
233
|
+
2) Loop through each retention time, getting a slice of each dataframe;
|
|
234
|
+
3) Loop through compound constraints to pick the best match in the slice;
|
|
235
|
+
4) Append result to new, constrained dataframe
|
|
236
|
+
|
|
237
|
+
SELECTING BEST MATCH: 1) If first formula of sorted slice contains only noBar, add to constrained dataframe
|
|
238
|
+
2) Otherwise, test next formula
|
|
239
|
+
3) If all other formulas have elements besides noBar, go back to first value and
|
|
240
|
+
allow its formula if it contains only highest priority elements
|
|
241
|
+
4) If it contains lower priority/blocklist elements, repeat down slice
|
|
242
|
+
5) If all formulas contain lower priority elements, allow the next priority and repeat search
|
|
243
|
+
5) If all formulas contain elements not listed in noBar or softBar, add "No Match" row
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
#Unpack constList into softBar and hardBar
|
|
247
|
+
noBar, softBar = constList
|
|
248
|
+
#Get list of written priorities from softBar and sort them by descending
|
|
249
|
+
priorList = sorted(list(set([x.Priority for x in softBar])))
|
|
250
|
+
#Get list of all retention times
|
|
251
|
+
arrayRF = filterDF['Component RT'].unique()
|
|
252
|
+
#Create dictionary for outputted data
|
|
253
|
+
constDF = pd.DataFrame(columns=['Component RT','Compound Name','Formula','Match Factor','Previous Best Compound Name',\
|
|
254
|
+
'Previous Best Formula','Previous Best Match Factor','Previous Worst Compound Name',\
|
|
255
|
+
'Previous Worst Formula','Previous Worst Match Factor'])
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
#For every listed retention time, select best match
|
|
259
|
+
for RTi in arrayRF:
|
|
260
|
+
|
|
261
|
+
#Get a slice containing all possible compounds at given RT
|
|
262
|
+
compound_slice = filterDF.loc[(filterDF["Component RT"] == RTi)]
|
|
263
|
+
#Remove Unknowns from slice, if slice is empty then skip one loop
|
|
264
|
+
compound_slice = compound_slice.loc[~compound_slice["Compound Name"].str.contains("Unknown")]
|
|
265
|
+
#Sort slice by match factor, reset indices
|
|
266
|
+
test_slice = compound_slice.sort_values(by=['Match Factor'], ascending=False).reset_index(drop=True)
|
|
267
|
+
|
|
268
|
+
#Find rows with best and worst match factors
|
|
269
|
+
try:
|
|
270
|
+
best_match = test_slice.iloc[0,:]
|
|
271
|
+
worst_match = test_slice.iloc[len(test_slice)-1,:]
|
|
272
|
+
except:
|
|
273
|
+
best_match = pd.Series(dtype='float64',index=['Component RT','Compound Name','Formula','Match Factor'])
|
|
274
|
+
worst_match = pd.Series(dtype='float64',index=['Component RT','Compound Name','Formula','Match Factor'])
|
|
275
|
+
|
|
276
|
+
#Set search True/False Boolean to True
|
|
277
|
+
search_tf = True
|
|
278
|
+
#Set counted_loops to 0
|
|
279
|
+
counted_loops = 0
|
|
280
|
+
#While loop to continue search function until match is either found or not
|
|
281
|
+
while search_tf == True and counted_loops < 100:
|
|
282
|
+
|
|
283
|
+
#For every row in the slice sorted by match factor..
|
|
284
|
+
for index, row in test_slice.iterrows():
|
|
285
|
+
#..If the formula meets the noBar criteria, choose row and break formula
|
|
286
|
+
if donoBar(row['Formula'],noBar) and counted_loops == 0:
|
|
287
|
+
constSeries = concatSeries(row,best_match,worst_match)
|
|
288
|
+
constDF = concatDF(constSeries,constDF)
|
|
289
|
+
search_tf = False
|
|
290
|
+
break
|
|
291
|
+
#..Otherwise if the loop number is greater than 0 and less than the
|
|
292
|
+
# number of unique softBar priorities, determine if formula meets softBar criteria
|
|
293
|
+
elif counted_loops > 0 and counted_loops < len(priorList):
|
|
294
|
+
#Try/except in case the counted loops goes higher than the priority list
|
|
295
|
+
try:
|
|
296
|
+
if dosoftBar(row['Formula'],noBar,softBar,priorList[counted_loops-1]):
|
|
297
|
+
constSeries = concatSeries(row,best_match,worst_match)
|
|
298
|
+
constDF = concatDF(constSeries,constDF)
|
|
299
|
+
search_tf = False
|
|
300
|
+
break
|
|
301
|
+
else:
|
|
302
|
+
pass
|
|
303
|
+
except:
|
|
304
|
+
pass
|
|
305
|
+
#..Otherwise if the loop number is greater than the number of listed priorities,
|
|
306
|
+
# add row with "No Match" and formula NaN
|
|
307
|
+
elif counted_loops > len(priorList):
|
|
308
|
+
constSeries = concatSeries(pd.Series({"Component RT":RTi,"Compound Name":"No Match",\
|
|
309
|
+
"Match Factor":float('nan'),"Formula":float('nan')}),\
|
|
310
|
+
best_match,worst_match)
|
|
311
|
+
constDF = concatDF(constSeries,constDF)
|
|
312
|
+
search_tf = False
|
|
313
|
+
break
|
|
314
|
+
else:
|
|
315
|
+
pass
|
|
316
|
+
#Separate into its own function momentarily
|
|
317
|
+
#if
|
|
318
|
+
|
|
319
|
+
#Count one while loop
|
|
320
|
+
counted_loops += 1
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
return constDF
|
|
324
|
+
|
|
325
|
+
#Function to save dataframe to .csv file
|
|
326
|
+
def outputCSV(constDF_Dict, file_directory, infilenames):
|
|
327
|
+
#Create names of exported files by adding "_UPP" to the name before .csv
|
|
328
|
+
outfilenames = [x[:x.index('.csv')] + '_UPP' + x[x.index('.csv'):] for x in infilenames]
|
|
329
|
+
#Create list of filepaths from export directory + filename.csv
|
|
330
|
+
filepathList = [file_directory / outfilenames[i] for i in range(len(outfilenames))]
|
|
331
|
+
#If export directory does not exist, create it
|
|
332
|
+
filepathList[0].parent.mkdir(parents=True,exist_ok=True)
|
|
333
|
+
|
|
334
|
+
#For every filename, save a .csv
|
|
335
|
+
for i in range(len(infilenames)):
|
|
336
|
+
constDF_Dict[infilenames[i]].to_csv(filepathList[i])
|
|
337
|
+
|
|
338
|
+
return None
|
|
339
|
+
|
|
340
|
+
""" CODE """
|
|
341
|
+
|
|
342
|
+
#Unpack all .csv files in provided directory
|
|
343
|
+
print("[MAIN] Unpacking data from provided directory...")
|
|
344
|
+
UAData_raw = {}
|
|
345
|
+
|
|
346
|
+
for i in range(len(files)):
|
|
347
|
+
UAData_raw[files[i]] = unpackUA(fileLoc[i])
|
|
348
|
+
|
|
349
|
+
print("[MAIN] Data unpacked.")
|
|
350
|
+
|
|
351
|
+
#Dictionaries for filtered and constrained data for each file
|
|
352
|
+
filterDF_Dict = {}
|
|
353
|
+
constDF_Dict = {}
|
|
354
|
+
#Dictionary for constrained data to be outputted
|
|
355
|
+
UAData_PP = {}
|
|
356
|
+
#For all files, run the constraint workflow
|
|
357
|
+
for i in range(len(files)):
|
|
358
|
+
#Group retention times for all files
|
|
359
|
+
print("[" + files[i] + "] Grouping retention times...")
|
|
360
|
+
filterDF = groupRT(UAData_raw[files[i]])
|
|
361
|
+
filterDF_Dict[files[i]] = filterDF
|
|
362
|
+
#Apply constraints to all files
|
|
363
|
+
print("[" + files[i] + "] Applying compound constraints...")
|
|
364
|
+
constDF = constrain(filterDF, [noBar,softBar])
|
|
365
|
+
constDF_Dict[files[i]] = constDF
|
|
366
|
+
|
|
367
|
+
#Save results
|
|
368
|
+
print("[MAIN] Saving results...")
|
|
369
|
+
outputCSV(constDF_Dict, expfileDir, files)
|
|
370
|
+
print("[MAIN] Files saved to " + str(expfileDir))
|
|
371
|
+
|
|
372
|
+
#Complete program
|
|
373
|
+
print("[MAIN] Unknowns post processing finished.")
|