chromaquant 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chromaquant/Handle/__init__.py +2 -1
- chromaquant/Handle/fileChecks.py +172 -0
- chromaquant/Handle/handleDirectories.py +1 -1
- chromaquant/Hydro/__init__.py +12 -0
- chromaquant/Hydro/hydroMain.py +496 -0
- chromaquant/Match/AutoFpmMatch.py +48 -48
- chromaquant/Match/MatchSub/__init__.py +13 -0
- chromaquant/Match/MatchSub/matchTools.py +282 -0
- chromaquant/Match/MatchSub/peakTools.py +259 -0
- chromaquant/Match/__init__.py +2 -1
- chromaquant/Match/matchMain.py +233 -0
- chromaquant/Quant/QuantSub/__init__.py +15 -0
- chromaquant/Quant/QuantSub/gasFID.py +241 -0
- chromaquant/Quant/QuantSub/gasTCD.py +425 -0
- chromaquant/Quant/QuantSub/liquidFID.py +310 -0
- chromaquant/Quant/QuantSub/parseTools.py +162 -0
- chromaquant/Quant/__init__.py +1 -1
- chromaquant/Quant/quantMain.py +417 -0
- chromaquant/UAPP/__init__.py +12 -0
- chromaquant/UAPP/uappMain.py +427 -0
- chromaquant/__main__.py +426 -393
- chromaquant/oldui.py +492 -0
- chromaquant/properties.json +1 -1
- {chromaquant-0.3.1.dist-info → chromaquant-0.4.0.dist-info}/METADATA +3 -3
- chromaquant-0.4.0.dist-info/RECORD +38 -0
- {chromaquant-0.3.1.dist-info → chromaquant-0.4.0.dist-info}/WHEEL +1 -1
- chromaquant-0.3.1.dist-info/RECORD +0 -22
- {chromaquant-0.3.1.dist-info → chromaquant-0.4.0.dist-info}/entry_points.txt +0 -0
- {chromaquant-0.3.1.dist-info → chromaquant-0.4.0.dist-info}/licenses/LICENSE.txt +0 -0
- {chromaquant-0.3.1.dist-info → chromaquant-0.4.0.dist-info}/licenses/LICENSES_bundled.txt +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
COPYRIGHT STATEMENT:
|
|
4
|
+
|
|
5
|
+
ChromaQuant – A quantification software for complex gas chromatographic data
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2024, by Julia Hancock
|
|
8
|
+
Affiliation: Dr. Julie Elaine Rorrer
|
|
9
|
+
URL: https://www.rorrerlab.com/
|
|
10
|
+
|
|
11
|
+
License: BSD 3-Clause License
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
SUBPACKAGE FOR MATCHING FID AND MS PEAKS ACCORDING TO A PASSED MODEL
|
|
16
|
+
|
|
17
|
+
Julia Hancock
|
|
18
|
+
Started 12/10/2024
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
""" PACKAGES """
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
""" FUNCTIONS """
|
|
26
|
+
|
|
27
|
+
#Third order function for testing
|
|
28
|
+
fit = lambda FID_RT: 0.0252*FID_RT**3 - 0.5274*FID_RT**2 + 4.8067*FID_RT - 3.0243
|
|
29
|
+
|
|
30
|
+
#Function that estimates unknown MS RT's and matches FID and MS peaks using a provided fit
|
|
31
|
+
def matchPeaks(fpmDF,mDF,fit,peakError=0.06):
|
|
32
|
+
|
|
33
|
+
"""
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
fpmDF : DataFrame
|
|
37
|
+
Dataframe containing FID and MS peak info
|
|
38
|
+
mDF : DataFrame
|
|
39
|
+
Dataframe containing MS info about identified compounds (UA_UPP)
|
|
40
|
+
linfits : Function
|
|
41
|
+
Function that returns an estimated MS RT with a passed FID RT
|
|
42
|
+
peakError : Float, optional
|
|
43
|
+
Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
fpmDF : DataFrame
|
|
48
|
+
Dataframe containing FID and MS peak info
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def matchOne(fpmDF,fpmiter,fit,peakError):
|
|
52
|
+
"""
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
fpmDF : DataFrame
|
|
56
|
+
Dataframe containing FID and MS peak info
|
|
57
|
+
fpmiter : List
|
|
58
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
59
|
+
linfits : List
|
|
60
|
+
List containing info about a linear fit for estimated MS RT's in the form [m,b,r2]
|
|
61
|
+
peakError : float
|
|
62
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
fpmDF : DataFrame
|
|
67
|
+
Dataframe containing FID and MS peak info
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
#Unpack fpmDF iterating info
|
|
71
|
+
fpmi = int(fpmiter[0])
|
|
72
|
+
fpmrow = fpmiter[1]
|
|
73
|
+
|
|
74
|
+
#Estimate an MS RT for the row's FID RT using the fit
|
|
75
|
+
est_MSRT = fit(fpmrow['FID RT'])
|
|
76
|
+
#Compare the estimated MS RT to all real MS RT's, seeing if there is a match within error
|
|
77
|
+
mDF_match = mDF.loc[(mDF['Component RT'] >= est_MSRT-peakError) & (mDF['Component RT'] <= est_MSRT+peakError)].copy()
|
|
78
|
+
#If there is more than one match, select the entry with the smallest error
|
|
79
|
+
if len(mDF_match) > 1:
|
|
80
|
+
#Add an RT error to all mDF_match entries
|
|
81
|
+
for i, row in mDF_match.iterrows():
|
|
82
|
+
mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-est_MSRT)
|
|
83
|
+
|
|
84
|
+
#Set mDF_match to the row with minimum RT Error
|
|
85
|
+
mDF_match = mDF_match.nsmallest(1,'RT Error')
|
|
86
|
+
|
|
87
|
+
#Reset the mDF_match index
|
|
88
|
+
mDF_match = mDF_match.reset_index().copy()
|
|
89
|
+
|
|
90
|
+
#If the length of mDF_match is greater than zero..
|
|
91
|
+
if len(mDF_match) > 0:
|
|
92
|
+
|
|
93
|
+
#Add the MS info to the FIDpMS dataframe
|
|
94
|
+
fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
|
|
95
|
+
fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
|
|
96
|
+
fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
|
|
97
|
+
fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
|
|
98
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using a linear fit of manual peak assignments'
|
|
99
|
+
|
|
100
|
+
#Otherwise, pass
|
|
101
|
+
else:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
return fpmDF
|
|
105
|
+
|
|
106
|
+
#Loop through every row in the dataframe
|
|
107
|
+
for i, row in fpmDF.iterrows():
|
|
108
|
+
#If the row's compound name is not blank
|
|
109
|
+
if not pd.isna(row['Compound Name']):
|
|
110
|
+
#If the row's compound source is either manual or blank, skip it
|
|
111
|
+
if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
|
|
112
|
+
pass
|
|
113
|
+
#Otherwise..
|
|
114
|
+
else:
|
|
115
|
+
#Match one FID peak
|
|
116
|
+
fpmDF = matchOne(fpmDF, [i,row], fit, peakError)
|
|
117
|
+
#Otherwise, if the row's compound name is blank..
|
|
118
|
+
else:
|
|
119
|
+
#Match one FID peak
|
|
120
|
+
fpmDF = matchOne(fpmDF, [i,row], fit, peakError)
|
|
121
|
+
|
|
122
|
+
return fpmDF
|
|
123
|
+
|
|
124
|
+
#Function that matches FID and MS peaks by their retention time
|
|
125
|
+
def matchRT(fpmDF,mDF,peakError=0.06):
|
|
126
|
+
"""
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
fpmDF : DataFrame
|
|
130
|
+
Dataframe containing FID and MS peak info
|
|
131
|
+
mDF : DataFrame
|
|
132
|
+
Dataframe containing MS info about identified compounds (UA_UPP)
|
|
133
|
+
peakError : Float, optional
|
|
134
|
+
Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
fpmDF : DataFrame
|
|
139
|
+
Dataframe containing FID and MS peak info
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def matchOne(fpmDF,fpmiter,peakError):
|
|
143
|
+
"""
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
fpmDF : DataFrame
|
|
147
|
+
Dataframe containing FID and MS peak info
|
|
148
|
+
fpmiter : List
|
|
149
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
150
|
+
peakError : float
|
|
151
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
fpmDF : DataFrame
|
|
156
|
+
Dataframe containing FID and MS peak info
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
#Unpack fpmDF iterating info
|
|
160
|
+
fpmi = int(fpmiter[0])
|
|
161
|
+
fpmrow = fpmiter[1]
|
|
162
|
+
|
|
163
|
+
#Compare the FID RT to the MS RT, collecting all matches within the specified peak error
|
|
164
|
+
mDF_match = mDF.loc[(mDF['Component RT'] >= fpmrow['FID RT']-peakError) & (mDF['Component RT'] <= fpmrow['FID RT']+peakError)].copy()
|
|
165
|
+
#If there is more than one MS RT match, select the entry with the smallest error from the FID RT
|
|
166
|
+
if len(mDF_match) > 1:
|
|
167
|
+
#Add an RT error to all mDF_match entries
|
|
168
|
+
for i, row in mDF_match.iterrows():
|
|
169
|
+
mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-row['Component RT'])
|
|
170
|
+
|
|
171
|
+
#Set mDF_match to the row with minimum RT Error
|
|
172
|
+
mDF_match = mDF_match.nsmallest(1,'RT Error')
|
|
173
|
+
|
|
174
|
+
#Reset the mDF_match index
|
|
175
|
+
mDF_match = mDF_match.reset_index().copy()
|
|
176
|
+
|
|
177
|
+
#If the length of mDF_match is greater than zero..
|
|
178
|
+
if len(mDF_match) > 0:
|
|
179
|
+
|
|
180
|
+
#Add the MS info to the FIDpMS dataframe
|
|
181
|
+
fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
|
|
182
|
+
fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
|
|
183
|
+
fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
|
|
184
|
+
fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
|
|
185
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned by comparing FID and MS retention times'
|
|
186
|
+
|
|
187
|
+
#Otherwise, pass
|
|
188
|
+
else:
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
return fpmDF
|
|
192
|
+
|
|
193
|
+
#Loop through every row in the dataframe
|
|
194
|
+
for i, row in fpmDF.iterrows():
|
|
195
|
+
#If the row's compound name is not blank
|
|
196
|
+
if not pd.isna(row['Compound Name']):
|
|
197
|
+
#If the row's compound source is either manual or blank, skip it
|
|
198
|
+
if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
|
|
199
|
+
pass
|
|
200
|
+
#Otherwise..
|
|
201
|
+
else:
|
|
202
|
+
#Match one FID peak
|
|
203
|
+
fpmDF = matchOne(fpmDF, [i,row], peakError)
|
|
204
|
+
#Otherwise, if the row's compound name is blank..
|
|
205
|
+
else:
|
|
206
|
+
#Match one FID peak
|
|
207
|
+
fpmDF = matchOne(fpmDF, [i,row], peakError)
|
|
208
|
+
|
|
209
|
+
return fpmDF
|
|
210
|
+
|
|
211
|
+
#Function that performs a subset of speculative labeling, using known peaks hard-coded in a file gasPairs_FIDpMS.csv
|
|
212
|
+
def matchKnownPeaks(fpmDF,mDF,gp_rsc):
|
|
213
|
+
def matchOne(fpmDF,fpmiter,gp_rsc):
|
|
214
|
+
"""
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
fpmDF : DataFrame
|
|
218
|
+
Dataframe containing FID and MS peak info
|
|
219
|
+
fpmiter : List
|
|
220
|
+
List containing current index and row in fpmDF of interest in form [i,row]
|
|
221
|
+
gp_rsc : DataFrame
|
|
222
|
+
Dataframe containing opened gasPairs resource.
|
|
223
|
+
peakError : float
|
|
224
|
+
Allowable error between estimated MS RT's and actual MS RT's
|
|
225
|
+
|
|
226
|
+
Returns
|
|
227
|
+
-------
|
|
228
|
+
fpmDF : DataFrame
|
|
229
|
+
Dataframe containing FID and MS peak info
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
#Unpack fpmDF iterating info
|
|
233
|
+
fpmi = int(fpmiter[0])
|
|
234
|
+
fpmrow = fpmiter[1]
|
|
235
|
+
|
|
236
|
+
#Search the gasPairs resource to see if any known peaks/RT's match the FID peak list
|
|
237
|
+
for i, row in gp_rsc.iterrows():
|
|
238
|
+
#Set gp_match to empty string
|
|
239
|
+
gp_match = pd.Series()
|
|
240
|
+
#Define error as two times the standard deviation for the FID RT in the gasPeaks resource
|
|
241
|
+
gp_error = row['Stdev FID RT']*2
|
|
242
|
+
#Extract the FID RT from the resource
|
|
243
|
+
gp_FIDRT = row['Average FID RT']
|
|
244
|
+
#If the current fpmrow FID RT is within the error bounds of an entry in the resource, match it
|
|
245
|
+
#NOTE: prefers the first match, even if the next match is closer. Most resourceRT's are more than
|
|
246
|
+
#2*error away from each other
|
|
247
|
+
if (fpmrow['FID RT'] >= gp_FIDRT - gp_error) and (fpmrow['FID RT'] <= gp_FIDRT + gp_error):
|
|
248
|
+
gp_match = row
|
|
249
|
+
break
|
|
250
|
+
#Otherwise, pass
|
|
251
|
+
else:
|
|
252
|
+
pass
|
|
253
|
+
|
|
254
|
+
#If gp_match is empty, pass
|
|
255
|
+
if gp_match.empty:
|
|
256
|
+
pass
|
|
257
|
+
#Otherwise, add the match info
|
|
258
|
+
else:
|
|
259
|
+
#Add the resource match info to the FIDpMS dataframe
|
|
260
|
+
fpmDF.at[fpmi,'Compound Name'] = gp_match['Species']
|
|
261
|
+
fpmDF.at[fpmi,'Formula'] = gp_match['Formula']
|
|
262
|
+
fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using gas pairs provided in resources'
|
|
263
|
+
|
|
264
|
+
return fpmDF
|
|
265
|
+
|
|
266
|
+
#Loop through every row in the dataframe
|
|
267
|
+
for i, row in fpmDF.iterrows():
|
|
268
|
+
#If the row's compound name is not blank
|
|
269
|
+
if not pd.isna(row['Compound Name']):
|
|
270
|
+
#If the row's compound source is either manual or blank, skip it
|
|
271
|
+
if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
|
|
272
|
+
pass
|
|
273
|
+
#Otherwise..
|
|
274
|
+
else:
|
|
275
|
+
#Match one FID peak
|
|
276
|
+
fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
|
|
277
|
+
#Otherwise, if the row's compound name is blank..
|
|
278
|
+
else:
|
|
279
|
+
#Match one FID peak
|
|
280
|
+
fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
|
|
281
|
+
|
|
282
|
+
return fpmDF
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
COPYRIGHT STATEMENT:
|
|
4
|
+
|
|
5
|
+
ChromaQuant – A quantification software for complex gas chromatographic data
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2024, by Julia Hancock
|
|
8
|
+
Affiliation: Dr. Julie Elaine Rorrer
|
|
9
|
+
URL: https://www.rorrerlab.com/
|
|
10
|
+
|
|
11
|
+
License: BSD 3-Clause License
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
SUBPACKAGE FOR POSTPROCESSING AFTER MATCHING
|
|
16
|
+
|
|
17
|
+
Julia Hancock
|
|
18
|
+
Started 12/10/2024
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
""" PACKAGES """
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
""" FUNCTIONS """
|
|
26
|
+
|
|
27
|
+
#Function that performs compound type abbreviation assignment
|
|
28
|
+
def ctaAssign(importDF, contains, keyLoop, elementExclude):
|
|
29
|
+
|
|
30
|
+
#Function that returns a compound type abbreviation corresponding to a compound
|
|
31
|
+
def assignType(compoundName,contains,keyLoop):
|
|
32
|
+
|
|
33
|
+
#Define default compound type abbreviation as 'O'
|
|
34
|
+
CTA = 'O'
|
|
35
|
+
|
|
36
|
+
#Function that accepts a list of substrings to check against a string and returns a boolean
|
|
37
|
+
def stringSearch(string,subList):
|
|
38
|
+
#Define export boolean default value
|
|
39
|
+
checkTF = False
|
|
40
|
+
#For every substring in subList...
|
|
41
|
+
for i in range(len(subList)):
|
|
42
|
+
|
|
43
|
+
#If the substring can be found in the string...
|
|
44
|
+
if subList[i] in string:
|
|
45
|
+
#Assign boolean to True and break
|
|
46
|
+
checkTF = True
|
|
47
|
+
break
|
|
48
|
+
#Otherwise, pass
|
|
49
|
+
else:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
return checkTF
|
|
53
|
+
|
|
54
|
+
#Loop through every key (compound type abbreviation) in contains
|
|
55
|
+
for i in keyLoop:
|
|
56
|
+
|
|
57
|
+
#If at least one substring in the key's list is found in compoundName...
|
|
58
|
+
if stringSearch(compoundName,contains[i]):
|
|
59
|
+
#Assign the compound type abbreviation to the current key and break the loop
|
|
60
|
+
CTA = i
|
|
61
|
+
break
|
|
62
|
+
#Otherwise, pass
|
|
63
|
+
else:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
return CTA
|
|
67
|
+
|
|
68
|
+
#Function that checks if formula string contains any of a list of elements
|
|
69
|
+
def checkElements(compoundFormula,elementList):
|
|
70
|
+
#Assign default export boolean to False
|
|
71
|
+
checkTF = False
|
|
72
|
+
|
|
73
|
+
#For every substring in elementList...
|
|
74
|
+
for i in range(len(elementList)):
|
|
75
|
+
#If the substring can be found in the compound formula...
|
|
76
|
+
if elementList[i] in compoundFormula:
|
|
77
|
+
#Set boolean to True and break
|
|
78
|
+
checkTF = True
|
|
79
|
+
break
|
|
80
|
+
#Otherwise, pass
|
|
81
|
+
else:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
return checkTF
|
|
85
|
+
|
|
86
|
+
#For every entry in the csv, assign a compound type abbreviation
|
|
87
|
+
for i, row in importDF.iterrows():
|
|
88
|
+
|
|
89
|
+
#Retrieve compound name and formula from row entry
|
|
90
|
+
compoundName = row['Compound Name']
|
|
91
|
+
compoundFormula = row['Formula']
|
|
92
|
+
|
|
93
|
+
#If the compound formula is a string...
|
|
94
|
+
if isinstance(compoundFormula,str):
|
|
95
|
+
|
|
96
|
+
#If the formula contains excluded elements...
|
|
97
|
+
if checkElements(compoundFormula,elementExclude):
|
|
98
|
+
|
|
99
|
+
#Assign 'O' to the row's compound type abbreviation entry
|
|
100
|
+
importDF.at[i,'Compound Type Abbreviation'] = 'O'
|
|
101
|
+
|
|
102
|
+
#Otherwise...
|
|
103
|
+
else:
|
|
104
|
+
|
|
105
|
+
#If the compound name is a string...
|
|
106
|
+
if isinstance(compoundName,str):
|
|
107
|
+
|
|
108
|
+
#Change compound name to lowercase
|
|
109
|
+
compoundName = compoundName.lower()
|
|
110
|
+
#Get a corresponding compound type abbreviation
|
|
111
|
+
CTA = assignType(compoundName, contains, keyLoop)
|
|
112
|
+
#Assign this CTA to the row's compound type abbreviation entry
|
|
113
|
+
importDF.at[i,'Compound Type Abbreviation'] = CTA
|
|
114
|
+
|
|
115
|
+
#Otherwise, pass
|
|
116
|
+
else:
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
return importDF
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
#Define function that loops through every row in a DataFrame and modifies rows with duplicate compounds
|
|
123
|
+
def duplicateHandle(DF):
|
|
124
|
+
|
|
125
|
+
#Define function that searches for rows in a DataFrame with duplicate compound names
|
|
126
|
+
def duplicateSearch(DF,cmp_name):
|
|
127
|
+
|
|
128
|
+
#Get a new dataframe that is a copy of the first argument
|
|
129
|
+
DF_out = DF.copy()
|
|
130
|
+
|
|
131
|
+
#Filter the dataframe using the provided compound name
|
|
132
|
+
DF_out = DF_out[DF_out['Compound Name'] == cmp_name]
|
|
133
|
+
|
|
134
|
+
#Define a Boolean describing whether or not there are duplicate rows
|
|
135
|
+
duplicate_TF = False
|
|
136
|
+
|
|
137
|
+
#If the DF_out dataframe is longer than one (if there are duplicate rows)...
|
|
138
|
+
if len(DF_out) > 1:
|
|
139
|
+
|
|
140
|
+
#Assign the Boolean to true
|
|
141
|
+
duplicate_TF = True
|
|
142
|
+
|
|
143
|
+
#Define the dataframe to be returned
|
|
144
|
+
DF_return = DF_out.copy()
|
|
145
|
+
|
|
146
|
+
#Otherwise, define the return dataframe as empty
|
|
147
|
+
else:
|
|
148
|
+
DF_return = pd.DataFrame()
|
|
149
|
+
|
|
150
|
+
#Return the boolean and the filtered DataFrame
|
|
151
|
+
return duplicate_TF, DF_return
|
|
152
|
+
|
|
153
|
+
#Define function that handles a given DataFrame of duplicates
|
|
154
|
+
def duplicateLogic(DF_search):
|
|
155
|
+
|
|
156
|
+
#Define the output DataFrame as an in copy
|
|
157
|
+
DF_logic = DF_search.copy()
|
|
158
|
+
|
|
159
|
+
#Get the row in the DataFrame with the largest area
|
|
160
|
+
maxSeries = DF_logic.loc[DF_logic['FID Area'].idxmax()]
|
|
161
|
+
|
|
162
|
+
#Get the name and compound type of this compound
|
|
163
|
+
max_name = maxSeries['Compound Name']
|
|
164
|
+
max_type = maxSeries['Compound Type Abbreviation']
|
|
165
|
+
|
|
166
|
+
#Get the remaining entries in the DataFrame
|
|
167
|
+
DF_logic = DF_logic.drop([maxSeries.name],axis=0)
|
|
168
|
+
|
|
169
|
+
#For every row in the remaining entries DataFrame, rename the compound to 'Isomer of..'
|
|
170
|
+
for i, row in DF_logic.iterrows():
|
|
171
|
+
|
|
172
|
+
#Get the new compound name
|
|
173
|
+
new_cmp_name = 'Isomer of ' + max_name
|
|
174
|
+
|
|
175
|
+
#Replace the compound name
|
|
176
|
+
DF_logic.at[i,'Compound Name'] = new_cmp_name
|
|
177
|
+
|
|
178
|
+
#If the compound type of the maxSeries is linear alkanes...
|
|
179
|
+
if max_type == 'L':
|
|
180
|
+
|
|
181
|
+
#Set the current row's compound type to branched alkanes
|
|
182
|
+
DF_logic.at[i,'Compound Type Abbreviation'] = 'B'
|
|
183
|
+
|
|
184
|
+
#Otherwise, pass
|
|
185
|
+
else:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
#Return the logic DataFrame
|
|
189
|
+
return DF_logic
|
|
190
|
+
|
|
191
|
+
#Define a function that replaces rows in the primary DataFrame with matches in the secondary, assuming the indices match
|
|
192
|
+
def duplicateReplace(pDF,sDF):
|
|
193
|
+
|
|
194
|
+
#For every entry in the secondary DataFrame...
|
|
195
|
+
for i, row in sDF.iterrows():
|
|
196
|
+
|
|
197
|
+
#Get the row's name, which is the numeric index in the DataFrame
|
|
198
|
+
row_name = row.name
|
|
199
|
+
|
|
200
|
+
#For every index in the row...
|
|
201
|
+
for j in row.index:
|
|
202
|
+
|
|
203
|
+
#Replace the corresponding entry in the pDF at the preserved sDF index
|
|
204
|
+
pDF.at[row_name,j] = row[j]
|
|
205
|
+
|
|
206
|
+
return pDF
|
|
207
|
+
|
|
208
|
+
#Define a list of compound names already handled
|
|
209
|
+
cmp_nameList = []
|
|
210
|
+
|
|
211
|
+
#Create a copy of the argument DataFrame to be used
|
|
212
|
+
DF_in = DF.copy()
|
|
213
|
+
|
|
214
|
+
#Initialize a DataFrame for the logic output
|
|
215
|
+
DF_logic = pd.DataFrame()
|
|
216
|
+
|
|
217
|
+
#Initialize a DataFrame for the output DF, create a copy of original DF in case there are no duplicates
|
|
218
|
+
DF_done = DF.copy()
|
|
219
|
+
|
|
220
|
+
#For every row in the provided DataFrame
|
|
221
|
+
for i, row in DF_in.iterrows():
|
|
222
|
+
|
|
223
|
+
#Get the compound name in that row
|
|
224
|
+
cmp_name = row['Compound Name']
|
|
225
|
+
|
|
226
|
+
#If the compound name is in the list of compound names handled, pass
|
|
227
|
+
if cmp_name in cmp_nameList:
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
#Otherwise...
|
|
231
|
+
else:
|
|
232
|
+
|
|
233
|
+
#If the compound name is 'No Match' or 'No match' or nan, pass
|
|
234
|
+
if cmp_name == 'No Match' or cmp_name == 'No Match' or pd.isna(cmp_name):
|
|
235
|
+
pass
|
|
236
|
+
|
|
237
|
+
#Otherwise...
|
|
238
|
+
else:
|
|
239
|
+
|
|
240
|
+
#Run the duplicate search function for that compound name
|
|
241
|
+
duplicate_TF, DF_search = duplicateSearch(DF_in,cmp_name)
|
|
242
|
+
|
|
243
|
+
#If duplicate_TF is True...
|
|
244
|
+
if duplicate_TF:
|
|
245
|
+
|
|
246
|
+
#Run the duplicate logic funcion
|
|
247
|
+
DF_logic = duplicateLogic(DF_search)
|
|
248
|
+
|
|
249
|
+
#Run the duplicate replace function
|
|
250
|
+
DF_done = duplicateReplace(DF_in,DF_logic)
|
|
251
|
+
|
|
252
|
+
#Otherwise, pass
|
|
253
|
+
else:
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
#Add the compound name to the compound name list
|
|
257
|
+
cmp_nameList.append(cmp_name)
|
|
258
|
+
|
|
259
|
+
return DF_done
|
chromaquant/Match/__init__.py
CHANGED