chromaquant 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,282 @@
1
+ """
2
+
3
+ COPYRIGHT STATEMENT:
4
+
5
+ ChromaQuant – A quantification software for complex gas chromatographic data
6
+
7
+ Copyright (c) 2024, by Julia Hancock
8
+ Affiliation: Dr. Julie Elaine Rorrer
9
+ URL: https://www.rorrerlab.com/
10
+
11
+ License: BSD 3-Clause License
12
+
13
+ ---
14
+
15
+ SUBPACKAGE FOR MATCHING FID AND MS PEAKS ACCORDING TO A PASSED MODEL
16
+
17
+ Julia Hancock
18
+ Started 12/10/2024
19
+
20
+ """
21
+
22
+ """ PACKAGES """
23
+ import pandas as pd
24
+
25
+ """ FUNCTIONS """
26
+
27
+ #Third order function for testing
28
+ fit = lambda FID_RT: 0.0252*FID_RT**3 - 0.5274*FID_RT**2 + 4.8067*FID_RT - 3.0243
29
+
30
+ #Function that estimates unknown MS RT's and matches FID and MS peaks using a provided fit
31
+ def matchPeaks(fpmDF,mDF,fit,peakError=0.06):
32
+
33
+ """
34
+ Parameters
35
+ ----------
36
+ fpmDF : DataFrame
37
+ Dataframe containing FID and MS peak info
38
+ mDF : DataFrame
39
+ Dataframe containing MS info about identified compounds (UA_UPP)
40
+ linfits : Function
41
+ Function that returns an estimated MS RT with a passed FID RT
42
+ peakError : Float, optional
43
+ Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
44
+
45
+ Returns
46
+ -------
47
+ fpmDF : DataFrame
48
+ Dataframe containing FID and MS peak info
49
+ """
50
+
51
+ def matchOne(fpmDF,fpmiter,fit,peakError):
52
+ """
53
+ Parameters
54
+ ----------
55
+ fpmDF : DataFrame
56
+ Dataframe containing FID and MS peak info
57
+ fpmiter : List
58
+ List containing current index and row in fpmDF of interest in form [i,row]
59
+ linfits : List
60
+ List containing info about a linear fit for estimated MS RT's in the form [m,b,r2]
61
+ peakError : float
62
+ Allowable error between estimated MS RT's and actual MS RT's
63
+
64
+ Returns
65
+ -------
66
+ fpmDF : DataFrame
67
+ Dataframe containing FID and MS peak info
68
+ """
69
+
70
+ #Unpack fpmDF iterating info
71
+ fpmi = int(fpmiter[0])
72
+ fpmrow = fpmiter[1]
73
+
74
+ #Estimate an MS RT for the row's FID RT using the fit
75
+ est_MSRT = fit(fpmrow['FID RT'])
76
+ #Compare the estimated MS RT to all real MS RT's, seeing if there is a match within error
77
+ mDF_match = mDF.loc[(mDF['Component RT'] >= est_MSRT-peakError) & (mDF['Component RT'] <= est_MSRT+peakError)].copy()
78
+ #If there is more than one match, select the entry with the smallest error
79
+ if len(mDF_match) > 1:
80
+ #Add an RT error to all mDF_match entries
81
+ for i, row in mDF_match.iterrows():
82
+ mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-est_MSRT)
83
+
84
+ #Set mDF_match to the row with minimum RT Error
85
+ mDF_match = mDF_match.nsmallest(1,'RT Error')
86
+
87
+ #Reset the mDF_match index
88
+ mDF_match = mDF_match.reset_index().copy()
89
+
90
+ #If the length of mDF_match is greater than zero..
91
+ if len(mDF_match) > 0:
92
+
93
+ #Add the MS info to the FIDpMS dataframe
94
+ fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
95
+ fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
96
+ fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
97
+ fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
98
+ fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using a linear fit of manual peak assignments'
99
+
100
+ #Otherwise, pass
101
+ else:
102
+ pass
103
+
104
+ return fpmDF
105
+
106
+ #Loop through every row in the dataframe
107
+ for i, row in fpmDF.iterrows():
108
+ #If the row's compound name is not blank
109
+ if not pd.isna(row['Compound Name']):
110
+ #If the row's compound source is either manual or blank, skip it
111
+ if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
112
+ pass
113
+ #Otherwise..
114
+ else:
115
+ #Match one FID peak
116
+ fpmDF = matchOne(fpmDF, [i,row], fit, peakError)
117
+ #Otherwise, if the row's compound name is blank..
118
+ else:
119
+ #Match one FID peak
120
+ fpmDF = matchOne(fpmDF, [i,row], fit, peakError)
121
+
122
+ return fpmDF
123
+
124
+ #Function that matches FID and MS peaks by their retention time
125
+ def matchRT(fpmDF,mDF,peakError=0.06):
126
+ """
127
+ Parameters
128
+ ----------
129
+ fpmDF : DataFrame
130
+ Dataframe containing FID and MS peak info
131
+ mDF : DataFrame
132
+ Dataframe containing MS info about identified compounds (UA_UPP)
133
+ peakError : Float, optional
134
+ Allowable error between estimated MS RT's and actual MS RT's. The default is 0.01.
135
+
136
+ Returns
137
+ -------
138
+ fpmDF : DataFrame
139
+ Dataframe containing FID and MS peak info
140
+ """
141
+
142
+ def matchOne(fpmDF,fpmiter,peakError):
143
+ """
144
+ Parameters
145
+ ----------
146
+ fpmDF : DataFrame
147
+ Dataframe containing FID and MS peak info
148
+ fpmiter : List
149
+ List containing current index and row in fpmDF of interest in form [i,row]
150
+ peakError : float
151
+ Allowable error between estimated MS RT's and actual MS RT's
152
+
153
+ Returns
154
+ -------
155
+ fpmDF : DataFrame
156
+ Dataframe containing FID and MS peak info
157
+ """
158
+
159
+ #Unpack fpmDF iterating info
160
+ fpmi = int(fpmiter[0])
161
+ fpmrow = fpmiter[1]
162
+
163
+ #Compare the FID RT to the MS RT, collecting all matches within the specified peak error
164
+ mDF_match = mDF.loc[(mDF['Component RT'] >= fpmrow['FID RT']-peakError) & (mDF['Component RT'] <= fpmrow['FID RT']+peakError)].copy()
165
+ #If there is more than one MS RT match, select the entry with the smallest error from the FID RT
166
+ if len(mDF_match) > 1:
167
+ #Add an RT error to all mDF_match entries
168
+ for i, row in mDF_match.iterrows():
169
+ mDF_match.at[i,'RT Error'] = abs(fpmrow['FID RT']-row['Component RT'])
170
+
171
+ #Set mDF_match to the row with minimum RT Error
172
+ mDF_match = mDF_match.nsmallest(1,'RT Error')
173
+
174
+ #Reset the mDF_match index
175
+ mDF_match = mDF_match.reset_index().copy()
176
+
177
+ #If the length of mDF_match is greater than zero..
178
+ if len(mDF_match) > 0:
179
+
180
+ #Add the MS info to the FIDpMS dataframe
181
+ fpmDF.at[fpmi,'MS RT'] = mDF_match.at[0,'Component RT']
182
+ fpmDF.at[fpmi,'Compound Name'] = mDF_match.at[0,'Compound Name']
183
+ fpmDF.at[fpmi,'Formula'] = mDF_match.at[0,'Formula']
184
+ fpmDF.at[fpmi,'Match Factor'] = mDF_match.at[0,'Match Factor']
185
+ fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned by comparing FID and MS retention times'
186
+
187
+ #Otherwise, pass
188
+ else:
189
+ pass
190
+
191
+ return fpmDF
192
+
193
+ #Loop through every row in the dataframe
194
+ for i, row in fpmDF.iterrows():
195
+ #If the row's compound name is not blank
196
+ if not pd.isna(row['Compound Name']):
197
+ #If the row's compound source is either manual or blank, skip it
198
+ if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
199
+ pass
200
+ #Otherwise..
201
+ else:
202
+ #Match one FID peak
203
+ fpmDF = matchOne(fpmDF, [i,row], peakError)
204
+ #Otherwise, if the row's compound name is blank..
205
+ else:
206
+ #Match one FID peak
207
+ fpmDF = matchOne(fpmDF, [i,row], peakError)
208
+
209
+ return fpmDF
210
+
211
+ #Function that performs a subset of speculative labeling, using known peaks hard-coded in a file gasPairs_FIDpMS.csv
212
+ def matchKnownPeaks(fpmDF,mDF,gp_rsc):
213
+ def matchOne(fpmDF,fpmiter,gp_rsc):
214
+ """
215
+ Parameters
216
+ ----------
217
+ fpmDF : DataFrame
218
+ Dataframe containing FID and MS peak info
219
+ fpmiter : List
220
+ List containing current index and row in fpmDF of interest in form [i,row]
221
+ gp_rsc : DataFrame
222
+ Dataframe containing opened gasPairs resource.
223
+ peakError : float
224
+ Allowable error between estimated MS RT's and actual MS RT's
225
+
226
+ Returns
227
+ -------
228
+ fpmDF : DataFrame
229
+ Dataframe containing FID and MS peak info
230
+ """
231
+
232
+ #Unpack fpmDF iterating info
233
+ fpmi = int(fpmiter[0])
234
+ fpmrow = fpmiter[1]
235
+
236
+ #Search the gasPairs resource to see if any known peaks/RT's match the FID peak list
237
+ for i, row in gp_rsc.iterrows():
238
+ #Set gp_match to empty string
239
+ gp_match = pd.Series()
240
+ #Define error as two times the standard deviation for the FID RT in the gasPeaks resource
241
+ gp_error = row['Stdev FID RT']*2
242
+ #Extract the FID RT from the resource
243
+ gp_FIDRT = row['Average FID RT']
244
+ #If the current fpmrow FID RT is within the error bounds of an entry in the resource, match it
245
+ #NOTE: prefers the first match, even if the next match is closer. Most resourceRT's are more than
246
+ #2*error away from each other
247
+ if (fpmrow['FID RT'] >= gp_FIDRT - gp_error) and (fpmrow['FID RT'] <= gp_FIDRT + gp_error):
248
+ gp_match = row
249
+ break
250
+ #Otherwise, pass
251
+ else:
252
+ pass
253
+
254
+ #If gp_match is empty, pass
255
+ if gp_match.empty:
256
+ pass
257
+ #Otherwise, add the match info
258
+ else:
259
+ #Add the resource match info to the FIDpMS dataframe
260
+ fpmDF.at[fpmi,'Compound Name'] = gp_match['Species']
261
+ fpmDF.at[fpmi,'Formula'] = gp_match['Formula']
262
+ fpmDF.at[fpmi,'Compound Source'] = 'Automatically assigned using gas pairs provided in resources'
263
+
264
+ return fpmDF
265
+
266
+ #Loop through every row in the dataframe
267
+ for i, row in fpmDF.iterrows():
268
+ #If the row's compound name is not blank
269
+ if not pd.isna(row['Compound Name']):
270
+ #If the row's compound source is either manual or blank, skip it
271
+ if row['Compound Source'] == 'Manual' or pd.isna(row['Compound Source']):
272
+ pass
273
+ #Otherwise..
274
+ else:
275
+ #Match one FID peak
276
+ fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
277
+ #Otherwise, if the row's compound name is blank..
278
+ else:
279
+ #Match one FID peak
280
+ fpmDF = matchOne(fpmDF, [i,row], gp_rsc)
281
+
282
+ return fpmDF
@@ -0,0 +1,259 @@
1
+ """
2
+
3
+ COPYRIGHT STATEMENT:
4
+
5
+ ChromaQuant – A quantification software for complex gas chromatographic data
6
+
7
+ Copyright (c) 2024, by Julia Hancock
8
+ Affiliation: Dr. Julie Elaine Rorrer
9
+ URL: https://www.rorrerlab.com/
10
+
11
+ License: BSD 3-Clause License
12
+
13
+ ---
14
+
15
+ SUBPACKAGE FOR POSTPROCESSING AFTER MATCHING
16
+
17
+ Julia Hancock
18
+ Started 12/10/2024
19
+
20
+ """
21
+
22
+ """ PACKAGES """
23
+ import pandas as pd
24
+
25
+ """ FUNCTIONS """
26
+
27
+ #Function that performs compound type abbreviation assignment
28
+ def ctaAssign(importDF, contains, keyLoop, elementExclude):
29
+
30
+ #Function that returns a compound type abbreviation corresponding to a compound
31
+ def assignType(compoundName,contains,keyLoop):
32
+
33
+ #Define default compound type abbreviation as 'O'
34
+ CTA = 'O'
35
+
36
+ #Function that accepts a list of substrings to check against a string and returns a boolean
37
+ def stringSearch(string,subList):
38
+ #Define export boolean default value
39
+ checkTF = False
40
+ #For every substring in subList...
41
+ for i in range(len(subList)):
42
+
43
+ #If the substring can be found in the string...
44
+ if subList[i] in string:
45
+ #Assign boolean to True and break
46
+ checkTF = True
47
+ break
48
+ #Otherwise, pass
49
+ else:
50
+ pass
51
+
52
+ return checkTF
53
+
54
+ #Loop through every key (compound type abbreviation) in contains
55
+ for i in keyLoop:
56
+
57
+ #If at least one substring in the key's list is found in compoundName...
58
+ if stringSearch(compoundName,contains[i]):
59
+ #Assign the compound type abbreviation to the current key and break the loop
60
+ CTA = i
61
+ break
62
+ #Otherwise, pass
63
+ else:
64
+ pass
65
+
66
+ return CTA
67
+
68
+ #Function that checks if formula string contains any of a list of elements
69
+ def checkElements(compoundFormula,elementList):
70
+ #Assign default export boolean to False
71
+ checkTF = False
72
+
73
+ #For every substring in elementList...
74
+ for i in range(len(elementList)):
75
+ #If the substring can be found in the compound formula...
76
+ if elementList[i] in compoundFormula:
77
+ #Set boolean to True and break
78
+ checkTF = True
79
+ break
80
+ #Otherwise, pass
81
+ else:
82
+ pass
83
+
84
+ return checkTF
85
+
86
+ #For every entry in the csv, assign a compound type abbreviation
87
+ for i, row in importDF.iterrows():
88
+
89
+ #Retrieve compound name and formula from row entry
90
+ compoundName = row['Compound Name']
91
+ compoundFormula = row['Formula']
92
+
93
+ #If the compound formula is a string...
94
+ if isinstance(compoundFormula,str):
95
+
96
+ #If the formula contains excluded elements...
97
+ if checkElements(compoundFormula,elementExclude):
98
+
99
+ #Assign 'O' to the row's compound type abbreviation entry
100
+ importDF.at[i,'Compound Type Abbreviation'] = 'O'
101
+
102
+ #Otherwise...
103
+ else:
104
+
105
+ #If the compound name is a string...
106
+ if isinstance(compoundName,str):
107
+
108
+ #Change compound name to lowercase
109
+ compoundName = compoundName.lower()
110
+ #Get a corresponding compound type abbreviation
111
+ CTA = assignType(compoundName, contains, keyLoop)
112
+ #Assign this CTA to the row's compound type abbreviation entry
113
+ importDF.at[i,'Compound Type Abbreviation'] = CTA
114
+
115
+ #Otherwise, pass
116
+ else:
117
+ pass
118
+
119
+ return importDF
120
+
121
+
122
+ #Define function that loops through every row in a DataFrame and modifies rows with duplicate compounds
123
+ def duplicateHandle(DF):
124
+
125
+ #Define function that searches for rows in a DataFrame with duplicate compound names
126
+ def duplicateSearch(DF,cmp_name):
127
+
128
+ #Get a new dataframe that is a copy of the first argument
129
+ DF_out = DF.copy()
130
+
131
+ #Filter the dataframe using the provided compound name
132
+ DF_out = DF_out[DF_out['Compound Name'] == cmp_name]
133
+
134
+ #Define a Boolean describing whether or not there are duplicate rows
135
+ duplicate_TF = False
136
+
137
+ #If the DF_out dataframe is longer than one (if there are duplicate rows)...
138
+ if len(DF_out) > 1:
139
+
140
+ #Assign the Boolean to true
141
+ duplicate_TF = True
142
+
143
+ #Define the dataframe to be returned
144
+ DF_return = DF_out.copy()
145
+
146
+ #Otherwise, define the return dataframe as empty
147
+ else:
148
+ DF_return = pd.DataFrame()
149
+
150
+ #Return the boolean and the filtered DataFrame
151
+ return duplicate_TF, DF_return
152
+
153
+ #Define function that handles a given DataFrame of duplicates
154
+ def duplicateLogic(DF_search):
155
+
156
+ #Define the output DataFrame as an in copy
157
+ DF_logic = DF_search.copy()
158
+
159
+ #Get the row in the DataFrame with the largest area
160
+ maxSeries = DF_logic.loc[DF_logic['FID Area'].idxmax()]
161
+
162
+ #Get the name and compound type of this compound
163
+ max_name = maxSeries['Compound Name']
164
+ max_type = maxSeries['Compound Type Abbreviation']
165
+
166
+ #Get the remaining entries in the DataFrame
167
+ DF_logic = DF_logic.drop([maxSeries.name],axis=0)
168
+
169
+ #For every row in the remaining entries DataFrame, rename the compound to 'Isomer of..'
170
+ for i, row in DF_logic.iterrows():
171
+
172
+ #Get the new compound name
173
+ new_cmp_name = 'Isomer of ' + max_name
174
+
175
+ #Replace the compound name
176
+ DF_logic.at[i,'Compound Name'] = new_cmp_name
177
+
178
+ #If the compound type of the maxSeries is linear alkanes...
179
+ if max_type == 'L':
180
+
181
+ #Set the current row's compound type to branched alkanes
182
+ DF_logic.at[i,'Compound Type Abbreviation'] = 'B'
183
+
184
+ #Otherwise, pass
185
+ else:
186
+ pass
187
+
188
+ #Return the logic DataFrame
189
+ return DF_logic
190
+
191
+ #Define a function that replaces rows in the primary DataFrame with matches in the secondary, assuming the indices match
192
+ def duplicateReplace(pDF,sDF):
193
+
194
+ #For every entry in the secondary DataFrame...
195
+ for i, row in sDF.iterrows():
196
+
197
+ #Get the row's name, which is the numeric index in the DataFrame
198
+ row_name = row.name
199
+
200
+ #For every index in the row...
201
+ for j in row.index:
202
+
203
+ #Replace the corresponding entry in the pDF at the preserved sDF index
204
+ pDF.at[row_name,j] = row[j]
205
+
206
+ return pDF
207
+
208
+ #Define a list of compound names already handled
209
+ cmp_nameList = []
210
+
211
+ #Create a copy of the argument DataFrame to be used
212
+ DF_in = DF.copy()
213
+
214
+ #Initialize a DataFrame for the logic output
215
+ DF_logic = pd.DataFrame()
216
+
217
+ #Initialize a DataFrame for the output DF, create a copy of original DF in case there are no duplicates
218
+ DF_done = DF.copy()
219
+
220
+ #For every row in the provided DataFrame
221
+ for i, row in DF_in.iterrows():
222
+
223
+ #Get the compound name in that row
224
+ cmp_name = row['Compound Name']
225
+
226
+ #If the compound name is in the list of compound names handled, pass
227
+ if cmp_name in cmp_nameList:
228
+ pass
229
+
230
+ #Otherwise...
231
+ else:
232
+
233
+ #If the compound name is 'No Match' or 'No match' or nan, pass
234
+ if cmp_name == 'No Match' or cmp_name == 'No Match' or pd.isna(cmp_name):
235
+ pass
236
+
237
+ #Otherwise...
238
+ else:
239
+
240
+ #Run the duplicate search function for that compound name
241
+ duplicate_TF, DF_search = duplicateSearch(DF_in,cmp_name)
242
+
243
+ #If duplicate_TF is True...
244
+ if duplicate_TF:
245
+
246
+ #Run the duplicate logic funcion
247
+ DF_logic = duplicateLogic(DF_search)
248
+
249
+ #Run the duplicate replace function
250
+ DF_done = duplicateReplace(DF_in,DF_logic)
251
+
252
+ #Otherwise, pass
253
+ else:
254
+ pass
255
+
256
+ #Add the compound name to the compound name list
257
+ cmp_nameList.append(cmp_name)
258
+
259
+ return DF_done
@@ -9,4 +9,5 @@ Created 10-19-2024
9
9
 
10
10
  """
11
11
 
12
- from .AutoFpmMatch import main_AutoFpmMatch
12
+ from .AutoFpmMatch import main_AutoFpmMatch
13
+ from .matchMain import mainMatch