chromaquant 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. chromaquant/__init__.py +9 -2
  2. chromaquant/data/__init__.py +14 -0
  3. chromaquant/data/breakdown.py +430 -0
  4. chromaquant/data/dataset.py +195 -0
  5. chromaquant/data/table.py +412 -0
  6. chromaquant/data/value.py +215 -0
  7. chromaquant/formula/__init__.py +13 -0
  8. chromaquant/formula/base_formulas.py +168 -0
  9. chromaquant/formula/formula.py +507 -0
  10. chromaquant/import_local_packages.py +55 -0
  11. chromaquant/logging_and_handling.py +76 -0
  12. chromaquant/match/__init__.py +13 -0
  13. chromaquant/match/match.py +184 -0
  14. chromaquant/match/match_config.py +296 -0
  15. chromaquant/match/match_tools.py +154 -0
  16. chromaquant/{Handle → results}/__init__.py +2 -2
  17. chromaquant/results/reporting_tools.py +190 -0
  18. chromaquant/results/results.py +250 -0
  19. chromaquant/utils/__init__.py +14 -0
  20. chromaquant/utils/categories.py +127 -0
  21. chromaquant/utils/chemical_formulas.py +104 -0
  22. chromaquant/utils/dataframe_processing.py +222 -0
  23. chromaquant/utils/file_tools.py +100 -0
  24. chromaquant/utils/formula_tools.py +119 -0
  25. chromaquant-0.5.0.dist-info/METADATA +61 -0
  26. chromaquant-0.5.0.dist-info/RECORD +29 -0
  27. {chromaquant-0.3.1.dist-info → chromaquant-0.5.0.dist-info}/WHEEL +1 -1
  28. {chromaquant-0.3.1.dist-info → chromaquant-0.5.0.dist-info}/licenses/LICENSE.txt +1 -1
  29. chromaquant-0.5.0.dist-info/licenses/LICENSES_bundled.txt +251 -0
  30. chromaquant/Handle/handleDirectories.py +0 -89
  31. chromaquant/Manual/HydroUI.py +0 -418
  32. chromaquant/Manual/QuantUPP.py +0 -373
  33. chromaquant/Manual/Quantification.py +0 -1305
  34. chromaquant/Manual/__init__.py +0 -10
  35. chromaquant/Manual/duplicateMatch.py +0 -211
  36. chromaquant/Manual/fpm_match.py +0 -798
  37. chromaquant/Manual/label-type.py +0 -179
  38. chromaquant/Match/AutoFpmMatch.py +0 -1133
  39. chromaquant/Match/__init__.py +0 -12
  40. chromaquant/Quant/AutoQuantification.py +0 -1329
  41. chromaquant/Quant/__init__.py +0 -12
  42. chromaquant/__main__.py +0 -493
  43. chromaquant/properties.json +0 -4
  44. chromaquant-0.3.1.dist-info/METADATA +0 -189
  45. chromaquant-0.3.1.dist-info/RECORD +0 -22
  46. chromaquant-0.3.1.dist-info/entry_points.txt +0 -2
  47. chromaquant-0.3.1.dist-info/licenses/LICENSES_bundled.txt +0 -1035
@@ -1,373 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
-
5
- COPYRIGHT STATEMENT:
6
-
7
- ChromaQuant – A quantification software for complex gas chromatographic data
8
-
9
- Copyright (c) 2024, by Julia Hancock
10
- Affiliation: Dr. Julie Elaine Rorrer
11
- URL: https://www.rorrerlab.com/
12
-
13
- License: BSD 3-Clause License
14
-
15
- ---
16
-
17
- UNKNOWNS ANALYSIS POST PROCESSING
18
- Intended to sort through raw UA output to find best hits considering
19
- compound constraints.
20
-
21
- Julia Hancock
22
- 08/25/2023
23
-
24
- First version (v1) completion: 08/31/2023
25
-
26
- Improvement notes: -Add places to throw error and redirect user through console when user-inputted data goes wrong
27
- -Add GUI
28
- -Separate functions into packages, redesign nested function trees
29
- -Check if saving data will cause an overwrite - if it does, add an additional suffix
30
- """
31
-
32
- """ PACKAGES """
33
-
34
- import pandas as pd
35
- import numpy as np
36
- import os
37
- from pathlib import Path
38
- import re
39
-
40
- """ PARAMETERS """
41
-
42
- #DIRECTORIES
43
- #Name of folder to search for files
44
- infolder_name = "MBPR025-033_GasMS_UA_Comp"
45
- #Name of folder to export results
46
- outfolder_name = infolder_name + "_UPP"
47
- #PARAMETERS
48
- #Limit of identical peak RT
49
- PeakRTLim = 0.005
50
-
51
- """ DIRECTORIES """
52
-
53
- #Find main file' path and its parent directory
54
- main_filepath = Path(__file__)
55
- main_directory = main_filepath.parent
56
- import_directory = main_directory / "Imports"
57
- export_directory = main_directory / "Exports"
58
-
59
- #Directory for finding relevant files
60
- fileDir = import_directory / infolder_name
61
- #Directory for exporting constrained data
62
- expfileDir = export_directory / outfolder_name
63
-
64
- #Create list of string paths for each file in provided file directory
65
- fileLoc = []
66
- for path, subdirs, files in os.walk(fileDir):
67
- for name in files:
68
- fileLoc.append(os.path.join(path, name))
69
-
70
- """ COMPOUND CONSTRAINS """
71
- #Establish lists for two levels of element restrictions:
72
- #1st list (softBar) - elements that will be allowed in compound match if there are no compound matches
73
- # with only allowed elements. When time comes, list will be searched for matches in
74
- # order of priority
75
- #2nd list (noBar) - elements that will always be allowed
76
-
77
- #Class for elements to enable compound constraints
78
- class element:
79
- def __init__(self, Symbol, Name, Priority=float("nan")):
80
- #Element must always have symbol and name
81
- self.Symbol = Symbol
82
- self.Name = Name
83
- #Element does not necessarily need priority - this is an integer allowing for more precise
84
- #control over choosing compound matches
85
- if Priority == float("nan"):
86
- pass
87
- else:
88
- if isinstance(Priority, int) and Priority > 0:
89
- self.Priority = Priority
90
- else:
91
- pass
92
-
93
- #softBar list of semi-allowed elements
94
- #softBar = [element("O","Oxygen",1),element("N","Nitrogen",2),element("Si","Silicon",4)]
95
- softBar = [element("O","Oxygen",1),element("Si","Silicon",4)]
96
- #noBar list of allowed elements
97
- noBar = [element("H","Hydrogen"),element("C","Carbon")]
98
-
99
- """ FUNCTIONS """
100
- #Function to unpack .csv file
101
- def unpackUA(filepath):
102
- Df = pd.read_csv(filepath)
103
- return Df
104
-
105
- #Function to add match data to dataframe
106
- def concatDF(dataSlice, DFin):
107
- #Assumes a dataframe provided with these columns: ['Component RT','Compound Name','Formula','Match Factor']
108
- #Also assumes dataSlice will contain at least these same columns
109
- col = ['Component RT','Compound Name','Formula','Match Factor','Previous Best Compound Name',\
110
- 'Previous Best Formula','Previous Best Match Factor','Previous Worst Compound Name',\
111
- 'Previous Worst Formula','Previous Worst Match Factor']
112
- listOut = [dataSlice[col[i]] for i in range(len(col))]
113
- DFout = pd.concat([pd.DataFrame([listOut], columns=DFin.columns), DFin], ignore_index=True)
114
-
115
- return DFout
116
-
117
- #Function to add series of matches with best and worst match factor to a selected match series
118
- def concatSeries(dataSlice, bestSlice, worstSlice):
119
- #Assumes all Series have these columns: ['Component RT','Compound Name','Formula','Match Factor']
120
-
121
- #Define dictionaries of new index names for bestSlice and worstSlice
122
- bindex = {'Component RT':'Previous Best Component RT','Compound Name':'Previous Best Compound Name',
123
- 'Formula':'Previous Best Formula','Match Factor':'Previous Best Match Factor'}
124
- windex = {'Component RT':'Previous Worst Component RT','Compound Name':'Previous Worst Compound Name',
125
- 'Formula':'Previous Worst Formula','Match Factor':'Previous Worst Match Factor'}
126
-
127
- #Rename bestSlice and worstSlice indices
128
- bestSlice = bestSlice.copy()
129
- worstSlice = worstSlice.copy()
130
- bestSlice.rename(index=bindex, inplace=True)
131
- worstSlice.rename(index=windex, inplace=True)
132
-
133
- #Lists of indices from best/worst slices we want to add to dataSlice
134
- bindexList = ['Previous Best Compound Name','Previous Best Formula','Previous Best Match Factor']
135
- windexList = ['Previous Worst Compound Name','Previous Worst Formula','Previous Worst Match Factor']
136
-
137
- #Define returnSeries
138
- returnSlice = pd.concat([dataSlice,bestSlice.loc[bindexList],worstSlice.loc[windexList]], axis=0)
139
- return returnSlice
140
-
141
- #Function to group retention times, taking median to be value of grouped peaks
142
- def groupRT(rawDF):
143
-
144
- #Redefine for clarity
145
- filterDF = rawDF.copy()
146
-
147
- #Set up empty list for output RT (RT_permF), an empty list for temporary (original) RT's
148
- #(RT_temp) with only the first original RT, and the median of that list
149
- RT_permF = []
150
- RT_temp = [rawDF['Component RT'][0]]
151
- RT_temp_median = RT_temp[0]
152
-
153
- #For all raw retention times, group times within the PeakRTLim of each other.
154
- for i in range(1,len(rawDF['Component RT'])):
155
- #Current retention time
156
- RT_current = rawDF['Component RT'][i]
157
-
158
- #If current retention time within the median plus the peak limits, redefine median
159
- if RT_current < RT_temp_median+PeakRTLim and RT_current > RT_temp_median-PeakRTLim:
160
- #Append to list of like retention times
161
- RT_temp.append(RT_current)
162
- #Recalculate median, rounding to 4 decimal places
163
- RT_temp_median = round(np.median(RT_temp),4)
164
- #If it's reached the end of the dataframe, append what's left
165
- if i == len(rawDF['Component RT']) - 1:
166
- RT_permF.extend(np.full(len(RT_temp),RT_temp_median))
167
- RT_temp_median = RT_current
168
- RT_temp = [RT_current]
169
-
170
- #Otherwise, save the RT_temp_median to all RT_temp positions, redefine RT_temp and RT_temp_median
171
- else:
172
- #Set old retention times to median
173
- filterDF.loc[i-len(RT_temp):i, ('Component RT')] = RT_temp_median
174
- RT_permF.extend(np.full(len(RT_temp),RT_temp_median))
175
- RT_temp_median = RT_current
176
- RT_temp = [RT_current]
177
-
178
- #Delete/return variables
179
- del RT_permF, RT_temp, RT_temp_median, RT_current
180
- return filterDF
181
-
182
- #Function to return True if formula only contains noBar restrictions
183
- def donoBar(formula, noBar):
184
-
185
- #Find all elements present in formula
186
- elements = re.findall('[A-Z][a-z]?',formula)
187
- #Get list of allowed elements from noBar dataframe
188
- allowed_elements = [noBar[i].Symbol for i in range(len(noBar))]
189
-
190
- #..If a set of the difference between the lists is not empty (there are formula elements besides allowed ones), return False
191
- if set(elements).difference(set(allowed_elements)):
192
- tf = False
193
- #..Otherwise, return True
194
- else:
195
- tf = True
196
-
197
- return tf
198
-
199
- #Function to return True if formula only contains softBar restrictions of given priority
200
- def dosoftBar(formula,noBar,softBar,priority):
201
-
202
- #Find all elements present in formula
203
- elements = re.findall('[A-Z][a-z]?',formula)
204
- #Get dataframe of elements and priority from softBar
205
- ePDF = pd.DataFrame.from_dict({"Symbol":[obj.Symbol for obj in softBar], "Priority":[obj.Priority for obj in softBar]})
206
- #Get list of symbols with provided priority or lower, add elements from noBar
207
- allowed_elements = ePDF.loc[ePDF['Priority']<=priority, 'Symbol'].to_list()
208
- allowed_elements.extend([noBar[i].Symbol for i in range(len(noBar))])
209
- #Delete elements dataframe
210
- del ePDF
211
-
212
- #..If a set of the difference between the lists is not empty (there are formula elements besides allowed ones), return False
213
- trial = set(elements).difference(set(allowed_elements))
214
- if set(elements).difference(set(allowed_elements)):
215
- tf = False
216
- #..Otherwise, return True
217
- else:
218
- tf = True
219
-
220
- return tf
221
-
222
- #Function to choose best matches according to compound constraints
223
- def constrain(filterDF, constList):
224
- """
225
- This function loops through the dataframe, selecting the best match out of duplicate retention time matches.
226
-
227
- INPUTS: filterDF - the dataframe to be filtered
228
- constList - a list containing constraints in the form [noBar, softBar]
229
-
230
- OUTPUTS: constDF - a dataframe containing the best matches for each retention time
231
-
232
- APPROACH: 1) Get a list of all retention times in the dataframe;
233
- 2) Loop through each retention time, getting a slice of each dataframe;
234
- 3) Loop through compound constraints to pick the best match in the slice;
235
- 4) Append result to new, constrained dataframe
236
-
237
- SELECTING BEST MATCH: 1) If first formula of sorted slice contains only noBar, add to constrained dataframe
238
- 2) Otherwise, test next formula
239
- 3) If all other formulas have elements besides noBar, go back to first value and
240
- allow its formula if it contains only highest priority elements
241
- 4) If it contains lower priority/blocklist elements, repeat down slice
242
- 5) If all formulas contain lower priority elements, allow the next priority and repeat search
243
- 5) If all formulas contain elements not listed in noBar or softBar, add "No Match" row
244
- """
245
-
246
- #Unpack constList into softBar and hardBar
247
- noBar, softBar = constList
248
- #Get list of written priorities from softBar and sort them by descending
249
- priorList = sorted(list(set([x.Priority for x in softBar])))
250
- #Get list of all retention times
251
- arrayRF = filterDF['Component RT'].unique()
252
- #Create dictionary for outputted data
253
- constDF = pd.DataFrame(columns=['Component RT','Compound Name','Formula','Match Factor','Previous Best Compound Name',\
254
- 'Previous Best Formula','Previous Best Match Factor','Previous Worst Compound Name',\
255
- 'Previous Worst Formula','Previous Worst Match Factor'])
256
-
257
-
258
- #For every listed retention time, select best match
259
- for RTi in arrayRF:
260
-
261
- #Get a slice containing all possible compounds at given RT
262
- compound_slice = filterDF.loc[(filterDF["Component RT"] == RTi)]
263
- #Remove Unknowns from slice, if slice is empty then skip one loop
264
- compound_slice = compound_slice.loc[~compound_slice["Compound Name"].str.contains("Unknown")]
265
- #Sort slice by match factor, reset indices
266
- test_slice = compound_slice.sort_values(by=['Match Factor'], ascending=False).reset_index(drop=True)
267
-
268
- #Find rows with best and worst match factors
269
- try:
270
- best_match = test_slice.iloc[0,:]
271
- worst_match = test_slice.iloc[len(test_slice)-1,:]
272
- except:
273
- best_match = pd.Series(dtype='float64',index=['Component RT','Compound Name','Formula','Match Factor'])
274
- worst_match = pd.Series(dtype='float64',index=['Component RT','Compound Name','Formula','Match Factor'])
275
-
276
- #Set search True/False Boolean to True
277
- search_tf = True
278
- #Set counted_loops to 0
279
- counted_loops = 0
280
- #While loop to continue search function until match is either found or not
281
- while search_tf == True and counted_loops < 100:
282
-
283
- #For every row in the slice sorted by match factor..
284
- for index, row in test_slice.iterrows():
285
- #..If the formula meets the noBar criteria, choose row and break formula
286
- if donoBar(row['Formula'],noBar) and counted_loops == 0:
287
- constSeries = concatSeries(row,best_match,worst_match)
288
- constDF = concatDF(constSeries,constDF)
289
- search_tf = False
290
- break
291
- #..Otherwise if the loop number is greater than 0 and less than the
292
- # number of unique softBar priorities, determine if formula meets softBar criteria
293
- elif counted_loops > 0 and counted_loops < len(priorList):
294
- #Try/except in case the counted loops goes higher than the priority list
295
- try:
296
- if dosoftBar(row['Formula'],noBar,softBar,priorList[counted_loops-1]):
297
- constSeries = concatSeries(row,best_match,worst_match)
298
- constDF = concatDF(constSeries,constDF)
299
- search_tf = False
300
- break
301
- else:
302
- pass
303
- except:
304
- pass
305
- #..Otherwise if the loop number is greater than the number of listed priorities,
306
- # add row with "No Match" and formula NaN
307
- elif counted_loops > len(priorList):
308
- constSeries = concatSeries(pd.Series({"Component RT":RTi,"Compound Name":"No Match",\
309
- "Match Factor":float('nan'),"Formula":float('nan')}),\
310
- best_match,worst_match)
311
- constDF = concatDF(constSeries,constDF)
312
- search_tf = False
313
- break
314
- else:
315
- pass
316
- #Separate into its own function momentarily
317
- #if
318
-
319
- #Count one while loop
320
- counted_loops += 1
321
-
322
-
323
- return constDF
324
-
325
- #Function to save dataframe to .csv file
326
- def outputCSV(constDF_Dict, file_directory, infilenames):
327
- #Create names of exported files by adding "_UPP" to the name before .csv
328
- outfilenames = [x[:x.index('.csv')] + '_UPP' + x[x.index('.csv'):] for x in infilenames]
329
- #Create list of filepaths from export directory + filename.csv
330
- filepathList = [file_directory / outfilenames[i] for i in range(len(outfilenames))]
331
- #If export directory does not exist, create it
332
- filepathList[0].parent.mkdir(parents=True,exist_ok=True)
333
-
334
- #For every filename, save a .csv
335
- for i in range(len(infilenames)):
336
- constDF_Dict[infilenames[i]].to_csv(filepathList[i])
337
-
338
- return None
339
-
340
- """ CODE """
341
-
342
- #Unpack all .csv files in provided directory
343
- print("[MAIN] Unpacking data from provided directory...")
344
- UAData_raw = {}
345
-
346
- for i in range(len(files)):
347
- UAData_raw[files[i]] = unpackUA(fileLoc[i])
348
-
349
- print("[MAIN] Data unpacked.")
350
-
351
- #Dictionaries for filtered and constrained data for each file
352
- filterDF_Dict = {}
353
- constDF_Dict = {}
354
- #Dictionary for constrained data to be outputted
355
- UAData_PP = {}
356
- #For all files, run the constraint workflow
357
- for i in range(len(files)):
358
- #Group retention times for all files
359
- print("[" + files[i] + "] Grouping retention times...")
360
- filterDF = groupRT(UAData_raw[files[i]])
361
- filterDF_Dict[files[i]] = filterDF
362
- #Apply constraints to all files
363
- print("[" + files[i] + "] Applying compound constraints...")
364
- constDF = constrain(filterDF, [noBar,softBar])
365
- constDF_Dict[files[i]] = constDF
366
-
367
- #Save results
368
- print("[MAIN] Saving results...")
369
- outputCSV(constDF_Dict, expfileDir, files)
370
- print("[MAIN] Files saved to " + str(expfileDir))
371
-
372
- #Complete program
373
- print("[MAIN] Unknowns post processing finished.")