pycompound 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+
2
+ from pycompound_fy7392.plot_spectra import generate_plots_on_HRMS_data
3
+ from pycompound_fy7392.plot_spectra import generate_plots_on_NRMS_data
4
+ import pandas as pd
5
+ import argparse
6
+ from pathlib import Path
7
+ import sys
8
+
9
+
10
+ parser = argparse.ArgumentParser()
11
+
12
+ parser.add_argument('--query_data', type=str, metavar='\b', help='CSV file of query mass spectrum/spectra to be identified. Each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.')
13
+ parser.add_argument('--reference_data', type=str, metavar='\b', help='CSV file of the reference mass spectra. Each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.')
14
+ parser.add_argument('--spectrum_ID1', type=str, metavar='\b', help='The identifier of the query spectrum to be plotted. Default: first query spectrum in query_data.')
15
+ parser.add_argument('--spectrum_ID2', type=str, metavar='\b', help='The identifier of the reference spectrum to be plotted. Default: first reference spectrum in reference_data.')
16
+ parser.add_argument('--similarity_measure', type=str, default='cosine', metavar='\b', help='Similarity measure: options are \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.')
17
+ parser.add_argument('--chromatography_platform', type=str, metavar='\b', help='Chromatography platform: options are \'HRMS\' and \'NRMS\'. Mandatory argument.')
18
+ parser.add_argument('--spectrum_preprocessing_order', type=str, metavar='\b', help='The LC-MS/MS spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of LC-MS/MS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL for HRMS, FNLW for NRMS')
19
+ parser.add_argument('--high_quality_reference_library', type=str, default='False', metavar='\b', help='True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
20
+ parser.add_argument('--mz_min', type=int, default=0, metavar='\b', help='Remove all peaks with mass/charge less than mz_min in each spectrum. Default: 0')
21
+ parser.add_argument('--mz_max', type=int, default=999999999999, metavar='\b', help='Remove all peaks with mass/charge greater than mz_max in each spectrum. Default: 999999999999')
22
+ parser.add_argument('--int_min', type=float, default=0, metavar='\b', help='Remove all peaks with intensity less than int_min in each spectrum. Default: 0')
23
+ parser.add_argument('--int_max', type=float, default=999999999999, metavar='\b', help='Remove all peaks with intensity greater than int_max in each spectrum. Default: 999999999999')
24
+ parser.add_argument('--window_size_centroiding', type=float, default=0.5, metavar='\b', help='Window size parameter used in centroiding a given spectrum. Only for HRMS. Default: 0.5')
25
+ parser.add_argument('--window_size_matching', type=float, default=0.5, metavar='\b', help='Window size parameter used in matching a query spectrum and a reference library spectrum. Only for HRMS. Default: 0.5')
26
+ parser.add_argument('--noise_threshold', type=float, default=0, metavar='\b', help='Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0')
27
+ parser.add_argument('--wf_mz', type=float, default=0, metavar='\b', help='Mass/charge weight factor parameter. Default: 0.')
28
+ parser.add_argument('--wf_intensity', type=float, default=1, metavar='\b', help='Intensity weight factor parameter. Default: 1.')
29
+ parser.add_argument('--LET_threshold', type=float, default=0, metavar='\b', help='Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.')
30
+ parser.add_argument('--entropy_dimension', type=float, default=1.1, metavar='\b', help='Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1.')
31
+ parser.add_argument('--y_axis_transformation', type=str, default='normalized', metavar='\b', help='Transformation to apply to y-axis (i.e. intensity axis) of plots. Options: \'normalized\', \'none\', \'log10\', and \'sqrt\'. Default: normalized.')
32
+ parser.add_argument('--output_path', type=str, metavar='\b', help='Output PDF file containing the plots of the query and reference spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./query_spec_{query_spectrum_ID}_reference_spec_{reference_spectrum_ID}_plot.pdf in the current working directory.')
33
+
34
+ args = parser.parse_args()
35
+
36
+ if args.chromatography_platform == 'HRMS':
37
+ spectrum_preprocessing_order = 'FCNMWL'
38
+ elif args.chromatography_platform == 'NRMS':
39
+ spectrum_preprocessing_order = 'FNLW'
40
+ else:
41
+ print('Error: chromatography_platform must be either \'HRMS\' or \'NRMS\'')
42
+ sys.exit()
43
+
44
+
45
+ if args.chromatography_platform == 'HRMS':
46
+ generate_plots_on_HRMS_data(query_data=args.query_data, reference_data=args.reference_data, spectrum_ID1=args.spectrum_ID1, spectrum_ID2=args.spectrum_ID2, similarity_measure=args.similarity_measure, spectrum_preprocessing_order=spectrum_preprocessing_order, high_quality_reference_library=args.high_quality_reference_library, mz_min=args.mz_min, mz_max=args.mz_max, int_min=args.int_min, int_max=args.int_max, window_size_centroiding=args.window_size_centroiding, window_size_matching=args.window_size_matching, noise_threshold=args.noise_threshold, wf_mz=args.wf_mz, wf_intensity=args.wf_intensity, LET_threshold=args.LET_threshold, entropy_dimension=args.entropy_dimension, y_axis_transformation=args.y_axis_transformation, output_path=args.output_path)
47
+ elif args.chromatography_platform == 'NRMS':
48
+ generate_plots_on_NRMS_data(query_data=args.query_data, reference_data=args.reference_data, spectrum_ID1=args.spectrum_ID1, spectrum_ID2=args.spectrum_ID2, similarity_measure=args.similarity_measure, spectrum_preprocessing_order=spectrum_preprocessing_order, high_quality_reference_library=args.high_quality_reference_library, mz_min=args.mz_min, mz_max=args.mz_max, int_min=args.int_min, int_max=args.int_max, noise_threshold=args.noise_threshold, wf_mz=args.wf_mz, wf_intensity=args.wf_intensity, LET_threshold=args.LET_threshold, entropy_dimension=args.entropy_dimension, y_axis_transformation=args.y_axis_transformation, output_path=args.output_path)
49
+
50
+
51
+
@@ -0,0 +1,316 @@
1
+
2
+ # This script contains the functions used to transform spectra prior to computing similarity scores
3
+
4
+ from pycompound_fy7392.build_library import build_library_from_raw_data
5
+ import scipy.stats
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
10
+ '''
11
+ This function performs a weight factor transformation on a spectrum
12
+
13
+ input:
14
+ wf_int: float
15
+ wf_mz: float
16
+ spec_mzs: 1d np array representing mass/charge values
17
+ spec_ints: 1d np array representing intensity values
18
+
19
+ spec_mzs and spec_ints must be of the same length N
20
+
21
+ output:
22
+ spec_ints: 1d np array of weight-factor-transformed spectrum intensities
23
+ '''
24
+
25
+ spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
26
+ return(spec_ints)
27
+
28
+
29
+ def LE_transform(intensity, thresh, normalization_method):
30
+ '''
31
+ This transformation was presented by:
32
+ Li, Y.; Kind, T.; Folz, J.; Vaniya, A.; Mehta, S. S.; Fiehn, O.
33
+ Spectral entropy outperforms MS/MS dot product similarity for small-molecule compound identification.
34
+ Nature Methods 2021, 18, 1524–1531
35
+
36
+ input:
37
+ intensity: 1d np array
38
+ thresh: nonnegative float
39
+ normalization_method: either 'standard' or 'softmax'
40
+
41
+ output:
42
+ 1d np array of transformed intensities
43
+ '''
44
+
45
+ intensity_tmp = normalize(intensity, method=normalization_method)
46
+ if np.sum(intensity_tmp) > 0:
47
+ S = scipy.stats.entropy(intensity_tmp.astype('float'))
48
+ if S > 0 and S < thresh:
49
+ w = (1 + S) / (1 + thresh)
50
+ intensity = np.power(intensity_tmp, w)
51
+ else:
52
+ intensity = np.zeros(len(intensity))
53
+ return intensity
54
+
55
+
56
+ def normalize(intensities,method='standard'):
57
+ '''
58
+ Normalizes a given vector to sum to 1 so that it represents a probability distribution
59
+
60
+ input:
61
+ intensities: 1d np array
62
+ method: normalization method; either 'standard' or 'softmax'
63
+
64
+ output:
65
+ 1d np array of normalized intensities
66
+ '''
67
+
68
+ if np.sum(intensities) > 0:
69
+ if method == 'softmax':
70
+ if np.any(intensities > 700):
71
+ print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
72
+ intensities /= np.sum(intensities)
73
+ else:
74
+ intensities2 = np.exp(intensities)
75
+ if np.isinf(intensities2).sum() == 0:
76
+ intensities = intensities / np.sum(intensities2)
77
+ elif method == 'standard':
78
+ intensities /= np.sum(intensities)
79
+ return(intensities)
80
+
81
+
82
+ def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
83
+ '''
84
+ keep points in a given spectrum in a given range of mz values and intensity values
85
+
86
+ input:
87
+ spec: Nx2 np array representing a mass spectrum with each row representing a peak, the first column representing mass/charge ratio, and the second column representing intensity
88
+ mz_min: remove peaks with mass/charge value smaller than mz_min
89
+ mz_max: remove peaks with mass/charge value larger than mz_max
90
+ int_min: remove peaks with intensity value smaller than int_min
91
+ int_max: remove peaks with intensity value larger than int_max
92
+
93
+ output:
94
+ Mx2 np array representing a mass spectrum with M <= N
95
+ '''
96
+
97
+ if is_matched == False:
98
+ spec = spec[spec[:,0] >= mz_min]
99
+ spec = spec[spec[:,0] <= mz_max]
100
+ spec = spec[spec[:,1] >= int_min]
101
+ spec = spec[spec[:,1] <= int_max]
102
+ else:
103
+ spec = spec[spec[:,0] >= mz_min]
104
+ spec = spec[spec[:,0] <= mz_max]
105
+ spec[spec[:,1] >= int_min] = 0
106
+ spec[spec[:,1] <= int_max] = 0
107
+ return(spec)
108
+
109
+
110
+ def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
111
+ '''
112
+ keep points in a given spectrum in a given range of mz values and intensity values
113
+
114
+ input:
115
+ spec: 1d np array representing the intensities of a nominal-resolution mass spectrum
116
+ mz_min: remove peaks with mass/charge value smaller than mz_min
117
+ mz_max: remove peaks with mass/charge value larger than mz_max
118
+ int_min: remove peaks with intensity value smaller than int_min
119
+ int_max: remove peaks with intensity value larger than int_max
120
+
121
+ output:
122
+ spec: 1d np array representing the intensities of a nominal-resolution mass spectrum post-filtering
123
+ '''
124
+
125
+ spec[np.where(spec[:,0] < mz_min)[0],1] = 0
126
+ spec[np.where(spec[:,0] > mz_max)[0],1] = 0
127
+ spec[np.where(spec[:,1] < int_min)[0],1] = 0
128
+ spec[np.where(spec[:,1] > int_max)[0],1] = 0
129
+ return(spec)
130
+
131
+
132
+ def remove_noise(spec, nr):
133
+ '''
134
+ removes points with intensity less than max(intensities)*nr
135
+
136
+ input:
137
+ spec: Nx2 np array representing a mass spectrum with each row representing a peak, the first column representing mass/charge ratio, and the second column representing intensity
138
+ nr: positive float
139
+
140
+ output:
141
+ Nx2 np array representing a mass spectrum with low-intensity peaks assigned intensity of 0
142
+ '''
143
+
144
+ if spec.shape[0] > 1:
145
+ if nr is not None:
146
+ spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
147
+
148
+ return(spec)
149
+
150
+
151
+ def centroid_spectrum(spec, window_size):
152
+ '''
153
+ This function was presented by:
154
+ Li, Y.; Kind, T.; Folz, J.; Vaniya, A.; Mehta, S. S.; Fiehn, O.
155
+ Spectral entropy outperforms MS/MS dot product similarity for small-molecule compound identification.
156
+ Nature Methods 2021, 18, 1524–1531
157
+
158
+ input:
159
+ spectrum: Nx2 np array with first column being mass/charge and second column being intensity
160
+ window_size: window-size parameter
161
+
162
+ output:
163
+ Mx2 np array representing the centroided spectrum with M <= N
164
+ '''
165
+
166
+ spec = spec[np.argsort(spec[:,0])]
167
+
168
+ #Fast check is the spectrum needs centroiding
169
+ mz_array = spec[:, 0]
170
+ need_centroid = 0
171
+ if mz_array.shape[0] > 1:
172
+ mz_delta = mz_array[1:] - mz_array[:-1]
173
+ if np.min(mz_delta) <= window_size:
174
+ need_centroid = 1
175
+
176
+ if need_centroid:
177
+ intensity_order = np.argsort(-spec[:, 1])
178
+ spec_new = []
179
+ for i in intensity_order:
180
+ mz_delta_allowed = window_size
181
+
182
+ if spec[i, 1] > 0:
183
+ #Find left bound for current peak
184
+ i_left = i - 1
185
+ while i_left >= 0:
186
+ mz_delta_left = spec[i, 0] - spec[i_left, 0]
187
+ if mz_delta_left <= mz_delta_allowed:
188
+ i_left -= 1
189
+ else:
190
+ break
191
+ i_left += 1
192
+
193
+ #Find right bound for current peak
194
+ i_right = i + 1
195
+ while i_right < spec.shape[0]:
196
+ mz_delta_right = spec[i_right, 0] - spec[i, 0]
197
+ if mz_delta_right <= mz_delta_allowed:
198
+ i_right += 1
199
+ else:
200
+ break
201
+
202
+ #Merge those peaks
203
+ intensity_sum = np.sum(spec[i_left:i_right, 1])
204
+ intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
205
+
206
+ spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
207
+ spec[i_left:i_right, 1] = 0
208
+
209
+ spec_new = np.array(spec_new)
210
+ spec_new = spec_new[np.argsort(spec_new[:, 0])]
211
+ if spec_new.shape[0] > 1:
212
+ spec_new = spec_new[np.argsort(spec_new[:, 0])]
213
+ return spec_new
214
+ else:
215
+ return np.array([[0,0]])
216
+ else:
217
+ return spec
218
+
219
+
220
+
221
+ def match_peaks_in_spectra(spec_a, spec_b, window_size):
222
+ '''
223
+ This function was presented by:
224
+ Li, Y.; Kind, T.; Folz, J.; Vaniya, A.; Mehta, S. S.; Fiehn, O.
225
+ Spectral entropy outperforms MS/MS dot product similarity for small-molecule compound identification.
226
+ Nature Methods 2021, 18, 1524–1531
227
+
228
+ This function matches two spectra to find common peaks in order
229
+ to obtain two lists of intensities of the same length
230
+
231
+ input:
232
+ spec_a: Nx2 np array with first column being mass/charge and second column being intensity
233
+ spec_b: Mx2 np array with first column being mass/charge and second column being intensity
234
+ window_size: window-size parameter
235
+
236
+ output:
237
+ Kx3 np array with first column being mass/charge, second column being matched intensities of spec_a, and third column being matched intensities of spec_b
238
+ '''
239
+
240
+ a = 0
241
+ b = 0
242
+
243
+ spec_merged = []
244
+ peak_b_int = 0.
245
+ while a < spec_a.shape[0] and b < spec_b.shape[0]:
246
+ mass_delta = spec_a[a, 0] - spec_b[b, 0]
247
+
248
+ if mass_delta < -window_size:
249
+ # Peak only existed in spec a.
250
+ spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
251
+ peak_b_int = 0.
252
+ a += 1
253
+ elif mass_delta > window_size:
254
+ # Peak only existed in spec b.
255
+ spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
256
+ b += 1
257
+ else:
258
+ # Peak existed in both spec.
259
+ peak_b_int += spec_b[b, 1]
260
+ b += 1
261
+
262
+ if peak_b_int > 0.:
263
+ spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
264
+ peak_b_int = 0.
265
+ a += 1
266
+
267
+ if b < spec_b.shape[0]:
268
+ spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
269
+
270
+ if a < spec_a.shape[0]:
271
+ spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
272
+
273
+ if spec_merged:
274
+ spec_merged = np.array(spec_merged, dtype=np.float64)
275
+ else:
276
+ spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
277
+ return spec_merged
278
+
279
+
280
+
281
+ def convert_spec(spec, mzs):
282
+ '''
283
+ imputes intensities of 0 where there is no mass/charge value reported in a given spectrum
284
+ input:
285
+ spec: Nx2 numpy array
286
+ mzs: list of entire span of mass/charge values considering both the query and reference libraries
287
+
288
+ output:
289
+ Nx2 numpy array
290
+ '''
291
+
292
+ ints_tmp = []
293
+ for i in range(0,len(mzs)):
294
+ if mzs[i] in spec[:,0]:
295
+ int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
296
+ else:
297
+ int_tmp = 0
298
+ ints_tmp.append(int_tmp)
299
+ out = np.transpose(np.array([mzs,ints_tmp]))
300
+ return out
301
+
302
+
303
+ def get_reference_df(reference_data, likely_reference_IDs=None):
304
+ extension = reference_data.rsplit('.',1)
305
+ extension = extension[(len(extension)-1)]
306
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
307
+ output_path_tmp = reference_data[:-3] + 'csv'
308
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
309
+ df_reference = pd.read_csv(output_path_tmp)
310
+ if extension == 'csv' or extension == 'CSV':
311
+ df_reference = pd.read_csv(reference_data)
312
+ if likely_reference_IDs is not None:
313
+ likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
314
+ df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
315
+ return df_reference
316
+
@@ -0,0 +1,100 @@
1
+
2
+ ##### Similarity Score Functions #####
3
+ # Note that the input for all similarity measures are two 1-d np arrays of the same length.
4
+ # These 1-d arrays must be normalized to sum to 1 for the Shannon, Renyi, and Tsallis Entropy Similarity Measures.
5
+
6
+ import scipy.stats
7
+ import numpy as np
8
+ import sys
9
+
10
+
11
+ def S_cos(ints_a, ints_b):
12
+ # Cosine Similarity Measure
13
+ if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
14
+ return(0)
15
+ else:
16
+ return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
17
+
18
+
19
+ def ent_renyi(ints, q):
20
+ # Computes the Renyi entropy of a probability distribution for a given positive entropy dimension q
21
+ return np.log(sum(np.power(ints,q))) / (1-q)
22
+
23
+
24
+ def ent_tsallis(ints, q):
25
+ # Computes the Tsallis entropy of a probability distribution for a given positive entropy dimension q
26
+ return (sum(np.power(ints,q))-1) / (1-q)
27
+
28
+
29
+ def S_shannon(ints_a, ints_b):
30
+ '''
31
+ Shannon Entropy Similarity Measure
32
+
33
+ This similarity function was presented by:
34
+ Li, Y.; Kind, T.; Folz, J.; Vaniya, A.; Mehta, S. S.; Fiehn, O.
35
+ Spectral entropy outperforms MS/MS dot product similarity for small-molecule compound identification.
36
+ * Note that since scipy.stats.entropy normalizes the input vector to sum to 1, vec1 and vec1 need not be normalized when computing ent_ab
37
+ '''
38
+
39
+ ent_a = scipy.stats.entropy(ints_a)
40
+ ent_b = scipy.stats.entropy(ints_b)
41
+ ent_ab = scipy.stats.entropy(ints_a + ints_b)
42
+ return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
43
+
44
+
45
+ def S_renyi(ints_a, ints_b, q):
46
+ '''
47
+ Renyi Entropy Similarity Measure
48
+ * This is a novel similarity measure which generalizes the Shannon Entropy Similarity Measure
49
+ * The Renyi Similarity Measure approaches the Shannon Entropy Similiarity Measure as q approaches 1
50
+ * ints_a and ints_b must be normalized to sum to 1
51
+ '''
52
+ if q == 1:
53
+ print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
54
+ return S_shannon(ints_a, ints_b)
55
+ else:
56
+ ent_a = ent_renyi(ints_a, q)
57
+ ent_b = ent_renyi(ints_b, q)
58
+ ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
59
+ N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
60
+ return 1 - (2 * ent_merg - ent_a - ent_b) / N
61
+
62
+
63
+ def S_tsallis(ints_a, ints_b, q):
64
+ '''
65
+ Tsallis Entropy Similarity Measure
66
+ * This is a novel similarity measure which generalizes the Shannon Entropy Similarity Measure
67
+ * The Tsallis Similarity Measure approaches the Shannon Entropy Similiarity Measure as q approaches 1
68
+ * ints_a and ints_b must be normalized to sum to 1
69
+ '''
70
+ if q == 1:
71
+ print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
72
+ return S_shannon(ints_a, ints_b)
73
+ else:
74
+ ent_a = ent_tsallis(ints_a, q)
75
+ ent_b = ent_tsallis(ints_b, q)
76
+ ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
77
+ N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
78
+ return 1 - (2 * ent_merg - ent_a - ent_b) / N
79
+
80
+ def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
81
+ '''
82
+ Mixture similarity measure that is a weighted sum of any combination of the four similarity measures of Cosine, Shannon, Renyi, and Tsallis
83
+ '''
84
+ if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
85
+ print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
86
+ sys.exit()
87
+
88
+ similarity = 0
89
+ for key, value in weights.items():
90
+ if key == 'Cosine':
91
+ similarity += value * S_cos(ints_a,ints_b)
92
+ if key == 'Shannon':
93
+ similarity += value * S_shannon(ints_a,ints_b)
94
+ if key == 'Renyi':
95
+ similarity += value * S_renyi(ints_a,ints_b,q)
96
+ if key == 'Tsallis':
97
+ similarity += value * S_tsallis(ints_a,ints_b,q)
98
+ return similarity
99
+
100
+