IsoSpecPy 2.3.0.dev11__cp313-cp313-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,294 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
4
+ #
5
+ # This file is part of IsoSpec.
6
+ #
7
+ # IsoSpec is free software: you can redistribute it and/or modify
8
+ # it under the terms of the Simplified ("2-clause") BSD licence.
9
+ #
10
+ # IsoSpec is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13
+ #
14
+ # You should have received a copy of the Simplified BSD Licence
15
+ # along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
16
+ #
17
+
18
+
19
+ '''
20
+ Bunch of deprecated functions for 1.0.X compatibility.
21
+ Avoid using them: there is a considerable overhead associated
22
+ with using the old interface... The backward compatibility module
23
+ is also very rudimentary, somewhat incomplete and not very well
24
+ tested too...
25
+
26
+ The current API is implemented in __init__.py, use that instead
27
+ '''
28
+
29
+ try:
30
+ xrange
31
+ except NameError:
32
+ xrange = range
33
+
34
+ import re
35
+ from .IsoSpecPy import IsoTotalProb, IsoThreshold
36
+
37
+ DIGIT_PATTERN = re.compile(r"\d+")
38
+ NON_DIGIT_PATTERN = re.compile(r"\D+")
39
+
40
+ class IsoSpec():
41
+ def __init__(
42
+ self,
43
+ _atomCounts,
44
+ _isotopeMasses,
45
+ _isotopeProbabilities,
46
+ _stopCondition,
47
+ tabSize = 1000, # ignored
48
+ hashSize = 1000, # ignored
49
+ step = 0.3, # ignored
50
+ trim = True, # True not supported yet, treated as False anyway
51
+ method = 'layered'
52
+ ):
53
+
54
+ isoargs = {
55
+ "formula" : None,
56
+ "get_confs" : True,
57
+ "atomCounts" : _atomCounts,
58
+ "isotopeMasses" : _isotopeMasses,
59
+ "isotopeProbabilities" : _isotopeProbabilities,
60
+ }
61
+
62
+
63
+ self.dimNumber = len(_atomCounts)
64
+ self._isotopeNumbers = [len(x) for x in _isotopeMasses]
65
+ self.allIsotopeNumber = sum(self._isotopeNumbers)
66
+ self._atomCounts = _atomCounts
67
+ self._isotopeMasses = _isotopeMasses
68
+ self._isotopeProbabilities = _isotopeProbabilities
69
+ self._stopCondition = _stopCondition
70
+
71
+ from .__init__ import IsoThreshold
72
+
73
+ try:
74
+ algo = { 'layered' : lambda total_prob: IsoTotalProb(total_prob, **isoargs),
75
+ 'ordered' : lambda total_prob: IsoTotalProb(total_prob, **isoargs),
76
+ 'threshold_absolute' : lambda threshold: IsoThreshold(threshold, True, **isoargs),
77
+ 'threshold_relative' : lambda threshold: IsoThreshold(threshold, False, **isoargs),
78
+ 'layered_estimating' : lambda total_prob: IsoTotalProb(total_prob, **isoargs)
79
+ }[method]
80
+ except KeyError:
81
+ raise Exception("Invalid ISO method")
82
+
83
+ # Reference to iso needs to be held in this object: it will deallocate masses/lprobs/etc arrays on C++ side if we
84
+ # allow GC to collect it prematurely
85
+ self.iso = algo(_stopCondition)
86
+
87
+ self.masses = self.iso.masses
88
+ self.lprobs = self.iso.lprobs
89
+ self.probs = self.iso.probs
90
+ self.confs = self.iso.confs
91
+ self.size = self.iso.size
92
+
93
+ if method == 'ordered' and len(self.masses) > 0:
94
+ L = sorted(zip(self.masses, self.lprobs, self.probs, self.confs), key = lambda x: -x[1])
95
+ self.masses, self.lprobs, self.probs, self.confs = zip(*L)
96
+
97
+ @staticmethod
98
+ def IsoFromFormula(formula, cutoff, tabSize = 1000, hashSize = 1000, classId = None, method = 'threshold_relative', step = 0.25, trim = True):
99
+ # It's much easier to just parse it in python than to use the C parsing function
100
+ # and retrieve back into Python the relevant object sizes
101
+ symbols = re.findall(NON_DIGIT_PATTERN, formula)
102
+ atom_counts = [int(x) for x in re.findall(DIGIT_PATTERN, formula)]
103
+
104
+ if not len(symbols) == len(atom_counts):
105
+ raise ValueError("Invalid formula")
106
+
107
+ from .PeriodicTbl import symbol_to_masses, symbol_to_probs
108
+ try:
109
+ masses = tuple(symbol_to_masses[symbol] for symbol in symbols)
110
+ probs = tuple(symbol_to_probs[symbol] for symbol in symbols)
111
+ except KeyError:
112
+ raise ValueError("Invalid formula")
113
+
114
+ return IsoSpec(atom_counts, masses, probs, cutoff, tabSize, hashSize, step, trim, method)
115
+
116
+
117
+
118
+ def __len__(self):
119
+ return self.size
120
+
121
+ def getConfsRaw(self):
122
+ return (self.masses, self.lprobs, self.confs)
123
+
124
+ # def get_conf_by_no(self, clist, idx):
125
+ # idx *= self.allIsotopeNumber
126
+ # ret = []
127
+ # for ison in self._isotopeNumbers:
128
+ # ret.append(tuple(clist[idx:idx+ison]))
129
+ # idx += ison
130
+ # return tuple(ret)
131
+
132
+
133
+ def getConfs(self):
134
+ masses, logProbs, isoCounts = self.getConfsRaw()
135
+ rows_no = len(masses)
136
+ masses = list(masses)
137
+ logProbs= list(logProbs)
138
+ confs = []
139
+ for i in xrange(rows_no):
140
+ confs.append([x for sublist in isoCounts[i] for x in sublist])
141
+ return masses, logProbs, confs
142
+
143
+
144
+ def getConfsNumpy(self):
145
+ import numpy as np
146
+ masses, logProbs, configurations = self.getConfsRaw()
147
+ rows_no = len(masses)
148
+ if rows_no == 0:
149
+ return np.array([])
150
+ cols_no = len(configurations)//len(masses)
151
+ masses = np.array(list(masses))
152
+ logProbs= np.array(list(logProbs))
153
+ configurations = np.array(list(configurations)).reshape((rows_no,cols_no))
154
+ return masses, logProbs, configurations
155
+
156
+
157
+ def splitConf(self, l, offset = 0):
158
+ conf = []
159
+ idx = self.allIsotopeNumber * offset
160
+ for i in xrange(self.dimNumber):
161
+ conf.append(tuple(l[idx:idx+self._isotopeNumbers[i]]))
162
+ idx += self._isotopeNumbers[i]
163
+ return tuple(conf)
164
+
165
+ def confStr(self, conf):
166
+ return '\t'.join([' '.join([str(x) for x in y]) for y in conf])
167
+
168
+ def printConfs(self):
169
+ masses, logProbs, isoCounts = self.getConfsRaw()
170
+ confs = []
171
+ step = sum(self._isotopeNumbers)
172
+ for i in xrange(len(masses)):
173
+ confs.append((masses[i], logProbs[i], self.splitConf(isoCounts, i)))
174
+
175
+ for conf in confs:
176
+ print(("Mass = {0}\t and log-prob = {1} and prob = {2}\t and configuration"\
177
+ "=\t{3}").format(conf[0], conf[1], math.exp(conf[1]), self.confStr(conf[2])))
178
+
179
+
180
+
181
+ class IsoPlot(dict):
182
+ def __init__(self, iso, bin_w):
183
+ self.iso = iso
184
+ self.bin_w = bin_w
185
+ masses, logProbs, _isoCounts = iso.getConfsRaw()
186
+ dd = defaultdict(Summator)
187
+ for i in xrange(len(masses)):
188
+ dd[float(int(masses[i]/bin_w))*bin_w].add(math.exp(logProbs[i]))
189
+ for key, val in dd.items():
190
+ self[key] = val.get()
191
+
192
+
193
+ def IsoSpecify( formula,
194
+ cutoff,
195
+ method= 'layered',
196
+ output_format = 'numpy_arrays',
197
+ trim = True,
198
+ _step = 0.25,
199
+ _trim = True,
200
+ _tabSize = 1000,
201
+ _hashSize = 1000 ):
202
+ """
203
+ Call IsoSpec on a formula with a given cutoff.
204
+
205
+ This function wraps around the IsoSpec class.
206
+
207
+ Parameters
208
+ ----------
209
+ formula : char
210
+ a string of a form '< Element Tag 1 >< Count 1 > ... ',
211
+ e.g. 'C100H202'. Using IUPAC conventions to name elements.
212
+
213
+ cutoff : float
214
+ The cutoff value. See description of the method argument.
215
+
216
+ method : char
217
+ Can take one of the following values: 'layered',
218
+ 'layered_estimating', 'threshold_absolute',
219
+ 'threshold_relative', 'ordered'.
220
+
221
+ The threshold versions of the algorithm rely on user
222
+ providing a precise lower bound on the reported peak heights.
223
+ This can be specified in absolute terms ('threshold_absolute'),
224
+ i.e. in terms of the limiting probability of the isotopologue,
225
+ or as a percentage of the heighest peak ('threshold_relative').
226
+
227
+ The layered versions of the algorithm rely on calculating
228
+ consecutive values of peak thresholds on flight.
229
+ The ultimate goal is to reach a peak probability that assures
230
+ that the sum of probabilities of the more probable isotopologues
231
+ exceeds the provided cutoff value.
232
+ The sequence of consecutive thresholds can be generated in
233
+ two ways. The default way, 'layered_estimating', estimates
234
+ the threshold to joint probability function by a progressive
235
+ linear spline, check Anal Chem. 2017 Mar 21;89(6):3272-3277.
236
+ doi: 10.1021/acs.analchem.6b01459. Epub 2017 Mar 8.
237
+ The other way, 'layered', estimates a threshold as a 30%%
238
+ quantile of the probabilities gathered in the fringe set, i.e.
239
+ isotopologues that are direct neighbours of the previously
240
+ accepted layer. Finally, choosing the 'ordered' version will
241
+ provide a loglinear version of the algorithm that relies on
242
+ the priority queue. This version automatically sorts
243
+ the configurations by their probability.
244
+
245
+ trim
246
+ while using a layered method, should one discard superfluously
247
+ obtained isotopologues, i.e. such that without them the set of
248
+ reported isotopologues already is an optimal p-set.
249
+
250
+ output_format
251
+ Should the output be presented as lists ('lists'),
252
+ or as numpy arrays ('numpy_arrays').
253
+
254
+ Returns
255
+ -------
256
+ masses
257
+ masses of isotopologues, either a list or a numpy array.
258
+
259
+ logProbs
260
+ logarithms of probabilities (theoretical heights) of isotopologues,
261
+ either a list or a numpy array.
262
+
263
+ confs
264
+ counts of isotopologues (extended chemical formulas that
265
+ include counts of isotopes of elements)
266
+ """
267
+
268
+
269
+ assert output_format in ('lists', 'numpy_arrays'), "Wrong value of output_format. Should be either 'lists' or 'numpy_arrays'."
270
+
271
+ assert method in ('layered', 'ordered', 'threshold_absolute', 'threshold_relative', 'layered_estimating'), "Wrong value of method. Should be among 'layered', 'ordered', 'threshold_absolute', 'threshold_relative', or 'layered_estimating'."
272
+
273
+ assert isinstance(cutoff, float), "Provided cut off ain't a float."
274
+
275
+ assert isinstance(formula, str), "Provided formula off ain't a string."
276
+
277
+ iso = IsoSpec.IsoFromFormula( formula,
278
+ cutoff,
279
+ tabSize = 1000,
280
+ hashSize = 1000,
281
+ classId = None,
282
+ method = method,
283
+ step = 0.25,
284
+ trim = trim )
285
+
286
+ if output_format == 'lists':
287
+ masses, logProbs, confs = iso.getConfs()
288
+ else:
289
+ masses, logProbs, confs = iso.getConfsNumpy()
290
+
291
+ # print 'Rev Startek is a silly old chump and his mother dresses up silly.'
292
+ return masses, logProbs, confs
293
+
294
+
@@ -0,0 +1,38 @@
1
+ from .isoFFI import isoFFI
2
+ from collections import defaultdict
3
+
4
+ try:
5
+ xrange
6
+ except NameError:
7
+ xrange = range
8
+
9
+ number_of_isotopic_entries = isoFFI.clib.NUMBER_OF_ISOTOPIC_ENTRIES
10
+
11
+ symbol_to_masses = defaultdict(tuple)
12
+ symbol_to_massNo = defaultdict(tuple)
13
+ symbol_to_probs = defaultdict(tuple)
14
+ symbol_to_atomic_number = {}
15
+
16
+ for i in xrange(number_of_isotopic_entries):
17
+ symbol = isoFFI.ffi.string(isoFFI.clib.elem_table_symbol[i]).decode("ascii")
18
+ symbol_to_masses[symbol] += (isoFFI.clib.elem_table_mass[i],)
19
+ symbol_to_massNo[symbol] += (isoFFI.clib.elem_table_massNo[i],)
20
+ symbol_to_probs[symbol] += (isoFFI.clib.elem_table_probability[i],)
21
+ symbol_to_atomic_number[symbol] = isoFFI.clib.elem_table_atomicNo[i]
22
+
23
+ symbol_to_masses = dict(symbol_to_masses)
24
+ symbol_to_probs = dict(symbol_to_probs)
25
+
26
+ # Several derivative convenience dicts...
27
+ symbol_to_massprob = dict((key, [zip(symbol_to_masses[key], symbol_to_probs[key])]) for key in symbol_to_probs.keys())
28
+
29
+ def crossprod(l1, l2):
30
+ return sum(x1*x2 for x1, x2 in zip(l1, l2))
31
+
32
+ symbol_to_avg_mass = dict((key, crossprod(symbol_to_masses[key], symbol_to_probs[key])) for key in symbol_to_probs.keys())
33
+
34
+ def maxprod(l1, l2):
35
+ return max(zip(l1, l2), key = lambda x: x[1])[0]
36
+
37
+ symbol_to_monoisotopic_mass = dict((key, maxprod(symbol_to_masses[key], symbol_to_probs[key])) for key in symbol_to_probs.keys())
38
+
IsoSpecPy/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ from .IsoSpecPy import *
2
+ import importlib.metadata
3
+
4
+ __version__ = importlib.metadata.version("IsoSpecPy")
5
+
6
+ # Old, deprecated name, for compatibility with 1.9.X only
7
+ IsoLayered = IsoTotalProb
8
+
9
+ # For backward compatibility with 1.0.X:
10
+ class CompatIsoWrapper(object):
11
+ def __init__(self):
12
+ from .IsoSpecPyOld import IsoSpec, IsoSpecify, IsoPlot
13
+ self.IsoSpec = IsoSpec
14
+ self.IsoSpecify = IsoSpecify
15
+ self.IsoPlot = IsoPlot
16
+
17
+
18
+ IsoSpecPy = CompatIsoWrapper()
19
+
IsoSpecPy/__main__.py ADDED
@@ -0,0 +1,25 @@
1
+ from . import __version__
2
+
3
+ if __name__ == "__main__":
4
+ import argparse
5
+ from pathlib import Path
6
+
7
+ parser = argparse.ArgumentParser(description="IsoSpecPy: Python interface to IsoSpec++ library, a fast and memory-efficient library for computing isotopic distributions.")
8
+ parser.add_argument('--version', '-v', action='version', version=__version__)
9
+ parser.add_argument('--libpath', action='store_true',
10
+ help='Print the path to the loaded C++ library and exit.')
11
+ parser.add_argument('--include', action='store_true',
12
+ help='Print the include path for the headers of the C++ library and exit.')
13
+ args = parser.parse_args()
14
+
15
+ if args.libpath:
16
+ try:
17
+ from .isoFFI import IsoFFI
18
+ ffi = IsoFFI()
19
+ print(Path(ffi.libpath).resolve())
20
+ except ImportError as e:
21
+ print(f"Error loading IsoSpecPy: {e}")
22
+ exit(0)
23
+
24
+ if args.include:
25
+ print(Path(__file__).parent.resolve())
@@ -0,0 +1,131 @@
1
+ """Here we store formulas for approximating numbers of isotopologues and subisotopologues."""
2
+ from math import exp, log, lgamma, pi
3
+
4
+ from .IsoSpecPy import IsoParamsFromFormula
5
+
6
+
7
+ def log_multinomial_confs_cnt(n, i):
8
+ """Return the natural logarithm of the number of configurations of the multinomial distribution.
9
+
10
+ There are n independent trials of an experiment that results in one of the i outcomes.
11
+ For each outcome, count how many times it occured.
12
+ These counts then follow the multinomial distribution.
13
+
14
+ Args:
15
+ n (int): The number of trials.
16
+ i (int): The number of possible outcomes.
17
+ """
18
+ return lgamma(n + i) - \
19
+ lgamma(n + 1) - \
20
+ lgamma(i)
21
+
22
+ def test_log_multinomial_confs_cnt():
23
+ x = log_multinomial_confs_cnt(10,3)
24
+ assert abs(x - 4.189655) < 10**(-5)
25
+
26
+
27
+ def multinomial_confs_cnt(n, i):
28
+ """Return the approximate number of configurations of the multinomial distribution.
29
+
30
+ Args:
31
+ n (int): The number of trials.
32
+ i (int): The number of possible outcomes.
33
+ """
34
+ return exp(log_subiso_cnt_simplex(n, i))
35
+
36
+
37
+ def log_V_simplex(n, i):
38
+ """Get the natural logarithm of the volume of a simplex {(x_1,..,x_{i-1}): \sum_{j=1}^i = n}.
39
+
40
+ Args:
41
+ n (int): The number of atoms of the element.
42
+ i (int): The number of isotopes of the element.
43
+ """
44
+ return (i-1)*log(n) - lgamma(i)
45
+
46
+
47
+ def test_log_V_simplex():
48
+ x = log_V_simplex(10,3)
49
+ assert abs(x - 3.912023) < 10**(-5)
50
+
51
+
52
+ def V_simplex(n, i):
53
+ """Get the volume of a simplex {(x_1,..,x_{i-1}): \sum_{j=1}^i = n}.
54
+
55
+ Args:
56
+ n (int): The number of atoms of the element.
57
+ i (int): The number of isotopes of the element.
58
+ """
59
+ return exp(log_V_simplex(n,i))
60
+
61
+ def log_V_ellipsoid(n, R2, probs):
62
+ """Get the natural logarithm of the volume of the ellipsoid.
63
+
64
+ The ellipsoid is defined by x' W^{-1} x <= R2,
65
+ where W = diag(probs[0:-1]) - probs[0:-1] * probs[0:-1]'
66
+ and R2 is the square of the radius.
67
+ diag(probs) is a matrix with values probs[0:-1] on the diagonal,
68
+ and probs[0:-1] * probs[0:-1]' is a projection on probs[0:-1].
69
+ Since probabilities are nonzero and sum to one, then det W != 0.
70
+ Also, the expression does not really depend upon the choice of one of the
71
+ ommited probability term, e.g. the outcome would stay the same if we remover p[1].
72
+
73
+ Args:
74
+ n (int): The number of atoms of the element.
75
+ R2 (float): The square of the radius of the ellipsoid.
76
+ probs (list): List with the natural frequencies of isotopes.
77
+ """
78
+ i = len(probs)
79
+ log_det = sum(log(p) for p in probs)
80
+ return (log_det + (i-1)*(log(n) + log(R2) + log(pi)))/2.0 - lgamma((i+1)/2.0)
81
+
82
+
83
+ def test_log_V_ellipsoid():
84
+ assert abs(log_V_ellipsoid(100, 10, [.2,.3,.5]) - 6.299206) < 10**(-5)
85
+
86
+
87
+ def V_ellipsoid(n, R2, probs):
88
+ """Get the volume of the ellipsoid."""
89
+ return exp(log_V_ellipsoid(n, R2, probs))
90
+
91
+ def log_subisotopologue_cnt(atoms_cnt, isotope_frequencies, ellipsoid_R2):
92
+ """Get the natural logarithm of the approximate number of subisotopologues.
93
+
94
+ Args:
95
+ atoms_cnt (int): The number of atoms of the given element.
96
+ isotope_frequencies (list): The natural frequencies of isotopes (sum to one).
97
+ ellipsoid_R2 (float): The radius of the ellipsoid used to approximate the optimal P-set.
98
+ """
99
+ isotopes_cnt = len(isotope_frequencies)
100
+ return log_multinomial_confs_cnt(atoms_cnt, isotopes_cnt) + \
101
+ log_V_ellipsoid(atoms_cnt, ellipsoid_R2, isotope_frequencies) - \
102
+ log_V_simplex(atoms_cnt, isotopes_cnt)
103
+
104
+ def test_log_subisotopologue_cnt():
105
+ assert abs(log_subisotopologue_cnt(100, [.2,.3,.5], 10) - 6.328959) < 10**(-5)
106
+
107
+
108
+ def subisotopologue_cnt(atoms_cnt, isotope_frequencies, ellipsoid_R2):
109
+ """Get the approximate number of subisotopologues."""
110
+ return exp(log_subisotopologue_cnt(atoms_cnt, isotope_frequencies, ellipsoid_R2))
111
+
112
+
113
+ def approximate_subisotopologues(molecule, P):
114
+ """Approximate the number of subisotopologues.
115
+
116
+ Args:
117
+ molecule (str): A string with molecule, e.g. 'C100H202'.
118
+ P (float): The joint probability threshold.
119
+ """
120
+ from scipy.stats import chi2
121
+ assert P >= 0 and P <= 1, 'That is not a probability.'
122
+ mol = IsoParamsFromFormula(molecule)
123
+ chi2_df = sum(len(p) for p in mol.probs) - len(mol.probs)
124
+ R2 = chi2.ppf(q=P, df=chi2_df)
125
+ return {e: subisotopologue_cnt(n, p, R2) for n, p, e in \
126
+ zip(mol.atomCounts, mol.probs, mol.elems)}
127
+
128
+
129
+ if __name__ == '__main__':
130
+ print(approximate_subisotopologues('C100H202', .999))
131
+ print(approximate_subisotopologues("C100H100", .999))
@@ -0,0 +1,16 @@
1
+
2
+
3
+
4
+
5
+ class ConfsPassthrough(object):
6
+ def __init__(self, confs_parser, size):
7
+ self.confs_parser = confs_parser
8
+ self.size = size
9
+
10
+ def __len__(self):
11
+ return self.size
12
+
13
+ def __getitem__(self, idx):
14
+ return self.confs_parser(idx)
15
+
16
+