IsoSpecPy 2.3.0.dev11__cp313-cp313-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- IsoSpecPy/Advanced.py +32 -0
- IsoSpecPy/Distributions.py +94 -0
- IsoSpecPy/Formulas.py +71 -0
- IsoSpecPy/IsoSpec++/allocator.h +69 -0
- IsoSpecPy/IsoSpec++/btrd.h +206 -0
- IsoSpecPy/IsoSpec++/conf.h +38 -0
- IsoSpecPy/IsoSpec++/cwrapper.h +179 -0
- IsoSpecPy/IsoSpec++/dirtyAllocator.h +58 -0
- IsoSpecPy/IsoSpec++/element_tables.h +50 -0
- IsoSpecPy/IsoSpec++/fasta.h +67 -0
- IsoSpecPy/IsoSpec++/fixedEnvelopes.h +244 -0
- IsoSpecPy/IsoSpec++/isoMath.h +87 -0
- IsoSpecPy/IsoSpec++/isoSpec++.h +693 -0
- IsoSpecPy/IsoSpec++/marginalTrek++.h +573 -0
- IsoSpecPy/IsoSpec++/misc.h +204 -0
- IsoSpecPy/IsoSpec++/mman.h +67 -0
- IsoSpecPy/IsoSpec++/operators.h +150 -0
- IsoSpecPy/IsoSpec++/platform.h +111 -0
- IsoSpecPy/IsoSpec++/platform_incl.h +33 -0
- IsoSpecPy/IsoSpec++/pod_vector.h +399 -0
- IsoSpecPy/IsoSpec++/summator.h +118 -0
- IsoSpecPy/IsoSpecPy.py +840 -0
- IsoSpecPy/IsoSpecPyOld.py +294 -0
- IsoSpecPy/PeriodicTbl.py +38 -0
- IsoSpecPy/__init__.py +19 -0
- IsoSpecPy/__main__.py +25 -0
- IsoSpecPy/approximations.py +131 -0
- IsoSpecPy/confs_passthrough.py +16 -0
- IsoSpecPy/isoFFI.py +223 -0
- bin/IsoSpecCppPy.dll +0 -0
- isospecpy-2.3.0.dev11.dist-info/METADATA +43 -0
- isospecpy-2.3.0.dev11.dist-info/RECORD +35 -0
- isospecpy-2.3.0.dev11.dist-info/WHEEL +5 -0
- isospecpy-2.3.0.dev11.dist-info/licenses/LICENCE +35 -0
- lib/IsoSpecCppPy.lib +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
|
|
4
|
+
#
|
|
5
|
+
# This file is part of IsoSpec.
|
|
6
|
+
#
|
|
7
|
+
# IsoSpec is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the Simplified ("2-clause") BSD licence.
|
|
9
|
+
#
|
|
10
|
+
# IsoSpec is distributed in the hope that it will be useful,
|
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
13
|
+
#
|
|
14
|
+
# You should have received a copy of the Simplified BSD Licence
|
|
15
|
+
# along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
'''
|
|
20
|
+
Bunch of deprecated functions for 1.0.X compatibility.
|
|
21
|
+
Avoid using them: there is a considerable overhead associated
|
|
22
|
+
with using the old interface... The backward compatibility module
|
|
23
|
+
is also very rudimentary, somewhat incomplete and not very well
|
|
24
|
+
tested too...
|
|
25
|
+
|
|
26
|
+
The current API is implemented in __init__.py, use that instead
|
|
27
|
+
'''
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
xrange
|
|
31
|
+
except NameError:
|
|
32
|
+
xrange = range
|
|
33
|
+
|
|
34
|
+
import re
|
|
35
|
+
from .IsoSpecPy import IsoTotalProb, IsoThreshold
|
|
36
|
+
|
|
37
|
+
DIGIT_PATTERN = re.compile(r"\d+")
|
|
38
|
+
NON_DIGIT_PATTERN = re.compile(r"\D+")
|
|
39
|
+
|
|
40
|
+
class IsoSpec():
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
_atomCounts,
|
|
44
|
+
_isotopeMasses,
|
|
45
|
+
_isotopeProbabilities,
|
|
46
|
+
_stopCondition,
|
|
47
|
+
tabSize = 1000, # ignored
|
|
48
|
+
hashSize = 1000, # ignored
|
|
49
|
+
step = 0.3, # ignored
|
|
50
|
+
trim = True, # True not supported yet, treated as False anyway
|
|
51
|
+
method = 'layered'
|
|
52
|
+
):
|
|
53
|
+
|
|
54
|
+
isoargs = {
|
|
55
|
+
"formula" : None,
|
|
56
|
+
"get_confs" : True,
|
|
57
|
+
"atomCounts" : _atomCounts,
|
|
58
|
+
"isotopeMasses" : _isotopeMasses,
|
|
59
|
+
"isotopeProbabilities" : _isotopeProbabilities,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
self.dimNumber = len(_atomCounts)
|
|
64
|
+
self._isotopeNumbers = [len(x) for x in _isotopeMasses]
|
|
65
|
+
self.allIsotopeNumber = sum(self._isotopeNumbers)
|
|
66
|
+
self._atomCounts = _atomCounts
|
|
67
|
+
self._isotopeMasses = _isotopeMasses
|
|
68
|
+
self._isotopeProbabilities = _isotopeProbabilities
|
|
69
|
+
self._stopCondition = _stopCondition
|
|
70
|
+
|
|
71
|
+
from .__init__ import IsoThreshold
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
algo = { 'layered' : lambda total_prob: IsoTotalProb(total_prob, **isoargs),
|
|
75
|
+
'ordered' : lambda total_prob: IsoTotalProb(total_prob, **isoargs),
|
|
76
|
+
'threshold_absolute' : lambda threshold: IsoThreshold(threshold, True, **isoargs),
|
|
77
|
+
'threshold_relative' : lambda threshold: IsoThreshold(threshold, False, **isoargs),
|
|
78
|
+
'layered_estimating' : lambda total_prob: IsoTotalProb(total_prob, **isoargs)
|
|
79
|
+
}[method]
|
|
80
|
+
except KeyError:
|
|
81
|
+
raise Exception("Invalid ISO method")
|
|
82
|
+
|
|
83
|
+
# Reference to iso needs to be held in this object: it will deallocate masses/lprobs/etc arrays on C++ side if we
|
|
84
|
+
# allow GC to collect it prematurely
|
|
85
|
+
self.iso = algo(_stopCondition)
|
|
86
|
+
|
|
87
|
+
self.masses = self.iso.masses
|
|
88
|
+
self.lprobs = self.iso.lprobs
|
|
89
|
+
self.probs = self.iso.probs
|
|
90
|
+
self.confs = self.iso.confs
|
|
91
|
+
self.size = self.iso.size
|
|
92
|
+
|
|
93
|
+
if method == 'ordered' and len(self.masses) > 0:
|
|
94
|
+
L = sorted(zip(self.masses, self.lprobs, self.probs, self.confs), key = lambda x: -x[1])
|
|
95
|
+
self.masses, self.lprobs, self.probs, self.confs = zip(*L)
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def IsoFromFormula(formula, cutoff, tabSize = 1000, hashSize = 1000, classId = None, method = 'threshold_relative', step = 0.25, trim = True):
|
|
99
|
+
# It's much easier to just parse it in python than to use the C parsing function
|
|
100
|
+
# and retrieve back into Python the relevant object sizes
|
|
101
|
+
symbols = re.findall(NON_DIGIT_PATTERN, formula)
|
|
102
|
+
atom_counts = [int(x) for x in re.findall(DIGIT_PATTERN, formula)]
|
|
103
|
+
|
|
104
|
+
if not len(symbols) == len(atom_counts):
|
|
105
|
+
raise ValueError("Invalid formula")
|
|
106
|
+
|
|
107
|
+
from .PeriodicTbl import symbol_to_masses, symbol_to_probs
|
|
108
|
+
try:
|
|
109
|
+
masses = tuple(symbol_to_masses[symbol] for symbol in symbols)
|
|
110
|
+
probs = tuple(symbol_to_probs[symbol] for symbol in symbols)
|
|
111
|
+
except KeyError:
|
|
112
|
+
raise ValueError("Invalid formula")
|
|
113
|
+
|
|
114
|
+
return IsoSpec(atom_counts, masses, probs, cutoff, tabSize, hashSize, step, trim, method)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def __len__(self):
|
|
119
|
+
return self.size
|
|
120
|
+
|
|
121
|
+
def getConfsRaw(self):
|
|
122
|
+
return (self.masses, self.lprobs, self.confs)
|
|
123
|
+
|
|
124
|
+
# def get_conf_by_no(self, clist, idx):
|
|
125
|
+
# idx *= self.allIsotopeNumber
|
|
126
|
+
# ret = []
|
|
127
|
+
# for ison in self._isotopeNumbers:
|
|
128
|
+
# ret.append(tuple(clist[idx:idx+ison]))
|
|
129
|
+
# idx += ison
|
|
130
|
+
# return tuple(ret)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def getConfs(self):
|
|
134
|
+
masses, logProbs, isoCounts = self.getConfsRaw()
|
|
135
|
+
rows_no = len(masses)
|
|
136
|
+
masses = list(masses)
|
|
137
|
+
logProbs= list(logProbs)
|
|
138
|
+
confs = []
|
|
139
|
+
for i in xrange(rows_no):
|
|
140
|
+
confs.append([x for sublist in isoCounts[i] for x in sublist])
|
|
141
|
+
return masses, logProbs, confs
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def getConfsNumpy(self):
|
|
145
|
+
import numpy as np
|
|
146
|
+
masses, logProbs, configurations = self.getConfsRaw()
|
|
147
|
+
rows_no = len(masses)
|
|
148
|
+
if rows_no == 0:
|
|
149
|
+
return np.array([])
|
|
150
|
+
cols_no = len(configurations)//len(masses)
|
|
151
|
+
masses = np.array(list(masses))
|
|
152
|
+
logProbs= np.array(list(logProbs))
|
|
153
|
+
configurations = np.array(list(configurations)).reshape((rows_no,cols_no))
|
|
154
|
+
return masses, logProbs, configurations
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def splitConf(self, l, offset = 0):
|
|
158
|
+
conf = []
|
|
159
|
+
idx = self.allIsotopeNumber * offset
|
|
160
|
+
for i in xrange(self.dimNumber):
|
|
161
|
+
conf.append(tuple(l[idx:idx+self._isotopeNumbers[i]]))
|
|
162
|
+
idx += self._isotopeNumbers[i]
|
|
163
|
+
return tuple(conf)
|
|
164
|
+
|
|
165
|
+
def confStr(self, conf):
|
|
166
|
+
return '\t'.join([' '.join([str(x) for x in y]) for y in conf])
|
|
167
|
+
|
|
168
|
+
def printConfs(self):
|
|
169
|
+
masses, logProbs, isoCounts = self.getConfsRaw()
|
|
170
|
+
confs = []
|
|
171
|
+
step = sum(self._isotopeNumbers)
|
|
172
|
+
for i in xrange(len(masses)):
|
|
173
|
+
confs.append((masses[i], logProbs[i], self.splitConf(isoCounts, i)))
|
|
174
|
+
|
|
175
|
+
for conf in confs:
|
|
176
|
+
print(("Mass = {0}\t and log-prob = {1} and prob = {2}\t and configuration"\
|
|
177
|
+
"=\t{3}").format(conf[0], conf[1], math.exp(conf[1]), self.confStr(conf[2])))
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class IsoPlot(dict):
|
|
182
|
+
def __init__(self, iso, bin_w):
|
|
183
|
+
self.iso = iso
|
|
184
|
+
self.bin_w = bin_w
|
|
185
|
+
masses, logProbs, _isoCounts = iso.getConfsRaw()
|
|
186
|
+
dd = defaultdict(Summator)
|
|
187
|
+
for i in xrange(len(masses)):
|
|
188
|
+
dd[float(int(masses[i]/bin_w))*bin_w].add(math.exp(logProbs[i]))
|
|
189
|
+
for key, val in dd.items():
|
|
190
|
+
self[key] = val.get()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def IsoSpecify( formula,
|
|
194
|
+
cutoff,
|
|
195
|
+
method= 'layered',
|
|
196
|
+
output_format = 'numpy_arrays',
|
|
197
|
+
trim = True,
|
|
198
|
+
_step = 0.25,
|
|
199
|
+
_trim = True,
|
|
200
|
+
_tabSize = 1000,
|
|
201
|
+
_hashSize = 1000 ):
|
|
202
|
+
"""
|
|
203
|
+
Call IsoSpec on a formula with a given cutoff.
|
|
204
|
+
|
|
205
|
+
This function wraps around the IsoSpec class.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
formula : char
|
|
210
|
+
a string of a form '< Element Tag 1 >< Count 1 > ... ',
|
|
211
|
+
e.g. 'C100H202'. Using IUPAC conventions to name elements.
|
|
212
|
+
|
|
213
|
+
cutoff : float
|
|
214
|
+
The cutoff value. See description of the method argument.
|
|
215
|
+
|
|
216
|
+
method : char
|
|
217
|
+
Can take one of the following values: 'layered',
|
|
218
|
+
'layered_estimating', 'threshold_absolute',
|
|
219
|
+
'threshold_relative', 'ordered'.
|
|
220
|
+
|
|
221
|
+
The threshold versions of the algorithm rely on user
|
|
222
|
+
providing a precise lower bound on the reported peak heights.
|
|
223
|
+
This can be specified in absolute terms ('threshold_absolute'),
|
|
224
|
+
i.e. in terms of the limiting probability of the isotopologue,
|
|
225
|
+
or as a percentage of the heighest peak ('threshold_relative').
|
|
226
|
+
|
|
227
|
+
The layered versions of the algorithm rely on calculating
|
|
228
|
+
consecutive values of peak thresholds on flight.
|
|
229
|
+
The ultimate goal is to reach a peak probability that assures
|
|
230
|
+
that the sum of probabilities of the more probable isotopologues
|
|
231
|
+
exceeds the provided cutoff value.
|
|
232
|
+
The sequence of consecutive thresholds can be generated in
|
|
233
|
+
two ways. The default way, 'layered_estimating', estimates
|
|
234
|
+
the threshold to joint probability function by a progressive
|
|
235
|
+
linear spline, check Anal Chem. 2017 Mar 21;89(6):3272-3277.
|
|
236
|
+
doi: 10.1021/acs.analchem.6b01459. Epub 2017 Mar 8.
|
|
237
|
+
The other way, 'layered', estimates a threshold as a 30%%
|
|
238
|
+
quantile of the probabilities gathered in the fringe set, i.e.
|
|
239
|
+
isotopologues that are direct neighbours of the previously
|
|
240
|
+
accepted layer. Finally, choosing the 'ordered' version will
|
|
241
|
+
provide a loglinear version of the algorithm that relies on
|
|
242
|
+
the priority queue. This version automatically sorts
|
|
243
|
+
the configurations by their probability.
|
|
244
|
+
|
|
245
|
+
trim
|
|
246
|
+
while using a layered method, should one discard superfluously
|
|
247
|
+
obtained isotopologues, i.e. such that without them the set of
|
|
248
|
+
reported isotopologues already is an optimal p-set.
|
|
249
|
+
|
|
250
|
+
output_format
|
|
251
|
+
Should the output be presented as lists ('lists'),
|
|
252
|
+
or as numpy arrays ('numpy_arrays').
|
|
253
|
+
|
|
254
|
+
Returns
|
|
255
|
+
-------
|
|
256
|
+
masses
|
|
257
|
+
masses of isotopologues, either a list or a numpy array.
|
|
258
|
+
|
|
259
|
+
logProbs
|
|
260
|
+
logarithms of probabilities (theoretical heights) of isotopologues,
|
|
261
|
+
either a list or a numpy array.
|
|
262
|
+
|
|
263
|
+
confs
|
|
264
|
+
counts of isotopologues (extended chemical formulas that
|
|
265
|
+
include counts of isotopes of elements)
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
assert output_format in ('lists', 'numpy_arrays'), "Wrong value of output_format. Should be either 'lists' or 'numpy_arrays'."
|
|
270
|
+
|
|
271
|
+
assert method in ('layered', 'ordered', 'threshold_absolute', 'threshold_relative', 'layered_estimating'), "Wrong value of method. Should be among 'layered', 'ordered', 'threshold_absolute', 'threshold_relative', or 'layered_estimating'."
|
|
272
|
+
|
|
273
|
+
assert isinstance(cutoff, float), "Provided cut off ain't a float."
|
|
274
|
+
|
|
275
|
+
assert isinstance(formula, str), "Provided formula off ain't a string."
|
|
276
|
+
|
|
277
|
+
iso = IsoSpec.IsoFromFormula( formula,
|
|
278
|
+
cutoff,
|
|
279
|
+
tabSize = 1000,
|
|
280
|
+
hashSize = 1000,
|
|
281
|
+
classId = None,
|
|
282
|
+
method = method,
|
|
283
|
+
step = 0.25,
|
|
284
|
+
trim = trim )
|
|
285
|
+
|
|
286
|
+
if output_format == 'lists':
|
|
287
|
+
masses, logProbs, confs = iso.getConfs()
|
|
288
|
+
else:
|
|
289
|
+
masses, logProbs, confs = iso.getConfsNumpy()
|
|
290
|
+
|
|
291
|
+
# print 'Rev Startek is a silly old chump and his mother dresses up silly.'
|
|
292
|
+
return masses, logProbs, confs
|
|
293
|
+
|
|
294
|
+
|
IsoSpecPy/PeriodicTbl.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from .isoFFI import isoFFI
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
xrange
|
|
6
|
+
except NameError:
|
|
7
|
+
xrange = range
|
|
8
|
+
|
|
9
|
+
number_of_isotopic_entries = isoFFI.clib.NUMBER_OF_ISOTOPIC_ENTRIES
|
|
10
|
+
|
|
11
|
+
symbol_to_masses = defaultdict(tuple)
|
|
12
|
+
symbol_to_massNo = defaultdict(tuple)
|
|
13
|
+
symbol_to_probs = defaultdict(tuple)
|
|
14
|
+
symbol_to_atomic_number = {}
|
|
15
|
+
|
|
16
|
+
for i in xrange(number_of_isotopic_entries):
|
|
17
|
+
symbol = isoFFI.ffi.string(isoFFI.clib.elem_table_symbol[i]).decode("ascii")
|
|
18
|
+
symbol_to_masses[symbol] += (isoFFI.clib.elem_table_mass[i],)
|
|
19
|
+
symbol_to_massNo[symbol] += (isoFFI.clib.elem_table_massNo[i],)
|
|
20
|
+
symbol_to_probs[symbol] += (isoFFI.clib.elem_table_probability[i],)
|
|
21
|
+
symbol_to_atomic_number[symbol] = isoFFI.clib.elem_table_atomicNo[i]
|
|
22
|
+
|
|
23
|
+
symbol_to_masses = dict(symbol_to_masses)
|
|
24
|
+
symbol_to_probs = dict(symbol_to_probs)
|
|
25
|
+
|
|
26
|
+
# Several derivative convenience dicts...
|
|
27
|
+
symbol_to_massprob = dict((key, [zip(symbol_to_masses[key], symbol_to_probs[key])]) for key in symbol_to_probs.keys())
|
|
28
|
+
|
|
29
|
+
def crossprod(l1, l2):
|
|
30
|
+
return sum(x1*x2 for x1, x2 in zip(l1, l2))
|
|
31
|
+
|
|
32
|
+
symbol_to_avg_mass = dict((key, crossprod(symbol_to_masses[key], symbol_to_probs[key])) for key in symbol_to_probs.keys())
|
|
33
|
+
|
|
34
|
+
def maxprod(l1, l2):
|
|
35
|
+
return max(zip(l1, l2), key = lambda x: x[1])[0]
|
|
36
|
+
|
|
37
|
+
symbol_to_monoisotopic_mass = dict((key, maxprod(symbol_to_masses[key], symbol_to_probs[key])) for key in symbol_to_probs.keys())
|
|
38
|
+
|
IsoSpecPy/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .IsoSpecPy import *
|
|
2
|
+
import importlib.metadata
|
|
3
|
+
|
|
4
|
+
__version__ = importlib.metadata.version("IsoSpecPy")
|
|
5
|
+
|
|
6
|
+
# Old, deprecated name, for compatibility with 1.9.X only
|
|
7
|
+
IsoLayered = IsoTotalProb
|
|
8
|
+
|
|
9
|
+
# For backward compatibility with 1.0.X:
|
|
10
|
+
class CompatIsoWrapper(object):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
from .IsoSpecPyOld import IsoSpec, IsoSpecify, IsoPlot
|
|
13
|
+
self.IsoSpec = IsoSpec
|
|
14
|
+
self.IsoSpecify = IsoSpecify
|
|
15
|
+
self.IsoPlot = IsoPlot
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
IsoSpecPy = CompatIsoWrapper()
|
|
19
|
+
|
IsoSpecPy/__main__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from . import __version__
|
|
2
|
+
|
|
3
|
+
if __name__ == "__main__":
|
|
4
|
+
import argparse
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
parser = argparse.ArgumentParser(description="IsoSpecPy: Python interface to IsoSpec++ library, a fast and memory-efficient library for computing isotopic distributions.")
|
|
8
|
+
parser.add_argument('--version', '-v', action='version', version=__version__)
|
|
9
|
+
parser.add_argument('--libpath', action='store_true',
|
|
10
|
+
help='Print the path to the loaded C++ library and exit.')
|
|
11
|
+
parser.add_argument('--include', action='store_true',
|
|
12
|
+
help='Print the include path for the headers of the C++ library and exit.')
|
|
13
|
+
args = parser.parse_args()
|
|
14
|
+
|
|
15
|
+
if args.libpath:
|
|
16
|
+
try:
|
|
17
|
+
from .isoFFI import IsoFFI
|
|
18
|
+
ffi = IsoFFI()
|
|
19
|
+
print(Path(ffi.libpath).resolve())
|
|
20
|
+
except ImportError as e:
|
|
21
|
+
print(f"Error loading IsoSpecPy: {e}")
|
|
22
|
+
exit(0)
|
|
23
|
+
|
|
24
|
+
if args.include:
|
|
25
|
+
print(Path(__file__).parent.resolve())
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Here we store formulas for approximating numbers of isotopologues and subisotopologues."""
|
|
2
|
+
from math import exp, log, lgamma, pi
|
|
3
|
+
|
|
4
|
+
from .IsoSpecPy import IsoParamsFromFormula
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def log_multinomial_confs_cnt(n, i):
|
|
8
|
+
"""Return the natural logarithm of the number of configurations of the multinomial distribution.
|
|
9
|
+
|
|
10
|
+
There are n independent trials of an experiment that results in one of the i outcomes.
|
|
11
|
+
For each outcome, count how many times it occured.
|
|
12
|
+
These counts then follow the multinomial distribution.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
n (int): The number of trials.
|
|
16
|
+
i (int): The number of possible outcomes.
|
|
17
|
+
"""
|
|
18
|
+
return lgamma(n + i) - \
|
|
19
|
+
lgamma(n + 1) - \
|
|
20
|
+
lgamma(i)
|
|
21
|
+
|
|
22
|
+
def test_log_multinomial_confs_cnt():
|
|
23
|
+
x = log_multinomial_confs_cnt(10,3)
|
|
24
|
+
assert abs(x - 4.189655) < 10**(-5)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def multinomial_confs_cnt(n, i):
|
|
28
|
+
"""Return the approximate number of configurations of the multinomial distribution.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
n (int): The number of trials.
|
|
32
|
+
i (int): The number of possible outcomes.
|
|
33
|
+
"""
|
|
34
|
+
return exp(log_subiso_cnt_simplex(n, i))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def log_V_simplex(n, i):
|
|
38
|
+
"""Get the natural logarithm of the volume of a simplex {(x_1,..,x_{i-1}): \sum_{j=1}^i = n}.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
n (int): The number of atoms of the element.
|
|
42
|
+
i (int): The number of isotopes of the element.
|
|
43
|
+
"""
|
|
44
|
+
return (i-1)*log(n) - lgamma(i)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_log_V_simplex():
|
|
48
|
+
x = log_V_simplex(10,3)
|
|
49
|
+
assert abs(x - 3.912023) < 10**(-5)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def V_simplex(n, i):
|
|
53
|
+
"""Get the volume of a simplex {(x_1,..,x_{i-1}): \sum_{j=1}^i = n}.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
n (int): The number of atoms of the element.
|
|
57
|
+
i (int): The number of isotopes of the element.
|
|
58
|
+
"""
|
|
59
|
+
return exp(log_V_simplex(n,i))
|
|
60
|
+
|
|
61
|
+
def log_V_ellipsoid(n, R2, probs):
|
|
62
|
+
"""Get the natural logarithm of the volume of the ellipsoid.
|
|
63
|
+
|
|
64
|
+
The ellipsoid is defined by x' W^{-1} x <= R2,
|
|
65
|
+
where W = diag(probs[0:-1]) - probs[0:-1] * probs[0:-1]'
|
|
66
|
+
and R2 is the square of the radius.
|
|
67
|
+
diag(probs) is a matrix with values probs[0:-1] on the diagonal,
|
|
68
|
+
and probs[0:-1] * probs[0:-1]' is a projection on probs[0:-1].
|
|
69
|
+
Since probabilities are nonzero and sum to one, then det W != 0.
|
|
70
|
+
Also, the expression does not really depend upon the choice of one of the
|
|
71
|
+
ommited probability term, e.g. the outcome would stay the same if we remover p[1].
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
n (int): The number of atoms of the element.
|
|
75
|
+
R2 (float): The square of the radius of the ellipsoid.
|
|
76
|
+
probs (list): List with the natural frequencies of isotopes.
|
|
77
|
+
"""
|
|
78
|
+
i = len(probs)
|
|
79
|
+
log_det = sum(log(p) for p in probs)
|
|
80
|
+
return (log_det + (i-1)*(log(n) + log(R2) + log(pi)))/2.0 - lgamma((i+1)/2.0)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_log_V_ellipsoid():
|
|
84
|
+
assert abs(log_V_ellipsoid(100, 10, [.2,.3,.5]) - 6.299206) < 10**(-5)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def V_ellipsoid(n, R2, probs):
|
|
88
|
+
"""Get the volume of the ellipsoid."""
|
|
89
|
+
return exp(log_V_ellipsoid(n, R2, probs))
|
|
90
|
+
|
|
91
|
+
def log_subisotopologue_cnt(atoms_cnt, isotope_frequencies, ellipsoid_R2):
|
|
92
|
+
"""Get the natural logarithm of the approximate number of subisotopologues.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
atoms_cnt (int): The number of atoms of the given element.
|
|
96
|
+
isotope_frequencies (list): The natural frequencies of isotopes (sum to one).
|
|
97
|
+
ellipsoid_R2 (float): The radius of the ellipsoid used to approximate the optimal P-set.
|
|
98
|
+
"""
|
|
99
|
+
isotopes_cnt = len(isotope_frequencies)
|
|
100
|
+
return log_multinomial_confs_cnt(atoms_cnt, isotopes_cnt) + \
|
|
101
|
+
log_V_ellipsoid(atoms_cnt, ellipsoid_R2, isotope_frequencies) - \
|
|
102
|
+
log_V_simplex(atoms_cnt, isotopes_cnt)
|
|
103
|
+
|
|
104
|
+
def test_log_subisotopologue_cnt():
|
|
105
|
+
assert abs(log_subisotopologue_cnt(100, [.2,.3,.5], 10) - 6.328959) < 10**(-5)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def subisotopologue_cnt(atoms_cnt, isotope_frequencies, ellipsoid_R2):
|
|
109
|
+
"""Get the approximate number of subisotopologues."""
|
|
110
|
+
return exp(log_subisotopologue_cnt(atoms_cnt, isotope_frequencies, ellipsoid_R2))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def approximate_subisotopologues(molecule, P):
|
|
114
|
+
"""Approximate the number of subisotopologues.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
molecule (str): A string with molecule, e.g. 'C100H202'.
|
|
118
|
+
P (float): The joint probability threshold.
|
|
119
|
+
"""
|
|
120
|
+
from scipy.stats import chi2
|
|
121
|
+
assert P >= 0 and P <= 1, 'That is not a probability.'
|
|
122
|
+
mol = IsoParamsFromFormula(molecule)
|
|
123
|
+
chi2_df = sum(len(p) for p in mol.probs) - len(mol.probs)
|
|
124
|
+
R2 = chi2.ppf(q=P, df=chi2_df)
|
|
125
|
+
return {e: subisotopologue_cnt(n, p, R2) for n, p, e in \
|
|
126
|
+
zip(mol.atomCounts, mol.probs, mol.elems)}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == '__main__':
|
|
130
|
+
print(approximate_subisotopologues('C100H202', .999))
|
|
131
|
+
print(approximate_subisotopologues("C100H100", .999))
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ConfsPassthrough(object):
|
|
6
|
+
def __init__(self, confs_parser, size):
|
|
7
|
+
self.confs_parser = confs_parser
|
|
8
|
+
self.size = size
|
|
9
|
+
|
|
10
|
+
def __len__(self):
|
|
11
|
+
return self.size
|
|
12
|
+
|
|
13
|
+
def __getitem__(self, idx):
|
|
14
|
+
return self.confs_parser(idx)
|
|
15
|
+
|
|
16
|
+
|