imspy-core 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ """
2
+ Utility module for imspy_core.
3
+
4
+ Contains general purpose utilities for sequences and mathematical functions.
5
+ """
6
+
7
+ from .utilities import (
8
+ re_index_indices, normal_pdf, gaussian, exp_distribution, exp_gaussian,
9
+ linear_map, NormalDistribution, ExponentialGaussianDistribution, TokenSequence,
10
+ is_unimod_start, is_unimod_end, tokenize_proforma_sequence,
11
+ get_aa_num_proforma_sequence, tokenizer_to_json, tokenizer_from_json
12
+ )
13
+ from .sequence import tokenize_unimod_sequence, remove_unimod_annotation
14
+
15
+ __all__ = [
16
+ 're_index_indices', 'tokenize_unimod_sequence', 'remove_unimod_annotation', 'linear_map',
17
+ 'normal_pdf', 'gaussian', 'exp_distribution', 'exp_gaussian',
18
+ 'NormalDistribution', 'ExponentialGaussianDistribution', 'TokenSequence',
19
+ 'is_unimod_start', 'is_unimod_end', 'tokenize_proforma_sequence',
20
+ 'get_aa_num_proforma_sequence', 'tokenizer_to_json', 'tokenizer_from_json'
21
+ ]
@@ -0,0 +1,38 @@
1
+ import re
2
+ from typing import List
3
+
4
+
5
+ def remove_unimod_annotation(sequence: str) -> str:
6
+ """
7
+ Remove UNIMOD annotations from a peptide sequence.
8
+
9
+ Args:
10
+ sequence: A peptide sequence with UNIMOD annotations (e.g., "PEPTM[UNIMOD:35]IDE").
11
+
12
+ Returns:
13
+ The peptide sequence without UNIMOD annotations (e.g., "PEPTMIDE").
14
+ """
15
+ pattern = r'\[UNIMOD:\d+\]'
16
+ return re.sub(pattern, '', sequence)
17
+
18
+
19
+ def tokenize_unimod_sequence(unimod_sequence: str) -> List[str]:
20
+ """
21
+ Tokenizes a sequence of modified amino acids.
22
+ Args:
23
+ unimod_sequence: A string representing the sequence of amino acids with modifications.
24
+
25
+ Returns:
26
+ A list of tokenized amino acids.
27
+ """
28
+ token_pattern = r'[A-Z](?:\[UNIMOD:\d+\])?'
29
+
30
+ # Special case handling for [UNIMOD:1] at the beginning
31
+ if unimod_sequence.startswith("[UNIMOD:1]"):
32
+ special_token = "<START>[UNIMOD:1]"
33
+ rest_of_string = unimod_sequence[len("[UNIMOD:1]"):]
34
+ other_tokens = re.findall(token_pattern, rest_of_string)
35
+ return [special_token] + other_tokens + ['<END>']
36
+ else:
37
+ tokens = re.findall(token_pattern, unimod_sequence)
38
+ return ['<START>'] + tokens + ['<END>']
@@ -0,0 +1,278 @@
1
+ import io
2
+ import json
3
+ import math
4
+ import numba
5
+ import numpy as np
6
+ from typing import List, Optional, TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ import tensorflow as tf
10
+
11
+ from numpy.typing import ArrayLike
12
+
13
+
14
+ @numba.jit(nopython=True)
15
+ def normal_pdf(x: ArrayLike, mass: float, s: float = 0.001,
16
+ inv_sqrt_2pi: float = 0.3989422804014327, normalize: bool = False):
17
+ """
18
+ Args:
19
+ x:
20
+ mass:
21
+ s:
22
+ inv_sqrt_2pi:
23
+ normalize:
24
+ """
25
+ a = (x - mass) / s
26
+ if normalize:
27
+ return np.exp(-0.5 * np.power(a, 2))
28
+ else:
29
+ return inv_sqrt_2pi / s * np.exp(-0.5 * np.power(a, 2))
30
+
31
+
32
+ @numba.jit(nopython=True)
33
+ def gaussian(x, μ: float = 0, σ: float = 1):
34
+ """
35
+ Gaussian function
36
+ :param x:
37
+ :param μ:
38
+ :param σ:
39
+ :return:
40
+ """
41
+ A = 1 / np.sqrt(2 * np.pi * np.power(σ, 2))
42
+ B = np.exp(- (np.power(x - μ, 2) / 2 * np.power(σ, 2)))
43
+
44
+ return A * B
45
+
46
+
47
+ @numba.jit(nopython=True)
48
+ def exp_distribution(x, λ: float = 1):
49
+ """
50
+ Exponential function
51
+ :param x:
52
+ :param λ:
53
+ :return:
54
+ """
55
+ if x > 0:
56
+ return λ * np.exp(-λ * x)
57
+ return 0
58
+
59
+
60
+ @numba.jit(nopython=True)
61
+ def exp_gaussian(x, μ: float = -3, σ: float = 1, λ: float = .25):
62
+ """
63
+ laplacian distribution with exponential decay
64
+ :param x:
65
+ :param μ:
66
+ :param σ:
67
+ :param λ:
68
+ :return:
69
+ """
70
+ A = λ / 2 * np.exp(λ / 2 * (2 * μ + λ * np.power(σ, 2) - 2 * x))
71
+ B = math.erfc((μ + λ * np.power(σ, 2) - x) / (np.sqrt(2) * σ))
72
+ return A * B
73
+
74
+
75
+ @numba.jit(nopython=True)
76
+ def linear_map(value, old_min, old_max, new_min=0.0, new_max=60.0):
77
+ """
78
+ Linear mapping from one domain to another.
79
+
80
+ Args:
81
+ value: The value to map.
82
+ old_min: Minimum of the original range.
83
+ old_max: Maximum of the original range.
84
+ new_min: Minimum of the target range (default 0.0).
85
+ new_max: Maximum of the target range (default 60.0).
86
+
87
+ Returns:
88
+ The mapped value in the new range.
89
+ """
90
+ scale = (new_max - new_min) / (old_max - old_min)
91
+ offset = new_min - old_min * scale
92
+ return value * scale + offset
93
+
94
+
95
+ class NormalDistribution:
96
+ def __init__(self, μ: float, σ: float):
97
+ self.μ = μ
98
+ self.σ = σ
99
+
100
+ def __call__(self, x):
101
+ return gaussian(x, self.μ, self.σ)
102
+
103
+
104
+ class ExponentialGaussianDistribution:
105
+ def __init__(self, μ: float = -3, σ: float = 1, λ: float = .25):
106
+ self.μ = μ
107
+ self.σ = σ
108
+ self.λ = λ
109
+
110
+ def __call__(self, x):
111
+ return exp_gaussian(x, self.μ, self.σ, self.λ)
112
+
113
+
114
+ def _from_jsons(jsons: str):
115
+ return json.loads(jsons)
116
+
117
+
118
+ class TokenSequence:
119
+
120
+ def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons: Optional[str] = None):
121
+ if jsons is not None:
122
+ self.sequence_tokenized = _from_jsons(jsons)
123
+ self._jsons = jsons
124
+ else:
125
+ self.sequence_tokenized = sequence_tokenized
126
+ self._jsons = self._to_jsons()
127
+
128
+ def _to_jsons(self):
129
+ json_dict = self.sequence_tokenized
130
+ return json.dumps(json_dict)
131
+
132
+ @property
133
+ def jsons(self):
134
+ return self._jsons
135
+
136
+
137
+ def is_unimod_start(char: str):
138
+ """
139
+ Tests if char is start of unimod
140
+ bracket
141
+
142
+ :param char: Character of a proForma formatted aa sequence
143
+ :type char: str
144
+ :return: Whether char is start of unimod bracket
145
+ :rtype: bool
146
+ """
147
+ if char in ["(", "[", "{"]:
148
+ return True
149
+ else:
150
+ return False
151
+
152
+
153
+ def is_unimod_end(char: str):
154
+ """
155
+ Tests if char is end of unimod
156
+ bracket
157
+
158
+ :param char: Character of a proForma formatted aa sequence
159
+ :type char: str
160
+ :return: Whether char is end of unimod bracket
161
+ :rtype: bool
162
+ """
163
+ if char in [")", "]", "}"]:
164
+ return True
165
+ else:
166
+ return False
167
+
168
+
169
+ def tokenize_proforma_sequence(sequence: str):
170
+ """
171
+ Tokenize a ProForma formatted sequence string.
172
+
173
+ :param sequence: Sequence string (ProForma formatted)
174
+ :type sequence: str
175
+ :return: List of tokens
176
+ :rtype: List
177
+ """
178
+ sequence = sequence.upper().replace("(", "[").replace(")", "]")
179
+ token_list = ["<START>"]
180
+ in_unimod_bracket = False
181
+ tmp_token = ""
182
+
183
+ for aa in sequence:
184
+ if is_unimod_start(aa):
185
+ in_unimod_bracket = True
186
+ if in_unimod_bracket:
187
+ if is_unimod_end(aa):
188
+ in_unimod_bracket = False
189
+ tmp_token += aa
190
+ continue
191
+ if tmp_token != "":
192
+ token_list.append(tmp_token)
193
+ tmp_token = ""
194
+ tmp_token += aa
195
+
196
+ if tmp_token != "":
197
+ token_list.append(tmp_token)
198
+
199
+ if len(token_list) > 1:
200
+ if token_list[1].find("UNIMOD:1") != -1:
201
+ token_list[1] = "<START>"+token_list[1]
202
+ token_list = token_list[1:]
203
+ token_list.append("<END>")
204
+
205
+ return token_list
206
+
207
+
208
+ def get_aa_num_proforma_sequence(sequence: str):
209
+ """
210
+ get number of amino acids in sequence
211
+
212
+ :param sequence: proforma formatted aa sequence
213
+ :type sequence: str
214
+ :return: Number of amino acids
215
+ :rtype: int
216
+ """
217
+ num_aa = 0
218
+ inside_bracket = False
219
+
220
+ for aa in sequence:
221
+ if is_unimod_start(aa):
222
+ inside_bracket = True
223
+ if inside_bracket:
224
+ if is_unimod_end(aa):
225
+ inside_bracket = False
226
+ continue
227
+ num_aa += 1
228
+ return num_aa
229
+
230
+
231
+ def re_index_indices(ids):
232
+ """Re-index indices, i.e. replace gaps in indices with consecutive numbers.
233
+ Can be used, e.g., to re-index frame IDs from precursors for visualization.
234
+ Args:
235
+ ids: Indices.
236
+ Returns:
237
+ Indices.
238
+ """
239
+ _, inverse = np.unique(ids, return_inverse=True)
240
+ return inverse
241
+
242
+
243
+ def tokenizer_to_json(tokenizer: "tf.keras.preprocessing.text.Tokenizer", path: str):
244
+ """
245
+ save a fit keras tokenizer to json for later use
246
+
247
+ Note:
248
+ Requires tensorflow to be installed.
249
+
250
+ :param tokenizer: fit keras tokenizer to save
251
+ :param path: path to save json to
252
+ """
253
+ tokenizer_json = tokenizer.to_json()
254
+ with io.open(path, 'w', encoding='utf-8') as f:
255
+ f.write(json.dumps(tokenizer_json, ensure_ascii=False))
256
+
257
+
258
+ def tokenizer_from_json(path: str):
259
+ """
260
+ load a pre-fit tokenizer from a json file
261
+
262
+ Note:
263
+ Requires tensorflow to be installed.
264
+
265
+ :param path: path to tokenizer as json file
266
+ :return: a keras tokenizer loaded from json
267
+ """
268
+ try:
269
+ import tensorflow as tf
270
+ except ImportError:
271
+ raise ImportError(
272
+ "tokenizer_from_json requires tensorflow. "
273
+ "Install it with: pip install tensorflow"
274
+ )
275
+
276
+ with open(path) as f:
277
+ data = json.load(f)
278
+ return tf.keras.preprocessing.text.tokenizer_from_json(data)
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: imspy-core
3
+ Version: 0.4.0
4
+ Summary: Core data structures and utilities for processing timsTOF ion mobility spectrometry data.
5
+ License-Expression: MIT
6
+ Author: theGreatHerrLebert
7
+ Author-email: davidteschner@googlemail.com
8
+ Requires-Python: >=3.11,<3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: imspy-connector (>=0.3.0)
14
+ Requires-Dist: mendeleev (>=0.7.0)
15
+ Requires-Dist: numba (>=0.53)
16
+ Requires-Dist: numpy (>=1.21)
17
+ Requires-Dist: opentims-bruker-bridge (>=1.1.0)
18
+ Requires-Dist: pandas (>=2.1)
19
+ Requires-Dist: pyarrow (>=13.0)
20
+ Requires-Dist: scipy (>=1.7.1)
21
+ Requires-Dist: tabulate (>=0.9.0)
22
+ Requires-Dist: toml (>=0.10.2)
23
+ Requires-Dist: tqdm (>=4.66)
24
+ Requires-Dist: zstd (>=1.5.6.1)
25
+ Description-Content-Type: text/markdown
26
+
27
+ # imspy-core
28
+
29
+ Core data structures and utilities for processing timsTOF ion mobility spectrometry data.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install imspy-core
35
+ ```
36
+
37
+ ## Features
38
+
39
+ - **Data Structures**: MzSpectrum, TimsSpectrum, PeptideSequence, and more
40
+ - **Chemistry Utilities**: Elements, amino acids, UNIMOD modifications, CCS/mobility conversions
41
+ - **TimsTOF Readers**: Read DDA and DIA datasets from Bruker timsTOF instruments
42
+ - **Low Dependencies**: Only essential packages (numpy, pandas, scipy, numba)
43
+
44
+ ## Quick Start
45
+
46
+ ```python
47
+ from imspy_core.timstof import TimsDatasetDDA
48
+ from imspy_core.data import PeptideSequence
49
+
50
+ # Read a DDA dataset
51
+ dataset = TimsDatasetDDA("/path/to/data.d")
52
+ frame = dataset.get_tims_frame(1)
53
+ print(frame)
54
+
55
+ # Work with peptides
56
+ peptide = PeptideSequence("PEPTIDEK")
57
+ print(f"Mass: {peptide.mono_isotopic_mass}")
58
+ ```
59
+
60
+ ## Related Packages
61
+
62
+ - **imspy-predictors**: ML-based predictions (CCS, RT, intensity) - requires TensorFlow
63
+ - **imspy-search**: Database search functionality - requires sagepy, mokapot
64
+ - **imspy-simulation**: Simulation tools for timsTOF data
65
+ - **imspy-vis**: Visualization tools - requires Plotly, Matplotlib
66
+
67
+ ## License
68
+
69
+ MIT License - see LICENSE file for details.
70
+
@@ -0,0 +1,28 @@
1
+ imspy_core/__init__.py,sha256=JMx3P_qFJ3lPO61DtdpXs7I9Yz5c2Yysr9_j0l83IFE,2280
2
+ imspy_core/chemistry/__init__.py,sha256=HhCH2I_rHpv8rRgBBbhpRPAvOygchJN8NPhj86HEhQo,1464
3
+ imspy_core/chemistry/amino_acids.py,sha256=B2JdsY2DA2_gp_C79h2lxeu4Xuv4pUrA_PxDxXXQbDU,234
4
+ imspy_core/chemistry/constants.py,sha256=5zDApZnEqm8QJ_SVeRS2C91xnUleafpJA8RX3QZ202c,394
5
+ imspy_core/chemistry/elements.py,sha256=Fixh7eaEWexOPkMwGl8VsFqj9FfJpEhkeKDZJ-_crSw,277
6
+ imspy_core/chemistry/mobility.py,sha256=_xCVq1qH8dd9a2UqhEutKYNwkPmC0fTPIDjpJ_mcofM,2875
7
+ imspy_core/chemistry/sum_formula.py,sha256=OpRye_5yYXt5tqgVguf-38rHCCysZBDtmC4WCh3fGlE,991
8
+ imspy_core/chemistry/unimod.py,sha256=-eq5yKcNsbsAdpk-BdBdhUk1_XkTXzAPDh14gD9l2GA,162
9
+ imspy_core/chemistry/utility.py,sha256=gINZ8k8BymGhts5aRQ4SPG9pWRkvc6XFecqSEEuTFfQ,1252
10
+ imspy_core/core/__init__.py,sha256=opTLNd3KW9UBYvk0hD9T3fRSry2MuGh2wKxBBXJF6Dg,168
11
+ imspy_core/core/base.py,sha256=NrkKhw3FeK_An0Im3legHaD9sOwOZ_QSJT5zRqcYR2c,867
12
+ imspy_core/data/__init__.py,sha256=7FQKFn4H3SqBmJMZBniSIrewDE1-byGNr1CUcHW41JE,532
13
+ imspy_core/data/peptide.py,sha256=Fd4wt1A9yvdEvhoxuS03KH6KXPxTSm4pvpaOg3PcXL0,20252
14
+ imspy_core/data/spectrum.py,sha256=9eMKcbhQuQiitQXjOrHgJGnj0163jHtal4q20lz9Ivc,18789
15
+ imspy_core/timstof/__init__.py,sha256=BcTi9vvpwmqxzIvkqYzNCmHjhMhxBXo6351IsbRHjmo,852
16
+ imspy_core/timstof/collision.py,sha256=RUab0N_GimlkEvTU7oINycPXJo8qoNngUWQwEAiTtiI,1097
17
+ imspy_core/timstof/data.py,sha256=19BrElNm0YE1G5TL9C_Pq55u9Zs8M2JSQT0hqey_UR4,14564
18
+ imspy_core/timstof/dda.py,sha256=gV-Gz6DtsuGmfw70iBxHPo-F46TsdbLOrLsuucphzWI,13695
19
+ imspy_core/timstof/dia.py,sha256=BDz_3xPFs8uXK50VDwwtm_h4SijEwY0x4e5TS00wK1w,4809
20
+ imspy_core/timstof/frame.py,sha256=4EROm-dvwFmhQDONdWTa_Khlv_BTpNmmRIK8qNYA6Eg,20051
21
+ imspy_core/timstof/quadrupole.py,sha256=_9eeBNUyHAyB7rT5uJm9iwX79C4QZCiJL1FoRtdWmvU,7918
22
+ imspy_core/timstof/slice.py,sha256=ctq42lGd570xkdUQskKzk-ge8ZqNj5wMkE4wiLxorQU,18897
23
+ imspy_core/utility/__init__.py,sha256=1k_X2Vsqqu8RoZbSszm9CaW7ing1RSu-rCmOX2JbJ5Y,917
24
+ imspy_core/utility/sequence.py,sha256=SmvI3fGISdt4m0Z7v_jVR56WACRudRIrRWi_O1UP6Cc,1222
25
+ imspy_core/utility/utilities.py,sha256=n9dNyG5QRHZq4URdZalmpYvKRzb5BynzdrGz3uQcugM,6800
26
+ imspy_core-0.4.0.dist-info/METADATA,sha256=DIJn2FNz_KaKKAoBjsSrRz8XH1LnfrWH4ddLvIAmoeo,2133
27
+ imspy_core-0.4.0.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
28
+ imspy_core-0.4.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.3.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any