spectrl 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spectrl/__init__.py +221 -0
- spectrl/cli.py +88 -0
- spectrl/codecs/__init__.py +78 -0
- spectrl/codecs/numpress.py +73 -0
- spectrl/codecs/raw.py +17 -0
- spectrl/cv.py +117 -0
- spectrl/header.py +346 -0
- spectrl/model.py +134 -0
- spectrl/mzml.py +173 -0
- spectrl/peaks.py +175 -0
- spectrl/proforma.py +23 -0
- spectrl/token.py +40 -0
- spectrl-0.1.0.dist-info/METADATA +216 -0
- spectrl-0.1.0.dist-info/RECORD +18 -0
- spectrl-0.1.0.dist-info/WHEEL +4 -0
- spectrl-0.1.0.dist-info/entry_points.txt +2 -0
- spectrl-0.1.0.dist-info/licenses/LICENSE +202 -0
- spectrl-0.1.0.dist-info/licenses/NOTICE +14 -0
spectrl/__init__.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""spectrl — Inline Spectrum URL Encoder.
|
|
2
|
+
|
|
3
|
+
Encodes a single mass spectrum into a compact, URL-safe token (spectrl1.…) so it can be
|
|
4
|
+
shared with no backend. The entire spectrum lives in the token.
|
|
5
|
+
|
|
6
|
+
Public API::
|
|
7
|
+
|
|
8
|
+
encode_spectrum(spec, *, lossless=False, max_len=None) -> str
|
|
9
|
+
decode_token(token) -> DecodedSpectrum
|
|
10
|
+
from_mzmlpy(spec, ref_groups=None) -> InlineSpectrum
|
|
11
|
+
top_n(spec, n) -> InlineSpectrum
|
|
12
|
+
to_fragment(token, base) -> str
|
|
13
|
+
to_query(token, base, param="d") -> str
|
|
14
|
+
to_data_uri(token) -> str
|
|
15
|
+
extract_token(url_or_uri) -> str
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import warnings
|
|
21
|
+
from urllib.parse import parse_qs, urlparse, urlunparse
|
|
22
|
+
|
|
23
|
+
from .cv import ARRAY_CHARGE, ARRAY_INTENSITY, ARRAY_MZ, ION_MOBILITY_ARRAY_TAILS
|
|
24
|
+
from .header import build_header, extract_descriptors, parse_header
|
|
25
|
+
from .model import DecodedSpectrum, InlineSpectrum, SpectrlCvParam
|
|
26
|
+
from .peaks import _validate_arrays, build_array_blobs, canonical_sort, compute_hash, decode_array_blobs, top_n
|
|
27
|
+
from .proforma import validate_interp
|
|
28
|
+
from .token import build_token, parse_token
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"encode_spectrum",
|
|
32
|
+
"decode_token",
|
|
33
|
+
"from_mzmlpy",
|
|
34
|
+
"top_n",
|
|
35
|
+
"to_fragment",
|
|
36
|
+
"to_query",
|
|
37
|
+
"to_data_uri",
|
|
38
|
+
"extract_token",
|
|
39
|
+
"InlineSpectrum",
|
|
40
|
+
"DecodedSpectrum",
|
|
41
|
+
"SpectrlCvParam",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
_SIZE_WARN = 8192 # bytes — warn past this
|
|
45
|
+
_MAGIC_PREFIX = "spectrl1."
|
|
46
|
+
_DATA_URI_PREFIX = "data:application/vnd.spectrl;v=1,"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def encode_spectrum(
|
|
50
|
+
spec: InlineSpectrum,
|
|
51
|
+
*,
|
|
52
|
+
lossless: bool = False,
|
|
53
|
+
max_len: int | None = None,
|
|
54
|
+
) -> str:
|
|
55
|
+
"""Encode an InlineSpectrum to a spectrl1 token string.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
spec: The spectrum to encode.
|
|
59
|
+
lossless: If True, use raw IEEE-754 + zlib (bit-exact). Default is lossy
|
|
60
|
+
MS-Numpress (recommended for URL sharing).
|
|
61
|
+
max_len: Raise OverflowError if the encoded token exceeds this byte length.
|
|
62
|
+
Use top_n() to reduce peak count, or fall back to a USI reference for
|
|
63
|
+
repository-resident spectra.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
A ``spectrl1.`` token string.
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
OverflowError: If max_len is set and the encoded length exceeds it.
|
|
70
|
+
ValueError: If arrays contain NaN/Inf, or peaks are not finite.
|
|
71
|
+
"""
|
|
72
|
+
spec = canonical_sort(spec)
|
|
73
|
+
_validate_arrays(spec)
|
|
74
|
+
|
|
75
|
+
if spec.interp is not None:
|
|
76
|
+
validate_interp(spec.interp)
|
|
77
|
+
|
|
78
|
+
blobs, descriptors = build_array_blobs(spec, lossless=lossless)
|
|
79
|
+
|
|
80
|
+
# Assign segment indices
|
|
81
|
+
for i, desc in enumerate(descriptors):
|
|
82
|
+
desc["seg"] = i
|
|
83
|
+
|
|
84
|
+
# Compute hash over header (without hash field) + blobs
|
|
85
|
+
header_no_hash = build_header(spec, descriptors, hash_str=None)
|
|
86
|
+
hash_str = compute_hash(header_no_hash, blobs)
|
|
87
|
+
|
|
88
|
+
# Build final header with hash
|
|
89
|
+
header_bytes = build_header(spec, descriptors, hash_str=hash_str)
|
|
90
|
+
token = build_token(header_bytes, blobs)
|
|
91
|
+
|
|
92
|
+
if len(token) > _SIZE_WARN:
|
|
93
|
+
warnings.warn(
|
|
94
|
+
f"spectrl token length {len(token)} bytes exceeds recommended maximum of {_SIZE_WARN} bytes. "
|
|
95
|
+
"Consider using top_n() to reduce peak count, or fall back to a USI reference.",
|
|
96
|
+
UserWarning,
|
|
97
|
+
stacklevel=2,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if max_len is not None and len(token) > max_len:
|
|
101
|
+
raise OverflowError(
|
|
102
|
+
f"Encoded spectrl token is {len(token)} bytes, which exceeds max_len={max_len}. "
|
|
103
|
+
"Use top_n(spec, n) to reduce peak count before encoding, "
|
|
104
|
+
"or use a USI reference for repository-resident spectra."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return token
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def decode_token(token: str) -> DecodedSpectrum:
|
|
111
|
+
"""Decode a spectrl1 token string into a DecodedSpectrum.
|
|
112
|
+
|
|
113
|
+
Verifies the stored hash if present, raising ValueError on mismatch.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
token: A ``spectrl1.`` token string.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
DecodedSpectrum with all metadata and peak arrays populated.
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
ValueError: On bad magic/version or hash mismatch.
|
|
123
|
+
"""
|
|
124
|
+
header_bytes, blobs = parse_token(token)
|
|
125
|
+
decoded = parse_header(header_bytes)
|
|
126
|
+
|
|
127
|
+
# Verify hash BEFORE decoding arrays (catches corruption early). The header
|
|
128
|
+
# without key 9 is recovered by byte-slicing the received bytes (not by
|
|
129
|
+
# re-encoding), so verification never depends on a canonical msgpack form.
|
|
130
|
+
if decoded.hash is not None:
|
|
131
|
+
from .header import strip_top_key
|
|
132
|
+
|
|
133
|
+
header_no_hash = strip_top_key(header_bytes, 9)
|
|
134
|
+
expected = compute_hash(header_no_hash, blobs)
|
|
135
|
+
if expected != decoded.hash:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
f"spectrl token hash mismatch: stored={decoded.hash!r}, computed={expected!r}. Token may be corrupted."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
descriptors = extract_descriptors(header_bytes)
|
|
141
|
+
|
|
142
|
+
# Decode peak arrays
|
|
143
|
+
arrays = decode_array_blobs(descriptors, blobs)
|
|
144
|
+
|
|
145
|
+
decoded.mz = arrays.get(ARRAY_MZ)
|
|
146
|
+
decoded.intensity = arrays.get(ARRAY_INTENSITY)
|
|
147
|
+
decoded.charge = arrays.get(ARRAY_CHARGE)
|
|
148
|
+
|
|
149
|
+
# Ion mobility: any remaining array tail in ION_MOBILITY_ARRAY_TAILS
|
|
150
|
+
for tail, arr in arrays.items():
|
|
151
|
+
if tail in ION_MOBILITY_ARRAY_TAILS.values():
|
|
152
|
+
decoded.ion_mobility = arr
|
|
153
|
+
from .cv import decode_tail
|
|
154
|
+
|
|
155
|
+
decoded.ion_mobility_type = decode_tail(tail)
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
return decoded
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def from_mzmlpy(spec, ref_groups: dict | None = None) -> InlineSpectrum:
|
|
162
|
+
"""Convert a mzmlpy Spectrum to InlineSpectrum.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
spec: A mzmlpy.spectra.Spectrum.
|
|
166
|
+
ref_groups: Optional dict mapping group id → mzmlpy _ParamGroup, for
|
|
167
|
+
expanding referenceableParamGroupRef elements. Build it as
|
|
168
|
+
``{g.id: g for g in mzml.referenceable_param_groups}``.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
InlineSpectrum ready for encoding.
|
|
172
|
+
"""
|
|
173
|
+
from .mzml import from_mzmlpy as _bridge
|
|
174
|
+
|
|
175
|
+
return _bridge(spec, ref_groups=ref_groups)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ─── URL binding helpers ─────────────────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def to_fragment(token: str, base: str) -> str:
|
|
182
|
+
"""Wrap a token as a URL fragment: ``base#token``.
|
|
183
|
+
|
|
184
|
+
The fragment is never sent to the server, avoiding length limits and access logs.
|
|
185
|
+
"""
|
|
186
|
+
return f"{base.rstrip('#')}#{token}"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def to_query(token: str, base: str, param: str = "d") -> str:
|
|
190
|
+
"""Wrap a token as a URL query parameter: ``base?param=token``."""
|
|
191
|
+
parsed = urlparse(base)
|
|
192
|
+
query = f"{param}={token}"
|
|
193
|
+
return urlunparse(parsed._replace(query=query))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def to_data_uri(token: str) -> str:
|
|
197
|
+
"""Wrap a token in a ``data:application/vnd.mzx;v=1,`` URI."""
|
|
198
|
+
return f"{_DATA_URI_PREFIX}{token}"
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def extract_token(url_or_uri: str) -> str:
|
|
202
|
+
"""Extract a spectrl1 token from a URL fragment, query string, or data: URI.
|
|
203
|
+
|
|
204
|
+
Raises ValueError if no token is found.
|
|
205
|
+
"""
|
|
206
|
+
if url_or_uri.startswith(_DATA_URI_PREFIX):
|
|
207
|
+
return url_or_uri[len(_DATA_URI_PREFIX) :]
|
|
208
|
+
|
|
209
|
+
parsed = urlparse(url_or_uri)
|
|
210
|
+
|
|
211
|
+
if parsed.fragment.startswith(_MAGIC_PREFIX):
|
|
212
|
+
return parsed.fragment
|
|
213
|
+
|
|
214
|
+
# Check query params for any value starting with spectrl1.
|
|
215
|
+
qs = parse_qs(parsed.query)
|
|
216
|
+
for vals in qs.values():
|
|
217
|
+
for v in vals:
|
|
218
|
+
if v.startswith(_MAGIC_PREFIX):
|
|
219
|
+
return v
|
|
220
|
+
|
|
221
|
+
raise ValueError(f"No spectrl1 token found in: {url_or_uri!r}")
|
spectrl/cli.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""CLI for mzx: encode, decode, and inspect tokens."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _encode_cmd(args: argparse.Namespace) -> None:
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from . import encode_spectrum
|
|
14
|
+
from .model import InlineSpectrum, SpectrlCvParam
|
|
15
|
+
|
|
16
|
+
data = json.load(sys.stdin if args.input == "-" else open(args.input))
|
|
17
|
+
mz = np.array(data["mz"], dtype=np.float64)
|
|
18
|
+
intensity = np.array(data["intensity"], dtype=np.float64)
|
|
19
|
+
|
|
20
|
+
params = [SpectrlCvParam(**p) for p in data.get("params", [])]
|
|
21
|
+
spec = InlineSpectrum(
|
|
22
|
+
default_array_length=len(mz),
|
|
23
|
+
mz=mz,
|
|
24
|
+
intensity=intensity,
|
|
25
|
+
id=data.get("id"),
|
|
26
|
+
params=params,
|
|
27
|
+
)
|
|
28
|
+
token = encode_spectrum(spec, lossless=args.lossless, max_len=args.max_len)
|
|
29
|
+
print(token)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _decode_cmd(args: argparse.Namespace) -> None:
|
|
33
|
+
from . import decode_token
|
|
34
|
+
|
|
35
|
+
token = (sys.stdin.read() if args.input == "-" else open(args.input).read()).strip()
|
|
36
|
+
decoded = decode_token(token)
|
|
37
|
+
out: dict = {
|
|
38
|
+
"id": decoded.id,
|
|
39
|
+
"default_array_length": decoded.default_array_length,
|
|
40
|
+
"mz": decoded.mz.tolist() if decoded.mz is not None else None,
|
|
41
|
+
"intensity": decoded.intensity.tolist() if decoded.intensity is not None else None,
|
|
42
|
+
"charge": decoded.charge.tolist() if decoded.charge is not None else None,
|
|
43
|
+
"hash": decoded.hash,
|
|
44
|
+
"interp": decoded.interp,
|
|
45
|
+
}
|
|
46
|
+
print(json.dumps(out, indent=2))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _inspect_cmd(args: argparse.Namespace) -> None:
|
|
50
|
+
import msgpack
|
|
51
|
+
|
|
52
|
+
from .token import parse_token
|
|
53
|
+
|
|
54
|
+
token = (sys.stdin.read() if args.input == "-" else open(args.input).read()).strip()
|
|
55
|
+
header_bytes, blobs = parse_token(token)
|
|
56
|
+
h = msgpack.unpackb(header_bytes, raw=False)
|
|
57
|
+
print(f"Segments: 1 header + {len(blobs)} array(s)")
|
|
58
|
+
print(f"Header size: {len(header_bytes)} bytes")
|
|
59
|
+
for i, blob in enumerate(blobs):
|
|
60
|
+
print(f"Array {i} size: {len(blob)} bytes")
|
|
61
|
+
print("Header (decoded):")
|
|
62
|
+
print(json.dumps(h, indent=2, default=str))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def main() -> None:
|
|
66
|
+
parser = argparse.ArgumentParser(prog="spectrl", description="mzx inline spectrum encoder/decoder")
|
|
67
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
68
|
+
|
|
69
|
+
enc = sub.add_parser("encode", help="Encode a spectrum JSON to a spectrl1 token")
|
|
70
|
+
enc.add_argument("input", nargs="?", default="-", help="Input JSON file or '-' for stdin")
|
|
71
|
+
enc.add_argument("--lossless", action="store_true", help="Use lossless IEEE-754 + zlib encoding")
|
|
72
|
+
enc.add_argument("--max-len", type=int, default=None, help="Maximum token length in bytes")
|
|
73
|
+
enc.set_defaults(func=_encode_cmd)
|
|
74
|
+
|
|
75
|
+
dec = sub.add_parser("decode", help="Decode a spectrl1 token to JSON")
|
|
76
|
+
dec.add_argument("input", nargs="?", default="-", help="Token file or '-' for stdin")
|
|
77
|
+
dec.set_defaults(func=_decode_cmd)
|
|
78
|
+
|
|
79
|
+
ins = sub.add_parser("inspect", help="Inspect a spectrl1 token header as readable JSON")
|
|
80
|
+
ins.add_argument("input", nargs="?", default="-", help="Token file or '-' for stdin")
|
|
81
|
+
ins.set_defaults(func=_inspect_cmd)
|
|
82
|
+
|
|
83
|
+
args = parser.parse_args()
|
|
84
|
+
args.func(args)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
main()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Codec registry keyed by compression CV accession tail integer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from ..cv import (
|
|
10
|
+
COMP_NUMLIN_ZLIB,
|
|
11
|
+
COMP_NUMPIC_ZLIB,
|
|
12
|
+
COMP_NUMSLOF_ZLIB,
|
|
13
|
+
COMP_ZLIB,
|
|
14
|
+
)
|
|
15
|
+
from .numpress import (
|
|
16
|
+
decode_numlin_zlib,
|
|
17
|
+
decode_numpic_zlib,
|
|
18
|
+
decode_numslof_zlib,
|
|
19
|
+
encode_numlin_zlib,
|
|
20
|
+
encode_numpic_zlib,
|
|
21
|
+
encode_numslof_zlib,
|
|
22
|
+
)
|
|
23
|
+
from .raw import decode_zlib_raw, encode_zlib_raw
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Codec(Protocol):
|
|
27
|
+
def encode(self, data: np.ndarray, fp: float | None) -> bytes: ...
|
|
28
|
+
def decode(self, blob: bytes) -> np.ndarray: ...
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _NumLinZlibCodec:
|
|
32
|
+
def encode(self, data: np.ndarray, fp: float | None) -> bytes:
|
|
33
|
+
return encode_numlin_zlib(data, fp)
|
|
34
|
+
|
|
35
|
+
def decode(self, blob: bytes) -> np.ndarray:
|
|
36
|
+
return decode_numlin_zlib(blob)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class _NumSlofZlibCodec:
|
|
40
|
+
def encode(self, data: np.ndarray, fp: float | None) -> bytes:
|
|
41
|
+
return encode_numslof_zlib(data, fp)
|
|
42
|
+
|
|
43
|
+
def decode(self, blob: bytes) -> np.ndarray:
|
|
44
|
+
return decode_numslof_zlib(blob)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class _NumPicZlibCodec:
|
|
48
|
+
def encode(self, data: np.ndarray, fp: float | None) -> bytes:
|
|
49
|
+
return encode_numpic_zlib(data, fp)
|
|
50
|
+
|
|
51
|
+
def decode(self, blob: bytes) -> np.ndarray:
|
|
52
|
+
return decode_numpic_zlib(blob)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class _ZlibRawCodec:
|
|
56
|
+
def encode(self, data: np.ndarray, fp: float | None) -> bytes:
|
|
57
|
+
return encode_zlib_raw(data)
|
|
58
|
+
|
|
59
|
+
def decode(self, blob: bytes) -> np.ndarray:
|
|
60
|
+
return decode_zlib_raw(blob)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
_REGISTRY: dict[int, Codec] = {
|
|
64
|
+
COMP_NUMLIN_ZLIB: _NumLinZlibCodec(),
|
|
65
|
+
COMP_NUMSLOF_ZLIB: _NumSlofZlibCodec(),
|
|
66
|
+
COMP_NUMPIC_ZLIB: _NumPicZlibCodec(),
|
|
67
|
+
COMP_ZLIB: _ZlibRawCodec(),
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_codec(comp_tail: int) -> Codec:
|
|
72
|
+
"""Return the codec for a given compression accession tail.
|
|
73
|
+
|
|
74
|
+
Raises KeyError if the tail is not registered.
|
|
75
|
+
"""
|
|
76
|
+
if comp_tail not in _REGISTRY:
|
|
77
|
+
raise KeyError(f"No codec registered for compression tail {comp_tail}.")
|
|
78
|
+
return _REGISTRY[comp_tail]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""MS-Numpress + zlib codec wrappers over pynumpress."""
|
|
2
|
+
|
|
3
|
+
import struct
|
|
4
|
+
import zlib
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pynumpress
|
|
8
|
+
|
|
9
|
+
DEFAULT_NUMLIN_FP = 100000.0 # ~0.1 mDa precision for m/z
|
|
10
|
+
# SLOF fp must satisfy: log(max_intensity + 1) * fp <= 65535 (uint16 max)
|
|
11
|
+
# Use 3600.0 which handles intensities up to ~8e7; clip to safe value if data is larger.
|
|
12
|
+
_SLOF_UINT16_MAX = 65535.0
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _safe_slof_fp(data: np.ndarray, desired_fp: float) -> float:
|
|
16
|
+
"""Return a slof fp that won't overflow uint16 given the array's max value."""
|
|
17
|
+
max_val = float(np.max(data)) if len(data) > 0 else 1.0
|
|
18
|
+
max_val = max(max_val, 1.0)
|
|
19
|
+
import math
|
|
20
|
+
|
|
21
|
+
max_fp = _SLOF_UINT16_MAX / (math.log(max_val + 1) + 1e-9)
|
|
22
|
+
return min(desired_fp, max_fp)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
DEFAULT_NUMSLOF_FP = 3600.0 # handles intensities up to ~8e7; adjusted dynamically if needed
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def encode_numlin_zlib(data: np.ndarray, fp: float | None = None) -> bytes:
|
|
29
|
+
"""Encode array with MS-Numpress linear prediction then zlib."""
|
|
30
|
+
fp = fp if fp is not None else DEFAULT_NUMLIN_FP
|
|
31
|
+
encoded = pynumpress.encode_linear(data.astype(np.float64), fp)
|
|
32
|
+
return zlib.compress(encoded.tobytes())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def decode_numlin_zlib(blob: bytes) -> np.ndarray:
|
|
36
|
+
"""Decode MS-Numpress linear + zlib blob back to float64 array."""
|
|
37
|
+
decompressed = zlib.decompress(blob)
|
|
38
|
+
n = len(decompressed)
|
|
39
|
+
# pynumpress 0.0.9 cannot decode a single-value linear blob (12 bytes: 8-byte
|
|
40
|
+
# fixed point + one 4-byte int), though encode_linear emits exactly that. The
|
|
41
|
+
# MS-Numpress reference decodes it (dataSize == 12 → one value); do so directly
|
|
42
|
+
# to keep single-peak spectra round-trippable and cross-impl compatible.
|
|
43
|
+
if 12 <= n < 16:
|
|
44
|
+
fixed_point = struct.unpack(">d", decompressed[:8])[0]
|
|
45
|
+
first = int.from_bytes(decompressed[8:12], "little", signed=False)
|
|
46
|
+
return np.array([first / fixed_point], dtype=np.float64)
|
|
47
|
+
return np.array(pynumpress.decode_linear(np.frombuffer(decompressed, dtype=np.uint8)), dtype=np.float64)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def encode_numslof_zlib(data: np.ndarray, fp: float | None = None) -> bytes:
|
|
51
|
+
"""Encode array with MS-Numpress short logged float then zlib."""
|
|
52
|
+
desired = fp if fp is not None else DEFAULT_NUMSLOF_FP
|
|
53
|
+
safe_fp = _safe_slof_fp(data, desired)
|
|
54
|
+
encoded = pynumpress.encode_slof(data.astype(np.float64), safe_fp)
|
|
55
|
+
return zlib.compress(encoded.tobytes())
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def decode_numslof_zlib(blob: bytes) -> np.ndarray:
|
|
59
|
+
"""Decode MS-Numpress slof + zlib blob back to float64 array."""
|
|
60
|
+
decompressed = zlib.decompress(blob)
|
|
61
|
+
return np.array(pynumpress.decode_slof(np.frombuffer(decompressed, dtype=np.uint8)), dtype=np.float64)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def encode_numpic_zlib(data: np.ndarray, fp: float | None = None) -> bytes:
|
|
65
|
+
"""Encode array with MS-Numpress positive integer then zlib."""
|
|
66
|
+
encoded = pynumpress.encode_pic(data.astype(np.float64))
|
|
67
|
+
return zlib.compress(encoded.tobytes())
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def decode_numpic_zlib(blob: bytes) -> np.ndarray:
|
|
71
|
+
"""Decode MS-Numpress pic + zlib blob back to float64 array."""
|
|
72
|
+
decompressed = zlib.decompress(blob)
|
|
73
|
+
return np.array(pynumpress.decode_pic(np.frombuffer(decompressed, dtype=np.uint8)), dtype=np.float64)
|
spectrl/codecs/raw.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Lossless IEEE-754 little-endian + zlib codec."""
|
|
2
|
+
|
|
3
|
+
import zlib
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def encode_zlib_raw(data: np.ndarray) -> bytes:
|
|
9
|
+
"""Encode array as little-endian float64 + zlib."""
|
|
10
|
+
raw = data.astype("<f8").tobytes()
|
|
11
|
+
return zlib.compress(raw)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def decode_zlib_raw(blob: bytes) -> np.ndarray:
|
|
15
|
+
"""Decode zlib-compressed little-endian float64 bytes back to array."""
|
|
16
|
+
raw = zlib.decompress(blob)
|
|
17
|
+
return np.frombuffer(raw, dtype="<f8").astype(np.float64)
|
spectrl/cv.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""CV accession ↔ integer-tail mapping using mzmlpy's StrEnum constants.
|
|
2
|
+
|
|
3
|
+
Rules (§3.1):
|
|
4
|
+
- Accession tails default to MS: ontology.
|
|
5
|
+
- Unit tails default to UO: ontology.
|
|
6
|
+
- Any other ontology uses an explicit [ontology_id, tail] pair.
|
|
7
|
+
|
|
8
|
+
The tail for "MS:1000511" is 1000511; for "UO:0000031" is 31.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from mzmlpy.constants import (
|
|
14
|
+
BinaryDataArrayAccession,
|
|
15
|
+
BinaryDataTypeAccession,
|
|
16
|
+
CollisionDissociationTypeAccession,
|
|
17
|
+
CompressionTypeAccessions,
|
|
18
|
+
ScanPolarity,
|
|
19
|
+
SpectrumCombinationAccession,
|
|
20
|
+
SpectrumMSAccession,
|
|
21
|
+
SpectrumType,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
_DEFAULT_PARAM_ONTOLOGY = "MS"
|
|
25
|
+
_DEFAULT_UNIT_ONTOLOGY = "UO"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def accession_tail(accession: str) -> int:
|
|
29
|
+
"""Extract the integer tail from an accession string like 'MS:1000511' → 1000511."""
|
|
30
|
+
return int(accession.split(":")[1])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def accession_ontology(accession: str) -> str:
|
|
34
|
+
"""Extract the ontology prefix from 'MS:1000511' → 'MS'."""
|
|
35
|
+
return accession.split(":")[0]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def encode_tail(accession: str) -> int:
|
|
39
|
+
"""Encode an accession to its tail integer (assumes MS: default ontology)."""
|
|
40
|
+
return accession_tail(accession)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def encode_unit(unit_accession: str) -> int | list:
|
|
44
|
+
"""Encode a unit accession to a tail int (UO: default) or [ontology, tail] for other ontologies."""
|
|
45
|
+
onto = accession_ontology(unit_accession)
|
|
46
|
+
tail = accession_tail(unit_accession)
|
|
47
|
+
if onto == _DEFAULT_UNIT_ONTOLOGY:
|
|
48
|
+
return tail
|
|
49
|
+
return [onto, tail]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def decode_tail(tail: int, ontology: str = _DEFAULT_PARAM_ONTOLOGY) -> str:
|
|
53
|
+
"""Reconstruct an accession string from a tail integer and ontology prefix."""
|
|
54
|
+
return f"{ontology}:{tail:07d}"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def decode_unit_tail(tail: int | list) -> str:
|
|
58
|
+
"""Reconstruct a unit accession string from a tail (int = UO: default, list = [ontology, tail])."""
|
|
59
|
+
if isinstance(tail, list):
|
|
60
|
+
return f"{tail[0]}:{tail[1]:07d}"
|
|
61
|
+
return f"{_DEFAULT_UNIT_ONTOLOGY}:{tail:07d}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ─── Codec compression tails (used by codecs module) ───────────────────────
|
|
65
|
+
|
|
66
|
+
COMP_NUMLIN_ZLIB = accession_tail(CompressionTypeAccessions.MS_NUMPRESS_LINEAR_PREDICTION_ZLIB)
|
|
67
|
+
COMP_NUMSLOF_ZLIB = accession_tail(CompressionTypeAccessions.MS_NUMPRESS_SHORT_LOGGED_FLOAT_ZLIB)
|
|
68
|
+
COMP_NUMPIC_ZLIB = accession_tail(CompressionTypeAccessions.MS_NUMPRESS_POSITIVE_INTEGER_ZLIB)
|
|
69
|
+
COMP_NUMLIN = accession_tail(CompressionTypeAccessions.MS_NUMPRESS_LINEAR_PREDICTION)
|
|
70
|
+
COMP_NUMSLOF = accession_tail(CompressionTypeAccessions.MS_NUMPRESS_SHORT_LOGGED_FLOAT)
|
|
71
|
+
COMP_NUMPIC = accession_tail(CompressionTypeAccessions.MS_NUMPRESS_POSITIVE_INTEGER)
|
|
72
|
+
COMP_ZLIB = accession_tail(CompressionTypeAccessions.ZLIB_COMPRESSION)
|
|
73
|
+
COMP_NONE = accession_tail(CompressionTypeAccessions.NO_COMPRESSION)
|
|
74
|
+
|
|
75
|
+
# ─── Data type tails ────────────────────────────────────────────────────────
|
|
76
|
+
|
|
77
|
+
TYPE_FLOAT64 = accession_tail(BinaryDataTypeAccession.FLOAT_64)
|
|
78
|
+
TYPE_FLOAT32 = accession_tail(BinaryDataTypeAccession.FLOAT_32)
|
|
79
|
+
TYPE_INT32 = accession_tail(BinaryDataTypeAccession.INT_32)
|
|
80
|
+
TYPE_INT64 = accession_tail(BinaryDataTypeAccession.INT_64)
|
|
81
|
+
|
|
82
|
+
# ─── Array type tails ───────────────────────────────────────────────────────
|
|
83
|
+
|
|
84
|
+
ARRAY_MZ = accession_tail(BinaryDataArrayAccession.MZ)
|
|
85
|
+
ARRAY_INTENSITY = accession_tail(BinaryDataArrayAccession.INTENSITY)
|
|
86
|
+
ARRAY_CHARGE = accession_tail(BinaryDataArrayAccession.CHARGE)
|
|
87
|
+
|
|
88
|
+
# Ion mobility array tails
|
|
89
|
+
ION_MOBILITY_ARRAY_TAILS: dict[str, int] = {
|
|
90
|
+
acc: accession_tail(acc)
|
|
91
|
+
for acc in (
|
|
92
|
+
BinaryDataArrayAccession.RAW_ION_MOBILITY,
|
|
93
|
+
BinaryDataArrayAccession.MEAN_ION_MOBILITY_DRIFT_TIME,
|
|
94
|
+
BinaryDataArrayAccession.DECONVOLUTED_ION_MOBILITY_DRIFT_TIME,
|
|
95
|
+
BinaryDataArrayAccession.MEAN_INVERSE_REDUCED_ION_MOBILITY,
|
|
96
|
+
BinaryDataArrayAccession.MEAN_ION_MOBILITY,
|
|
97
|
+
BinaryDataArrayAccession.DECONVOLUTED_INVERSE_REDUCED_ION_MOBILITY,
|
|
98
|
+
BinaryDataArrayAccession.RAW_ION_MOBILITY_DRIFT_TIME,
|
|
99
|
+
BinaryDataArrayAccession.RAW_INVERSE_REDUCED_ION_MOBILITY,
|
|
100
|
+
BinaryDataArrayAccession.ION_MOBILITY,
|
|
101
|
+
)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# ─── Known accession registry for validation/tests ──────────────────────────
|
|
105
|
+
|
|
106
|
+
ALL_MZX_ACCESSIONS: set[str] = set()
|
|
107
|
+
for _enum in (
|
|
108
|
+
BinaryDataArrayAccession,
|
|
109
|
+
BinaryDataTypeAccession,
|
|
110
|
+
CompressionTypeAccessions,
|
|
111
|
+
ScanPolarity,
|
|
112
|
+
SpectrumCombinationAccession,
|
|
113
|
+
SpectrumMSAccession,
|
|
114
|
+
SpectrumType,
|
|
115
|
+
CollisionDissociationTypeAccession,
|
|
116
|
+
):
|
|
117
|
+
ALL_MZX_ACCESSIONS.update(str(v) for v in _enum)
|