opentfraw 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opentfraw-1.0.0/.github/workflows/ci.yml +101 -0
- opentfraw-1.0.0/.github/workflows/publish.yml +85 -0
- opentfraw-1.0.0/.gitignore +33 -0
- opentfraw-1.0.0/ATTRIBUTION.md +38 -0
- opentfraw-1.0.0/CHANGELOG.md +64 -0
- opentfraw-1.0.0/CORPUS.md +188 -0
- opentfraw-1.0.0/Cargo.lock +65 -0
- opentfraw-1.0.0/Cargo.toml +33 -0
- opentfraw-1.0.0/LICENSE +171 -0
- opentfraw-1.0.0/PKG-INFO +58 -0
- opentfraw-1.0.0/README.md +64 -0
- opentfraw-1.0.0/docs/.gitignore +6 -0
- opentfraw-1.0.0/docs/bun.lock +2802 -0
- opentfraw-1.0.0/docs/docs/changelog.md +23 -0
- opentfraw-1.0.0/docs/docs/format/00-overview.md +172 -0
- opentfraw-1.0.0/docs/docs/format/01-file-layout.md +139 -0
- opentfraw-1.0.0/docs/docs/format/02-sample-and-sequence.md +87 -0
- opentfraw-1.0.0/docs/docs/format/03-raw-file-info.md +143 -0
- opentfraw-1.0.0/docs/docs/format/04-run-header.md +102 -0
- opentfraw-1.0.0/docs/docs/format/05-scan-index-and-data.md +195 -0
- opentfraw-1.0.0/docs/docs/format/06-scan-event.md +167 -0
- opentfraw-1.0.0/docs/docs/format/07-scan-parameters.md +135 -0
- opentfraw-1.0.0/docs/docs/format/08-logs.md +35 -0
- opentfraw-1.0.0/docs/docs/format/09-enumerations.md +78 -0
- opentfraw-1.0.0/docs/docs/format/10-frequency-to-mz.md +56 -0
- opentfraw-1.0.0/docs/docs/format/11-references.md +39 -0
- opentfraw-1.0.0/docs/docs/guide/instrument-families.md +27 -0
- opentfraw-1.0.0/docs/docs/guide/mzml-export.md +27 -0
- opentfraw-1.0.0/docs/docs/guide/reader.md +45 -0
- opentfraw-1.0.0/docs/docs/guide/scan-data.md +27 -0
- opentfraw-1.0.0/docs/docs/install.md +70 -0
- opentfraw-1.0.0/docs/docs/intro.md +50 -0
- opentfraw-1.0.0/docs/docs/license.md +25 -0
- opentfraw-1.0.0/docs/docs/quickstart.md +57 -0
- opentfraw-1.0.0/docs/docusaurus.config.ts +132 -0
- opentfraw-1.0.0/docs/package.json +38 -0
- opentfraw-1.0.0/docs/sidebars.ts +43 -0
- opentfraw-1.0.0/docs/src/css/custom.css +206 -0
- opentfraw-1.0.0/docs/static/.nojekyll +0 -0
- opentfraw-1.0.0/docs/static/img/favicon.ico +0 -0
- opentfraw-1.0.0/docs/static/img/logo.svg +38 -0
- opentfraw-1.0.0/docs/tsconfig.json +5 -0
- opentfraw-1.0.0/docs/wrangler.jsonc +17 -0
- opentfraw-1.0.0/examples/dump.rs +336 -0
- opentfraw-1.0.0/examples/to_mzml.rs +108 -0
- opentfraw-1.0.0/pyproject.toml +24 -0
- opentfraw-1.0.0/python/.gitignore +15 -0
- opentfraw-1.0.0/python/Cargo.lock +297 -0
- opentfraw-1.0.0/python/Cargo.toml +23 -0
- opentfraw-1.0.0/python/README.md +43 -0
- opentfraw-1.0.0/python/src/lib.rs +290 -0
- opentfraw-1.0.0/scripts/fetch_corpus.py +353 -0
- opentfraw-1.0.0/scripts/sources.json +442 -0
- opentfraw-1.0.0/scripts/validate_mzml.py +239 -0
- opentfraw-1.0.0/src/audit_tag.rs +32 -0
- opentfraw-1.0.0/src/device.rs +408 -0
- opentfraw-1.0.0/src/error.rs +30 -0
- opentfraw-1.0.0/src/error_log.rs +18 -0
- opentfraw-1.0.0/src/generic_data.rs +290 -0
- opentfraw-1.0.0/src/header.rs +64 -0
- opentfraw-1.0.0/src/lib.rs +26 -0
- opentfraw-1.0.0/src/mzml.rs +1204 -0
- opentfraw-1.0.0/src/raw_file_info.rs +167 -0
- opentfraw-1.0.0/src/reader.rs +1570 -0
- opentfraw-1.0.0/src/run_header.rs +134 -0
- opentfraw-1.0.0/src/sample_info.rs +82 -0
- opentfraw-1.0.0/src/scan_data.rs +435 -0
- opentfraw-1.0.0/src/scan_event.rs +397 -0
- opentfraw-1.0.0/src/scan_filter.rs +425 -0
- opentfraw-1.0.0/src/scan_format.rs +70 -0
- opentfraw-1.0.0/src/scan_index.rs +64 -0
- opentfraw-1.0.0/src/seq_row.rs +95 -0
- opentfraw-1.0.0/src/types.rs +330 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# CI for OpenTFRaw.
|
|
2
|
+
#
|
|
3
|
+
# Jobs:
|
|
4
|
+
# build - cargo fmt --check, cargo clippy -D warnings, cargo test
|
|
5
|
+
# (Linux + macOS via WarpBuild)
|
|
6
|
+
# validate-mzml - download a small public Thermo RAW file, convert to mzML,
|
|
7
|
+
# and run scripts/validate_mzml.py to check structural
|
|
8
|
+
# conformance.
|
|
9
|
+
|
|
10
|
+
name: CI
|
|
11
|
+
|
|
12
|
+
on:
|
|
13
|
+
push:
|
|
14
|
+
branches: ["main"]
|
|
15
|
+
pull_request:
|
|
16
|
+
branches: ["main"]
|
|
17
|
+
|
|
18
|
+
env:
|
|
19
|
+
CARGO_TERM_COLOR: always
|
|
20
|
+
RUSTFLAGS: -D warnings
|
|
21
|
+
# Small LTQ FT file (~15 MB) from PRIDE PXD054004, used for the mzML
|
|
22
|
+
# validation job. URL is stable because PRIDE uses year/month archive paths.
|
|
23
|
+
PRIDE_RAW_URL: >-
|
|
24
|
+
https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/05/PXD054004/20171113_Map_NS1_1to139_4deg_50uM_001.raw
|
|
25
|
+
PRIDE_RAW_NAME: test.raw
|
|
26
|
+
|
|
27
|
+
jobs:
|
|
28
|
+
build:
|
|
29
|
+
name: Build and test (${{ matrix.os }})
|
|
30
|
+
runs-on: ${{ matrix.os }}
|
|
31
|
+
strategy:
|
|
32
|
+
fail-fast: false
|
|
33
|
+
matrix:
|
|
34
|
+
os:
|
|
35
|
+
- ubuntu-latest
|
|
36
|
+
- macos-latest
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/checkout@v4
|
|
39
|
+
|
|
40
|
+
- name: Install Rust toolchain
|
|
41
|
+
uses: dtolnay/rust-toolchain@stable
|
|
42
|
+
with:
|
|
43
|
+
toolchain: "1.75"
|
|
44
|
+
components: rustfmt, clippy
|
|
45
|
+
|
|
46
|
+
- name: Cache cargo
|
|
47
|
+
uses: Swatinem/rust-cache@v2
|
|
48
|
+
|
|
49
|
+
- name: cargo fmt
|
|
50
|
+
run: cargo fmt --all -- --check
|
|
51
|
+
|
|
52
|
+
- name: cargo clippy
|
|
53
|
+
run: cargo clippy --all-targets -- -D warnings
|
|
54
|
+
|
|
55
|
+
- name: cargo test
|
|
56
|
+
run: cargo test --all-targets
|
|
57
|
+
|
|
58
|
+
validate-mzml:
|
|
59
|
+
name: mzML structural validation
|
|
60
|
+
runs-on: ubuntu-latest
|
|
61
|
+
steps:
|
|
62
|
+
- uses: actions/checkout@v4
|
|
63
|
+
|
|
64
|
+
- name: Install Rust toolchain
|
|
65
|
+
uses: dtolnay/rust-toolchain@stable
|
|
66
|
+
with:
|
|
67
|
+
toolchain: "1.75"
|
|
68
|
+
|
|
69
|
+
- name: Cache cargo
|
|
70
|
+
uses: Swatinem/rust-cache@v2
|
|
71
|
+
|
|
72
|
+
- name: Build to_mzml example (release)
|
|
73
|
+
run: cargo build --release --example to_mzml
|
|
74
|
+
|
|
75
|
+
- name: Download test RAW file
|
|
76
|
+
run: |
|
|
77
|
+
wget -q --user-agent="OpenTFRaw-CI/1.0" \
|
|
78
|
+
-O "$PRIDE_RAW_NAME" "$PRIDE_RAW_URL"
|
|
79
|
+
echo "Downloaded $(du -sh $PRIDE_RAW_NAME | cut -f1) RAW file"
|
|
80
|
+
|
|
81
|
+
- name: Convert RAW to mzML (centroid)
|
|
82
|
+
run: |
|
|
83
|
+
./target/release/examples/to_mzml "$PRIDE_RAW_NAME" test_centroid.mzML
|
|
84
|
+
echo "Centroid mzML: $(wc -l < test_centroid.mzML) lines"
|
|
85
|
+
|
|
86
|
+
- name: Convert RAW to indexed mzML (centroid)
|
|
87
|
+
run: |
|
|
88
|
+
./target/release/examples/to_mzml --indexed "$PRIDE_RAW_NAME" test_indexed.mzML
|
|
89
|
+
echo "Indexed mzML: $(wc -l < test_indexed.mzML) lines"
|
|
90
|
+
|
|
91
|
+
- name: Convert RAW to mzML (profile)
|
|
92
|
+
run: |
|
|
93
|
+
./target/release/examples/to_mzml --include-profile "$PRIDE_RAW_NAME" test_profile.mzML
|
|
94
|
+
echo "Profile mzML: $(wc -l < test_profile.mzML) lines"
|
|
95
|
+
|
|
96
|
+
- name: Validate mzML structural conformance
|
|
97
|
+
run: |
|
|
98
|
+
python3 scripts/validate_mzml.py \
|
|
99
|
+
test_centroid.mzML \
|
|
100
|
+
test_indexed.mzML \
|
|
101
|
+
test_profile.mzML
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
concurrency:
|
|
8
|
+
group: publish-${{ github.ref }}
|
|
9
|
+
cancel-in-progress: true
|
|
10
|
+
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
cargo-publish:
|
|
16
|
+
name: Publish to crates.io
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
- uses: dtolnay/rust-toolchain@stable
|
|
21
|
+
- run: cargo publish -p opentfraw
|
|
22
|
+
env:
|
|
23
|
+
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
|
24
|
+
|
|
25
|
+
build-wheels:
|
|
26
|
+
name: Build wheels (${{ matrix.os }} / ${{ matrix.target }})
|
|
27
|
+
runs-on: ${{ matrix.os }}
|
|
28
|
+
strategy:
|
|
29
|
+
fail-fast: false
|
|
30
|
+
matrix:
|
|
31
|
+
include:
|
|
32
|
+
- os: ubuntu-latest
|
|
33
|
+
target: x86_64
|
|
34
|
+
- os: ubuntu-latest
|
|
35
|
+
target: aarch64
|
|
36
|
+
- os: macos-latest
|
|
37
|
+
target: x86_64
|
|
38
|
+
- os: macos-latest
|
|
39
|
+
target: aarch64
|
|
40
|
+
- os: windows-latest
|
|
41
|
+
target: x86_64
|
|
42
|
+
steps:
|
|
43
|
+
- uses: actions/checkout@v4
|
|
44
|
+
- uses: actions/setup-python@v5
|
|
45
|
+
with:
|
|
46
|
+
python-version: "3.11"
|
|
47
|
+
- uses: PyO3/maturin-action@v1
|
|
48
|
+
with:
|
|
49
|
+
target: ${{ matrix.target }}
|
|
50
|
+
args: --release --out dist --find-interpreter --manifest-path python/Cargo.toml
|
|
51
|
+
manylinux: auto
|
|
52
|
+
- uses: actions/upload-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: wheels-${{ matrix.os }}-${{ matrix.target }}
|
|
55
|
+
path: dist/*.whl
|
|
56
|
+
|
|
57
|
+
build-sdist:
|
|
58
|
+
name: Build sdist
|
|
59
|
+
runs-on: ubuntu-latest
|
|
60
|
+
steps:
|
|
61
|
+
- uses: actions/checkout@v4
|
|
62
|
+
- uses: PyO3/maturin-action@v1
|
|
63
|
+
with:
|
|
64
|
+
command: sdist
|
|
65
|
+
args: --out dist --manifest-path python/Cargo.toml
|
|
66
|
+
- uses: actions/upload-artifact@v4
|
|
67
|
+
with:
|
|
68
|
+
name: sdist
|
|
69
|
+
path: dist/*.tar.gz
|
|
70
|
+
|
|
71
|
+
pypi-publish:
|
|
72
|
+
name: Publish to PyPI
|
|
73
|
+
needs: [build-wheels, build-sdist]
|
|
74
|
+
runs-on: ubuntu-latest
|
|
75
|
+
environment: pypi
|
|
76
|
+
permissions:
|
|
77
|
+
id-token: write
|
|
78
|
+
steps:
|
|
79
|
+
- uses: actions/download-artifact@v4
|
|
80
|
+
with:
|
|
81
|
+
path: dist
|
|
82
|
+
merge-multiple: true
|
|
83
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
84
|
+
with:
|
|
85
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Cargo build output
|
|
2
|
+
/target
|
|
3
|
+
|
|
4
|
+
# Roadmap (internal)
|
|
5
|
+
/ROADMAP.md
|
|
6
|
+
|
|
7
|
+
# Corpus sample files — pulled on demand via scripts/fetch_corpus.py,
|
|
8
|
+
# not redistributed through this repo (multi-GB PRIDE datasets)
|
|
9
|
+
/samples/
|
|
10
|
+
/corpus/
|
|
11
|
+
|
|
12
|
+
# Reverse-engineering scratch: hexdump analysers, ad-hoc probes, and
|
|
13
|
+
# early validation harnesses — kept out of the public repo
|
|
14
|
+
/analyze_*.py
|
|
15
|
+
/validate_*.py
|
|
16
|
+
/trace_*.py
|
|
17
|
+
/examples/*_survey.rs
|
|
18
|
+
/examples/gdh_hunt.rs
|
|
19
|
+
/examples/params_probe.rs
|
|
20
|
+
|
|
21
|
+
# Imported 3rd-party reference material
|
|
22
|
+
/unfinnigan_field_definitions.md
|
|
23
|
+
|
|
24
|
+
# Python bytecode (scripts/ uses plain Python)
|
|
25
|
+
__pycache__/
|
|
26
|
+
*.pyc
|
|
27
|
+
*.pyo
|
|
28
|
+
|
|
29
|
+
# Editor / tooling
|
|
30
|
+
.vscode/
|
|
31
|
+
.idea/
|
|
32
|
+
*.swp
|
|
33
|
+
.DS_Store
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Credits
|
|
2
|
+
|
|
3
|
+
## Prior art
|
|
4
|
+
|
|
5
|
+
### unfinnigan
|
|
6
|
+
|
|
7
|
+
Gene Selkov, 2010-2012. Perl and Python reverse-engineering of the Thermo RAW binary format.
|
|
8
|
+
The most thorough prior independent analysis of the format, covering versions 57, 62, 63, 64,
|
|
9
|
+
and 66. Field names and layout notes from unfinnigan were cross-referenced when validating
|
|
10
|
+
field offsets.
|
|
11
|
+
|
|
12
|
+
Source: https://github.com/prvst/unfinnigan
|
|
13
|
+
|
|
14
|
+
## Standards
|
|
15
|
+
|
|
16
|
+
The mzML output follows the [HUPO-PSI mzML 1.1.0 specification](https://www.psidev.info/mzML)
|
|
17
|
+
and uses CV terms from the PSI-MS ontology (psi-ms.obo):
|
|
18
|
+
|
|
19
|
+
Deutsch EW et al. "A guided tour of the Trans-Proteomic Pipeline."
|
|
20
|
+
Proteomics. 2010;10(6):1150-9. doi:10.1002/pmic.200900375
|
|
21
|
+
|
|
22
|
+
Instrument CV accessions were cross-referenced against the PSI-MS ontology instrument
|
|
23
|
+
model branch (MS:1000031).
|
|
24
|
+
|
|
25
|
+
## Validation corpus
|
|
26
|
+
|
|
27
|
+
Corpus files were downloaded from the [PRIDE Archive](https://www.ebi.ac.uk/pride/):
|
|
28
|
+
|
|
29
|
+
Perez-Riverol Y et al. "The PRIDE database and related tools and resources in 2019:
|
|
30
|
+
improving support for quantification data." Nucleic Acids Res. 2019;47(D1):D442-D450.
|
|
31
|
+
doi:10.1093/nar/gky1106
|
|
32
|
+
|
|
33
|
+
## Rust dependencies
|
|
34
|
+
|
|
35
|
+
- [thiserror](https://github.com/dtolnay/thiserror) -- derive macro for Error impls (David Tolnay, MIT/Apache-2.0)
|
|
36
|
+
- [pyo3](https://github.com/PyO3/pyo3) -- Rust/Python bindings (PyO3 contributors, MIT/Apache-2.0)
|
|
37
|
+
- [numpy](https://github.com/PyO3/rust-numpy) -- PyO3 numpy integration (PyO3 contributors, BSD-2-Clause)
|
|
38
|
+
- [maturin](https://github.com/PyO3/maturin) -- Python wheel build tool (PyO3 contributors, MIT/Apache-2.0)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [1.0.0] - 2026-05-17
|
|
9
|
+
|
|
10
|
+
First stable release. The public API of `opentfraw` is now considered
|
|
11
|
+
stable and will follow semantic versioning. Format coverage is unchanged
|
|
12
|
+
from 0.1.0 (LTQ FT, Q Exactive HF, Orbitrap Fusion Lumos, Orbitrap
|
|
13
|
+
Exploris 480, TSQ Vantage, TSQ Quantiva, TSQ Altis).
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
|
|
17
|
+
- `ATTRIBUTION.md` (replaces `CREDITS.md`): tracks third-party notices for
|
|
18
|
+
bundled data and vendored code.
|
|
19
|
+
- `publish.yml` GitHub Actions workflow: publishes the `opentfraw` crate
|
|
20
|
+
to crates.io and the Python wheel to PyPI via OIDC Trusted Publishing
|
|
21
|
+
on every `v*` tag push.
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
|
|
25
|
+
- CI migrated from WarpBuild runners to standard GitHub-hosted
|
|
26
|
+
(`ubuntu-latest`, `macos-latest`, `windows-latest`).
|
|
27
|
+
- Removed the `tools/` vendor SDK tree and `corpus/mzml/` binary corpus
|
|
28
|
+
from repository history (git history rewritten; total size reduced from
|
|
29
|
+
~1.5 GB to ~660 KB).
|
|
30
|
+
- Removed "Pure-Rust" marketing language from `README.md` and related
|
|
31
|
+
documentation (Python bindings use PyO3/maturin which pulls in a C
|
|
32
|
+
compiler at build time).
|
|
33
|
+
- Renamed `CREDITS.md` to `ATTRIBUTION.md`.
|
|
34
|
+
|
|
35
|
+
## [0.1.0] - 2026-05-16
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
|
|
39
|
+
- Rust parser for the Thermo Fisher RAW mass spectrometry file
|
|
40
|
+
format, no native or system dependencies.
|
|
41
|
+
- Reader API for top-level structures: `FileHeader`, `AuditTag`,
|
|
42
|
+
`SeqRow`, `InjectionData`, `ASInfo`, `RawFileInfo`, `InstID`,
|
|
43
|
+
`RunHeader`, `SampleInfo`.
|
|
44
|
+
- Per-scan API: scan-index entries, packet headers, profile chunks,
|
|
45
|
+
centroid peaks, scan events, scan parameters (generic records).
|
|
46
|
+
- Error log and instrument log decoders.
|
|
47
|
+
- Robust instrument-model detection via byte scan.
|
|
48
|
+
- Frequency-to-m/z conversion using the per-segment calibration table.
|
|
49
|
+
- `examples/dump.rs`: dump the contents of a RAW file as plain text.
|
|
50
|
+
- `examples/to_mzml.rs`: convert a RAW file to mzML (centroid or
|
|
51
|
+
profile; optionally indexed).
|
|
52
|
+
- Validated against ProteoWizard `msconvert` mzML output for a
|
|
53
|
+
multi-instrument PRIDE corpus (LTQ FT, Q Exactive HF, Orbitrap
|
|
54
|
+
Fusion Lumos, Orbitrap Exploris 480, TSQ Vantage, TSQ Quantiva,
|
|
55
|
+
TSQ Altis).
|
|
56
|
+
- Optional Python bindings (`opentfraw-py`, not published to crates.io).
|
|
57
|
+
- Format specification under `docs/docs/format/`.
|
|
58
|
+
|
|
59
|
+
### Out of scope
|
|
60
|
+
|
|
61
|
+
- Methods file (`MethodFile`) deep parse beyond byte-level layout.
|
|
62
|
+
|
|
63
|
+
[1.0.0]: https://github.com/Sigilweaver/OpenTFRaw/releases/tag/v1.0.0
|
|
64
|
+
[0.1.0]: https://github.com/Sigilweaver/OpenTFRaw/releases/tag/v0.1.0
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# OpenTFRaw Validation Corpus
|
|
2
|
+
|
|
3
|
+
The test corpus covers every major Thermo RAW format variant the parser
|
|
4
|
+
needs to handle:
|
|
5
|
+
|
|
6
|
+
- All supported format versions (8, 47, 57, 60, 62, 63, 64, 66)
|
|
7
|
+
- Both scan-data encodings (PacketHeader and the two Flat variants)
|
|
8
|
+
- Each major instrument family (ion trap, Orbitrap hybrid, Q-Orbitrap,
|
|
9
|
+
Tribrid, single-stage Orbitrap, Astral, triple quadrupole)
|
|
10
|
+
|
|
11
|
+
Current size: ~124 GB across 283 files, covering all instrument families
|
|
12
|
+
and acquisition modes. Multiple files per instrument are included to
|
|
13
|
+
exercise parameter variation across real-world datasets.
|
|
14
|
+
|
|
15
|
+
## Source: PRIDE Archive
|
|
16
|
+
|
|
17
|
+
All files come from the EBI PRIDE Archive (https://www.ebi.ac.uk/pride/),
|
|
18
|
+
a public proteomics repository hosting hundreds of thousands of Thermo RAW
|
|
19
|
+
files contributed by academic and commercial labs.
|
|
20
|
+
|
|
21
|
+
Access is via HTTPS from the PRIDE FTP mirror:
|
|
22
|
+
|
|
23
|
+
https://ftp.pride.ebi.ac.uk/pride/data/archive/YYYY/MM/\<PXD_ACCESSION\>/
|
|
24
|
+
|
|
25
|
+
PRIDE datasets are published under CC-BY or equivalent open licences.
|
|
26
|
+
|
|
27
|
+
## Source List
|
|
28
|
+
|
|
29
|
+
The file `scripts/sources.json` records which PRIDE projects and files to
|
|
30
|
+
download:
|
|
31
|
+
|
|
32
|
+
[
|
|
33
|
+
{
|
|
34
|
+
"instrument": "LCQ Classic",
|
|
35
|
+
"accession": "PXD044152",
|
|
36
|
+
"files": ["Ex250122_K50ng_60m2.raw"],
|
|
37
|
+
"count": 6
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"instrument": "Orbitrap Fusion Lumos",
|
|
41
|
+
"mode": "DIA",
|
|
42
|
+
"accession": "PXD031322",
|
|
43
|
+
"files": ["OFL001513-YLL-GPF-15K-1.raw"],
|
|
44
|
+
"count": 5
|
|
45
|
+
},
|
|
46
|
+
...
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
- `files` - specific filenames always downloaded first
|
|
50
|
+
- `count` - total target file count from this project; the fetcher
|
|
51
|
+
auto-fills from the FTP directory listing until the count is reached
|
|
52
|
+
- `mode` - distinguishes multiple entries for the same instrument
|
|
53
|
+
covering different acquisition modes (DIA, EThcD, PRM, MS3, etc.)
|
|
54
|
+
|
|
55
|
+
To add or replace an entry, edit `sources.json` directly and re-run the
|
|
56
|
+
fetcher. The manifest (`corpus/manifest.json`) records what is
|
|
57
|
+
currently on disk; the fetcher skips any key already present there.
|
|
58
|
+
|
|
59
|
+
## Running the Fetcher
|
|
60
|
+
|
|
61
|
+
python scripts/fetch_corpus.py # download missing files
|
|
62
|
+
python scripts/fetch_corpus.py --dry-run # report without downloading
|
|
63
|
+
python scripts/fetch_corpus.py --list-files PXD032800 # discover files
|
|
64
|
+
|
|
65
|
+
The script resolves each download URL through the PRIDE REST API
|
|
66
|
+
(https://www.ebi.ac.uk/pride/ws/archive/v2/files/byProject) and saves
|
|
67
|
+
files as `{accession}_{instrument_label}_{original_filename}` under
|
|
68
|
+
`corpus/`. If the API returns an empty response (an intermittent server
|
|
69
|
+
behaviour observed in 2026), the script falls back to constructing the
|
|
70
|
+
FTP URL directly from the project publication date.
|
|
71
|
+
|
|
72
|
+
To discover all available files in a PRIDE project before adding it to
|
|
73
|
+
`sources.json`:
|
|
74
|
+
|
|
75
|
+
python scripts/fetch_corpus.py --list-files PXD032800
|
|
76
|
+
|
|
77
|
+
## Provenance Record
|
|
78
|
+
|
|
79
|
+
`corpus/manifest.json` records which PRIDE project each local
|
|
80
|
+
file came from. Keys are `{accession}/{original_filename}`:
|
|
81
|
+
|
|
82
|
+
{
|
|
83
|
+
"PXD055201/20170427_CO_0673AnGS_DM_Mix1_R12R13R14_2.raw": {
|
|
84
|
+
"instrument": "LTQ Orbitrap XL",
|
|
85
|
+
"dest_filename": "PXD055201_LTQ_Orbitrap_XL_20170427_..._2.raw",
|
|
86
|
+
"size_bytes": 396954554
|
|
87
|
+
},
|
|
88
|
+
...
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
To trace any file back to its source, use the PXD accession:
|
|
92
|
+
|
|
93
|
+
https://www.ebi.ac.uk/pride/archive/projects/<PXD_ACCESSION>
|
|
94
|
+
|
|
95
|
+
## Target Instruments and Acquisition Modes
|
|
96
|
+
|
|
97
|
+
The corpus is organised in two tiers:
|
|
98
|
+
|
|
99
|
+
**Tier 1 - one file per instrument line** (covers every format version
|
|
100
|
+
and scan-data encoding path):
|
|
101
|
+
|
|
102
|
+
| Family | Instruments |
|
|
103
|
+
| ------------------------- | ------------------------------------------------------------- |
|
|
104
|
+
| Ion traps (LCQ/LTQ) | LCQ Classic, LTQ, LTQ XL, LTQ Velos, LTQ FT |
|
|
105
|
+
| LTQ Orbitrap hybrids | LTQ Orbitrap, XL, XL ETD, Velos, Velos Pro, Elite |
|
|
106
|
+
| Q-Orbitrap | Q Exactive, Plus, HF, HF-X, UHMR |
|
|
107
|
+
| Tribrid Orbitrap | Fusion, Fusion Lumos, Eclipse, Ascend |
|
|
108
|
+
| Single-stage Orbitrap | Exploris 120, 240, 480, Astral (DIA) |
|
|
109
|
+
| Triple quadrupole | TSQ Vantage, Quantiva, Altis |
|
|
110
|
+
|
|
111
|
+
**Tier 2 - additional files per instrument covering distinct modes**:
|
|
112
|
+
|
|
113
|
+
| Entry | Mode | What it exercises |
|
|
114
|
+
| -------------------------------- | ------ | ------------------------------------------------------ |
|
|
115
|
+
| Orbitrap Fusion Lumos (DIA) | DIA | Multiple isolation windows per scan cycle |
|
|
116
|
+
| Orbitrap Fusion Lumos (MS3) | MS3 | Three-stage fragmentation / XL-MS workflow |
|
|
117
|
+
| Orbitrap Fusion Lumos (EThcD) | EThcD | Supplemental activation on tribrid variable-body scans |
|
|
118
|
+
| Orbitrap Eclipse (EThcD) | EThcD | Electron-transfer + supplemental HCD, two-clause filter|
|
|
119
|
+
| Q Exactive Plus (DDA-2) | DDA | Second Q Exactive Plus vintage for regression |
|
|
120
|
+
| Orbitrap Fusion Lumos (UVPD) | UVPD | Ultraviolet photodissociation, tests Activation::Uvpd |
|
|
121
|
+
| Q Exactive HF (DIA) | DIA | Fixed-window SWATH-like DIA on Q Exactive |
|
|
122
|
+
| Orbitrap Exploris 480 (DDA-2) | DDA | Second firmware vintage for regression |
|
|
123
|
+
| TSQ Altis (SRM-2) | SRM-2 | Second SRM file from a different dataset |
|
|
124
|
+
| Q Exactive HF-X (PRM) | PRM | Parallel reaction monitoring: 42 targets, |
|
|
125
|
+
| | | 7-minute gradient, SARS-CoV-2 peptides |
|
|
126
|
+
|
|
127
|
+
### Multi-controller coverage
|
|
128
|
+
|
|
129
|
+
Several Tier 1 files carry `controller_count > 1` in their
|
|
130
|
+
`RawFileInfoPreamble`, meaning the RAW file contains a UV/analog chromatogram
|
|
131
|
+
channel alongside the MS data stream. The parser exercises the
|
|
132
|
+
multi-controller selection path (reader.rs `select_ms_run_header`) for these:
|
|
133
|
+
|
|
134
|
+
| File (Tier 1 instrument) | `controller_count` | Confirmed year |
|
|
135
|
+
| ------------------------- | :----------------: | -------------- |
|
|
136
|
+
| Orbitrap Fusion | 2 | 2016-12 |
|
|
137
|
+
| Orbitrap Fusion Lumos | 2 | 2016-03 |
|
|
138
|
+
| LTQ Orbitrap (PXD069348) | 3 | 2014-02 |
|
|
139
|
+
|
|
140
|
+
The selection heuristic — `ntrailer > 0` (v64+) or `nsegs > 0 && first_scan
|
|
141
|
+
<= last_scan` (v63) — correctly identifies the MS controller in every case.
|
|
142
|
+
|
|
143
|
+
## Limitations
|
|
144
|
+
|
|
145
|
+
- PRIDE's metadata lists declared instrument names; a few submitters
|
|
146
|
+
mislabel files. Device detection in the parser is therefore best-effort.
|
|
147
|
+
- Some instrument lines (Astral, top-down ETD workflows) have few publicly
|
|
148
|
+
available files on PRIDE. The `count` values in `sources.json` are
|
|
149
|
+
capped at the number of files actually present in the FTP directory.
|
|
150
|
+
|
|
151
|
+
## Open Issues
|
|
152
|
+
|
|
153
|
+
### DIA isolation window m/z (Orbitrap Exploris 480 and similar)
|
|
154
|
+
|
|
155
|
+
For the Exploris 480 DIA files in corpus (PXD035500), the isolation window
|
|
156
|
+
center m/z is currently absent from filter strings. Investigation findings:
|
|
157
|
+
|
|
158
|
+
- **Scan event body format**: DIA MS2 scan events use a uniform 136-byte body
|
|
159
|
+
(event size = 272 bytes total). The body[8..12] f32 field holds a value in
|
|
160
|
+
the range ~3.8-5.0, which is in instrument frequency space, not m/z. There
|
|
161
|
+
is no reaction structure (np = 0 at body[4..8]) and no m/z at any body offset.
|
|
162
|
+
|
|
163
|
+
- **Scan params**: The file has 1004 bytes/scan of scan params data starting at
|
|
164
|
+
`scan_params_addr`, but the GenericDataHeader (GDH) that describes the record
|
|
165
|
+
schema was not found anywhere in the 8 MB window between the error log and
|
|
166
|
+
scan_trailer that `find_forward` searches. As a result `scan_parameters` is
|
|
167
|
+
empty for all scans in this file and `ScanParams` accessors return `None`.
|
|
168
|
+
|
|
169
|
+
- **What is needed**: Locate the GDH for Exploris 480 scan params (it may be
|
|
170
|
+
outside the current search window, or use a different header format). Once
|
|
171
|
+
the schema is found, the calibration coefficients (conversion parameters A, B,
|
|
172
|
+
C) inside the scan params record can be used to convert frequency to m/z and
|
|
173
|
+
recover the isolation window center.
|
|
174
|
+
|
|
175
|
+
- **Workaround**: For instruments where the GDH is found correctly (Q Exactive,
|
|
176
|
+
Fusion Lumos, Eclipse), `ScanParams::isolation_width_mz()` and the
|
|
177
|
+
`monoisotopic_mz()` family already work. Eclipse DIA files (PXD038440, once
|
|
178
|
+
downloaded) will clarify whether tribrid instruments store the isolation m/z
|
|
179
|
+
in the reaction structure (np > 0) as DDA scans do, bypassing the calibration
|
|
180
|
+
problem entirely.
|
|
181
|
+
|
|
182
|
+
### Acquisition modes not yet in corpus
|
|
183
|
+
|
|
184
|
+
| Mode | Notes |
|
|
185
|
+
| ---- | ----- |
|
|
186
|
+
| Eclipse DIA | DIA on tribrid Orbitrap: needed to confirm whether tribrid instruments store isolation m/z in reaction structure (np>0) as DDA scans do. No confirmed Eclipse DIA PRIDE accession with accessible RAW files identified yet. The existing Fusion Lumos DIA files (PXD031322) show the same filter gap as Exploris 480, suggesting the isolation m/z is absent in the tribrid scan event body for DIA as well. |
|
|
187
|
+
| SPS-MS3 (TMT) | Synchronous precursor selection MS3 for isobaric quantification; differs from standard MS3 in the number of simultaneous precursor m/z in the scan event body. |
|
|
188
|
+
| ECD / IRMPD | Both enum variants implemented; no corpus files yet. |
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
|
2
|
+
# It is not intended for manual editing.
|
|
3
|
+
version = 4
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "opentfraw"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"thiserror",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[[package]]
|
|
13
|
+
name = "proc-macro2"
|
|
14
|
+
version = "1.0.106"
|
|
15
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
16
|
+
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
|
|
17
|
+
dependencies = [
|
|
18
|
+
"unicode-ident",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[[package]]
|
|
22
|
+
name = "quote"
|
|
23
|
+
version = "1.0.45"
|
|
24
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
25
|
+
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
|
|
26
|
+
dependencies = [
|
|
27
|
+
"proc-macro2",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[[package]]
|
|
31
|
+
name = "syn"
|
|
32
|
+
version = "2.0.117"
|
|
33
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
34
|
+
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
|
|
35
|
+
dependencies = [
|
|
36
|
+
"proc-macro2",
|
|
37
|
+
"quote",
|
|
38
|
+
"unicode-ident",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[[package]]
|
|
42
|
+
name = "thiserror"
|
|
43
|
+
version = "2.0.18"
|
|
44
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
45
|
+
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
|
|
46
|
+
dependencies = [
|
|
47
|
+
"thiserror-impl",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[[package]]
|
|
51
|
+
name = "thiserror-impl"
|
|
52
|
+
version = "2.0.18"
|
|
53
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
54
|
+
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
|
|
55
|
+
dependencies = [
|
|
56
|
+
"proc-macro2",
|
|
57
|
+
"quote",
|
|
58
|
+
"syn",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
[[package]]
|
|
62
|
+
name = "unicode-ident"
|
|
63
|
+
version = "1.0.24"
|
|
64
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
65
|
+
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "opentfraw"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
rust-version = "1.75"
|
|
6
|
+
description = "Rust parser for Thermo Fisher RAW mass spectrometry files."
|
|
7
|
+
authors = ["Nathan Riley <git@nathanriley.com>"]
|
|
8
|
+
license = "Apache-2.0"
|
|
9
|
+
repository = "https://github.com/Sigilweaver/OpenTFRaw"
|
|
10
|
+
homepage = "https://github.com/Sigilweaver/OpenTFRaw"
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
keywords = ["mass-spectrometry", "thermo", "raw", "proteomics", "orbitrap"]
|
|
13
|
+
categories = ["parser-implementations", "science"]
|
|
14
|
+
|
|
15
|
+
[lints.rust]
|
|
16
|
+
unsafe_code = "forbid"
|
|
17
|
+
|
|
18
|
+
[dependencies]
|
|
19
|
+
thiserror = "2"
|
|
20
|
+
|
|
21
|
+
[dev-dependencies]
|
|
22
|
+
|
|
23
|
+
[[example]]
|
|
24
|
+
name = "dump"
|
|
25
|
+
path = "examples/dump.rs"
|
|
26
|
+
|
|
27
|
+
[[example]]
|
|
28
|
+
name = "to_mzml"
|
|
29
|
+
path = "examples/to_mzml.rs"
|
|
30
|
+
|
|
31
|
+
# Keep the Python bindings crate out of the parent's cargo invocations.
|
|
32
|
+
[workspace]
|
|
33
|
+
exclude = ["python"]
|