opentfraw 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. opentfraw-1.0.0/.github/workflows/ci.yml +101 -0
  2. opentfraw-1.0.0/.github/workflows/publish.yml +85 -0
  3. opentfraw-1.0.0/.gitignore +33 -0
  4. opentfraw-1.0.0/ATTRIBUTION.md +38 -0
  5. opentfraw-1.0.0/CHANGELOG.md +64 -0
  6. opentfraw-1.0.0/CORPUS.md +188 -0
  7. opentfraw-1.0.0/Cargo.lock +65 -0
  8. opentfraw-1.0.0/Cargo.toml +33 -0
  9. opentfraw-1.0.0/LICENSE +171 -0
  10. opentfraw-1.0.0/PKG-INFO +58 -0
  11. opentfraw-1.0.0/README.md +64 -0
  12. opentfraw-1.0.0/docs/.gitignore +6 -0
  13. opentfraw-1.0.0/docs/bun.lock +2802 -0
  14. opentfraw-1.0.0/docs/docs/changelog.md +23 -0
  15. opentfraw-1.0.0/docs/docs/format/00-overview.md +172 -0
  16. opentfraw-1.0.0/docs/docs/format/01-file-layout.md +139 -0
  17. opentfraw-1.0.0/docs/docs/format/02-sample-and-sequence.md +87 -0
  18. opentfraw-1.0.0/docs/docs/format/03-raw-file-info.md +143 -0
  19. opentfraw-1.0.0/docs/docs/format/04-run-header.md +102 -0
  20. opentfraw-1.0.0/docs/docs/format/05-scan-index-and-data.md +195 -0
  21. opentfraw-1.0.0/docs/docs/format/06-scan-event.md +167 -0
  22. opentfraw-1.0.0/docs/docs/format/07-scan-parameters.md +135 -0
  23. opentfraw-1.0.0/docs/docs/format/08-logs.md +35 -0
  24. opentfraw-1.0.0/docs/docs/format/09-enumerations.md +78 -0
  25. opentfraw-1.0.0/docs/docs/format/10-frequency-to-mz.md +56 -0
  26. opentfraw-1.0.0/docs/docs/format/11-references.md +39 -0
  27. opentfraw-1.0.0/docs/docs/guide/instrument-families.md +27 -0
  28. opentfraw-1.0.0/docs/docs/guide/mzml-export.md +27 -0
  29. opentfraw-1.0.0/docs/docs/guide/reader.md +45 -0
  30. opentfraw-1.0.0/docs/docs/guide/scan-data.md +27 -0
  31. opentfraw-1.0.0/docs/docs/install.md +70 -0
  32. opentfraw-1.0.0/docs/docs/intro.md +50 -0
  33. opentfraw-1.0.0/docs/docs/license.md +25 -0
  34. opentfraw-1.0.0/docs/docs/quickstart.md +57 -0
  35. opentfraw-1.0.0/docs/docusaurus.config.ts +132 -0
  36. opentfraw-1.0.0/docs/package.json +38 -0
  37. opentfraw-1.0.0/docs/sidebars.ts +43 -0
  38. opentfraw-1.0.0/docs/src/css/custom.css +206 -0
  39. opentfraw-1.0.0/docs/static/.nojekyll +0 -0
  40. opentfraw-1.0.0/docs/static/img/favicon.ico +0 -0
  41. opentfraw-1.0.0/docs/static/img/logo.svg +38 -0
  42. opentfraw-1.0.0/docs/tsconfig.json +5 -0
  43. opentfraw-1.0.0/docs/wrangler.jsonc +17 -0
  44. opentfraw-1.0.0/examples/dump.rs +336 -0
  45. opentfraw-1.0.0/examples/to_mzml.rs +108 -0
  46. opentfraw-1.0.0/pyproject.toml +24 -0
  47. opentfraw-1.0.0/python/.gitignore +15 -0
  48. opentfraw-1.0.0/python/Cargo.lock +297 -0
  49. opentfraw-1.0.0/python/Cargo.toml +23 -0
  50. opentfraw-1.0.0/python/README.md +43 -0
  51. opentfraw-1.0.0/python/src/lib.rs +290 -0
  52. opentfraw-1.0.0/scripts/fetch_corpus.py +353 -0
  53. opentfraw-1.0.0/scripts/sources.json +442 -0
  54. opentfraw-1.0.0/scripts/validate_mzml.py +239 -0
  55. opentfraw-1.0.0/src/audit_tag.rs +32 -0
  56. opentfraw-1.0.0/src/device.rs +408 -0
  57. opentfraw-1.0.0/src/error.rs +30 -0
  58. opentfraw-1.0.0/src/error_log.rs +18 -0
  59. opentfraw-1.0.0/src/generic_data.rs +290 -0
  60. opentfraw-1.0.0/src/header.rs +64 -0
  61. opentfraw-1.0.0/src/lib.rs +26 -0
  62. opentfraw-1.0.0/src/mzml.rs +1204 -0
  63. opentfraw-1.0.0/src/raw_file_info.rs +167 -0
  64. opentfraw-1.0.0/src/reader.rs +1570 -0
  65. opentfraw-1.0.0/src/run_header.rs +134 -0
  66. opentfraw-1.0.0/src/sample_info.rs +82 -0
  67. opentfraw-1.0.0/src/scan_data.rs +435 -0
  68. opentfraw-1.0.0/src/scan_event.rs +397 -0
  69. opentfraw-1.0.0/src/scan_filter.rs +425 -0
  70. opentfraw-1.0.0/src/scan_format.rs +70 -0
  71. opentfraw-1.0.0/src/scan_index.rs +64 -0
  72. opentfraw-1.0.0/src/seq_row.rs +95 -0
  73. opentfraw-1.0.0/src/types.rs +330 -0
@@ -0,0 +1,101 @@
1
+ # CI for OpenTFRaw.
2
+ #
3
+ # Jobs:
4
+ # build - cargo fmt --check, cargo clippy -D warnings, cargo test
5
+ # (Linux + macOS via WarpBuild)
6
+ # validate-mzml - download a small public Thermo RAW file, convert to mzML,
7
+ # and run scripts/validate_mzml.py to check structural
8
+ # conformance.
9
+
10
+ name: CI
11
+
12
+ on:
13
+ push:
14
+ branches: ["main"]
15
+ pull_request:
16
+ branches: ["main"]
17
+
18
+ env:
19
+ CARGO_TERM_COLOR: always
20
+ RUSTFLAGS: -D warnings
21
+ # Small LTQ FT file (~15 MB) from PRIDE PXD054004, used for the mzML
22
+ # validation job. URL is stable because PRIDE uses year/month archive paths.
23
+ PRIDE_RAW_URL: >-
24
+ https://ftp.pride.ebi.ac.uk/pride/data/archive/2025/05/PXD054004/20171113_Map_NS1_1to139_4deg_50uM_001.raw
25
+ PRIDE_RAW_NAME: test.raw
26
+
27
+ jobs:
28
+ build:
29
+ name: Build and test (${{ matrix.os }})
30
+ runs-on: ${{ matrix.os }}
31
+ strategy:
32
+ fail-fast: false
33
+ matrix:
34
+ os:
35
+ - ubuntu-latest
36
+ - macos-latest
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+
40
+ - name: Install Rust toolchain
41
+ uses: dtolnay/rust-toolchain@stable
42
+ with:
43
+ toolchain: "1.75"
44
+ components: rustfmt, clippy
45
+
46
+ - name: Cache cargo
47
+ uses: Swatinem/rust-cache@v2
48
+
49
+ - name: cargo fmt
50
+ run: cargo fmt --all -- --check
51
+
52
+ - name: cargo clippy
53
+ run: cargo clippy --all-targets -- -D warnings
54
+
55
+ - name: cargo test
56
+ run: cargo test --all-targets
57
+
58
+ validate-mzml:
59
+ name: mzML structural validation
60
+ runs-on: ubuntu-latest
61
+ steps:
62
+ - uses: actions/checkout@v4
63
+
64
+ - name: Install Rust toolchain
65
+ uses: dtolnay/rust-toolchain@stable
66
+ with:
67
+ toolchain: "1.75"
68
+
69
+ - name: Cache cargo
70
+ uses: Swatinem/rust-cache@v2
71
+
72
+ - name: Build to_mzml example (release)
73
+ run: cargo build --release --example to_mzml
74
+
75
+ - name: Download test RAW file
76
+ run: |
77
+ wget -q --user-agent="OpenTFRaw-CI/1.0" \
78
+ -O "$PRIDE_RAW_NAME" "$PRIDE_RAW_URL"
79
+ echo "Downloaded $(du -sh $PRIDE_RAW_NAME | cut -f1) RAW file"
80
+
81
+ - name: Convert RAW to mzML (centroid)
82
+ run: |
83
+ ./target/release/examples/to_mzml "$PRIDE_RAW_NAME" test_centroid.mzML
84
+ echo "Centroid mzML: $(wc -l < test_centroid.mzML) lines"
85
+
86
+ - name: Convert RAW to indexed mzML (centroid)
87
+ run: |
88
+ ./target/release/examples/to_mzml --indexed "$PRIDE_RAW_NAME" test_indexed.mzML
89
+ echo "Indexed mzML: $(wc -l < test_indexed.mzML) lines"
90
+
91
+ - name: Convert RAW to mzML (profile)
92
+ run: |
93
+ ./target/release/examples/to_mzml --include-profile "$PRIDE_RAW_NAME" test_profile.mzML
94
+ echo "Profile mzML: $(wc -l < test_profile.mzML) lines"
95
+
96
+ - name: Validate mzML structural conformance
97
+ run: |
98
+ python3 scripts/validate_mzml.py \
99
+ test_centroid.mzML \
100
+ test_indexed.mzML \
101
+ test_profile.mzML
@@ -0,0 +1,85 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ concurrency:
8
+ group: publish-${{ github.ref }}
9
+ cancel-in-progress: true
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ cargo-publish:
16
+ name: Publish to crates.io
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - uses: dtolnay/rust-toolchain@stable
21
+ - run: cargo publish -p opentfraw
22
+ env:
23
+ CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
24
+
25
+ build-wheels:
26
+ name: Build wheels (${{ matrix.os }} / ${{ matrix.target }})
27
+ runs-on: ${{ matrix.os }}
28
+ strategy:
29
+ fail-fast: false
30
+ matrix:
31
+ include:
32
+ - os: ubuntu-latest
33
+ target: x86_64
34
+ - os: ubuntu-latest
35
+ target: aarch64
36
+ - os: macos-latest
37
+ target: x86_64
38
+ - os: macos-latest
39
+ target: aarch64
40
+ - os: windows-latest
41
+ target: x86_64
42
+ steps:
43
+ - uses: actions/checkout@v4
44
+ - uses: actions/setup-python@v5
45
+ with:
46
+ python-version: "3.11"
47
+ - uses: PyO3/maturin-action@v1
48
+ with:
49
+ target: ${{ matrix.target }}
50
+ args: --release --out dist --find-interpreter --manifest-path python/Cargo.toml
51
+ manylinux: auto
52
+ - uses: actions/upload-artifact@v4
53
+ with:
54
+ name: wheels-${{ matrix.os }}-${{ matrix.target }}
55
+ path: dist/*.whl
56
+
57
+ build-sdist:
58
+ name: Build sdist
59
+ runs-on: ubuntu-latest
60
+ steps:
61
+ - uses: actions/checkout@v4
62
+ - uses: PyO3/maturin-action@v1
63
+ with:
64
+ command: sdist
65
+ args: --out dist --manifest-path python/Cargo.toml
66
+ - uses: actions/upload-artifact@v4
67
+ with:
68
+ name: sdist
69
+ path: dist/*.tar.gz
70
+
71
+ pypi-publish:
72
+ name: Publish to PyPI
73
+ needs: [build-wheels, build-sdist]
74
+ runs-on: ubuntu-latest
75
+ environment: pypi
76
+ permissions:
77
+ id-token: write
78
+ steps:
79
+ - uses: actions/download-artifact@v4
80
+ with:
81
+ path: dist
82
+ merge-multiple: true
83
+ - uses: pypa/gh-action-pypi-publish@release/v1
84
+ with:
85
+ packages-dir: dist/
@@ -0,0 +1,33 @@
1
+ # Cargo build output
2
+ /target
3
+
4
+ # Roadmap (internal)
5
+ /ROADMAP.md
6
+
7
+ # Corpus sample files — pulled on demand via scripts/fetch_corpus.py,
8
+ # not redistributed through this repo (multi-GB PRIDE datasets)
9
+ /samples/
10
+ /corpus/
11
+
12
+ # Reverse-engineering scratch: hexdump analysers, ad-hoc probes, and
13
+ # early validation harnesses — kept out of the public repo
14
+ /analyze_*.py
15
+ /validate_*.py
16
+ /trace_*.py
17
+ /examples/*_survey.rs
18
+ /examples/gdh_hunt.rs
19
+ /examples/params_probe.rs
20
+
21
+ # Imported 3rd-party reference material
22
+ /unfinnigan_field_definitions.md
23
+
24
+ # Python bytecode (scripts/ uses plain Python)
25
+ __pycache__/
26
+ *.pyc
27
+ *.pyo
28
+
29
+ # Editor / tooling
30
+ .vscode/
31
+ .idea/
32
+ *.swp
33
+ .DS_Store
@@ -0,0 +1,38 @@
1
+ # Credits
2
+
3
+ ## Prior art
4
+
5
+ ### unfinnigan
6
+
7
+ Gene Selkov, 2010-2012. Perl and Python reverse-engineering of the Thermo RAW binary format.
8
+ The most thorough prior independent analysis of the format, covering versions 57, 62, 63, 64,
9
+ and 66. Field names and layout notes from unfinnigan were cross-referenced when validating
10
+ field offsets.
11
+
12
+ Source: https://github.com/prvst/unfinnigan
13
+
14
+ ## Standards
15
+
16
+ The mzML output follows the [HUPO-PSI mzML 1.1.0 specification](https://www.psidev.info/mzML)
17
+ and uses CV terms from the PSI-MS ontology (psi-ms.obo):
18
+
19
+ Deutsch EW et al. "A guided tour of the Trans-Proteomic Pipeline."
20
+ Proteomics. 2010;10(6):1150-9. doi:10.1002/pmic.200900375
21
+
22
+ Instrument CV accessions were cross-referenced against the PSI-MS ontology instrument
23
+ model branch (MS:1000031).
24
+
25
+ ## Validation corpus
26
+
27
+ Corpus files were downloaded from the [PRIDE Archive](https://www.ebi.ac.uk/pride/):
28
+
29
+ Perez-Riverol Y et al. "The PRIDE database and related tools and resources in 2019:
30
+ improving support for quantification data." Nucleic Acids Res. 2019;47(D1):D442-D450.
31
+ doi:10.1093/nar/gky1106
32
+
33
+ ## Rust dependencies
34
+
35
+ - [thiserror](https://github.com/dtolnay/thiserror) -- derive macro for Error impls (David Tolnay, MIT/Apache-2.0)
36
+ - [pyo3](https://github.com/PyO3/pyo3) -- Rust/Python bindings (PyO3 contributors, MIT/Apache-2.0)
37
+ - [numpy](https://github.com/PyO3/rust-numpy) -- PyO3 numpy integration (PyO3 contributors, BSD-2-Clause)
38
+ - [maturin](https://github.com/PyO3/maturin) -- Python wheel build tool (PyO3 contributors, MIT/Apache-2.0)
@@ -0,0 +1,64 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.0.0] - 2026-05-17
9
+
10
+ First stable release. The public API of `opentfraw` is now considered
11
+ stable and will follow semantic versioning. Format coverage is unchanged
12
+ from 0.1.0 (LTQ FT, Q Exactive HF, Orbitrap Fusion Lumos, Orbitrap
13
+ Exploris 480, TSQ Vantage, TSQ Quantiva, TSQ Altis).
14
+
15
+ ### Added
16
+
17
+ - `ATTRIBUTION.md` (replaces `CREDITS.md`): tracks third-party notices for
18
+ bundled data and vendored code.
19
+ - `publish.yml` GitHub Actions workflow: publishes the `opentfraw` crate
20
+ to crates.io and the Python wheel to PyPI via OIDC Trusted Publishing
21
+ on every `v*` tag push.
22
+
23
+ ### Changed
24
+
25
+ - CI migrated from WarpBuild runners to standard GitHub-hosted
26
+ (`ubuntu-latest`, `macos-latest`, `windows-latest`).
27
+ - Removed the `tools/` vendor SDK tree and `corpus/mzml/` binary corpus
28
+ from repository history (git history rewritten; total size reduced from
29
+ ~1.5 GB to ~660 KB).
30
+ - Removed "Pure-Rust" marketing language from `README.md` and related
31
+ documentation (Python bindings use PyO3/maturin which pulls in a C
32
+ compiler at build time).
33
+ - Renamed `CREDITS.md` to `ATTRIBUTION.md`.
34
+
35
+ ## [0.1.0] - 2026-05-16
36
+
37
+ ### Added
38
+
39
+ - Rust parser for the Thermo Fisher RAW mass spectrometry file
40
+ format, no native or system dependencies.
41
+ - Reader API for top-level structures: `FileHeader`, `AuditTag`,
42
+ `SeqRow`, `InjectionData`, `ASInfo`, `RawFileInfo`, `InstID`,
43
+ `RunHeader`, `SampleInfo`.
44
+ - Per-scan API: scan-index entries, packet headers, profile chunks,
45
+ centroid peaks, scan events, scan parameters (generic records).
46
+ - Error log and instrument log decoders.
47
+ - Robust instrument-model detection via byte scan.
48
+ - Frequency-to-m/z conversion using the per-segment calibration table.
49
+ - `examples/dump.rs`: dump the contents of a RAW file as plain text.
50
+ - `examples/to_mzml.rs`: convert a RAW file to mzML (centroid or
51
+ profile; optionally indexed).
52
+ - Validated against ProteoWizard `msconvert` mzML output for a
53
+ multi-instrument PRIDE corpus (LTQ FT, Q Exactive HF, Orbitrap
54
+ Fusion Lumos, Orbitrap Exploris 480, TSQ Vantage, TSQ Quantiva,
55
+ TSQ Altis).
56
+ - Optional Python bindings (`opentfraw-py`, not published to crates.io).
57
+ - Format specification under `docs/docs/format/`.
58
+
59
+ ### Out of scope
60
+
61
+ - Methods file (`MethodFile`) deep parse beyond byte-level layout.
62
+
63
+ [1.0.0]: https://github.com/Sigilweaver/OpenTFRaw/releases/tag/v1.0.0
64
+ [0.1.0]: https://github.com/Sigilweaver/OpenTFRaw/releases/tag/v0.1.0
@@ -0,0 +1,188 @@
1
+ # OpenTFRaw Validation Corpus
2
+
3
+ The test corpus covers every major Thermo RAW format variant the parser
4
+ needs to handle:
5
+
6
+ - All supported format versions (8, 47, 57, 60, 62, 63, 64, 66)
7
+ - Both scan-data encodings (PacketHeader and the two Flat variants)
8
+ - Each major instrument family (ion trap, Orbitrap hybrid, Q-Orbitrap,
9
+ Tribrid, single-stage Orbitrap, Astral, triple quadrupole)
10
+
11
+ Current size: ~124 GB across 283 files, covering all instrument families
12
+ and acquisition modes. Multiple files per instrument are included to
13
+ exercise parameter variation across real-world datasets.
14
+
15
+ ## Source: PRIDE Archive
16
+
17
+ All files come from the EBI PRIDE Archive (https://www.ebi.ac.uk/pride/),
18
+ a public proteomics repository hosting hundreds of thousands of Thermo RAW
19
+ files contributed by academic and commercial labs.
20
+
21
+ Access is via HTTPS from the PRIDE FTP mirror:
22
+
23
+ https://ftp.pride.ebi.ac.uk/pride/data/archive/YYYY/MM/\<PXD_ACCESSION\>/
24
+
25
+ PRIDE datasets are published under CC-BY or equivalent open licences.
26
+
27
+ ## Source List
28
+
29
+ The file `scripts/sources.json` records which PRIDE projects and files to
30
+ download:
31
+
32
+ [
33
+ {
34
+ "instrument": "LCQ Classic",
35
+ "accession": "PXD044152",
36
+ "files": ["Ex250122_K50ng_60m2.raw"],
37
+ "count": 6
38
+ },
39
+ {
40
+ "instrument": "Orbitrap Fusion Lumos",
41
+ "mode": "DIA",
42
+ "accession": "PXD031322",
43
+ "files": ["OFL001513-YLL-GPF-15K-1.raw"],
44
+ "count": 5
45
+ },
46
+ ...
47
+ ]
48
+
49
+ - `files` - specific filenames always downloaded first
50
+ - `count` - total target file count from this project; the fetcher
51
+ auto-fills from the FTP directory listing until the count is reached
52
+ - `mode` - distinguishes multiple entries for the same instrument
53
+ covering different acquisition modes (DIA, EThcD, PRM, MS3, etc.)
54
+
55
+ To add or replace an entry, edit `sources.json` directly and re-run the
56
+ fetcher. The manifest (`corpus/manifest.json`) records what is
57
+ currently on disk; the fetcher skips any key already present there.
58
+
59
+ ## Running the Fetcher
60
+
61
+ python scripts/fetch_corpus.py # download missing files
62
+ python scripts/fetch_corpus.py --dry-run # report without downloading
63
+ python scripts/fetch_corpus.py --list-files PXD032800 # discover files
64
+
65
+ The script resolves each download URL through the PRIDE REST API
66
+ (https://www.ebi.ac.uk/pride/ws/archive/v2/files/byProject) and saves
67
+ files as `{accession}_{instrument_label}_{original_filename}` under
68
+ `corpus/`. If the API returns an empty response (an intermittent server
69
+ behaviour observed in 2026), the script falls back to constructing the
70
+ FTP URL directly from the project publication date.
71
+
72
+ To discover all available files in a PRIDE project before adding it to
73
+ `sources.json`:
74
+
75
+ python scripts/fetch_corpus.py --list-files PXD032800
76
+
77
+ ## Provenance Record
78
+
79
+ `corpus/manifest.json` records which PRIDE project each local
80
+ file came from. Keys are `{accession}/{original_filename}`:
81
+
82
+ {
83
+ "PXD055201/20170427_CO_0673AnGS_DM_Mix1_R12R13R14_2.raw": {
84
+ "instrument": "LTQ Orbitrap XL",
85
+ "dest_filename": "PXD055201_LTQ_Orbitrap_XL_20170427_..._2.raw",
86
+ "size_bytes": 396954554
87
+ },
88
+ ...
89
+ }
90
+
91
+ To trace any file back to its source, use the PXD accession:
92
+
93
+ https://www.ebi.ac.uk/pride/archive/projects/<PXD_ACCESSION>
94
+
95
+ ## Target Instruments and Acquisition Modes
96
+
97
+ The corpus is organised in two tiers:
98
+
99
+ **Tier 1 - one file per instrument line** (covers every format version
100
+ and scan-data encoding path):
101
+
102
+ | Family | Instruments |
103
+ | ------------------------- | ------------------------------------------------------------- |
104
+ | Ion traps (LCQ/LTQ) | LCQ Classic, LTQ, LTQ XL, LTQ Velos, LTQ FT |
105
+ | LTQ Orbitrap hybrids | LTQ Orbitrap, XL, XL ETD, Velos, Velos Pro, Elite |
106
+ | Q-Orbitrap | Q Exactive, Plus, HF, HF-X, UHMR |
107
+ | Tribrid Orbitrap | Fusion, Fusion Lumos, Eclipse, Ascend |
108
+ | Single-stage Orbitrap | Exploris 120, 240, 480, Astral (DIA) |
109
+ | Triple quadrupole | TSQ Vantage, Quantiva, Altis |
110
+
111
+ **Tier 2 - additional files per instrument covering distinct modes**:
112
+
113
+ | Entry | Mode | What it exercises |
114
+ | -------------------------------- | ------ | ------------------------------------------------------ |
115
+ | Orbitrap Fusion Lumos (DIA) | DIA | Multiple isolation windows per scan cycle |
116
+ | Orbitrap Fusion Lumos (MS3) | MS3 | Three-stage fragmentation / XL-MS workflow |
117
+ | Orbitrap Fusion Lumos (EThcD) | EThcD | Supplemental activation on tribrid variable-body scans |
118
+ | Orbitrap Eclipse (EThcD) | EThcD | Electron-transfer + supplemental HCD, two-clause filter|
119
+ | Q Exactive Plus (DDA-2) | DDA | Second Q Exactive Plus vintage for regression |
120
+ | Orbitrap Fusion Lumos (UVPD) | UVPD | Ultraviolet photodissociation, tests Activation::Uvpd |
121
+ | Q Exactive HF (DIA) | DIA | Fixed-window SWATH-like DIA on Q Exactive |
122
+ | Orbitrap Exploris 480 (DDA-2) | DDA | Second firmware vintage for regression |
123
+ | TSQ Altis (SRM-2) | SRM-2 | Second SRM file from a different dataset |
124
+ | Q Exactive HF-X (PRM) | PRM | Parallel reaction monitoring: 42 targets, |
125
+ | | | 7-minute gradient, SARS-CoV-2 peptides |
126
+
127
+ ### Multi-controller coverage
128
+
129
+ Several Tier 1 files carry `controller_count > 1` in their
130
+ `RawFileInfoPreamble`, meaning the RAW file contains a UV/analog chromatogram
131
+ channel alongside the MS data stream. The parser exercises the
132
+ multi-controller selection path (reader.rs `select_ms_run_header`) for these:
133
+
134
+ | File (Tier 1 instrument) | `controller_count` | Confirmed year |
135
+ | ------------------------- | :----------------: | -------------- |
136
+ | Orbitrap Fusion | 2 | 2016-12 |
137
+ | Orbitrap Fusion Lumos | 2 | 2016-03 |
138
+ | LTQ Orbitrap (PXD069348) | 3 | 2014-02 |
139
+
140
+ The selection heuristic — `ntrailer > 0` (v64+) or `nsegs > 0 && first_scan
141
+ <= last_scan` (v63) — correctly identifies the MS controller in every case.
142
+
143
+ ## Limitations
144
+
145
+ - PRIDE's metadata lists declared instrument names; a few submitters
146
+ mislabel files. Device detection in the parser is therefore best-effort.
147
+ - Some instrument lines (Astral, top-down ETD workflows) have few publicly
148
+ available files on PRIDE. The `count` values in `sources.json` are
149
+ capped at the number of files actually present in the FTP directory.
150
+
151
+ ## Open Issues
152
+
153
+ ### DIA isolation window m/z (Orbitrap Exploris 480 and similar)
154
+
155
+ For the Exploris 480 DIA files in corpus (PXD035500), the isolation window
156
+ center m/z is currently absent from filter strings. Investigation findings:
157
+
158
+ - **Scan event body format**: DIA MS2 scan events use a uniform 136-byte body
159
+ (event size = 272 bytes total). The body[8..12] f32 field holds a value in
160
+ the range ~3.8-5.0, which is in instrument frequency space, not m/z. There
161
+ is no reaction structure (np = 0 at body[4..8]) and no m/z at any body offset.
162
+
163
+ - **Scan params**: The file has 1004 bytes/scan of scan params data starting at
164
+ `scan_params_addr`, but the GenericDataHeader (GDH) that describes the record
165
+ schema was not found anywhere in the 8 MB window between the error log and
166
+ scan_trailer that `find_forward` searches. As a result `scan_parameters` is
167
+ empty for all scans in this file and `ScanParams` accessors return `None`.
168
+
169
+ - **What is needed**: Locate the GDH for Exploris 480 scan params (it may be
170
+ outside the current search window, or use a different header format). Once
171
+ the schema is found, the calibration coefficients (conversion parameters A, B,
172
+ C) inside the scan params record can be used to convert frequency to m/z and
173
+ recover the isolation window center.
174
+
175
+ - **Workaround**: For instruments where the GDH is found correctly (Q Exactive,
176
+ Fusion Lumos, Eclipse), `ScanParams::isolation_width_mz()` and the
177
+ `monoisotopic_mz()` family already work. Eclipse DIA files (PXD038440, once
178
+ downloaded) will clarify whether tribrid instruments store the isolation m/z
179
+ in the reaction structure (np > 0) as DDA scans do, bypassing the calibration
180
+ problem entirely.
181
+
182
+ ### Acquisition modes not yet in corpus
183
+
184
+ | Mode | Notes |
185
+ | ---- | ----- |
186
+ | Eclipse DIA | DIA on tribrid Orbitrap: needed to confirm whether tribrid instruments store isolation m/z in reaction structure (np>0) as DDA scans do. No confirmed Eclipse DIA PRIDE accession with accessible RAW files identified yet. The existing Fusion Lumos DIA files (PXD031322) show the same filter gap as Exploris 480, suggesting the isolation m/z is absent in the tribrid scan event body for DIA as well. |
187
+ | SPS-MS3 (TMT) | Synchronous precursor selection MS3 for isobaric quantification; differs from standard MS3 in the number of simultaneous precursor m/z in the scan event body. |
188
+ | ECD / IRMPD | Both enum variants implemented; no corpus files yet. |
@@ -0,0 +1,65 @@
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "opentfraw"
7
+ version = "1.0.0"
8
+ dependencies = [
9
+ "thiserror",
10
+ ]
11
+
12
+ [[package]]
13
+ name = "proc-macro2"
14
+ version = "1.0.106"
15
+ source = "registry+https://github.com/rust-lang/crates.io-index"
16
+ checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
17
+ dependencies = [
18
+ "unicode-ident",
19
+ ]
20
+
21
+ [[package]]
22
+ name = "quote"
23
+ version = "1.0.45"
24
+ source = "registry+https://github.com/rust-lang/crates.io-index"
25
+ checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
26
+ dependencies = [
27
+ "proc-macro2",
28
+ ]
29
+
30
+ [[package]]
31
+ name = "syn"
32
+ version = "2.0.117"
33
+ source = "registry+https://github.com/rust-lang/crates.io-index"
34
+ checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
35
+ dependencies = [
36
+ "proc-macro2",
37
+ "quote",
38
+ "unicode-ident",
39
+ ]
40
+
41
+ [[package]]
42
+ name = "thiserror"
43
+ version = "2.0.18"
44
+ source = "registry+https://github.com/rust-lang/crates.io-index"
45
+ checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
46
+ dependencies = [
47
+ "thiserror-impl",
48
+ ]
49
+
50
+ [[package]]
51
+ name = "thiserror-impl"
52
+ version = "2.0.18"
53
+ source = "registry+https://github.com/rust-lang/crates.io-index"
54
+ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
55
+ dependencies = [
56
+ "proc-macro2",
57
+ "quote",
58
+ "syn",
59
+ ]
60
+
61
+ [[package]]
62
+ name = "unicode-ident"
63
+ version = "1.0.24"
64
+ source = "registry+https://github.com/rust-lang/crates.io-index"
65
+ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
@@ -0,0 +1,33 @@
1
+ [package]
2
+ name = "opentfraw"
3
+ version = "1.0.0"
4
+ edition = "2021"
5
+ rust-version = "1.75"
6
+ description = "Rust parser for Thermo Fisher RAW mass spectrometry files."
7
+ authors = ["Nathan Riley <git@nathanriley.com>"]
8
+ license = "Apache-2.0"
9
+ repository = "https://github.com/Sigilweaver/OpenTFRaw"
10
+ homepage = "https://github.com/Sigilweaver/OpenTFRaw"
11
+ readme = "README.md"
12
+ keywords = ["mass-spectrometry", "thermo", "raw", "proteomics", "orbitrap"]
13
+ categories = ["parser-implementations", "science"]
14
+
15
+ [lints.rust]
16
+ unsafe_code = "forbid"
17
+
18
+ [dependencies]
19
+ thiserror = "2"
20
+
21
+ [dev-dependencies]
22
+
23
+ [[example]]
24
+ name = "dump"
25
+ path = "examples/dump.rs"
26
+
27
+ [[example]]
28
+ name = "to_mzml"
29
+ path = "examples/to_mzml.rs"
30
+
31
+ # Keep the Python bindings crate out of the parent's cargo invocations.
32
+ [workspace]
33
+ exclude = ["python"]