piscem 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- piscem-0.1.0/.github/workflows/publish-py-piscem.yml +57 -0
- piscem-0.1.0/.gitignore +30 -0
- piscem-0.1.0/CLAUDE.md +246 -0
- piscem-0.1.0/Cargo.lock +3079 -0
- piscem-0.1.0/Cargo.toml +67 -0
- piscem-0.1.0/PKG-INFO +9 -0
- piscem-0.1.0/README.md +87 -0
- piscem-0.1.0/crates/py-piscem/Cargo.toml +15 -0
- piscem-0.1.0/crates/py-piscem/README.md +159 -0
- piscem-0.1.0/crates/py-piscem/piscem.pyi +116 -0
- piscem-0.1.0/crates/py-piscem/src/lib.rs +1021 -0
- piscem-0.1.0/crates/py-piscem/tests/test_piscem.py +216 -0
- piscem-0.1.0/examples/atac_mismatch_diag.rs +325 -0
- piscem-0.1.0/examples/index_stats.rs +57 -0
- piscem-0.1.0/examples/test_atac_compare.rs +33 -0
- piscem-0.1.0/implementation_plan.md +606 -0
- piscem-0.1.0/pyproject.toml +20 -0
- piscem-0.1.0/release-py.sh +114 -0
- piscem-0.1.0/src/cli/build.rs +51 -0
- piscem-0.1.0/src/cli/inspect.rs +12 -0
- piscem-0.1.0/src/cli/map_bulk.rs +294 -0
- piscem-0.1.0/src/cli/map_scatac.rs +310 -0
- piscem-0.1.0/src/cli/map_scrna.rs +304 -0
- piscem-0.1.0/src/cli/mod.rs +42 -0
- piscem-0.1.0/src/cli/parity.rs +26 -0
- piscem-0.1.0/src/cli/poison.rs +79 -0
- piscem-0.1.0/src/index/build.rs +652 -0
- piscem-0.1.0/src/index/build_poison.rs +370 -0
- piscem-0.1.0/src/index/contig_table.rs +620 -0
- piscem-0.1.0/src/index/eq_classes.rs +625 -0
- piscem-0.1.0/src/index/formats.rs +5 -0
- piscem-0.1.0/src/index/mod.rs +8 -0
- piscem-0.1.0/src/index/poison_table.rs +685 -0
- piscem-0.1.0/src/index/reference_index.rs +499 -0
- piscem-0.1.0/src/index/refinfo.rs +269 -0
- piscem-0.1.0/src/io/fastx.rs +83 -0
- piscem-0.1.0/src/io/map_info.rs +113 -0
- piscem-0.1.0/src/io/mod.rs +4 -0
- piscem-0.1.0/src/io/rad.rs +919 -0
- piscem-0.1.0/src/io/threads.rs +113 -0
- piscem-0.1.0/src/lib.rs +7 -0
- piscem-0.1.0/src/main.rs +15 -0
- piscem-0.1.0/src/mapping/binning.rs +229 -0
- piscem-0.1.0/src/mapping/cache.rs +130 -0
- piscem-0.1.0/src/mapping/chain_state.rs +527 -0
- piscem-0.1.0/src/mapping/engine.rs +732 -0
- piscem-0.1.0/src/mapping/filters.rs +505 -0
- piscem-0.1.0/src/mapping/hit_searcher.rs +1100 -0
- piscem-0.1.0/src/mapping/hits.rs +263 -0
- piscem-0.1.0/src/mapping/kmer_value.rs +70 -0
- piscem-0.1.0/src/mapping/map_fragment.rs +156 -0
- piscem-0.1.0/src/mapping/merge_pairs.rs +522 -0
- piscem-0.1.0/src/mapping/mod.rs +17 -0
- piscem-0.1.0/src/mapping/overlap.rs +320 -0
- piscem-0.1.0/src/mapping/processors.rs +1042 -0
- piscem-0.1.0/src/mapping/projected_hits.rs +411 -0
- piscem-0.1.0/src/mapping/protocols/bulk.rs +81 -0
- piscem-0.1.0/src/mapping/protocols/custom.rs +512 -0
- piscem-0.1.0/src/mapping/protocols/mod.rs +100 -0
- piscem-0.1.0/src/mapping/protocols/scatac.rs +104 -0
- piscem-0.1.0/src/mapping/protocols/scrna.rs +281 -0
- piscem-0.1.0/src/mapping/sketch_hit_simple.rs +290 -0
- piscem-0.1.0/src/mapping/streaming_query.rs +189 -0
- piscem-0.1.0/src/mapping/unitig_end_cache.rs +154 -0
- piscem-0.1.0/src/verify/index_compare.rs +114 -0
- piscem-0.1.0/src/verify/mod.rs +3 -0
- piscem-0.1.0/src/verify/parity.rs +116 -0
- piscem-0.1.0/src/verify/rad_compare.rs +1493 -0
- piscem-0.1.0/tests/parity_smoke.rs +134 -0
- piscem-0.1.0/tests/rad_parity_atac.rs +172 -0
- piscem-0.1.0/tests/rad_parity_bulk.rs +1073 -0
- piscem-0.1.0/tests/rad_parity_sc.rs +439 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: Publish piscem Python package
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["py-piscem-v[0-9]*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
name: Build (${{ matrix.target }})
|
|
10
|
+
runs-on: ${{ matrix.os }}
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
include:
|
|
14
|
+
- os: ubuntu-latest
|
|
15
|
+
target: x86_64
|
|
16
|
+
sdist: true
|
|
17
|
+
- os: ubuntu-latest
|
|
18
|
+
target: aarch64
|
|
19
|
+
sdist: false
|
|
20
|
+
- os: macos-latest
|
|
21
|
+
target: universal2-apple-darwin
|
|
22
|
+
sdist: false
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- uses: PyO3/maturin-action@v1
|
|
27
|
+
with:
|
|
28
|
+
target: ${{ matrix.target }}
|
|
29
|
+
manylinux: auto
|
|
30
|
+
working-directory: crates/py-piscem
|
|
31
|
+
args: >-
|
|
32
|
+
--release
|
|
33
|
+
--out dist
|
|
34
|
+
${{ matrix.sdist && '--sdist' || '' }}
|
|
35
|
+
|
|
36
|
+
- uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: wheels-${{ matrix.target }}
|
|
39
|
+
path: crates/py-piscem/dist
|
|
40
|
+
|
|
41
|
+
publish:
|
|
42
|
+
name: Publish to PyPI
|
|
43
|
+
needs: build
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
environment:
|
|
46
|
+
name: pypi
|
|
47
|
+
url: https://pypi.org/project/piscem/
|
|
48
|
+
permissions:
|
|
49
|
+
id-token: write
|
|
50
|
+
steps:
|
|
51
|
+
- uses: actions/download-artifact@v4
|
|
52
|
+
with:
|
|
53
|
+
pattern: wheels-*
|
|
54
|
+
merge-multiple: true
|
|
55
|
+
path: dist
|
|
56
|
+
|
|
57
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
piscem-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Generated by Cargo
|
|
2
|
+
# will have compiled files and executables
|
|
3
|
+
debug
|
|
4
|
+
target
|
|
5
|
+
|
|
6
|
+
# These are backup files generated by rustfmt
|
|
7
|
+
**/*.rs.bk
|
|
8
|
+
|
|
9
|
+
# MSVC Windows builds of rustc generate these, which store debugging information
|
|
10
|
+
*.pdb
|
|
11
|
+
|
|
12
|
+
# Generated by cargo mutants
|
|
13
|
+
# Contains mutation testing data
|
|
14
|
+
**/mutants.out*/
|
|
15
|
+
|
|
16
|
+
# RustRover
|
|
17
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
18
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
19
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
20
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
21
|
+
#.idea/
|
|
22
|
+
|
|
23
|
+
.claude/settings.local.json
|
|
24
|
+
|
|
25
|
+
# Local Cargo config (contains [patch] for sshash-rs local checkout)
|
|
26
|
+
.cargo/
|
|
27
|
+
|
|
28
|
+
# sshash-rs is an independent repo; use a local checkout for development
|
|
29
|
+
# and override in .cargo/config.toml if needed
|
|
30
|
+
sshash-rs/
|
piscem-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# piscem-rs Development Context
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
|
|
5
|
+
This is a Rust port of the C++ `piscem` bioinformatics tool for k-mer-based read mapping. The Rust implementation (`piscem-rs`) must produce **semantically equivalent** outputs to the C++ version (not byte-identical). It depends on `sshash-rs` (git dependency on `https://github.com/COMBINE-lab/sshash-rs.git`, branch `main`) for the compressed k-mer dictionary. A local checkout at `./sshash-rs/` is used for development and automatically picked up by Cargo.
|
|
6
|
+
|
|
7
|
+
The full implementation plan with C++ → Rust type mappings, architectural notes, and phased roadmap is in `implementation_plan.md`. Read it before starting new phases.
|
|
8
|
+
|
|
9
|
+
## Current Status
|
|
10
|
+
|
|
11
|
+
### Completed Phases
|
|
12
|
+
|
|
13
|
+
- **Phase 0**: Project bootstrap with CLI skeleton and parity harness scaffolding
|
|
14
|
+
- **Phase 1A–1E: Index data structures + build pipeline** — ContigTable (EF offsets + packed entries), RefInfo, ReferenceIndex, EqClassMap, end-to-end build from cuttlefish output
|
|
15
|
+
- **Phase 2: PoisonTable** — `AHashMap<CanonicalKmer, u64>` with fixed-seed ahash, serialization (`PPOIS01\0`), query methods
|
|
16
|
+
- **Phase 3: Mapping core** — ProjectedHits, PiscemStreamingQuery (sshash-rs wrapper + unitig-end cache), HitSearcher (PERMISSIVE/STRICT modes)
|
|
17
|
+
- **Phase 4: Mapping infrastructure** — `map_read<K, S>()` kernel, MappingCache, SketchHitInfo trait, RadWriter, Protocol trait
|
|
18
|
+
- **Phase 5: Protocol implementations + CLI** — Bulk/scRNA/scATAC mapping CLIs, ChromiumProtocol, custom geometry parser
|
|
19
|
+
- **Phase 6: Hardening** — UnitigEndCache (DashMap), overlap detection, genome binning, parity harness
|
|
20
|
+
- **Phase 7: Poison builder + CanonicalKmer** — `build-poison` CLI, CanonicalKmer newtype
|
|
21
|
+
- **Phase 8: scATAC parity** — Triple-file input, every-kmer mode, bin-based merge, 100% record parity
|
|
22
|
+
- **Phase 9: Idiomatic paraseq refactor** — Replaced custom crossbeam producer-consumer pipeline with paraseq's native `ParallelProcessor`/`PairedParallelProcessor`/`MultiParallelProcessor` traits. Eliminated intermediate `ReadPair`/`ReadTriplet` owned copies; reads processed in-place from paraseq buffers (zero-copy for single-line FASTQ). Per-thread stats flushed once via `on_thread_complete()` instead of per-chunk atomics.
|
|
23
|
+
- **Phase 10: Multi-file parallel decompression** — Switched from concatenated single-reader streams (`open_concatenated_readers` + `fastq::Reader`) to paraseq's `Collection` API (`fastx::Collection` with `CollectionType::Paired`/`Single`/`Multi`). Enables parallel decompression across multiple input file sets. Processor trait impls updated from `fastq::RefRecord` to `fastx::RefRecord`.
|
|
24
|
+
|
|
25
|
+
### Parity Status
|
|
26
|
+
|
|
27
|
+
| Mode | Dataset | Mapping Rate | Record-Level Parity |
|
|
28
|
+
|------|---------|-------------|-------------------|
|
|
29
|
+
| Bulk PE | gencode_pc_v44 (no poison) | 100% match (96.46%) | 100% (964,594/964,594) |
|
|
30
|
+
| Bulk PE | gencode_pc_v44 (with poison) | 100% match | 100% (961,505/961,505) |
|
|
31
|
+
| Bulk PE (strict) | gencode_pc_v44 | 100% match | 100% |
|
|
32
|
+
| Bulk SE | gencode_pc_v44 | 100% match | 83.65% (tie-breaking differences expected) |
|
|
33
|
+
| scRNA | SRR12623882 (Chromium V3) | 100% match | 100% |
|
|
34
|
+
| scRNA | PBMC 1k v3 (33.4M reads) | 100% match (86.64%) | 100% (28,968,858/28,968,858) |
|
|
35
|
+
| scATAC | 5M ATAC reads (hg38 k25) | 100% match (98.33%) | 100% (4,916,721/4,916,721) |
|
|
36
|
+
|
|
37
|
+
### Performance Status
|
|
38
|
+
|
|
39
|
+
Rust is **faster than C++** across both bulk and scRNA workloads (Apple Silicon M2 Max):
|
|
40
|
+
|
|
41
|
+
**Bulk PE** (1M reads, gencode v44):
|
|
42
|
+
|
|
43
|
+
| Threads | C++ | Rust | Ratio |
|
|
44
|
+
|--------:|----:|-----:|------:|
|
|
45
|
+
| 1 | 14.3s | 14.0s | 0.98x |
|
|
46
|
+
| 4 | 3.9s | 3.8s | 0.96x |
|
|
47
|
+
| 8 | 3.3s | 2.4s | 0.71x |
|
|
48
|
+
|
|
49
|
+
**scRNA** (PBMC 1k v3, 33.4M reads, Chromium V3, gencode v44, 237K refs):
|
|
50
|
+
|
|
51
|
+
| Platform | Threads | C++ | Rust | Ratio |
|
|
52
|
+
|----------|--------:|----:|-----:|------:|
|
|
53
|
+
| Apple Silicon M2 Max | 8 | 114s | 111s | 0.97x |
|
|
54
|
+
| x86-64 Linux | 8 | 55s | 47s | 0.85x |
|
|
55
|
+
|
|
56
|
+
Mapping counts are identical: 28,968,858 / 33,436,697 (86.64%) for both implementations.
|
|
57
|
+
|
|
58
|
+
Key optimizations applied:
|
|
59
|
+
- **AHashMap for hit_map**: Replaced `nohash-hasher` (identity hash) which caused pathological SwissTable H2 collisions with sequential transcript IDs (~38% regression on scRNA with 237K refs). `AHashMap` properly distributes hash bits for SwissTable SIMD probing.
|
|
60
|
+
- **AHashSet for observed_ecs**: Replaced standard `HashSet<u64>` (SipHash) with `AHashSet<u64>` matching C++ `ankerl::unordered_dense::set` performance.
|
|
61
|
+
- **rapidhash in sshash-rs**: Replaced ahash for MPHF and minimizer hashing. ahash switches algorithm when AES-NI is available (via `target-cpu=native`), silently breaking serialized indices. rapidhash is CPU-feature independent.
|
|
62
|
+
- **Optional UnitigEndCache**: Only scATAC uses the cache; bulk and scRNA pass `None`, avoiding DashMap overhead. This was the primary source of the x86-64 performance gap.
|
|
63
|
+
- **LocatedHit**: Eliminated double `locate_with_end` Elias-Fano successor queries in dictionary lookups
|
|
64
|
+
- **from_ascii_unchecked**: Eliminated `Kmer::from_str` string round-trips (~15% of worker thread time), changed streaming query API from `&str` to `&[u8]`
|
|
65
|
+
- **Paraseq native processing**: Zero-copy read access, per-thread stat accumulation (reduced atomic contention at high thread counts)
|
|
66
|
+
- **Paraseq Collection**: Multi-file parallel decompression via `fastx::Collection` — threads distributed across reader groups when multiple file sets provided (e.g., `-1 a.fq.gz,b.fq.gz`). No regression for single-file case.
|
|
67
|
+
|
|
68
|
+
### Next Up
|
|
69
|
+
|
|
70
|
+
- Benchmark script: `/tmp/bench_piscem.sh` (reads: `test_data/sim_1M_{1,2}.fq.gz`, Rust index: `test_data/gencode_pc_v44_index_rust/`, C++ index: `test_data/gencode_pc_v44_index_cpp/`)
|
|
71
|
+
- SC perf test data: `test_data/perf_test/pbmc_1k_v3_S1_L001_R{1,2}_001.fastq.gz`, indices at `test_data/perf_test/{cpp_index,rust_index}`
|
|
72
|
+
|
|
73
|
+
## Key Design Decisions
|
|
74
|
+
|
|
75
|
+
1. **No binary compatibility with C++ index format** — Rust has its own serialization format, only semantic equivalence required
|
|
76
|
+
2. **Size efficiency matters** — serialized indices should be similar size to C++
|
|
77
|
+
3. **Default mapping strategy**: `get_raw_hits_sketch` with PERMISSIVE mode, no structural constraints initially
|
|
78
|
+
4. **C++ global mutable state → Rust struct fields**: `ref_shift`/`pos_mask` stored in `EntryEncoding`, passed by reference (not global)
|
|
79
|
+
5. **sshash-rs const-generic K**: Use `dispatch_on_k!(k, K => { ... })` at mapping entry point
|
|
80
|
+
6. **rapidhash for index hashing**: sshash-rs uses rapidhash (not ahash) for MPHF and minimizer hashing — CPU-feature independent, indices portable across `target-cpu=native`. ahash is still used in piscem-rs for ephemeral hot-path HashMaps (hit_map, observed_ecs) where portability doesn't matter.
|
|
81
|
+
7. **UnitigEndCache is scATAC-only**: Bulk and scRNA pass `None` to avoid DashMap overhead
|
|
82
|
+
6. **Succinct data structure crates**: `sux` 0.12 git main (with epserde feature), NOT `cseq` or `sucds`
|
|
83
|
+
7. **Test data**: Pre-built C++ indices expected in `test_data/` directory
|
|
84
|
+
8. **libradicl**: Use git dependency to `develop` branch for RAD comparison
|
|
85
|
+
9. **sshash-rs dependency**: Git dependency (`branch = "main"`) in Cargo.toml. Local checkout at `./sshash-rs/` is gitignored and used automatically by Cargo for development.
|
|
86
|
+
|
|
87
|
+
## Threading Architecture
|
|
88
|
+
|
|
89
|
+
The mapping pipeline uses **paraseq's `Collection` API** for parallel I/O across multiple input files:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
paraseq Collection (Vec<fastx::Reader>, CollectionType)
|
|
93
|
+
└─ distributes threads across reader groups (auto: total_threads / num_groups)
|
|
94
|
+
└─ each group: mutex-guarded I/O → fill RecordSet → process batch
|
|
95
|
+
└─ Processor struct (one clone per thread, lazy-init state)
|
|
96
|
+
├─ process_record_pair_batch(): map reads, accumulate RAD output
|
|
97
|
+
├─ on_batch_complete(): backpatch chunk header, flush to shared file
|
|
98
|
+
└─ on_thread_complete(): flush local stats to atomics (once)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Processor pattern** (`src/mapping/processors.rs`):
|
|
102
|
+
- Processor structs hold shared `&'a` references (index, cache, output, stats) + `Option<ThreadState>`
|
|
103
|
+
- Custom `Clone` impl: copies reference pointers, sets `state: None`
|
|
104
|
+
- `ThreadState` lazily initialized on first batch via `get_or_insert_with()`
|
|
105
|
+
- `CommonThreadState<'a, K>`: HitSearcher, PiscemStreamingQuery, MappingCache×3, PoisonState, RadWriter, local counters
|
|
106
|
+
|
|
107
|
+
**Three processor types**:
|
|
108
|
+
- `BulkProcessor` — implements `PairedParallelProcessor` (PE) and `ParallelProcessor` (SE)
|
|
109
|
+
- `ScrnaProcessor` — implements `PairedParallelProcessor` with BC/UMI extraction
|
|
110
|
+
- `ScatacProcessor` — implements `MultiParallelProcessor` for triple-file (R1 + barcode + R2) input
|
|
111
|
+
|
|
112
|
+
**CLI pattern** (`src/cli/map_*.rs`):
|
|
113
|
+
```rust
|
|
114
|
+
dispatch_on_k!(k, K => {
|
|
115
|
+
let mut processor = BulkProcessor::<K>::new(index, end_cache, output, stats, strat);
|
|
116
|
+
let mut readers = Vec::new();
|
|
117
|
+
for (r1, r2) in read1_paths.iter().zip(read2_paths.iter()) {
|
|
118
|
+
readers.push(paraseq::fastx::Reader::new(open_with_decompression(r1)?)?);
|
|
119
|
+
readers.push(paraseq::fastx::Reader::new(open_with_decompression(r2)?)?);
|
|
120
|
+
}
|
|
121
|
+
let collection = Collection::new(readers, CollectionType::Paired)?;
|
|
122
|
+
collection.process_parallel_paired(&mut processor, num_threads, None)?;
|
|
123
|
+
});
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Critical Import Patterns
|
|
127
|
+
|
|
128
|
+
These caused compilation errors previously — remember them:
|
|
129
|
+
|
|
130
|
+
```rust
|
|
131
|
+
// BitFieldVec requires these trait imports for index_value/set_value/len:
|
|
132
|
+
use value_traits::slices::{SliceByValue, SliceByValueMut};
|
|
133
|
+
|
|
134
|
+
// epserde serialization:
|
|
135
|
+
use epserde::ser::Serialize;
|
|
136
|
+
use epserde::deser::Deserialize;
|
|
137
|
+
|
|
138
|
+
// Elias-Fano (sux-rs):
|
|
139
|
+
use sux::dict::elias_fano::{EliasFanoBuilder, EfSeq, EfSeqDict};
|
|
140
|
+
use sux::traits::{IndexedSeq, Succ};
|
|
141
|
+
|
|
142
|
+
// mem_size() for sux-rs types (sux 0.12 / mem_dbg 0.4):
|
|
143
|
+
use mem_dbg::{MemSize, SizeFlags};
|
|
144
|
+
// ef.mem_size(SizeFlags::default())
|
|
145
|
+
|
|
146
|
+
// paraseq parallel processing (explicit lifetime on trait impl):
|
|
147
|
+
use paraseq::parallel::{ParallelProcessor, PairedParallelProcessor, MultiParallelProcessor};
|
|
148
|
+
use paraseq::fastx::{Collection, CollectionType};
|
|
149
|
+
// Processors impl traits for fastx::RefRecord (not fastq::RefRecord) — Collection uses fastx
|
|
150
|
+
// impl<'a, 'r, const K: usize> PairedParallelProcessor<paraseq::fastx::RefRecord<'r>> for MyProcessor<'a, K>
|
|
151
|
+
// (NOT RefRecord<'_> — anonymous lifetimes in impl Trait are unstable)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Project Structure
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
piscem-rs/
|
|
158
|
+
Cargo.toml # Main crate config (sshash-lib = git dep)
|
|
159
|
+
implementation_plan.md # Detailed roadmap (READ THIS)
|
|
160
|
+
sshash-rs/ # Local sshash-rs checkout (gitignored, auto-used by Cargo)
|
|
161
|
+
src/
|
|
162
|
+
lib.rs # Modules: cli, index, io, mapping, verify
|
|
163
|
+
main.rs # Entry point
|
|
164
|
+
index/
|
|
165
|
+
mod.rs # build, build_poison, contig_table, eq_classes, poison_table, formats, reference_index, refinfo
|
|
166
|
+
build.rs # End-to-end index build pipeline from cuttlefish output
|
|
167
|
+
build_poison.rs # Edge-method poison table builder from decoy FASTA
|
|
168
|
+
contig_table.rs # EF offsets + BitFieldVec entries
|
|
169
|
+
refinfo.rs # Reference names and lengths
|
|
170
|
+
reference_index.rs # Assembles Dictionary + ContigTable + RefInfo
|
|
171
|
+
eq_classes.rs # EC map: tile → EC → (transcript_id, orientation)
|
|
172
|
+
poison_table.rs # Poison k-mer table with AHashMap<CanonicalKmer, u64>
|
|
173
|
+
formats.rs # ArtifactFormat enum
|
|
174
|
+
cli/
|
|
175
|
+
build.rs # Index build CLI
|
|
176
|
+
poison.rs # build-poison CLI (decoy scanning + save)
|
|
177
|
+
map_bulk.rs # Bulk mapping CLI — creates BulkProcessor, calls paraseq
|
|
178
|
+
map_scrna.rs # scRNA mapping CLI — creates ScrnaProcessor, calls paraseq
|
|
179
|
+
map_scatac.rs # scATAC mapping CLI — creates ScatacProcessor, calls paraseq
|
|
180
|
+
mapping/
|
|
181
|
+
processors.rs # BulkProcessor, ScrnaProcessor, ScatacProcessor (paraseq trait impls)
|
|
182
|
+
kmer_value.rs # CanonicalKmer newtype (u64-backed, upgrade path for k>31)
|
|
183
|
+
unitig_end_cache.rs # UnitigEndCache with DashMap<CanonicalKmer>, orientation-aware
|
|
184
|
+
hit_searcher.rs # HitSearcher, ReadKmerIter, PERMISSIVE/STRICT modes
|
|
185
|
+
projected_hits.rs # RefPos, ProjectedHits<'a>, decode_hit()
|
|
186
|
+
streaming_query.rs # PiscemStreamingQuery<'a, K> wrapper + unitig-end cache integration
|
|
187
|
+
hits.rs # MappingType, HitDirection, SimpleHit, SketchHitInfo trait
|
|
188
|
+
sketch_hit_simple.rs # SketchHitInfoSimple (no-constraint default)
|
|
189
|
+
chain_state.rs # SketchHitInfoChained (optional structural constraints)
|
|
190
|
+
filters.rs # PoisonState, scan_raw_hits, CanonicalKmerIter
|
|
191
|
+
cache.rs # MappingCache<S> generic mapping state
|
|
192
|
+
engine.rs # map_read<K,S>() kernel + map_read_atac<K,S>() bin-based kernel
|
|
193
|
+
merge_pairs.rs # merge_se_mappings() + merge_se_mappings_binned() bin-aware PE merge
|
|
194
|
+
map_fragment.rs # SE/PE helpers + ATAC bin-based variants
|
|
195
|
+
overlap.rs # Mate overlap detection (dovetail/regular + seed-based alignment)
|
|
196
|
+
binning.rs # BinPos cumulative per-ref binning matching C++ bin_pos
|
|
197
|
+
protocols/
|
|
198
|
+
mod.rs # Protocol trait + AlignableReads + TechSeqs
|
|
199
|
+
bulk.rs # BulkProtocol
|
|
200
|
+
scrna.rs # ChromiumProtocol (V2/V2_5p/V3/V3_5p/V4_3p) + barcode recovery
|
|
201
|
+
scatac.rs # ScatacProtocol for scATAC-seq
|
|
202
|
+
custom.rs # CustomProtocol + geometry parser (recursive descent)
|
|
203
|
+
io/
|
|
204
|
+
rad.rs # RadWriter + RAD headers/records (SC + bulk + ATAC)
|
|
205
|
+
fastx.rs # open_with_decompression(), Collection/CollectionType re-exports, MultiReader
|
|
206
|
+
threads.rs # OutputInfo (mutex RAD file), MappingStats (atomic counters), ThreadConfig
|
|
207
|
+
map_info.rs # map_info.json writer
|
|
208
|
+
verify/
|
|
209
|
+
mod.rs # Module declarations
|
|
210
|
+
index_compare.rs # Index semantic comparison (ref metadata)
|
|
211
|
+
rad_compare.rs # RAD file comparison (binary header parser)
|
|
212
|
+
parity.rs # Parity orchestration + JSON report
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Terminology Bridge
|
|
216
|
+
|
|
217
|
+
| sshash-rs term | piscem-cpp term | Meaning |
|
|
218
|
+
|---|---|---|
|
|
219
|
+
| `string_id` | `contig_id` | Unitig identifier |
|
|
220
|
+
| `kmer_id_in_string` | `kmer_id_in_contig` | K-mer position within unitig |
|
|
221
|
+
| `num_strings()` | num contigs | Number of unitigs |
|
|
222
|
+
|
|
223
|
+
## Running Tests
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
cargo test # All 179 unit tests should pass
|
|
227
|
+
cargo check # Should compile clean with no warnings
|
|
228
|
+
RUST_LOG=info cargo run # Run with logging
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Parity Tests (require test data + `--features parity-test`)
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
cargo test --features parity-test --release --test rad_parity_bulk -- --ignored --nocapture
|
|
235
|
+
cargo test --features parity-test --release --test rad_parity_sc -- --ignored --nocapture
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## C++ Reference Code
|
|
239
|
+
|
|
240
|
+
The C++ piscem codebase should be available in `piscem-cpp/` in the current directory for cross-reference. Key files:
|
|
241
|
+
- `include/reference_index.hpp` — C++ ReferenceIndex
|
|
242
|
+
- `include/basic_contig_table.hpp` — C++ contig table
|
|
243
|
+
- `include/hit_searcher.hpp` / `src/hit_searcher.cpp` — Hit collection (~1400 lines)
|
|
244
|
+
- `include/mapping/utils.hpp` — `map_read()` kernel
|
|
245
|
+
- `include/projected_hits.hpp` — Projected hits and `decode_hit()`
|
|
246
|
+
- `include/streaming_query.hpp` — piscem-level streaming query wrapper
|