piscem 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. piscem-0.1.0/.github/workflows/publish-py-piscem.yml +57 -0
  2. piscem-0.1.0/.gitignore +30 -0
  3. piscem-0.1.0/CLAUDE.md +246 -0
  4. piscem-0.1.0/Cargo.lock +3079 -0
  5. piscem-0.1.0/Cargo.toml +67 -0
  6. piscem-0.1.0/PKG-INFO +9 -0
  7. piscem-0.1.0/README.md +87 -0
  8. piscem-0.1.0/crates/py-piscem/Cargo.toml +15 -0
  9. piscem-0.1.0/crates/py-piscem/README.md +159 -0
  10. piscem-0.1.0/crates/py-piscem/piscem.pyi +116 -0
  11. piscem-0.1.0/crates/py-piscem/src/lib.rs +1021 -0
  12. piscem-0.1.0/crates/py-piscem/tests/test_piscem.py +216 -0
  13. piscem-0.1.0/examples/atac_mismatch_diag.rs +325 -0
  14. piscem-0.1.0/examples/index_stats.rs +57 -0
  15. piscem-0.1.0/examples/test_atac_compare.rs +33 -0
  16. piscem-0.1.0/implementation_plan.md +606 -0
  17. piscem-0.1.0/pyproject.toml +20 -0
  18. piscem-0.1.0/release-py.sh +114 -0
  19. piscem-0.1.0/src/cli/build.rs +51 -0
  20. piscem-0.1.0/src/cli/inspect.rs +12 -0
  21. piscem-0.1.0/src/cli/map_bulk.rs +294 -0
  22. piscem-0.1.0/src/cli/map_scatac.rs +310 -0
  23. piscem-0.1.0/src/cli/map_scrna.rs +304 -0
  24. piscem-0.1.0/src/cli/mod.rs +42 -0
  25. piscem-0.1.0/src/cli/parity.rs +26 -0
  26. piscem-0.1.0/src/cli/poison.rs +79 -0
  27. piscem-0.1.0/src/index/build.rs +652 -0
  28. piscem-0.1.0/src/index/build_poison.rs +370 -0
  29. piscem-0.1.0/src/index/contig_table.rs +620 -0
  30. piscem-0.1.0/src/index/eq_classes.rs +625 -0
  31. piscem-0.1.0/src/index/formats.rs +5 -0
  32. piscem-0.1.0/src/index/mod.rs +8 -0
  33. piscem-0.1.0/src/index/poison_table.rs +685 -0
  34. piscem-0.1.0/src/index/reference_index.rs +499 -0
  35. piscem-0.1.0/src/index/refinfo.rs +269 -0
  36. piscem-0.1.0/src/io/fastx.rs +83 -0
  37. piscem-0.1.0/src/io/map_info.rs +113 -0
  38. piscem-0.1.0/src/io/mod.rs +4 -0
  39. piscem-0.1.0/src/io/rad.rs +919 -0
  40. piscem-0.1.0/src/io/threads.rs +113 -0
  41. piscem-0.1.0/src/lib.rs +7 -0
  42. piscem-0.1.0/src/main.rs +15 -0
  43. piscem-0.1.0/src/mapping/binning.rs +229 -0
  44. piscem-0.1.0/src/mapping/cache.rs +130 -0
  45. piscem-0.1.0/src/mapping/chain_state.rs +527 -0
  46. piscem-0.1.0/src/mapping/engine.rs +732 -0
  47. piscem-0.1.0/src/mapping/filters.rs +505 -0
  48. piscem-0.1.0/src/mapping/hit_searcher.rs +1100 -0
  49. piscem-0.1.0/src/mapping/hits.rs +263 -0
  50. piscem-0.1.0/src/mapping/kmer_value.rs +70 -0
  51. piscem-0.1.0/src/mapping/map_fragment.rs +156 -0
  52. piscem-0.1.0/src/mapping/merge_pairs.rs +522 -0
  53. piscem-0.1.0/src/mapping/mod.rs +17 -0
  54. piscem-0.1.0/src/mapping/overlap.rs +320 -0
  55. piscem-0.1.0/src/mapping/processors.rs +1042 -0
  56. piscem-0.1.0/src/mapping/projected_hits.rs +411 -0
  57. piscem-0.1.0/src/mapping/protocols/bulk.rs +81 -0
  58. piscem-0.1.0/src/mapping/protocols/custom.rs +512 -0
  59. piscem-0.1.0/src/mapping/protocols/mod.rs +100 -0
  60. piscem-0.1.0/src/mapping/protocols/scatac.rs +104 -0
  61. piscem-0.1.0/src/mapping/protocols/scrna.rs +281 -0
  62. piscem-0.1.0/src/mapping/sketch_hit_simple.rs +290 -0
  63. piscem-0.1.0/src/mapping/streaming_query.rs +189 -0
  64. piscem-0.1.0/src/mapping/unitig_end_cache.rs +154 -0
  65. piscem-0.1.0/src/verify/index_compare.rs +114 -0
  66. piscem-0.1.0/src/verify/mod.rs +3 -0
  67. piscem-0.1.0/src/verify/parity.rs +116 -0
  68. piscem-0.1.0/src/verify/rad_compare.rs +1493 -0
  69. piscem-0.1.0/tests/parity_smoke.rs +134 -0
  70. piscem-0.1.0/tests/rad_parity_atac.rs +172 -0
  71. piscem-0.1.0/tests/rad_parity_bulk.rs +1073 -0
  72. piscem-0.1.0/tests/rad_parity_sc.rs +439 -0
@@ -0,0 +1,57 @@
1
+ name: Publish piscem Python package
2
+
3
+ on:
4
+ push:
5
+ tags: ["py-piscem-v[0-9]*"]
6
+
7
+ jobs:
8
+ build:
9
+ name: Build (${{ matrix.target }})
10
+ runs-on: ${{ matrix.os }}
11
+ strategy:
12
+ matrix:
13
+ include:
14
+ - os: ubuntu-latest
15
+ target: x86_64
16
+ sdist: true
17
+ - os: ubuntu-latest
18
+ target: aarch64
19
+ sdist: false
20
+ - os: macos-latest
21
+ target: universal2-apple-darwin
22
+ sdist: false
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - uses: PyO3/maturin-action@v1
27
+ with:
28
+ target: ${{ matrix.target }}
29
+ manylinux: auto
30
+ working-directory: crates/py-piscem
31
+ args: >-
32
+ --release
33
+ --out dist
34
+ ${{ matrix.sdist && '--sdist' || '' }}
35
+
36
+ - uses: actions/upload-artifact@v4
37
+ with:
38
+ name: wheels-${{ matrix.target }}
39
+ path: crates/py-piscem/dist
40
+
41
+ publish:
42
+ name: Publish to PyPI
43
+ needs: build
44
+ runs-on: ubuntu-latest
45
+ environment:
46
+ name: pypi
47
+ url: https://pypi.org/project/piscem/
48
+ permissions:
49
+ id-token: write
50
+ steps:
51
+ - uses: actions/download-artifact@v4
52
+ with:
53
+ pattern: wheels-*
54
+ merge-multiple: true
55
+ path: dist
56
+
57
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,30 @@
1
+ # Generated by Cargo
2
+ # will have compiled files and executables
3
+ debug
4
+ target
5
+
6
+ # These are backup files generated by rustfmt
7
+ **/*.rs.bk
8
+
9
+ # MSVC Windows builds of rustc generate these, which store debugging information
10
+ *.pdb
11
+
12
+ # Generated by cargo mutants
13
+ # Contains mutation testing data
14
+ **/mutants.out*/
15
+
16
+ # RustRover
17
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
18
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
19
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
20
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
21
+ #.idea/
22
+
23
+ .claude/settings.local.json
24
+
25
+ # Local Cargo config (contains [patch] for sshash-rs local checkout)
26
+ .cargo/
27
+
28
+ # sshash-rs is an independent repo; use a local checkout for development
29
+ # and override in .cargo/config.toml if needed
30
+ sshash-rs/
piscem-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,246 @@
1
+ # piscem-rs Development Context
2
+
3
+ ## Project Overview
4
+
5
+ This is a Rust port of the C++ `piscem` bioinformatics tool for k-mer-based read mapping. The Rust implementation (`piscem-rs`) must produce **semantically equivalent** outputs to the C++ version (not byte-identical). It depends on `sshash-rs` (git dependency on `https://github.com/COMBINE-lab/sshash-rs.git`, branch `main`) for the compressed k-mer dictionary. A local checkout at `./sshash-rs/` is used for development and automatically picked up by Cargo.
6
+
7
+ The full implementation plan with C++ → Rust type mappings, architectural notes, and phased roadmap is in `implementation_plan.md`. Read it before starting new phases.
8
+
9
+ ## Current Status
10
+
11
+ ### Completed Phases
12
+
13
+ - **Phase 0**: Project bootstrap with CLI skeleton and parity harness scaffolding
14
+ - **Phase 1A–1E: Index data structures + build pipeline** — ContigTable (EF offsets + packed entries), RefInfo, ReferenceIndex, EqClassMap, end-to-end build from cuttlefish output
15
+ - **Phase 2: PoisonTable** — `AHashMap<CanonicalKmer, u64>` with fixed-seed ahash, serialization (`PPOIS01\0`), query methods
16
+ - **Phase 3: Mapping core** — ProjectedHits, PiscemStreamingQuery (sshash-rs wrapper + unitig-end cache), HitSearcher (PERMISSIVE/STRICT modes)
17
+ - **Phase 4: Mapping infrastructure** — `map_read<K, S>()` kernel, MappingCache, SketchHitInfo trait, RadWriter, Protocol trait
18
+ - **Phase 5: Protocol implementations + CLI** — Bulk/scRNA/scATAC mapping CLIs, ChromiumProtocol, custom geometry parser
19
+ - **Phase 6: Hardening** — UnitigEndCache (DashMap), overlap detection, genome binning, parity harness
20
+ - **Phase 7: Poison builder + CanonicalKmer** — `build-poison` CLI, CanonicalKmer newtype
21
+ - **Phase 8: scATAC parity** — Triple-file input, every-kmer mode, bin-based merge, 100% record parity
22
+ - **Phase 9: Idiomatic paraseq refactor** — Replaced custom crossbeam producer-consumer pipeline with paraseq's native `ParallelProcessor`/`PairedParallelProcessor`/`MultiParallelProcessor` traits. Eliminated intermediate `ReadPair`/`ReadTriplet` owned copies; reads processed in-place from paraseq buffers (zero-copy for single-line FASTQ). Per-thread stats flushed once via `on_thread_complete()` instead of per-chunk atomics.
23
+ - **Phase 10: Multi-file parallel decompression** — Switched from concatenated single-reader streams (`open_concatenated_readers` + `fastq::Reader`) to paraseq's `Collection` API (`fastx::Collection` with `CollectionType::Paired`/`Single`/`Multi`). Enables parallel decompression across multiple input file sets. Processor trait impls updated from `fastq::RefRecord` to `fastx::RefRecord`.
24
+
25
+ ### Parity Status
26
+
27
+ | Mode | Dataset | Mapping Rate | Record-Level Parity |
28
+ |------|---------|-------------|-------------------|
29
+ | Bulk PE | gencode_pc_v44 (no poison) | 100% match (96.46%) | 100% (964,594/964,594) |
30
+ | Bulk PE | gencode_pc_v44 (with poison) | 100% match | 100% (961,505/961,505) |
31
+ | Bulk PE (strict) | gencode_pc_v44 | 100% match | 100% |
32
+ | Bulk SE | gencode_pc_v44 | 100% match | 83.65% (tie-breaking differences expected) |
33
+ | scRNA | SRR12623882 (Chromium V3) | 100% match | 100% |
34
+ | scRNA | PBMC 1k v3 (33.4M reads) | 100% match (86.64%) | 100% (28,968,858/28,968,858) |
35
+ | scATAC | 5M ATAC reads (hg38 k25) | 100% match (98.33%) | 100% (4,916,721/4,916,721) |
36
+
37
+ ### Performance Status
38
+
39
+ Rust is **faster than C++** across both bulk and scRNA workloads (Apple Silicon M2 Max):
40
+
41
+ **Bulk PE** (1M reads, gencode v44):
42
+
43
+ | Threads | C++ | Rust | Ratio |
44
+ |--------:|----:|-----:|------:|
45
+ | 1 | 14.3s | 14.0s | 0.98x |
46
+ | 4 | 3.9s | 3.8s | 0.96x |
47
+ | 8 | 3.3s | 2.4s | 0.71x |
48
+
49
+ **scRNA** (PBMC 1k v3, 33.4M reads, Chromium V3, gencode v44, 237K refs):
50
+
51
+ | Platform | Threads | C++ | Rust | Ratio |
52
+ |----------|--------:|----:|-----:|------:|
53
+ | Apple Silicon M2 Max | 8 | 114s | 111s | 0.97x |
54
+ | x86-64 Linux | 8 | 55s | 47s | 0.85x |
55
+
56
+ Mapping counts are identical: 28,968,858 / 33,436,697 (86.64%) for both implementations.
57
+
58
+ Key optimizations applied:
59
+ - **AHashMap for hit_map**: Replaced `nohash-hasher` (identity hash) which caused pathological SwissTable H2 collisions with sequential transcript IDs (~38% regression on scRNA with 237K refs). `AHashMap` properly distributes hash bits for SwissTable SIMD probing.
60
+ - **AHashSet for observed_ecs**: Replaced standard `HashSet<u64>` (SipHash) with `AHashSet<u64>` matching C++ `ankerl::unordered_dense::set` performance.
61
+ - **rapidhash in sshash-rs**: Replaced ahash for MPHF and minimizer hashing. ahash switches algorithm when AES-NI is available (via `target-cpu=native`), silently breaking serialized indices. rapidhash is CPU-feature independent.
62
+ - **Optional UnitigEndCache**: Only scATAC uses the cache; bulk and scRNA pass `None`, avoiding DashMap overhead. This was the primary source of the x86-64 performance gap.
63
+ - **LocatedHit**: Eliminated double `locate_with_end` Elias-Fano successor queries in dictionary lookups
64
+ - **from_ascii_unchecked**: Eliminated `Kmer::from_str` string round-trips (~15% of worker thread time), changed streaming query API from `&str` to `&[u8]`
65
+ - **Paraseq native processing**: Zero-copy read access, per-thread stat accumulation (reduced atomic contention at high thread counts)
66
+ - **Paraseq Collection**: Multi-file parallel decompression via `fastx::Collection` — threads distributed across reader groups when multiple file sets provided (e.g., `-1 a.fq.gz,b.fq.gz`). No regression for single-file case.
67
+
68
+ ### Next Up
69
+
70
+ - Benchmark script: `/tmp/bench_piscem.sh` (reads: `test_data/sim_1M_{1,2}.fq.gz`, Rust index: `test_data/gencode_pc_v44_index_rust/`, C++ index: `test_data/gencode_pc_v44_index_cpp/`)
71
+ - SC perf test data: `test_data/perf_test/pbmc_1k_v3_S1_L001_R{1,2}_001.fastq.gz`, indices at `test_data/perf_test/{cpp_index,rust_index}`
72
+
73
+ ## Key Design Decisions
74
+
75
+ 1. **No binary compatibility with C++ index format** — Rust has its own serialization format, only semantic equivalence required
76
+ 2. **Size efficiency matters** — serialized indices should be similar size to C++
77
+ 3. **Default mapping strategy**: `get_raw_hits_sketch` with PERMISSIVE mode, no structural constraints initially
78
+ 4. **C++ global mutable state → Rust struct fields**: `ref_shift`/`pos_mask` stored in `EntryEncoding`, passed by reference (not global)
79
+ 5. **sshash-rs const-generic K**: Use `dispatch_on_k!(k, K => { ... })` at mapping entry point
80
+ 6. **rapidhash for index hashing**: sshash-rs uses rapidhash (not ahash) for MPHF and minimizer hashing — CPU-feature independent, indices portable across `target-cpu=native`. ahash is still used in piscem-rs for ephemeral hot-path HashMaps (hit_map, observed_ecs) where portability doesn't matter.
81
+ 7. **UnitigEndCache is scATAC-only**: Bulk and scRNA pass `None` to avoid DashMap overhead
82
+ 6. **Succinct data structure crates**: `sux` 0.12 git main (with epserde feature), NOT `cseq` or `sucds`
83
+ 7. **Test data**: Pre-built C++ indices expected in `test_data/` directory
84
+ 8. **libradicl**: Use git dependency to `develop` branch for RAD comparison
85
+ 9. **sshash-rs dependency**: Git dependency (`branch = "main"`) in Cargo.toml. Local checkout at `./sshash-rs/` is gitignored and used automatically by Cargo for development.
86
+
87
+ ## Threading Architecture
88
+
89
+ The mapping pipeline uses **paraseq's `Collection` API** for parallel I/O across multiple input files:
90
+
91
+ ```
92
+ paraseq Collection (Vec<fastx::Reader>, CollectionType)
93
+ └─ distributes threads across reader groups (auto: total_threads / num_groups)
94
+ └─ each group: mutex-guarded I/O → fill RecordSet → process batch
95
+ └─ Processor struct (one clone per thread, lazy-init state)
96
+ ├─ process_record_pair_batch(): map reads, accumulate RAD output
97
+ ├─ on_batch_complete(): backpatch chunk header, flush to shared file
98
+ └─ on_thread_complete(): flush local stats to atomics (once)
99
+ ```
100
+
101
+ **Processor pattern** (`src/mapping/processors.rs`):
102
+ - Processor structs hold shared `&'a` references (index, cache, output, stats) + `Option<ThreadState>`
103
+ - Custom `Clone` impl: copies reference pointers, sets `state: None`
104
+ - `ThreadState` lazily initialized on first batch via `get_or_insert_with()`
105
+ - `CommonThreadState<'a, K>`: HitSearcher, PiscemStreamingQuery, MappingCache×3, PoisonState, RadWriter, local counters
106
+
107
+ **Three processor types**:
108
+ - `BulkProcessor` — implements `PairedParallelProcessor` (PE) and `ParallelProcessor` (SE)
109
+ - `ScrnaProcessor` — implements `PairedParallelProcessor` with BC/UMI extraction
110
+ - `ScatacProcessor` — implements `MultiParallelProcessor` for triple-file (R1 + barcode + R2) input
111
+
112
+ **CLI pattern** (`src/cli/map_*.rs`):
113
+ ```rust
114
+ dispatch_on_k!(k, K => {
115
+ let mut processor = BulkProcessor::<K>::new(index, end_cache, output, stats, strat);
116
+ let mut readers = Vec::new();
117
+ for (r1, r2) in read1_paths.iter().zip(read2_paths.iter()) {
118
+ readers.push(paraseq::fastx::Reader::new(open_with_decompression(r1)?)?);
119
+ readers.push(paraseq::fastx::Reader::new(open_with_decompression(r2)?)?);
120
+ }
121
+ let collection = Collection::new(readers, CollectionType::Paired)?;
122
+ collection.process_parallel_paired(&mut processor, num_threads, None)?;
123
+ });
124
+ ```
125
+
126
+ ## Critical Import Patterns
127
+
128
+ These caused compilation errors previously — remember them:
129
+
130
+ ```rust
131
+ // BitFieldVec requires these trait imports for index_value/set_value/len:
132
+ use value_traits::slices::{SliceByValue, SliceByValueMut};
133
+
134
+ // epserde serialization:
135
+ use epserde::ser::Serialize;
136
+ use epserde::deser::Deserialize;
137
+
138
+ // Elias-Fano (sux-rs):
139
+ use sux::dict::elias_fano::{EliasFanoBuilder, EfSeq, EfSeqDict};
140
+ use sux::traits::{IndexedSeq, Succ};
141
+
142
+ // mem_size() for sux-rs types (sux 0.12 / mem_dbg 0.4):
143
+ use mem_dbg::{MemSize, SizeFlags};
144
+ // ef.mem_size(SizeFlags::default())
145
+
146
+ // paraseq parallel processing (explicit lifetime on trait impl):
147
+ use paraseq::parallel::{ParallelProcessor, PairedParallelProcessor, MultiParallelProcessor};
148
+ use paraseq::fastx::{Collection, CollectionType};
149
+ // Processors impl traits for fastx::RefRecord (not fastq::RefRecord) — Collection uses fastx
150
+ // impl<'a, 'r, const K: usize> PairedParallelProcessor<paraseq::fastx::RefRecord<'r>> for MyProcessor<'a, K>
151
+ // (NOT RefRecord<'_> — anonymous lifetimes in impl Trait are unstable)
152
+ ```
153
+
154
+ ## Project Structure
155
+
156
+ ```
157
+ piscem-rs/
158
+ Cargo.toml # Main crate config (sshash-lib = git dep)
159
+ implementation_plan.md # Detailed roadmap (READ THIS)
160
+ sshash-rs/ # Local sshash-rs checkout (gitignored, auto-used by Cargo)
161
+ src/
162
+ lib.rs # Modules: cli, index, io, mapping, verify
163
+ main.rs # Entry point
164
+ index/
165
+ mod.rs # build, build_poison, contig_table, eq_classes, poison_table, formats, reference_index, refinfo
166
+ build.rs # End-to-end index build pipeline from cuttlefish output
167
+ build_poison.rs # Edge-method poison table builder from decoy FASTA
168
+ contig_table.rs # EF offsets + BitFieldVec entries
169
+ refinfo.rs # Reference names and lengths
170
+ reference_index.rs # Assembles Dictionary + ContigTable + RefInfo
171
+ eq_classes.rs # EC map: tile → EC → (transcript_id, orientation)
172
+ poison_table.rs # Poison k-mer table with AHashMap<CanonicalKmer, u64>
173
+ formats.rs # ArtifactFormat enum
174
+ cli/
175
+ build.rs # Index build CLI
176
+ poison.rs # build-poison CLI (decoy scanning + save)
177
+ map_bulk.rs # Bulk mapping CLI — creates BulkProcessor, calls paraseq
178
+ map_scrna.rs # scRNA mapping CLI — creates ScrnaProcessor, calls paraseq
179
+ map_scatac.rs # scATAC mapping CLI — creates ScatacProcessor, calls paraseq
180
+ mapping/
181
+ processors.rs # BulkProcessor, ScrnaProcessor, ScatacProcessor (paraseq trait impls)
182
+ kmer_value.rs # CanonicalKmer newtype (u64-backed, upgrade path for k>31)
183
+ unitig_end_cache.rs # UnitigEndCache with DashMap<CanonicalKmer>, orientation-aware
184
+ hit_searcher.rs # HitSearcher, ReadKmerIter, PERMISSIVE/STRICT modes
185
+ projected_hits.rs # RefPos, ProjectedHits<'a>, decode_hit()
186
+ streaming_query.rs # PiscemStreamingQuery<'a, K> wrapper + unitig-end cache integration
187
+ hits.rs # MappingType, HitDirection, SimpleHit, SketchHitInfo trait
188
+ sketch_hit_simple.rs # SketchHitInfoSimple (no-constraint default)
189
+ chain_state.rs # SketchHitInfoChained (optional structural constraints)
190
+ filters.rs # PoisonState, scan_raw_hits, CanonicalKmerIter
191
+ cache.rs # MappingCache<S> generic mapping state
192
+ engine.rs # map_read<K,S>() kernel + map_read_atac<K,S>() bin-based kernel
193
+ merge_pairs.rs # merge_se_mappings() + merge_se_mappings_binned() bin-aware PE merge
194
+ map_fragment.rs # SE/PE helpers + ATAC bin-based variants
195
+ overlap.rs # Mate overlap detection (dovetail/regular + seed-based alignment)
196
+ binning.rs # BinPos cumulative per-ref binning matching C++ bin_pos
197
+ protocols/
198
+ mod.rs # Protocol trait + AlignableReads + TechSeqs
199
+ bulk.rs # BulkProtocol
200
+ scrna.rs # ChromiumProtocol (V2/V2_5p/V3/V3_5p/V4_3p) + barcode recovery
201
+ scatac.rs # ScatacProtocol for scATAC-seq
202
+ custom.rs # CustomProtocol + geometry parser (recursive descent)
203
+ io/
204
+ rad.rs # RadWriter + RAD headers/records (SC + bulk + ATAC)
205
+ fastx.rs # open_with_decompression(), Collection/CollectionType re-exports, MultiReader
206
+ threads.rs # OutputInfo (mutex RAD file), MappingStats (atomic counters), ThreadConfig
207
+ map_info.rs # map_info.json writer
208
+ verify/
209
+ mod.rs # Module declarations
210
+ index_compare.rs # Index semantic comparison (ref metadata)
211
+ rad_compare.rs # RAD file comparison (binary header parser)
212
+ parity.rs # Parity orchestration + JSON report
213
+ ```
214
+
215
+ ## Terminology Bridge
216
+
217
+ | sshash-rs term | piscem-cpp term | Meaning |
218
+ |---|---|---|
219
+ | `string_id` | `contig_id` | Unitig identifier |
220
+ | `kmer_id_in_string` | `kmer_id_in_contig` | K-mer position within unitig |
221
+ | `num_strings()` | num contigs | Number of unitigs |
222
+
223
+ ## Running Tests
224
+
225
+ ```bash
226
+ cargo test # All 179 unit tests should pass
227
+ cargo check # Should compile clean with no warnings
228
+ RUST_LOG=info cargo run # Run with logging
229
+ ```
230
+
231
+ ### Parity Tests (require test data + `--features parity-test`)
232
+
233
+ ```bash
234
+ cargo test --features parity-test --release --test rad_parity_bulk -- --ignored --nocapture
235
+ cargo test --features parity-test --release --test rad_parity_sc -- --ignored --nocapture
236
+ ```
237
+
238
+ ## C++ Reference Code
239
+
240
+ The C++ piscem codebase should be available in `piscem-cpp/` in the current directory for cross-reference. Key files:
241
+ - `include/reference_index.hpp` — C++ ReferenceIndex
242
+ - `include/basic_contig_table.hpp` — C++ contig table
243
+ - `include/hit_searcher.hpp` / `src/hit_searcher.cpp` — Hit collection (~1400 lines)
244
+ - `include/mapping/utils.hpp` — `map_read()` kernel
245
+ - `include/projected_hits.hpp` — Projected hits and `decode_hit()`
246
+ - `include/streaming_query.hpp` — piscem-level streaming query wrapper