@sjcrh/proteinpaint-rust 2.186.0 → 2.189.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -84,10 +84,6 @@ path="src/cluster.rs"
84
84
  name="gdcmaf"
85
85
  path="src/gdcmaf.rs"
86
86
 
87
- [[bin]]
88
- name="topGeneByExpressionVariance"
89
- path="src/topGeneByExpressionVariance.rs"
90
-
91
87
  [[bin]]
92
88
  name="wilcoxon"
93
89
  path="src/wilcoxon.rs"
@@ -142,4 +138,4 @@ path="src/dmrcate.rs"
142
138
 
143
139
  [[bin]]
144
140
  name="volcano"
145
- path="src/volcano.rs"
141
+ path="src/volcano.rs"
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.186.0",
2
+ "version": "2.189.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/volcano.rs CHANGED
@@ -8,11 +8,11 @@
8
8
 
9
9
  use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64};
10
10
  use plotters::prelude::*;
11
- use plotters::style::ShapeStyle;
12
11
  use serde::{Deserialize, Serialize};
13
12
  use serde_json::Value;
14
13
  use std::error::Error;
15
14
  use std::io::{self, Read};
15
+ use tiny_skia::{Paint, PathBuilder, Pixmap, Stroke, Transform};
16
16
 
17
17
  #[derive(Deserialize)]
18
18
  struct Input {
@@ -36,6 +36,14 @@ struct Input {
36
36
  /// only the overlay list is truncated to the most-significant N.
37
37
  #[serde(default)]
38
38
  max_interactive_dots: Option<usize>,
39
+ /// Hi-DPI scale factor (e.g. 2.0 on retina). Defaults to 1.0 when absent
40
+ /// so existing callers don't change behavior. The PNG is rasterized at
41
+ /// `(pixel_* + pad) * dpr` device pixels and is rendered at the CSS-space
42
+ /// dimensions reported in `plot_extent.pixel_*` — the browser uses the
43
+ /// extra resolution for sharpness on hi-DPI displays. Mirror of
44
+ /// manhattan_plot.rs's `device_pixel_ratio` handling.
45
+ #[serde(default)]
46
+ device_pixel_ratio: Option<f64>,
39
47
  }
40
48
 
41
49
  #[derive(Serialize)]
@@ -189,14 +197,23 @@ fn main() -> Result<(), Box<dyn Error>> {
189
197
  let x_max = x_max_unpadded + x_pad_data;
190
198
  let y_min = y_min_unpadded - y_pad_data;
191
199
  let y_max = y_max_unpadded + y_pad_data;
192
- let mut buffer = vec![0u8; (w as usize) * (h as usize) * 3];
193
- // Per-point pixel coords as plotters actually rasterizes them. Returned to
194
- // the client so the SVG overlay rings sit exactly on top of the PNG dots
195
- // instead of being recomputed from data coords (which loses sub-pixel
196
- // precision under plotters' integer truncation).
197
- let mut all_pixel_coords: Vec<(f64, f64)> = Vec::with_capacity(points.len());
200
+
201
+ // Hi-DPR scaling. The buffer/chart are sized in device pixels (CSS * dpr)
202
+ // and the drawn radius/stroke are scaled the same way, so the PNG is
203
+ // sharper on retina. backend_coord returns device-pixel coords; we divide
204
+ // by dpr below to keep `pixel_x/pixel_y` in CSS-space (which is what the
205
+ // SVG overlay coordinate system uses). Mirror of manhattan_plot.rs.
206
+ let dpr = input.device_pixel_ratio.unwrap_or(1.0).max(1.0);
207
+ let w_hd = ((w as f64) * dpr) as u32;
208
+ let h_hd = ((h as f64) * dpr) as u32;
209
+
210
+ let mut buffer = vec![0u8; (w_hd as usize) * (h_hd as usize) * 3];
211
+ // Per-point pixel coords as plotters' chart maps them. Captured in device
212
+ // pixels so tiny-skia can draw the AA rings exactly at those positions;
213
+ // we keep a CSS-space copy below for the SVG overlay.
214
+ let mut pixel_coords_hd: Vec<(f64, f64)> = Vec::with_capacity(points.len());
198
215
  {
199
- let backend = BitMapBackend::with_buffer(&mut buffer, (w, h));
216
+ let backend = BitMapBackend::with_buffer(&mut buffer, (w_hd, h_hd));
200
217
  let root = backend.into_drawing_area();
201
218
  root.fill(&WHITE)?;
202
219
 
@@ -213,44 +230,77 @@ fn main() -> Result<(), Box<dyn Error>> {
213
230
 
214
231
  // Threshold guide lines are drawn by the SVG overlay on the client, not
215
232
  // here — double-drawing them would add stray lines offset by axis padding.
233
+ // The dots themselves are drawn below with tiny-skia for true AA; here
234
+ // plotters just gives us a white-background buffer and the data-to-pixel
235
+ // mapping. Mirror of manhattan_plot.rs.
216
236
 
217
- // Resolve colors once. Up/down fall back to `color_sig` when absent.
218
- let color_sig = rgb(&input.color_significant, (214, 39, 40));
219
- let color_non = rgb(&input.color_nonsignificant, (0, 0, 0));
220
- let resolve = |o: &Option<String>| o.as_deref().map(|s| rgb(s, (214, 39, 40))).unwrap_or(color_sig);
221
- let color_up = resolve(&input.color_significant_up);
222
- let color_down = resolve(&input.color_significant_down);
223
-
224
- // Stroke-only rings at full opacity so each ring is the exact configured
225
- // group color — matching the hue the SVG overlay uses.
226
- let ring = |c: RGBColor| ShapeStyle {
227
- color: c.into(),
228
- filled: false,
229
- stroke_width: 1,
230
- };
231
-
232
- // Draw non-significant first so significant rings overlay on top.
233
- chart.draw_series(
234
- points
235
- .iter()
236
- .filter(|p| !p.significant)
237
- .map(|p| Circle::new((p.fc, p.y), radius_px, ring(color_non))),
238
- )?;
239
- chart.draw_series(points.iter().filter(|p| p.significant).map(|p| {
240
- let c = if p.fc > 0.0 { color_up } else { color_down };
241
- Circle::new((p.fc, p.y), radius_px, ring(c))
242
- }))?;
243
-
244
- // Mirror manhattan_plot.rs: capture the exact pixel coords plotters
245
- // used for each point so the client overlay can land on them precisely.
246
237
  for p in points.iter() {
247
238
  let (px, py) = chart.backend_coord(&(p.fc, p.y));
248
- all_pixel_coords.push((px as f64, py as f64));
239
+ pixel_coords_hd.push((px as f64, py as f64));
249
240
  }
250
241
 
251
242
  root.present()?;
252
243
  }
253
244
 
245
+ // Convert plotters' RGB buffer to a tiny-skia RGBA pixmap, then stroke the
246
+ // dots on top with anti-aliasing — gives crisp rings even when the user
247
+ // zooms in past native DPR. Plotters' BitMapBackend has no AA on shapes,
248
+ // which is why ring edges looked chunky before this rewrite.
249
+ let mut pixmap = Pixmap::new(w_hd, h_hd).ok_or("failed to create pixmap")?;
250
+ {
251
+ let data = pixmap.data_mut();
252
+ for (src, dst) in buffer.chunks_exact(3).zip(data.chunks_exact_mut(4)) {
253
+ dst[..3].copy_from_slice(src);
254
+ dst[3] = 255;
255
+ }
256
+ }
257
+
258
+ // Resolve colors once. Up/down fall back to `color_sig` when absent.
259
+ let color_sig = rgb(&input.color_significant, (214, 39, 40));
260
+ let color_non = rgb(&input.color_nonsignificant, (0, 0, 0));
261
+ let resolve = |o: &Option<String>| o.as_deref().map(|s| rgb(s, (214, 39, 40))).unwrap_or(color_sig);
262
+ let color_up = resolve(&input.color_significant_up);
263
+ let color_down = resolve(&input.color_significant_down);
264
+
265
+ let radius_hd_f = radius_px as f32 * dpr as f32;
266
+ // 1 CSS-pixel-wide stroke at hi-DPR. The stroke straddles the path, so the
267
+ // visible ring thickness is `stroke_width` device px ≈ 1 CSS px.
268
+ let mut stroke = Stroke::default();
269
+ stroke.width = dpr as f32;
270
+ let mut paint = Paint::default();
271
+ paint.anti_alias = true;
272
+
273
+ let stroke_ring = |pixmap: &mut Pixmap, paint: &mut Paint, color: RGBColor, px: f32, py: f32| {
274
+ paint.set_color_rgba8(color.0, color.1, color.2, 255);
275
+ let mut pb = PathBuilder::new();
276
+ pb.push_circle(px, py, radius_hd_f);
277
+ if let Some(path) = pb.finish() {
278
+ pixmap.stroke_path(&path, paint, &stroke, Transform::identity(), None);
279
+ }
280
+ };
281
+
282
+ // Draw non-significant first so significant rings overlay on top.
283
+ for (i, p) in points.iter().enumerate() {
284
+ if p.significant {
285
+ continue;
286
+ }
287
+ let (px, py) = pixel_coords_hd[i];
288
+ stroke_ring(&mut pixmap, &mut paint, color_non, px as f32, py as f32);
289
+ }
290
+ for (i, p) in points.iter().enumerate() {
291
+ if !p.significant {
292
+ continue;
293
+ }
294
+ let (px, py) = pixel_coords_hd[i];
295
+ let c = if p.fc > 0.0 { color_up } else { color_down };
296
+ stroke_ring(&mut pixmap, &mut paint, c, px as f32, py as f32);
297
+ }
298
+
299
+ // CSS-space coords for the SVG overlay — divide the device-pixel positions
300
+ // by dpr. The overlay does not know about hi-DPR; the PNG sizing handles
301
+ // sharpness for us.
302
+ let all_pixel_coords: Vec<(f64, f64)> = pixel_coords_hd.iter().map(|(x, y)| (x / dpr, y / dpr)).collect();
303
+
254
304
  // Build the interactive `dots` list: threshold-passers sorted asc by the
255
305
  // chosen p-value column, optionally capped at `max_interactive_dots`.
256
306
  let mut sig_points: Vec<&Point> = points.iter().filter(|p| p.significant).collect();
@@ -273,7 +323,7 @@ fn main() -> Result<(), Box<dyn Error>> {
273
323
  .collect();
274
324
 
275
325
  let output = Output {
276
- png: BASE64.encode(&encode_rgb_to_png(&buffer, w, h)?),
326
+ png: BASE64.encode(&pixmap.encode_png()?),
277
327
  plot_extent: PlotExtent {
278
328
  x_min,
279
329
  x_max,
@@ -302,13 +352,3 @@ fn main() -> Result<(), Box<dyn Error>> {
302
352
  println!("{}", serde_json::to_string(&output)?);
303
353
  Ok(())
304
354
  }
305
-
306
- /// Convert a plotters RGB buffer (3 bytes/px) to a PNG via tiny-skia (4 bytes/px).
307
- fn encode_rgb_to_png(rgb: &[u8], w: u32, h: u32) -> Result<Vec<u8>, Box<dyn Error>> {
308
- let mut pixmap = tiny_skia::Pixmap::new(w, h).ok_or("failed to create pixmap")?;
309
- for (src, dst) in rgb.chunks_exact(3).zip(pixmap.data_mut().chunks_exact_mut(4)) {
310
- dst[..3].copy_from_slice(src);
311
- dst[3] = 255;
312
- }
313
- Ok(pixmap.encode_png()?)
314
- }
@@ -1,731 +0,0 @@
1
- /*
2
- This script selects the top most variant genes by calculating the variance/interquartile region for each gene.
3
- Added support for HDF5 input files alongside the existing text file support.
4
-
5
- Various JSON parameters:
6
- samples: Enter the sample ID(s) separated by comma
7
- input_file: Path to input file (either text or HDF5 format)
8
- filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
9
- num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
10
- rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
11
- newformat?: bool. Used to support new format HDF5
12
-
13
- Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
14
-
15
- Usage for new format HDF5
16
- echo '{"samples":"sample1,sample2,sample3","newformat":true,"min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' | ./target/release/topGeneByExpressionVariance
17
- */
18
- #![allow(non_snake_case)]
19
- use bgzip::BGZFReader;
20
- use json;
21
- use nalgebra::DMatrix;
22
- use nalgebra::base::Matrix;
23
- use nalgebra::base::VecStorage;
24
- use nalgebra::base::dimension::Dyn;
25
- use serde::{Deserialize, Serialize};
26
- use serde_json;
27
- use statrs::statistics::Data;
28
- use statrs::statistics::Median;
29
- use statrs::statistics::OrderStatistics;
30
- use statrs::statistics::Statistics;
31
- use std::cmp::Ordering;
32
- use std::fs;
33
- use std::io;
34
- use std::io::Read;
35
- use std::str::FromStr;
36
- // use std::time::Instant;
37
- use hdf5::types::VarLenUnicode;
38
- use hdf5::{File, Result};
39
- use ndarray::Dim;
40
-
41
- /// Read expression data from a dense HDF5 file for a list of samples
42
- ///
43
- /// This function extracts expression data from a dense format HDF5 file for
44
- /// the specified samples and returns it in the format expected by the
45
- /// gene variance calculation code.
46
- ///
47
- /// # Arguments
48
- ///
49
- /// * `filename` - Path to the HDF5 file
50
- /// * `sample_list` - List of sample IDs to extract data for
51
- ///
52
- /// # Returns
53
- ///
54
- /// A Result containing either:
55
- /// - A tuple with expression matrix and gene symbols list on success, or
56
- /// - An error with details formatted as JSON
57
- fn input_data_hdf5(
58
- filename: &String,
59
- sample_list: &Vec<&str>,
60
- ) -> Result<(Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>)> {
61
- // Open the HDF5 file
62
- let file = match File::open(filename) {
63
- Ok(f) => f,
64
- Err(err) => {
65
- return Err(hdf5::Error::Internal(format!("Failed to open HDF5 file: {}", err)));
66
- }
67
- };
68
-
69
- // Read gene symbols dataset
70
- let genes_dataset = match file.dataset("item") {
71
- Ok(ds) => ds,
72
- Err(err) => {
73
- return Err(hdf5::Error::Internal(format!(
74
- "Failed to open gene_names dataset: {}",
75
- err
76
- )));
77
- }
78
- };
79
-
80
- // Read genes as VarLenAscii
81
- let genes_varlen = match genes_dataset.read_1d::<VarLenUnicode>() {
82
- Ok(g) => g,
83
- Err(err) => {
84
- return Err(hdf5::Error::Internal(format!("Failed to read gene symbols: {}", err)));
85
- }
86
- };
87
-
88
- // Convert to Vec<String> for easier handling
89
- let gene_names: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
90
- let num_genes = gene_names.len();
91
-
92
- // Read sample names
93
- let samples_dataset = match file.dataset("samples") {
94
- Ok(ds) => ds,
95
- Err(err) => {
96
- println!(
97
- "{}",
98
- serde_json::json!({
99
- "status": "error",
100
- "message": format!("Failed to open samples dataset: {}", err),
101
- "file_path": filename
102
- })
103
- );
104
- return Err(hdf5::Error::Internal(format!(
105
- "Failed to open samples dataset: {}",
106
- err
107
- )));
108
- }
109
- };
110
-
111
- // Read samples as VarLenAscii
112
- let samples_varlen = match samples_dataset.read_1d::<VarLenUnicode>() {
113
- Ok(s) => s,
114
- Err(err) => {
115
- // eprintln!("Failed to read sample names: {}", err);
116
- println!(
117
- "{}",
118
- serde_json::json!({
119
- "status": "error",
120
- "message": format!("Failed to read sample names: {}", err),
121
- "file_path": filename
122
- })
123
- );
124
- return Err(hdf5::Error::Internal(format!("Failed to read sample names: {}", err)));
125
- }
126
- };
127
-
128
- // Convert to Vec<String> for easier handling
129
- let all_samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
130
-
131
- // Find indices of requested samples
132
- let mut column_indices: Vec<usize> = Vec::with_capacity(sample_list.len());
133
- for sample in sample_list {
134
- if let Some(index) = all_samples.iter().position(|s| s == sample) {
135
- column_indices.push(index);
136
- } else {
137
- return Err(hdf5::Error::Internal(format!(
138
- "Sample '{}' not found in the dataset",
139
- sample
140
- )));
141
- }
142
- }
143
-
144
- // Read the counts dataset
145
- let counts_dataset = match file.dataset("matrix") {
146
- Ok(ds) => ds,
147
- Err(err) => {
148
- return Err(hdf5::Error::Internal(format!("Failed to open counts dataset: {}", err)));
149
- }
150
- };
151
-
152
- // Get dataset dimensions for validation
153
- let dataset_shape = counts_dataset.shape();
154
- if dataset_shape.len() != 2 {
155
- return Err(hdf5::Error::Internal("Expected a 2D dataset for counts".to_string()));
156
- };
157
-
158
- // Check dimensions match expected values
159
- if dataset_shape[0] != num_genes {
160
- return Err(hdf5::Error::Internal(format!(
161
- "Counts dataset first dimension ({}) doesn't match number of genes ({})",
162
- dataset_shape[0], num_genes
163
- )));
164
- };
165
-
166
- if dataset_shape[1] != all_samples.len() {
167
- return Err(hdf5::Error::Internal(format!(
168
- "Counts dataset second dimension ({}) doesn't match number of samples ({})",
169
- dataset_shape[1],
170
- all_samples.len()
171
- )));
172
- };
173
-
174
- // Read the counts dataset
175
- let all_counts = match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
176
- Ok(data) => data,
177
- Err(err) => {
178
- return Err(hdf5::Error::Internal(format!(
179
- "Failed to read expression data: {}",
180
- err
181
- )));
182
- }
183
- };
184
-
185
- let mut input_vector: Vec<f64> = Vec::with_capacity(num_genes * sample_list.len());
186
-
187
- for gene_idx in 0..num_genes {
188
- for &col_idx in &column_indices {
189
- input_vector.push(all_counts[[gene_idx, col_idx]]);
190
- }
191
- }
192
-
193
- // Create matrix from the extracted data
194
- let dm = DMatrix::from_row_slice(num_genes, sample_list.len(), &input_vector);
195
-
196
- Ok((dm, gene_names))
197
- }
198
-
199
- // The original input_data function for text files is kept as is
200
- fn input_data(
201
- filename: &String,
202
- sample_list: &Vec<&str>,
203
- ) -> (Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>) {
204
- // Build the CSV reader and iterate over each record.
205
- let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
206
- let mut num_lines: usize = 0;
207
- let mut gene_names: Vec<String> = Vec::with_capacity(500);
208
-
209
- let mut buffer = String::new();
210
- reader.read_to_string(&mut buffer).unwrap();
211
-
212
- let lines = buffer.split("\n");
213
- let mut first = true;
214
- let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
215
- let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
216
- for line in lines {
217
- if first == true {
218
- first = false;
219
- let columns: Vec<&str> = line.split("\t").collect();
220
- // Finding column numbers corresponding to each sample given in the input list
221
- for item in sample_list {
222
- if let Some(index) = columns.iter().position(|num| num == item) {
223
- column_numbers.push(index)
224
- } else {
225
- panic!("Sample {} not found:", item)
226
- }
227
- }
228
- } else {
229
- let line2: Vec<&str> = line.split("\t").collect();
230
- if line2.len() == 1 {
231
- break; // end of file
232
- } else {
233
- num_lines += 1;
234
- //println!("line2:{:?}", line2);
235
- gene_names.push(line2[3].to_string());
236
- for i in &column_numbers {
237
- let field = line2[*i];
238
- let num = FromStr::from_str(field);
239
- match num {
240
- Ok(n) => {
241
- //println!("n:{}", n);
242
- input_vector.push(n);
243
- }
244
- Err(_n) => {
245
- panic!(
246
- "Number {} in line {} and column {} is not a decimal number",
247
- field,
248
- num_lines + 1,
249
- i + 1
250
- );
251
- }
252
- }
253
- }
254
- }
255
- }
256
- }
257
-
258
- //println!("case_indexes:{:?}", case_indexes);
259
- //println!("control_indexes:{:?}", control_indexes);
260
-
261
- let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
262
- //println!("dm:{:?}", dm);
263
- (dm, gene_names)
264
- }
265
-
266
- #[allow(dead_code)]
267
- #[derive(Debug, Serialize, Deserialize)]
268
- struct GeneInfo {
269
- gene_symbol: String,
270
- rank_type: f64,
271
- }
272
-
273
- fn calculate_variance(
274
- input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
275
- gene_names: Vec<String>,
276
- mut min_sample_size: f64,
277
- filter_extreme_values: bool,
278
- rank_type: String,
279
- min_count_option: Option<f64>,
280
- min_total_count_option: Option<f64>,
281
- ) -> Vec<GeneInfo> {
282
- let mut min_count: f64 = 10.0;
283
- match min_count_option {
284
- Some(x) => min_count = x,
285
- None => {}
286
- }
287
- let mut min_total_count: f64 = 15.0;
288
- match min_total_count_option {
289
- Some(x) => min_total_count = x,
290
- None => {}
291
- }
292
- //const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
293
- //const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
294
- const LARGE_N: f64 = 10.0; // Value of constant from R implementation
295
- const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
296
-
297
- if min_sample_size == 0.0 {
298
- panic!("Only one condition present in groups");
299
- }
300
-
301
- if min_sample_size > LARGE_N {
302
- min_sample_size = LARGE_N + (min_sample_size - LARGE_N) * MIN_PROP;
303
- }
304
-
305
- // Per-sample library sizes as nansum — a single NaN gene doesn't
306
- // poison the whole sample's total.
307
- let mut lib_sizes = Vec::<f64>::with_capacity(input_matrix.ncols());
308
- for col in 0..input_matrix.ncols() {
309
- let mut s = 0.0_f64;
310
- for row in 0..input_matrix.nrows() {
311
- let v = input_matrix[(row, col)];
312
- if v.is_finite() {
313
- s += v;
314
- }
315
- }
316
- lib_sizes.push(s);
317
- }
318
-
319
- let median_lib_size = Data::new(lib_sizes.clone()).median();
320
- let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
321
- //println!("cpm_cutoff:{}", cpm_cutoff);
322
- let cpm_matrix = cpm(&input_matrix, &lib_sizes);
323
- const TOL: f64 = 1e-14; // Value of constant from R implementation
324
-
325
- let mut gene_infos = Vec::<GeneInfo>::new();
326
- for row in 0..input_matrix.nrows() {
327
- let mut trues = 0.0;
328
- // CPM filter (NaN-safe)
329
- for col in 0..cpm_matrix.ncols() {
330
- let v = cpm_matrix[(row, col)];
331
- if v.is_finite() && v >= cpm_cutoff {
332
- trues += 1.0;
333
- }
334
- }
335
- let mut keep_cpm_bool = false;
336
- if trues >= min_sample_size - TOL {
337
- keep_cpm_bool = true;
338
- //keep_cpm.push(keep_cpm_bool);
339
- //positive_cpm += 1;
340
- }
341
-
342
- let mut row_sum_finite = 0.0_f64;
343
- for col in 0..input_matrix.ncols() {
344
- let v = input_matrix[(row, col)];
345
- if v.is_finite() {
346
- row_sum_finite += v;
347
- }
348
- }
349
- let mut keep_total_bool = false;
350
- if row_sum_finite >= min_total_count - TOL {
351
- keep_total_bool = true;
352
- }
353
-
354
- let mut gene_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
355
- for col in 0..input_matrix.ncols() {
356
- let v = input_matrix[(row, col)];
357
- if v.is_finite() {
358
- gene_counts.push(v);
359
- }
360
- }
361
-
362
- // Skip genes with too few observations to produce a stable statistic
363
- let min_required = if rank_type == "var" { 2 } else { 4 };
364
- if gene_counts.len() < min_required {
365
- continue;
366
- }
367
-
368
- if rank_type == "var" {
369
- // Calculating variance
370
- if gene_counts.clone().variance().is_nan() == true {
371
- } else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
372
- gene_infos.push(GeneInfo {
373
- rank_type: gene_counts.variance(),
374
- gene_symbol: gene_names[row].clone(),
375
- });
376
- } else if filter_extreme_values == false {
377
- gene_infos.push(GeneInfo {
378
- rank_type: gene_counts.variance(),
379
- gene_symbol: gene_names[row].clone(),
380
- });
381
- }
382
- } else {
383
- // Calculating interquartile region
384
- let mut gene_counts_data = Data::new(gene_counts);
385
- if gene_counts_data.clone().interquartile_range().is_nan() == true {
386
- } else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
387
- gene_infos.push(GeneInfo {
388
- rank_type: gene_counts_data.interquartile_range(),
389
- gene_symbol: gene_names[row].clone(),
390
- });
391
- } else if filter_extreme_values == false {
392
- gene_infos.push(GeneInfo {
393
- rank_type: gene_counts_data.interquartile_range(),
394
- gene_symbol: gene_names[row].clone(),
395
- });
396
- }
397
- }
398
- }
399
- gene_infos
400
- .as_mut_slice()
401
- .sort_by(|a, b| (a.rank_type).partial_cmp(&b.rank_type).unwrap_or(Ordering::Equal));
402
- gene_infos
403
- }
404
-
405
- fn cpm(
406
- input_matrix: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
407
- col_sums: &[f64],
408
- ) -> Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>> {
409
- let mut output_matrix = DMatrix::from_element(input_matrix.nrows(), input_matrix.ncols(), 0.0);
410
-
411
- for col in 0..input_matrix.ncols() {
412
- let norm = col_sums[col];
413
- for row in 0..input_matrix.nrows() {
414
- let v = input_matrix[(row, col)];
415
- output_matrix[(row, col)] = if v.is_finite() && norm > 0.0 {
416
- v * 1_000_000.0 / norm
417
- } else {
418
- f64::NAN
419
- };
420
- }
421
- }
422
- output_matrix
423
- }
424
-
425
- fn main() {
426
- // println!("Starting gene variance calculation...");
427
- let mut input = String::new();
428
- match io::stdin().read_line(&mut input) {
429
- // Accepting the piped input from nodejs (or command line from testing)
430
- Ok(_bytes_read) => {
431
- // eprintln!("Read {} bytes from stdin", bytes_read);
432
- // println!("{} bytes read", bytes_read);
433
- // println!("{}", input);
434
- let input_json = json::parse(&input);
435
- match input_json {
436
- Ok(json_string) => {
437
- // println!("Successfully parsed JSON input");
438
- // let now = Instant::now();
439
- let samples_string_result = &json_string["samples"].to_owned();
440
- let samples_string;
441
- match samples_string_result.as_str() {
442
- Some(x) => {
443
- samples_string = x.to_string();
444
- // println!("Samples: {}", samples_string);
445
- }
446
- None => {
447
- // eprintln!("ERROR: Samples not provided in JSON");
448
- println!(
449
- "{}",
450
- serde_json::json!({
451
- "status": "error",
452
- "message": "Samples not provided"
453
- })
454
- );
455
- return;
456
- }
457
- }
458
-
459
- let file_name_result = &json_string["input_file"];
460
- let file_name;
461
-
462
- match file_name_result.as_str() {
463
- Some(x) => {
464
- file_name = x.to_string();
465
- // eprintln!("Input file: {}", file_name);
466
- // Return file name as JSON for debugging
467
- // println!(
468
- // "{}",
469
- // serde_json::json!({"status": "success", "file_name": file_name})
470
- // );
471
- }
472
- None => {
473
- // eprintln!("ERROR: File name missing in JSON");
474
- // println!(
475
- // "{}",
476
- // serde_json::json!({
477
- // "status": "error",
478
- // "message": "File name is missing"
479
- // })
480
- // );
481
- return;
482
- }
483
- }
484
-
485
- // Determine file type based on extension
486
- let file_type: String;
487
- if file_name.to_lowercase().ends_with(".h5") {
488
- file_type = "hdf5".to_string();
489
- // eprintln!("Detected HDF5 file format based on .h5 extension");
490
- } else {
491
- file_type = "text".to_string();
492
- // eprintln!("Using default text file format (no .h5 extension found)");
493
- }
494
-
495
- // Determine if the H5 file is new format
496
- //let new_format: bool = match &json_string {
497
- // json::JsonValue::Object(ref obj) => {
498
- // obj.get("newformat").and_then(|v| v.as_bool()).map_or(false, |b| b)
499
- // }
500
- // _ => false,
501
- //};
502
-
503
- let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
504
- .to_owned()
505
- .as_str()
506
- .unwrap_or("var")
507
- .to_string();
508
- // eprintln!("Rank type: {}", rank_type);
509
- if rank_type != "var" && rank_type != "iqr" {
510
- // Check if any unknown method has been provided
511
- // eprintln!("ERROR: Unknown rank method: {}", rank_type);
512
- // println!(
513
- // "{}",
514
- // serde_json::json!({
515
- // "status": "error",
516
- // "message": format!("Unknown rank method: {}. Must be 'var' or 'iqr'", rank_type)
517
- // })
518
- // );
519
- return;
520
- }
521
- let filter_extreme_values_result = &json_string["filter_extreme_values"];
522
-
523
- let filter_extreme_values;
524
- match filter_extreme_values_result.as_bool() {
525
- Some(x) => {
526
- filter_extreme_values = x;
527
- // eprintln!("Filter extreme values: {}", filter_extreme_values);
528
- }
529
- None => {
530
- filter_extreme_values = true; // If filter_extreme_values field is missing, set it to true by default
531
- // eprintln!(
532
- // "Filter extreme values not specified, defaulting to: {}",
533
- // filter_extreme_values
534
- // );
535
- }
536
- }
537
-
538
- let num_genes_result = &json_string["num_genes"];
539
- let num_genes;
540
- match num_genes_result.as_usize() {
541
- Some(x) => {
542
- num_genes = x;
543
- // eprintln!("Number of genes requested: {}", num_genes);
544
- }
545
- None => {
546
- // eprintln!("ERROR: Number of genes to be given is missing");
547
- println!(
548
- "{}",
549
- serde_json::json!({
550
- "status": "error",
551
- "message": "Number of genes to be given is missing"
552
- })
553
- );
554
- return;
555
- }
556
- }
557
-
558
- let min_count_result = &json_string["min_count"];
559
- let mut min_count: Option<f64> = None;
560
- match min_count_result.as_f64() {
561
- Some(x) => {
562
- min_count = Some(x);
563
- // eprintln!("Min count: {}", x);
564
- }
565
- None => {
566
- // eprintln!("Min count not specified, will use default");
567
- }
568
- }
569
-
570
- let min_total_count_result = &json_string["min_total_count"];
571
- let mut min_total_count: Option<f64> = None;
572
- match min_total_count_result.as_f64() {
573
- Some(x) => {
574
- min_total_count = Some(x);
575
- // eprintln!("Min total count: {}", x);
576
- }
577
- None => {
578
- // eprintln!("Min total count not specified, will use default");
579
- }
580
- }
581
-
582
- let samples_list: Vec<&str> = samples_string.split(",").collect();
583
- // eprintln!("Number of samples in list: {}", samples_list.len());
584
-
585
- // Choose the appropriate input function based on file type
586
- // eprintln!("Reading data from {} file: {}", file_type, file_name);
587
- let (input_matrix, gene_names) = if file_type == "hdf5" {
588
- // eprintln!("Using HDF5 reader function...");
589
- match input_data_hdf5(&file_name, &samples_list) {
590
- Ok(result) => result,
591
- Err(err) => {
592
- eprintln!("ERROR in HDF5 reader: {:?}", err);
593
- return;
594
- }
595
- }
596
- } else {
597
- // For original text-based implementation, we wrap it in a try-catch block
598
- // to handle panics in a more structured way
599
- // eprintln!("Using text file reader function...");
600
- match std::panic::catch_unwind(|| input_data(&file_name, &samples_list)) {
601
- Ok(result) => {
602
- // eprintln!("Successfully read text file data");
603
- result
604
- }
605
- Err(err) => {
606
- eprintln!("ERROR in text file reader: {:?}", err);
607
- println!(
608
- "{}",
609
- serde_json::json!({
610
- "status": "error",
611
- "message": "Failed to read text file data",
612
- "file_path": file_name
613
- })
614
- );
615
- return;
616
- }
617
- }
618
- };
619
-
620
- // eprintln!(
621
- // "Matrix dimensions: {}x{}",
622
- // input_matrix.nrows(),
623
- // input_matrix.ncols()
624
- // );
625
- // eprintln!("Number of gene symbols: {}", gene_names.len());
626
- if !gene_names.is_empty() {
627
- // eprintln!(
628
- // "First few gene symbols: {:?}",
629
- // &gene_names.iter().take(5).collect::<Vec<_>>()
630
- // );
631
- }
632
-
633
- // Wrap the variance calculation in a try-catch to capture any panics
634
- // eprintln!(
635
- // "Calculating variance with {} samples, filter={}, rank_type={}",
636
- // samples_list.len(),
637
- // filter_extreme_values,
638
- // rank_type
639
- // );
640
- let gene_infos = match std::panic::catch_unwind(|| {
641
- calculate_variance(
642
- input_matrix,
643
- gene_names,
644
- samples_list.len() as f64,
645
- filter_extreme_values,
646
- rank_type.to_string(),
647
- min_count,
648
- min_total_count,
649
- )
650
- }) {
651
- Ok(result) => {
652
- // eprintln!(
653
- // "Successfully calculated variance for {} genes",
654
- // result.len()
655
- // );
656
- result
657
- }
658
- Err(err) => {
659
- eprintln!("ERROR in variance calculation: {:?}", err);
660
- println!(
661
- "{}",
662
- serde_json::json!({
663
- "status": "error",
664
- "message": "Error calculating gene variance",
665
- "file_path": file_name
666
- })
667
- );
668
- return;
669
- }
670
- };
671
-
672
- // Check if we have enough genes for the requested output
673
- if gene_infos.len() < num_genes {
674
- // eprintln!(
675
- // "WARNING: Only {} genes found, but {} were requested",
676
- // gene_infos.len(),
677
- // num_genes
678
- // );
679
- }
680
-
681
- let actual_num_genes = std::cmp::min(num_genes, gene_infos.len());
682
- // eprintln!("Returning top {} genes", actual_num_genes);
683
-
684
- // Printing the top "num_genes" genes to JSON
685
- let mut output_string = "[".to_string();
686
- for j in 0..actual_num_genes {
687
- let i = gene_infos.len() - j - 1;
688
- output_string += &serde_json::to_string(&gene_infos[i]).unwrap();
689
- if i > gene_infos.len() - actual_num_genes {
690
- output_string += &",".to_string();
691
- }
692
- }
693
- output_string += &"]".to_string();
694
-
695
- // Debug the first few characters of the output
696
- if output_string.len() > 100 {
697
- // eprintln!("Output JSON starts with: {}", &output_string[0..100]);
698
- } else {
699
- // eprintln!("Output JSON: {}", output_string);
700
- }
701
-
702
- println!("output_json:{}", output_string);
703
- // let elapsed = now.elapsed();
704
- // eprintln!("Completed in: {:?}", elapsed);
705
- // println!("Time for calculating variances:{:?}", elapsed);
706
- }
707
- Err(error) => {
708
- eprintln!("ERROR: JSON parsing error: {}", error);
709
- println!(
710
- "{}",
711
- serde_json::json!({
712
- "status": "error",
713
- "message": format!("Incorrect json: {}", error)
714
- })
715
- );
716
- }
717
- }
718
- }
719
- Err(error) => {
720
- eprintln!("ERROR: Failed to read from stdin: {}", error);
721
- println!(
722
- "{}",
723
- serde_json::json!({
724
- "status": "error",
725
- "message": format!("Piping error: {}", error)
726
- })
727
- );
728
- }
729
- }
730
- // println!("Program execution complete");
731
- }