@sjcrh/proteinpaint-rust 2.31.0 → 2.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -72,6 +72,10 @@ path="src/cluster.rs"
72
72
  name="gdcmaf"
73
73
  path="src/gdcmaf.rs"
74
74
 
75
+ [[bin]]
76
+ name="gene_variance"
77
+ path="src/gene_variance.rs"
78
+
75
79
  #[[bin]]
76
80
  #name="wilcoxon"
77
81
  #path="src/wilcoxon.rs"
package/index.js CHANGED
@@ -1,8 +1,9 @@
1
1
  const path = require('path'),
2
2
  spawn = require('child_process').spawn,
3
- Readable = require('stream').Readable
3
+ Readable = require('stream').Readable,
4
+ Transform = require('stream').Transform
4
5
 
5
- exports.run_rust = function(binfile, input_data) {
6
+ exports.run_rust = function (binfile, input_data) {
6
7
  return new Promise((resolve, reject) => {
7
8
  const binpath = path.join(__dirname, '/target/release/', binfile)
8
9
  const ps = spawn(binpath)
@@ -41,3 +42,31 @@ exports.run_rust = function(binfile, input_data) {
41
42
  })
42
43
  })
43
44
  }
45
+
46
+ exports.run_rust_stream = function (binfile, input_data) {
47
+ const binpath = path.join(__dirname, '/target/release/', binfile)
48
+ const ps = spawn(binpath)
49
+ try {
50
+ Readable.from(input_data).pipe(ps.stdin)
51
+ } catch (error) {
52
+ ps.kill()
53
+ let errmsg = error
54
+ if (stderr.length) errmsg += `killed run_rust('${binfile}'), stderr: ${stderr.join('').trim()}`
55
+ reject(errmsg)
56
+ }
57
+
58
+ const childStream = new Transform({
59
+ transform(chunk, encoding, callback) {
60
+ this.push(chunk)
61
+ callback()
62
+ }
63
+ })
64
+ ps.stdout.pipe(childStream)
65
+ childStream.on('error', err => {
66
+ reject(err)
67
+ })
68
+ childStream.on('close', code => {
69
+ childStream.end()
70
+ })
71
+ return childStream
72
+ }
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.31.0",
2
+ "version": "2.34.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
package/src/gdcmaf.rs CHANGED
@@ -4,8 +4,7 @@ use flate2::Compression;
4
4
  use serde_json::Value;
5
5
  use std::path::Path;
6
6
  use futures::StreamExt;
7
- use std::io;
8
- use std::io::{Read,Write};
7
+ use std::io::{self,Read,Write};
9
8
  use std::sync::mpsc;
10
9
 
11
10
 
@@ -45,8 +44,7 @@ fn gen_vec(d:String) -> (Vec<String>,Vec<Vec<u8>>) {
45
44
  }
46
45
  };
47
46
  maf_out_lst.push("\n".to_string());
48
- let maf_compress_data = gen_gzip_vec(maf_out_lst);
49
- maf_bit.push(maf_compress_data);
47
+ maf_bit.push(maf_out_lst.join("\t").as_bytes().to_vec());
50
48
  lst_chrom_pos.push(chrom+"\t"+&pos);
51
49
  }
52
50
  };
@@ -62,14 +60,6 @@ fn get_sorted_indices(lst: &Vec<String>) -> Vec<usize>{
62
60
  indices
63
61
  }
64
62
 
65
- // convert vector (maf row) to GZIP encoded format
66
- fn gen_gzip_vec(s:Vec<String>) -> Vec<u8> {
67
- let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
68
- let _ = encoder.write_all(s.join("\t").as_bytes());
69
- let compress_data = encoder.finish().unwrap();
70
- compress_data
71
- }
72
-
73
63
  // GDC MAF columns (96)
74
64
  const MAF_COL: [&str;96] = ["Hugo_Symbol", "Entrez_Gene_Id", "Center", "NCBI_Build", "Chromosome",
75
65
  "Start_Position", "End_Position", "Strand", "Variant_Classification",
@@ -112,18 +102,15 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
112
102
  url.into_iter().map(|url|{
113
103
  let txt = tx.clone();
114
104
  async move {
115
- match reqwest::get(&url).await{
116
- Ok(resp) => {
117
- let content = resp.bytes().await.unwrap();
118
- let mut decoder = GzDecoder::new(&content[..]);
119
- let mut decompressed_content = Vec::new();
120
- if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
121
- let text = String::from_utf8_lossy(&decompressed_content);
122
- let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
123
- txt.send((lst_chrom_pos,maf_bit)).unwrap();
124
- }
105
+ if let Ok(resp) = reqwest::get(&url).await {
106
+ let content = resp.bytes().await.unwrap();
107
+ let mut decoder = GzDecoder::new(&content[..]);
108
+ let mut decompressed_content = Vec::new();
109
+ if let Ok(_) = decoder.read_to_end(&mut decompressed_content) {
110
+ let text = String::from_utf8_lossy(&decompressed_content);
111
+ let (lst_chrom_pos,maf_bit) = gen_vec(text.to_string());
112
+ txt.send((lst_chrom_pos,maf_bit)).unwrap();
125
113
  }
126
- Err(_) => println!("ERROR downloading {}", url),
127
114
  }
128
115
  }
129
116
  })
@@ -145,17 +132,11 @@ async fn main() -> Result<(),Box<dyn std::error::Error>> {
145
132
  // output
146
133
  // maf_out_bit: A vector of GZIPPED maf
147
134
  // compress_header: output header
148
- let mut maf_out_bit: Vec<u8> = Vec::new();
149
- let compress_header = gen_gzip_vec(MAF_COL.iter().map(|s| s.to_string()).collect());
150
- maf_out_bit.extend(compress_header);
151
- let compress_header_line_break = gen_gzip_vec(["\n".to_string()].to_vec());
152
- maf_out_bit.extend(compress_header_line_break);
135
+ let mut encoder = GzEncoder::new(io::stdout(), Compression::default());
136
+ let _ = encoder.write_all(&MAF_COL.join("\t").as_bytes().to_vec()).expect("Failed to write header");
137
+ let _ = encoder.write_all(b"\n").expect("Failed to write newline");
153
138
  for i in idx_sorted.iter() {
154
- maf_out_bit.extend(&maf_bit[*i]);
139
+ let _ = encoder.write_all(&maf_bit[*i]).expect("Failed to write file");
155
140
  };
156
-
157
- // standard output
158
- println!("{:?}",maf_out_bit);
159
- std::io::stdout().flush().expect("Failed to flush stdout");
160
141
  Ok(())
161
142
  }
@@ -0,0 +1,354 @@
1
+ /*
2
+ This script selects the top most variant genes by calculating the variance/interquartile region for each gene.
3
+
4
+ Various JSON parameters:
5
+ samples: Enter the sample ID(s) separated by comma
6
+ input_file: Path to input file
7
+ filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
8
+ num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
9
+ param: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
10
+
11
+ Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
12
+ */
13
+ #![allow(non_snake_case)]
14
+ use json;
15
+ use nalgebra::base::dimension::Dyn;
16
+ use nalgebra::base::Matrix;
17
+ use nalgebra::base::VecStorage;
18
+ use nalgebra::DMatrix;
19
+ use serde::{Deserialize, Serialize};
20
+ use serde_json;
21
+ use statrs::statistics::Data;
22
+ use statrs::statistics::Median;
23
+ use statrs::statistics::OrderStatistics;
24
+ use statrs::statistics::Statistics;
25
+ use std::cmp::Ordering;
26
+ use std::io;
27
+ use std::path::Path;
28
+ use std::str::FromStr;
29
+ use std::time::Instant;
30
+
31
+ fn input_data(
32
+ filename: &String,
33
+ sample_list: &Vec<&str>,
34
+ ) -> (
35
+ Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
36
+ Vec<String>,
37
+ Vec<String>,
38
+ ) {
39
+ // Build the CSV reader and iterate over each record.
40
+ let path = Path::new(filename);
41
+ let mut rdr = csv::Reader::from_path(path).unwrap();
42
+ let mut num_lines: usize = 0;
43
+ let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
44
+ let mut gene_names: Vec<String> = Vec::with_capacity(65000);
45
+ let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
46
+ let mut num_columns: usize = 0;
47
+
48
+ // Check headers for samples
49
+ let header_line = rdr.headers().unwrap();
50
+ let mut headers: Vec<&str> = Vec::with_capacity(1500);
51
+ for field in header_line.iter() {
52
+ headers = field.split('\t').collect::<Vec<&str>>();
53
+ }
54
+ //println!("headers:{:?}", headers);
55
+ let mut sample_indexes_original: Vec<usize> = Vec::with_capacity(sample_list.len());
56
+ let gene_name_index = headers.iter().position(|r| r == &"geneID");
57
+ let gene_symbol_index = headers.iter().position(|r| r == &"geneSymbol");
58
+ //let mut case_samples_not_found: Vec<&str> = Vec::with_capacity(sample_list.len());
59
+ //let mut control_samples_not_found: Vec<&str> = Vec::with_capacity(control_list.len());
60
+
61
+ for item in sample_list {
62
+ //println!("item:{}", item);
63
+ let index = headers.iter().position(|r| r == item);
64
+ match index {
65
+ Some(n) => sample_indexes_original.push(n),
66
+ None => {
67
+ //panic!("Case sample not found:{}", item);
68
+ //case_samples_not_found.push(item);
69
+ }
70
+ }
71
+ }
72
+
73
+ //println!("case_indexes_original:{:?}", case_indexes_original);
74
+
75
+ let mut samples_indexes: Vec<usize> = Vec::with_capacity(sample_list.len());
76
+ for result in rdr.records() {
77
+ // The iterator yields Result<StringRecord, Error>, so we check the
78
+ // error here.
79
+ let record = result.unwrap();
80
+ //println!("record:{:?}", record);
81
+ let mut index = 0;
82
+ for field in record[0].split('\t').collect::<Vec<&str>>() {
83
+ if index == gene_name_index.unwrap() {
84
+ gene_names.push(field.to_string());
85
+ } else if index == gene_symbol_index.unwrap() {
86
+ gene_symbols.push(field.to_string());
87
+ } else if sample_indexes_original.contains(&index) {
88
+ let num = FromStr::from_str(field);
89
+ match num {
90
+ Ok(n) => {
91
+ //println!("n:{}", n);
92
+ input_vector.push(n);
93
+ if num_lines == 0 {
94
+ samples_indexes.push(num_columns);
95
+ num_columns += 1;
96
+ }
97
+ }
98
+ Err(_n) => {
99
+ panic!(
100
+ "Number {} in line {} and column {} is not a decimal number",
101
+ field,
102
+ num_lines + 1,
103
+ index + 1
104
+ );
105
+ }
106
+ }
107
+ }
108
+ index += 1;
109
+ }
110
+ num_lines += 1;
111
+ }
112
+ //println!("case_indexes:{:?}", case_indexes);
113
+ //println!("control_indexes:{:?}", control_indexes);
114
+
115
+ let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
116
+ //println!("dm:{:?}", dm);
117
+ (dm, gene_names, gene_symbols)
118
+ }
119
+
120
+ #[allow(dead_code)]
121
+ #[derive(Debug, Serialize, Deserialize)]
122
+ struct GeneInfo {
123
+ gene_name: String,
124
+ gene_symbol: String,
125
+ param: f64,
126
+ }
127
+
128
+ fn calculate_variance(
129
+ input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
130
+ gene_names: Vec<String>,
131
+ gene_symbols: Vec<String>,
132
+ mut min_sample_size: f64,
133
+ filter_extreme_values: bool,
134
+ param: String,
135
+ ) -> Vec<GeneInfo> {
136
+ const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
137
+ const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
138
+ const LARGE_N: f64 = 10.0; // Value of constant from R implementation
139
+ const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
140
+
141
+ if min_sample_size == 0.0 {
142
+ panic!("Only one condition present in groups");
143
+ }
144
+
145
+ if min_sample_size > LARGE_N {
146
+ min_sample_size = LARGE_N + (min_sample_size - LARGE_N) * MIN_PROP;
147
+ }
148
+
149
+ let mut lib_sizes = Vec::<f64>::new();
150
+ let lib_sizes_vector = input_matrix.row_sum();
151
+ //println!("lib_sizes_vector:{:?}", lib_sizes_vector);
152
+ for i in 0..lib_sizes_vector.ncols() {
153
+ lib_sizes.push(lib_sizes_vector[(0, i)].into());
154
+ }
155
+ //println!("lib_sizes:{:?}", lib_sizes);
156
+ //println!("min_sample_size:{}", min_sample_size);
157
+ let median_lib_size = Data::new(lib_sizes.clone()).median();
158
+ let cpm_cutoff = (MIN_COUNT / median_lib_size) * 1000000.0;
159
+ //println!("cpm_cutoff:{}", cpm_cutoff);
160
+ let cpm_matrix = cpm(&input_matrix);
161
+ const TOL: f64 = 1e-14; // Value of constant from R implementation
162
+
163
+ let mut gene_infos = Vec::<GeneInfo>::new();
164
+ let row_sums = input_matrix.column_sum();
165
+ for row in 0..input_matrix.nrows() {
166
+ let mut trues = 0.0;
167
+ for col in 0..cpm_matrix.ncols() {
168
+ if cpm_matrix[(row, col)] >= cpm_cutoff {
169
+ trues += 1.0;
170
+ }
171
+ }
172
+ let mut keep_cpm_bool = false;
173
+ if trues >= min_sample_size - TOL {
174
+ keep_cpm_bool = true;
175
+ //keep_cpm.push(keep_cpm_bool);
176
+ //positive_cpm += 1;
177
+ }
178
+
179
+ let mut keep_total_bool = false;
180
+ if row_sums[(row, 0)] as f64 >= MIN_TOTAL_COUNT - TOL {
181
+ keep_total_bool = true;
182
+ //keep_total.push(keep_total_bool);
183
+ //positive_total += 1;
184
+ }
185
+
186
+ let mut gene_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
187
+ for col in 0..input_matrix.ncols() {
188
+ gene_counts.push(input_matrix[(row, col)]);
189
+ }
190
+ if param == "var" {
191
+ // Calculating variance
192
+ if gene_counts.clone().variance().is_nan() == true {
193
+ } else if filter_extreme_values == true
194
+ && keep_cpm_bool == true
195
+ && keep_total_bool == true
196
+ {
197
+ gene_infos.push(GeneInfo {
198
+ param: gene_counts.variance(),
199
+ gene_name: gene_names[row].clone(),
200
+ gene_symbol: gene_symbols[row].clone(),
201
+ });
202
+ } else if filter_extreme_values == false {
203
+ gene_infos.push(GeneInfo {
204
+ param: gene_counts.variance(),
205
+ gene_name: gene_names[row].clone(),
206
+ gene_symbol: gene_symbols[row].clone(),
207
+ });
208
+ }
209
+ } else {
210
+ // Calculating interquartile region
211
+ let mut gene_counts_data = Data::new(gene_counts);
212
+ if gene_counts_data.clone().interquartile_range().is_nan() == true {
213
+ } else if filter_extreme_values == true
214
+ && keep_cpm_bool == true
215
+ && keep_total_bool == true
216
+ {
217
+ gene_infos.push(GeneInfo {
218
+ param: gene_counts_data.interquartile_range(),
219
+ gene_name: gene_names[row].clone(),
220
+ gene_symbol: gene_symbols[row].clone(),
221
+ });
222
+ } else if filter_extreme_values == false {
223
+ gene_infos.push(GeneInfo {
224
+ param: gene_counts_data.interquartile_range(),
225
+ gene_name: gene_names[row].clone(),
226
+ gene_symbol: gene_symbols[row].clone(),
227
+ });
228
+ }
229
+ }
230
+ }
231
+ gene_infos
232
+ .as_mut_slice()
233
+ .sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
234
+ gene_infos
235
+ }
236
+
237
+ fn cpm(
238
+ input_matrix: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
239
+ ) -> Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>> {
240
+ //let mut blank = Vec::<f64>::new();
241
+ let mut blank = Vec::with_capacity(input_matrix.nrows() * input_matrix.ncols());
242
+ for _i in 0..input_matrix.nrows() * input_matrix.ncols() {
243
+ blank.push(0.0);
244
+ }
245
+ let mut output_matrix = DMatrix::from_vec(input_matrix.nrows(), input_matrix.ncols(), blank);
246
+ let column_sums = input_matrix.row_sum();
247
+ for col in 0..input_matrix.ncols() {
248
+ let norm_factor = column_sums[(0, col)];
249
+ for row in 0..input_matrix.nrows() {
250
+ output_matrix[(row, col)] =
251
+ (input_matrix[(row, col)] as f64 * 1000000.0) / norm_factor as f64;
252
+ }
253
+ }
254
+ //println!("output_matrix:{:?}", output_matrix);
255
+ output_matrix
256
+ }
257
+
258
+ fn main() {
259
+ let mut input = String::new();
260
+ match io::stdin().read_line(&mut input) {
261
+ // Accepting the piped input from nodejs (or command line from testing)
262
+ Ok(_bytes_read) => {
263
+ //println!("{} bytes read", bytes_read);
264
+ //println!("{}", input);
265
+ let input_json = json::parse(&input);
266
+ match input_json {
267
+ Ok(json_string) => {
268
+ let now = Instant::now();
269
+ let samples_string_result = &json_string["samples"].to_owned();
270
+ let samples_string;
271
+ match samples_string_result.as_str() {
272
+ Some(x) => {
273
+ samples_string = x.to_string();
274
+ }
275
+ None => {
276
+ panic!("Samples not provided");
277
+ }
278
+ }
279
+
280
+ let file_name_result = &json_string["input_file"];
281
+ let file_name;
282
+ match file_name_result.as_str() {
283
+ Some(x) => {
284
+ file_name = x.to_string();
285
+ }
286
+ None => {
287
+ panic!("File name is missing");
288
+ }
289
+ }
290
+
291
+ let param = &json_string["param"] // Value provide must be either "var" or "iqr"
292
+ .to_owned()
293
+ .as_str()
294
+ .unwrap()
295
+ .to_string();
296
+ if param != "var" && param != "iqr" {
297
+ // Check if any unknown method has been provided
298
+ panic!("Unknown method:{}", param);
299
+ }
300
+ let filter_extreme_values_result = &json_string["filter_extreme_values"];
301
+
302
+ let filter_extreme_values;
303
+ match filter_extreme_values_result.as_bool() {
304
+ Some(x) => {
305
+ filter_extreme_values = x;
306
+ }
307
+ None => {
308
+ filter_extreme_values = true; // If filter_extreme_values field is missing, set it to true by default
309
+ }
310
+ }
311
+
312
+ let num_genes_result = &json_string["num_genes"];
313
+ let num_genes;
314
+ match num_genes_result.as_usize() {
315
+ Some(x) => {
316
+ num_genes = x;
317
+ }
318
+ None => {
319
+ panic!("Number of genes to be given is missing")
320
+ }
321
+ }
322
+
323
+ let samples_list: Vec<&str> = samples_string.split(",").collect();
324
+ let (input_matrix, gene_names, gene_symbols) =
325
+ input_data(&file_name, &samples_list);
326
+ let gene_infos = calculate_variance(
327
+ input_matrix,
328
+ gene_names,
329
+ gene_symbols,
330
+ samples_list.len() as f64,
331
+ filter_extreme_values,
332
+ param.to_string(),
333
+ );
334
+ //println!("gene_infos:{:?}", gene_infos);
335
+
336
+ // Printing the top "num_genes" genes to JSON
337
+ let mut output_string = "[".to_string();
338
+ for j in 0..num_genes {
339
+ let i = gene_infos.len() - j - 1;
340
+ output_string += &serde_json::to_string(&gene_infos[i]).unwrap();
341
+ if i > gene_infos.len() - num_genes {
342
+ output_string += &",".to_string();
343
+ }
344
+ }
345
+ output_string += &"]".to_string();
346
+ println!("{}", output_string);
347
+ println!("Time for calculating variances:{:?}", now.elapsed());
348
+ }
349
+ Err(error) => println!("Incorrect json: {}", error),
350
+ }
351
+ }
352
+ Err(error) => println!("Piping error: {}", error),
353
+ }
354
+ }