@sjcrh/proteinpaint-rust 2.61.1 → 2.73.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.61.1",
2
+ "version": "2.73.0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.61.1"
41
+ "pp_release_tag": "v2.73.0"
42
42
  }
@@ -6,9 +6,9 @@ Various JSON parameters:
6
6
  input_file: Path to input file
7
7
  filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
8
8
  num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
9
- param: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
9
+ rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
10
10
 
11
- Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "param":"var"}' && time echo $json | target/release/gene_variance
11
+ Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
12
12
  */
13
13
  #![allow(non_snake_case)]
14
14
  use bgzip::BGZFReader;
@@ -103,7 +103,7 @@ fn input_data(
103
103
  #[derive(Debug, Serialize, Deserialize)]
104
104
  struct GeneInfo {
105
105
  gene_symbol: String,
106
- param: f64,
106
+ rank_type: f64,
107
107
  }
108
108
 
109
109
  fn calculate_variance(
@@ -111,10 +111,22 @@ fn calculate_variance(
111
111
  gene_symbols: Vec<String>,
112
112
  mut min_sample_size: f64,
113
113
  filter_extreme_values: bool,
114
- param: String,
114
+ rank_type: String,
115
+ min_count_option: Option<f64>,
116
+ min_total_count_option: Option<f64>,
115
117
  ) -> Vec<GeneInfo> {
116
- const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
117
- const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
118
+ let mut min_count: f64 = 10.0;
119
+ match min_count_option {
120
+ Some(x) => min_count = x,
121
+ None => {}
122
+ }
123
+ let mut min_total_count: f64 = 15.0;
124
+ match min_total_count_option {
125
+ Some(x) => min_total_count = x,
126
+ None => {}
127
+ }
128
+ //const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
129
+ //const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
118
130
  const LARGE_N: f64 = 10.0; // Value of constant from R implementation
119
131
  const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
120
132
 
@@ -135,7 +147,7 @@ fn calculate_variance(
135
147
  //println!("lib_sizes:{:?}", lib_sizes);
136
148
  //println!("min_sample_size:{}", min_sample_size);
137
149
  let median_lib_size = Data::new(lib_sizes.clone()).median();
138
- let cpm_cutoff = (MIN_COUNT / median_lib_size) * 1000000.0;
150
+ let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
139
151
  //println!("cpm_cutoff:{}", cpm_cutoff);
140
152
  let cpm_matrix = cpm(&input_matrix);
141
153
  const TOL: f64 = 1e-14; // Value of constant from R implementation
@@ -157,7 +169,7 @@ fn calculate_variance(
157
169
  }
158
170
 
159
171
  let mut keep_total_bool = false;
160
- if row_sums[(row, 0)] as f64 >= MIN_TOTAL_COUNT - TOL {
172
+ if row_sums[(row, 0)] as f64 >= min_total_count - TOL {
161
173
  keep_total_bool = true;
162
174
  //keep_total.push(keep_total_bool);
163
175
  //positive_total += 1;
@@ -167,7 +179,7 @@ fn calculate_variance(
167
179
  for col in 0..input_matrix.ncols() {
168
180
  gene_counts.push(input_matrix[(row, col)]);
169
181
  }
170
- if param == "var" {
182
+ if rank_type == "var" {
171
183
  // Calculating variance
172
184
  if gene_counts.clone().variance().is_nan() == true {
173
185
  } else if filter_extreme_values == true
@@ -175,12 +187,12 @@ fn calculate_variance(
175
187
  && keep_total_bool == true
176
188
  {
177
189
  gene_infos.push(GeneInfo {
178
- param: gene_counts.variance(),
190
+ rank_type: gene_counts.variance(),
179
191
  gene_symbol: gene_symbols[row].clone(),
180
192
  });
181
193
  } else if filter_extreme_values == false {
182
194
  gene_infos.push(GeneInfo {
183
- param: gene_counts.variance(),
195
+ rank_type: gene_counts.variance(),
184
196
  gene_symbol: gene_symbols[row].clone(),
185
197
  });
186
198
  }
@@ -193,20 +205,22 @@ fn calculate_variance(
193
205
  && keep_total_bool == true
194
206
  {
195
207
  gene_infos.push(GeneInfo {
196
- param: gene_counts_data.interquartile_range(),
208
+ rank_type: gene_counts_data.interquartile_range(),
197
209
  gene_symbol: gene_symbols[row].clone(),
198
210
  });
199
211
  } else if filter_extreme_values == false {
200
212
  gene_infos.push(GeneInfo {
201
- param: gene_counts_data.interquartile_range(),
213
+ rank_type: gene_counts_data.interquartile_range(),
202
214
  gene_symbol: gene_symbols[row].clone(),
203
215
  });
204
216
  }
205
217
  }
206
218
  }
207
- gene_infos
208
- .as_mut_slice()
209
- .sort_by(|a, b| (a.param).partial_cmp(&b.param).unwrap_or(Ordering::Equal));
219
+ gene_infos.as_mut_slice().sort_by(|a, b| {
220
+ (a.rank_type)
221
+ .partial_cmp(&b.rank_type)
222
+ .unwrap_or(Ordering::Equal)
223
+ });
210
224
  gene_infos
211
225
  }
212
226
 
@@ -264,14 +278,14 @@ fn main() {
264
278
  }
265
279
  }
266
280
 
267
- let param = &json_string["param"] // Value provide must be either "var" or "iqr"
281
+ let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
268
282
  .to_owned()
269
283
  .as_str()
270
284
  .unwrap()
271
285
  .to_string();
272
- if param != "var" && param != "iqr" {
286
+ if rank_type != "var" && rank_type != "iqr" {
273
287
  // Check if any unknown method has been provided
274
- panic!("Unknown method:{}", param);
288
+ panic!("Unknown method:{}", rank_type);
275
289
  }
276
290
  let filter_extreme_values_result = &json_string["filter_extreme_values"];
277
291
 
@@ -296,6 +310,20 @@ fn main() {
296
310
  }
297
311
  }
298
312
 
313
+ let min_count_result = &json_string["min_count"];
314
+ let mut min_count: Option<f64> = None;
315
+ match min_count_result.as_f64() {
316
+ Some(x) => min_count = Some(x),
317
+ None => {}
318
+ }
319
+
320
+ let min_total_count_result = &json_string["min_total_count"];
321
+ let mut min_total_count: Option<f64> = None;
322
+ match min_total_count_result.as_f64() {
323
+ Some(x) => min_total_count = Some(x),
324
+ None => {}
325
+ }
326
+
299
327
  let samples_list: Vec<&str> = samples_string.split(",").collect();
300
328
  let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
301
329
  let gene_infos = calculate_variance(
@@ -303,7 +331,9 @@ fn main() {
303
331
  gene_symbols,
304
332
  samples_list.len() as f64,
305
333
  filter_extreme_values,
306
- param.to_string(),
334
+ rank_type.to_string(),
335
+ min_count,
336
+ min_total_count,
307
337
  );
308
338
  //println!("gene_infos:{:?}", gene_infos);
309
339