@sjcrh/proteinpaint-rust 2.61.1 → 2.73.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.73.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.73.0"
|
|
42
42
|
}
|
|
@@ -6,9 +6,9 @@ Various JSON parameters:
|
|
|
6
6
|
input_file: Path to input file
|
|
7
7
|
filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
|
|
8
8
|
num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
|
|
9
|
-
|
|
9
|
+
rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
|
|
10
10
|
|
|
11
|
-
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "
|
|
11
|
+
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
|
|
12
12
|
*/
|
|
13
13
|
#![allow(non_snake_case)]
|
|
14
14
|
use bgzip::BGZFReader;
|
|
@@ -103,7 +103,7 @@ fn input_data(
|
|
|
103
103
|
#[derive(Debug, Serialize, Deserialize)]
|
|
104
104
|
struct GeneInfo {
|
|
105
105
|
gene_symbol: String,
|
|
106
|
-
|
|
106
|
+
rank_type: f64,
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
fn calculate_variance(
|
|
@@ -111,10 +111,22 @@ fn calculate_variance(
|
|
|
111
111
|
gene_symbols: Vec<String>,
|
|
112
112
|
mut min_sample_size: f64,
|
|
113
113
|
filter_extreme_values: bool,
|
|
114
|
-
|
|
114
|
+
rank_type: String,
|
|
115
|
+
min_count_option: Option<f64>,
|
|
116
|
+
min_total_count_option: Option<f64>,
|
|
115
117
|
) -> Vec<GeneInfo> {
|
|
116
|
-
|
|
117
|
-
|
|
118
|
+
let mut min_count: f64 = 10.0;
|
|
119
|
+
match min_count_option {
|
|
120
|
+
Some(x) => min_count = x,
|
|
121
|
+
None => {}
|
|
122
|
+
}
|
|
123
|
+
let mut min_total_count: f64 = 15.0;
|
|
124
|
+
match min_total_count_option {
|
|
125
|
+
Some(x) => min_total_count = x,
|
|
126
|
+
None => {}
|
|
127
|
+
}
|
|
128
|
+
//const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
|
|
129
|
+
//const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
|
|
118
130
|
const LARGE_N: f64 = 10.0; // Value of constant from R implementation
|
|
119
131
|
const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
|
|
120
132
|
|
|
@@ -135,7 +147,7 @@ fn calculate_variance(
|
|
|
135
147
|
//println!("lib_sizes:{:?}", lib_sizes);
|
|
136
148
|
//println!("min_sample_size:{}", min_sample_size);
|
|
137
149
|
let median_lib_size = Data::new(lib_sizes.clone()).median();
|
|
138
|
-
let cpm_cutoff = (
|
|
150
|
+
let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
|
|
139
151
|
//println!("cpm_cutoff:{}", cpm_cutoff);
|
|
140
152
|
let cpm_matrix = cpm(&input_matrix);
|
|
141
153
|
const TOL: f64 = 1e-14; // Value of constant from R implementation
|
|
@@ -157,7 +169,7 @@ fn calculate_variance(
|
|
|
157
169
|
}
|
|
158
170
|
|
|
159
171
|
let mut keep_total_bool = false;
|
|
160
|
-
if row_sums[(row, 0)] as f64 >=
|
|
172
|
+
if row_sums[(row, 0)] as f64 >= min_total_count - TOL {
|
|
161
173
|
keep_total_bool = true;
|
|
162
174
|
//keep_total.push(keep_total_bool);
|
|
163
175
|
//positive_total += 1;
|
|
@@ -167,7 +179,7 @@ fn calculate_variance(
|
|
|
167
179
|
for col in 0..input_matrix.ncols() {
|
|
168
180
|
gene_counts.push(input_matrix[(row, col)]);
|
|
169
181
|
}
|
|
170
|
-
if
|
|
182
|
+
if rank_type == "var" {
|
|
171
183
|
// Calculating variance
|
|
172
184
|
if gene_counts.clone().variance().is_nan() == true {
|
|
173
185
|
} else if filter_extreme_values == true
|
|
@@ -175,12 +187,12 @@ fn calculate_variance(
|
|
|
175
187
|
&& keep_total_bool == true
|
|
176
188
|
{
|
|
177
189
|
gene_infos.push(GeneInfo {
|
|
178
|
-
|
|
190
|
+
rank_type: gene_counts.variance(),
|
|
179
191
|
gene_symbol: gene_symbols[row].clone(),
|
|
180
192
|
});
|
|
181
193
|
} else if filter_extreme_values == false {
|
|
182
194
|
gene_infos.push(GeneInfo {
|
|
183
|
-
|
|
195
|
+
rank_type: gene_counts.variance(),
|
|
184
196
|
gene_symbol: gene_symbols[row].clone(),
|
|
185
197
|
});
|
|
186
198
|
}
|
|
@@ -193,20 +205,22 @@ fn calculate_variance(
|
|
|
193
205
|
&& keep_total_bool == true
|
|
194
206
|
{
|
|
195
207
|
gene_infos.push(GeneInfo {
|
|
196
|
-
|
|
208
|
+
rank_type: gene_counts_data.interquartile_range(),
|
|
197
209
|
gene_symbol: gene_symbols[row].clone(),
|
|
198
210
|
});
|
|
199
211
|
} else if filter_extreme_values == false {
|
|
200
212
|
gene_infos.push(GeneInfo {
|
|
201
|
-
|
|
213
|
+
rank_type: gene_counts_data.interquartile_range(),
|
|
202
214
|
gene_symbol: gene_symbols[row].clone(),
|
|
203
215
|
});
|
|
204
216
|
}
|
|
205
217
|
}
|
|
206
218
|
}
|
|
207
|
-
gene_infos
|
|
208
|
-
.
|
|
209
|
-
|
|
219
|
+
gene_infos.as_mut_slice().sort_by(|a, b| {
|
|
220
|
+
(a.rank_type)
|
|
221
|
+
.partial_cmp(&b.rank_type)
|
|
222
|
+
.unwrap_or(Ordering::Equal)
|
|
223
|
+
});
|
|
210
224
|
gene_infos
|
|
211
225
|
}
|
|
212
226
|
|
|
@@ -264,14 +278,14 @@ fn main() {
|
|
|
264
278
|
}
|
|
265
279
|
}
|
|
266
280
|
|
|
267
|
-
let
|
|
281
|
+
let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
|
|
268
282
|
.to_owned()
|
|
269
283
|
.as_str()
|
|
270
284
|
.unwrap()
|
|
271
285
|
.to_string();
|
|
272
|
-
if
|
|
286
|
+
if rank_type != "var" && rank_type != "iqr" {
|
|
273
287
|
// Check if any unknown method has been provided
|
|
274
|
-
panic!("Unknown method:{}",
|
|
288
|
+
panic!("Unknown method:{}", rank_type);
|
|
275
289
|
}
|
|
276
290
|
let filter_extreme_values_result = &json_string["filter_extreme_values"];
|
|
277
291
|
|
|
@@ -296,6 +310,20 @@ fn main() {
|
|
|
296
310
|
}
|
|
297
311
|
}
|
|
298
312
|
|
|
313
|
+
let min_count_result = &json_string["min_count"];
|
|
314
|
+
let mut min_count: Option<f64> = None;
|
|
315
|
+
match min_count_result.as_f64() {
|
|
316
|
+
Some(x) => min_count = Some(x),
|
|
317
|
+
None => {}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
let min_total_count_result = &json_string["min_total_count"];
|
|
321
|
+
let mut min_total_count: Option<f64> = None;
|
|
322
|
+
match min_total_count_result.as_f64() {
|
|
323
|
+
Some(x) => min_total_count = Some(x),
|
|
324
|
+
None => {}
|
|
325
|
+
}
|
|
326
|
+
|
|
299
327
|
let samples_list: Vec<&str> = samples_string.split(",").collect();
|
|
300
328
|
let (input_matrix, gene_symbols) = input_data(&file_name, &samples_list);
|
|
301
329
|
let gene_infos = calculate_variance(
|
|
@@ -303,7 +331,9 @@ fn main() {
|
|
|
303
331
|
gene_symbols,
|
|
304
332
|
samples_list.len() as f64,
|
|
305
333
|
filter_extreme_values,
|
|
306
|
-
|
|
334
|
+
rank_type.to_string(),
|
|
335
|
+
min_count,
|
|
336
|
+
min_total_count,
|
|
307
337
|
);
|
|
308
338
|
//println!("gene_infos:{:?}", gene_infos);
|
|
309
339
|
|