@sjcrh/proteinpaint-rust 2.186.0 → 2.188.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +1 -5
- package/package.json +1 -1
- package/src/topGeneByExpressionVariance.rs +0 -731
package/Cargo.toml
CHANGED
|
@@ -84,10 +84,6 @@ path="src/cluster.rs"
|
|
|
84
84
|
name="gdcmaf"
|
|
85
85
|
path="src/gdcmaf.rs"
|
|
86
86
|
|
|
87
|
-
[[bin]]
|
|
88
|
-
name="topGeneByExpressionVariance"
|
|
89
|
-
path="src/topGeneByExpressionVariance.rs"
|
|
90
|
-
|
|
91
87
|
[[bin]]
|
|
92
88
|
name="wilcoxon"
|
|
93
89
|
path="src/wilcoxon.rs"
|
|
@@ -142,4 +138,4 @@ path="src/dmrcate.rs"
|
|
|
142
138
|
|
|
143
139
|
[[bin]]
|
|
144
140
|
name="volcano"
|
|
145
|
-
path="src/volcano.rs"
|
|
141
|
+
path="src/volcano.rs"
|
package/package.json
CHANGED
|
@@ -1,731 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
This script selects the top most variant genes by calculating the variance/interquartile region for each gene.
|
|
3
|
-
Added support for HDF5 input files alongside the existing text file support.
|
|
4
|
-
|
|
5
|
-
Various JSON parameters:
|
|
6
|
-
samples: Enter the sample ID(s) separated by comma
|
|
7
|
-
input_file: Path to input file (either text or HDF5 format)
|
|
8
|
-
filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
|
|
9
|
-
num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
|
|
10
|
-
rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
|
|
11
|
-
newformat?: bool. Used to support new format HDF5
|
|
12
|
-
|
|
13
|
-
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
|
|
14
|
-
|
|
15
|
-
Usage for new format HDF5
|
|
16
|
-
echo '{"samples":"sample1,sample2,sample3","newformat":true,"min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' | ./target/release/topGeneByExpressionVariance
|
|
17
|
-
*/
|
|
18
|
-
#![allow(non_snake_case)]
|
|
19
|
-
use bgzip::BGZFReader;
|
|
20
|
-
use json;
|
|
21
|
-
use nalgebra::DMatrix;
|
|
22
|
-
use nalgebra::base::Matrix;
|
|
23
|
-
use nalgebra::base::VecStorage;
|
|
24
|
-
use nalgebra::base::dimension::Dyn;
|
|
25
|
-
use serde::{Deserialize, Serialize};
|
|
26
|
-
use serde_json;
|
|
27
|
-
use statrs::statistics::Data;
|
|
28
|
-
use statrs::statistics::Median;
|
|
29
|
-
use statrs::statistics::OrderStatistics;
|
|
30
|
-
use statrs::statistics::Statistics;
|
|
31
|
-
use std::cmp::Ordering;
|
|
32
|
-
use std::fs;
|
|
33
|
-
use std::io;
|
|
34
|
-
use std::io::Read;
|
|
35
|
-
use std::str::FromStr;
|
|
36
|
-
// use std::time::Instant;
|
|
37
|
-
use hdf5::types::VarLenUnicode;
|
|
38
|
-
use hdf5::{File, Result};
|
|
39
|
-
use ndarray::Dim;
|
|
40
|
-
|
|
41
|
-
/// Read expression data from a dense HDF5 file for a list of samples
|
|
42
|
-
///
|
|
43
|
-
/// This function extracts expression data from a dense format HDF5 file for
|
|
44
|
-
/// the specified samples and returns it in the format expected by the
|
|
45
|
-
/// gene variance calculation code.
|
|
46
|
-
///
|
|
47
|
-
/// # Arguments
|
|
48
|
-
///
|
|
49
|
-
/// * `filename` - Path to the HDF5 file
|
|
50
|
-
/// * `sample_list` - List of sample IDs to extract data for
|
|
51
|
-
///
|
|
52
|
-
/// # Returns
|
|
53
|
-
///
|
|
54
|
-
/// A Result containing either:
|
|
55
|
-
/// - A tuple with expression matrix and gene symbols list on success, or
|
|
56
|
-
/// - An error with details formatted as JSON
|
|
57
|
-
fn input_data_hdf5(
|
|
58
|
-
filename: &String,
|
|
59
|
-
sample_list: &Vec<&str>,
|
|
60
|
-
) -> Result<(Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>)> {
|
|
61
|
-
// Open the HDF5 file
|
|
62
|
-
let file = match File::open(filename) {
|
|
63
|
-
Ok(f) => f,
|
|
64
|
-
Err(err) => {
|
|
65
|
-
return Err(hdf5::Error::Internal(format!("Failed to open HDF5 file: {}", err)));
|
|
66
|
-
}
|
|
67
|
-
};
|
|
68
|
-
|
|
69
|
-
// Read gene symbols dataset
|
|
70
|
-
let genes_dataset = match file.dataset("item") {
|
|
71
|
-
Ok(ds) => ds,
|
|
72
|
-
Err(err) => {
|
|
73
|
-
return Err(hdf5::Error::Internal(format!(
|
|
74
|
-
"Failed to open gene_names dataset: {}",
|
|
75
|
-
err
|
|
76
|
-
)));
|
|
77
|
-
}
|
|
78
|
-
};
|
|
79
|
-
|
|
80
|
-
// Read genes as VarLenAscii
|
|
81
|
-
let genes_varlen = match genes_dataset.read_1d::<VarLenUnicode>() {
|
|
82
|
-
Ok(g) => g,
|
|
83
|
-
Err(err) => {
|
|
84
|
-
return Err(hdf5::Error::Internal(format!("Failed to read gene symbols: {}", err)));
|
|
85
|
-
}
|
|
86
|
-
};
|
|
87
|
-
|
|
88
|
-
// Convert to Vec<String> for easier handling
|
|
89
|
-
let gene_names: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
90
|
-
let num_genes = gene_names.len();
|
|
91
|
-
|
|
92
|
-
// Read sample names
|
|
93
|
-
let samples_dataset = match file.dataset("samples") {
|
|
94
|
-
Ok(ds) => ds,
|
|
95
|
-
Err(err) => {
|
|
96
|
-
println!(
|
|
97
|
-
"{}",
|
|
98
|
-
serde_json::json!({
|
|
99
|
-
"status": "error",
|
|
100
|
-
"message": format!("Failed to open samples dataset: {}", err),
|
|
101
|
-
"file_path": filename
|
|
102
|
-
})
|
|
103
|
-
);
|
|
104
|
-
return Err(hdf5::Error::Internal(format!(
|
|
105
|
-
"Failed to open samples dataset: {}",
|
|
106
|
-
err
|
|
107
|
-
)));
|
|
108
|
-
}
|
|
109
|
-
};
|
|
110
|
-
|
|
111
|
-
// Read samples as VarLenAscii
|
|
112
|
-
let samples_varlen = match samples_dataset.read_1d::<VarLenUnicode>() {
|
|
113
|
-
Ok(s) => s,
|
|
114
|
-
Err(err) => {
|
|
115
|
-
// eprintln!("Failed to read sample names: {}", err);
|
|
116
|
-
println!(
|
|
117
|
-
"{}",
|
|
118
|
-
serde_json::json!({
|
|
119
|
-
"status": "error",
|
|
120
|
-
"message": format!("Failed to read sample names: {}", err),
|
|
121
|
-
"file_path": filename
|
|
122
|
-
})
|
|
123
|
-
);
|
|
124
|
-
return Err(hdf5::Error::Internal(format!("Failed to read sample names: {}", err)));
|
|
125
|
-
}
|
|
126
|
-
};
|
|
127
|
-
|
|
128
|
-
// Convert to Vec<String> for easier handling
|
|
129
|
-
let all_samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
|
|
130
|
-
|
|
131
|
-
// Find indices of requested samples
|
|
132
|
-
let mut column_indices: Vec<usize> = Vec::with_capacity(sample_list.len());
|
|
133
|
-
for sample in sample_list {
|
|
134
|
-
if let Some(index) = all_samples.iter().position(|s| s == sample) {
|
|
135
|
-
column_indices.push(index);
|
|
136
|
-
} else {
|
|
137
|
-
return Err(hdf5::Error::Internal(format!(
|
|
138
|
-
"Sample '{}' not found in the dataset",
|
|
139
|
-
sample
|
|
140
|
-
)));
|
|
141
|
-
}
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// Read the counts dataset
|
|
145
|
-
let counts_dataset = match file.dataset("matrix") {
|
|
146
|
-
Ok(ds) => ds,
|
|
147
|
-
Err(err) => {
|
|
148
|
-
return Err(hdf5::Error::Internal(format!("Failed to open counts dataset: {}", err)));
|
|
149
|
-
}
|
|
150
|
-
};
|
|
151
|
-
|
|
152
|
-
// Get dataset dimensions for validation
|
|
153
|
-
let dataset_shape = counts_dataset.shape();
|
|
154
|
-
if dataset_shape.len() != 2 {
|
|
155
|
-
return Err(hdf5::Error::Internal("Expected a 2D dataset for counts".to_string()));
|
|
156
|
-
};
|
|
157
|
-
|
|
158
|
-
// Check dimensions match expected values
|
|
159
|
-
if dataset_shape[0] != num_genes {
|
|
160
|
-
return Err(hdf5::Error::Internal(format!(
|
|
161
|
-
"Counts dataset first dimension ({}) doesn't match number of genes ({})",
|
|
162
|
-
dataset_shape[0], num_genes
|
|
163
|
-
)));
|
|
164
|
-
};
|
|
165
|
-
|
|
166
|
-
if dataset_shape[1] != all_samples.len() {
|
|
167
|
-
return Err(hdf5::Error::Internal(format!(
|
|
168
|
-
"Counts dataset second dimension ({}) doesn't match number of samples ({})",
|
|
169
|
-
dataset_shape[1],
|
|
170
|
-
all_samples.len()
|
|
171
|
-
)));
|
|
172
|
-
};
|
|
173
|
-
|
|
174
|
-
// Read the counts dataset
|
|
175
|
-
let all_counts = match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
|
|
176
|
-
Ok(data) => data,
|
|
177
|
-
Err(err) => {
|
|
178
|
-
return Err(hdf5::Error::Internal(format!(
|
|
179
|
-
"Failed to read expression data: {}",
|
|
180
|
-
err
|
|
181
|
-
)));
|
|
182
|
-
}
|
|
183
|
-
};
|
|
184
|
-
|
|
185
|
-
let mut input_vector: Vec<f64> = Vec::with_capacity(num_genes * sample_list.len());
|
|
186
|
-
|
|
187
|
-
for gene_idx in 0..num_genes {
|
|
188
|
-
for &col_idx in &column_indices {
|
|
189
|
-
input_vector.push(all_counts[[gene_idx, col_idx]]);
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// Create matrix from the extracted data
|
|
194
|
-
let dm = DMatrix::from_row_slice(num_genes, sample_list.len(), &input_vector);
|
|
195
|
-
|
|
196
|
-
Ok((dm, gene_names))
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// The original input_data function for text files is kept as is
|
|
200
|
-
fn input_data(
|
|
201
|
-
filename: &String,
|
|
202
|
-
sample_list: &Vec<&str>,
|
|
203
|
-
) -> (Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>, Vec<String>) {
|
|
204
|
-
// Build the CSV reader and iterate over each record.
|
|
205
|
-
let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
|
|
206
|
-
let mut num_lines: usize = 0;
|
|
207
|
-
let mut gene_names: Vec<String> = Vec::with_capacity(500);
|
|
208
|
-
|
|
209
|
-
let mut buffer = String::new();
|
|
210
|
-
reader.read_to_string(&mut buffer).unwrap();
|
|
211
|
-
|
|
212
|
-
let lines = buffer.split("\n");
|
|
213
|
-
let mut first = true;
|
|
214
|
-
let mut input_vector: Vec<f64> = Vec::with_capacity(1000 * 500);
|
|
215
|
-
let mut column_numbers: Vec<usize> = Vec::with_capacity(300);
|
|
216
|
-
for line in lines {
|
|
217
|
-
if first == true {
|
|
218
|
-
first = false;
|
|
219
|
-
let columns: Vec<&str> = line.split("\t").collect();
|
|
220
|
-
// Finding column numbers corresponding to each sample given in the input list
|
|
221
|
-
for item in sample_list {
|
|
222
|
-
if let Some(index) = columns.iter().position(|num| num == item) {
|
|
223
|
-
column_numbers.push(index)
|
|
224
|
-
} else {
|
|
225
|
-
panic!("Sample {} not found:", item)
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
} else {
|
|
229
|
-
let line2: Vec<&str> = line.split("\t").collect();
|
|
230
|
-
if line2.len() == 1 {
|
|
231
|
-
break; // end of file
|
|
232
|
-
} else {
|
|
233
|
-
num_lines += 1;
|
|
234
|
-
//println!("line2:{:?}", line2);
|
|
235
|
-
gene_names.push(line2[3].to_string());
|
|
236
|
-
for i in &column_numbers {
|
|
237
|
-
let field = line2[*i];
|
|
238
|
-
let num = FromStr::from_str(field);
|
|
239
|
-
match num {
|
|
240
|
-
Ok(n) => {
|
|
241
|
-
//println!("n:{}", n);
|
|
242
|
-
input_vector.push(n);
|
|
243
|
-
}
|
|
244
|
-
Err(_n) => {
|
|
245
|
-
panic!(
|
|
246
|
-
"Number {} in line {} and column {} is not a decimal number",
|
|
247
|
-
field,
|
|
248
|
-
num_lines + 1,
|
|
249
|
-
i + 1
|
|
250
|
-
);
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
//println!("case_indexes:{:?}", case_indexes);
|
|
259
|
-
//println!("control_indexes:{:?}", control_indexes);
|
|
260
|
-
|
|
261
|
-
let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
|
|
262
|
-
//println!("dm:{:?}", dm);
|
|
263
|
-
(dm, gene_names)
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
#[allow(dead_code)]
|
|
267
|
-
#[derive(Debug, Serialize, Deserialize)]
|
|
268
|
-
struct GeneInfo {
|
|
269
|
-
gene_symbol: String,
|
|
270
|
-
rank_type: f64,
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
fn calculate_variance(
|
|
274
|
-
input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
275
|
-
gene_names: Vec<String>,
|
|
276
|
-
mut min_sample_size: f64,
|
|
277
|
-
filter_extreme_values: bool,
|
|
278
|
-
rank_type: String,
|
|
279
|
-
min_count_option: Option<f64>,
|
|
280
|
-
min_total_count_option: Option<f64>,
|
|
281
|
-
) -> Vec<GeneInfo> {
|
|
282
|
-
let mut min_count: f64 = 10.0;
|
|
283
|
-
match min_count_option {
|
|
284
|
-
Some(x) => min_count = x,
|
|
285
|
-
None => {}
|
|
286
|
-
}
|
|
287
|
-
let mut min_total_count: f64 = 15.0;
|
|
288
|
-
match min_total_count_option {
|
|
289
|
-
Some(x) => min_total_count = x,
|
|
290
|
-
None => {}
|
|
291
|
-
}
|
|
292
|
-
//const MIN_COUNT: f64 = 10.0; // Value of constant from R implementation
|
|
293
|
-
//const MIN_TOTAL_COUNT: f64 = 15.0; // Value of constant from R implementation
|
|
294
|
-
const LARGE_N: f64 = 10.0; // Value of constant from R implementation
|
|
295
|
-
const MIN_PROP: f64 = 0.7; // Value of constant from R implementation
|
|
296
|
-
|
|
297
|
-
if min_sample_size == 0.0 {
|
|
298
|
-
panic!("Only one condition present in groups");
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
if min_sample_size > LARGE_N {
|
|
302
|
-
min_sample_size = LARGE_N + (min_sample_size - LARGE_N) * MIN_PROP;
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
// Per-sample library sizes as nansum — a single NaN gene doesn't
|
|
306
|
-
// poison the whole sample's total.
|
|
307
|
-
let mut lib_sizes = Vec::<f64>::with_capacity(input_matrix.ncols());
|
|
308
|
-
for col in 0..input_matrix.ncols() {
|
|
309
|
-
let mut s = 0.0_f64;
|
|
310
|
-
for row in 0..input_matrix.nrows() {
|
|
311
|
-
let v = input_matrix[(row, col)];
|
|
312
|
-
if v.is_finite() {
|
|
313
|
-
s += v;
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
lib_sizes.push(s);
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
let median_lib_size = Data::new(lib_sizes.clone()).median();
|
|
320
|
-
let cpm_cutoff = (min_count / median_lib_size) * 1000000.0;
|
|
321
|
-
//println!("cpm_cutoff:{}", cpm_cutoff);
|
|
322
|
-
let cpm_matrix = cpm(&input_matrix, &lib_sizes);
|
|
323
|
-
const TOL: f64 = 1e-14; // Value of constant from R implementation
|
|
324
|
-
|
|
325
|
-
let mut gene_infos = Vec::<GeneInfo>::new();
|
|
326
|
-
for row in 0..input_matrix.nrows() {
|
|
327
|
-
let mut trues = 0.0;
|
|
328
|
-
// CPM filter (NaN-safe)
|
|
329
|
-
for col in 0..cpm_matrix.ncols() {
|
|
330
|
-
let v = cpm_matrix[(row, col)];
|
|
331
|
-
if v.is_finite() && v >= cpm_cutoff {
|
|
332
|
-
trues += 1.0;
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
let mut keep_cpm_bool = false;
|
|
336
|
-
if trues >= min_sample_size - TOL {
|
|
337
|
-
keep_cpm_bool = true;
|
|
338
|
-
//keep_cpm.push(keep_cpm_bool);
|
|
339
|
-
//positive_cpm += 1;
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
let mut row_sum_finite = 0.0_f64;
|
|
343
|
-
for col in 0..input_matrix.ncols() {
|
|
344
|
-
let v = input_matrix[(row, col)];
|
|
345
|
-
if v.is_finite() {
|
|
346
|
-
row_sum_finite += v;
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
let mut keep_total_bool = false;
|
|
350
|
-
if row_sum_finite >= min_total_count - TOL {
|
|
351
|
-
keep_total_bool = true;
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
let mut gene_counts: Vec<f64> = Vec::with_capacity(input_matrix.ncols());
|
|
355
|
-
for col in 0..input_matrix.ncols() {
|
|
356
|
-
let v = input_matrix[(row, col)];
|
|
357
|
-
if v.is_finite() {
|
|
358
|
-
gene_counts.push(v);
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
// Skip genes with too few observations to produce a stable statistic
|
|
363
|
-
let min_required = if rank_type == "var" { 2 } else { 4 };
|
|
364
|
-
if gene_counts.len() < min_required {
|
|
365
|
-
continue;
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
if rank_type == "var" {
|
|
369
|
-
// Calculating variance
|
|
370
|
-
if gene_counts.clone().variance().is_nan() == true {
|
|
371
|
-
} else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
|
|
372
|
-
gene_infos.push(GeneInfo {
|
|
373
|
-
rank_type: gene_counts.variance(),
|
|
374
|
-
gene_symbol: gene_names[row].clone(),
|
|
375
|
-
});
|
|
376
|
-
} else if filter_extreme_values == false {
|
|
377
|
-
gene_infos.push(GeneInfo {
|
|
378
|
-
rank_type: gene_counts.variance(),
|
|
379
|
-
gene_symbol: gene_names[row].clone(),
|
|
380
|
-
});
|
|
381
|
-
}
|
|
382
|
-
} else {
|
|
383
|
-
// Calculating interquartile region
|
|
384
|
-
let mut gene_counts_data = Data::new(gene_counts);
|
|
385
|
-
if gene_counts_data.clone().interquartile_range().is_nan() == true {
|
|
386
|
-
} else if filter_extreme_values == true && keep_cpm_bool == true && keep_total_bool == true {
|
|
387
|
-
gene_infos.push(GeneInfo {
|
|
388
|
-
rank_type: gene_counts_data.interquartile_range(),
|
|
389
|
-
gene_symbol: gene_names[row].clone(),
|
|
390
|
-
});
|
|
391
|
-
} else if filter_extreme_values == false {
|
|
392
|
-
gene_infos.push(GeneInfo {
|
|
393
|
-
rank_type: gene_counts_data.interquartile_range(),
|
|
394
|
-
gene_symbol: gene_names[row].clone(),
|
|
395
|
-
});
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
gene_infos
|
|
400
|
-
.as_mut_slice()
|
|
401
|
-
.sort_by(|a, b| (a.rank_type).partial_cmp(&b.rank_type).unwrap_or(Ordering::Equal));
|
|
402
|
-
gene_infos
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
fn cpm(
|
|
406
|
-
input_matrix: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
407
|
-
col_sums: &[f64],
|
|
408
|
-
) -> Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>> {
|
|
409
|
-
let mut output_matrix = DMatrix::from_element(input_matrix.nrows(), input_matrix.ncols(), 0.0);
|
|
410
|
-
|
|
411
|
-
for col in 0..input_matrix.ncols() {
|
|
412
|
-
let norm = col_sums[col];
|
|
413
|
-
for row in 0..input_matrix.nrows() {
|
|
414
|
-
let v = input_matrix[(row, col)];
|
|
415
|
-
output_matrix[(row, col)] = if v.is_finite() && norm > 0.0 {
|
|
416
|
-
v * 1_000_000.0 / norm
|
|
417
|
-
} else {
|
|
418
|
-
f64::NAN
|
|
419
|
-
};
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
|
-
output_matrix
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
fn main() {
|
|
426
|
-
// println!("Starting gene variance calculation...");
|
|
427
|
-
let mut input = String::new();
|
|
428
|
-
match io::stdin().read_line(&mut input) {
|
|
429
|
-
// Accepting the piped input from nodejs (or command line from testing)
|
|
430
|
-
Ok(_bytes_read) => {
|
|
431
|
-
// eprintln!("Read {} bytes from stdin", bytes_read);
|
|
432
|
-
// println!("{} bytes read", bytes_read);
|
|
433
|
-
// println!("{}", input);
|
|
434
|
-
let input_json = json::parse(&input);
|
|
435
|
-
match input_json {
|
|
436
|
-
Ok(json_string) => {
|
|
437
|
-
// println!("Successfully parsed JSON input");
|
|
438
|
-
// let now = Instant::now();
|
|
439
|
-
let samples_string_result = &json_string["samples"].to_owned();
|
|
440
|
-
let samples_string;
|
|
441
|
-
match samples_string_result.as_str() {
|
|
442
|
-
Some(x) => {
|
|
443
|
-
samples_string = x.to_string();
|
|
444
|
-
// println!("Samples: {}", samples_string);
|
|
445
|
-
}
|
|
446
|
-
None => {
|
|
447
|
-
// eprintln!("ERROR: Samples not provided in JSON");
|
|
448
|
-
println!(
|
|
449
|
-
"{}",
|
|
450
|
-
serde_json::json!({
|
|
451
|
-
"status": "error",
|
|
452
|
-
"message": "Samples not provided"
|
|
453
|
-
})
|
|
454
|
-
);
|
|
455
|
-
return;
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
let file_name_result = &json_string["input_file"];
|
|
460
|
-
let file_name;
|
|
461
|
-
|
|
462
|
-
match file_name_result.as_str() {
|
|
463
|
-
Some(x) => {
|
|
464
|
-
file_name = x.to_string();
|
|
465
|
-
// eprintln!("Input file: {}", file_name);
|
|
466
|
-
// Return file name as JSON for debugging
|
|
467
|
-
// println!(
|
|
468
|
-
// "{}",
|
|
469
|
-
// serde_json::json!({"status": "success", "file_name": file_name})
|
|
470
|
-
// );
|
|
471
|
-
}
|
|
472
|
-
None => {
|
|
473
|
-
// eprintln!("ERROR: File name missing in JSON");
|
|
474
|
-
// println!(
|
|
475
|
-
// "{}",
|
|
476
|
-
// serde_json::json!({
|
|
477
|
-
// "status": "error",
|
|
478
|
-
// "message": "File name is missing"
|
|
479
|
-
// })
|
|
480
|
-
// );
|
|
481
|
-
return;
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
// Determine file type based on extension
|
|
486
|
-
let file_type: String;
|
|
487
|
-
if file_name.to_lowercase().ends_with(".h5") {
|
|
488
|
-
file_type = "hdf5".to_string();
|
|
489
|
-
// eprintln!("Detected HDF5 file format based on .h5 extension");
|
|
490
|
-
} else {
|
|
491
|
-
file_type = "text".to_string();
|
|
492
|
-
// eprintln!("Using default text file format (no .h5 extension found)");
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
// Determine if the H5 file is new format
|
|
496
|
-
//let new_format: bool = match &json_string {
|
|
497
|
-
// json::JsonValue::Object(ref obj) => {
|
|
498
|
-
// obj.get("newformat").and_then(|v| v.as_bool()).map_or(false, |b| b)
|
|
499
|
-
// }
|
|
500
|
-
// _ => false,
|
|
501
|
-
//};
|
|
502
|
-
|
|
503
|
-
let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
|
|
504
|
-
.to_owned()
|
|
505
|
-
.as_str()
|
|
506
|
-
.unwrap_or("var")
|
|
507
|
-
.to_string();
|
|
508
|
-
// eprintln!("Rank type: {}", rank_type);
|
|
509
|
-
if rank_type != "var" && rank_type != "iqr" {
|
|
510
|
-
// Check if any unknown method has been provided
|
|
511
|
-
// eprintln!("ERROR: Unknown rank method: {}", rank_type);
|
|
512
|
-
// println!(
|
|
513
|
-
// "{}",
|
|
514
|
-
// serde_json::json!({
|
|
515
|
-
// "status": "error",
|
|
516
|
-
// "message": format!("Unknown rank method: {}. Must be 'var' or 'iqr'", rank_type)
|
|
517
|
-
// })
|
|
518
|
-
// );
|
|
519
|
-
return;
|
|
520
|
-
}
|
|
521
|
-
let filter_extreme_values_result = &json_string["filter_extreme_values"];
|
|
522
|
-
|
|
523
|
-
let filter_extreme_values;
|
|
524
|
-
match filter_extreme_values_result.as_bool() {
|
|
525
|
-
Some(x) => {
|
|
526
|
-
filter_extreme_values = x;
|
|
527
|
-
// eprintln!("Filter extreme values: {}", filter_extreme_values);
|
|
528
|
-
}
|
|
529
|
-
None => {
|
|
530
|
-
filter_extreme_values = true; // If filter_extreme_values field is missing, set it to true by default
|
|
531
|
-
// eprintln!(
|
|
532
|
-
// "Filter extreme values not specified, defaulting to: {}",
|
|
533
|
-
// filter_extreme_values
|
|
534
|
-
// );
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
|
|
538
|
-
let num_genes_result = &json_string["num_genes"];
|
|
539
|
-
let num_genes;
|
|
540
|
-
match num_genes_result.as_usize() {
|
|
541
|
-
Some(x) => {
|
|
542
|
-
num_genes = x;
|
|
543
|
-
// eprintln!("Number of genes requested: {}", num_genes);
|
|
544
|
-
}
|
|
545
|
-
None => {
|
|
546
|
-
// eprintln!("ERROR: Number of genes to be given is missing");
|
|
547
|
-
println!(
|
|
548
|
-
"{}",
|
|
549
|
-
serde_json::json!({
|
|
550
|
-
"status": "error",
|
|
551
|
-
"message": "Number of genes to be given is missing"
|
|
552
|
-
})
|
|
553
|
-
);
|
|
554
|
-
return;
|
|
555
|
-
}
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
let min_count_result = &json_string["min_count"];
|
|
559
|
-
let mut min_count: Option<f64> = None;
|
|
560
|
-
match min_count_result.as_f64() {
|
|
561
|
-
Some(x) => {
|
|
562
|
-
min_count = Some(x);
|
|
563
|
-
// eprintln!("Min count: {}", x);
|
|
564
|
-
}
|
|
565
|
-
None => {
|
|
566
|
-
// eprintln!("Min count not specified, will use default");
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
let min_total_count_result = &json_string["min_total_count"];
|
|
571
|
-
let mut min_total_count: Option<f64> = None;
|
|
572
|
-
match min_total_count_result.as_f64() {
|
|
573
|
-
Some(x) => {
|
|
574
|
-
min_total_count = Some(x);
|
|
575
|
-
// eprintln!("Min total count: {}", x);
|
|
576
|
-
}
|
|
577
|
-
None => {
|
|
578
|
-
// eprintln!("Min total count not specified, will use default");
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
|
|
582
|
-
let samples_list: Vec<&str> = samples_string.split(",").collect();
|
|
583
|
-
// eprintln!("Number of samples in list: {}", samples_list.len());
|
|
584
|
-
|
|
585
|
-
// Choose the appropriate input function based on file type
|
|
586
|
-
// eprintln!("Reading data from {} file: {}", file_type, file_name);
|
|
587
|
-
let (input_matrix, gene_names) = if file_type == "hdf5" {
|
|
588
|
-
// eprintln!("Using HDF5 reader function...");
|
|
589
|
-
match input_data_hdf5(&file_name, &samples_list) {
|
|
590
|
-
Ok(result) => result,
|
|
591
|
-
Err(err) => {
|
|
592
|
-
eprintln!("ERROR in HDF5 reader: {:?}", err);
|
|
593
|
-
return;
|
|
594
|
-
}
|
|
595
|
-
}
|
|
596
|
-
} else {
|
|
597
|
-
// For original text-based implementation, we wrap it in a try-catch block
|
|
598
|
-
// to handle panics in a more structured way
|
|
599
|
-
// eprintln!("Using text file reader function...");
|
|
600
|
-
match std::panic::catch_unwind(|| input_data(&file_name, &samples_list)) {
|
|
601
|
-
Ok(result) => {
|
|
602
|
-
// eprintln!("Successfully read text file data");
|
|
603
|
-
result
|
|
604
|
-
}
|
|
605
|
-
Err(err) => {
|
|
606
|
-
eprintln!("ERROR in text file reader: {:?}", err);
|
|
607
|
-
println!(
|
|
608
|
-
"{}",
|
|
609
|
-
serde_json::json!({
|
|
610
|
-
"status": "error",
|
|
611
|
-
"message": "Failed to read text file data",
|
|
612
|
-
"file_path": file_name
|
|
613
|
-
})
|
|
614
|
-
);
|
|
615
|
-
return;
|
|
616
|
-
}
|
|
617
|
-
}
|
|
618
|
-
};
|
|
619
|
-
|
|
620
|
-
// eprintln!(
|
|
621
|
-
// "Matrix dimensions: {}x{}",
|
|
622
|
-
// input_matrix.nrows(),
|
|
623
|
-
// input_matrix.ncols()
|
|
624
|
-
// );
|
|
625
|
-
// eprintln!("Number of gene symbols: {}", gene_names.len());
|
|
626
|
-
if !gene_names.is_empty() {
|
|
627
|
-
// eprintln!(
|
|
628
|
-
// "First few gene symbols: {:?}",
|
|
629
|
-
// &gene_names.iter().take(5).collect::<Vec<_>>()
|
|
630
|
-
// );
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
// Wrap the variance calculation in a try-catch to capture any panics
|
|
634
|
-
// eprintln!(
|
|
635
|
-
// "Calculating variance with {} samples, filter={}, rank_type={}",
|
|
636
|
-
// samples_list.len(),
|
|
637
|
-
// filter_extreme_values,
|
|
638
|
-
// rank_type
|
|
639
|
-
// );
|
|
640
|
-
let gene_infos = match std::panic::catch_unwind(|| {
|
|
641
|
-
calculate_variance(
|
|
642
|
-
input_matrix,
|
|
643
|
-
gene_names,
|
|
644
|
-
samples_list.len() as f64,
|
|
645
|
-
filter_extreme_values,
|
|
646
|
-
rank_type.to_string(),
|
|
647
|
-
min_count,
|
|
648
|
-
min_total_count,
|
|
649
|
-
)
|
|
650
|
-
}) {
|
|
651
|
-
Ok(result) => {
|
|
652
|
-
// eprintln!(
|
|
653
|
-
// "Successfully calculated variance for {} genes",
|
|
654
|
-
// result.len()
|
|
655
|
-
// );
|
|
656
|
-
result
|
|
657
|
-
}
|
|
658
|
-
Err(err) => {
|
|
659
|
-
eprintln!("ERROR in variance calculation: {:?}", err);
|
|
660
|
-
println!(
|
|
661
|
-
"{}",
|
|
662
|
-
serde_json::json!({
|
|
663
|
-
"status": "error",
|
|
664
|
-
"message": "Error calculating gene variance",
|
|
665
|
-
"file_path": file_name
|
|
666
|
-
})
|
|
667
|
-
);
|
|
668
|
-
return;
|
|
669
|
-
}
|
|
670
|
-
};
|
|
671
|
-
|
|
672
|
-
// Check if we have enough genes for the requested output
|
|
673
|
-
if gene_infos.len() < num_genes {
|
|
674
|
-
// eprintln!(
|
|
675
|
-
// "WARNING: Only {} genes found, but {} were requested",
|
|
676
|
-
// gene_infos.len(),
|
|
677
|
-
// num_genes
|
|
678
|
-
// );
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
let actual_num_genes = std::cmp::min(num_genes, gene_infos.len());
|
|
682
|
-
// eprintln!("Returning top {} genes", actual_num_genes);
|
|
683
|
-
|
|
684
|
-
// Printing the top "num_genes" genes to JSON
|
|
685
|
-
let mut output_string = "[".to_string();
|
|
686
|
-
for j in 0..actual_num_genes {
|
|
687
|
-
let i = gene_infos.len() - j - 1;
|
|
688
|
-
output_string += &serde_json::to_string(&gene_infos[i]).unwrap();
|
|
689
|
-
if i > gene_infos.len() - actual_num_genes {
|
|
690
|
-
output_string += &",".to_string();
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
output_string += &"]".to_string();
|
|
694
|
-
|
|
695
|
-
// Debug the first few characters of the output
|
|
696
|
-
if output_string.len() > 100 {
|
|
697
|
-
// eprintln!("Output JSON starts with: {}", &output_string[0..100]);
|
|
698
|
-
} else {
|
|
699
|
-
// eprintln!("Output JSON: {}", output_string);
|
|
700
|
-
}
|
|
701
|
-
|
|
702
|
-
println!("output_json:{}", output_string);
|
|
703
|
-
// let elapsed = now.elapsed();
|
|
704
|
-
// eprintln!("Completed in: {:?}", elapsed);
|
|
705
|
-
// println!("Time for calculating variances:{:?}", elapsed);
|
|
706
|
-
}
|
|
707
|
-
Err(error) => {
|
|
708
|
-
eprintln!("ERROR: JSON parsing error: {}", error);
|
|
709
|
-
println!(
|
|
710
|
-
"{}",
|
|
711
|
-
serde_json::json!({
|
|
712
|
-
"status": "error",
|
|
713
|
-
"message": format!("Incorrect json: {}", error)
|
|
714
|
-
})
|
|
715
|
-
);
|
|
716
|
-
}
|
|
717
|
-
}
|
|
718
|
-
}
|
|
719
|
-
Err(error) => {
|
|
720
|
-
eprintln!("ERROR: Failed to read from stdin: {}", error);
|
|
721
|
-
println!(
|
|
722
|
-
"{}",
|
|
723
|
-
serde_json::json!({
|
|
724
|
-
"status": "error",
|
|
725
|
-
"message": format!("Piping error: {}", error)
|
|
726
|
-
})
|
|
727
|
-
);
|
|
728
|
-
}
|
|
729
|
-
}
|
|
730
|
-
// println!("Program execution complete");
|
|
731
|
-
}
|