@sjcrh/proteinpaint-rust 2.114.0 → 2.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/topGeneByExpressionVariance.rs +513 -31
package/package.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "2.
|
|
2
|
+
"version": "2.116.0",
|
|
3
3
|
"name": "@sjcrh/proteinpaint-rust",
|
|
4
4
|
"description": "Rust-based utilities for proteinpaint",
|
|
5
5
|
"main": "index.js",
|
|
@@ -38,5 +38,5 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"tape": "^5.2.2"
|
|
40
40
|
},
|
|
41
|
-
"pp_release_tag": "v2.
|
|
41
|
+
"pp_release_tag": "v2.116.0"
|
|
42
42
|
}
|
|
@@ -1,22 +1,23 @@
|
|
|
1
1
|
/*
|
|
2
2
|
This script selects the top most variant genes by calculating the variance/interquartile region for each gene.
|
|
3
|
+
Added support for HDF5 input files alongside the existing text file support.
|
|
3
4
|
|
|
4
5
|
Various JSON parameters:
|
|
5
6
|
samples: Enter the sample ID(s) separated by comma
|
|
6
|
-
input_file: Path to input file
|
|
7
|
+
input_file: Path to input file (either text or HDF5 format)
|
|
7
8
|
filter_extreme_values: boolean (true/false). When true, this filter according to logic filterbyExpr in edgeR. This basically removes genes that have very low gene counts.
|
|
8
9
|
num_genes: The top num_genes (for e.g 10) that need to be reported in the output.
|
|
9
10
|
rank_type: var/iqr . This parameter decides whether to sort genes using variance or interquartile region. There is an article which states that its better to use interquartile region than variance for selecting genes for clustering https://www.frontiersin.org/articles/10.3389/fgene.2021.632620/full
|
|
10
11
|
|
|
11
|
-
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
|
|
12
|
+
Example syntax: cd .. && cargo build --release && json='{"samples":"sample1,sample2,sample3","min_count":30,"min_total_count":20,"input_file":"/path/to/input/file.h5","filter_extreme_values":true,"num_genes":100, "rank_type":"var"}' && time echo $json | target/release/gene_variance
|
|
12
13
|
*/
|
|
13
14
|
#![allow(non_snake_case)]
|
|
14
15
|
use bgzip::BGZFReader;
|
|
15
16
|
use json;
|
|
16
|
-
use nalgebra::
|
|
17
|
+
use nalgebra::DMatrix;
|
|
17
18
|
use nalgebra::base::Matrix;
|
|
18
19
|
use nalgebra::base::VecStorage;
|
|
19
|
-
use nalgebra::
|
|
20
|
+
use nalgebra::base::dimension::Dyn;
|
|
20
21
|
use serde::{Deserialize, Serialize};
|
|
21
22
|
use serde_json;
|
|
22
23
|
use statrs::statistics::Data;
|
|
@@ -28,8 +29,297 @@ use std::fs;
|
|
|
28
29
|
use std::io;
|
|
29
30
|
use std::io::Read;
|
|
30
31
|
use std::str::FromStr;
|
|
31
|
-
use std::time::Instant;
|
|
32
|
+
// use std::time::Instant;
|
|
33
|
+
use hdf5::types::VarLenAscii;
|
|
34
|
+
use hdf5::{File, Result};
|
|
35
|
+
use ndarray::Dim;
|
|
36
|
+
|
|
37
|
+
/// Read expression data from a dense HDF5 file for a list of samples
|
|
38
|
+
///
|
|
39
|
+
/// This function extracts expression data from a dense format HDF5 file for
|
|
40
|
+
/// the specified samples and returns it in the format expected by the
|
|
41
|
+
/// gene variance calculation code.
|
|
42
|
+
///
|
|
43
|
+
/// # Arguments
|
|
44
|
+
///
|
|
45
|
+
/// * `filename` - Path to the HDF5 file
|
|
46
|
+
/// * `sample_list` - List of sample IDs to extract data for
|
|
47
|
+
///
|
|
48
|
+
/// # Returns
|
|
49
|
+
///
|
|
50
|
+
/// A Result containing either:
|
|
51
|
+
/// - A tuple with expression matrix and gene symbols list on success, or
|
|
52
|
+
/// - An error with details formatted as JSON
|
|
53
|
+
fn input_data_hdf5(
|
|
54
|
+
filename: &String,
|
|
55
|
+
sample_list: &Vec<&str>,
|
|
56
|
+
) -> Result<(
|
|
57
|
+
Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
|
|
58
|
+
Vec<String>,
|
|
59
|
+
)> {
|
|
60
|
+
// let now = Instant::now();
|
|
61
|
+
// eprintln!("Reading HDF5 file: {}", filename);
|
|
62
|
+
|
|
63
|
+
// Open the HDF5 file
|
|
64
|
+
let file = match File::open(filename) {
|
|
65
|
+
Ok(f) => f,
|
|
66
|
+
Err(err) => {
|
|
67
|
+
eprintln!("Failed to open HDF5 file: {}", err);
|
|
68
|
+
println!(
|
|
69
|
+
"{}",
|
|
70
|
+
serde_json::json!({
|
|
71
|
+
"status": "error",
|
|
72
|
+
"message": format!("Failed to open HDF5 file: {}", err),
|
|
73
|
+
"file_path": filename
|
|
74
|
+
})
|
|
75
|
+
);
|
|
76
|
+
return Err(hdf5::Error::Internal(format!(
|
|
77
|
+
"Failed to open HDF5 file: {}",
|
|
78
|
+
err
|
|
79
|
+
)));
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
// Read gene symbols dataset
|
|
84
|
+
let genes_dataset = match file.dataset("gene_symbols") {
|
|
85
|
+
Ok(ds) => ds,
|
|
86
|
+
Err(err) => {
|
|
87
|
+
eprintln!("Failed to open gene_symbols dataset: {}", err);
|
|
88
|
+
println!(
|
|
89
|
+
"{}",
|
|
90
|
+
serde_json::json!({
|
|
91
|
+
"status": "error",
|
|
92
|
+
"message": format!("Failed to open gene_symbols dataset: {}", err),
|
|
93
|
+
"file_path": filename
|
|
94
|
+
})
|
|
95
|
+
);
|
|
96
|
+
return Err(hdf5::Error::Internal(format!(
|
|
97
|
+
"Failed to open gene_symbols dataset: {}",
|
|
98
|
+
err
|
|
99
|
+
)));
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
// Read genes as VarLenAscii
|
|
104
|
+
let genes_varlen = match genes_dataset.read_1d::<VarLenAscii>() {
|
|
105
|
+
Ok(g) => g,
|
|
106
|
+
Err(err) => {
|
|
107
|
+
eprintln!("Failed to read gene symbols: {}", err);
|
|
108
|
+
println!(
|
|
109
|
+
"{}",
|
|
110
|
+
serde_json::json!({
|
|
111
|
+
"status": "error",
|
|
112
|
+
"message": format!("Failed to read gene symbols: {}", err),
|
|
113
|
+
"file_path": filename
|
|
114
|
+
})
|
|
115
|
+
);
|
|
116
|
+
return Err(hdf5::Error::Internal(format!(
|
|
117
|
+
"Failed to read gene symbols: {}",
|
|
118
|
+
err
|
|
119
|
+
)));
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
// Convert to Vec<String> for easier handling
|
|
124
|
+
let gene_symbols: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
|
|
125
|
+
let num_genes = gene_symbols.len();
|
|
126
|
+
// eprintln!("Found {} gene symbols", num_genes);
|
|
127
|
+
|
|
128
|
+
// Read sample names
|
|
129
|
+
let samples_dataset = match file.dataset("samples") {
|
|
130
|
+
Ok(ds) => ds,
|
|
131
|
+
Err(err) => {
|
|
132
|
+
eprintln!("Failed to open samples dataset: {}", err);
|
|
133
|
+
println!(
|
|
134
|
+
"{}",
|
|
135
|
+
serde_json::json!({
|
|
136
|
+
"status": "error",
|
|
137
|
+
"message": format!("Failed to open samples dataset: {}", err),
|
|
138
|
+
"file_path": filename
|
|
139
|
+
})
|
|
140
|
+
);
|
|
141
|
+
return Err(hdf5::Error::Internal(format!(
|
|
142
|
+
"Failed to open samples dataset: {}",
|
|
143
|
+
err
|
|
144
|
+
)));
|
|
145
|
+
}
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
// Read samples as VarLenAscii
|
|
149
|
+
let samples_varlen = match samples_dataset.read_1d::<VarLenAscii>() {
|
|
150
|
+
Ok(s) => s,
|
|
151
|
+
Err(err) => {
|
|
152
|
+
eprintln!("Failed to read sample names: {}", err);
|
|
153
|
+
println!(
|
|
154
|
+
"{}",
|
|
155
|
+
serde_json::json!({
|
|
156
|
+
"status": "error",
|
|
157
|
+
"message": format!("Failed to read sample names: {}", err),
|
|
158
|
+
"file_path": filename
|
|
159
|
+
})
|
|
160
|
+
);
|
|
161
|
+
return Err(hdf5::Error::Internal(format!(
|
|
162
|
+
"Failed to read sample names: {}",
|
|
163
|
+
err
|
|
164
|
+
)));
|
|
165
|
+
}
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
// Convert to Vec<String> for easier handling
|
|
169
|
+
let all_samples: Vec<String> = samples_varlen.iter().map(|s| s.to_string()).collect();
|
|
170
|
+
// eprintln!("Found {} total samples", all_samples.len());
|
|
171
|
+
|
|
172
|
+
// Find indices of requested samples
|
|
173
|
+
let mut column_indices: Vec<usize> = Vec::with_capacity(sample_list.len());
|
|
174
|
+
for sample in sample_list {
|
|
175
|
+
if let Some(index) = all_samples.iter().position(|s| s == sample) {
|
|
176
|
+
column_indices.push(index);
|
|
177
|
+
} else {
|
|
178
|
+
eprintln!("Sample {} not found in the dataset", sample);
|
|
179
|
+
println!(
|
|
180
|
+
"{}",
|
|
181
|
+
serde_json::json!({
|
|
182
|
+
"status": "error",
|
|
183
|
+
"message": format!("Sample '{}' not found in the dataset", sample),
|
|
184
|
+
"file_path": filename,
|
|
185
|
+
"available_samples": all_samples
|
|
186
|
+
})
|
|
187
|
+
);
|
|
188
|
+
return Err(hdf5::Error::Internal(format!(
|
|
189
|
+
"Sample '{}' not found in the dataset",
|
|
190
|
+
sample
|
|
191
|
+
)));
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Read the counts dataset
|
|
196
|
+
let counts_dataset = match file.dataset("counts") {
|
|
197
|
+
Ok(ds) => ds,
|
|
198
|
+
Err(err) => {
|
|
199
|
+
eprintln!("Failed to open counts dataset: {}", err);
|
|
200
|
+
println!(
|
|
201
|
+
"{}",
|
|
202
|
+
serde_json::json!({
|
|
203
|
+
"status": "error",
|
|
204
|
+
"message": format!("Failed to open counts dataset: {}", err),
|
|
205
|
+
"file_path": filename
|
|
206
|
+
})
|
|
207
|
+
);
|
|
208
|
+
return Err(hdf5::Error::Internal(format!(
|
|
209
|
+
"Failed to open counts dataset: {}",
|
|
210
|
+
err
|
|
211
|
+
)));
|
|
212
|
+
}
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
// Get dataset dimensions for validation
|
|
216
|
+
let dataset_shape = counts_dataset.shape();
|
|
217
|
+
if dataset_shape.len() != 2 {
|
|
218
|
+
eprintln!("Counts dataset does not have the expected 2D shape");
|
|
219
|
+
println!(
|
|
220
|
+
"{}",
|
|
221
|
+
serde_json::json!({
|
|
222
|
+
"status": "error",
|
|
223
|
+
"message": "Expected a 2D dataset for counts",
|
|
224
|
+
"file_path": filename,
|
|
225
|
+
"actual_shape": dataset_shape
|
|
226
|
+
})
|
|
227
|
+
);
|
|
228
|
+
return Err(hdf5::Error::Internal(
|
|
229
|
+
"Expected a 2D dataset for counts".to_string(),
|
|
230
|
+
));
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Check dimensions match expected values
|
|
234
|
+
if dataset_shape[0] != num_genes {
|
|
235
|
+
eprintln!(
|
|
236
|
+
"Counts dataset first dimension ({}) doesn't match number of genes ({})",
|
|
237
|
+
dataset_shape[0], num_genes
|
|
238
|
+
);
|
|
239
|
+
println!(
|
|
240
|
+
"{}",
|
|
241
|
+
serde_json::json!({
|
|
242
|
+
"status": "error",
|
|
243
|
+
"message": format!("Counts dataset first dimension ({}) doesn't match number of genes ({})",
|
|
244
|
+
dataset_shape[0], num_genes),
|
|
245
|
+
"file_path": filename
|
|
246
|
+
})
|
|
247
|
+
);
|
|
248
|
+
return Err(hdf5::Error::Internal(format!(
|
|
249
|
+
"Counts dataset first dimension ({}) doesn't match number of genes ({})",
|
|
250
|
+
dataset_shape[0], num_genes
|
|
251
|
+
)));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if dataset_shape[1] != all_samples.len() {
|
|
255
|
+
eprintln!(
|
|
256
|
+
"Counts dataset second dimension ({}) doesn't match number of samples ({})",
|
|
257
|
+
dataset_shape[1],
|
|
258
|
+
all_samples.len()
|
|
259
|
+
);
|
|
260
|
+
println!(
|
|
261
|
+
"{}",
|
|
262
|
+
serde_json::json!({
|
|
263
|
+
"status": "error",
|
|
264
|
+
"message": format!("Counts dataset second dimension ({}) doesn't match number of samples ({})",
|
|
265
|
+
dataset_shape[1], all_samples.len()),
|
|
266
|
+
"file_path": filename
|
|
267
|
+
})
|
|
268
|
+
);
|
|
269
|
+
return Err(hdf5::Error::Internal(format!(
|
|
270
|
+
"Counts dataset second dimension ({}) doesn't match number of samples ({})",
|
|
271
|
+
dataset_shape[1],
|
|
272
|
+
all_samples.len()
|
|
273
|
+
)));
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Read the counts dataset
|
|
277
|
+
let all_counts = match counts_dataset.read::<f64, Dim<[usize; 2]>>() {
|
|
278
|
+
Ok(data) => data,
|
|
279
|
+
Err(err) => {
|
|
280
|
+
eprintln!("Failed to read expression data: {}", err);
|
|
281
|
+
println!(
|
|
282
|
+
"{}",
|
|
283
|
+
serde_json::json!({
|
|
284
|
+
"status": "error",
|
|
285
|
+
"message": format!("Failed to read expression data: {}", err),
|
|
286
|
+
"file_path": filename
|
|
287
|
+
})
|
|
288
|
+
);
|
|
289
|
+
return Err(hdf5::Error::Internal(format!(
|
|
290
|
+
"Failed to read expression data: {}",
|
|
291
|
+
err
|
|
292
|
+
)));
|
|
293
|
+
}
|
|
294
|
+
};
|
|
295
|
+
|
|
296
|
+
// Extract only the columns corresponding to the requested samples
|
|
297
|
+
// eprintln!(
|
|
298
|
+
// "Extracting data for {} requested samples",
|
|
299
|
+
// sample_list.len()
|
|
300
|
+
// );
|
|
301
|
+
let mut input_vector: Vec<f64> = Vec::with_capacity(num_genes * sample_list.len());
|
|
302
|
+
|
|
303
|
+
for gene_idx in 0..num_genes {
|
|
304
|
+
for &col_idx in &column_indices {
|
|
305
|
+
input_vector.push(all_counts[[gene_idx, col_idx]]);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Create matrix from the extracted data
|
|
310
|
+
let dm = DMatrix::from_row_slice(num_genes, sample_list.len(), &input_vector);
|
|
311
|
+
|
|
312
|
+
// eprintln!("Time for reading HDF5 data: {:?}", now.elapsed());
|
|
313
|
+
// eprintln!(
|
|
314
|
+
// "Successfully extracted expression data matrix of size {}x{}",
|
|
315
|
+
// dm.nrows(),
|
|
316
|
+
// dm.ncols()
|
|
317
|
+
// );
|
|
318
|
+
|
|
319
|
+
Ok((dm, gene_symbols))
|
|
320
|
+
}
|
|
32
321
|
|
|
322
|
+
// The original input_data function for text files is kept as is
|
|
33
323
|
fn input_data(
|
|
34
324
|
filename: &String,
|
|
35
325
|
sample_list: &Vec<&str>,
|
|
@@ -246,46 +536,92 @@ fn cpm(
|
|
|
246
536
|
}
|
|
247
537
|
|
|
248
538
|
fn main() {
|
|
539
|
+
// eprintln!("Starting gene variance calculation...");
|
|
249
540
|
let mut input = String::new();
|
|
250
541
|
match io::stdin().read_line(&mut input) {
|
|
251
542
|
// Accepting the piped input from nodejs (or command line from testing)
|
|
252
543
|
Ok(_bytes_read) => {
|
|
544
|
+
// eprintln!("Read {} bytes from stdin", _bytes_read);
|
|
253
545
|
//println!("{} bytes read", bytes_read);
|
|
254
546
|
//println!("{}", input);
|
|
255
547
|
let input_json = json::parse(&input);
|
|
256
548
|
match input_json {
|
|
257
549
|
Ok(json_string) => {
|
|
258
|
-
|
|
550
|
+
// eprintln!("Successfully parsed JSON input");
|
|
551
|
+
// let now = Instant::now();
|
|
259
552
|
let samples_string_result = &json_string["samples"].to_owned();
|
|
260
553
|
let samples_string;
|
|
261
554
|
match samples_string_result.as_str() {
|
|
262
555
|
Some(x) => {
|
|
263
556
|
samples_string = x.to_string();
|
|
557
|
+
// eprintln!("Samples: {}", samples_string);
|
|
264
558
|
}
|
|
265
559
|
None => {
|
|
266
|
-
|
|
560
|
+
eprintln!("ERROR: Samples not provided in JSON");
|
|
561
|
+
println!(
|
|
562
|
+
"{}",
|
|
563
|
+
serde_json::json!({
|
|
564
|
+
"status": "error",
|
|
565
|
+
"message": "Samples not provided"
|
|
566
|
+
})
|
|
567
|
+
);
|
|
568
|
+
return;
|
|
267
569
|
}
|
|
268
570
|
}
|
|
269
571
|
|
|
270
572
|
let file_name_result = &json_string["input_file"];
|
|
271
573
|
let file_name;
|
|
574
|
+
|
|
272
575
|
match file_name_result.as_str() {
|
|
273
576
|
Some(x) => {
|
|
274
577
|
file_name = x.to_string();
|
|
578
|
+
// eprintln!("Input file: {}", file_name);
|
|
579
|
+
// Return file name as JSON for debugging
|
|
580
|
+
// println!(
|
|
581
|
+
// "{}",
|
|
582
|
+
// serde_json::json!({"status": "success", "file_name": file_name})
|
|
583
|
+
// );
|
|
275
584
|
}
|
|
276
585
|
None => {
|
|
277
|
-
|
|
586
|
+
eprintln!("ERROR: File name missing in JSON");
|
|
587
|
+
println!(
|
|
588
|
+
"{}",
|
|
589
|
+
serde_json::json!({
|
|
590
|
+
"status": "error",
|
|
591
|
+
"message": "File name is missing"
|
|
592
|
+
})
|
|
593
|
+
);
|
|
594
|
+
return;
|
|
278
595
|
}
|
|
279
596
|
}
|
|
280
597
|
|
|
598
|
+
// Determine file type based on extension
|
|
599
|
+
let file_type: String;
|
|
600
|
+
if file_name.to_lowercase().ends_with(".h5") {
|
|
601
|
+
file_type = "hdf5".to_string();
|
|
602
|
+
// eprintln!("Detected HDF5 file format based on .h5 extension");
|
|
603
|
+
} else {
|
|
604
|
+
file_type = "text".to_string();
|
|
605
|
+
// eprintln!("Using default text file format (no .h5 extension found)");
|
|
606
|
+
}
|
|
607
|
+
|
|
281
608
|
let rank_type = &json_string["rank_type"] // Value provide must be either "var" or "iqr"
|
|
282
609
|
.to_owned()
|
|
283
610
|
.as_str()
|
|
284
|
-
.
|
|
611
|
+
.unwrap_or("var")
|
|
285
612
|
.to_string();
|
|
613
|
+
// eprintln!("Rank type: {}", rank_type);
|
|
286
614
|
if rank_type != "var" && rank_type != "iqr" {
|
|
287
615
|
// Check if any unknown method has been provided
|
|
288
|
-
|
|
616
|
+
eprintln!("ERROR: Unknown rank method: {}", rank_type);
|
|
617
|
+
println!(
|
|
618
|
+
"{}",
|
|
619
|
+
serde_json::json!({
|
|
620
|
+
"status": "error",
|
|
621
|
+
"message": format!("Unknown rank method: {}. Must be 'var' or 'iqr'", rank_type)
|
|
622
|
+
})
|
|
623
|
+
);
|
|
624
|
+
return;
|
|
289
625
|
}
|
|
290
626
|
let filter_extreme_values_result = &json_string["filter_extreme_values"];
|
|
291
627
|
|
|
@@ -293,9 +629,14 @@ fn main() {
|
|
|
293
629
|
match filter_extreme_values_result.as_bool() {
|
|
294
630
|
Some(x) => {
|
|
295
631
|
filter_extreme_values = x;
|
|
632
|
+
eprintln!("Filter extreme values: {}", filter_extreme_values);
|
|
296
633
|
}
|
|
297
634
|
None => {
|
|
298
635
|
filter_extreme_values = true; // If filter_extreme_values field is missing, set it to true by default
|
|
636
|
+
// eprintln!(
|
|
637
|
+
// "Filter extreme values not specified, defaulting to: {}",
|
|
638
|
+
// filter_extreme_values
|
|
639
|
+
// );
|
|
299
640
|
}
|
|
300
641
|
}
|
|
301
642
|
|
|
@@ -304,55 +645,196 @@ fn main() {
|
|
|
304
645
|
match num_genes_result.as_usize() {
|
|
305
646
|
Some(x) => {
|
|
306
647
|
num_genes = x;
|
|
648
|
+
// eprintln!("Number of genes requested: {}", num_genes);
|
|
307
649
|
}
|
|
308
650
|
None => {
|
|
309
|
-
|
|
651
|
+
eprintln!("ERROR: Number of genes to be given is missing");
|
|
652
|
+
println!(
|
|
653
|
+
"{}",
|
|
654
|
+
serde_json::json!({
|
|
655
|
+
"status": "error",
|
|
656
|
+
"message": "Number of genes to be given is missing"
|
|
657
|
+
})
|
|
658
|
+
);
|
|
659
|
+
return;
|
|
310
660
|
}
|
|
311
661
|
}
|
|
312
662
|
|
|
313
663
|
let min_count_result = &json_string["min_count"];
|
|
314
664
|
let mut min_count: Option<f64> = None;
|
|
315
665
|
match min_count_result.as_f64() {
|
|
316
|
-
Some(x) =>
|
|
317
|
-
|
|
666
|
+
Some(x) => {
|
|
667
|
+
min_count = Some(x);
|
|
668
|
+
// eprintln!("Min count: {}", x);
|
|
669
|
+
}
|
|
670
|
+
None => {
|
|
671
|
+
eprintln!("Min count not specified, will use default");
|
|
672
|
+
}
|
|
318
673
|
}
|
|
319
674
|
|
|
320
675
|
let min_total_count_result = &json_string["min_total_count"];
|
|
321
676
|
let mut min_total_count: Option<f64> = None;
|
|
322
677
|
match min_total_count_result.as_f64() {
|
|
323
|
-
Some(x) =>
|
|
324
|
-
|
|
678
|
+
Some(x) => {
|
|
679
|
+
min_total_count = Some(x);
|
|
680
|
+
// eprintln!("Min total count: {}", x);
|
|
681
|
+
}
|
|
682
|
+
None => {
|
|
683
|
+
eprintln!("Min total count not specified, will use default");
|
|
684
|
+
}
|
|
325
685
|
}
|
|
326
686
|
|
|
327
687
|
let samples_list: Vec<&str> = samples_string.split(",").collect();
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
688
|
+
// eprintln!("Number of samples in list: {}", samples_list.len());
|
|
689
|
+
|
|
690
|
+
// Choose the appropriate input function based on file type
|
|
691
|
+
// eprintln!("Reading data from {} file: {}", file_type, file_name);
|
|
692
|
+
let (input_matrix, gene_symbols) = if file_type == "hdf5" {
|
|
693
|
+
// eprintln!("Using HDF5 reader function...");
|
|
694
|
+
match input_data_hdf5(&file_name, &samples_list) {
|
|
695
|
+
Ok(result) => {
|
|
696
|
+
// eprintln!("Successfully read HDF5 data");
|
|
697
|
+
result
|
|
698
|
+
}
|
|
699
|
+
Err(err) => {
|
|
700
|
+
eprintln!("ERROR in HDF5 reader: {:?}", err);
|
|
701
|
+
// Error has already been printed to stdout in JSON format by the function
|
|
702
|
+
return;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
} else {
|
|
706
|
+
// For original text-based implementation, we wrap it in a try-catch block
|
|
707
|
+
// to handle panics in a more structured way
|
|
708
|
+
// eprintln!("Using text file reader function...");
|
|
709
|
+
match std::panic::catch_unwind(|| input_data(&file_name, &samples_list)) {
|
|
710
|
+
Ok(result) => {
|
|
711
|
+
// eprintln!("Successfully read text file data");
|
|
712
|
+
result
|
|
713
|
+
}
|
|
714
|
+
Err(err) => {
|
|
715
|
+
eprintln!("ERROR in text file reader: {:?}", err);
|
|
716
|
+
println!(
|
|
717
|
+
"{}",
|
|
718
|
+
serde_json::json!({
|
|
719
|
+
"status": "error",
|
|
720
|
+
"message": "Failed to read text file data",
|
|
721
|
+
"file_path": file_name
|
|
722
|
+
})
|
|
723
|
+
);
|
|
724
|
+
return;
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
};
|
|
728
|
+
|
|
729
|
+
// eprintln!(
|
|
730
|
+
// "Matrix dimensions: {}x{}",
|
|
731
|
+
// input_matrix.nrows(),
|
|
732
|
+
// input_matrix.ncols()
|
|
733
|
+
// );
|
|
734
|
+
// eprintln!("Number of gene symbols: {}", gene_symbols.len());
|
|
735
|
+
if !gene_symbols.is_empty() {
|
|
736
|
+
// eprintln!(
|
|
737
|
+
// "First few gene symbols: {:?}",
|
|
738
|
+
// &gene_symbols.iter().take(5).collect::<Vec<_>>()
|
|
739
|
+
// );
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Wrap the variance calculation in a try-catch to capture any panics
|
|
743
|
+
// eprintln!(
|
|
744
|
+
// "Calculating variance with {} samples, filter={}, rank_type={}",
|
|
745
|
+
// samples_list.len(),
|
|
746
|
+
// filter_extreme_values,
|
|
747
|
+
// rank_type
|
|
748
|
+
// );
|
|
749
|
+
let gene_infos = match std::panic::catch_unwind(|| {
|
|
750
|
+
calculate_variance(
|
|
751
|
+
input_matrix,
|
|
752
|
+
gene_symbols,
|
|
753
|
+
samples_list.len() as f64,
|
|
754
|
+
filter_extreme_values,
|
|
755
|
+
rank_type.to_string(),
|
|
756
|
+
min_count,
|
|
757
|
+
min_total_count,
|
|
758
|
+
)
|
|
759
|
+
}) {
|
|
760
|
+
Ok(result) => {
|
|
761
|
+
// eprintln!(
|
|
762
|
+
// "Successfully calculated variance for {} genes",
|
|
763
|
+
// result.len()
|
|
764
|
+
// );
|
|
765
|
+
result
|
|
766
|
+
}
|
|
767
|
+
Err(err) => {
|
|
768
|
+
eprintln!("ERROR in variance calculation: {:?}", err);
|
|
769
|
+
println!(
|
|
770
|
+
"{}",
|
|
771
|
+
serde_json::json!({
|
|
772
|
+
"status": "error",
|
|
773
|
+
"message": "Error calculating gene variance",
|
|
774
|
+
"file_path": file_name
|
|
775
|
+
})
|
|
776
|
+
);
|
|
777
|
+
return;
|
|
778
|
+
}
|
|
779
|
+
};
|
|
780
|
+
|
|
781
|
+
// Check if we have enough genes for the requested output
|
|
782
|
+
if gene_infos.len() < num_genes {
|
|
783
|
+
eprintln!(
|
|
784
|
+
"WARNING: Only {} genes found, but {} were requested",
|
|
785
|
+
gene_infos.len(),
|
|
786
|
+
num_genes
|
|
787
|
+
);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
let actual_num_genes = std::cmp::min(num_genes, gene_infos.len());
|
|
791
|
+
// eprintln!("Returning top {} genes", actual_num_genes);
|
|
339
792
|
|
|
340
793
|
// Printing the top "num_genes" genes to JSON
|
|
341
794
|
let mut output_string = "[".to_string();
|
|
342
|
-
for j in 0..
|
|
795
|
+
for j in 0..actual_num_genes {
|
|
343
796
|
let i = gene_infos.len() - j - 1;
|
|
344
797
|
output_string += &serde_json::to_string(&gene_infos[i]).unwrap();
|
|
345
|
-
if i > gene_infos.len() -
|
|
798
|
+
if i > gene_infos.len() - actual_num_genes {
|
|
346
799
|
output_string += &",".to_string();
|
|
347
800
|
}
|
|
348
801
|
}
|
|
349
802
|
output_string += &"]".to_string();
|
|
803
|
+
|
|
804
|
+
// Debug the first few characters of the output
|
|
805
|
+
if output_string.len() > 100 {
|
|
806
|
+
// eprintln!("Output JSON starts with: {}", &output_string[0..100]);
|
|
807
|
+
} else {
|
|
808
|
+
// eprintln!("Output JSON: {}", output_string);
|
|
809
|
+
}
|
|
810
|
+
|
|
350
811
|
println!("output_json:{}", output_string);
|
|
351
|
-
|
|
812
|
+
// let elapsed = now.elapsed();
|
|
813
|
+
// eprintln!("Completed in: {:?}", elapsed);
|
|
814
|
+
// println!("Time for calculating variances:{:?}", elapsed);
|
|
815
|
+
}
|
|
816
|
+
Err(error) => {
|
|
817
|
+
eprintln!("ERROR: JSON parsing error: {}", error);
|
|
818
|
+
println!(
|
|
819
|
+
"{}",
|
|
820
|
+
serde_json::json!({
|
|
821
|
+
"status": "error",
|
|
822
|
+
"message": format!("Incorrect json: {}", error)
|
|
823
|
+
})
|
|
824
|
+
);
|
|
352
825
|
}
|
|
353
|
-
Err(error) => println!("Incorrect json: {}", error),
|
|
354
826
|
}
|
|
355
827
|
}
|
|
356
|
-
Err(error) =>
|
|
828
|
+
Err(error) => {
|
|
829
|
+
eprintln!("ERROR: Failed to read from stdin: {}", error);
|
|
830
|
+
println!(
|
|
831
|
+
"{}",
|
|
832
|
+
serde_json::json!({
|
|
833
|
+
"status": "error",
|
|
834
|
+
"message": format!("Piping error: {}", error)
|
|
835
|
+
})
|
|
836
|
+
);
|
|
837
|
+
}
|
|
357
838
|
}
|
|
839
|
+
// println!("Program execution complete");
|
|
358
840
|
}
|