@sjcrh/proteinpaint-rust 2.117.0 → 2.119.0-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.117.0",
2
+ "version": "2.119.0-0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "description": "Rust-based utilities for proteinpaint",
5
5
  "main": "index.js",
@@ -38,5 +38,5 @@
38
38
  "devDependencies": {
39
39
  "tape": "^5.2.2"
40
40
  },
41
- "pp_release_tag": "v2.117.0"
41
+ "pp_release_tag": "v2.119.0-0"
42
42
  }
package/src/DEanalysis.rs CHANGED
@@ -86,6 +86,18 @@ fn input_data_from_HDF5(
86
86
  //println!("num_samples bulk:{}", num_samples);
87
87
  //println!("num_genes bulk:{}", num_genes);
88
88
 
89
+ //let now_gene_ids = Instant::now();
90
+ let ds_gene_ids = file.dataset("gene_ids").unwrap();
91
+ //println!("ds_gene_ids:{:?}", ds_gene_ids);
92
+ let gene_ids = ds_gene_ids
93
+ .read::<VarLenAscii, Dim<[usize; 1]>>()
94
+ .unwrap();
95
+ //println!("\tgene_ids = {:?}", gene_ids);
96
+ //println!("\tgene_ids.shape() = {:?}", gene_ids.shape());
97
+ //println!("\tgene_ids.strides() = {:?}", gene_ids.strides());
98
+ //println!("\tgene_ids.ndim() = {:?}", gene_ids.ndim());
99
+ //println!("Time for parsing gene names:{:?}", now_gene_ids.elapsed());
100
+
89
101
  //let now_gene_names = Instant::now();
90
102
  let ds_gene_names = file.dataset("gene_names").unwrap();
91
103
  //println!("ds_gene_names:{:?}", ds_gene_names);
@@ -96,28 +108,16 @@ fn input_data_from_HDF5(
96
108
  //println!("\tgene_names.shape() = {:?}", gene_names.shape());
97
109
  //println!("\tgene_names.strides() = {:?}", gene_names.strides());
98
110
  //println!("\tgene_names.ndim() = {:?}", gene_names.ndim());
99
- //println!("Time for parsing gene names:{:?}", now_gene_names.elapsed());
100
-
101
- //let now_gene_symbols = Instant::now();
102
- let ds_gene_symbols = file.dataset("gene_symbols").unwrap();
103
- //println!("ds_gene_symbols:{:?}", ds_gene_symbols);
104
- let gene_symbols = ds_gene_symbols
105
- .read::<VarLenAscii, Dim<[usize; 1]>>()
106
- .unwrap();
107
- //println!("\tgene_symbols = {:?}", gene_symbols);
108
- //println!("\tgene_symbols.shape() = {:?}", gene_symbols.shape());
109
- //println!("\tgene_symbols.strides() = {:?}", gene_symbols.strides());
110
- //println!("\tgene_symbols.ndim() = {:?}", gene_symbols.ndim());
111
111
  //println!(
112
112
  // "Time for parsing gene symbols:{:?}",
113
- // now_gene_symbols.elapsed()
113
+ // now_gene_names.elapsed()
114
114
  //);
115
115
 
116
+ let mut gene_ids_string: Vec<String> = Vec::with_capacity(gene_ids.len());
116
117
  let mut gene_names_string: Vec<String> = Vec::with_capacity(gene_names.len());
117
- let mut gene_symbols_string: Vec<String> = Vec::with_capacity(gene_symbols.len());
118
- for i in 0..gene_names.len() {
118
+ for i in 0..gene_ids.len() {
119
+ gene_ids_string.push(gene_ids[i].to_string());
119
120
  gene_names_string.push(gene_names[i].to_string());
120
- gene_symbols_string.push(gene_symbols[i].to_string());
121
121
  }
122
122
 
123
123
  //let now_samples = Instant::now();
@@ -154,7 +154,7 @@ fn input_data_from_HDF5(
154
154
  }
155
155
 
156
156
  let sample_array: Array2<f64> = ds_counts
157
- .read_slice_2d((0..gene_names.len(), sample_index..sample_index + 1))
157
+ .read_slice_2d((0..gene_ids.len(), sample_index..sample_index + 1))
158
158
  .unwrap();
159
159
  //println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
160
160
  input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
@@ -183,7 +183,7 @@ fn input_data_from_HDF5(
183
183
  //let data_counts: Array1<_> = ds_counts.read::<f64, Dim<[usize; 1]>>().unwrap();
184
184
  //println!("Data_counts: {:?}", data_counts);
185
185
  let sample_array: Array2<f64> = ds_counts
186
- .read_slice_2d((0..gene_names.len(), sample_index..sample_index + 1))
186
+ .read_slice_2d((0..gene_ids.len(), sample_index..sample_index + 1))
187
187
  .unwrap();
188
188
  //println!("Length of gene array:{:?}", sample_array.len()); // Please check the result
189
189
  input_vector.append(&mut sample_array.as_slice().unwrap().to_vec());
@@ -196,19 +196,19 @@ fn input_data_from_HDF5(
196
196
  // "case + control length:{}",
197
197
  // case_list.len() + control_list.len()
198
198
  //);
199
- //println!("gene_names length:{}", gene_names.len());
199
+ //println!("gene_ids length:{}", gene_ids.len());
200
200
  //println!("input_vector length:{}", input_vector.len());
201
201
  let dm = DMatrix::from_row_slice(
202
202
  case_list.len() + control_list.len(),
203
- gene_names.len(),
203
+ gene_ids.len(),
204
204
  &input_vector,
205
205
  );
206
206
  (
207
207
  dm.transpose(), // Transposing the matrix
208
208
  case_indexes,
209
209
  control_indexes,
210
+ gene_ids_string,
210
211
  gene_names_string,
211
- gene_symbols_string,
212
212
  )
213
213
  }
214
214
 
@@ -227,8 +227,8 @@ fn input_data_from_text(
227
227
  let mut file = File::open(filename).unwrap();
228
228
  let mut num_lines: usize = 0;
229
229
  let mut input_vector: Vec<f64> = Vec::with_capacity(500 * 65000);
230
+ let mut gene_ids: Vec<String> = Vec::with_capacity(65000);
230
231
  let mut gene_names: Vec<String> = Vec::with_capacity(65000);
231
- let mut gene_symbols: Vec<String> = Vec::with_capacity(65000);
232
232
  let mut num_columns: usize = 0;
233
233
 
234
234
  // Check headers for samples
@@ -289,9 +289,9 @@ fn input_data_from_text(
289
289
  let mut index = 0;
290
290
  for field in line.split('\t').collect::<Vec<&str>>() {
291
291
  if index == gene_name_index.unwrap() {
292
- gene_names.push(field.to_string());
292
+ gene_ids.push(field.to_string());
293
293
  } else if index == gene_symbol_index.unwrap() {
294
- gene_symbols.push(field.to_string());
294
+ gene_names.push(field.to_string());
295
295
  } else if binary_search(&case_indexes_original, index) != -1 {
296
296
  let num = FromStr::from_str(field);
297
297
  match num {
@@ -475,8 +475,8 @@ fn input_data_from_text(
475
475
  input_vector.append(&mut *input_vector_temp.lock().unwrap());
476
476
  case_indexes.append(&mut *case_indexes_temp.lock().unwrap());
477
477
  control_indexes.append(&mut *control_indexes_temp.lock().unwrap());
478
- gene_names.append(&mut *genes_names_temp.lock().unwrap());
479
- gene_symbols.append(&mut *genes_symbols_temp.lock().unwrap());
478
+ gene_ids.append(&mut *genes_names_temp.lock().unwrap());
479
+ gene_names.append(&mut *genes_symbols_temp.lock().unwrap());
480
480
 
481
481
  num_lines += *num_lines_temp.lock().unwrap();
482
482
  num_columns += *num_columns_temp.lock().unwrap();
@@ -490,7 +490,7 @@ fn input_data_from_text(
490
490
  //println!("Time for inputting data:{:?}", input_time.elapsed());
491
491
  let dm = DMatrix::from_row_slice(num_lines, num_columns, &input_vector);
492
492
  //println!("dm:{:?}", dm);
493
- (dm, case_indexes, control_indexes, gene_names, gene_symbols)
493
+ (dm, case_indexes, control_indexes, gene_ids, gene_names)
494
494
  }
495
495
 
496
496
  #[allow(dead_code)]
@@ -625,16 +625,16 @@ fn main() {
625
625
  input_matrix,
626
626
  case_indexes,
627
627
  control_indexes,
628
+ gene_ids,
628
629
  gene_names,
629
- gene_symbols,
630
630
  );
631
631
  if storage_type == "text" {
632
632
  (
633
633
  input_matrix,
634
634
  case_indexes,
635
635
  control_indexes,
636
+ gene_ids,
636
637
  gene_names,
637
- gene_symbols,
638
638
  ) = input_data_from_text(file_name, &case_list, &control_list);
639
639
  } else {
640
640
  // Parsing data from a HDF5 file
@@ -642,8 +642,8 @@ fn main() {
642
642
  input_matrix,
643
643
  case_indexes,
644
644
  control_indexes,
645
+ gene_ids,
645
646
  gene_names,
646
- gene_symbols,
647
647
  ) = input_data_from_HDF5(file_name, &case_list, &control_list);
648
648
  }
649
649
  //let filtering_time = Instant::now();
@@ -651,19 +651,27 @@ fn main() {
651
651
  filtered_matrix,
652
652
  lib_sizes,
653
653
  filtered_genes,
654
- filtered_gene_symbols,
654
+ filtered_gene_names,
655
655
  ) = filter_by_expr(
656
656
  min_count,
657
657
  min_total_count,
658
658
  &input_matrix,
659
659
  case_indexes.len(),
660
660
  control_indexes.len(),
661
+ gene_ids,
661
662
  gene_names,
662
- gene_symbols,
663
663
  );
664
664
  //println!("filtering time:{:?}", filtering_time.elapsed());
665
665
  //println!("filtered_matrix_rows:{:?}", filtered_matrix.nrows());
666
666
  //println!("filtered_matrix_cols:{:?}", filtered_matrix.ncols());
667
+ if filtered_matrix.nrows() == 0 {
668
+ // Its possible after filtering there might not be any genes left in the matrix, in such a case the rust code must exit gracefully with an error.
669
+ panic!("Number of genes after filtering = 0, cannot proceed any further")
670
+ }
671
+ if filtered_matrix.ncols() == 0 {
672
+ // Its possible after filtering there might not be any samples left in the matrix, in such a case the rust code must exit gracefully with an error.
673
+ panic!("Number of samples after filtering = 0, cannot proceed any further")
674
+ }
667
675
  //let cpm_normalization_time = Instant::now();
668
676
  let mut normalized_matrix = cpm(&filtered_matrix);
669
677
  //println!(
@@ -747,7 +755,7 @@ fn main() {
747
755
  p_values.push(PValueIndexes {
748
756
  index: i,
749
757
  gene_name: filtered_genes[i].to_owned(),
750
- gene_symbol: filtered_gene_symbols[i].to_owned(),
758
+ gene_symbol: filtered_gene_names[i].to_owned(),
751
759
  fold_change: (treated_mean.unwrap()
752
760
  / control_mean.unwrap())
753
761
  .log2(),
@@ -759,8 +767,8 @@ fn main() {
759
767
  // Multithreaded implementation of calculating wilcoxon p-values
760
768
  let normalized_matrix_temp = Arc::new(normalized_matrix);
761
769
  let filtered_genes_temp = Arc::new(filtered_genes);
762
- let filtered_gene_symbols_temp =
763
- Arc::new(filtered_gene_symbols);
770
+ let filtered_gene_names_temp =
771
+ Arc::new(filtered_gene_names);
764
772
  let case_indexes_temp = Arc::new(case_indexes);
765
773
  let control_indexes_temp = Arc::new(control_indexes);
766
774
  let p_values_temp =
@@ -774,8 +782,8 @@ fn main() {
774
782
  Arc::clone(&control_indexes_temp);
775
783
  let p_values_temp = Arc::clone(&p_values_temp);
776
784
  let filtered_genes_temp = Arc::clone(&filtered_genes_temp);
777
- let filtered_gene_symbols_temp =
778
- Arc::clone(&filtered_gene_symbols_temp);
785
+ let filtered_gene_names_temp =
786
+ Arc::clone(&filtered_gene_names_temp);
779
787
  let handle = thread::spawn(move || {
780
788
  let mut p_values_thread: Vec<PValueIndexes> =
781
789
  Vec::with_capacity(
@@ -835,7 +843,7 @@ fn main() {
835
843
  index: i,
836
844
  gene_name: filtered_genes_temp[i]
837
845
  .to_owned(),
838
- gene_symbol: filtered_gene_symbols_temp
846
+ gene_symbol: filtered_gene_names_temp
839
847
  [i]
840
848
  .to_owned(),
841
849
  fold_change: (treated_mean.unwrap()
@@ -1214,7 +1222,6 @@ fn calc_factor_quantile(
1214
1222
  for i in 0..input_matrix.nrows() {
1215
1223
  row_vec.push(input_matrix[(i, j)] as f64);
1216
1224
  }
1217
- //println!("row_vec:{:?}", row_vec);
1218
1225
  let quan = calc_quantile(row_vec, P);
1219
1226
  //println!("quan:{}", quan);
1220
1227
  let num = quan / lib_sizes[j];
@@ -1245,8 +1252,8 @@ fn filter_by_expr(
1245
1252
  raw_data: &Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
1246
1253
  num_diseased: usize,
1247
1254
  num_control: usize,
1255
+ gene_ids: Vec<String>,
1248
1256
  gene_names: Vec<String>,
1249
- gene_symbols: Vec<String>,
1250
1257
  ) -> (
1251
1258
  Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
1252
1259
  Vec<f64>,
@@ -1338,14 +1345,14 @@ fn filter_by_expr(
1338
1345
  }
1339
1346
  let mut filtered_matrix = DMatrix::from_vec(positives.len(), num_diseased + num_control, blank);
1340
1347
  let mut filtered_genes: Vec<String> = Vec::with_capacity(positives.len());
1341
- let mut filtered_gene_symbols: Vec<String> = Vec::with_capacity(positives.len());
1348
+ let mut filtered_gene_names: Vec<String> = Vec::with_capacity(positives.len());
1342
1349
  let mut i = 0;
1343
1350
  //println!("filtered_matrix rows:{}", filtered_matrix.nrows());
1344
1351
  //println!("filtered_matrix cols:{}", filtered_matrix.ncols());
1345
1352
  for index in positives {
1346
1353
  let row = raw_data.row(index);
1347
- filtered_genes.push(gene_names[index].to_owned());
1348
- filtered_gene_symbols.push(gene_symbols[index].to_owned());
1354
+ filtered_genes.push(gene_ids[index].to_owned());
1355
+ filtered_gene_names.push(gene_names[index].to_owned());
1349
1356
  let mut j = 0;
1350
1357
  for item in &row {
1351
1358
  //println!("index:{}", index);
@@ -1369,7 +1376,7 @@ fn filter_by_expr(
1369
1376
  filtered_matrix,
1370
1377
  modified_lib_sizes,
1371
1378
  filtered_genes,
1372
- filtered_gene_symbols,
1379
+ filtered_gene_names,
1373
1380
  )
1374
1381
  }
1375
1382
 
@@ -81,20 +81,20 @@ fn input_data_hdf5(
81
81
  };
82
82
 
83
83
  // Read gene symbols dataset
84
- let genes_dataset = match file.dataset("gene_symbols") {
84
+ let genes_dataset = match file.dataset("gene_names") {
85
85
  Ok(ds) => ds,
86
86
  Err(err) => {
87
- // eprintln!("Failed to open gene_symbols dataset: {}", err);
87
+ // eprintln!("Failed to open gene_names dataset: {}", err);
88
88
  // println!(
89
89
  // "{}",
90
90
  // serde_json::json!({
91
91
  // "status": "error",
92
- // "message": format!("Failed to open gene_symbols dataset: {}", err),
92
+ // "message": format!("Failed to open gene_names dataset: {}", err),
93
93
  // "file_path": filename
94
94
  // })
95
95
  // );
96
96
  return Err(hdf5::Error::Internal(format!(
97
- "Failed to open gene_symbols dataset: {}",
97
+ "Failed to open gene_names dataset: {}",
98
98
  err
99
99
  )));
100
100
  }
@@ -121,8 +121,8 @@ fn input_data_hdf5(
121
121
  };
122
122
 
123
123
  // Convert to Vec<String> for easier handling
124
- let gene_symbols: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
125
- let num_genes = gene_symbols.len();
124
+ let gene_names: Vec<String> = genes_varlen.iter().map(|g| g.to_string()).collect();
125
+ let num_genes = gene_names.len();
126
126
  // eprintln!("Found {} gene symbols", num_genes);
127
127
 
128
128
  // Read sample names
@@ -316,7 +316,7 @@ fn input_data_hdf5(
316
316
  // dm.ncols()
317
317
  // );
318
318
 
319
- Ok((dm, gene_symbols))
319
+ Ok((dm, gene_names))
320
320
  }
321
321
 
322
322
  // The original input_data function for text files is kept as is
@@ -330,7 +330,7 @@ fn input_data(
330
330
  // Build the CSV reader and iterate over each record.
331
331
  let mut reader = BGZFReader::new(fs::File::open(filename).unwrap()).unwrap();
332
332
  let mut num_lines: usize = 0;
333
- let mut gene_symbols: Vec<String> = Vec::with_capacity(500);
333
+ let mut gene_names: Vec<String> = Vec::with_capacity(500);
334
334
 
335
335
  let mut buffer = String::new();
336
336
  reader.read_to_string(&mut buffer).unwrap();
@@ -358,7 +358,7 @@ fn input_data(
358
358
  } else {
359
359
  num_lines += 1;
360
360
  //println!("line2:{:?}", line2);
361
- gene_symbols.push(line2[3].to_string());
361
+ gene_names.push(line2[3].to_string());
362
362
  for i in &column_numbers {
363
363
  let field = line2[*i];
364
364
  let num = FromStr::from_str(field);
@@ -386,7 +386,7 @@ fn input_data(
386
386
 
387
387
  let dm = DMatrix::from_row_slice(num_lines, sample_list.len(), &input_vector);
388
388
  //println!("dm:{:?}", dm);
389
- (dm, gene_symbols)
389
+ (dm, gene_names)
390
390
  }
391
391
 
392
392
  #[allow(dead_code)]
@@ -398,7 +398,7 @@ struct GeneInfo {
398
398
 
399
399
  fn calculate_variance(
400
400
  input_matrix: Matrix<f64, Dyn, Dyn, VecStorage<f64, Dyn, Dyn>>,
401
- gene_symbols: Vec<String>,
401
+ gene_names: Vec<String>,
402
402
  mut min_sample_size: f64,
403
403
  filter_extreme_values: bool,
404
404
  rank_type: String,
@@ -478,12 +478,12 @@ fn calculate_variance(
478
478
  {
479
479
  gene_infos.push(GeneInfo {
480
480
  rank_type: gene_counts.variance(),
481
- gene_symbol: gene_symbols[row].clone(),
481
+ gene_symbol: gene_names[row].clone(),
482
482
  });
483
483
  } else if filter_extreme_values == false {
484
484
  gene_infos.push(GeneInfo {
485
485
  rank_type: gene_counts.variance(),
486
- gene_symbol: gene_symbols[row].clone(),
486
+ gene_symbol: gene_names[row].clone(),
487
487
  });
488
488
  }
489
489
  } else {
@@ -496,12 +496,12 @@ fn calculate_variance(
496
496
  {
497
497
  gene_infos.push(GeneInfo {
498
498
  rank_type: gene_counts_data.interquartile_range(),
499
- gene_symbol: gene_symbols[row].clone(),
499
+ gene_symbol: gene_names[row].clone(),
500
500
  });
501
501
  } else if filter_extreme_values == false {
502
502
  gene_infos.push(GeneInfo {
503
503
  rank_type: gene_counts_data.interquartile_range(),
504
- gene_symbol: gene_symbols[row].clone(),
504
+ gene_symbol: gene_names[row].clone(),
505
505
  });
506
506
  }
507
507
  }
@@ -689,7 +689,7 @@ fn main() {
689
689
 
690
690
  // Choose the appropriate input function based on file type
691
691
  // eprintln!("Reading data from {} file: {}", file_type, file_name);
692
- let (input_matrix, gene_symbols) = if file_type == "hdf5" {
692
+ let (input_matrix, gene_names) = if file_type == "hdf5" {
693
693
  // eprintln!("Using HDF5 reader function...");
694
694
  match input_data_hdf5(&file_name, &samples_list) {
695
695
  Ok(result) => {
@@ -731,11 +731,11 @@ fn main() {
731
731
  // input_matrix.nrows(),
732
732
  // input_matrix.ncols()
733
733
  // );
734
- // eprintln!("Number of gene symbols: {}", gene_symbols.len());
735
- if !gene_symbols.is_empty() {
734
+ // eprintln!("Number of gene symbols: {}", gene_names.len());
735
+ if !gene_names.is_empty() {
736
736
  // eprintln!(
737
737
  // "First few gene symbols: {:?}",
738
- // &gene_symbols.iter().take(5).collect::<Vec<_>>()
738
+ // &gene_names.iter().take(5).collect::<Vec<_>>()
739
739
  // );
740
740
  }
741
741
 
@@ -749,7 +749,7 @@ fn main() {
749
749
  let gene_infos = match std::panic::catch_unwind(|| {
750
750
  calculate_variance(
751
751
  input_matrix,
752
- gene_symbols,
752
+ gene_names,
753
753
  samples_list.len() as f64,
754
754
  filter_extreme_values,
755
755
  rank_type.to_string(),