@sjcrh/proteinpaint-rust 2.137.2-0 → 2.138.3-7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -111,3 +111,8 @@ path="src/gdcGRIN2.rs"
111
111
  [[bin]]
112
112
  name="cerno"
113
113
  path="src/cerno.rs"
114
+
115
+ [[bin]]
116
+ name="readH5"
117
+ path="src/readH5.rs"
118
+
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.137.2-0",
2
+ "version": "2.138.3-7",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
package/src/readH5.rs ADDED
@@ -0,0 +1,617 @@
1
+ // readH5.rs - validate/read HDF5 file
2
+ //
3
+ // READ:
4
+ // Extracts matrix from HDF5 files.
5
+ // Matrix dataset is hardcoded as "matrix". row_dataset(samples) and col_dataset(genesets)
6
+ // row_dataset and col_dataset are stored as VarLenUnicode.
7
+ // Supports f32, f64, i32 and i64 matrix datatypes.
8
+ //
9
+ // Features:
10
+ // - Hardcoded "matrix" dataset
11
+ // - Supports f32, f64, i32 and i64 matrix datatypes
12
+ // - Parallel processing with dynamic thread count
13
+ // - JSON output with timing metrics
14
+
15
+ // Usage
16
+ // echo '{"query":["HALLMARK_ADIPOGENESIS", "HALLMARK_ANGIOGENESIS"],"hdf5_file":"matrix.h5"}' | ./target/release/readH5
17
+ //
18
+ //
19
+ // VLIDATE:
20
+ // output: JSON with {"samples":[]}
21
+ // Usage
22
+ // echo '{"validate":true,"hdf5_file":"matrix.h5"}' | ./target/release/readH5
23
+
24
+ use hdf5::types::VarLenUnicode;
25
+ use hdf5::{File, Result, Selection};
26
+ use json::JsonValue;
27
+ use rayon::prelude::*;
28
+ use serde_json::{Map, Value, json};
29
+ use std::io;
30
+ use std::sync::Arc;
31
+ use std::time::Instant;
32
+
33
+ /// Creates an error JSON response
34
+ fn error_response(message: impl Into<String>) -> Value {
35
+ json!({
36
+ "status": "error",
37
+ "message": message.into()
38
+ })
39
+ }
40
+
41
+ /// h5 file validation
42
+ // Detects if the HDF5 file contains a valid matrix dataset
43
+ pub fn detect_hdf5_format(
44
+ hdf5_filename: &str,
45
+ matrix_name: &str,
46
+ row_dataset: &str,
47
+ col_dataset: &str,
48
+ ) -> Result<&'static str> {
49
+ let file = File::open(hdf5_filename)?;
50
+
51
+ // Check for matrix dataset (must be 2D)
52
+ let matrix_ok = file
53
+ .dataset(matrix_name)
54
+ .map(|dataset| dataset.shape().len() == 2)
55
+ .unwrap_or(false);
56
+
57
+ // Check for row dataset (must exist and contain VarLenAscii)
58
+ let row_ok = file
59
+ .dataset(row_dataset)
60
+ .and_then(|ds| ds.read_1d::<hdf5::types::VarLenUnicode>())
61
+ .is_ok();
62
+
63
+ // Check for column dataset (must exist and contain VarLenAscii)
64
+ let col_ok = file
65
+ .dataset(col_dataset)
66
+ .and_then(|ds| ds.read_1d::<hdf5::types::VarLenUnicode>())
67
+ .is_ok();
68
+
69
+ if matrix_ok && row_ok && col_ok {
70
+ Ok("matrix")
71
+ } else {
72
+ Ok("unknown")
73
+ }
74
+ }
75
+
76
+ pub fn validate_hdf5_file(hdf5_filename: String) -> Result<()> {
77
+ let file = File::open(&hdf5_filename)?;
78
+ let matrix_name = "matrix";
79
+ let row_dataset = "samples";
80
+ let col_dataset = "item";
81
+ let file_format = detect_hdf5_format(&hdf5_filename, matrix_name, row_dataset, col_dataset)?;
82
+
83
+ let output = match file_format {
84
+ "matrix" => {
85
+ let dataset = file.dataset(matrix_name)?;
86
+ let matrix_shape = dataset.shape();
87
+ let datatype = dataset.dtype()?;
88
+
89
+ // Read row_dataset as VarLenUnicode
90
+ let row_dataset_data = file.dataset(row_dataset)?;
91
+ let row_data: Vec<String> = row_dataset_data
92
+ .read_1d::<VarLenUnicode>()?
93
+ .iter()
94
+ .map(|s| s.to_string())
95
+ .collect();
96
+
97
+ // Validate matrix data
98
+ let matrix_valid = if matrix_shape.len() == 2 && matrix_shape[0] > 0 && matrix_shape[1] > 0 {
99
+ // Create a selection for a 1x1 slice at (0,0)
100
+ let selection = hdf5::Selection::from((0..1, 0..1));
101
+
102
+ if datatype.is::<f64>() {
103
+ dataset.read_slice_2d::<f64, _>(selection).is_ok()
104
+ } else if datatype.is::<f32>() {
105
+ dataset.read_slice_2d::<f32, _>(selection).is_ok()
106
+ } else if datatype.is::<i32>() {
107
+ dataset.read_slice_2d::<i32, _>(selection).is_ok()
108
+ } else if datatype.is::<i64>() {
109
+ dataset.read_slice_2d::<i64, _>(selection).is_ok()
110
+ } else {
111
+ false
112
+ }
113
+ } else {
114
+ false
115
+ };
116
+
117
+ json!({
118
+ "status": if matrix_valid { "success" } else { "failure" },
119
+ "message": if matrix_valid {
120
+ "HDF5 matrix file loaded successfully"
121
+ } else {
122
+ "Invalid matrix structure"
123
+ },
124
+ "file_path": hdf5_filename,
125
+ "format": "matrix",
126
+ "matrix_dimensions": {
127
+ "num_rows": matrix_shape.get(0).unwrap_or(&0),
128
+ "num_columns": matrix_shape.get(1).unwrap_or(&0)
129
+ },
130
+ row_dataset.to_string(): row_data
131
+ })
132
+ }
133
+ _ => {
134
+ json!({
135
+ "status": "failure",
136
+ "message": format!(
137
+ "Missing or invalid required datasets: matrix='{}', row_dataset='{}', col_dataset='{}'",
138
+ matrix_name, row_dataset, col_dataset
139
+ ),
140
+ "file_path": hdf5_filename,
141
+ "format": "unknown",
142
+ "matrix_dimensions": {
143
+ "num_rows": 0,
144
+ "num_columns": 0
145
+ }
146
+ })
147
+ }
148
+ };
149
+ println!("{}", output);
150
+ Ok(())
151
+ }
152
+
153
+ /// read h5 file
154
+
155
+ // Trait for converting types to f64, allowing lossy conversions
156
+ trait ToF64 {
157
+ fn to_f64(&self) -> f64;
158
+ }
159
+
160
+ impl ToF64 for f32 {
161
+ fn to_f64(&self) -> f64 {
162
+ *self as f64
163
+ }
164
+ }
165
+
166
+ impl ToF64 for f64 {
167
+ fn to_f64(&self) -> f64 {
168
+ *self
169
+ }
170
+ }
171
+
172
+ impl ToF64 for i32 {
173
+ fn to_f64(&self) -> f64 {
174
+ *self as f64
175
+ }
176
+ }
177
+
178
+ impl ToF64 for i64 {
179
+ fn to_f64(&self) -> f64 {
180
+ if self.abs() > (1 << 53) {
181
+ eprintln!("Warning: i64 value {} may lose precision when converted to f64", self);
182
+ }
183
+ *self as f64
184
+ }
185
+ }
186
+
187
+ // Process matrix data for a given type
188
+ fn process_data<T: ToF64 + Copy>(data: &[T], row_data: &[String]) -> Map<String, Value> {
189
+ let mut row_data_map = Map::new();
190
+ for (i, row) in row_data.iter().enumerate() {
191
+ if i < data.len() {
192
+ let value = data[i].to_f64();
193
+ row_data_map.insert(
194
+ row.replace("\\", ""),
195
+ if value.is_finite() {
196
+ Value::from(value)
197
+ } else {
198
+ Value::Null
199
+ },
200
+ );
201
+ }
202
+ }
203
+ row_data_map
204
+ }
205
+
206
+ // Data query
207
+ // Supports f32, f64, i32 and i64 datatypes for the matrix dataset.
208
+ // Uses hardcoded "matrix", "samples" and "item" dataset.
209
+ // "samples" and "item" datasets are read as VarLenUnicode.
210
+ //
211
+ // # Arguments
212
+ // * `hdf5_filenam` - Path to the HDF5 file
213
+ // * `qry` - Query (non-empty array)
214
+ //
215
+ // # Returns
216
+ // Prints a JSON object with matrix data for query data to stdout
217
+ fn query_dataset(hdf5_filename: String, qry: Vec<String>) -> Result<()> {
218
+ let overall_start_time = Instant::now();
219
+ let mut timings = Map::new();
220
+ timings.insert("query_count".to_string(), Value::from(qry.len()));
221
+
222
+ let file = match File::open(&hdf5_filename) {
223
+ Ok(f) => f,
224
+ Err(err) => {
225
+ println!("{}", error_response(format!("Failed to open HDF5 file: {}", err)));
226
+ return Ok(());
227
+ }
228
+ };
229
+
230
+ let col_dataset_name = String::from("item");
231
+ let col_dataset = match file.dataset(&col_dataset_name) {
232
+ Ok(ds) => ds,
233
+ Err(err) => {
234
+ println!(
235
+ "{}",
236
+ error_response(format!("Failed to open {} dataset: {}", col_dataset_name, err))
237
+ );
238
+ return Ok(());
239
+ }
240
+ };
241
+
242
+ let col_dataset_varlen = match col_dataset.read_1d::<VarLenUnicode>() {
243
+ Ok(g) => g,
244
+ Err(err) => {
245
+ println!(
246
+ "{}",
247
+ error_response(format!("Failed to read {}: {}", col_dataset_name, err))
248
+ );
249
+ return Ok(());
250
+ }
251
+ };
252
+ let col_data: Vec<String> = col_dataset_varlen.iter().map(|g| g.to_string()).collect();
253
+
254
+ let hashmap_start_time = Instant::now();
255
+ let col_data_to_index: std::collections::HashMap<String, usize> =
256
+ col_data.iter().enumerate().map(|(i, g)| (g.clone(), i)).collect();
257
+ timings.insert(
258
+ "build_hashmap_ms".to_string(),
259
+ Value::from(hashmap_start_time.elapsed().as_millis() as u64),
260
+ );
261
+
262
+ let row_dataset_name = String::from("samples");
263
+ let row_dataset = match file.dataset(&row_dataset_name) {
264
+ Ok(ds) => ds,
265
+ Err(err) => {
266
+ println!(
267
+ "{}",
268
+ error_response(format!("Failed to open {} dataset: {}", row_dataset_name, err))
269
+ );
270
+ return Ok(());
271
+ }
272
+ };
273
+ let row_dataset_varlen = match row_dataset.read_1d::<VarLenUnicode>() {
274
+ Ok(s) => s,
275
+ Err(err) => {
276
+ println!(
277
+ "{}",
278
+ error_response(format!("Failed to read {}: {}", row_dataset_name, err))
279
+ );
280
+ return Ok(());
281
+ }
282
+ };
283
+ let row_data: Vec<String> = row_dataset_varlen.iter().map(|s| s.to_string()).collect();
284
+
285
+ let matrix_dataset = match file.dataset("matrix") {
286
+ Ok(ds) => ds,
287
+ Err(err) => {
288
+ println!("{}", error_response(format!("Failed to open matrix dataset: {}", err)));
289
+ return Ok(());
290
+ }
291
+ };
292
+
293
+ let datatype = match matrix_dataset.dtype() {
294
+ Ok(dt) => dt,
295
+ Err(err) => {
296
+ println!("{}", error_response(format!("Failed to read matrix datatype: {}", err)));
297
+ return Ok(());
298
+ }
299
+ };
300
+
301
+ let col_data_map = Arc::new(std::sync::Mutex::new(Map::new()));
302
+ let thread_count = std::cmp::min(4, qry.len());
303
+ timings.insert("thread_count".to_string(), Value::from(thread_count));
304
+
305
+ let results: Vec<(String, Value)> = match rayon::ThreadPoolBuilder::new()
306
+ .num_threads(thread_count)
307
+ .build()
308
+ {
309
+ Ok(pool) => pool.install(|| {
310
+ qry
311
+ .par_iter()
312
+ .map(|query| {
313
+ let query_start_time = Instant::now();
314
+ let result = match col_data_to_index.get(query) {
315
+ Some(&index) => {
316
+ if index >= matrix_dataset.shape()[0] {
317
+ let mut error_map = Map::new();
318
+ error_map.insert(
319
+ "error".to_string(),
320
+ Value::String("Query index out of bounds".to_string()),
321
+ );
322
+ (query.clone(), Value::Object(error_map))
323
+ } else {
324
+ let selection = Selection::from((index..index+1, ..));
325
+ if datatype.is::<f64>() {
326
+ match matrix_dataset.read_slice_2d::<f64,_>(selection) {
327
+ Ok(data) => (
328
+ query.clone(),
329
+ json!({
330
+ "dataId": query,
331
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
332
+ }),
333
+ ),
334
+ Err(err) => {
335
+ let mut error_map = Map::new();
336
+ error_map.insert(
337
+ "error".to_string(),
338
+ Value::String(format!("Failed to read f64 matrix values: {}", err)),
339
+ );
340
+ (query.clone(), Value::Object(error_map))
341
+ }
342
+ }
343
+ } else if datatype.is::<f32>() {
344
+ match matrix_dataset.read_slice_2d::<f32,_>(selection) {
345
+ Ok(data) => (
346
+ query.clone(),
347
+ json!({
348
+ "dataId": query,
349
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
350
+ }),
351
+ ),
352
+ Err(err) => {
353
+ let mut error_map = Map::new();
354
+ error_map.insert(
355
+ "error".to_string(),
356
+ Value::String(format!("Failed to read f32 matrix values: {}", err)),
357
+ );
358
+ (query.clone(), Value::Object(error_map))
359
+ }
360
+ }
361
+ } else if datatype.is::<i32>() {
362
+ match matrix_dataset.read_slice_2d::<i32,_>(selection) {
363
+ Ok(data) => (
364
+ query.clone(),
365
+ json!({
366
+ "dataId": query,
367
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
368
+ }),
369
+ ),
370
+ Err(err) => {
371
+ let mut error_map = Map::new();
372
+ error_map.insert(
373
+ "error".to_string(),
374
+ Value::String(format!("Failed to read i32 matrix values: {}", err)),
375
+ );
376
+ (query.clone(), Value::Object(error_map))
377
+ }
378
+ }
379
+ } else if datatype.is::<i64>() {
380
+ match matrix_dataset.read_slice_2d::<i64,_>(selection) {
381
+ Ok(data) => (
382
+ query.clone(),
383
+ json!({
384
+ "dataId": query,
385
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
386
+ }),
387
+ ),
388
+ Err(err) => {
389
+ let mut error_map = Map::new();
390
+ error_map.insert(
391
+ "error".to_string(),
392
+ Value::String(format!("Failed to read i64 matrix values: {}", err)),
393
+ );
394
+ (query.clone(), Value::Object(error_map))
395
+ }
396
+ }
397
+ } else {
398
+ let mut error_map = Map::new();
399
+ error_map.insert(
400
+ "error".to_string(),
401
+ Value::String("Unsupported matrix datatype (expected f64, f32, i64 or i32)".to_string()),
402
+ );
403
+ (query.clone(), Value::Object(error_map))
404
+ }
405
+ }
406
+ }
407
+ None => {
408
+ let mut error_map = Map::new();
409
+ error_map.insert(
410
+ "error".to_string(),
411
+ Value::String(format!("Query '{}' not found in {} dataset", query, col_dataset_name)),
412
+ );
413
+ (query.clone(),Value::Object(error_map))
414
+ }
415
+ };
416
+ let elapsed_time = query_start_time.elapsed().as_millis() as u64;
417
+ let mut query_timings = col_data_map.lock().unwrap();
418
+ query_timings.insert(
419
+ format!("{}_ms", query),
420
+ Value::from(elapsed_time),
421
+ );
422
+ result
423
+ })
424
+ .collect()
425
+ }),
426
+ Err(err) => {
427
+ timings.insert(
428
+ "thread_pool_error".to_string(),
429
+ Value::String(format!("Failed to create thread pool: {}", err)),
430
+ );
431
+ qry
432
+ .iter()
433
+ .map(|query| {
434
+ let query_start_time = Instant::now();
435
+ let result = match col_data_to_index.get(query) {
436
+ Some(&index) => {
437
+ if index >= matrix_dataset.shape()[0] {
438
+ let mut error_map = Map::new();
439
+ error_map.insert(
440
+ "error".to_string(),
441
+ Value::String("Query index out of bounds".to_string()),
442
+ );
443
+ (query.clone(), Value::Object(error_map))
444
+ } else {
445
+ let selection = Selection::from((index..index+1, ..));
446
+ if datatype.is::<f64>() {
447
+ match matrix_dataset.read_slice_1d::<f64,_>(selection) {
448
+ Ok(data) => (
449
+ query.clone(),
450
+ json!({
451
+ "dataId": query,
452
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
453
+ }),
454
+ ),
455
+ Err(err) => {
456
+ let mut error_map = Map::new();
457
+ error_map.insert(
458
+ "error".to_string(),
459
+ Value::String(format!("Failed to read f64 matrix values: {}", err)),
460
+ );
461
+ (query.clone(), Value::Object(error_map))
462
+ }
463
+ }
464
+ } else if datatype.is::<f32>() {
465
+ match matrix_dataset.read_slice_1d::<f32,_>(selection) {
466
+ Ok(data) => (
467
+ query.clone(),
468
+ json!({
469
+ "dataId": query,
470
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
471
+ }),
472
+ ),
473
+ Err(err) => {
474
+ let mut error_map = Map::new();
475
+ error_map.insert(
476
+ "error".to_string(),
477
+ Value::String(format!("Failed to read f32 matrix values: {}", err)),
478
+ );
479
+ (query.clone(), Value::Object(error_map))
480
+ }
481
+ }
482
+ } else if datatype.is::<i32>() {
483
+ match matrix_dataset.read_slice_1d::<i32,_>(selection) {
484
+ Ok(data) => (
485
+ query.clone(),
486
+ json!({
487
+ "dataId": query,
488
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
489
+ }),
490
+ ),
491
+ Err(err) => {
492
+ let mut error_map = Map::new();
493
+ error_map.insert(
494
+ "error".to_string(),
495
+ Value::String(format!("Failed to read i32 matrix values: {}", err)),
496
+ );
497
+ (query.clone(), Value::Object(error_map))
498
+ }
499
+ }
500
+ } else if datatype.is::<i64>() {
501
+ match matrix_dataset.read_slice_1d::<i64,_>(selection) {
502
+ Ok(data) => (
503
+ query.clone(),
504
+ json!({
505
+ "dataId": query,
506
+ row_dataset_name.clone(): process_data(data.as_slice().unwrap(), &row_data)
507
+ }),
508
+ ),
509
+ Err(err) => {
510
+ let mut error_map = Map::new();
511
+ error_map.insert(
512
+ "error".to_string(),
513
+ Value::String(format!("Failed to read i64 matrix values: {}", err)),
514
+ );
515
+ (query.clone(), Value::Object(error_map))
516
+ }
517
+ }
518
+ } else {
519
+ let mut error_map = Map::new();
520
+ error_map.insert(
521
+ "error".to_string(),
522
+ Value::String("Unsupported matrix datatype (expected f64, f32, i64 or i32)".to_string()),
523
+ );
524
+ (query.clone(), Value::Object(error_map))
525
+ }
526
+ }
527
+ }
528
+ None => {
529
+ let mut error_map = Map::new();
530
+ error_map.insert(
531
+ "error".to_string(),
532
+ Value::String(format!("Query '{}' not found in {} dataset", query, col_dataset_name)),
533
+ );
534
+ (query.clone(),Value::Object(error_map))
535
+ }
536
+ };
537
+ let elapsed_time = query_start_time.elapsed().as_millis() as u64;
538
+ let mut query_timings = col_data_map.lock().unwrap();
539
+ query_timings.insert(
540
+ format!("{}_ms", query),
541
+ Value::from(elapsed_time),
542
+ );
543
+ result
544
+ })
545
+ .collect()
546
+ }
547
+ };
548
+
549
+ let mut col_data_map = col_data_map.lock().unwrap();
550
+ for (query, query_data) in results {
551
+ col_data_map.insert(query, query_data);
552
+ }
553
+
554
+ let output_json = json!({
555
+ "query_output": *col_data_map,
556
+ "timings": timings,
557
+ "total_time_ms": overall_start_time.elapsed().as_millis() as u64
558
+ });
559
+ println!("{}", output_json);
560
+ Ok(())
561
+ }
562
+
563
+ /// Main function to handle both validation and read of h5 file
564
+ fn main() -> Result<()> {
565
+ let mut input = String::new();
566
+ match io::stdin().read_line(&mut input) {
567
+ Ok(_bytes_read) => {
568
+ let input_json = match json::parse(&input) {
569
+ Ok(json) => json,
570
+ Err(_err) => {
571
+ panic!("Invalid JSON input");
572
+ }
573
+ };
574
+
575
+ // Extract HDF5 filename
576
+ let hdf5_filename = match input_json["hdf5_file"].as_str() {
577
+ Some(x) => x.to_string(),
578
+ None => {
579
+ println!("{}", error_response("HDF5 filename not provided"));
580
+ return Ok(());
581
+ }
582
+ };
583
+
584
+ // h5 file validation
585
+ if input_json.has_key("validate") {
586
+ let v: bool = match input_json["validate"].as_bool() {
587
+ Some(x) => x,
588
+ None => false,
589
+ };
590
+ if !v {
591
+ println!("{}", error_response("The value of validate is invalid"));
592
+ return Ok(());
593
+ }
594
+ let _ = validate_hdf5_file(hdf5_filename);
595
+ } else if input_json.has_key("query") {
596
+ let qry: Vec<String> = match &input_json["query"] {
597
+ JsonValue::Array(arr) => arr.iter().filter_map(|v| v.as_str().map(|s| s.to_string())).collect(),
598
+ _ => vec![],
599
+ };
600
+ if !qry.is_empty() {
601
+ query_dataset(hdf5_filename, qry)?;
602
+ } else {
603
+ println!("{}", error_response(format!("query is empty")));
604
+ };
605
+ } else {
606
+ println!(
607
+ "{}",
608
+ error_response("validate or query has to be provided in input JSON.")
609
+ );
610
+ }
611
+ }
612
+ Err(error) => {
613
+ println!("{}", error_response(format!("Error reading input: {}", error)));
614
+ }
615
+ }
616
+ Ok(())
617
+ }