@sjcrh/proteinpaint-rust 2.111.0 → 2.114.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ // syntax:
2
+ // echo '{"hdf5_file":"/path/to/my/local/file.h5"}' | ./target/release/validateHDF5
3
+
4
+ use hdf5::types::VarLenAscii;
5
+ use hdf5::{File, Result};
6
+ use ndarray::Array1;
7
+ use ndarray::Dim;
8
+ use std::io;
9
+ use serde_json;
10
+
11
+ /// Detects the format of the HDF5 file
12
+ pub fn detect_hdf5_format(hdf5_filename: &str) -> Result<&'static str> {
13
+ let file = File::open(hdf5_filename)?;
14
+
15
+ // Check for dense format (has counts, gene_names, and samples datasets)
16
+ let has_counts = file.dataset("counts").is_ok();
17
+ let has_gene_names = file.dataset("gene_names").is_ok();
18
+ let has_samples = file.dataset("samples").is_ok();
19
+
20
+ // Check for sparse matrix format (has data group and sample_names)
21
+ let has_data_group = file.group("data").is_ok();
22
+ let has_sample_names = file.dataset("sample_names").is_ok();
23
+
24
+ if has_counts && has_gene_names && has_samples {
25
+ // eprintln!("Dense format detected");
26
+ Ok("dense")
27
+ } else if has_data_group && has_sample_names {
28
+ // eprintln!("Sparse format detected");
29
+ Ok("sparse")
30
+ } else {
31
+ // eprintln!("Unknown format detected");
32
+ Ok("unknown")
33
+ }
34
+ }
35
+
36
+ /// Validates and loads the HDF5 file
37
+ pub fn validate_hdf5_file(hdf5_filename: String) -> Result<()> {
38
+ // Open the HDF5 file
39
+ let file = File::open(&hdf5_filename)?;
40
+
41
+ // Detect file format
42
+ let file_format = detect_hdf5_format(&hdf5_filename)?;
43
+
44
+ // Get basic information about the file depending on format
45
+ let output = match file_format {
46
+ "dense" => {
47
+ // For dense format, get dimensions from the counts dataset
48
+ let ds_counts = file.dataset("counts")?;
49
+ let data_shape = ds_counts.shape();
50
+
51
+ // Read sample names using VarLenAscii
52
+ let mut sample_names: Vec<String> = Vec::new();
53
+ if let Ok(ds_samples) = file.dataset("samples") {
54
+ if let Ok(samples) = ds_samples.read_1d::<VarLenAscii>() {
55
+ for sample in samples.iter() {
56
+ sample_names.push(sample.to_string());
57
+ }
58
+ } else {
59
+ eprintln!("Error reading samples as VarLenAscii");
60
+ }
61
+ }
62
+
63
+ // Read gene names using VarLenAscii
64
+ let mut gene_names: Vec<String> = Vec::new();
65
+ if let Ok(ds_genes) = file.dataset("gene_ids") {
66
+ if let Ok(genes) = ds_genes.read_1d::<VarLenAscii>() {
67
+ for gene in genes.iter() {
68
+ gene_names.push(gene.to_string());
69
+ }
70
+ } else {
71
+ eprintln!("Error reading gene_ids as VarLenAscii");
72
+ }
73
+ } else {
74
+ eprintln!("Could not find 'gene_ids' dataset");
75
+ }
76
+
77
+ // Create JSON with both sample names and gene names
78
+ serde_json::json!({
79
+ "status": "success",
80
+ "message": "HDF5 file loaded successfully",
81
+ "file_path": hdf5_filename,
82
+ "format": "dense",
83
+ "sampleNames": sample_names,
84
+ "matrix_dimensions": {
85
+ "num_genes": data_shape[0],
86
+ "num_samples": data_shape[1]
87
+ }
88
+ })
89
+ }
90
+ "sparse" => {
91
+ // For sparse format, get dimensions from the data/dim dataset
92
+ let ds_dim = file.dataset("data/dim")?;
93
+ let data_dim: Array1<usize> = ds_dim.read::<usize, Dim<[usize; 1]>>()?;
94
+ let num_samples = data_dim[0];
95
+ let num_genes = data_dim[1];
96
+
97
+ // Read sample names using VarLenAscii
98
+ let mut sample_names: Vec<String> = Vec::new();
99
+ if let Ok(ds_samples) = file.dataset("sample_names") {
100
+ if let Ok(samples) = ds_samples.read_1d::<VarLenAscii>() {
101
+ for sample in samples.iter() {
102
+ sample_names.push(sample.to_string());
103
+ }
104
+ } else {
105
+ eprintln!("Error reading sample_names as VarLenAscii");
106
+ }
107
+ }
108
+
109
+ // Read gene names using VarLenAscii
110
+ let mut gene_names: Vec<String> = Vec::new();
111
+ if let Ok(ds_genes) = file.dataset("gene_names") {
112
+ if let Ok(genes) = ds_genes.read_1d::<VarLenAscii>() {
113
+ for gene in genes.iter() {
114
+ gene_names.push(gene.to_string());
115
+ }
116
+ } else {
117
+ eprintln!("Error reading gene_names as VarLenAscii");
118
+ }
119
+ } else {
120
+ eprintln!("Could not find 'gene_names' dataset, trying alternatives");
121
+ }
122
+
123
+ // Create JSON with the same structure as dense format
124
+ serde_json::json!({
125
+ "status": "success",
126
+ "message": "HDF5 file loaded successfully",
127
+ "file_path": hdf5_filename,
128
+ "format": "sparse",
129
+ "sampleNames": sample_names,
130
+ "matrix_dimensions": {
131
+ "num_genes": num_genes,
132
+ "num_samples": num_samples
133
+ }
134
+ })
135
+ }
136
+ _ => {
137
+ // For unknown format
138
+ serde_json::json!({
139
+ "status": "failure",
140
+ "message": "Unknown file format cannot be loaded successfully",
141
+ "file_path": hdf5_filename,
142
+ "format": "unknown",
143
+ "sampleNames": [],
144
+ "geneNames": []
145
+ })
146
+ }
147
+ };
148
+
149
+ // Print the output
150
+ println!("{}", output);
151
+
152
+ Ok(())
153
+ }
154
+
155
+ /// Main function to handle the validation process
156
+ fn main() -> Result<()> {
157
+ let mut input = String::new();
158
+ match io::stdin().read_line(&mut input) {
159
+ Ok(_bytes_read) => {
160
+ let input_json = json::parse(&input);
161
+ match input_json {
162
+ Ok(json_string) => {
163
+ // Extract HDF5 filename
164
+ let hdf5_filename = match json_string["hdf5_file"].as_str() {
165
+ Some(x) => x.to_string(),
166
+ None => {
167
+ eprintln!("HDF5 filename not provided");
168
+ println!(
169
+ "{}",
170
+ serde_json::json!({
171
+ "status": "error",
172
+ "message": "HDF5 filename not provided"
173
+ })
174
+ );
175
+ return Ok(());
176
+ }
177
+ };
178
+
179
+ // Log the start of validation
180
+ // let start_time = Instant::now();
181
+ // eprintln!("Starting validation of file: {}", hdf5_filename);
182
+
183
+ // Run the validation
184
+ if let Err(err) = validate_hdf5_file(hdf5_filename.clone()) {
185
+ eprintln!("Error validating HDF5 file: {:?}", err);
186
+ println!(
187
+ "{}",
188
+ serde_json::json!({
189
+ "status": "error",
190
+ "message": format!("Error validating HDF5 file: {}", err)
191
+ })
192
+ );
193
+ }
194
+
195
+ // Log completion time
196
+ // eprintln!("Validation completed in: {:?}", start_time.elapsed());
197
+ }
198
+ Err(error) => {
199
+ eprintln!("Incorrect JSON: {}", error);
200
+ println!(
201
+ "{}",
202
+ serde_json::json!({
203
+ "status": "error",
204
+ "message": format!("Invalid JSON input: {}", error)
205
+ })
206
+ );
207
+ }
208
+ }
209
+ }
210
+ Err(error) => {
211
+ eprintln!("Piping error: {}", error);
212
+ println!(
213
+ "{}",
214
+ serde_json::json!({
215
+ "status": "error",
216
+ "message": format!("Error reading input: {}", error)
217
+ })
218
+ );
219
+ }
220
+ }
221
+ Ok(())
222
+ }