@sjcrh/proteinpaint-rust 2.170.0 → 2.171.0-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/manhattan_plot.rs +207 -18
package/package.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "2.170.0",
2
+ "version": "2.171.0-0",
3
3
  "name": "@sjcrh/proteinpaint-rust",
4
4
  "type": "module",
5
5
  "description": "Rust-based utilities for proteinpaint",
@@ -3,7 +3,7 @@ use plotters::prelude::*;
3
3
  use plotters::style::ShapeStyle;
4
4
  use serde::{Deserialize, Serialize};
5
5
  use serde_json;
6
- use std::collections::HashMap;
6
+ use std::collections::{HashMap, HashSet};
7
7
  use std::convert::TryInto;
8
8
  use std::error::Error;
9
9
  use std::fs::File;
@@ -22,7 +22,10 @@ struct Input {
22
22
  plot_height: u64,
23
23
  device_pixel_ratio: f64,
24
24
  png_dot_radius: u64,
25
- log_cutoff: f64,
25
+ max_capped_points: u64,
26
+ hard_cap: f64,
27
+ bin_size: f64,
28
+ q_value_threshold: f64,
26
29
  }
27
30
 
28
31
  // chromosome info
@@ -59,6 +62,8 @@ struct InteractiveData {
59
62
  y_min: f64,
60
63
  y_max: f64,
61
64
  device_pixel_ratio: f64,
65
+ default_log_cutoff: f64,
66
+ has_capped_points: bool,
62
67
  }
63
68
 
64
69
  #[derive(Serialize)]
@@ -79,6 +84,134 @@ fn hex_to_rgb(hex: &str) -> Option<(u8, u8, u8)> {
79
84
  Some((r, g, b))
80
85
  }
81
86
 
87
+ // Helper function to calculate default log cutoff value from the data coming from GRIN2 file
88
+ // We just find the mean of the -log10 q-values that are below the hard cap and
89
+ // set it as the default log cutoff. If the mean is less than 40, we set it to 40.
90
+ // If it is too low it can cause an error in the setting up of the histogram bins in the dynamic y-cap calculation.
91
+ // The exclude_indices parameter allows us to skip placeholder values (e.g., 0.0 placeholders for zero q-values)
92
+ // that would otherwise contaminate the mean calculation.
93
+ fn get_log_cutoff(ys: &[f64], hard_cap: f64, exclude_indices: &HashSet<usize>) -> f64 {
94
+ let filtered: Vec<f64> = ys
95
+ .iter()
96
+ .enumerate()
97
+ .filter(|(i, &y)| y < hard_cap && !exclude_indices.contains(i))
98
+ .map(|(_, &y)| y)
99
+ .collect();
100
+ let count = filtered.len();
101
+ let sum: f64 = filtered.iter().sum();
102
+
103
+ // If all values are greater than or equal to hard_cap (or excluded), default to hard_cap
104
+ if filtered.is_empty() {
105
+ return hard_cap;
106
+ }
107
+ let mean = sum / count as f64;
108
+
109
+ mean.max(40.0)
110
+ }
111
+
112
+ /// Calculates a dynamic y-axis cap for Manhattan plots to handle outliers gracefully.
113
+ ///
114
+ /// # Problem
115
+ /// Manhattan plots often have a few extreme outliers (very significant p-values) that
116
+ /// compress the visual range for the majority of points. This function finds an optimal
117
+ /// y-axis cap that:
118
+ /// - Shows most data at true scale
119
+ /// - Caps only a small number of extreme outliers
120
+ /// - Ensures visible outliers (below hard cap) render at their true positions
121
+ ///
122
+ /// # Algorithm
123
+ /// 1. **No outliers**: If `max_y <= default_cap`, return `max_y` (no capping needed)
124
+ /// 2. **Histogram binning**: Partition the range `(default_cap, hard_cap]` into fixed-size bins
125
+ /// 3. **Walk up**: Starting from the lowest bin, find the first cap where at most
126
+ /// `max_capped_points` would be clamped
127
+ /// 4. **Preserve visible outliers**: Ensure the chosen cap is above the highest y-value
128
+ /// that falls below `hard_cap`, so those points render at their true positions
129
+ ///
130
+ /// # Parameters
131
+ /// - `ys`: All y-values (-log10 q-values) in the plot
132
+ /// - `max_capped_points`: Maximum points allowed to be clamped to the cap (e.g., 5)
133
+ /// - `default_cap`: Starting threshold; points below this are never capped (e.g., whatever log_cutoff is calculated to be from get_log_cutoff)
134
+ /// - `hard_cap`: Absolute maximum y-axis value; points above are always clamped (e.g., 200)
135
+ /// - `bin_size`: Histogram bin width on -log10 scale (e.g., 10)
136
+ ///
137
+ /// # Returns
138
+ /// The optimal y-axis cap, guaranteed to be in the range `[max_y.min(default_cap), hard_cap]`
139
+ ///
140
+ /// # Example
141
+ /// With `default_cap=40`, `hard_cap=200`, `bin_size=10`, `max_capped_points=5`:
142
+ /// - If 7 points are above 40, with two at 83 and 183 and five at/above 200:
143
+ /// Returns 200, so the points at 83 and 183 display at their true positions while
144
+ /// the 5 extreme outliers are clamped to 200
145
+ fn calculate_dynamic_y_cap(
146
+ ys: &[f64],
147
+ max_capped_points: usize,
148
+ default_cap: f64,
149
+ hard_cap: f64,
150
+ bin_size: f64,
151
+ ) -> f64 {
152
+ let mut num_bins = ((hard_cap - default_cap) / bin_size) as usize;
153
+ if num_bins == 0 {
154
+ // Have to make sure num_bins is positive to avoid issues with histogram later
155
+ num_bins = 1;
156
+ }
157
+ let mut histogram = vec![0usize; num_bins];
158
+ let mut max_y = f64::NEG_INFINITY;
159
+ let mut max_y_below_hard_cap = f64::NEG_INFINITY; // Track highest value that's not hard-capped
160
+ let mut points_above_default = 0usize;
161
+
162
+ // Single pass: find max and build histogram simultaneously
163
+ for &y in ys {
164
+ if y > max_y {
165
+ max_y = y;
166
+ }
167
+ if y > default_cap {
168
+ points_above_default += 1;
169
+ if y > hard_cap {
170
+ histogram[num_bins - 1] += 1;
171
+ } else {
172
+ // Track the max y that's at or below the hard cap
173
+ if y > max_y_below_hard_cap {
174
+ max_y_below_hard_cap = y;
175
+ }
176
+ let bin_idx = ((y - default_cap) / bin_size) as usize;
177
+ histogram[bin_idx] += 1;
178
+ }
179
+ }
180
+ }
181
+
182
+ // Case 1: No points exceed default cap - use actual max
183
+ if max_y <= default_cap {
184
+ return max_y;
185
+ }
186
+
187
+ // Walk up from default_cap to hard_cap
188
+ let mut points_above = points_above_default;
189
+
190
+ for (i, &count) in histogram.iter().enumerate() {
191
+ if points_above <= max_capped_points {
192
+ // Found acceptable number of capped points
193
+ let bin_upper_bound = default_cap + ((i + 1) as f64) * bin_size;
194
+
195
+ // The cap should be:
196
+ // 1. At least above max_y_below_hard_cap (so those points render at true position)
197
+ // 2. At most hard_cap
198
+ // 3. But if all outliers are at/above hard_cap, use the bin boundary
199
+ let cap = if max_y_below_hard_cap > bin_upper_bound {
200
+ // There's a visible outlier above this bin - extend cap to show it
201
+ (max_y_below_hard_cap + bin_size).min(hard_cap)
202
+ } else {
203
+ bin_upper_bound.min(hard_cap)
204
+ };
205
+
206
+ return cap;
207
+ }
208
+ points_above -= count;
209
+ }
210
+
211
+ // All points are hard-capped outliers
212
+ hard_cap
213
+ }
214
+
82
215
  // Function to Build cumulative chromosome map
83
216
  fn cumulative_chrom(
84
217
  chrom_size: &HashMap<String, u64>,
@@ -121,8 +254,18 @@ fn cumulative_chrom(
121
254
  fn grin2_file_read(
122
255
  grin2_file: &str,
123
256
  chrom_data: &HashMap<String, ChromInfo>,
124
- log_cutoff: f64,
125
- ) -> Result<(Vec<u64>, Vec<f64>, Vec<String>, Vec<PointDetail>, Vec<usize>), Box<dyn Error>> {
257
+ q_value_threshold: f64,
258
+ ) -> Result<
259
+ (
260
+ Vec<u64>,
261
+ Vec<f64>,
262
+ Vec<String>,
263
+ Vec<PointDetail>,
264
+ Vec<usize>,
265
+ Vec<usize>,
266
+ ),
267
+ Box<dyn Error>,
268
+ > {
126
269
  // Default colours
127
270
  let mut colors: HashMap<String, String> = HashMap::new();
128
271
  colors.insert("gain".into(), "#FF4444".into());
@@ -136,6 +279,7 @@ fn grin2_file_read(
136
279
  let mut colors_vec = Vec::new();
137
280
  let mut point_details = Vec::new();
138
281
  let mut sig_indices: Vec<usize> = Vec::new();
282
+ let mut zero_q_indices: Vec<usize> = Vec::new();
139
283
 
140
284
  let grin2_file = File::open(grin2_file).expect("Failed to open grin2_result_file");
141
285
  let mut reader = BufReader::new(grin2_file);
@@ -224,9 +368,11 @@ fn grin2_file_read(
224
368
  _ => continue,
225
369
  };
226
370
 
227
- // Use log_cutoff for zero q-values to avoid -inf. These will be capped later in plotting at log_cutoff
371
+ // Use a placeholder for zero q-values - these will be updated later
372
+ // after we calculate the dynamic y_cap from the full dataset
228
373
  let neg_log10_q = if original_q_val == 0.0 {
229
- log_cutoff
374
+ zero_q_indices.push(mut_num);
375
+ 0.0 // Placeholder - will be set to y_cap later in plot_grin2_manhattan
230
376
  } else {
231
377
  -original_q_val.log10()
232
378
  };
@@ -242,7 +388,7 @@ fn grin2_file_read(
242
388
 
243
389
  // only add significant points for interactivity
244
390
  // We check against the original q-value here so we send back the correct values instead of the 1e-300 used for log transform
245
- if original_q_val <= 0.05 {
391
+ if original_q_val <= q_value_threshold {
246
392
  point_details.push(PointDetail {
247
393
  x: x_pos,
248
394
  y: neg_log10_q,
@@ -264,7 +410,7 @@ fn grin2_file_read(
264
410
  }
265
411
  }
266
412
 
267
- Ok((xs, ys, colors_vec, point_details, sig_indices))
413
+ Ok((xs, ys, colors_vec, point_details, sig_indices, zero_q_indices))
268
414
  }
269
415
 
270
416
  // Function to create the GRIN2 Manhattan plot
@@ -275,7 +421,10 @@ fn plot_grin2_manhattan(
275
421
  plot_height: u64,
276
422
  device_pixel_ratio: f64,
277
423
  png_dot_radius: u64,
278
- log_cutoff: f64,
424
+ bin_size: f64,
425
+ max_capped_points: u64,
426
+ hard_cap: f64,
427
+ q_value_threshold: f64,
279
428
  ) -> Result<(String, InteractiveData), Box<dyn Error>> {
280
429
  // ------------------------------------------------
281
430
  // 1. Build cumulative chromosome map
@@ -303,23 +452,55 @@ fn plot_grin2_manhattan(
303
452
  let mut colors_vec = Vec::new();
304
453
  let mut point_details = Vec::new();
305
454
  let mut sig_indices = Vec::new();
455
+ let mut zero_q_indices: Vec<usize> = Vec::new();
306
456
 
307
- if let Ok((x, y, c, pd, si)) = grin2_file_read(&grin2_result_file, &chrom_data, log_cutoff) {
457
+ if let Ok((x, y, c, pd, si, zq)) = grin2_file_read(&grin2_result_file, &chrom_data, q_value_threshold) {
308
458
  xs = x;
309
459
  ys = y;
310
460
  colors_vec = c;
311
461
  point_details = pd;
312
462
  sig_indices = si;
463
+ zero_q_indices = zq;
313
464
  }
314
465
 
315
466
  // ------------------------------------------------
316
- // 3. Y-axis scaling (cap at 40)
467
+ // 3. Calculate log_cutoff from data and update zero q-values
468
+ // ------------------------------------------------
469
+ // Convert zero_q_indices to HashSet for O(1) lookup when excluding placeholders
470
+ let zero_q_set: HashSet<usize> = zero_q_indices.iter().cloned().collect();
471
+ let log_cutoff = get_log_cutoff(&ys, hard_cap, &zero_q_set);
472
+
473
+ // ------------------------------------------------
474
+ // 4. Y-axis capping with dynamic cap
317
475
  // ------------------------------------------------
318
476
  let y_padding = png_dot_radius as f64;
319
477
  let y_min = 0.0 - y_padding;
320
- let y_cap = log_cutoff; // typically 40.0. Use the passed log_cutoff value that user will be able to modify in the future
321
- let y_max = if !ys.is_empty() {
478
+
479
+ // Dynamic y-cap calculation:
480
+ // - log_cutoff: the baseline cap (calculated from data mean)
481
+ // - max_capped_points: maximum number of points allowed above cap before raising it
482
+ // - hard_cap: absolute maximum cap regardless of data distribution
483
+ // - bin_size: size of bins for histogram approach
484
+ let max_capped_points = max_capped_points as usize;
485
+
486
+ let y_cap = calculate_dynamic_y_cap(&ys, max_capped_points, log_cutoff, hard_cap, bin_size);
487
+
488
+ let (y_max, has_capped_points) = if !ys.is_empty() {
322
489
  let max_y = ys.iter().cloned().fold(f64::MIN, f64::max);
490
+
491
+ // has_capped_points is true if any points exceed the default cap (log_cutoff)
492
+ let has_capped = max_y > log_cutoff;
493
+
494
+ // Set q=0 points (currently placeholders at 0.0) to y_cap so they appear at the top
495
+ for &idx in &zero_q_indices {
496
+ ys[idx] = y_cap;
497
+ }
498
+ for p in point_details.iter_mut() {
499
+ if p.q_value == 0.0 {
500
+ p.y = y_cap;
501
+ }
502
+ }
503
+
323
504
  if max_y > y_cap {
324
505
  // Clamp values above the cap
325
506
  for y in ys.iter_mut() {
@@ -332,12 +513,12 @@ fn plot_grin2_manhattan(
332
513
  p.y = y_cap;
333
514
  }
334
515
  }
335
- y_cap + 0.35 + y_padding
516
+ (y_cap + 0.35 + y_padding, has_capped)
336
517
  } else {
337
- max_y + 0.35 + y_padding
518
+ (max_y + 0.35 + y_padding, has_capped)
338
519
  }
339
520
  } else {
340
- 1.0 + y_padding
521
+ (1.0 + y_padding, false)
341
522
  };
342
523
 
343
524
  // ------------------------------------------------
@@ -480,6 +661,8 @@ fn plot_grin2_manhattan(
480
661
  y_min,
481
662
  y_max,
482
663
  device_pixel_ratio: dpr,
664
+ default_log_cutoff: log_cutoff,
665
+ has_capped_points,
483
666
  };
484
667
  Ok((png_data, interactive_data))
485
668
  }
@@ -506,7 +689,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
506
689
  let plot_height = &input_json.plot_height;
507
690
  let device_pixel_ratio = &input_json.device_pixel_ratio;
508
691
  let png_dot_radius = &input_json.png_dot_radius;
509
- let log_cutoff = &input_json.log_cutoff;
692
+ let max_capped_points = &input_json.max_capped_points;
693
+ let hard_cap = &input_json.hard_cap;
694
+ let bin_size = &input_json.bin_size;
695
+ let q_value_threshold = &input_json.q_value_threshold;
510
696
  if let Ok((base64_string, plot_data)) = plot_grin2_manhattan(
511
697
  grin2_file.clone(),
512
698
  chrom_size.clone(),
@@ -514,7 +700,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
514
700
  plot_height.clone(),
515
701
  device_pixel_ratio.clone(),
516
702
  png_dot_radius.clone(),
517
- log_cutoff.clone(),
703
+ bin_size.clone(),
704
+ max_capped_points.clone(),
705
+ hard_cap.clone(),
706
+ q_value_threshold.clone(),
518
707
  ) {
519
708
  let output = Output {
520
709
  png: base64_string,