@sjcrh/proteinpaint-rust 2.170.0 → 2.170.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/manhattan_plot.rs +207 -18
package/package.json
CHANGED
package/src/manhattan_plot.rs
CHANGED
|
@@ -3,7 +3,7 @@ use plotters::prelude::*;
|
|
|
3
3
|
use plotters::style::ShapeStyle;
|
|
4
4
|
use serde::{Deserialize, Serialize};
|
|
5
5
|
use serde_json;
|
|
6
|
-
use std::collections::HashMap;
|
|
6
|
+
use std::collections::{HashMap, HashSet};
|
|
7
7
|
use std::convert::TryInto;
|
|
8
8
|
use std::error::Error;
|
|
9
9
|
use std::fs::File;
|
|
@@ -22,7 +22,10 @@ struct Input {
|
|
|
22
22
|
plot_height: u64,
|
|
23
23
|
device_pixel_ratio: f64,
|
|
24
24
|
png_dot_radius: u64,
|
|
25
|
-
|
|
25
|
+
max_capped_points: u64,
|
|
26
|
+
hard_cap: f64,
|
|
27
|
+
bin_size: f64,
|
|
28
|
+
q_value_threshold: f64,
|
|
26
29
|
}
|
|
27
30
|
|
|
28
31
|
// chromosome info
|
|
@@ -59,6 +62,8 @@ struct InteractiveData {
|
|
|
59
62
|
y_min: f64,
|
|
60
63
|
y_max: f64,
|
|
61
64
|
device_pixel_ratio: f64,
|
|
65
|
+
default_log_cutoff: f64,
|
|
66
|
+
has_capped_points: bool,
|
|
62
67
|
}
|
|
63
68
|
|
|
64
69
|
#[derive(Serialize)]
|
|
@@ -79,6 +84,134 @@ fn hex_to_rgb(hex: &str) -> Option<(u8, u8, u8)> {
|
|
|
79
84
|
Some((r, g, b))
|
|
80
85
|
}
|
|
81
86
|
|
|
87
|
+
// Helper function to calculate default log cutoff value from the data coming from GRIN2 file
|
|
88
|
+
// We just find the mean of the -log10 q-values that are below the hard cap and
|
|
89
|
+
// set it as the default log cutoff. If the mean is less than 40, we set it to 40.
|
|
90
|
+
// If it is too low it can cause an error in the setting up of the histogram bins in the dynamic y-cap calculation.
|
|
91
|
+
// The exclude_indices parameter allows us to skip placeholder values (e.g., 0.0 placeholders for zero q-values)
|
|
92
|
+
// that would otherwise contaminate the mean calculation.
|
|
93
|
+
fn get_log_cutoff(ys: &[f64], hard_cap: f64, exclude_indices: &HashSet<usize>) -> f64 {
|
|
94
|
+
let filtered: Vec<f64> = ys
|
|
95
|
+
.iter()
|
|
96
|
+
.enumerate()
|
|
97
|
+
.filter(|(i, &y)| y < hard_cap && !exclude_indices.contains(i))
|
|
98
|
+
.map(|(_, &y)| y)
|
|
99
|
+
.collect();
|
|
100
|
+
let count = filtered.len();
|
|
101
|
+
let sum: f64 = filtered.iter().sum();
|
|
102
|
+
|
|
103
|
+
// If all values are greater than or equal to hard_cap (or excluded), default to hard_cap
|
|
104
|
+
if filtered.is_empty() {
|
|
105
|
+
return hard_cap;
|
|
106
|
+
}
|
|
107
|
+
let mean = sum / count as f64;
|
|
108
|
+
|
|
109
|
+
mean.max(40.0)
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/// Calculates a dynamic y-axis cap for Manhattan plots to handle outliers gracefully.
|
|
113
|
+
///
|
|
114
|
+
/// # Problem
|
|
115
|
+
/// Manhattan plots often have a few extreme outliers (very significant p-values) that
|
|
116
|
+
/// compress the visual range for the majority of points. This function finds an optimal
|
|
117
|
+
/// y-axis cap that:
|
|
118
|
+
/// - Shows most data at true scale
|
|
119
|
+
/// - Caps only a small number of extreme outliers
|
|
120
|
+
/// - Ensures visible outliers (below hard cap) render at their true positions
|
|
121
|
+
///
|
|
122
|
+
/// # Algorithm
|
|
123
|
+
/// 1. **No outliers**: If `max_y <= default_cap`, return `max_y` (no capping needed)
|
|
124
|
+
/// 2. **Histogram binning**: Partition the range `(default_cap, hard_cap]` into fixed-size bins
|
|
125
|
+
/// 3. **Walk up**: Starting from the lowest bin, find the first cap where at most
|
|
126
|
+
/// `max_capped_points` would be clamped
|
|
127
|
+
/// 4. **Preserve visible outliers**: Ensure the chosen cap is above the highest y-value
|
|
128
|
+
/// that falls below `hard_cap`, so those points render at their true positions
|
|
129
|
+
///
|
|
130
|
+
/// # Parameters
|
|
131
|
+
/// - `ys`: All y-values (-log10 q-values) in the plot
|
|
132
|
+
/// - `max_capped_points`: Maximum points allowed to be clamped to the cap (e.g., 5)
|
|
133
|
+
/// - `default_cap`: Starting threshold; points below this are never capped (e.g., whatever log_cutoff is calculated to be from get_log_cutoff)
|
|
134
|
+
/// - `hard_cap`: Absolute maximum y-axis value; points above are always clamped (e.g., 200)
|
|
135
|
+
/// - `bin_size`: Histogram bin width on -log10 scale (e.g., 10)
|
|
136
|
+
///
|
|
137
|
+
/// # Returns
|
|
138
|
+
/// The optimal y-axis cap, guaranteed to be in the range `[max_y.min(default_cap), hard_cap]`
|
|
139
|
+
///
|
|
140
|
+
/// # Example
|
|
141
|
+
/// With `default_cap=40`, `hard_cap=200`, `bin_size=10`, `max_capped_points=5`:
|
|
142
|
+
/// - If 7 points are above 40, with two at 83 and 183 and five at/above 200:
|
|
143
|
+
/// Returns 200, so the points at 83 and 183 display at their true positions while
|
|
144
|
+
/// the 5 extreme outliers are clamped to 200
|
|
145
|
+
fn calculate_dynamic_y_cap(
|
|
146
|
+
ys: &[f64],
|
|
147
|
+
max_capped_points: usize,
|
|
148
|
+
default_cap: f64,
|
|
149
|
+
hard_cap: f64,
|
|
150
|
+
bin_size: f64,
|
|
151
|
+
) -> f64 {
|
|
152
|
+
let mut num_bins = ((hard_cap - default_cap) / bin_size) as usize;
|
|
153
|
+
if num_bins == 0 {
|
|
154
|
+
// Have to make sure num_bins is positive to avoid issues with histogram later
|
|
155
|
+
num_bins = 1;
|
|
156
|
+
}
|
|
157
|
+
let mut histogram = vec![0usize; num_bins];
|
|
158
|
+
let mut max_y = f64::NEG_INFINITY;
|
|
159
|
+
let mut max_y_below_hard_cap = f64::NEG_INFINITY; // Track highest value that's not hard-capped
|
|
160
|
+
let mut points_above_default = 0usize;
|
|
161
|
+
|
|
162
|
+
// Single pass: find max and build histogram simultaneously
|
|
163
|
+
for &y in ys {
|
|
164
|
+
if y > max_y {
|
|
165
|
+
max_y = y;
|
|
166
|
+
}
|
|
167
|
+
if y > default_cap {
|
|
168
|
+
points_above_default += 1;
|
|
169
|
+
if y > hard_cap {
|
|
170
|
+
histogram[num_bins - 1] += 1;
|
|
171
|
+
} else {
|
|
172
|
+
// Track the max y that's at or below the hard cap
|
|
173
|
+
if y > max_y_below_hard_cap {
|
|
174
|
+
max_y_below_hard_cap = y;
|
|
175
|
+
}
|
|
176
|
+
let bin_idx = ((y - default_cap) / bin_size) as usize;
|
|
177
|
+
histogram[bin_idx] += 1;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Case 1: No points exceed default cap - use actual max
|
|
183
|
+
if max_y <= default_cap {
|
|
184
|
+
return max_y;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Walk up from default_cap to hard_cap
|
|
188
|
+
let mut points_above = points_above_default;
|
|
189
|
+
|
|
190
|
+
for (i, &count) in histogram.iter().enumerate() {
|
|
191
|
+
if points_above <= max_capped_points {
|
|
192
|
+
// Found acceptable number of capped points
|
|
193
|
+
let bin_upper_bound = default_cap + ((i + 1) as f64) * bin_size;
|
|
194
|
+
|
|
195
|
+
// The cap should be:
|
|
196
|
+
// 1. At least above max_y_below_hard_cap (so those points render at true position)
|
|
197
|
+
// 2. At most hard_cap
|
|
198
|
+
// 3. But if all outliers are at/above hard_cap, use the bin boundary
|
|
199
|
+
let cap = if max_y_below_hard_cap > bin_upper_bound {
|
|
200
|
+
// There's a visible outlier above this bin - extend cap to show it
|
|
201
|
+
(max_y_below_hard_cap + bin_size).min(hard_cap)
|
|
202
|
+
} else {
|
|
203
|
+
bin_upper_bound.min(hard_cap)
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
return cap;
|
|
207
|
+
}
|
|
208
|
+
points_above -= count;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// All points are hard-capped outliers
|
|
212
|
+
hard_cap
|
|
213
|
+
}
|
|
214
|
+
|
|
82
215
|
// Function to Build cumulative chromosome map
|
|
83
216
|
fn cumulative_chrom(
|
|
84
217
|
chrom_size: &HashMap<String, u64>,
|
|
@@ -121,8 +254,18 @@ fn cumulative_chrom(
|
|
|
121
254
|
fn grin2_file_read(
|
|
122
255
|
grin2_file: &str,
|
|
123
256
|
chrom_data: &HashMap<String, ChromInfo>,
|
|
124
|
-
|
|
125
|
-
) -> Result<
|
|
257
|
+
q_value_threshold: f64,
|
|
258
|
+
) -> Result<
|
|
259
|
+
(
|
|
260
|
+
Vec<u64>,
|
|
261
|
+
Vec<f64>,
|
|
262
|
+
Vec<String>,
|
|
263
|
+
Vec<PointDetail>,
|
|
264
|
+
Vec<usize>,
|
|
265
|
+
Vec<usize>,
|
|
266
|
+
),
|
|
267
|
+
Box<dyn Error>,
|
|
268
|
+
> {
|
|
126
269
|
// Default colours
|
|
127
270
|
let mut colors: HashMap<String, String> = HashMap::new();
|
|
128
271
|
colors.insert("gain".into(), "#FF4444".into());
|
|
@@ -136,6 +279,7 @@ fn grin2_file_read(
|
|
|
136
279
|
let mut colors_vec = Vec::new();
|
|
137
280
|
let mut point_details = Vec::new();
|
|
138
281
|
let mut sig_indices: Vec<usize> = Vec::new();
|
|
282
|
+
let mut zero_q_indices: Vec<usize> = Vec::new();
|
|
139
283
|
|
|
140
284
|
let grin2_file = File::open(grin2_file).expect("Failed to open grin2_result_file");
|
|
141
285
|
let mut reader = BufReader::new(grin2_file);
|
|
@@ -224,9 +368,11 @@ fn grin2_file_read(
|
|
|
224
368
|
_ => continue,
|
|
225
369
|
};
|
|
226
370
|
|
|
227
|
-
// Use
|
|
371
|
+
// Use a placeholder for zero q-values - these will be updated later
|
|
372
|
+
// after we calculate the dynamic y_cap from the full dataset
|
|
228
373
|
let neg_log10_q = if original_q_val == 0.0 {
|
|
229
|
-
|
|
374
|
+
zero_q_indices.push(mut_num);
|
|
375
|
+
0.0 // Placeholder - will be set to y_cap later in plot_grin2_manhattan
|
|
230
376
|
} else {
|
|
231
377
|
-original_q_val.log10()
|
|
232
378
|
};
|
|
@@ -242,7 +388,7 @@ fn grin2_file_read(
|
|
|
242
388
|
|
|
243
389
|
// only add significant points for interactivity
|
|
244
390
|
// We check against the original q-value here so we send back the correct values instead of the 1e-300 used for log transform
|
|
245
|
-
if original_q_val <=
|
|
391
|
+
if original_q_val <= q_value_threshold {
|
|
246
392
|
point_details.push(PointDetail {
|
|
247
393
|
x: x_pos,
|
|
248
394
|
y: neg_log10_q,
|
|
@@ -264,7 +410,7 @@ fn grin2_file_read(
|
|
|
264
410
|
}
|
|
265
411
|
}
|
|
266
412
|
|
|
267
|
-
Ok((xs, ys, colors_vec, point_details, sig_indices))
|
|
413
|
+
Ok((xs, ys, colors_vec, point_details, sig_indices, zero_q_indices))
|
|
268
414
|
}
|
|
269
415
|
|
|
270
416
|
// Function to create the GRIN2 Manhattan plot
|
|
@@ -275,7 +421,10 @@ fn plot_grin2_manhattan(
|
|
|
275
421
|
plot_height: u64,
|
|
276
422
|
device_pixel_ratio: f64,
|
|
277
423
|
png_dot_radius: u64,
|
|
278
|
-
|
|
424
|
+
bin_size: f64,
|
|
425
|
+
max_capped_points: u64,
|
|
426
|
+
hard_cap: f64,
|
|
427
|
+
q_value_threshold: f64,
|
|
279
428
|
) -> Result<(String, InteractiveData), Box<dyn Error>> {
|
|
280
429
|
// ------------------------------------------------
|
|
281
430
|
// 1. Build cumulative chromosome map
|
|
@@ -303,23 +452,55 @@ fn plot_grin2_manhattan(
|
|
|
303
452
|
let mut colors_vec = Vec::new();
|
|
304
453
|
let mut point_details = Vec::new();
|
|
305
454
|
let mut sig_indices = Vec::new();
|
|
455
|
+
let mut zero_q_indices: Vec<usize> = Vec::new();
|
|
306
456
|
|
|
307
|
-
if let Ok((x, y, c, pd, si)) = grin2_file_read(&grin2_result_file, &chrom_data,
|
|
457
|
+
if let Ok((x, y, c, pd, si, zq)) = grin2_file_read(&grin2_result_file, &chrom_data, q_value_threshold) {
|
|
308
458
|
xs = x;
|
|
309
459
|
ys = y;
|
|
310
460
|
colors_vec = c;
|
|
311
461
|
point_details = pd;
|
|
312
462
|
sig_indices = si;
|
|
463
|
+
zero_q_indices = zq;
|
|
313
464
|
}
|
|
314
465
|
|
|
315
466
|
// ------------------------------------------------
|
|
316
|
-
// 3.
|
|
467
|
+
// 3. Calculate log_cutoff from data and update zero q-values
|
|
468
|
+
// ------------------------------------------------
|
|
469
|
+
// Convert zero_q_indices to HashSet for O(1) lookup when excluding placeholders
|
|
470
|
+
let zero_q_set: HashSet<usize> = zero_q_indices.iter().cloned().collect();
|
|
471
|
+
let log_cutoff = get_log_cutoff(&ys, hard_cap, &zero_q_set);
|
|
472
|
+
|
|
473
|
+
// ------------------------------------------------
|
|
474
|
+
// 4. Y-axis capping with dynamic cap
|
|
317
475
|
// ------------------------------------------------
|
|
318
476
|
let y_padding = png_dot_radius as f64;
|
|
319
477
|
let y_min = 0.0 - y_padding;
|
|
320
|
-
|
|
321
|
-
|
|
478
|
+
|
|
479
|
+
// Dynamic y-cap calculation:
|
|
480
|
+
// - log_cutoff: the baseline cap (calculated from data mean)
|
|
481
|
+
// - max_capped_points: maximum number of points allowed above cap before raising it
|
|
482
|
+
// - hard_cap: absolute maximum cap regardless of data distribution
|
|
483
|
+
// - bin_size: size of bins for histogram approach
|
|
484
|
+
let max_capped_points = max_capped_points as usize;
|
|
485
|
+
|
|
486
|
+
let y_cap = calculate_dynamic_y_cap(&ys, max_capped_points, log_cutoff, hard_cap, bin_size);
|
|
487
|
+
|
|
488
|
+
let (y_max, has_capped_points) = if !ys.is_empty() {
|
|
322
489
|
let max_y = ys.iter().cloned().fold(f64::MIN, f64::max);
|
|
490
|
+
|
|
491
|
+
// has_capped_points is true if any points exceed the default cap (log_cutoff)
|
|
492
|
+
let has_capped = max_y > log_cutoff;
|
|
493
|
+
|
|
494
|
+
// Set q=0 points (currently placeholders at 0.0) to y_cap so they appear at the top
|
|
495
|
+
for &idx in &zero_q_indices {
|
|
496
|
+
ys[idx] = y_cap;
|
|
497
|
+
}
|
|
498
|
+
for p in point_details.iter_mut() {
|
|
499
|
+
if p.q_value == 0.0 {
|
|
500
|
+
p.y = y_cap;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
323
504
|
if max_y > y_cap {
|
|
324
505
|
// Clamp values above the cap
|
|
325
506
|
for y in ys.iter_mut() {
|
|
@@ -332,12 +513,12 @@ fn plot_grin2_manhattan(
|
|
|
332
513
|
p.y = y_cap;
|
|
333
514
|
}
|
|
334
515
|
}
|
|
335
|
-
y_cap + 0.35 + y_padding
|
|
516
|
+
(y_cap + 0.35 + y_padding, has_capped)
|
|
336
517
|
} else {
|
|
337
|
-
max_y + 0.35 + y_padding
|
|
518
|
+
(max_y + 0.35 + y_padding, has_capped)
|
|
338
519
|
}
|
|
339
520
|
} else {
|
|
340
|
-
1.0 + y_padding
|
|
521
|
+
(1.0 + y_padding, false)
|
|
341
522
|
};
|
|
342
523
|
|
|
343
524
|
// ------------------------------------------------
|
|
@@ -480,6 +661,8 @@ fn plot_grin2_manhattan(
|
|
|
480
661
|
y_min,
|
|
481
662
|
y_max,
|
|
482
663
|
device_pixel_ratio: dpr,
|
|
664
|
+
default_log_cutoff: log_cutoff,
|
|
665
|
+
has_capped_points,
|
|
483
666
|
};
|
|
484
667
|
Ok((png_data, interactive_data))
|
|
485
668
|
}
|
|
@@ -506,7 +689,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
506
689
|
let plot_height = &input_json.plot_height;
|
|
507
690
|
let device_pixel_ratio = &input_json.device_pixel_ratio;
|
|
508
691
|
let png_dot_radius = &input_json.png_dot_radius;
|
|
509
|
-
let
|
|
692
|
+
let max_capped_points = &input_json.max_capped_points;
|
|
693
|
+
let hard_cap = &input_json.hard_cap;
|
|
694
|
+
let bin_size = &input_json.bin_size;
|
|
695
|
+
let q_value_threshold = &input_json.q_value_threshold;
|
|
510
696
|
if let Ok((base64_string, plot_data)) = plot_grin2_manhattan(
|
|
511
697
|
grin2_file.clone(),
|
|
512
698
|
chrom_size.clone(),
|
|
@@ -514,7 +700,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
514
700
|
plot_height.clone(),
|
|
515
701
|
device_pixel_ratio.clone(),
|
|
516
702
|
png_dot_radius.clone(),
|
|
517
|
-
|
|
703
|
+
bin_size.clone(),
|
|
704
|
+
max_capped_points.clone(),
|
|
705
|
+
hard_cap.clone(),
|
|
706
|
+
q_value_threshold.clone(),
|
|
518
707
|
) {
|
|
519
708
|
let output = Output {
|
|
520
709
|
png: base64_string,
|