@sjcrh/proteinpaint-server 2.27.1 → 2.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -69,12 +69,12 @@ to parse line as gene file, require following:
69
69
 
70
70
  */
71
71
 
72
- const checkReadingFrame = require('./checkReadingFrame')
72
+ import checkReadingFrame from './checkReadingFrame'
73
73
 
74
74
  //a valid exonFrames field can only contain members of validFrames, names -1, 0, 1, or 2
75
75
  const validFrames = new Set(['-1', '0', '1', '2'])
76
76
 
77
- exports.parseBedLine = function parseBedLine(l, enst2desc) {
77
+ export function parseBedLine(l, enst2desc) {
78
78
  const chr = l[0],
79
79
  chromstart = Number(l[2 - 1]),
80
80
  chromstop = l[3 - 1],
@@ -264,7 +264,7 @@ exports.parseBedLine = function parseBedLine(l, enst2desc) {
264
264
  }
265
265
  if (!tmp3.some(i => !validFrames.has(i))) {
266
266
  /* all fields are valid frames, reject values that are not -1, 0, 1, or 2 */
267
- checkReadingFrame.default(obj, exonframes)
267
+ checkReadingFrame(obj, exonframes)
268
268
  }
269
269
  return obj
270
270
  }
@@ -16,7 +16,7 @@ str:
16
16
  if the first coding exon has a frame of 1/2 but not 0, the "startCodonFrame" attribute will be added to obj
17
17
  so that it can be properly translatec
18
18
  */
19
- exports.default = (obj, str) => {
19
+ export default function (obj, str) {
20
20
  if (!obj.codingstart) {
21
21
  // not coding
22
22
  return
package/src/lines2R.js CHANGED
@@ -9,13 +9,13 @@ Arguments:
9
9
  Given an R script and a JavaScript array of input data lines, the data lines are streamed into the standard input of the R script. The standard output of the R script is then returned as a JavaScript array of output data lines.
10
10
  */
11
11
 
12
- const path = require('path')
13
- const fs = require('fs')
14
- const spawn = require('child_process').spawn
15
- const Readable = require('stream').Readable
16
- const serverconfig = require('./serverconfig')
12
+ import fs from 'fs'
13
+ import path from 'path'
14
+ import serverconfig from './serverconfig'
15
+ import { spawn } from 'child_process'
16
+ import { Readable } from 'stream'
17
17
 
18
- module.exports = async function lines2R(Rscript, lines, args = []) {
18
+ export default async function lines2R(Rscript, lines, args = []) {
19
19
  try {
20
20
  await fs.promises.stat(Rscript)
21
21
  } catch (e) {
@@ -55,10 +55,7 @@ module.exports = async function lines2R(Rscript, lines, args = []) {
55
55
  const errmsg = `R process emitted standard error\nR stderr: ${err}`
56
56
  reject(errmsg)
57
57
  }
58
- const out = stdout
59
- .join('')
60
- .trim()
61
- .split('\n')
58
+ const out = stdout.join('').trim().split('\n')
62
59
  resolve(out)
63
60
  })
64
61
  })
@@ -4,7 +4,7 @@ f{}
4
4
  returns a GDC filter object
5
5
  TODO support nested filter
6
6
  */
7
- exports.filter2GDCfilter = f => {
7
+ export function filter2GDCfilter(f) {
8
8
  // gdc filter
9
9
  const obj = {
10
10
  op: 'and',
@@ -52,6 +52,8 @@ if (!serverconfig.bigBedInfo) serverconfig.bigBedInfo = 'bigBedInfo'
52
52
  if (!serverconfig.bigBedNamedItems) serverconfig.bigBedNamedItems = 'bigBedNamedItems'
53
53
  if (!serverconfig.clustalo) serverconfig.clustalo = 'clustalo'
54
54
  if (!serverconfig.Rscript) serverconfig.Rscript = 'Rscript'
55
+ if (!serverconfig.gfServer) serverconfig.gfServer = 'gfServer'
56
+ if (!serverconfig.gfClient) serverconfig.gfClient = 'gfClient'
55
57
 
56
58
  /******************
57
59
  APPLY OVERRIDES
package/utils/burden.R CHANGED
@@ -6,17 +6,23 @@ rm(list=ls())
6
6
  suppressPackageStartupMessages(library(dplyr)) ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
7
7
  suppressPackageStartupMessages(library(survival))
8
8
  library(jsonlite)
9
+ library(parallel)
9
10
 
10
11
  options(warn=-1)
11
12
 
12
13
  # Input from lines2R
13
14
  args <- commandArgs(trailingOnly = T)
14
- if (length(args) != 4) stop("Usage: Rscript burden.R in.json > results")
15
+ if (length(args) != 4) stop("Usage: Rscript burden.R in.json fitsData survData sampleData > results")
15
16
  infile <- args[1]
16
17
  fitsData <- args[2]
17
18
  survData <- args[3]
18
19
  sampleData <- args[4]
19
20
 
21
+ chc_nums <- c(1:32)[-c(2,5,14,20,23,26)] # CHCs. 6 out of 32 CHCs not used.
22
+ availCores <- detectCores()
23
+ if (is.na(availCores)) stop("cannot detect number of available cores")
24
+ cores <- ifelse(length(chc_nums) < availCores, length(chc_nums), availCores)
25
+
20
26
  #####################
21
27
  # Functions for our method
22
28
  # Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
@@ -130,12 +136,11 @@ newdata_chc_sampled$hdmtxdose=input$hdmtx # hdmtxval
130
136
  # 10="Retinoblastoma"
131
137
  # 11="Germ cell tumor";
132
138
 
133
-
134
- for(j in c(1:32)[-c(2,5,14,20,23,26)]){ ## CHCs. 6 out of 32 CHCs not used.
135
- tmp_Nj = predict(cphfits2[[j]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected')
136
- newdata_chc_sampled = data.frame(newdata_chc_sampled,tmp_Nj)
139
+ results <- mclapply(X = chc_nums, FUN = function(chc_num) predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected'), mc.cores = cores)
140
+ for(n in 1:length(results)){
141
+ newdata_chc_sampled = data.frame(newdata_chc_sampled,results[[n]])
137
142
  }
138
- names(newdata_chc_sampled)[25:50]=paste0("est_chc",c(1:32)[-c(2,5,14,20,23,26)])
143
+ names(newdata_chc_sampled)[25:50]=paste0("est_chc",chc_nums)
139
144
  newdata_chc_sampled = newdata_chc_sampled %>%
140
145
  mutate(sumN_tmp = rowSums(dplyr::select(.,starts_with("est_chc"))))%>%
141
146
  group_by(mrn) %>%
@@ -225,106 +230,128 @@ newdata_chc_sampled1=newdata_chc_sampled ## do this so each run on chc_num loops
225
230
  ##########################################################################
226
231
  person_burden=NULL
227
232
 
228
- for(chc_num in c(1:32)[-c(2,5,14,20,23,26)]){ #### Edgar, you may make this in separate runs to save time.
229
- # print(chc_num)
230
- newdata_chc_sampled=newdata_chc_sampled1
231
-
232
- # linear predictor
233
- newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
234
-
235
- # Baseline nelson-aalan est
236
- # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
237
- j=chc_num
238
- base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
239
- #centered, if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
240
- #request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
241
-
242
-
243
- ### Max time in the data is 70.42. We need to estimate up to 90.
244
- #Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
245
- #One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative." Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative? https://cran.r-project.org/web/packages/pspline/pspline.pdf
246
-
247
-
248
- #### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
249
- base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
250
- base=base[base$time<=agecut,] ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
251
-
252
- ##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
253
- cumHspline=smooth.spline(base$time,base$hazard,spar=1)
254
- predcumhz=predict(cumHspline,seq(0,95,1)) ### predicted cumulative hazard
255
-
256
-
257
- ##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
258
- base=data.frame(time=predcumhz$x,hazard=predcumhz$y) ##Daisuke used the cumHz, here we smoothed it and then use it.
259
- #### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
260
- base$hazard[base$hazard<0]=0
261
- base2 = base %>%
262
- mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
263
- ungroup() %>% as.data.frame()
264
-
265
- base2 = base2 %>%
266
- mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
267
- ungroup()
268
-
269
- base3 = base2 %>%
270
- group_by(time_cat) %>%
271
- dplyr::summarize(dN0 = sum(hazard2)) %>%
272
- filter(!is.na(time_cat))
273
-
274
- ###############
275
- # BCCT
276
- ###############
277
- newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
278
-
279
- #newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
280
-
281
- newdata_chc_sampled = newdata_chc_sampled %>%
282
- left_join(base3,by="time_cat")
283
- newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
284
-
285
- BCCT = newdata_chc_sampled %>%
286
- group_by(mrn) %>%
287
- mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
288
- mutate(BCCT = cumsum(BCCT_tmp)) %>%
289
- filter(t.startage>=20) %>%
290
- ungroup() %>%
291
- as.data.frame()
292
-
293
- for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
294
- for_web_BCCT =for_web_BCCT[,-1]
295
-
296
- #### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
297
- #### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
298
- ncoltmp=75 ## from 20 to 94
299
- if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
300
- for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
301
- for_web_BCCT=as.data.frame(for_web_BCCT2)
302
- colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
303
- }
304
- #For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
305
- if(chc_num %in% c(10)){
306
- for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
307
- for_web_BCCT=as.data.frame(for_web_BCCT2)
308
- colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
309
- }
310
- ##### if female condition 6, then it is 0 for males.
311
- if(chc_num %in% c(6) & sexval==1){
312
- for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
313
- for_web_BCCT=as.data.frame(for_web_BCCT2)
314
- colnames(for_web_BCCT)=colnames(person_burden[1:75])
315
- }
316
- ##### if male condition 7, then it is 0 for females.d
317
- if(chc_num %in% c(7) & sexval==0){
318
- for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
319
- for_web_BCCT=as.data.frame(for_web_BCCT2)
320
- colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
233
+ get_estimate <- function(chc_num) { #### Edgar, you may make this in separate runs to save time.
234
+ # print(chc_num)
235
+ newdata_chc_sampled=newdata_chc_sampled1
236
+
237
+ # linear predictor
238
+ newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
239
+
240
+ # Baseline nelson-aalan est
241
+ # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
242
+ j=chc_num
243
+ base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
244
+ #centered, if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
245
+ #request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
246
+
247
+
248
+ ### Max time in the data is 70.42. We need to estimate up to 90.
249
+ #Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
250
+ #One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative." Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative? https://cran.r-project.org/web/packages/pspline/pspline.pdf
251
+
252
+
253
+ #### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
254
+ base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
255
+ base=base[base$time<=agecut,] ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
256
+
257
+ ##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
258
+ cumHspline=smooth.spline(base$time,base$hazard,spar=1)
259
+ predcumhz=predict(cumHspline,seq(0,95,1)) ### predicted cumulative hazard
260
+
261
+
262
+ ##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
263
+ base=data.frame(time=predcumhz$x,hazard=predcumhz$y) ##Daisuke used the cumHz, here we smoothed it and then use it.
264
+ #### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
265
+ base$hazard[base$hazard<0]=0
266
+ base2 = base %>%
267
+ mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
268
+ ungroup() %>% as.data.frame()
269
+
270
+ base2 = base2 %>%
271
+ mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
272
+ ungroup()
273
+
274
+ base3 = base2 %>%
275
+ group_by(time_cat) %>%
276
+ dplyr::summarize(dN0 = sum(hazard2)) %>%
277
+ filter(!is.na(time_cat))
278
+
279
+ ###############
280
+ # BCCT
281
+ ###############
282
+ newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
283
+
284
+ #newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
285
+
286
+ newdata_chc_sampled = newdata_chc_sampled %>%
287
+ left_join(base3,by="time_cat")
288
+ newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
289
+
290
+ BCCT = newdata_chc_sampled %>%
291
+ group_by(mrn) %>%
292
+ mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
293
+ mutate(BCCT = cumsum(BCCT_tmp)) %>%
294
+ filter(t.startage>=20) %>%
295
+ ungroup() %>%
296
+ as.data.frame()
297
+
298
+ for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
299
+ for_web_BCCT =for_web_BCCT[,-1]
300
+
301
+ #### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
302
+ #### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
303
+ ncoltmp=75 ## from 20 to 94
304
+ if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
305
+ for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
306
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
307
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
308
+ }
309
+ #For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
310
+ if(chc_num %in% c(10)){
311
+ for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
312
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
313
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
314
+ }
315
+ ##### if female condition 6, then it is 0 for males.
316
+ if(chc_num %in% c(6) & sexval==1){
317
+ for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
318
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
319
+ colnames(for_web_BCCT)=colnames(person_burden[1:75])
320
+ }
321
+ ##### if male condition 7, then it is 0 for females.d
322
+ if(chc_num %in% c(7) & sexval==0){
323
+ for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
324
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
325
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
326
+ }
327
+
328
+ for_web_BCCT$chc=chc_num
329
+
330
+ return(for_web_BCCT)
321
331
  }
322
332
 
323
- for_web_BCCT$chc=chc_num
324
-
325
- person_burden=rbind(person_burden,for_web_BCCT)
326
-
327
- } #end of chc_num loop
333
+ # this serial loop works
334
+ # for(chc_num in chc_nums) {
335
+ # person_burden=rbind(person_burden, get_estimate(chc_num))
336
+ # }
337
+
338
+ # get estimates
339
+ # parallelize across chc_nums
340
+ results <- mclapply(X = chc_nums, FUN = get_estimate, mc.cores = cores)
341
+
342
+ # combine rows into person_burden data frame
343
+ for (n in 1:length(results)) {
344
+ row <- results[[n]]
345
+ if (!identical(names(row), names(results[[1]]))) {
346
+ # some rows may have empty column names because they
347
+ # used the columns names from the person_burden table, which
348
+ # is NULL when get_estimate() is run in parallel (see the
349
+ # if() statements in get_estimate())
350
+ # in this situation, use the column names from the first row
351
+ names(row) <- names(results[[1]])
352
+ }
353
+ person_burden <- rbind(person_burden, row)
354
+ }
328
355
 
329
356
  # person_burden[,30:31]
330
357
  # sum(person_burden[,31]) ## total burden at 50 years old. 8.971574 for this example.
package/utils/fastclust.R CHANGED
@@ -49,6 +49,13 @@ if (length(input$valueIsTransformed) == 0 || input$valueIsTransformed == FALSE)
49
49
  normalized_matrix <- input$matrix
50
50
  }
51
51
 
52
+ rownames(normalized_matrix) <- input$row_names
53
+ colnames(normalized_matrix) <- input$col_names
54
+ normalized_matrix <- na.omit(normalized_matrix) # Removes rows with NA values
55
+
56
+ #print ("normalized_matrix")
57
+ #print (dim(normalized_matrix))
58
+
52
59
  # For columns (i.e samples)
53
60
  RowDist <- dist(normalized_matrix, method = "euclidean") # Transposing the matrix
54
61
 
@@ -103,14 +110,16 @@ print ("Done")
103
110
  # Sorting the matrix
104
111
 
105
112
  SortedMatrix <- normalized_matrix[RowDend$order, ColumnDend$order]
106
- SortedRowNames <- input$row_names[RowDend$order]
107
- SortedColumnNames <- input$col_names[ColumnDend$order]
108
-
109
- m <- matrix(SortedMatrix,length(SortedRowNames),length(SortedColumnNames))
110
- colnames(m) <- SortedColumnNames
111
- rownames(m) <- SortedRowNames
112
- cat("rownames",RowDend$order,"\n",sep="\t")
113
- cat("colnames",ColumnDend$order,"\n",sep="\t")
113
+ SortedRowNames <- rownames(normalized_matrix)[RowDend$order]
114
+ SortedColumnNames <- colnames(normalized_matrix)[ColumnDend$order]
115
+
116
+ #m <- matrix(SortedMatrix,length(SortedRowNames),length(SortedColumnNames))
117
+ #colnames(m) <- SortedColumnNames
118
+ #rownames(m) <- SortedRowNames
119
+ cat("rowindexes",RowDend$order,"\n",sep="\t") # Prints out row indices
120
+ cat("colindexes",ColumnDend$order,"\n",sep="\t") # Prints out column indicies
121
+ cat("rownames",SortedRowNames,"\n",sep="\t") # Prints out row names
122
+ cat("colnames",SortedColumnNames,"\n",sep="\t") # Prints out column names
114
123
  cat ("OutputMatrix",normalized_matrix,"\n",sep="\t") # This outputs the 2D array in 1D column-wise. This is later converted to 2D array in nodejs.
115
124
 
116
125