npm - @sjcrh/proteinpaint-server - Versions diffs - 2.27.1 → 2.28.0 - Mend

@sjcrh/proteinpaint-server 2.27.1 → 2.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/cards/hic.json +55 -13
package/package.json +2 -2
package/routes/termdb.getCategories.ts +80 -0
package/routes/termdb.violin.ts +1 -1
package/server.js +1 -1
package/src/bedj.parseBed.js +3 -3
package/src/checkReadingFrame.js +1 -1
package/src/lines2R.js +7 -10
package/src/mds3.gdc.filter.js +1 -1
package/src/serverconfig.js +2 -0
package/utils/burden.R +131 -104
package/utils/fastclust.R +17 -8

package/src/bedj.parseBed.js CHANGED Viewed

@@ -69,12 +69,12 @@ to parse line as gene file, require following:
 */
-const checkReadingFrame = require('./checkReadingFrame')
+import checkReadingFrame from './checkReadingFrame'
 //a valid exonFrames field can only contain members of validFrames, names -1, 0, 1, or 2
 const validFrames = new Set(['-1', '0', '1', '2'])
-exports.parseBedLine = function parseBedLine(l, enst2desc) {
+export function parseBedLine(l, enst2desc) {
 	const chr = l[0],
 		chromstart = Number(l[2 - 1]),
 		chromstop = l[3 - 1],
@@ -264,7 +264,7 @@ exports.parseBedLine = function parseBedLine(l, enst2desc) {
 	}
 	if (!tmp3.some(i => !validFrames.has(i))) {
 		/* all fields are valid frames, reject values that are not -1, 0, 1, or 2 */
-		checkReadingFrame.default(obj, exonframes)
+		checkReadingFrame(obj, exonframes)
 	}
 	return obj
 }

package/src/checkReadingFrame.js CHANGED Viewed

@@ -16,7 +16,7 @@ str:
 if the first coding exon has a frame of 1/2 but not 0, the "startCodonFrame" attribute will be added to obj
 so that it can be properly translatec
 */
-exports.default = (obj, str) => {
+export default function (obj, str) {
 	if (!obj.codingstart) {
 		// not coding
 		return

package/src/lines2R.js CHANGED Viewed

@@ -9,13 +9,13 @@ Arguments:
 Given an R script and a JavaScript array of input data lines, the data lines are streamed into the standard input of the R script. The standard output of the R script is then returned as a JavaScript array of output data lines.
 */
-const path = require('path')
-const fs = require('fs')
-const spawn = require('child_process').spawn
-const Readable = require('stream').Readable
-const serverconfig = require('./serverconfig')
+import fs from 'fs'
+import path from 'path'
+import serverconfig from './serverconfig'
+import { spawn } from 'child_process'
+import { Readable } from 'stream'
-module.exports = async function lines2R(Rscript, lines, args = []) {
+export default async function lines2R(Rscript, lines, args = []) {
 	try {
 		await fs.promises.stat(Rscript)
 	} catch (e) {
@@ -55,10 +55,7 @@ module.exports = async function lines2R(Rscript, lines, args = []) {
 				const errmsg = `R process emitted standard error\nR stderr: ${err}`
 				reject(errmsg)
 			}
-			const out = stdout
-				.join('')
-				.trim()
-				.split('\n')
+			const out = stdout.join('').trim().split('\n')
 			resolve(out)
 		})
 	})

package/src/mds3.gdc.filter.js CHANGED Viewed

@@ -4,7 +4,7 @@ f{}
 returns a GDC filter object
 TODO support nested filter
 */
-exports.filter2GDCfilter = f => {
+export function filter2GDCfilter(f) {
 	// gdc filter
 	const obj = {
 		op: 'and',

package/src/serverconfig.js CHANGED Viewed

@@ -52,6 +52,8 @@ if (!serverconfig.bigBedInfo) serverconfig.bigBedInfo = 'bigBedInfo'
 if (!serverconfig.bigBedNamedItems) serverconfig.bigBedNamedItems = 'bigBedNamedItems'
 if (!serverconfig.clustalo) serverconfig.clustalo = 'clustalo'
 if (!serverconfig.Rscript) serverconfig.Rscript = 'Rscript'
+if (!serverconfig.gfServer) serverconfig.gfServer = 'gfServer'
+if (!serverconfig.gfClient) serverconfig.gfClient = 'gfClient'
 /******************
 	APPLY OVERRIDES

package/utils/burden.R CHANGED Viewed

@@ -6,17 +6,23 @@ rm(list=ls())
 suppressPackageStartupMessages(library(dplyr))  ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
 suppressPackageStartupMessages(library(survival))
 library(jsonlite)
+library(parallel)
 options(warn=-1)
 # Input from lines2R
 args <- commandArgs(trailingOnly = T)
-if (length(args) != 4) stop("Usage: Rscript burden.R in.json > results")
+if (length(args) != 4) stop("Usage: Rscript burden.R in.json fitsData survData sampleData > results")
 infile <- args[1]
 fitsData <- args[2]
 survData <- args[3]
 sampleData <- args[4]
+chc_nums <- c(1:32)[-c(2,5,14,20,23,26)] # CHCs. 6 out of 32 CHCs not used.
+availCores <- detectCores()
+if (is.na(availCores)) stop("cannot detect number of available cores")
+cores <- ifelse(length(chc_nums) < availCores, length(chc_nums), availCores)
 #####################
 # Functions for our method
 # Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
@@ -130,12 +136,11 @@ newdata_chc_sampled$hdmtxdose=input$hdmtx # hdmtxval
 #	10="Retinoblastoma"
 #	11="Germ cell tumor";
-for(j in c(1:32)[-c(2,5,14,20,23,26)]){ ## CHCs. 6 out of 32 CHCs not used.
-	tmp_Nj = predict(cphfits2[[j]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected')
-	newdata_chc_sampled = data.frame(newdata_chc_sampled,tmp_Nj)
+results <- mclapply(X = chc_nums, FUN = function(chc_num) predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected'), mc.cores = cores)
+for(n in 1:length(results)){
+	newdata_chc_sampled = data.frame(newdata_chc_sampled,results[[n]])
 }
-names(newdata_chc_sampled)[25:50]=paste0("est_chc",c(1:32)[-c(2,5,14,20,23,26)])
+names(newdata_chc_sampled)[25:50]=paste0("est_chc",chc_nums)
 newdata_chc_sampled = newdata_chc_sampled %>%
 	mutate(sumN_tmp = rowSums(dplyr::select(.,starts_with("est_chc"))))%>%
 	group_by(mrn) %>%
@@ -225,106 +230,128 @@ newdata_chc_sampled1=newdata_chc_sampled ## do this so each run on chc_num loops
 ##########################################################################
 person_burden=NULL
-for(chc_num in c(1:32)[-c(2,5,14,20,23,26)]){  #### Edgar, you may make this in separate runs to save time.
-# print(chc_num)
-newdata_chc_sampled=newdata_chc_sampled1
-# linear predictor
-newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
-# Baseline nelson-aalan est
-# https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
-j=chc_num
-base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
-#centered,	if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
-#request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
-### Max time in the data is 70.42. We need to estimate up to 90.
-#Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
-#One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative."  Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative?  https://cran.r-project.org/web/packages/pspline/pspline.pdf
-#### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
-base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
-base=base[base$time<=agecut,]  ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
-##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
-cumHspline=smooth.spline(base$time,base$hazard,spar=1)
-predcumhz=predict(cumHspline,seq(0,95,1))  ### predicted cumulative hazard
-##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the  difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
-base=data.frame(time=predcumhz$x,hazard=predcumhz$y)  ##Daisuke used the cumHz, here we smoothed it and then use it.
-#### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
-base$hazard[base$hazard<0]=0
-base2 = base %>%
-	mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
-	ungroup() %>% as.data.frame()
-base2 = base2 %>%
-	mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
-	ungroup()
-base3 = base2 %>%
-	group_by(time_cat) %>%
-	dplyr::summarize(dN0 = sum(hazard2)) %>%
-	filter(!is.na(time_cat))
-###############
-# BCCT
-###############
-newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
-#newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
-newdata_chc_sampled = newdata_chc_sampled %>%
-	left_join(base3,by="time_cat")
-newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
-BCCT = newdata_chc_sampled %>%
-	group_by(mrn) %>%
-	mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
-	mutate(BCCT = cumsum(BCCT_tmp)) %>%
-	filter(t.startage>=20) %>%
-	ungroup() %>%
-	as.data.frame()
-for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
-for_web_BCCT =for_web_BCCT[,-1]
-#### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
-#### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
-ncoltmp=75  ## from 20 to 94
-if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
-for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
-for_web_BCCT=as.data.frame(for_web_BCCT2)
-colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
-}
-#For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
-if(chc_num %in% c(10)){
-for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
-for_web_BCCT=as.data.frame(for_web_BCCT2)
-colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
-}
-##### if female condition 6, then it is 0 for males.
-if(chc_num %in% c(6) & sexval==1){
-	for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
-for_web_BCCT=as.data.frame(for_web_BCCT2)
-colnames(for_web_BCCT)=colnames(person_burden[1:75])
-}
-##### if male condition 7, then it is 0 for females.d
-if(chc_num %in% c(7) & sexval==0){
-	for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
-for_web_BCCT=as.data.frame(for_web_BCCT2)
-colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
+get_estimate <- function(chc_num) {  #### Edgar, you may make this in separate runs to save time.
+	# print(chc_num)
+	newdata_chc_sampled=newdata_chc_sampled1
+	# linear predictor
+	newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
+	# Baseline nelson-aalan est
+	# https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
+	j=chc_num
+	base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
+	#centered,	if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
+	#request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
+	### Max time in the data is 70.42. We need to estimate up to 90.
+	#Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
+	#One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative."  Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative?  https://cran.r-project.org/web/packages/pspline/pspline.pdf
+	#### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
+	base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
+	base=base[base$time<=agecut,]  ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
+	##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
+	cumHspline=smooth.spline(base$time,base$hazard,spar=1)
+	predcumhz=predict(cumHspline,seq(0,95,1))  ### predicted cumulative hazard
+	##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the  difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
+	base=data.frame(time=predcumhz$x,hazard=predcumhz$y)  ##Daisuke used the cumHz, here we smoothed it and then use it.
+	#### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
+	base$hazard[base$hazard<0]=0
+	base2 = base %>%
+		mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
+		ungroup() %>% as.data.frame()
+	base2 = base2 %>%
+		mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
+		ungroup()
+	base3 = base2 %>%
+		group_by(time_cat) %>%
+		dplyr::summarize(dN0 = sum(hazard2)) %>%
+		filter(!is.na(time_cat))
+	###############
+	# BCCT
+	###############
+	newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
+	#newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
+	newdata_chc_sampled = newdata_chc_sampled %>%
+		left_join(base3,by="time_cat")
+	newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
+	BCCT = newdata_chc_sampled %>%
+		group_by(mrn) %>%
+		mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
+		mutate(BCCT = cumsum(BCCT_tmp)) %>%
+		filter(t.startage>=20) %>%
+		ungroup() %>%
+		as.data.frame()
+	for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
+	for_web_BCCT =for_web_BCCT[,-1]
+	#### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
+	#### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
+	ncoltmp=75  ## from 20 to 94
+	if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
+	for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
+	for_web_BCCT=as.data.frame(for_web_BCCT2)
+	colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
+	}
+	#For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
+	if(chc_num %in% c(10)){
+	for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
+	for_web_BCCT=as.data.frame(for_web_BCCT2)
+	colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
+	}
+	##### if female condition 6, then it is 0 for males.
+	if(chc_num %in% c(6) & sexval==1){
+		for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
+	for_web_BCCT=as.data.frame(for_web_BCCT2)
+	colnames(for_web_BCCT)=colnames(person_burden[1:75])
+	}
+	##### if male condition 7, then it is 0 for females.d
+	if(chc_num %in% c(7) & sexval==0){
+		for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
+	for_web_BCCT=as.data.frame(for_web_BCCT2)
+	colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
+	}
+	for_web_BCCT$chc=chc_num
+	return(for_web_BCCT)
 }
-for_web_BCCT$chc=chc_num
-person_burden=rbind(person_burden,for_web_BCCT)
-} #end of chc_num loop
+# this serial loop works
+# for(chc_num in chc_nums) {
+# 	person_burden=rbind(person_burden, get_estimate(chc_num))
+# }
+# get estimates
+# parallelize across chc_nums
+results <- mclapply(X = chc_nums, FUN = get_estimate, mc.cores = cores)
+# combine rows into person_burden data frame
+for (n in 1:length(results)) {
+  row <- results[[n]]
+  if (!identical(names(row), names(results[[1]]))) {
+    # some rows may have empty column names because they
+    # used the columns names from the person_burden table, which
+    # is NULL when get_estimate() is run in parallel (see the
+    # if() statements in get_estimate())
+    # in this situation, use the column names from the first row
+    names(row) <- names(results[[1]])
+  }
+  person_burden <- rbind(person_burden, row)
+}
 # person_burden[,30:31]
 # sum(person_burden[,31])  ## total burden at 50 years old. 8.971574 for this example.

package/utils/fastclust.R CHANGED Viewed

@@ -49,6 +49,13 @@ if (length(input$valueIsTransformed) == 0 || input$valueIsTransformed == FALSE)
  normalized_matrix <- input$matrix
 }
+rownames(normalized_matrix) <- input$row_names
+colnames(normalized_matrix) <- input$col_names
+normalized_matrix <- na.omit(normalized_matrix) # Removes rows with NA values
+#print ("normalized_matrix")
+#print (dim(normalized_matrix))
 # For columns (i.e samples)
 RowDist <- dist(normalized_matrix, method = "euclidean") # Transposing the matrix
@@ -103,14 +110,16 @@ print ("Done")
 # Sorting the matrix
 SortedMatrix  <- normalized_matrix[RowDend$order, ColumnDend$order]
-SortedRowNames <- input$row_names[RowDend$order]
-SortedColumnNames <- input$col_names[ColumnDend$order]
-m <- matrix(SortedMatrix,length(SortedRowNames),length(SortedColumnNames))
-colnames(m) <- SortedColumnNames
-rownames(m) <- SortedRowNames
-cat("rownames",RowDend$order,"\n",sep="\t")
-cat("colnames",ColumnDend$order,"\n",sep="\t")
+SortedRowNames <- rownames(normalized_matrix)[RowDend$order]
+SortedColumnNames <- colnames(normalized_matrix)[ColumnDend$order]
+#m <- matrix(SortedMatrix,length(SortedRowNames),length(SortedColumnNames))
+#colnames(m) <- SortedColumnNames
+#rownames(m) <- SortedRowNames
+cat("rowindexes",RowDend$order,"\n",sep="\t") # Prints out row indices
+cat("colindexes",ColumnDend$order,"\n",sep="\t") # Prints out column indicies
+cat("rownames",SortedRowNames,"\n",sep="\t") # Prints out row names
+cat("colnames",SortedColumnNames,"\n",sep="\t") # Prints out column names
 cat ("OutputMatrix",normalized_matrix,"\n",sep="\t") # This outputs the 2D array in 1D column-wise. This is later converted to 2D array in nodejs.