@sjcrh/proteinpaint-server 2.26.1 → 2.27.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/utils/burden.R ADDED
@@ -0,0 +1,334 @@
1
+
2
+ ##### This code takes about 30 seconds to run. When user input the parameters (sexval to hdmtxval), run this for the original data and 20 for the bootstraped data at the same time, so we can have the burdern and 95% CI in about 30 seconds.
3
+
4
+ rm(list=ls())
5
+
6
+ suppressPackageStartupMessages(library(dplyr)) ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
7
+ suppressPackageStartupMessages(library(survival))
8
+ library(jsonlite)
9
+
10
+ options(warn=-1)
11
+
12
+ # Input from lines2R
13
+ args <- commandArgs(trailingOnly = T)
14
+ if (length(args) != 4) stop("Usage: Rscript burden.R in.json > results")
15
+ infile <- args[1]
16
+ fitsData <- args[2]
17
+ survData <- args[3]
18
+ sampleData <- args[4]
19
+
20
+ #####################
21
+ # Functions for our method
22
+ # Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
23
+ #####################
24
+ # setwd("R:/Biostatistics/Biostatistics2/Qi/QiCommon/St Jude/Nature Review/CHCs/App/Rdata")
25
+
26
+ load(fitsData)
27
+ load(survData)
28
+ # survs[[1]]
29
+
30
+ ############################ These are the input values in APP that users can change. Edgar, these should be the same as the APP before, variable names and units. #############
31
+ ### Input the primary DX.
32
+ # pr=5
33
+ agecut=40 ##### Edgar, This is not an user input paramter, but we input this. This depends on the DX. For example, here for CNS we use 40. For HL DX, it is 55. I will give this value for each DX.
34
+
35
+ # # # Input person's values, 18 input X's , plus the input primary DX
36
+ # sexval=1 #sex, take value 1 for male and 0 for female
37
+ # whiteval=1 # Race white or not, 1 for white, 0 for non-white
38
+ # agedxval=6 # age at primary cancer DX
39
+
40
+ # #### Chemotherapy
41
+ # steroidval=0 #Steroids 1 for yes 0 for no
42
+ # bleoval=0; ##Bleomycin
43
+ # vcrval=12; #Vincristine
44
+ # etopval=2500; #Etoposide
45
+ # itmtval=0; #Intrathecal Methotrexate
46
+ # cedval=1.6 # Cyclophosphamide, 0.7692 mean 7692.
47
+ # cispval=300 #Cisplatin
48
+ # doxval=0 #Anthracycline, 3 mean 300 ml/m2
49
+ # carboval=0 ## Carboplatin
50
+ # hdmtxval=0 ## High-Dose Methotrexate
51
+
52
+ # # Radiation
53
+ # brainval=5.4 #Brain, 5.4 means 54Gy, 5400 cGy. #####Same for all RT doses.#####
54
+ # chestval=2.4 # chest/neck RT, 2.4 for 24 Gy
55
+ # heartval=0 # Heart RT
56
+ # pelvisval=0 #pelvis RT
57
+ # abdval=2.4 # Abdominal RT
58
+
59
+ ####################################################################################
60
+
61
+ ##### if no TX, use these.
62
+ # steroidval=0; bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0;
63
+ # doxval=0; chestval=0; abdval=0;
64
+
65
+ # survs[[1]]
66
+
67
+ ############### no TX
68
+ # steroidval=0; bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0; doxval=0; chestval=0; abdval=0; heartval=0; pelvisval=0; carboval=0; hdmtxval=0
69
+
70
+ # Qi made many newdata_chc_sampled so we have 1000 times more donors -- but in different files.
71
+ load(sampleData)
72
+
73
+ newdata_chc_sampled=do.call("rbind", replicate(6,newdata_chc_sampled, simplify = FALSE))
74
+ newdata_chc_sampled$t.startage=seq(5,70,1)
75
+ newdata_chc_sampled$t.endage=seq(6,71,1)
76
+ ### originally data fit to 60 only. using cphfits can get est up to 60 only. ==> later I further cut at 50 or so to fit lines, becuase original data had 95th percentile around age 50 or so.
77
+ newdata_chc_sampled=newdata_chc_sampled[newdata_chc_sampled$t.endage<=60,]
78
+
79
+ input <- fromJSON(infile)
80
+ # paste(names(input), input, sep = ":", collapse = ",")
81
+ pr=input$diaggrp
82
+ sexval=input$sex
83
+ newdata_chc_sampled$sex=input$sex # sexval
84
+ newdata_chc_sampled$white=input$white # whiteval
85
+ newdata_chc_sampled$agedx2=input$agedx # agedxval
86
+ newdata_chc_sampled$steroid=input$steroid # steroidval
87
+ newdata_chc_sampled$bleodose=input$bleo # bleoval
88
+ newdata_chc_sampled$vcrdose=input$vcr # vcrval
89
+ newdata_chc_sampled$etopdose=input$etop # etopval
90
+ newdata_chc_sampled$itmtxdose=input$itmt # itmtval
91
+ newdata_chc_sampled$ced_sum2=input$ced # cedval
92
+ newdata_chc_sampled$cisplatdose=input$cisp # cispval
93
+ newdata_chc_sampled$brainrad2=input$brain # brainval
94
+ newdata_chc_sampled$doxed_sum2=input$dox # doxval
95
+ newdata_chc_sampled$chestrad2=input$chest # chestval
96
+ newdata_chc_sampled$abdrad2=input$abd # abdval
97
+ newdata_chc_sampled$heartradboth2=input$heart # heartval
98
+ newdata_chc_sampled$pelvisrad2=input$pelvis # pelvisval
99
+ newdata_chc_sampled$carboplatdose=input$carbo # carboval
100
+ newdata_chc_sampled$hdmtxdose=input$hdmtx # hdmtxval
101
+
102
+ # newdata_chc_sampled$sex=sexval
103
+ # newdata_chc_sampled$white=whiteval
104
+ # newdata_chc_sampled$agedx2=agedxval
105
+ # newdata_chc_sampled$steroid=steroidval
106
+ # newdata_chc_sampled$bleodose=bleoval
107
+ # newdata_chc_sampled$vcrdose=vcrval
108
+ # newdata_chc_sampled$etopdose=etopval
109
+ # newdata_chc_sampled$itmtxdose=itmtval
110
+ # newdata_chc_sampled$ced_sum2=cedval
111
+ # newdata_chc_sampled$cisplatdose=cispval
112
+ # newdata_chc_sampled$brainrad2=brainval
113
+ # newdata_chc_sampled$doxed_sum2=doxval
114
+ # newdata_chc_sampled$chestrad2=chestval
115
+ # newdata_chc_sampled$abdrad2=abdval
116
+ # newdata_chc_sampled$heartradboth2=heartval
117
+ # newdata_chc_sampled$pelvisrad2=pelvisval
118
+ # newdata_chc_sampled$carboplatdose=carboval
119
+ # newdata_chc_sampled$hdmtxdose=hdmtxval
120
+
121
+ # 1="Acute lymphoblastic leukemia"
122
+ # 2="AML"
123
+ # 3="Hodgkin lymphoma"
124
+ # 4="Non-Hodgkin lymphoma"
125
+ # 5="Central nervous system"
126
+ # 6="Bone tumor"
127
+ # 7="STS"
128
+ # 8="Wilms tumor"
129
+ # 9="Neuroblastoma"
130
+ # 10="Retinoblastoma"
131
+ # 11="Germ cell tumor";
132
+
133
+
134
+ for(j in c(1:32)[-c(2,5,14,20,23,26)]){ ## CHCs. 6 out of 32 CHCs not used.
135
+ tmp_Nj = predict(cphfits2[[j]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected')
136
+ newdata_chc_sampled = data.frame(newdata_chc_sampled,tmp_Nj)
137
+ }
138
+ names(newdata_chc_sampled)[25:50]=paste0("est_chc",c(1:32)[-c(2,5,14,20,23,26)])
139
+ newdata_chc_sampled = newdata_chc_sampled %>%
140
+ mutate(sumN_tmp = rowSums(dplyr::select(.,starts_with("est_chc"))))%>%
141
+ group_by(mrn) %>%
142
+ mutate(sumN_obs = cumsum(sumN_tmp)) %>%
143
+ as.data.frame()
144
+
145
+ ##Qi: the sumN here depends on all the 26 grouped conditions. So the input X's all matter. That is, if sex is not in a CHC of interest, it would make a difference here on sumN (becuase sex was on some CHCs), and hence make a difference on burden of that CHC even that it is not in the cphfits of that CHC.
146
+ newdata_chc_sampled = newdata_chc_sampled %>%
147
+ group_by(mrn) %>%
148
+ mutate(chc20 = sumN_obs[t.endage == 20]) %>%
149
+ ungroup() %>%
150
+ as.data.frame()
151
+ newdata_chc_sampled$death =1
152
+ newdata_chc_sampled$obsCHCat20 = newdata_chc_sampled$current.chc
153
+
154
+
155
+ # survival probability
156
+ # https://stats.stackexchange.com/questions/288393/calculating-survival-probability-per-person-at-time-t-from-cox-ph
157
+
158
+
159
+ newdata_chc_sampled$survprob = exp(-predict(survs[[1]],newdata=data.frame(newdata_chc_sampled,primary=pr),type='expected'))
160
+
161
+ #----------------------------------------------------------------------------------------------------------------#
162
+ ##### Qi added the below "cumprod" for survival by time t. But need to figure out: What is the "survprob" in BCCT formulat? Should it be survival of the segment, or survival by time t? == need to figure out with YY. Discussed, YY confirmed my way: survival prob in the formula is cumulative, not for that segment.
163
+ #----------------------------------------------------------------------------------------------------------------#
164
+
165
+ #----------------------------------------------------------------------------------------------------------------#
166
+ ## If assume "survprob" is over time (not for each segment):
167
+ #### why does the survprob does not decrease over time? I think this is not the real survival probability over time. Do I have to do multiplication over time thinking survprob is the survival over that segment? Try the multiplication over time.===== I think this make sense. In the "predict" above, survial=exp(-expected) was for each row (thinking each row is a separate person). While in newdata_chc_sampled, the rows are for the same person, and the survival depends on the previoys line, so need to multiply the survival from the previous line.
168
+ newdata_chc_sampled$survprob4=cumprod(newdata_chc_sampled$survprob)
169
+ newdata_chc_sampled$survprob=newdata_chc_sampled$survprob4
170
+
171
+ # plot(c(0,90),c(0,1),type="n")
172
+ survspline=smooth.spline(newdata_chc_sampled$t.endage[newdata_chc_sampled$t.endage<=agecut],newdata_chc_sampled$survprob[newdata_chc_sampled$t.endage<=agecut],spar=0.5)
173
+ predsurv=predict(survspline,seq(0,95,1))
174
+
175
+ # lines(predsurv$x,predsurv$y,col=3,lty=2)
176
+
177
+ ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
178
+
179
+ ###### get rid of the est_chcXX and "sumN"columns which were used to calculate the survival probability only.
180
+ # invisible(dim(newdata_chc_sampled))
181
+ newdata_chc_sampled=newdata_chc_sampled[,-grep("est_chc", colnames(newdata_chc_sampled))]
182
+ newdata_chc_sampled=newdata_chc_sampled[,-grep("sumN", colnames(newdata_chc_sampled))]
183
+ # invisible(dim(newdata_chc_sampled))
184
+
185
+ ### Add rows t.startage from 60 to 94, and t.endage from 65 to 95; so we can get burden 60-90.
186
+ add=newdata_chc_sampled[newdata_chc_sampled$t.startage<=39,]
187
+ # table(add$t.startage)
188
+ # table(add$t.endage)
189
+ add$t.startage=add$t.startage+55
190
+ add$t.endage=add$t.endage+55
191
+ # table(add$t.startage)
192
+ # table(add$t.endage)
193
+ newdata_chc_sampled=rbind(newdata_chc_sampled,add)
194
+ newdata_chc_sampled=newdata_chc_sampled[order(newdata_chc_sampled$mrn,newdata_chc_sampled$t.startage),]
195
+ ### replace the survival prob with the calculated/extrapolated survival probability
196
+ smooth_surv=data.frame(age=predsurv$x,surv=predsurv$y)
197
+ smooth_surv$surv[smooth_surv$age<=20]=1
198
+ #### survival probability cannot be <0. Hanle the years with survival prob<0
199
+ #https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310013501 This page had the conditional survival based on age
200
+ ## take the last year with a positive survival prob, and its survival prob
201
+ positive=smooth_surv[smooth_surv$surv>0,]
202
+ alast=tail(positive,1)[1,1]
203
+ slast=tail(positive,1)[1,2]
204
+ #smooth_surv$alast=alast
205
+ #smooth_surv$alast=slast
206
+ smooth_surv$interval=smooth_surv$age-alast
207
+ ### use the last positive survival prob*0.5^(years from the last age with positive survival probability), assuming the conditions survival prob after that age 50% each year.
208
+ cave <- function(x) slast*0.5^(max(x["interval"],0))
209
+ smooth_surv$surv1=apply(smooth_surv,1,cave)
210
+ smooth_surv$surv[smooth_surv$surv<0]=smooth_surv$surv1[smooth_surv$surv<0]
211
+
212
+ newdata_chc_sampled=merge(newdata_chc_sampled,smooth_surv,by.x="t.endage",by.y="age")
213
+ newdata_chc_sampled$survprob=newdata_chc_sampled$surv
214
+
215
+ # when there is an interaction in the model, it gave warning. So I would make a new data with all 0's to make it work.
216
+ newdata0=matrix(0,nrow=1,ncol=18)
217
+ newdata0=as.data.frame(newdata0)
218
+ colnames(newdata0)=c("sex","white","agedx2","steroids","bleodose","vcrdose","etopdose","itmtxdose","ced_sum2",
219
+ "cisplatdose","brainrad2","doxed_sum2","chestrad2","abdrad2","heartradboth2","pelvisrad2","carboplatdose","hdmtxdose")
220
+
221
+
222
+ newdata_chc_sampled1=newdata_chc_sampled ## do this so each run on chc_num loops below starts with the original newdata_chc_sampled1
223
+
224
+
225
+ ##########################################################################
226
+ person_burden=NULL
227
+
228
+ for(chc_num in c(1:32)[-c(2,5,14,20,23,26)]){ #### Edgar, you may make this in separate runs to save time.
229
+ # print(chc_num)
230
+ newdata_chc_sampled=newdata_chc_sampled1
231
+
232
+ # linear predictor
233
+ newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
234
+
235
+ # Baseline nelson-aalan est
236
+ # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
237
+ j=chc_num
238
+ base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
239
+ #centered, if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
240
+ #request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
241
+
242
+
243
+ ### Max time in the data is 70.42. We need to estimate up to 90.
244
+ #Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
245
+ #One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative." Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative? https://cran.r-project.org/web/packages/pspline/pspline.pdf
246
+
247
+
248
+ #### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
249
+ base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
250
+ base=base[base$time<=agecut,] ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
251
+
252
+ ##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
253
+ cumHspline=smooth.spline(base$time,base$hazard,spar=1)
254
+ predcumhz=predict(cumHspline,seq(0,95,1)) ### predicted cumulative hazard
255
+
256
+
257
+ ##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
258
+ base=data.frame(time=predcumhz$x,hazard=predcumhz$y) ##Daisuke used the cumHz, here we smoothed it and then use it.
259
+ #### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
260
+ base$hazard[base$hazard<0]=0
261
+ base2 = base %>%
262
+ mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
263
+ ungroup() %>% as.data.frame()
264
+
265
+ base2 = base2 %>%
266
+ mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
267
+ ungroup()
268
+
269
+ base3 = base2 %>%
270
+ group_by(time_cat) %>%
271
+ dplyr::summarize(dN0 = sum(hazard2)) %>%
272
+ filter(!is.na(time_cat))
273
+
274
+ ###############
275
+ # BCCT
276
+ ###############
277
+ newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
278
+
279
+ #newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
280
+
281
+ newdata_chc_sampled = newdata_chc_sampled %>%
282
+ left_join(base3,by="time_cat")
283
+ newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
284
+
285
+ BCCT = newdata_chc_sampled %>%
286
+ group_by(mrn) %>%
287
+ mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
288
+ mutate(BCCT = cumsum(BCCT_tmp)) %>%
289
+ filter(t.startage>=20) %>%
290
+ ungroup() %>%
291
+ as.data.frame()
292
+
293
+ for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
294
+ for_web_BCCT =for_web_BCCT[,-1]
295
+
296
+ #### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
297
+ #### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
298
+ ncoltmp=75 ## from 20 to 94
299
+ if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
300
+ for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
301
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
302
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
303
+ }
304
+ #For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
305
+ if(chc_num %in% c(10)){
306
+ for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
307
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
308
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
309
+ }
310
+ ##### if female condition 6, then it is 0 for males.
311
+ if(chc_num %in% c(6) & sexval==1){
312
+ for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
313
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
314
+ colnames(for_web_BCCT)=colnames(person_burden[1:75])
315
+ }
316
+ ##### if male condition 7, then it is 0 for females.d
317
+ if(chc_num %in% c(7) & sexval==0){
318
+ for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
319
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
320
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
321
+ }
322
+
323
+ for_web_BCCT$chc=chc_num
324
+
325
+ person_burden=rbind(person_burden,for_web_BCCT)
326
+
327
+ } #end of chc_num loop
328
+
329
+ # person_burden[,30:31]
330
+ # sum(person_burden[,31]) ## total burden at 50 years old. 8.971574 for this example.
331
+
332
+ #### The predicated burden for 26 grouped CHCs from age 20 to 95.
333
+ # write.csv(person_burden,file=paste("primary",pr,".csv"),row.names=F)
334
+ toJSON(person_burden, digits = NA, na = "string")
package/utils/fastclust.R CHANGED
@@ -42,10 +42,17 @@ args <- commandArgs(trailingOnly = T)
42
42
  if (length(args) != 1) stop("Usage: Rscript test.R in.json > results")
43
43
  infile <- args[1]
44
44
  input <- fromJSON(infile)
45
- normalized_matrix <- t(scale(t(input$matrix))) # Applying z-score normalization
45
+
46
+ if (length(input$valueIsTransformed) == 0 || input$valueIsTransformed == FALSE) {
47
+ normalized_matrix <- t(scale(t(input$matrix))) # Applying z-score normalization
48
+ } else { # No normalization
49
+ normalized_matrix <- input$matrix
50
+ }
51
+
46
52
  # For columns (i.e samples)
47
53
  RowDist <- dist(normalized_matrix, method = "euclidean") # Transposing the matrix
48
54
 
55
+
49
56
  # Hierarchical clustering
50
57
  print (input$cluster_method)
51
58
  RowDend <- hclust(RowDist, method = tolower(input$cluster_method))