@sjcrh/proteinpaint-server 2.105.0 → 2.107.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ rm(list=ls())
2
+
3
+ suppressPackageStartupMessages({
4
+ library(dplyr) ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
5
+ library(survival)
6
+ library(jsonlite)
7
+ library(parallel)
8
+ library(doParallel)
9
+ })
10
+
11
+ options(warn=-1)
12
+
13
+ # stream in json input data
14
+ con <- file("stdin", "r")
15
+ json <- readLines(con)
16
+ close(con)
17
+ input <- fromJSON(json)
18
+ # handle input arguments
19
+ args <- commandArgs(trailingOnly = T)
20
+ if (length(args) != 0) stop("Usage: echo <in_json> | Rscript burden.R > <out_json>")
21
+
22
+ # register the parallel backend (used by foreach() for parallelization)
23
+ availCores <- detectCores()
24
+ if (is.na(availCores)) stop("cannot detect number of available cores")
25
+ registerDoParallel(cores = availCores - 1) # use all available cores except one
26
+
27
+ chc_nums <- c(1:32)[-c(2,5,14,20,23,26)] # CHCs. 6 out of 32 CHCs not used.
28
+
29
+ #####################
30
+ # Functions for our method
31
+ # Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
32
+ #####################
33
+ # setwd("R:/Biostatistics/Biostatistics2/Qi/QiCommon/St Jude/Nature Review/CHCs/App/Rdata")
34
+
35
+
36
+ #################################
37
+ # Bootstrapping burden estimate #
38
+ #################################
39
+
40
+ # import get_burden() function
41
+ source(file.path(input$binpath, "utils/getBurden.R"))
42
+
43
+ # compute burden estimate for each bootstrap
44
+ # parallelize across bootstraps (not across chcs)
45
+ bootnums <- 20 # number of bootstraps
46
+ f <- input$datafiles
47
+ sampleData <- file.path(f$dir, f$files$sample) # dataframe with all the X's needed, and X's are updated by input values
48
+ outall <- foreach(i = 1:bootnums, .combine = rbind) %dopar% {
49
+ fitsData <- file.path(f$dir, f$boosubdir, paste0("boot",i), f$files$fit)
50
+ survData <- file.path(f$dir, f$boosubdir, paste0("boot",i), f$files$surv)
51
+ person_burden <- get_burden(fitsData, survData, sampleData, FALSE)
52
+ person_burden$boot <- i
53
+ person_burden
54
+ }
55
+
56
+
57
+ ###########################
58
+ # 95% confidence interval #
59
+ ###########################
60
+
61
+ # pr=5
62
+ # outall=NULL
63
+ # for(bootnum in 1:20){
64
+ # #bootnum=1 ##### loop this from 1 to 20.
65
+ # print(bootnum)
66
+ # setwd(paste("/Users/gmatt/data/tp/files/hg38/sjlife/burden/boot/boot",bootnum,sep=""))
67
+ # out1=read.csv(file=paste("./bootprimary",pr,".csv",sep=""))
68
+ # out1$boot=bootnum
69
+ # outall=rbind(outall,out1)
70
+ # }
71
+
72
+
73
+ ### For each cell (each chc and age combination), there are 20 values. get the SD from the 20 bootstrapped burdens.
74
+ SDall=NULL
75
+ for(chc_num in chc_nums){
76
+ # chc_num=1
77
+ data=outall[outall$chc==chc_num,] ## data for this chc_num from the 20 bootstraps on each age point
78
+ data=data[,!colnames(data) %in% c("chc","boot")]
79
+ # for each column (each age point), get the SD
80
+ sd1=apply(data,2,sd)
81
+ sd1$chc=chc_num
82
+ SDall=rbind(SDall,sd1)
83
+ }
84
+
85
+ # #### burden for total of the 26 CHCs is the sum of the 26 burdens. So for each boot, take all the 26 and then sum up, to result in a column with 20 rows.
86
+
87
+ btotal=NULL
88
+ for(bootnum in 1:bootnums){
89
+ # bootnum=1
90
+ data=outall[outall$boot==bootnum,]
91
+ data=data[,!colnames(data) %in% c("chc","boot")]
92
+ total=apply(data,2,sum)
93
+ btotal=rbind(btotal,total)
94
+ }
95
+ ### get SD for the total burdern from the 20 rows
96
+ sdtotal=apply(btotal,2,sd)
97
+ sdtotal=data.frame(t(sdtotal), check.names=F)
98
+ sdtotal$chc=0 ### indicating the total CHCs
99
+
100
+ sd=rbind(SDall,sdtotal)
101
+ sd=apply(sd,c(1,2),as.numeric)
102
+
103
+ ##### read the burden from the original data. Use that and SD to get lower and upper bound.
104
+ # oburden=read.csv(paste("R:/Biostatistics/Biostatistics2/Qi/QiCommon/St Jude/Nature Review/CHCs/App/Rdata/primary",pr,".csv",sep=""))
105
+ oburden=input$burden
106
+ # oburden$boot=0
107
+ ### total burden for the original data
108
+ total=apply(oburden[,!colnames(oburden) %in% c("chc","boot")],2,sum)
109
+ burdentotal=data.frame(t(total), check.names=F)
110
+ burdentotal$chc=0 ### indicating the total CHCs
111
+ oburden=rbind(oburden,burdentotal) #### burden for each chc with age in the columns. The last row is for the total burden.
112
+ oburden=data.frame(oburden, check.names=F)
113
+
114
+
115
+ #### lower bound, the lowest is 0 burden.
116
+ low=oburden[,!colnames(oburden) %in% "chc"]-1.96*sd[,!colnames(oburden) %in% "chc"]
117
+ low[low<0]=0
118
+ low$chc=oburden$chc
119
+ #### The upper bound
120
+ up=oburden[,!colnames(oburden) %in% "chc"]+1.96*sd[,!colnames(oburden) %in% "chc"]
121
+ up$chc=oburden$chc
122
+
123
+ ### Take primary=5 CNS as an example (with hight TXs in step 3). At age 50, the burden is 9.04 with 95% CI (7.92 to 10.16)
124
+ # oburden$X.50.51[oburden$chc==0]
125
+ # low$X.50.51[low$chc==0]
126
+ # up$X.50.51[up$chc==0]
127
+
128
+ # plot(c(20,95),c(0,15),type="n",xlab="Age",ylab="Burden",font=2)
129
+ # lines(seq(20,94,1),oburden[oburden$chc==0,!colnames(oburden) %in% "chc"],lty=1)
130
+ # lines(seq(20,94,1),low[low$chc==0,!colnames(low) %in% "chc"],lty=2)
131
+ # lines(seq(20,94,1),up[up$chc==0,!colnames(up) %in% "chc"],lty=2)
132
+
133
+ ci <- list(low = low, up = up, overall=burdentotal)
134
+ toJSON(ci, digits = NA, na = "string")
@@ -0,0 +1,46 @@
1
+ rm(list=ls())
2
+
3
+ suppressPackageStartupMessages({
4
+ library(dplyr) ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
5
+ library(survival)
6
+ library(jsonlite)
7
+ library(parallel)
8
+ library(doParallel)
9
+ })
10
+
11
+ options(warn=-1)
12
+
13
+ # stream in json input data
14
+ con <- file("stdin", "r")
15
+ json <- readLines(con)
16
+ close(con)
17
+ input <- fromJSON(json)
18
+ # handle input arguments
19
+ args <- commandArgs(trailingOnly = T)
20
+ if (length(args) != 0) stop("Usage: echo <in_json> | Rscript burden.R > <out_json>")
21
+
22
+ # register the parallel backend (used by foreach() for parallelization)
23
+ availCores <- detectCores()
24
+ if (is.na(availCores)) stop("cannot detect number of available cores")
25
+ registerDoParallel(cores = availCores - 1) # use all available cores except one
26
+
27
+ chc_nums <- c(1:32)[-c(2,5,14,20,23,26)] # CHCs. 6 out of 32 CHCs not used.
28
+
29
+ #####################
30
+ # Functions for our method
31
+ # Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
32
+ #####################
33
+ # setwd("R:/Biostatistics/Biostatistics2/Qi/QiCommon/St Jude/Nature Review/CHCs/App/Rdata")
34
+
35
+ # import get_burden() function
36
+ source(file.path(input$binpath, "utils/getBurden.R"))
37
+
38
+ # compute main burden estimate
39
+ # parallelize across CHCs
40
+ f <- input$datafiles
41
+ fitsData <- file.path(f$dir, f$files$fit)
42
+ survData <- file.path(f$dir, f$files$surv)
43
+ sampleData <- file.path(f$dir, f$files$sample) # dataframe with all the X's needed, and X's are updated by input values
44
+ person_burden <- get_burden(fitsData, survData, sampleData, TRUE)
45
+
46
+ toJSON(person_burden, digits = NA, na = "string")
package/utils/edge.R CHANGED
@@ -144,7 +144,7 @@ if (length(input$conf1) == 0) { # No adjustment of confounding factors
144
144
  } else { # Adjusting for confounding factors
145
145
  y$samples <- data.frame(conditions = conditions, conf1 = input$conf1)
146
146
  model_gen_time <- system.time({
147
- design <- model.matrix(~ conf1 + conditions, data = y$samples)
147
+ design <- model.matrix(~ conditions + conf1, data = y$samples)
148
148
  })
149
149
  #cat("Time for making design matrix: ", model_gen_time[3], " seconds\n")
150
150
 
@@ -0,0 +1,371 @@
1
+ # TODOs: see TODOs in the code
2
+
3
+ # function to compute burden estimate
4
+ # fitsData: file path to fit data
5
+ # survData: file path to surv data
6
+ # sampleData: file path to sample data
7
+ # parallelizeChcs: boolean for whether to parallelize across chcs
8
+ get_burden <- function(fitsData, survData, sampleData, parallelizeChcs) {
9
+ load(fitsData)
10
+ load(survData)
11
+ # Qi made many newdata_chc_sampled so we have 1000 times more donors -- but in different files.
12
+ load(sampleData)
13
+ # survs[[1]]
14
+
15
+ ############################ These are the input values in APP that users can change. Edgar, these should be the same as the APP before, variable names and units. #############
16
+ ### Input the primary DX.
17
+ # pr=5
18
+ # agecut=40 ##### Edgar, This is not an user input paramter, but we input this. This depends on the DX. For example, here for CNS we use 40. For HL DX, it is 55. I will give this value for each DX.
19
+
20
+ # # # Input person's values, 18 input X's , plus the input primary DX
21
+ # sexval=1 #sex, take value 1 for male and 0 for female
22
+ # whiteval=1 # Race white or not, 1 for white, 0 for non-white
23
+ # agedxval=6 # age at primary cancer DX
24
+
25
+ # #### Chemotherapy
26
+ # steroidval=0 #Steroids 1 for yes 0 for no
27
+ # bleoval=0; ##Bleomycin
28
+ # vcrval=12; #Vincristine
29
+ # etopval=2500; #Etoposide
30
+ # itmtval=0; #Intrathecal Methotrexate
31
+ # cedval=1.6 # Cyclophosphamide, 0.7692 mean 7692.
32
+ # cispval=300 #Cisplatin
33
+ # doxval=0 #Anthracycline, 3 mean 300 ml/m2
34
+ # carboval=0 ## Carboplatin
35
+ # hdmtxval=0 ## High-Dose Methotrexate
36
+
37
+ # # Radiation
38
+ # brainval=5.4 #Brain, 5.4 means 54Gy, 5400 cGy. #####Same for all RT doses.#####
39
+ # chestval=2.4 # chest/neck RT, 2.4 for 24 Gy
40
+ # heartval=0 # Heart RT
41
+ # pelvisval=0 #pelvis RT
42
+ # abdval=2.4 # Abdominal RT
43
+
44
+ ####################################################################################
45
+
46
+ ##### if no TX, use these.
47
+ # steroidval=0; bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0;
48
+ # doxval=0; chestval=0; abdval=0;
49
+
50
+ # survs[[1]]
51
+
52
+ ############### no TX
53
+ # steroidval=0; bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0; doxval=0; chestval=0; abdval=0; heartval=0; pelvisval=0; carboval=0; hdmtxval=0
54
+
55
+ newdata_chc_sampled=do.call("rbind", replicate(6,newdata_chc_sampled, simplify = FALSE))
56
+ newdata_chc_sampled$t.startage=seq(5,70,1)
57
+ newdata_chc_sampled$t.endage=seq(6,71,1)
58
+ ### originally data fit to 60 only. using cphfits can get est up to 60 only. ==> later I further cut at 50 or so to fit lines, becuase original data had 95th percentile around age 50 or so.
59
+ newdata_chc_sampled=newdata_chc_sampled[newdata_chc_sampled$t.endage<=60,]
60
+
61
+ # paste(names(input), input, sep = ":", collapse = ",")
62
+ pr=input$diaggrp
63
+ # agecut was previously hardcoded to 40 above
64
+ agecut=c('1'=50, '2'=45, '3'=55, '4'=50, '5'=40, '6'=60, '7'=50, '8'=45, '9'=45, '10'=45, '11'=50 )[pr]
65
+ sexval=input$sex
66
+ newdata_chc_sampled$sex=input$sex # sexval
67
+ newdata_chc_sampled$white=input$white # whiteval
68
+ newdata_chc_sampled$agedx2=input$agedx # agedxval
69
+ newdata_chc_sampled$steroid=input$steroid # steroidval
70
+ newdata_chc_sampled$bleodose=input$bleo # bleoval
71
+ newdata_chc_sampled$vcrdose=input$vcr # vcrval
72
+ newdata_chc_sampled$etopdose=input$etop # etopval
73
+ newdata_chc_sampled$itmtxdose=input$itmt # itmtval
74
+ newdata_chc_sampled$ced_sum2=input$ced # cedval
75
+ newdata_chc_sampled$cisplatdose=input$cisp # cispval
76
+ newdata_chc_sampled$brainrad2=input$brain # brainval
77
+ newdata_chc_sampled$doxed_sum2=input$dox # doxval
78
+ newdata_chc_sampled$chestrad2=input$chest # chestval
79
+ newdata_chc_sampled$abdrad2=input$abd # abdval
80
+ newdata_chc_sampled$heartradboth2=input$heart # heartval
81
+ newdata_chc_sampled$pelvisrad2=input$pelvis # pelvisval
82
+ newdata_chc_sampled$carboplatdose=input$carbo # carboval
83
+ newdata_chc_sampled$hdmtxdose=input$hdmtx # hdmtxval
84
+
85
+ # newdata_chc_sampled$sex=sexval
86
+ # newdata_chc_sampled$white=whiteval
87
+ # newdata_chc_sampled$agedx2=agedxval
88
+ # newdata_chc_sampled$steroid=steroidval
89
+ # newdata_chc_sampled$bleodose=bleoval
90
+ # newdata_chc_sampled$vcrdose=vcrval
91
+ # newdata_chc_sampled$etopdose=etopval
92
+ # newdata_chc_sampled$itmtxdose=itmtval
93
+ # newdata_chc_sampled$ced_sum2=cedval
94
+ # newdata_chc_sampled$cisplatdose=cispval
95
+ # newdata_chc_sampled$brainrad2=brainval
96
+ # newdata_chc_sampled$doxed_sum2=doxval
97
+ # newdata_chc_sampled$chestrad2=chestval
98
+ # newdata_chc_sampled$abdrad2=abdval
99
+ # newdata_chc_sampled$heartradboth2=heartval
100
+ # newdata_chc_sampled$pelvisrad2=pelvisval
101
+ # newdata_chc_sampled$carboplatdose=carboval
102
+ # newdata_chc_sampled$hdmtxdose=hdmtxval
103
+
104
+ # 1="Acute lymphoblastic leukemia"
105
+ # 2="AML"
106
+ # 3="Hodgkin lymphoma"
107
+ # 4="Non-Hodgkin lymphoma"
108
+ # 5="Central nervous system"
109
+ # 6="Bone tumor"
110
+ # 7="STS"
111
+ # 8="Wilms tumor"
112
+ # 9="Neuroblastoma"
113
+ # 10="Retinoblastoma"
114
+ # 11="Germ cell tumor";
115
+
116
+ #results <- mclapply(X = chc_nums, FUN = function(chc_num) predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected'), mc.cores = cores)
117
+ predictChc <- function(chc_num) {
118
+ predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected')
119
+ }
120
+ chcCols <- NULL
121
+ if (parallelizeChcs) {
122
+ # parallelize over chc_nums
123
+ chcCols <- foreach(chc_num = chc_nums, .combine = cbind) %dopar% {
124
+ predictChc(chc_num)
125
+ }
126
+ } else {
127
+ # do not parallelize over chc_nums
128
+ # parallelizing over bootstraps instead
129
+ chcCols <- foreach(chc_num = chc_nums, .combine = cbind) %do% {
130
+ predictChc(chc_num)
131
+ }
132
+ }
133
+
134
+ newdata_chc_sampled = data.frame(newdata_chc_sampled, chcCols)
135
+ names(newdata_chc_sampled)[25:50]=paste0("est_chc",chc_nums)
136
+ newdata_chc_sampled = newdata_chc_sampled %>%
137
+ mutate(sumN_tmp = rowSums(dplyr::select(.,starts_with("est_chc"))))%>%
138
+ group_by(mrn) %>%
139
+ mutate(sumN_obs = cumsum(sumN_tmp)) %>%
140
+ as.data.frame()
141
+
142
+ ##Qi: the sumN here depends on all the 26 grouped conditions. So the input X's all matter. That is, if sex is not in a CHC of interest, it would make a difference here on sumN (becuase sex was on some CHCs), and hence make a difference on burden of that CHC even that it is not in the cphfits of that CHC.
143
+ newdata_chc_sampled = newdata_chc_sampled %>%
144
+ group_by(mrn) %>%
145
+ mutate(chc20 = sumN_obs[t.endage == 20]) %>%
146
+ ungroup() %>%
147
+ as.data.frame()
148
+ newdata_chc_sampled$death =1
149
+ newdata_chc_sampled$obsCHCat20 = newdata_chc_sampled$current.chc
150
+
151
+
152
+ # survival probability
153
+ # https://stats.stackexchange.com/questions/288393/calculating-survival-probability-per-person-at-time-t-from-cox-ph
154
+
155
+
156
+ newdata_chc_sampled$survprob = exp(-predict(survs[[1]],newdata=data.frame(newdata_chc_sampled,primary=pr),type='expected'))
157
+
158
+ #----------------------------------------------------------------------------------------------------------------#
159
+ ##### Qi added the below "cumprod" for survival by time t. But need to figure out: What is the "survprob" in BCCT formulat? Should it be survival of the segment, or survival by time t? == need to figure out with YY. Discussed, YY confirmed my way: survival prob in the formula is cumulative, not for that segment.
160
+ #----------------------------------------------------------------------------------------------------------------#
161
+
162
+ #----------------------------------------------------------------------------------------------------------------#
163
+ ## If assume "survprob" is over time (not for each segment):
164
+ #### why does the survprob does not decrease over time? I think this is not the real survival probability over time. Do I have to do multiplication over time thinking survprob is the survival over that segment? Try the multiplication over time.===== I think this make sense. In the "predict" above, survial=exp(-expected) was for each row (thinking each row is a separate person). While in newdata_chc_sampled, the rows are for the same person, and the survival depends on the previoys line, so need to multiply the survival from the previous line.
165
+ newdata_chc_sampled$survprob4=cumprod(newdata_chc_sampled$survprob)
166
+ newdata_chc_sampled$survprob=newdata_chc_sampled$survprob4
167
+
168
+ # plot(c(0,90),c(0,1),type="n")
169
+ survspline=smooth.spline(newdata_chc_sampled$t.endage[newdata_chc_sampled$t.endage<=agecut],newdata_chc_sampled$survprob[newdata_chc_sampled$t.endage<=agecut],spar=0.5)
170
+ predsurv=predict(survspline,seq(0,95,1))
171
+
172
+ # lines(predsurv$x,predsurv$y,col=3,lty=2)
173
+
174
+ ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
175
+
176
+ ###### get rid of the est_chcXX and "sumN"columns which were used to calculate the survival probability only.
177
+ # invisible(dim(newdata_chc_sampled))
178
+ newdata_chc_sampled=newdata_chc_sampled[,-grep("est_chc", colnames(newdata_chc_sampled))]
179
+ newdata_chc_sampled=newdata_chc_sampled[,-grep("sumN", colnames(newdata_chc_sampled))]
180
+ # invisible(dim(newdata_chc_sampled))
181
+
182
+ ### Add rows t.startage from 60 to 94, and t.endage from 65 to 95; so we can get burden 60-90.
183
+ add=newdata_chc_sampled[newdata_chc_sampled$t.startage<=39,]
184
+ # table(add$t.startage)
185
+ # table(add$t.endage)
186
+ add$t.startage=add$t.startage+55
187
+ add$t.endage=add$t.endage+55
188
+ # table(add$t.startage)
189
+ # table(add$t.endage)
190
+ newdata_chc_sampled=rbind(newdata_chc_sampled,add)
191
+ newdata_chc_sampled=newdata_chc_sampled[order(newdata_chc_sampled$mrn,newdata_chc_sampled$t.startage),]
192
+ ### replace the survival prob with the calculated/extrapolated survival probability
193
+ smooth_surv=data.frame(age=predsurv$x,surv=predsurv$y)
194
+ smooth_surv$surv[smooth_surv$age<=20]=1
195
+ #### survival probability cannot be <0. Hanle the years with survival prob<0
196
+ #https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310013501 This page had the conditional survival based on age
197
+ ## take the last year with a positive survival prob, and its survival prob
198
+ positive=smooth_surv[smooth_surv$surv>0,]
199
+ alast=tail(positive,1)[1,1]
200
+ slast=tail(positive,1)[1,2]
201
+ #smooth_surv$alast=alast
202
+ #smooth_surv$alast=slast
203
+ smooth_surv$interval=smooth_surv$age-alast
204
+ ### use the last positive survival prob*0.5^(years from the last age with positive survival probability), assuming the conditions survival prob after that age 50% each year.
205
+ cave <- function(x) slast*0.5^(max(x["interval"],0))
206
+ smooth_surv$surv1=apply(smooth_surv,1,cave)
207
+ smooth_surv$surv[smooth_surv$surv<0]=smooth_surv$surv1[smooth_surv$surv<0]
208
+
209
+ newdata_chc_sampled=merge(newdata_chc_sampled,smooth_surv,by.x="t.endage",by.y="age")
210
+ newdata_chc_sampled$survprob=newdata_chc_sampled$surv
211
+
212
+ # when there is an interaction in the model, it gave warning. So I would make a new data with all 0's to make it work.
213
+ newdata0=matrix(0,nrow=1,ncol=18)
214
+ newdata0=as.data.frame(newdata0)
215
+ colnames(newdata0)=c("sex","white","agedx2","steroids","bleodose","vcrdose","etopdose","itmtxdose","ced_sum2",
216
+ "cisplatdose","brainrad2","doxed_sum2","chestrad2","abdrad2","heartradboth2","pelvisrad2","carboplatdose","hdmtxdose")
217
+
218
+
219
+ newdata_chc_sampled1=newdata_chc_sampled ## do this so each run on chc_num loops below starts with the original newdata_chc_sampled1
220
+
221
+
222
+ ##########################################################################
223
+ person_burden=NULL
224
+
225
+ get_estimate <- function(chc_num) { #### Edgar, you may make this in separate runs to save time.
226
+ # print(chc_num)
227
+ newdata_chc_sampled=newdata_chc_sampled1
228
+
229
+ # linear predictor
230
+ newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
231
+
232
+ # Baseline nelson-aalan est
233
+ # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
234
+ j=chc_num
235
+ base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
236
+ #centered, if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
237
+ #request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
238
+
239
+
240
+ ### Max time in the data is 70.42. We need to estimate up to 90.
241
+ #Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
242
+ #One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative." Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative? https://cran.r-project.org/web/packages/pspline/pspline.pdf
243
+
244
+
245
+ #### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
246
+ base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
247
+ base=base[base$time<=agecut,] ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
248
+
249
+ ##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
250
+ cumHspline=smooth.spline(base$time,base$hazard,spar=1)
251
+ predcumhz=predict(cumHspline,seq(0,95,1)) ### predicted cumulative hazard
252
+
253
+
254
+ ##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
255
+ base=data.frame(time=predcumhz$x,hazard=predcumhz$y) ##Daisuke used the cumHz, here we smoothed it and then use it.
256
+ #### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
257
+ base$hazard[base$hazard<0]=0
258
+ base2 = base %>%
259
+ mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
260
+ ungroup() %>% as.data.frame()
261
+
262
+ base2 = base2 %>%
263
+ mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
264
+ ungroup()
265
+
266
+ base3 = base2 %>%
267
+ group_by(time_cat) %>%
268
+ dplyr::summarize(dN0 = sum(hazard2)) %>%
269
+ filter(!is.na(time_cat))
270
+
271
+ ###############
272
+ # BCCT
273
+ ###############
274
+ newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
275
+
276
+ #newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
277
+
278
+ newdata_chc_sampled = newdata_chc_sampled %>%
279
+ left_join(base3,by="time_cat")
280
+ newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
281
+
282
+ BCCT = newdata_chc_sampled %>%
283
+ group_by(mrn) %>%
284
+ mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
285
+ mutate(BCCT = cumsum(BCCT_tmp)) %>%
286
+ filter(t.startage>=20) %>%
287
+ ungroup() %>%
288
+ as.data.frame()
289
+
290
+ for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
291
+ for_web_BCCT =for_web_BCCT[,-1]
292
+
293
+ #### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
294
+ #### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
295
+ ncoltmp=75 ## from 20 to 94
296
+ if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
297
+ for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
298
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
299
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
300
+ }
301
+ #For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
302
+ if(chc_num %in% c(10)){
303
+ for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
304
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
305
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
306
+ }
307
+ ##### if female condition 6, then it is 0 for males.
308
+ if(chc_num %in% c(6) & sexval==1){
309
+ for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
310
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
311
+ colnames(for_web_BCCT)=colnames(person_burden[1:75])
312
+ }
313
+ ##### if male condition 7, then it is 0 for females.d
314
+ if(chc_num %in% c(7) & sexval==0){
315
+ for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
316
+ for_web_BCCT=as.data.frame(for_web_BCCT2)
317
+ colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
318
+ }
319
+
320
+ for_web_BCCT$chc=chc_num
321
+
322
+ return(for_web_BCCT)
323
+ }
324
+
325
+ results <- NULL
326
+ if (parallelizeChcs) {
327
+ # parallelize over chc_nums
328
+ results <- foreach(chc_num = chc_nums) %dopar% {
329
+ get_estimate(chc_num)
330
+ }
331
+ } else {
332
+ # do not parallelize over chc_nums
333
+ # parallelizing over bootstraps instead
334
+ results <- foreach(chc_num = chc_nums) %do% {
335
+ get_estimate(chc_num)
336
+ }
337
+ }
338
+
339
+ # this serial loop works
340
+ # for(chc_num in chc_nums) {
341
+ # person_burden=rbind(person_burden, get_estimate(chc_num))
342
+ # }
343
+
344
+ # get estimates
345
+ # parallelize across chc_nums
346
+ #results <- mclapply(X = chc_nums, FUN = get_estimate, mc.cores = cores)
347
+ #results <- lapply(X = chc_nums, FUN = get_estimate)
348
+
349
+ # combine rows into person_burden data frame
350
+ # TODO: need to verify this
351
+ for (n in 1:length(results)) {
352
+ row <- results[[n]]
353
+ if (!identical(names(row), names(results[[1]]))) {
354
+ # some rows may have empty column names because they
355
+ # used the columns names from the person_burden table, which
356
+ # is NULL when get_estimate() is run in parallel (see the
357
+ # if() statements in get_estimate())
358
+ # in this situation, use the column names from the first row
359
+ names(row) <- names(results[[1]])
360
+ }
361
+ person_burden <- rbind(person_burden, row)
362
+ }
363
+
364
+ # person_burden[,30:31]
365
+ # sum(person_burden[,31]) ## total burden at 50 years old. 8.971574 for this example.
366
+
367
+ #### The predicated burden for 26 grouped CHCs from age 20 to 95.
368
+ # write.csv(person_burden,file=paste("primary",pr,".csv"),row.names=F)
369
+ #toJSON(person_burden, digits = NA, na = "string")
370
+ return(person_burden)
371
+ }