@sjcrh/proteinpaint-server 2.122.0 → 2.124.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,804 +0,0 @@
1
- ########################
2
- # REGRESSION UTILITIES #
3
- ########################
4
-
5
- ####################
6
- # LIST OF FUNCTIONS
7
- ####################
8
-
9
- # prepareDataTable
10
- # cubic_spline
11
- # buildFormulas
12
- # linearRegression
13
- # logisticRegression
14
- # coxRegression
15
- # runRegression
16
- # plot_spline
17
- # build_coef_table
18
- # formatCoefficients
19
- # formatType3
20
-
21
-
22
- #######
23
- # CODE
24
- #######
25
-
26
- # prepare data table
27
- prepareDataTable <- function(tempdat, independent) {
28
- dat <- tempdat[,colnames(tempdat) != "__sample"] # remove sample column
29
- for (r in 1:nrow(independent)) {
30
- variable <- independent[r,]
31
- id <- variable$id
32
- if (variable$rtype == "factor") {
33
- # factor variable
34
- # assign reference group
35
- dat[,id] <- factor(dat[,id])
36
- dat[,id] <- relevel(dat[,id], ref = variable$refGrp)
37
- }
38
- }
39
- return(dat)
40
- }
41
-
42
- # compute cubic spline
43
- # args: values = variable values; knots = knot values
44
- cubic_spline <- function(values, knots) {
45
- nknots <- length(knots) # there will be (nknots-1) cubic spline functions
46
- f <- matrix(nrow = length(values), ncol = (nknots-1))
47
- f[,1] <- values
48
- for (j in 1:(nknots-2)) {
49
- for (i in 1:length(values)) {
50
- f[i,(j+1)] <- max(0,(values[i]-knots[j])^3)-max(0,(values[i]-knots[nknots-1])^3)*(knots[nknots]-knots[j])/(knots[nknots]-knots[nknots-1])+
51
- max(0,(values[i]-knots[nknots])^3)*(knots[nknots-1]-knots[j])/(knots[nknots]-knots[nknots-1])
52
- }
53
- }
54
- return(f)
55
- }
56
-
57
- # build formulas
58
- buildFormulas <- function(outcome, independent, includeUnivariate) {
59
- # first, format variables for building formulas
60
-
61
- # declare new objects
62
- formula_outcome <- vector(mode = "character") # outcome variables formatted for formula
63
- formula_independent <- vector(mode = "character") # independent variables formatted for formula
64
- formula_interaction <- vector(mode = "character") # interacting variables formatted for formula
65
- snpLocusSnps <- independent[0,] # snplocus snps
66
- splineVariables <- independent[0,] # spline variables
67
-
68
- # outcome variable
69
- if ("timeToEvent" %in% names(outcome)) {
70
- # cox outcome variable
71
- # time-to-event data
72
- outcomeEventId <- outcome$timeToEvent$eventId
73
- if (outcome$timeToEvent$timeScale == "time") {
74
- outcomeTimeId <- outcome$timeToEvent$timeId
75
- formula_outcome <- paste0("Surv(",outcomeTimeId,", ",outcomeEventId,")")
76
- } else if (outcome$timeToEvent$timeScale == "age") {
77
- outcomeAgeStartId <- outcome$timeToEvent$agestartId
78
- outcomeAgeEndId <- outcome$timeToEvent$ageendId
79
- formula_outcome <- paste0("Surv(",outcomeAgeStartId,", ",outcomeAgeEndId,", ",outcomeEventId,")")
80
- } else {
81
- stop ("unknown cox regression time scale")
82
- }
83
- } else {
84
- # other outcome variable
85
- formula_outcome <- outcome$id
86
- }
87
-
88
- # independent variables
89
- for (r in 1:nrow(independent)) {
90
- variable <- independent[r,]
91
- if (variable$type == "spline") {
92
- # cubic spline variable
93
- # use call to cubic spline function in regression formula
94
- splineCmd <- paste0("cubic_spline(", variable$id, ", ", paste0("c(", paste(variable$spline$knots[[1]],collapse = ","), ")"), ")")
95
- formula_independent <- c(formula_independent, splineCmd)
96
- splineVariables <- rbind(splineVariables, variable)
97
- } else if (variable$type == "snplocus") {
98
- # snplocus snp
99
- # set aside so that separate formulas can be made for each snplocus snp
100
- snpLocusSnps <- rbind(snpLocusSnps, variable)
101
- next
102
- } else {
103
- # other independent variable
104
- formula_independent <- c(formula_independent, variable$id)
105
- }
106
- if ("interactions" %in% names(variable) & length(variable$interactions[[1]]) > 0) {
107
- # interactions
108
- if (variable$type == "spline") stop("interactions not allowed with spline variable")
109
- interactionIds <- variable$interactions[[1]]
110
- for (intId in interactionIds) {
111
- # get unique set of interactions
112
- int1 <- paste(variable$id, intId, sep = ":")
113
- int2 <- paste(intId, variable$id, sep = ":")
114
- if (!(int1 %in% formula_interaction) & !(int2 %in% formula_interaction)) {
115
- formula_interaction <- c(formula_interaction, int1)
116
- }
117
- }
118
- }
119
- }
120
-
121
- # combine variables into formula(s)
122
- # if snplocus snps are present, then prepare a
123
- # separate formula for each snplocus snp
124
- formulas <- list()
125
- if (nrow(snpLocusSnps) > 0) {
126
- # snplocus snps present
127
- # build separate formula for each snplocus snp
128
- for (r in 1:nrow(snpLocusSnps)) {
129
- temp_formula_independent <- formula_independent
130
- temp_formula_interaction <- formula_interaction
131
- snp <- snpLocusSnps[r,]
132
- temp_formula_independent <- c(temp_formula_independent, snp$id)
133
- if ("interactions" %in% names(snp) & length(snp$interactions[[1]]) > 0) {
134
- # interactions
135
- interactionIds <- snp$interactions[[1]]
136
- for (intId in interactionIds) {
137
- # get unique set of interactions
138
- int1 <- paste(snp$id, intId, sep = ":")
139
- int2 <- paste(intId, snp$id, sep = ":")
140
- if (!(int1 %in% temp_formula_interaction) & !(int2 %in% temp_formula_interaction)) {
141
- temp_formula_interaction <- c(temp_formula_interaction, int1)
142
- }
143
- }
144
- }
145
- formula <- as.formula(paste(formula_outcome, paste(c(temp_formula_independent, temp_formula_interaction), collapse = "+"), sep = "~"))
146
- entry <- length(formulas) + 1
147
- formulas[[entry]] <- list("id" = snp$id, "formula" = formula)
148
- if (nrow(splineVariables) > 0) {
149
- formulas[[entry]][["splineVariables"]] = splineVariables
150
- }
151
- }
152
- } else {
153
- # no snplocus snps
154
- if (isTRUE(includeUnivariate)) {
155
- # include univariate formulas along with the multivariate formula
156
- if (length(formula_independent) < 2) stop("must have multiple covariates to build multivariate and univariate formulas")
157
- if (length(formula_interaction) > 0) stop("interactions not supported in univariate models")
158
- # multivariate formula
159
- formula <- as.formula(paste(formula_outcome, paste(formula_independent, collapse = "+"), sep = "~"))
160
- formulas[[1]] <- list("id" = "", "type" = "multivariate", "formula" = formula)
161
- if (nrow(splineVariables) > 0) {
162
- formulas[[1]][["splineVariables"]] = splineVariables
163
- }
164
- # univariate formulas
165
- for (var in formula_independent) {
166
- formula <- as.formula(paste(formula_outcome, var, sep = "~"))
167
- entry <- length(formulas) + 1
168
- formulas[[entry]] <- list("id" = "", "type" = "univariate", "formula" = formula)
169
- if (startsWith(var, "cubic_spline")) {
170
- id <- gsub("cubic_spline\\(", "", gsub("\\,.*", "", var))
171
- splineVariable <- splineVariables[splineVariables$id == id, ,drop = F]
172
- if (nrow(splineVariable) == 0) stop("expected spline variable")
173
- formulas[[entry]][["splineVariables"]] = splineVariable
174
- }
175
- }
176
- } else {
177
- # use single formula for all variables
178
- formula <- as.formula(paste(formula_outcome, paste(c(formula_independent, formula_interaction), collapse = "+"), sep = "~"))
179
- formulas[[1]] <- list("id" = "", "formula" = formula)
180
- if (nrow(splineVariables) > 0) {
181
- formulas[[1]][["splineVariables"]] = splineVariables
182
- }
183
- }
184
- }
185
- return(formulas)
186
- }
187
-
188
- # linear regression
189
- linearRegression <- function(formula, dat) {
190
- res <- lm(formula$formula, data = dat)
191
- sampleSize <- res$df.residual + length(res$coefficients[!is.na(res$coefficients)])
192
- res_summ <- summary(res)
193
-
194
- # residuals table
195
- residuals_table <- list("header" = c("Minimum","1st quartile","Median","3rd quartile","Maximum"), "rows" = unname(round(fivenum(res_summ$residuals),3)))
196
-
197
- # coefficients table
198
- coefficients_table <- build_coef_table(res_summ)
199
- colnames(coefficients_table)[1] <- "Beta"
200
- # compute confidence intervals of beta values
201
- ci <- suppressMessages(confint(res))
202
- colnames(ci) <- c("95% CI (low)","95% CI (high)")
203
- coefficients_table <- cbind(coefficients_table, ci)
204
- coefficients_table <- coefficients_table[,c(1,5,6,2,3,4)]
205
-
206
- # type III statistics table
207
- type3_table <- as.matrix(drop1(res, scope = ~., test = "F"))
208
- # if there are interactions, then set the results
209
- # of the main effects to "NA" because these type III
210
- # stats cannot be accurately estimated
211
- ints <- grep(":", row.names(type3_table), value = T, fixed = T)
212
- intsIds <- unique(unlist(strsplit(ints, ":")))
213
- type3_table[intsIds,] <- NA
214
- # round values
215
- type3_table[,c("Sum of Sq","RSS","AIC")] <- round(type3_table[,c("Sum of Sq","RSS","AIC")], 1)
216
- type3_table[,"F value"] <- round(type3_table[,"F value"], 3)
217
- type3_table[,"Pr(>F)"] <- signif(type3_table[,"Pr(>F)"], 4)
218
-
219
- # total SNP effect table
220
- # if a snplocus snp has an interaction, then compute
221
- # the total effect of the snp on the model
222
- # by determining the combined effect of removing the
223
- # snp and all of its interactions from the model
224
- totalSnpEffect_table <- NULL
225
- if (formula$id != "" && length(ints) > 0 && any(grepl(formula$id, ints, fixed = T))) {
226
- snp_vars <- grep(formula$id, row.names(type3_table), value = T, fixed = T)
227
- formula_reduce <- update(formula$formula, paste0("~.",paste0("-", snp_vars, collapse = "")))
228
- res_reduce <- lm(formula_reduce, data = dat)
229
- totalSnpEffect_table <- as.matrix(anova(res, res_reduce, test = "F"))[2, c("Df","F","Pr(>F)"), drop = F]
230
- row.names(totalSnpEffect_table) <- "Total"
231
- totalSnpEffect_table[,"Df"] <- totalSnpEffect_table[,"Df"] * -1
232
- totalSnpEffect_table[,"F"] <- round(totalSnpEffect_table[,"F"], 3)
233
- totalSnpEffect_table[,"Pr(>F)"] <- signif(totalSnpEffect_table[,"Pr(>F)"], 4)
234
- totalSnpEffect_table <- cbind("Variable" = row.names(totalSnpEffect_table), totalSnpEffect_table, "VariableIDs" = paste(snp_vars, collapse = ";"))
235
- totalSnpEffect_table <- list("header" = colnames(totalSnpEffect_table), "rows" = totalSnpEffect_table)
236
- }
237
-
238
- # other summary stats table
239
- other_table <- list(
240
- "header" = c("Residual standard error", "Residual degrees of freedom", "R-squared", "Adjusted R-squared", "F-statistic", "P-value"),
241
- "rows" = c(round(res_summ$sigma, 2), round(res$df.residual, 0), round(res_summ$r.squared, 5), round(res_summ$adj.r.squared, 5))
242
- )
243
- # check for F-statistic
244
- # this statistic is not computed if variables have no variability
245
- if ("fstatistic" %in% names(res_summ)) {
246
- other_table[["rows"]] <- c(other_table[["rows"]], round(unname(res_summ$fstatistic[1]), 2), signif(unname(pf(res_summ$fstatistic[1], res_summ$fstatistic[2], res_summ$fstatistic[3], lower.tail = F)), 4))
247
- } else {
248
- other_table[["rows"]] <- c(other_table[["rows"]], NA, NA)
249
- }
250
-
251
- # export the results tables
252
- out <- list("res" = res, "sampleSize" = unbox(sampleSize), "residuals" = residuals_table, "coefficients" = coefficients_table, "type3" = type3_table, "other" = other_table)
253
- if (!is.null(totalSnpEffect_table)) out[["totalSnpEffect"]] <- totalSnpEffect_table
254
- return(out)
255
- }
256
-
257
- # logistic regression
258
- logisticRegression <- function(formula, dat) {
259
- res <- glm(formula$formula, family = binomial(link='logit'), data = dat)
260
- sampleSize <- res$df.residual + length(res$coefficients[!is.na(res$coefficients)])
261
- res_summ <- summary(res)
262
-
263
- # residuals table
264
- residuals_table <- list("header" = c("Minimum","1st quartile","Median","3rd quartile","Maximum"), "rows" = unname(round(fivenum(res_summ$deviance.resid),3)))
265
-
266
- # coefficients table
267
- coefficients_table <- build_coef_table(res_summ)
268
- colnames(coefficients_table)[1] <- "Log odds"
269
- # compute odds ratio
270
- coefficients_table <- cbind("Odds ratio" = exp(coef(res)), coefficients_table)
271
- # compute confidence intervals of odds ratios
272
- ci <- tryCatch(
273
- exp(suppressMessages(confint(res))),
274
- error = function(err) {
275
- # computation of confidence intervals might fail
276
- # because of small sample sizes of coefficients.
277
- # use custom confidence interval computation instead
278
- low <- coefficients_table[,"Log odds"] - (1.96 * coefficients_table[,"Std. Error"])
279
- up <- coefficients_table[,"Log odds"] + (1.96 * coefficients_table[,"Std. Error"])
280
- ci <- cbind(low,up)
281
- row.names(ci) <- row.names(coefficients_table)
282
- ci <- exp(ci)
283
- return(ci)
284
- }
285
- )
286
- colnames(ci) <- c("95% CI (low)","95% CI (high)")
287
- coefficients_table <- cbind(coefficients_table, ci)
288
- coefficients_table <- coefficients_table[,c(1,6,7,2,3,4,5)]
289
-
290
- # type III statistics table
291
- type3_table <- as.matrix(drop1(res, scope = ~., test = "LRT"))
292
- # if there are interactions, then set the results
293
- # of the main effects to "NA" because these type III
294
- # stats cannot be accurately estimated
295
- ints <- grep(":", row.names(type3_table), value = T, fixed = T)
296
- intsIds <- unique(unlist(strsplit(ints, ":")))
297
- type3_table[intsIds,] <- NA
298
- # round values
299
- type3_table[,c("Deviance","AIC")] <- round(type3_table[,c("Deviance","AIC")], 1)
300
- type3_table[,"LRT"] <- round(type3_table[,"LRT"], 3)
301
- type3_table[,"Pr(>Chi)"] <- signif(type3_table[,"Pr(>Chi)"], 4)
302
-
303
- # total SNP effect table
304
- # if a snplocus snp has an interaction, then compute
305
- # the total effect of the snp on the model
306
- # by determining the combined effect of removing the
307
- # snp and all of its interactions from the model
308
- totalSnpEffect_table <- NULL
309
- if (formula$id != "" && length(ints) > 0 && any(grepl(formula$id, ints, fixed = T))) {
310
- snp_vars <- grep(formula$id, row.names(type3_table), value = T, fixed = T)
311
- formula_reduce <- update(formula$formula, paste0("~.",paste0("-", snp_vars, collapse = "")))
312
- res_reduce <- glm(formula_reduce, family = binomial(link='logit'), data = dat)
313
- totalSnpEffect_table <- as.matrix(lrtest(res, res_reduce))[2, 3:5, drop = F]
314
- row.names(totalSnpEffect_table) <- "Total"
315
- totalSnpEffect_table[,"Df"] <- totalSnpEffect_table[,"Df"] * -1
316
- totalSnpEffect_table[,"Chisq"] <- round(totalSnpEffect_table[,"Chisq"], 3)
317
- totalSnpEffect_table[,"Pr(>Chisq)"] <- signif(totalSnpEffect_table[,"Pr(>Chisq)"], 4)
318
- totalSnpEffect_table <- cbind("Variable" = row.names(totalSnpEffect_table), totalSnpEffect_table, "VariableIDs" = paste(snp_vars, collapse = ";"))
319
- totalSnpEffect_table <- list("header" = colnames(totalSnpEffect_table), "rows" = totalSnpEffect_table)
320
- }
321
-
322
- # other summary stats table
323
- other_table <- list(
324
- "header" = c("Dispersion parameter", "Null deviance", "Null deviance degrees of freedom", "Residual deviance", "Residual deviance degrees of freedom", "AIC"),
325
- "rows" = round(c(res_summ$dispersion, res_summ$null.deviance, res_summ$df.null, res_summ$deviance, res_summ$df.residual, res_summ$aic), 1)
326
- )
327
-
328
- # export the results tables
329
- out <- list("res" = res, "sampleSize" = unbox(sampleSize), "residuals" = residuals_table, "coefficients" = coefficients_table, "type3" = type3_table, "other" = other_table)
330
- if (!is.null(totalSnpEffect_table)) out[["totalSnpEffect"]] <- totalSnpEffect_table
331
- return(out)
332
- }
333
-
334
- # cox regression
335
- coxRegression <- function(formula, dat) {
336
- res <- coxph(formula$formula, data = dat, model = T)
337
-
338
- # extract summary object
339
- res_summ <- summary(res)
340
-
341
- # numbers of samples and events
342
- sampleSize <- res_summ$n
343
- eventCnt <- res_summ$nevent
344
-
345
- # coefficients table
346
- coefficients_table <- res_summ$coefficients
347
- # add in confidence intervals
348
- confint_table <- res_summ$conf.int
349
- coefficients_table <- cbind(coefficients_table, confint_table[, c("lower .95", "upper .95"), drop = F])
350
- # rename and reorder columns
351
- colnames(coefficients_table)[colnames(coefficients_table) == "coef"] <- "Beta"
352
- colnames(coefficients_table)[colnames(coefficients_table) == "exp(coef)"] <- "HR"
353
- colnames(coefficients_table)[colnames(coefficients_table) == "se(coef)"] <- "Std. Error"
354
- colnames(coefficients_table)[colnames(coefficients_table) == "lower .95"] <- "95% CI (low)"
355
- colnames(coefficients_table)[colnames(coefficients_table) == "upper .95"] <- "95% CI (high)"
356
- coefficients_table <- coefficients_table[, c(2,6,7,1,3,4,5), drop = F]
357
-
358
- # type III statistics table
359
- # do not use drop1() to compute type III stats for cox regression
360
- # because it will use the drop1.default() method and this will
361
- # error out if samples are filtered from the data table
362
- # TODO: may replace with car::Anova() when car package becomes available on servers
363
- vlst <- attr(res$terms, "term.labels", exact = T)
364
- type3_table <- matrix(data = NA, nrow = length(vlst), ncol = 5)
365
- row.names(type3_table) <- vlst
366
- for (i in 1:length(vlst)) {
367
- v <- vlst[i]
368
- formula_reduce <- update(formula$formula, paste0("~.-", v))
369
- res_reduce <- coxph(formula_reduce, data = dat, model = T)
370
- lt <- as.matrix(lrtest(res, res_reduce))
371
- type3_table[i,] <- lt[2,]
372
- }
373
- colnames(type3_table) <- colnames(lt)
374
- type3_table <- type3_table[,3:5,drop = F]
375
- type3_table[,"Df"] <- type3_table[,"Df"] * -1 #convert Df diff to Df
376
- # if there are interactions, then set the results
377
- # of the main effects to "NA" because these type III
378
- # stats cannot be accurately estimated
379
- ints <- grep(":", row.names(type3_table), value = T, fixed = T)
380
- intsIds <- unique(unlist(strsplit(ints, ":")))
381
- type3_table[intsIds,] <- NA
382
- # round values
383
- type3_table[,"Chisq"] <- round(type3_table[,"Chisq"], 3)
384
- type3_table[,"Pr(>Chisq)"] <- signif(type3_table[,"Pr(>Chisq)"], 4)
385
-
386
- # total SNP effect table
387
- # if a snplocus snp has an interaction, then compute
388
- # the total effect of the snp on the model
389
- # by determining the combined effect of removing the
390
- # snp and all of its interactions from the model
391
- totalSnpEffect_table <- NULL
392
- if (formula$id != "" && length(ints) > 0 && any(grepl(formula$id, ints, fixed = T))) {
393
- snp_vars <- grep(formula$id, row.names(type3_table), value = T, fixed = T)
394
- formula_reduce <- update(formula$formula, paste0("~.",paste0("-", snp_vars, collapse = "")))
395
- res_reduce <- coxph(formula_reduce, data = dat, model = T)
396
- totalSnpEffect_table <- as.matrix(lrtest(res, res_reduce))[2, 3:5, drop = F]
397
- row.names(totalSnpEffect_table) <- "Total"
398
- totalSnpEffect_table[,"Df"] <- totalSnpEffect_table[,"Df"] * -1
399
- totalSnpEffect_table[,"Chisq"] <- round(totalSnpEffect_table[,"Chisq"], 3)
400
- totalSnpEffect_table[,"Pr(>Chisq)"] <- signif(totalSnpEffect_table[,"Pr(>Chisq)"], 4)
401
- totalSnpEffect_table <- cbind("Variable" = row.names(totalSnpEffect_table), totalSnpEffect_table, "VariableIDs" = paste(snp_vars, collapse = ";"))
402
- totalSnpEffect_table <- list("header" = colnames(totalSnpEffect_table), "rows" = totalSnpEffect_table)
403
- }
404
-
405
- # statistical tests table
406
- tests_table <- rbind(res_summ$logtest, res_summ$waldtest, res_summ$sctest)
407
- colnames(tests_table) <- c("Test statistic", "Df", "P-value")
408
- tests_table[,"Test statistic"] <- round(tests_table[,"Test statistic"], 2)
409
- tests_table[,"Df"] <- round(tests_table[,"Df"], 0)
410
- tests_table[,"P-value"] <- signif(tests_table[,"P-value"], 4)
411
- tests_table <- cbind("Test" = c("Likelihood ratio test", "Wald test", "Score (log rank) test"), tests_table)
412
- tests_table <- list("header" = colnames(tests_table), "rows" = tests_table)
413
-
414
- # other summary stats table
415
- other_table <- list(
416
- "header" = c("Concordance", "Concordance standard error"),
417
- "rows" = c(unname(round(res_summ$concordance["C"], 3)), unname(round(res_summ$concordance["se(C)"], 3)))
418
- )
419
-
420
- # export the results tables
421
- out <- list("res" = res, "sampleSize" = unbox(sampleSize), "eventCnt" = unbox(eventCnt),"coefficients" = coefficients_table, "type3" = type3_table, "tests" = tests_table, "other" = other_table)
422
- if (!is.null(totalSnpEffect_table)) out[["totalSnpEffect"]] <- totalSnpEffect_table
423
- return(out)
424
- }
425
-
426
- # run regression analysis
427
- runRegression <- function(formula, regtype, dat, outcome, cachedir) {
428
- # remove samples with NA values in any variable in the formula
429
- # NOTE: even though regression functions (e.g. lm, glm, etc.)
430
- # perform this filtration by default, this filtration
431
- # should be done before any regression analysis because
432
- # multiple regression models might need to be compared to one
433
- # another (e.g. computation of total snp effect) and there might
434
- # be samples that are NA for variables in one model but
435
- # not in another model.
436
- fdat <- dat[complete.cases(dat[,all.vars(formula$formula)]),]
437
- warns <- vector(mode = "character")
438
- handleWarns <- function(w) {
439
- # handler for warning messages
440
- warns <<- c(warns, conditionMessage(w))
441
- invokeRestart("muffleWarning")
442
- }
443
- if (regtype == "linear") {
444
- results <- withCallingHandlers(linearRegression(formula, fdat), warning = handleWarns)
445
- } else if (regtype == "logistic") {
446
- results <- withCallingHandlers(logisticRegression(formula, fdat), warning = handleWarns)
447
- } else if (regtype == "cox") {
448
- results <- withCallingHandlers(coxRegression(formula, fdat), warning = handleWarns)
449
- } else {
450
- stop("unknown regression type")
451
- }
452
- if ("splineVariables" %in% names(formula)) {
453
- # spline variables present
454
- # plot cubic spline for each spline variable (if applicable)
455
- results$splinePlotFiles <- vector(mode = "character")
456
- splineVariables <- formula$splineVariables
457
- for (r in 1:nrow(splineVariables)) {
458
- splineVariable <- splineVariables[r,]
459
- if (isTRUE(splineVariable$spline$plot)) {
460
- spline_plot_file <- plot_spline(splineVariable, fdat, outcome, results$res, regtype, formula$type, cachedir)
461
- results$splinePlotFiles <- c(results$splinePlotFiles, spline_plot_file)
462
- }
463
- }
464
- }
465
- if (length(warns) > 0) results[["warnings"]] <- warns
466
- results$coefficients <- formatCoefficients(results$coefficients, results$res, input$regressionType, fdat)
467
- results$type3 <- formatType3(results$type3)
468
- out <- list("id" = unbox(formula$id), "data" = results[names(results) != "res"])
469
- if (!is.null(formula$type)) out$type <- unbox(formula$type)
470
- return(out)
471
- }
472
-
473
- # generate cubic spline plot
474
- plot_spline <- function(splineVariable, dat, outcome, res, regtype, formulatype, cachedir) {
475
- # prepare test data table for predicting model outcome values
476
- # columns are all independent variables
477
- # for the spline variable, use regularly spaced data; for continuous variables, use data median; for categorical variables, use reference category
478
- if (regtype == "cox") {
479
- outcomeIds <- c(outcome$timeToEvent$timeId, outcome$timeToEvent$eventId)
480
- } else {
481
- outcomeIds <- outcome$id
482
- }
483
- independentIds <- colnames(dat)[!colnames(dat) %in% outcomeIds]
484
- sampleSize <- ifelse(nrow(dat) > 1000, 1000, nrow(dat))
485
- newdat <- dat[1:sampleSize, independentIds, drop = F]
486
-
487
- # prepare test data table for adjusting model outcome values
488
- # columns are the proportions and effect sizes of non-reference categorical coefficients
489
- newdat2 <- matrix(data = NA, nrow = 0, ncol = 2, dimnames = list(c(),c("prop", "effectSize")))
490
-
491
- # populate the test data tables
492
- for (term in colnames(newdat)) {
493
- if (term == splineVariable$id) {
494
- newdat[,term] <- seq(from = min(dat[,term]), to = max(dat[,term]), length.out = sampleSize)
495
- } else if (is.factor(dat[,term])) {
496
- ref <- levels(dat[,term])[1]
497
- newdat[,term] <- ref
498
- props <- table(dat[,term], exclude = ref)/length(dat[,term])
499
- for (category in names(props)) {
500
- newdat2 <- rbind(newdat2, c(props[category], res$coefficients[paste0(term,category)]))
501
- }
502
- } else {
503
- newdat[,term] <- median(dat[,term])
504
- }
505
- }
506
-
507
- # test model
508
- # predict outcome values of the model using the test data
509
- preddat <- predict(res, newdata = newdat, se.fit = T)
510
- # compute 95% confidence intervals
511
- ci_upr <- preddat$fit + (1.96 * preddat$se.fit)
512
- ci_lwr <- preddat$fit - (1.96 * preddat$se.fit)
513
- preddat_ci <- cbind("fit" = preddat$fit, "lwr" = ci_lwr, "upr" = ci_upr)
514
- # adjust the predicted values
515
- # adjusted predicted values = predicted values + ((prop category A * coef category A) + (prop category B * coef category B) + etc.)
516
- preddat_ci_adj <- preddat_ci + sum(apply(newdat2, 1, prod), na.rm = T)
517
-
518
- # plot data
519
- plotfile <- paste0(cachedir, "splinePlot_", ifelse(is.null(formulatype), "", paste0(formulatype, "_")), createRandString(), ".svg")
520
- svg(filename = plotfile, width = 6.7, height = ifelse(is.null(formulatype),5.25,5.35), pointsize = 20)
521
- par(mar = c(2, 2, ifelse(is.null(formulatype),0.7,1), 5) + 0.1, mgp = c(1, 1, 0))
522
- if (regtype == "linear" | regtype == "logistic") {
523
- if (regtype == "linear") {
524
- # for linear, plot predicted values
525
- pointtype <- 16
526
- pointsize <- 0.3
527
- pointalpha <- 0.8
528
- ylab <- outcome$name
529
- } else {
530
- # for logistic, plot predicted probabilities
531
- preddat_ci_adj <- 1/(1+exp(-preddat_ci_adj))
532
- pointtype <- 124
533
- pointsize <- 0.7
534
- pointalpha <- 0.5
535
- ylab <- paste0("Pr(", outcome$name, " ", outcome$categories$nonref, ")")
536
- }
537
- # use only finite predicted data
538
- toKeep <- rowSums(!is.finite(preddat_ci_adj)) == 0
539
- preddat_ci_adj <- preddat_ci_adj[toKeep,]
540
- newdat <- newdat[toKeep,,drop = F]
541
- # first plot actual (not predicted) data
542
- # predicted data will be overlayed later
543
- plot(dat[,splineVariable$id],
544
- dat[,outcome$id],
545
- ann = F,
546
- xaxt = "n",
547
- yaxt = "n",
548
- type = "n"
549
- )
550
- points(dat[,splineVariable$id],
551
- dat[,outcome$id],
552
- pch = pointtype,
553
- cex = pointsize,
554
- col = adjustcolor("#ce768e", alpha.f = pointalpha)
555
- )
556
- } else if (regtype == "cox") {
557
- # for cox, plot hazard ratios
558
- preddat_ci_adj <- exp(preddat_ci_adj)
559
- # use only finite predicted data
560
- toKeep <- rowSums(!is.finite(preddat_ci_adj)) == 0
561
- preddat_ci_adj <- preddat_ci_adj[toKeep,]
562
- newdat <- newdat[toKeep,,drop = F]
563
- ylab <- "Hazard Ratio"
564
- # plot only predicted data
565
- # do not also plot actual data (like for linear/logistic) because cox outcome data is time-to-event and will not be comparable to the predicted hazard ratios
566
- plot(newdat[,splineVariable$id],
567
- preddat_ci_adj[,"fit"],
568
- ylim = c(min(preddat_ci_adj[,"lwr"]), max(preddat_ci_adj[,"upr"])),
569
- ann = F,
570
- xaxt = "n",
571
- yaxt = "n",
572
- type = "n"
573
- )
574
- } else {
575
- stop("unrecognized regression type")
576
- }
577
-
578
- # axes
579
- axis(1, cex.axis = 0.5, mgp = c(0, 0.2, 0))
580
- axis(2, cex.axis = 0.5, mgp = c(0, 0.5, 0))
581
-
582
- # titles
583
- title(xlab = splineVariable$name, line = 1, cex.lab = 0.5)
584
- title(ylab = ylab, line = 1.5, cex.lab = 0.5)
585
- if (!is.null(formulatype)) {
586
- if (formulatype == "univariate") plotTitle <- "Univariate"
587
- else if (formulatype == "multivariate") plotTitle <- "Multivariable-adjusted"
588
- else stop("unexpected formula type")
589
- title(main = plotTitle, cex.main = 0.5, line = 0.45)
590
- }
591
-
592
- # knots
593
- abline(v = splineVariable$spline$knots[[1]],
594
- col = "grey60",
595
- lty = 2,
596
- lwd = 1,
597
- xpd = F
598
- )
599
-
600
- # confidence intervals of regression line
601
- polygon(x = c(newdat[,splineVariable$id], rev(newdat[,splineVariable$id])),
602
- y = c(preddat_ci_adj[,"lwr"], rev(preddat_ci_adj[,"upr"])),
603
- col = adjustcolor("grey", alpha.f = 0.8),
604
- border = NA
605
- )
606
-
607
- # regression line
608
- lines(newdat[,splineVariable$id],
609
- preddat_ci_adj[,"fit"],
610
- col = "blue",
611
- lwd = 1.7
612
- )
613
-
614
- # legend for lines
615
- legend("topright",
616
- inset = c(-0.39, 0),
617
- cex = 0.5,
618
- legend = c("knots", "cubic spline fit", "95% CI"),
619
- text.col = "white",
620
- lty = c(2, 1, NA),
621
- col = c("grey60", "blue", NA),
622
- xpd = TRUE
623
- )
624
-
625
- # legend for ci
626
- legend("topright",
627
- inset = c(-0.39, 0),
628
- cex = 0.5,
629
- legend = c("knots", "cubic spline fit", "95% CI"),
630
- text.col = "black",
631
- bty = "n",
632
- fill = c(NA, NA, adjustcolor("grey", alpha.f = 0.8)),
633
- border = c(NA, NA, NA),
634
- xpd = TRUE
635
- )
636
- dev.off()
637
- return(plotfile)
638
- }
639
-
640
- createRandString <- function() {
641
- digits = 0:9
642
- v = c(sample(LETTERS, 4, replace = TRUE),
643
- sample(digits, 4, replace = TRUE),
644
- sample(LETTERS, 1, replace = TRUE))
645
- return(paste0(v, collapse = ""))
646
- }
647
-
648
- # build coefficients table
649
- build_coef_table <- function(res_summ) {
650
- coefficients_table <- res_summ$coefficients
651
- if (any(aliased <- res_summ$aliased)) {
652
- # to keep coefficients with "NA" estimates in the table
653
- cn <- names(aliased)
654
- coefficients_table <- matrix(NA, length(aliased), 4, dimnames = list(cn, colnames(coefficients_table)))
655
- coefficients_table[!aliased, ] <- res_summ$coefficients
656
- }
657
- return(coefficients_table)
658
- }
659
-
660
- # reformat the coefficients table
661
- formatCoefficients <- function(coefficients_table, res, regtype, dat) {
662
- # round all columns to 4 significant digits
663
- coefficients_table <- signif(coefficients_table, 4)
664
- # add variable and category columns
665
- if (regtype == "cox") {
666
- vCol <- vector(mode = "character")
667
- cCol <- vector(mode = "character")
668
- } else {
669
- vCol <- c("Intercept")
670
- cCol <- c("")
671
- }
672
- vlst <- attr(res$terms, "term.labels", exact = T)
673
- for (v in vlst) {
674
- if (grepl(":", v, fixed = T)) {
675
- # interacting variables
676
- v1 <- strsplit(v, split = ":", fixed = T)[[1]][1]
677
- v2 <- strsplit(v, split = ":", fixed = T)[[1]][2]
678
- clst1 <- ""
679
- clst2 <- ""
680
- if (v1 %in% names(res$xlevels)) {
681
- clst1 <- res$xlevels[[v1]][-1] # extract categories (without reference category)
682
- }
683
- if (v2 %in% names(res$xlevels)) {
684
- clst2 <- res$xlevels[[v2]][-1] # extract categories (without reference category)
685
- }
686
- for (c2 in clst2) {
687
- for (c1 in clst1) {
688
- cCol <- c(cCol, paste(c1, c2, sep = ":"))
689
- vCol <- c(vCol, v)
690
- }
691
- }
692
- } else {
693
- # single variable
694
- if (grepl("^cubic_spline", v)) {
695
- # spline variable
696
- vid <- sub("cubic_spline\\(", "", sub(", c\\(.*", "", v))
697
- clst <- paste("spline function", 1:ncol(res$model[,v])) # determine the number of spline functions
698
- for (c in clst) {
699
- vCol <- c(vCol, vid)
700
- cCol <- c(cCol, c)
701
- }
702
- } else if (v %in% names(res$xlevels)) {
703
- # categorical variable
704
- clst <- res$xlevels[[v]][-1] # extract non-ref categories
705
- for (c in clst) {
706
- vCol <- c(vCol, v)
707
- cCol <- c(cCol, c)
708
- }
709
- } else {
710
- # continuous variable
711
- vCol <- c(vCol, v)
712
- cCol <- c(cCol, "")
713
- }
714
- }
715
- }
716
-
717
- coefficients_table <- cbind("Variable" = vCol, "Category" = cCol, coefficients_table)
718
-
719
- # extract columns of interest
720
- if (regtype == "linear") {
721
- coefficients_table <- coefficients_table[, c("Variable", "Category", "Beta", "95% CI (low)", "95% CI (high)", "Pr(>|t|)"), drop = F]
722
- } else if (regtype == "logistic") {
723
- coefficients_table <- coefficients_table[, c("Variable", "Category", "Odds ratio", "95% CI (low)", "95% CI (high)", "Pr(>|z|)"), drop = F]
724
- } else if (regtype == "cox") {
725
- # cox regression
726
- # report sample size and event counts of coefficients
727
- sCol <- vector(mode = "character")
728
- eCol <- vector(mode = "character")
729
- for (i in 1:length(vCol)) {
730
- v <- vCol[i]
731
- c <- cCol[i]
732
- if (v %in% names(res$xlevels)) {
733
- # categorical variable
734
- # determine sample size and event count of both ref and non-ref categories
735
- # values will be stored in separate columns in the format "ref/nonref"
736
- ref <- res$xlevels[[v]][1]
737
- m <- table(dat[,"outcome_event"], dat[,v])
738
- samplesize_ref <- sum(m[,ref])
739
- samplesize_c <- sum(m[,c])
740
- sCol <- c(sCol, paste(samplesize_ref, samplesize_c, sep = "/"))
741
- eventcnt_ref <- m["1",ref]
742
- eventcnt_c <- m["1",c]
743
- eCol <- c(eCol, paste(eventcnt_ref, eventcnt_c, sep = "/"))
744
- } else {
745
- # continuous or spline variable
746
- # set sample size and event count to NA
747
- sCol <- c(sCol, NA)
748
- eCol <- c(eCol, NA)
749
- }
750
- }
751
- coefficients_table <- cbind(coefficients_table, "Sample Size (ref/non-ref)" = sCol, "Events (ref/non-ref)" = eCol)
752
- coefficients_table <- coefficients_table[, c("Variable", "Category", "Sample Size (ref/non-ref)", "Events (ref/non-ref)", "HR", "95% CI (low)", "95% CI (high)", "Pr(>|z|)"), drop = F]
753
- } else {
754
- stop("regression type is not recognized")
755
- }
756
-
757
- colnames(coefficients_table)[ncol(coefficients_table)] <- "P" # p-value column
758
- coefficients_table <- list("header" = colnames(coefficients_table), "rows" = coefficients_table)
759
- return(coefficients_table)
760
- }
761
-
762
- # reformat the type III statistics table
763
- formatType3 <- function(type3_table) {
764
- # add a variable column
765
- type3_table <- cbind("Variable" = sub("cubic_spline\\(", "", sub(", c\\(.*", "", row.names(type3_table))), type3_table)
766
- type3_table <- list("header" = colnames(type3_table), "rows" = type3_table)
767
- return(type3_table)
768
- }
769
-
770
- # parse results from univariate and multivariate analyses
771
- parseUniMultiResults <- function(reg_results, regtype) {
772
- multiCoefficients <- NULL
773
- uniCoefficients <- NULL
774
- splinePlotFiles <- NULL
775
- for (res in reg_results) {
776
- # parse coefficients
777
- coefs <- res$data$coefficients$rows
778
- # remove intercept row because cannot merge together intercepts
779
- # from different univariate analyses
780
- coefs <- coefs[row.names(coefs) != "(Intercept)", ,drop = F]
781
- if (res$type == "multivariate") {
782
- multiCoefficients <- coefs
783
- } else if (res$type == "univariate") {
784
- if (is.null(uniCoefficients)) {
785
- uniCoefficients <- coefs
786
- } else {
787
- uniCoefficients <- rbind(uniCoefficients, coefs)
788
- }
789
- } else {
790
- stop ("results type not recognized")
791
- }
792
- # parse spline plots
793
- splinePlotFiles <- c(splinePlotFiles, res$data$splinePlotFiles)
794
- }
795
- # prepare separate univariate and multivariate coefficients tables
796
- uniCoefficients_table <- list("header" = colnames(uniCoefficients), "rows" = uniCoefficients)
797
- multiCoefficients_table <- list("header" = colnames(multiCoefficients), "rows" = multiCoefficients)
798
- # return parsed results containing the separate coefficients tables
799
- reg_results_parsed <- list()
800
- reg_results_parsed[[1]] <- list("id" = res$id, "data" = list("sampleSize" = res$data$sampleSize, "coefficients_uni" = uniCoefficients_table, "coefficients_multi" = multiCoefficients_table))
801
- if (regtype == "cox") reg_results_parsed[[1]]$data$eventCnt = res$data$eventCnt
802
- if (!is.null(splinePlotFiles)) reg_results_parsed[[1]]$data$splinePlotFiles = splinePlotFiles
803
- return(reg_results_parsed)
804
- }