@sjcrh/proteinpaint-server 2.105.0 → 2.107.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/utils/gsea.py CHANGED
@@ -1,129 +1,145 @@
1
- # cat ~/sjpp/test.txt | python gsea.py
1
+ # Test syntax: cat ~/sjpp/test.txt | time python gsea.py
2
+ # test.txt contains the json string autogenerated by the commented out nodejs code.
3
+ import blitzgsea as blitz
4
+ import json
5
+ import time
6
+ import sys
7
+ import sqlite3
8
+ import os
9
+ import numpy as np
10
+ import pandas as pd
2
11
 
3
- import blitzgsea as blitz
4
- import json
5
- import time
6
- import sys
7
- import sqlite3
8
- import os
9
- import numpy as np
10
- import pandas as pd
11
-
12
+ # Helper function to extract gene symbols from a dictionary
12
13
  def extract_symbols(x):
13
- return x['symbol']
14
-
15
- def extract_plot_data(signature, geneset, library, result, center=True):
16
- signature = signature.copy()
17
- signature.columns = ["i","v"]
18
- signature = signature.sort_values("v", ascending=False).set_index("i")
19
- signature = signature[~signature.index.duplicated(keep='first')]
20
- if center:
21
- signature.loc[:,"v"] -= np.mean(signature.loc[:,"v"])
22
- signature_map = {}
23
- for i,h in enumerate(signature.index):
24
- signature_map[h] = i
25
-
26
- gs = set(library[geneset])
27
- hits = [i for i,x in enumerate(signature.index) if x in gs]
28
-
29
- running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:,0])), signature_map, gs)
30
- running_sum = list(running_sum)
31
- nn = np.where(np.abs(running_sum)==np.max(np.abs(running_sum)))[0][0]
32
- #print ("nn:",nn)
33
- #print ("running_sum:",running_sum)
34
- #print ("es:",es)
35
- running_sum_str=[str(elem) for elem in running_sum]
36
- print ('result: {"nn":'+str(nn)+',"running_sum":"'+",".join(running_sum_str)+'","es":'+str(es)+'}')
14
+ return x['symbol'] # Return the 'symbol' field from the dictionary
37
15
 
38
-
39
- # Main function
16
+ # Main function
40
17
  try:
41
- # Try to read a single character from stdin without blocking
18
+ # Check if there is input from stdin
42
19
  if sys.stdin.read(1):
43
- # Read from stdin
20
+ # Read each line from stdin
44
21
  for line in sys.stdin:
45
- # Process each line
22
+ # Parse the JSON input
46
23
  json_object = json.loads(line)
47
- cachedir=json_object['cachedir']
48
- genes=json_object['genes']
49
- fold_change=json_object['fold_change']
50
- table_name=json_object['geneset_group']
51
- filter_non_coding_genes=json_object['filter_non_coding_genes']
52
- df = {'Genes': genes, 'fold_change': fold_change}
53
- signature=pd.DataFrame(df)
54
- db=json_object['db']
55
- # Connect to the SQLite database
56
- conn = sqlite3.connect(db)
57
-
58
- # Create a cursor object using the cursor() method
59
- cursor = conn.cursor()
60
-
61
- # SQL query to select all data from the table
62
- query = f"select id from terms where parent_id='" + table_name + "'"
63
- # Execute the SQL query
64
- cursor.execute(query)
65
- if filter_non_coding_genes == True:
66
- # SQL query to code all the protein coding genes
67
- coding_genes_query = f"select * from codingGenes"
68
- genedb = json_object['genedb']
69
- gene_conn = sqlite3.connect(genedb)
70
- gene_cursor = gene_conn.cursor()
71
- gene_cursor.execute(coding_genes_query)
72
- coding_genes_list=gene_cursor.fetchall()
73
- coding_genes_list=list(map(lambda x: x[0],coding_genes_list))
74
- signature=signature[signature['Genes'].isin(coding_genes_list)]
75
-
76
- # Fetch all rows from the executed SQL query
77
- rows = cursor.fetchall()
78
-
79
- start_loop_time = time.time()
80
- msigdb_library={}
81
- # Iterate over the rows and print them
82
- for row in rows:
83
- #print(row[0])
84
- query2=f"select genes from term2genes where id='" + row[0] + "'"
85
- cursor.execute(query2)
86
- rows2 = cursor.fetchall()
87
- row3=json.loads(rows2[0][0])
88
- msigdb_library[row[0]] = list(map(extract_symbols,row3))
24
+ cachedir = json_object['cachedir'] # Get the cache directory from the JSON object
25
+ genes = json_object['genes'] # Get the genes from the JSON object
26
+ fold_change = json_object['fold_change'] # Get the fold change values from the JSON object
27
+ num_permutations = json_object['num_permutations'] # Number of permutations for GSEA analysis
28
+ table_name = json_object['geneset_group'] # Get the gene set group from the JSON object
29
+ filter_non_coding_genes = json_object['filter_non_coding_genes'] # Get the filter_non_coding_genes flag from the JSON object
30
+ db = json_object['db'] # Get the database path from the JSON object
31
+ # Create a DataFrame for the signature
32
+ df = {'Genes': genes, 'fold_change': fold_change} # Create a dictionary with genes and fold change
33
+ signature = pd.DataFrame(df) # Convert the dictionary to a DataFrame
89
34
 
90
- #print ("msigdb_library:",msigdb_library)
91
- # Close the cursor and connection to the database
92
- cursor.close()
93
- conn.close()
94
- stop_loop_time = time.time()
95
- execution_time = stop_loop_time - start_loop_time
96
- print(f"Execution time: {execution_time} seconds")
97
- try: # Extract ES data to be plotted on client side
98
- geneset_name=json_object['geneset_name'] # Checks if geneset_name is present, if yes it indicates the server request is for generating the image. It retrieves the result.pkl file and generates the image without having to recompute gsea again.
99
- pickle_file=json_object['pickle_file']
100
- result = pd.read_pickle(os.path.join(cachedir,pickle_file))
101
- fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True)
102
- random_num = np.random.rand()
103
- png_filename = "gsea_plot_" + str(random_num) + ".png"
104
- fig.savefig(os.path.join(cachedir,png_filename), bbox_inches='tight')
105
- #extract_plot_data(signature, geneset_name, msigdb_library, result) # This returns raw data to client side, not currently used
106
- print ('image: {"image_file":"' + png_filename + '"}')
107
- except KeyError: #Initial GSEA calculation, result saved to a result.pkl pickle file
108
- # run enrichment analysis
109
- start_gsea_time = time.time()
110
- if __name__ == "__main__":
111
- result = blitz.gsea(signature, msigdb_library).T
112
- random_num = np.random.rand()
113
- pickle_filename="gsea_result_"+ str(random_num) +".pkl"
114
- result.to_pickle(os.path.join(cachedir,pickle_filename))
115
- gsea_str='{"data":' + result.to_json() + '}'
116
- pickle_str='{"pickle_file":"' + pickle_filename + '"}'
117
- #print ("pickle_file:",pickle_str)
118
- gsea_dict = json.loads(gsea_str)
119
- pickle_dict = json.loads(pickle_str)
120
- result_dict = {**gsea_dict, **pickle_dict}
121
- print ("result:",json.dumps(result_dict))
122
- stop_gsea_time = time.time()
123
- gsea_time = stop_gsea_time - start_gsea_time
124
- print (f"GSEA time: {gsea_time} seconds")
35
+ # Connect to the SQLite database
36
+ conn = sqlite3.connect(db) # Connect to the SQLite database
37
+ cursor = conn.cursor() # Create a cursor object
38
+
39
+ msigdb_library = {} # Initialize an empty dictionary for the gene set library
40
+ if table_name == "REACTOME--blitzgsea": # Parse from blitzgsea reactome library
41
+ msigdb_library = blitz.enrichr.get_library("Reactome_2022")
42
+ elif table_name == "KEGG--blitzgsea": # Parse from blitzgsea KEGG library
43
+ msigdb_library = blitz.enrichr.get_library("KEGG_2021_Human")
44
+ elif table_name == "WikiPathways--blitzgsea": # Parse from blitzgsea WikiPathways library
45
+ msigdb_library = blitz.enrichr.get_library("WikiPathways_2019_Human")
46
+ else: # Use geneset groups from msigdb
47
+ # Query to get gene set IDs
48
+ query = f"SELECT id FROM terms WHERE parent_id='{table_name}'" # SQL query to get gene set IDs
49
+ cursor.execute(query) # Execute the query
50
+
51
+ # Fetch all gene set IDs
52
+ rows = cursor.fetchall() # Fetch all rows from the executed query
53
+
54
+ start_loop_time = time.time() # Record the start time of the loop
55
+
56
+ # Iterate over gene set IDs and fetch corresponding genes
57
+ for row in rows:
58
+ query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'" # SQL query to get genes for a gene set ID
59
+ cursor.execute(query2) # Execute the query
60
+ rows2 = cursor.fetchall() # Fetch all rows from the executed query
61
+ row3 = json.loads(rows2[0][0]) # Parse the JSON data
62
+ msigdb_library[row[0]] = list(set(map(extract_symbols, row3))) # Extract only unique gene symbols and add them to the library. "set" command selects only unique genes
63
+ #print ("msigdb_library:",msigdb_library)
64
+
65
+ # Close the cursor and connection to the database
66
+ cursor.close() # Close the cursor
67
+ conn.close() # Close the connection
68
+
69
+ stop_loop_time = time.time() # Record the stop time of the loop
70
+ execution_time = stop_loop_time - start_loop_time # Calculate the execution time
71
+ print(f"Execution time: {execution_time} seconds") # Print the execution time
72
+
73
+ # Filter out non-coding genes if specified
74
+ if filter_non_coding_genes:
75
+ coding_genes_query = "SELECT * FROM codingGenes" # SQL query to get coding genes
76
+ genedb = json_object['genedb'] # Get the gene database path from the JSON object
77
+ gene_conn = sqlite3.connect(genedb) # Connect to the gene database
78
+ gene_cursor = gene_conn.cursor() # Create a cursor object for the gene database
79
+ gene_cursor.execute(coding_genes_query) # Execute the query to get coding genes
80
+ coding_genes_list = gene_cursor.fetchall() # Fetch all coding genes
81
+ coding_genes_list = list(map(lambda x: x[0], coding_genes_list)) # Extract the gene symbols
82
+ signature = signature[signature['Genes'].isin(coding_genes_list)] # Filter the signature to include only coding genes
125
83
 
84
+ try:
85
+ # Check if geneset_name and pickle_file are present for generating the plot
86
+ geneset_name = json_object['geneset_name'] # Get the gene set name from the JSON object
87
+ pickle_file = json_object['pickle_file'] # Get the pickle file name from the JSON object
88
+ if os.path.isfile(os.path.join(cachedir, pickle_file)): # Check if the pickle file exists as it may not be in the same server that did the original GSEA computation
89
+ result = pd.read_pickle(os.path.join(cachedir, pickle_file)) # Load the result from the pickle file
90
+ fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True) # Generate the running sum plot
91
+ else: # If pickle file is not found, redo the GSEA computation from scratch
92
+ result = blitz.gsea(signature, msigdb_library, permutations=num_permutations).T # Perform GSEA computation and transpose the result
93
+ fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True) # Generate the running sum plot
94
+ result.to_pickle(os.path.join(cachedir, pickle_file)) # Save the result to a pickle file with same name
95
+ random_num = np.random.rand() # Generate a random number for unique png filename
96
+ png_filename = f"gsea_plot_{random_num}.png" # Create a filename for the plot
97
+ fig.savefig(os.path.join(cachedir, png_filename), bbox_inches='tight') # Save the plot as a PNG file
98
+ print(f'image: {{"image_file": "{png_filename}"}}') # Print the image file path in JSON format
99
+ except KeyError:
100
+ # Initial GSEA calculation and save the result to a pickle file
101
+ start_gsea_time = time.time() # Record the start time of GSEA
102
+ if __name__ == "__main__":
103
+ result = blitz.gsea(signature, msigdb_library, permutations=num_permutations).T # Perform GSEA computation and transpose the result
104
+ random_num = np.random.rand() # Generate a random number for unique pickle filename
105
+ pickle_filename = f"gsea_result_{random_num}.pkl" # Create a filename for the pickle file
106
+ result.to_pickle(os.path.join(cachedir, pickle_filename)) # Save the result to the pickle file
107
+ gsea_str = f'{{"data": {result.to_json()}}}' # Convert the result to JSON format
108
+ pickle_str = f'{{"pickle_file": "{pickle_filename}"}}' # Create a JSON string for the pickle file
109
+ gsea_dict = json.loads(gsea_str) # Parse the JSON string
110
+ pickle_dict = json.loads(pickle_str) # Parse the JSON string
111
+ result_dict = {**gsea_dict, **pickle_dict} # Merge the dictionaries
112
+ print(f"result: {json.dumps(result_dict)}") # Print the result in JSON format
113
+ stop_gsea_time = time.time() # Record the stop time of GSEA
114
+ gsea_time = stop_gsea_time - start_gsea_time # Calculate the GSEA execution time
115
+ print(f"GSEA time: {gsea_time} seconds") # Print the GSEA execution time
126
116
  else:
127
- pass
117
+ pass # Do nothing if there is no input from stdin
128
118
  except (EOFError, IOError):
129
- pass
119
+ pass # Handle EOFError and IOError exceptions gracefully
120
+
121
+ # Function to extract plot data for GSEA visualization (NOT currently being used, but will be used for generating client side gsea plots)
122
+ def extract_plot_data(signature, geneset, library, result, center=True):
123
+ print("signature", signature)
124
+ print("result", result)
125
+ print("geneset", geneset)
126
+ print("library", library)
127
+ signature = signature.copy() # Create a copy of the signature DataFrame
128
+ signature.columns = ["i", "v"] # Rename columns to 'i' and 'v'
129
+ signature = signature.sort_values("v", ascending=False).set_index("i") # Sort by 'v' in descending order and set 'i' as index
130
+ signature = signature[~signature.index.duplicated(keep='first')] # Remove duplicate indices, keeping the first occurrence
131
+
132
+ if center:
133
+ signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"]) # Center the signature values by subtracting the mean
134
+
135
+ signature_map = {h: i for i, h in enumerate(signature.index)} # Create a mapping of signature indices
136
+
137
+ gs = set(library[geneset]) # Get the gene set from the library
138
+ hits = [i for i, x in enumerate(signature.index) if x in gs] # Find the indices of hits in the signature
139
+
140
+ running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs) # Compute running sum and enrichment score
141
+ running_sum = list(running_sum) # Convert running sum to a list
142
+ nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0] # Find the index of the maximum absolute running sum
143
+
144
+ running_sum_str = [str(elem) for elem in running_sum] # Convert running sum elements to strings
145
+ print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}') # Print the result in JSON format
package/utils/burden.R DELETED
@@ -1,366 +0,0 @@
1
-
2
- ##### This code takes about 30 seconds to run. When user input the parameters (sexval to hdmtxval), run this for the original data and 20 for the bootstraped data at the same time, so we can have the burdern and 95% CI in about 30 seconds.
3
-
4
- rm(list=ls())
5
-
6
- suppressPackageStartupMessages(library(dplyr)) ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
7
- suppressPackageStartupMessages(library(survival))
8
- library(jsonlite)
9
- library(parallel)
10
-
11
- options(warn=-1)
12
-
13
- # stream in json input data
14
- con <- file("stdin", "r")
15
- json <- readLines(con)
16
- close(con)
17
- input <- fromJSON(json)
18
- # handle input arguments
19
- args <- commandArgs(trailingOnly = T)
20
- if (length(args) != 3) stop("Usage: echo <in_json> | Rscript burden.R fitsData survData sampleData > <out_json>")
21
- fitsData <- args[1]
22
- survData <- args[2]
23
- sampleData <- args[3]
24
-
25
- chc_nums <- c(1:32)[-c(2,5,14,20,23,26)] # CHCs. 6 out of 32 CHCs not used.
26
- availCores <- detectCores()
27
- if (is.na(availCores)) stop("cannot detect number of available cores")
28
- cores <- ifelse(length(chc_nums) < availCores, length(chc_nums), availCores)
29
-
30
- #####################
31
- # Functions for our method
32
- # Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
33
- #####################
34
- # setwd("R:/Biostatistics/Biostatistics2/Qi/QiCommon/St Jude/Nature Review/CHCs/App/Rdata")
35
-
36
- load(fitsData)
37
- load(survData)
38
- # survs[[1]]
39
-
40
- ############################ These are the input values in APP that users can change. Edgar, these should be the same as the APP before, variable names and units. #############
41
- ### Input the primary DX.
42
- # pr=5
43
- # agecut=40 ##### Edgar, This is not an user input paramter, but we input this. This depends on the DX. For example, here for CNS we use 40. For HL DX, it is 55. I will give this value for each DX.
44
-
45
- # # # Input person's values, 18 input X's , plus the input primary DX
46
- # sexval=1 #sex, take value 1 for male and 0 for female
47
- # whiteval=1 # Race white or not, 1 for white, 0 for non-white
48
- # agedxval=6 # age at primary cancer DX
49
-
50
- # #### Chemotherapy
51
- # steroidval=0 #Steroids 1 for yes 0 for no
52
- # bleoval=0; ##Bleomycin
53
- # vcrval=12; #Vincristine
54
- # etopval=2500; #Etoposide
55
- # itmtval=0; #Intrathecal Methotrexate
56
- # cedval=1.6 # Cyclophosphamide, 0.7692 mean 7692.
57
- # cispval=300 #Cisplatin
58
- # doxval=0 #Anthracycline, 3 mean 300 ml/m2
59
- # carboval=0 ## Carboplatin
60
- # hdmtxval=0 ## High-Dose Methotrexate
61
-
62
- # # Radiation
63
- # brainval=5.4 #Brain, 5.4 means 54Gy, 5400 cGy. #####Same for all RT doses.#####
64
- # chestval=2.4 # chest/neck RT, 2.4 for 24 Gy
65
- # heartval=0 # Heart RT
66
- # pelvisval=0 #pelvis RT
67
- # abdval=2.4 # Abdominal RT
68
-
69
- ####################################################################################
70
-
71
- ##### if no TX, use these.
72
- # steroidval=0; bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0;
73
- # doxval=0; chestval=0; abdval=0;
74
-
75
- # survs[[1]]
76
-
77
- ############### no TX
78
- # steroidval=0; bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0; doxval=0; chestval=0; abdval=0; heartval=0; pelvisval=0; carboval=0; hdmtxval=0
79
-
80
- # Qi made many newdata_chc_sampled so we have 1000 times more donors -- but in different files.
81
- load(sampleData)
82
-
83
- newdata_chc_sampled=do.call("rbind", replicate(6,newdata_chc_sampled, simplify = FALSE))
84
- newdata_chc_sampled$t.startage=seq(5,70,1)
85
- newdata_chc_sampled$t.endage=seq(6,71,1)
86
- ### originally data fit to 60 only. using cphfits can get est up to 60 only. ==> later I further cut at 50 or so to fit lines, becuase original data had 95th percentile around age 50 or so.
87
- newdata_chc_sampled=newdata_chc_sampled[newdata_chc_sampled$t.endage<=60,]
88
-
89
- # paste(names(input), input, sep = ":", collapse = ",")
90
- pr=input$diaggrp
91
- # agecut was previously hardcoded to 40 above
92
- agecut=c('1'=50, '2'=45, '3'=55, '4'=50, '5'=40, '6'=60, '7'=50, '8'=45, '9'=45, '10'=45, '11'=50 )[pr]
93
- sexval=input$sex
94
- newdata_chc_sampled$sex=input$sex # sexval
95
- newdata_chc_sampled$white=input$white # whiteval
96
- newdata_chc_sampled$agedx2=input$agedx # agedxval
97
- newdata_chc_sampled$steroid=input$steroid # steroidval
98
- newdata_chc_sampled$bleodose=input$bleo # bleoval
99
- newdata_chc_sampled$vcrdose=input$vcr # vcrval
100
- newdata_chc_sampled$etopdose=input$etop # etopval
101
- newdata_chc_sampled$itmtxdose=input$itmt # itmtval
102
- newdata_chc_sampled$ced_sum2=input$ced # cedval
103
- newdata_chc_sampled$cisplatdose=input$cisp # cispval
104
- newdata_chc_sampled$brainrad2=input$brain # brainval
105
- newdata_chc_sampled$doxed_sum2=input$dox # doxval
106
- newdata_chc_sampled$chestrad2=input$chest # chestval
107
- newdata_chc_sampled$abdrad2=input$abd # abdval
108
- newdata_chc_sampled$heartradboth2=input$heart # heartval
109
- newdata_chc_sampled$pelvisrad2=input$pelvis # pelvisval
110
- newdata_chc_sampled$carboplatdose=input$carbo # carboval
111
- newdata_chc_sampled$hdmtxdose=input$hdmtx # hdmtxval
112
-
113
- # newdata_chc_sampled$sex=sexval
114
- # newdata_chc_sampled$white=whiteval
115
- # newdata_chc_sampled$agedx2=agedxval
116
- # newdata_chc_sampled$steroid=steroidval
117
- # newdata_chc_sampled$bleodose=bleoval
118
- # newdata_chc_sampled$vcrdose=vcrval
119
- # newdata_chc_sampled$etopdose=etopval
120
- # newdata_chc_sampled$itmtxdose=itmtval
121
- # newdata_chc_sampled$ced_sum2=cedval
122
- # newdata_chc_sampled$cisplatdose=cispval
123
- # newdata_chc_sampled$brainrad2=brainval
124
- # newdata_chc_sampled$doxed_sum2=doxval
125
- # newdata_chc_sampled$chestrad2=chestval
126
- # newdata_chc_sampled$abdrad2=abdval
127
- # newdata_chc_sampled$heartradboth2=heartval
128
- # newdata_chc_sampled$pelvisrad2=pelvisval
129
- # newdata_chc_sampled$carboplatdose=carboval
130
- # newdata_chc_sampled$hdmtxdose=hdmtxval
131
-
132
- # 1="Acute lymphoblastic leukemia"
133
- # 2="AML"
134
- # 3="Hodgkin lymphoma"
135
- # 4="Non-Hodgkin lymphoma"
136
- # 5="Central nervous system"
137
- # 6="Bone tumor"
138
- # 7="STS"
139
- # 8="Wilms tumor"
140
- # 9="Neuroblastoma"
141
- # 10="Retinoblastoma"
142
- # 11="Germ cell tumor";
143
-
144
- results <- mclapply(X = chc_nums, FUN = function(chc_num) predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected'), mc.cores = cores)
145
- for(n in 1:length(results)){
146
- newdata_chc_sampled = data.frame(newdata_chc_sampled,results[[n]])
147
- }
148
- names(newdata_chc_sampled)[25:50]=paste0("est_chc",chc_nums)
149
- newdata_chc_sampled = newdata_chc_sampled %>%
150
- mutate(sumN_tmp = rowSums(dplyr::select(.,starts_with("est_chc"))))%>%
151
- group_by(mrn) %>%
152
- mutate(sumN_obs = cumsum(sumN_tmp)) %>%
153
- as.data.frame()
154
-
155
- ##Qi: the sumN here depends on all the 26 grouped conditions. So the input X's all matter. That is, if sex is not in a CHC of interest, it would make a difference here on sumN (becuase sex was on some CHCs), and hence make a difference on burden of that CHC even that it is not in the cphfits of that CHC.
156
- newdata_chc_sampled = newdata_chc_sampled %>%
157
- group_by(mrn) %>%
158
- mutate(chc20 = sumN_obs[t.endage == 20]) %>%
159
- ungroup() %>%
160
- as.data.frame()
161
- newdata_chc_sampled$death =1
162
- newdata_chc_sampled$obsCHCat20 = newdata_chc_sampled$current.chc
163
-
164
-
165
- # survival probability
166
- # https://stats.stackexchange.com/questions/288393/calculating-survival-probability-per-person-at-time-t-from-cox-ph
167
-
168
-
169
- newdata_chc_sampled$survprob = exp(-predict(survs[[1]],newdata=data.frame(newdata_chc_sampled,primary=pr),type='expected'))
170
-
171
- #----------------------------------------------------------------------------------------------------------------#
172
- ##### Qi added the below "cumprod" for survival by time t. But need to figure out: What is the "survprob" in BCCT formulat? Should it be survival of the segment, or survival by time t? == need to figure out with YY. Discussed, YY confirmed my way: survival prob in the formula is cumulative, not for that segment.
173
- #----------------------------------------------------------------------------------------------------------------#
174
-
175
- #----------------------------------------------------------------------------------------------------------------#
176
- ## If assume "survprob" is over time (not for each segment):
177
- #### why does the survprob does not decrease over time? I think this is not the real survival probability over time. Do I have to do multiplication over time thinking survprob is the survival over that segment? Try the multiplication over time.===== I think this make sense. In the "predict" above, survial=exp(-expected) was for each row (thinking each row is a separate person). While in newdata_chc_sampled, the rows are for the same person, and the survival depends on the previoys line, so need to multiply the survival from the previous line.
178
- newdata_chc_sampled$survprob4=cumprod(newdata_chc_sampled$survprob)
179
- newdata_chc_sampled$survprob=newdata_chc_sampled$survprob4
180
-
181
- # plot(c(0,90),c(0,1),type="n")
182
- survspline=smooth.spline(newdata_chc_sampled$t.endage[newdata_chc_sampled$t.endage<=agecut],newdata_chc_sampled$survprob[newdata_chc_sampled$t.endage<=agecut],spar=0.5)
183
- predsurv=predict(survspline,seq(0,95,1))
184
-
185
- # lines(predsurv$x,predsurv$y,col=3,lty=2)
186
-
187
- ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
188
-
189
- ###### get rid of the est_chcXX and "sumN"columns which were used to calculate the survival probability only.
190
- # invisible(dim(newdata_chc_sampled))
191
- newdata_chc_sampled=newdata_chc_sampled[,-grep("est_chc", colnames(newdata_chc_sampled))]
192
- newdata_chc_sampled=newdata_chc_sampled[,-grep("sumN", colnames(newdata_chc_sampled))]
193
- # invisible(dim(newdata_chc_sampled))
194
-
195
- ### Add rows t.startage from 60 to 94, and t.endage from 65 to 95; so we can get burden 60-90.
196
- add=newdata_chc_sampled[newdata_chc_sampled$t.startage<=39,]
197
- # table(add$t.startage)
198
- # table(add$t.endage)
199
- add$t.startage=add$t.startage+55
200
- add$t.endage=add$t.endage+55
201
- # table(add$t.startage)
202
- # table(add$t.endage)
203
- newdata_chc_sampled=rbind(newdata_chc_sampled,add)
204
- newdata_chc_sampled=newdata_chc_sampled[order(newdata_chc_sampled$mrn,newdata_chc_sampled$t.startage),]
205
- ### replace the survival prob with the calculated/extrapolated survival probability
206
- smooth_surv=data.frame(age=predsurv$x,surv=predsurv$y)
207
- smooth_surv$surv[smooth_surv$age<=20]=1
208
- #### survival probability cannot be <0. Hanle the years with survival prob<0
209
- #https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310013501 This page had the conditional survival based on age
210
- ## take the last year with a positive survival prob, and its survival prob
211
- positive=smooth_surv[smooth_surv$surv>0,]
212
- alast=tail(positive,1)[1,1]
213
- slast=tail(positive,1)[1,2]
214
- #smooth_surv$alast=alast
215
- #smooth_surv$alast=slast
216
- smooth_surv$interval=smooth_surv$age-alast
217
- ### use the last positive survival prob*0.5^(years from the last age with positive survival probability), assuming the conditions survival prob after that age 50% each year.
218
- cave <- function(x) slast*0.5^(max(x["interval"],0))
219
- smooth_surv$surv1=apply(smooth_surv,1,cave)
220
- smooth_surv$surv[smooth_surv$surv<0]=smooth_surv$surv1[smooth_surv$surv<0]
221
-
222
- newdata_chc_sampled=merge(newdata_chc_sampled,smooth_surv,by.x="t.endage",by.y="age")
223
- newdata_chc_sampled$survprob=newdata_chc_sampled$surv
224
-
225
- # when there is an interaction in the model, it gave warning. So I would make a new data with all 0's to make it work.
226
- newdata0=matrix(0,nrow=1,ncol=18)
227
- newdata0=as.data.frame(newdata0)
228
- colnames(newdata0)=c("sex","white","agedx2","steroids","bleodose","vcrdose","etopdose","itmtxdose","ced_sum2",
229
- "cisplatdose","brainrad2","doxed_sum2","chestrad2","abdrad2","heartradboth2","pelvisrad2","carboplatdose","hdmtxdose")
230
-
231
-
232
- newdata_chc_sampled1=newdata_chc_sampled ## do this so each run on chc_num loops below starts with the original newdata_chc_sampled1
233
-
234
-
235
- ##########################################################################
236
- person_burden=NULL
237
-
238
- get_estimate <- function(chc_num) { #### Edgar, you may make this in separate runs to save time.
239
- # print(chc_num)
240
- newdata_chc_sampled=newdata_chc_sampled1
241
-
242
- # linear predictor
243
- newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
244
-
245
- # Baseline nelson-aalan est
246
- # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
247
- j=chc_num
248
- base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
249
- #centered, if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
250
- #request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
251
-
252
-
253
- ### Max time in the data is 70.42. We need to estimate up to 90.
254
- #Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
255
- #One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative." Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative? https://cran.r-project.org/web/packages/pspline/pspline.pdf
256
-
257
-
258
- #### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
259
- base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
260
- base=base[base$time<=agecut,] ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
261
-
262
- ##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
263
- cumHspline=smooth.spline(base$time,base$hazard,spar=1)
264
- predcumhz=predict(cumHspline,seq(0,95,1)) ### predicted cumulative hazard
265
-
266
-
267
- ##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
268
- base=data.frame(time=predcumhz$x,hazard=predcumhz$y) ##Daisuke used the cumHz, here we smoothed it and then use it.
269
- #### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
270
- base$hazard[base$hazard<0]=0
271
- base2 = base %>%
272
- mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
273
- ungroup() %>% as.data.frame()
274
-
275
- base2 = base2 %>%
276
- mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
277
- ungroup()
278
-
279
- base3 = base2 %>%
280
- group_by(time_cat) %>%
281
- dplyr::summarize(dN0 = sum(hazard2)) %>%
282
- filter(!is.na(time_cat))
283
-
284
- ###############
285
- # BCCT
286
- ###############
287
- newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
288
-
289
- #newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
290
-
291
- newdata_chc_sampled = newdata_chc_sampled %>%
292
- left_join(base3,by="time_cat")
293
- newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
294
-
295
- BCCT = newdata_chc_sampled %>%
296
- group_by(mrn) %>%
297
- mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
298
- mutate(BCCT = cumsum(BCCT_tmp)) %>%
299
- filter(t.startage>=20) %>%
300
- ungroup() %>%
301
- as.data.frame()
302
-
303
- for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
304
- for_web_BCCT =for_web_BCCT[,-1]
305
-
306
- #### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
307
- #### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
308
- ncoltmp=75 ## from 20 to 94
309
- if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
310
- for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
311
- for_web_BCCT=as.data.frame(for_web_BCCT2)
312
- colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
313
- }
314
- #For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
315
- if(chc_num %in% c(10)){
316
- for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
317
- for_web_BCCT=as.data.frame(for_web_BCCT2)
318
- colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
319
- }
320
- ##### if female condition 6, then it is 0 for males.
321
- if(chc_num %in% c(6) & sexval==1){
322
- for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
323
- for_web_BCCT=as.data.frame(for_web_BCCT2)
324
- colnames(for_web_BCCT)=colnames(person_burden[1:75])
325
- }
326
- ##### if male condition 7, then it is 0 for females.d
327
- if(chc_num %in% c(7) & sexval==0){
328
- for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
329
- for_web_BCCT=as.data.frame(for_web_BCCT2)
330
- colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
331
- }
332
-
333
- for_web_BCCT$chc=chc_num
334
-
335
- return(for_web_BCCT)
336
- }
337
-
338
- # this serial loop works
339
- # for(chc_num in chc_nums) {
340
- # person_burden=rbind(person_burden, get_estimate(chc_num))
341
- # }
342
-
343
- # get estimates
344
- # parallelize across chc_nums
345
- results <- mclapply(X = chc_nums, FUN = get_estimate, mc.cores = cores)
346
-
347
- # combine rows into person_burden data frame
348
- for (n in 1:length(results)) {
349
- row <- results[[n]]
350
- if (!identical(names(row), names(results[[1]]))) {
351
- # some rows may have empty column names because they
352
- # used the columns names from the person_burden table, which
353
- # is NULL when get_estimate() is run in parallel (see the
354
- # if() statements in get_estimate())
355
- # in this situation, use the column names from the first row
356
- names(row) <- names(results[[1]])
357
- }
358
- person_burden <- rbind(person_burden, row)
359
- }
360
-
361
- # person_burden[,30:31]
362
- # sum(person_burden[,31]) ## total burden at 50 years old. 8.971574 for this example.
363
-
364
- #### The predicated burden for 26 grouped CHCs from age 20 to 95.
365
- # write.csv(person_burden,file=paste("primary",pr,".csv"),row.names=F)
366
- toJSON(person_burden, digits = NA, na = "string")