npm - @sjcrh/proteinpaint-server - Versions diffs - 2.105.0 → 2.107.0 - Mend

@sjcrh/proteinpaint-server 2.105.0 → 2.107.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +3 -3
package/routes/burden.js +114 -51
package/routes/genesetEnrichment.js +1 -0
package/src/app.js +211 -128
package/utils/burden-ci95.R +134 -0
package/utils/burden-main.R +46 -0
package/utils/edge.R +1 -1
package/utils/getBurden.R +371 -0
package/utils/gsea.py +134 -118
package/utils/burden.R +0 -366

package/utils/gsea.py CHANGED Viewed

@@ -1,129 +1,145 @@
-# cat ~/sjpp/test.txt | python gsea.py
+# Test syntax: cat ~/sjpp/test.txt | time python gsea.py
+# test.txt contains the json string autogenerated by the commented out nodejs code.
+import blitzgsea as blitz
+import json
+import time
+import sys
+import sqlite3
+import os
+import numpy as np
+import pandas as pd
-import blitzgsea as blitz
-import json
-import time
-import sys
-import sqlite3
-import os
-import numpy as np
-import pandas as pd
+# Helper function to extract gene symbols from a dictionary
 def extract_symbols(x):
-   return x['symbol']
-def extract_plot_data(signature, geneset, library, result, center=True):
-   signature = signature.copy()
-   signature.columns = ["i","v"]
-   signature = signature.sort_values("v", ascending=False).set_index("i")
-   signature = signature[~signature.index.duplicated(keep='first')]
-   if center:
-       signature.loc[:,"v"] -= np.mean(signature.loc[:,"v"])
-   signature_map = {}
-   for i,h in enumerate(signature.index):
-       signature_map[h] = i
-   gs = set(library[geneset])
-   hits = [i for i,x in enumerate(signature.index) if x in gs]
-   running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:,0])), signature_map, gs)
-   running_sum = list(running_sum)
-   nn = np.where(np.abs(running_sum)==np.max(np.abs(running_sum)))[0][0]
-   #print ("nn:",nn)
-   #print ("running_sum:",running_sum)
-   #print ("es:",es)
-   running_sum_str=[str(elem) for elem in running_sum]
-   print ('result: {"nn":'+str(nn)+',"running_sum":"'+",".join(running_sum_str)+'","es":'+str(es)+'}')
+    return x['symbol']  # Return the 'symbol' field from the dictionary
-# Main function
+# Main function
 try:
-    # Try to read a single character from stdin without blocking
+    # Check if there is input from stdin
     if sys.stdin.read(1):
-        # Read from stdin
+        # Read each line from stdin
         for line in sys.stdin:
-            # Process each line
+            # Parse the JSON input
             json_object = json.loads(line)
-            cachedir=json_object['cachedir']
-            genes=json_object['genes']
-            fold_change=json_object['fold_change']
-            table_name=json_object['geneset_group']
-            filter_non_coding_genes=json_object['filter_non_coding_genes']
-            df = {'Genes': genes, 'fold_change': fold_change}
-            signature=pd.DataFrame(df)
-            db=json_object['db']
-            # Connect to the SQLite database
-            conn = sqlite3.connect(db)
-            # Create a cursor object using the cursor() method
-            cursor = conn.cursor()
-            # SQL query to select all data from the table
-            query = f"select id from terms where parent_id='" + table_name  + "'"
-            # Execute the SQL query
-            cursor.execute(query)
-            if filter_non_coding_genes == True:
-                 # SQL query to code all the protein coding genes
-                 coding_genes_query = f"select * from codingGenes"
-                 genedb = json_object['genedb']
-                 gene_conn = sqlite3.connect(genedb)
-                 gene_cursor = gene_conn.cursor()
-                 gene_cursor.execute(coding_genes_query)
-                 coding_genes_list=gene_cursor.fetchall()
-                 coding_genes_list=list(map(lambda x: x[0],coding_genes_list))
-                 signature=signature[signature['Genes'].isin(coding_genes_list)]
-            # Fetch all rows from the executed SQL query
-            rows = cursor.fetchall()
-            start_loop_time = time.time()
-            msigdb_library={}
-            # Iterate over the rows and print them
-            for row in rows:
-                #print(row[0])
-                query2=f"select genes from term2genes where id='" + row[0]  + "'"
-                cursor.execute(query2)
-                rows2 = cursor.fetchall()
-                row3=json.loads(rows2[0][0])
-                msigdb_library[row[0]] = list(map(extract_symbols,row3))
+            cachedir = json_object['cachedir']  # Get the cache directory from the JSON object
+            genes = json_object['genes']  # Get the genes from the JSON object
+            fold_change = json_object['fold_change']  # Get the fold change values from the JSON object
+            num_permutations = json_object['num_permutations'] # Number of permutations for GSEA analysis
+            table_name = json_object['geneset_group']  # Get the gene set group from the JSON object
+            filter_non_coding_genes = json_object['filter_non_coding_genes']  # Get the filter_non_coding_genes flag from the JSON object
+            db = json_object['db']  # Get the database path from the JSON object
+            # Create a DataFrame for the signature
+            df = {'Genes': genes, 'fold_change': fold_change}  # Create a dictionary with genes and fold change
+            signature = pd.DataFrame(df)  # Convert the dictionary to a DataFrame
-            #print ("msigdb_library:",msigdb_library)
-            # Close the cursor and connection to the database
-            cursor.close()
-            conn.close()
-            stop_loop_time = time.time()
-            execution_time = stop_loop_time - start_loop_time
-            print(f"Execution time: {execution_time} seconds")
-            try: # Extract ES data to be plotted on client side
-               geneset_name=json_object['geneset_name'] # Checks if geneset_name is present, if yes it indicates the server request is for generating the image. It retrieves the result.pkl file and generates the image without having to recompute gsea again.
-               pickle_file=json_object['pickle_file']
-               result = pd.read_pickle(os.path.join(cachedir,pickle_file))
-               fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True)
-               random_num = np.random.rand()
-               png_filename = "gsea_plot_" + str(random_num) + ".png"
-               fig.savefig(os.path.join(cachedir,png_filename), bbox_inches='tight')
-               #extract_plot_data(signature, geneset_name, msigdb_library, result) # This returns raw data to client side, not currently used
-               print ('image: {"image_file":"' + png_filename + '"}')
-            except KeyError: #Initial GSEA calculation, result saved to a result.pkl pickle file
-               # run enrichment analysis
-               start_gsea_time = time.time()
-               if __name__ == "__main__":
-                  result = blitz.gsea(signature, msigdb_library).T
-                  random_num = np.random.rand()
-                  pickle_filename="gsea_result_"+ str(random_num) +".pkl"
-                  result.to_pickle(os.path.join(cachedir,pickle_filename))
-                  gsea_str='{"data":' + result.to_json() + '}'
-                  pickle_str='{"pickle_file":"' + pickle_filename + '"}'
-                  #print ("pickle_file:",pickle_str)
-                  gsea_dict = json.loads(gsea_str)
-                  pickle_dict = json.loads(pickle_str)
-                  result_dict = {**gsea_dict, **pickle_dict}
-                  print ("result:",json.dumps(result_dict))
-               stop_gsea_time = time.time()
-               gsea_time = stop_gsea_time - start_gsea_time
-               print (f"GSEA time: {gsea_time} seconds")
+            # Connect to the SQLite database
+            conn = sqlite3.connect(db)  # Connect to the SQLite database
+            cursor = conn.cursor()  # Create a cursor object
+            msigdb_library = {}  # Initialize an empty dictionary for the gene set library
+            if table_name == "REACTOME--blitzgsea": # Parse from blitzgsea reactome library
+              msigdb_library = blitz.enrichr.get_library("Reactome_2022")
+            elif table_name == "KEGG--blitzgsea": # Parse from blitzgsea KEGG library
+              msigdb_library = blitz.enrichr.get_library("KEGG_2021_Human")
+            elif table_name == "WikiPathways--blitzgsea": # Parse from blitzgsea WikiPathways library
+              msigdb_library = blitz.enrichr.get_library("WikiPathways_2019_Human")
+            else: # Use geneset groups from msigdb
+              # Query to get gene set IDs
+              query = f"SELECT id FROM terms WHERE parent_id='{table_name}'"  # SQL query to get gene set IDs
+              cursor.execute(query)  # Execute the query
+              # Fetch all gene set IDs
+              rows = cursor.fetchall()  # Fetch all rows from the executed query
+              start_loop_time = time.time()  # Record the start time of the loop
+              # Iterate over gene set IDs and fetch corresponding genes
+              for row in rows:
+                  query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'"  # SQL query to get genes for a gene set ID
+                  cursor.execute(query2)  # Execute the query
+                  rows2 = cursor.fetchall()  # Fetch all rows from the executed query
+                  row3 = json.loads(rows2[0][0])  # Parse the JSON data
+                  msigdb_library[row[0]] = list(set(map(extract_symbols, row3)))  # Extract only unique gene symbols and add them to the library. "set" command selects only unique genes
+              #print ("msigdb_library:",msigdb_library)
+              # Close the cursor and connection to the database
+              cursor.close()  # Close the cursor
+              conn.close()  # Close the connection
+              stop_loop_time = time.time()  # Record the stop time of the loop
+              execution_time = stop_loop_time - start_loop_time  # Calculate the execution time
+              print(f"Execution time: {execution_time} seconds")  # Print the execution time
+            # Filter out non-coding genes if specified
+            if filter_non_coding_genes:
+                coding_genes_query = "SELECT * FROM codingGenes"  # SQL query to get coding genes
+                genedb = json_object['genedb']  # Get the gene database path from the JSON object
+                gene_conn = sqlite3.connect(genedb)  # Connect to the gene database
+                gene_cursor = gene_conn.cursor()  # Create a cursor object for the gene database
+                gene_cursor.execute(coding_genes_query)  # Execute the query to get coding genes
+                coding_genes_list = gene_cursor.fetchall()  # Fetch all coding genes
+                coding_genes_list = list(map(lambda x: x[0], coding_genes_list))  # Extract the gene symbols
+                signature = signature[signature['Genes'].isin(coding_genes_list)]  # Filter the signature to include only coding genes
+            try:
+                # Check if geneset_name and pickle_file are present for generating the plot
+                geneset_name = json_object['geneset_name']  # Get the gene set name from the JSON object
+                pickle_file = json_object['pickle_file']  # Get the pickle file name from the JSON object
+                if os.path.isfile(os.path.join(cachedir, pickle_file)): # Check if the pickle file exists as it may not be in the same server that did the original GSEA computation
+                    result = pd.read_pickle(os.path.join(cachedir, pickle_file))  # Load the result from the pickle file
+                    fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True)  # Generate the running sum plot
+                else: # If pickle file is not found, redo the GSEA computation from scratch
+                    result = blitz.gsea(signature, msigdb_library, permutations=num_permutations).T  # Perform GSEA computation and transpose the result
+                    fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True)  # Generate the running sum plot
+                    result.to_pickle(os.path.join(cachedir, pickle_file))  # Save the result to a pickle file with same name
+                random_num = np.random.rand()  # Generate a random number for unique png filename
+                png_filename = f"gsea_plot_{random_num}.png"  # Create a filename for the plot
+                fig.savefig(os.path.join(cachedir, png_filename), bbox_inches='tight')  # Save the plot as a PNG file
+                print(f'image: {{"image_file": "{png_filename}"}}')  # Print the image file path in JSON format
+            except KeyError:
+                # Initial GSEA calculation and save the result to a pickle file
+                start_gsea_time = time.time()  # Record the start time of GSEA
+                if __name__ == "__main__":
+                    result = blitz.gsea(signature, msigdb_library, permutations=num_permutations).T  # Perform GSEA computation and transpose the result
+                    random_num = np.random.rand()  # Generate a random number for unique pickle filename
+                    pickle_filename = f"gsea_result_{random_num}.pkl"  # Create a filename for the pickle file
+                    result.to_pickle(os.path.join(cachedir, pickle_filename))  # Save the result to the pickle file
+                    gsea_str = f'{{"data": {result.to_json()}}}'  # Convert the result to JSON format
+                    pickle_str = f'{{"pickle_file": "{pickle_filename}"}}'  # Create a JSON string for the pickle file
+                    gsea_dict = json.loads(gsea_str)  # Parse the JSON string
+                    pickle_dict = json.loads(pickle_str)  # Parse the JSON string
+                    result_dict = {**gsea_dict, **pickle_dict}  # Merge the dictionaries
+                    print(f"result: {json.dumps(result_dict)}")  # Print the result in JSON format
+                stop_gsea_time = time.time()  # Record the stop time of GSEA
+                gsea_time = stop_gsea_time - start_gsea_time  # Calculate the GSEA execution time
+                print(f"GSEA time: {gsea_time} seconds")  # Print the GSEA execution time
     else:
-       pass
+        pass  # Do nothing if there is no input from stdin
 except (EOFError, IOError):
-    pass
+    pass  # Handle EOFError and IOError exceptions gracefully
+# Function to extract plot data for GSEA visualization (NOT currently being used, but will be used for generating client side gsea plots)
+def extract_plot_data(signature, geneset, library, result, center=True):
+     print("signature", signature)
+     print("result", result)
+     print("geneset", geneset)
+     print("library", library)
+     signature = signature.copy()  # Create a copy of the signature DataFrame
+     signature.columns = ["i", "v"]  # Rename columns to 'i' and 'v'
+     signature = signature.sort_values("v", ascending=False).set_index("i")  # Sort by 'v' in descending order and set 'i' as index
+     signature = signature[~signature.index.duplicated(keep='first')]  # Remove duplicate indices, keeping the first occurrence
+     if center:
+         signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"])  # Center the signature values by subtracting the mean
+     signature_map = {h: i for i, h in enumerate(signature.index)}  # Create a mapping of signature indices
+     gs = set(library[geneset])  # Get the gene set from the library
+     hits = [i for i, x in enumerate(signature.index) if x in gs]  # Find the indices of hits in the signature
+     running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs)  # Compute running sum and enrichment score
+     running_sum = list(running_sum)  # Convert running sum to a list
+     nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0]  # Find the index of the maximum absolute running sum
+     running_sum_str = [str(elem) for elem in running_sum]  # Convert running sum elements to strings
+     print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}')  # Print the result in JSON format

package/utils/burden.R DELETED Viewed

@@ -1,366 +0,0 @@
-##### This code takes about 30 seconds to run. When user input the parameters (sexval to hdmtxval), run this for the original data and 20 for the bootstraped data at the same time, so we can have the burdern and 95% CI in about 30 seconds.
-rm(list=ls())
-suppressPackageStartupMessages(library(dplyr))  ### Qi changed to load plyr first, due to R message: If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
-suppressPackageStartupMessages(library(survival))
-library(jsonlite)
-library(parallel)
-options(warn=-1)
-# stream in json input data
-con <- file("stdin", "r")
-json <- readLines(con)
-close(con)
-input <- fromJSON(json)
-# handle input arguments
-args <- commandArgs(trailingOnly = T)
-if (length(args) != 3) stop("Usage: echo <in_json> | Rscript burden.R fitsData survData sampleData > <out_json>")
-fitsData <- args[1]
-survData <- args[2]
-sampleData <- args[3]
-chc_nums <- c(1:32)[-c(2,5,14,20,23,26)] # CHCs. 6 out of 32 CHCs not used.
-availCores <- detectCores()
-if (is.na(availCores)) stop("cannot detect number of available cores")
-cores <- ifelse(length(chc_nums) < availCores, length(chc_nums), availCores)
-#####################
-# Functions for our method
-# Ref: https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
-#####################
-# setwd("R:/Biostatistics/Biostatistics2/Qi/QiCommon/St Jude/Nature Review/CHCs/App/Rdata")
-load(fitsData)
-load(survData)
-# survs[[1]]
-############################ These are the input values in APP that users can change. Edgar, these should be the same as the APP before, variable names and units. #############
-### Input the primary DX.
-# pr=5
-# agecut=40   ##### Edgar, This is not an user input paramter, but we input this. This depends on the DX. For example, here for CNS we use 40. For HL DX, it is 55. I will give this value for each DX.
-# # # Input person's values, 18 input X's , plus the input primary DX
-# 	sexval=1  #sex, take value 1 for male and 0 for female
-# 	whiteval=1	# Race white or not, 1 for white, 0 for non-white
-# 	agedxval=6  # age at primary cancer DX
-# #### Chemotherapy
-# 	steroidval=0  #Steroids 1 for yes 0 for no
-# 	bleoval=0; ##Bleomycin
-# 	vcrval=12; 	#Vincristine
-# 	etopval=2500; #Etoposide
-# 	itmtval=0; 		#Intrathecal Methotrexate
-# 	cedval=1.6		# Cyclophosphamide, 0.7692 mean 7692.
-# 	cispval=300		#Cisplatin
-# 	doxval=0		#Anthracycline, 3 mean 300 ml/m2
-# 	carboval=0  ## Carboplatin
-# 	hdmtxval=0	## High-Dose Methotrexate
-# # Radiation
-# 	brainval=5.4 #Brain, 5.4 means 54Gy, 5400 cGy. #####Same for all RT doses.#####
-# 	chestval=2.4 # chest/neck RT, 2.4 for 24 Gy
-# 	heartval=0	# Heart RT
-# 	pelvisval=0	#pelvis RT
-# 	abdval=2.4  # Abdominal RT
-####################################################################################
-##### if no TX, use these.
-#	steroidval=0; bleoval=0; 	vcrval=0; 	etopval=0;	itmtval=0; 	cedval=0; cispval=0; brainval=0;
-#	doxval=0; chestval=0; abdval=0;
-# survs[[1]]
-############### no TX
-#	steroidval=0;  bleoval=0; vcrval=0; etopval=0; itmtval=0; cedval=0; cispval=0; brainval=0;  doxval=0; chestval=0; abdval=0; heartval=0; pelvisval=0; carboval=0; hdmtxval=0
-# Qi made many newdata_chc_sampled so we have 1000 times more donors -- but in different files.
-load(sampleData)
-newdata_chc_sampled=do.call("rbind", replicate(6,newdata_chc_sampled, simplify = FALSE))
-newdata_chc_sampled$t.startage=seq(5,70,1)
-newdata_chc_sampled$t.endage=seq(6,71,1)
-### originally data fit to 60 only. using cphfits can get est up to 60 only. ==> later I further cut at 50 or so to fit lines, becuase original data had 95th percentile around age 50 or so.
-newdata_chc_sampled=newdata_chc_sampled[newdata_chc_sampled$t.endage<=60,]
-# paste(names(input), input, sep = ":", collapse = ",")
-pr=input$diaggrp
-# agecut was previously hardcoded to 40 above
-agecut=c('1'=50, '2'=45, '3'=55, '4'=50, '5'=40, '6'=60, '7'=50, '8'=45, '9'=45, '10'=45, '11'=50 )[pr]
-sexval=input$sex
-newdata_chc_sampled$sex=input$sex # sexval
-newdata_chc_sampled$white=input$white # whiteval
-newdata_chc_sampled$agedx2=input$agedx # agedxval
-newdata_chc_sampled$steroid=input$steroid # steroidval
-newdata_chc_sampled$bleodose=input$bleo # bleoval
-newdata_chc_sampled$vcrdose=input$vcr # vcrval
-newdata_chc_sampled$etopdose=input$etop # etopval
-newdata_chc_sampled$itmtxdose=input$itmt # itmtval
-newdata_chc_sampled$ced_sum2=input$ced # cedval
-newdata_chc_sampled$cisplatdose=input$cisp # cispval
-newdata_chc_sampled$brainrad2=input$brain # brainval
-newdata_chc_sampled$doxed_sum2=input$dox # doxval
-newdata_chc_sampled$chestrad2=input$chest # chestval
-newdata_chc_sampled$abdrad2=input$abd # abdval
-newdata_chc_sampled$heartradboth2=input$heart # heartval
-newdata_chc_sampled$pelvisrad2=input$pelvis # pelvisval
-newdata_chc_sampled$carboplatdose=input$carbo # carboval
-newdata_chc_sampled$hdmtxdose=input$hdmtx # hdmtxval
-# newdata_chc_sampled$sex=sexval
-# newdata_chc_sampled$white=whiteval
-# newdata_chc_sampled$agedx2=agedxval
-# newdata_chc_sampled$steroid=steroidval
-# newdata_chc_sampled$bleodose=bleoval
-# newdata_chc_sampled$vcrdose=vcrval
-# newdata_chc_sampled$etopdose=etopval
-# newdata_chc_sampled$itmtxdose=itmtval
-# newdata_chc_sampled$ced_sum2=cedval
-# newdata_chc_sampled$cisplatdose=cispval
-# newdata_chc_sampled$brainrad2=brainval
-# newdata_chc_sampled$doxed_sum2=doxval
-# newdata_chc_sampled$chestrad2=chestval
-# newdata_chc_sampled$abdrad2=abdval
-# newdata_chc_sampled$heartradboth2=heartval
-# newdata_chc_sampled$pelvisrad2=pelvisval
-# newdata_chc_sampled$carboplatdose=carboval
-# newdata_chc_sampled$hdmtxdose=hdmtxval
-#	1="Acute lymphoblastic leukemia"
-#	2="AML"
-#	3="Hodgkin lymphoma"
-#	4="Non-Hodgkin lymphoma"
-#	5="Central nervous system"
-#	6="Bone tumor"
-#	7="STS"
-#	8="Wilms tumor"
-#	9="Neuroblastoma"
-#	10="Retinoblastoma"
-#	11="Germ cell tumor";
-results <- mclapply(X = chc_nums, FUN = function(chc_num) predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='expected'), mc.cores = cores)
-for(n in 1:length(results)){
-	newdata_chc_sampled = data.frame(newdata_chc_sampled,results[[n]])
-}
-names(newdata_chc_sampled)[25:50]=paste0("est_chc",chc_nums)
-newdata_chc_sampled = newdata_chc_sampled %>%
-	mutate(sumN_tmp = rowSums(dplyr::select(.,starts_with("est_chc"))))%>%
-	group_by(mrn) %>%
-	mutate(sumN_obs = cumsum(sumN_tmp)) %>%
-	as.data.frame()
-##Qi: the sumN here depends on all the 26 grouped conditions. So the input X's all matter. That is, if sex is not in a CHC of interest, it would make a difference here on sumN (becuase sex was on some CHCs), and hence make a difference on burden of that CHC even that it is not in the cphfits of that CHC.
-newdata_chc_sampled = newdata_chc_sampled %>%
-	group_by(mrn) %>%
-	mutate(chc20 = sumN_obs[t.endage == 20]) %>%
-	ungroup() %>%
-	as.data.frame()
-newdata_chc_sampled$death =1
-newdata_chc_sampled$obsCHCat20 = newdata_chc_sampled$current.chc
-# survival probability
-# https://stats.stackexchange.com/questions/288393/calculating-survival-probability-per-person-at-time-t-from-cox-ph
-newdata_chc_sampled$survprob = exp(-predict(survs[[1]],newdata=data.frame(newdata_chc_sampled,primary=pr),type='expected'))
-#----------------------------------------------------------------------------------------------------------------#
-##### Qi added the below "cumprod" for survival by time t. But need to figure out: What is the "survprob" in BCCT formulat? Should it be survival of the segment, or survival by time t? == need to figure out with YY. Discussed, YY confirmed my way: survival prob in the formula is cumulative, not for that segment.
-#----------------------------------------------------------------------------------------------------------------#
-#----------------------------------------------------------------------------------------------------------------#
-## If assume "survprob" is over time (not for each segment):
-#### why does the survprob does not decrease over time? I think this is not the real survival probability over time. Do I have to do multiplication over time thinking survprob is the survival over that segment? Try the multiplication over time.===== I think this make sense. In the "predict" above, survial=exp(-expected) was for each row (thinking each row is a separate person). While in newdata_chc_sampled, the rows are for the same person, and the survival depends on the previoys line, so need to multiply the survival from the previous line.
-newdata_chc_sampled$survprob4=cumprod(newdata_chc_sampled$survprob)
-newdata_chc_sampled$survprob=newdata_chc_sampled$survprob4
-#	plot(c(0,90),c(0,1),type="n")
-survspline=smooth.spline(newdata_chc_sampled$t.endage[newdata_chc_sampled$t.endage<=agecut],newdata_chc_sampled$survprob[newdata_chc_sampled$t.endage<=agecut],spar=0.5)
-predsurv=predict(survspline,seq(0,95,1))
-#	lines(predsurv$x,predsurv$y,col=3,lty=2)
-##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
-###### get rid of the est_chcXX and "sumN"columns which were used to calculate the survival probability only.
-# invisible(dim(newdata_chc_sampled))
-newdata_chc_sampled=newdata_chc_sampled[,-grep("est_chc", colnames(newdata_chc_sampled))]
-newdata_chc_sampled=newdata_chc_sampled[,-grep("sumN", colnames(newdata_chc_sampled))]
-# invisible(dim(newdata_chc_sampled))
-### Add rows  t.startage from 60 to 94, and t.endage from 65 to 95; so we can get burden 60-90.
-add=newdata_chc_sampled[newdata_chc_sampled$t.startage<=39,]
-# table(add$t.startage)
-# table(add$t.endage)
-add$t.startage=add$t.startage+55
-add$t.endage=add$t.endage+55
-# table(add$t.startage)
-# table(add$t.endage)
-newdata_chc_sampled=rbind(newdata_chc_sampled,add)
-newdata_chc_sampled=newdata_chc_sampled[order(newdata_chc_sampled$mrn,newdata_chc_sampled$t.startage),]
-### replace the survival prob with the calculated/extrapolated survival probability
-smooth_surv=data.frame(age=predsurv$x,surv=predsurv$y)
-smooth_surv$surv[smooth_surv$age<=20]=1
-#### survival probability cannot be <0. Hanle the years with survival prob<0
-#https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1310013501 This page had the conditional survival based on age
-## take the last year with a positive survival prob, and its survival prob
-positive=smooth_surv[smooth_surv$surv>0,]
-alast=tail(positive,1)[1,1]
-slast=tail(positive,1)[1,2]
-#smooth_surv$alast=alast
-#smooth_surv$alast=slast
-smooth_surv$interval=smooth_surv$age-alast
-### use the last positive survival prob*0.5^(years from the last age with positive survival probability), assuming the conditions survival prob after that age 50% each year.
-cave <- function(x) slast*0.5^(max(x["interval"],0))
-smooth_surv$surv1=apply(smooth_surv,1,cave)
-smooth_surv$surv[smooth_surv$surv<0]=smooth_surv$surv1[smooth_surv$surv<0]
-newdata_chc_sampled=merge(newdata_chc_sampled,smooth_surv,by.x="t.endage",by.y="age")
-newdata_chc_sampled$survprob=newdata_chc_sampled$surv
-# when there is an interaction in the model, it gave warning. So I would make a new data with all 0's to make it work.
-newdata0=matrix(0,nrow=1,ncol=18)
-newdata0=as.data.frame(newdata0)
-colnames(newdata0)=c("sex","white","agedx2","steroids","bleodose","vcrdose","etopdose","itmtxdose","ced_sum2",
-"cisplatdose","brainrad2","doxed_sum2","chestrad2","abdrad2","heartradboth2","pelvisrad2","carboplatdose","hdmtxdose")
-newdata_chc_sampled1=newdata_chc_sampled ## do this so each run on chc_num loops below starts with the original newdata_chc_sampled1
-##########################################################################
-person_burden=NULL
-get_estimate <- function(chc_num) {  #### Edgar, you may make this in separate runs to save time.
-	# print(chc_num)
-	newdata_chc_sampled=newdata_chc_sampled1
-	# linear predictor
-	newdata_chc_sampled$exp_lp = predict(cphfits2[[chc_num]], newdata = data.frame(newdata_chc_sampled,primary=pr),type='risk',reference="zero")
-	# Baseline nelson-aalan est
-	# https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
-	j=chc_num
-	base = basehaz(cphfits2[[chc_num]],centered = F) # this is a cumulative hazard, so need to convert it into non-cumulative version
-	#centered,	if TRUE return data from a predicted survival curve at the mean values of the covariates fit$mean, if FALSE return a prediction for all covariates equal to zero.
-	#request the hazard for that covariate combination from the survfit() function that is called by basehaz(). https://stats.stackexchange.com/questions/565210/about-getting-baseline-survival-probability-for-a-piecewise-cox-model-with-inter
-	### Max time in the data is 70.42. We need to estimate up to 90.
-	#Yutaka: I think we should smooth the cumulative hazard and then take the derivative to get the hazard.
-	#One thread I found on Web is: "As an approximation you can smooth the fitted baseline cumulative hazard (e.g. by package pspline) and ask for its derivative."  Can you try using smooth.spline and smooth the cumulative hazard and then get the derivative?  https://cran.r-project.org/web/packages/pspline/pspline.pdf
-	#### Qi added: base is for different DX. Now we run within each pr, so neeed cumulaive hazrd for that pr only
-	base=base[base$strata==paste("primary=",pr,sep=""),] #cumulative hazard
-	base=base[base$time<=agecut,]  ### shouldn't we use the same age cutoff as the survival function splines? Yes, do so.
-	##### study the smooth parameter. I think spar=1 is the best one to use (most smoothest)
-	cumHspline=smooth.spline(base$time,base$hazard,spar=1)
-	predcumhz=predict(cumHspline,seq(0,95,1))  ### predicted cumulative hazard
-	##### In order to use the above way to get dN0, do Daisuke's original way using cumhz difference. But the  difference is that: we fit cumhz with smooth.spline and can extend it to 90 years old.
-	base=data.frame(time=predcumhz$x,hazard=predcumhz$y)  ##Daisuke used the cumHz, here we smoothed it and then use it.
-	#### fitted values had <0 values in age 0-8 or so. change to 0 cumulative hazard.
-	base$hazard[base$hazard<0]=0
-	base2 = base %>%
-		mutate(hazard2 = hazard - c(0,hazard[-length(hazard)])) %>%
-		ungroup() %>% as.data.frame()
-	base2 = base2 %>%
-		mutate(time_cat = cut(time,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)) %>%
-		ungroup()
-	base3 = base2 %>%
-		group_by(time_cat) %>%
-		dplyr::summarize(dN0 = sum(hazard2)) %>%
-		filter(!is.na(time_cat))
-	###############
-	# BCCT
-	###############
-	newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,95,1),right = FALSE, include.lowest = TRUE)
-	#newdata_chc_sampled$time_cat = cut(newdata_chc_sampled$t.startage,breaks=seq(0,90,5),right = FALSE, include.lowest = TRUE) this won't work, because the input donors file had "t.startage" up to 55 only
-	newdata_chc_sampled = newdata_chc_sampled %>%
-		left_join(base3,by="time_cat")
-	newdata_chc_sampled$dN0 = ifelse(is.na(newdata_chc_sampled$dN0),0,newdata_chc_sampled$dN0)
-	BCCT = newdata_chc_sampled %>%
-		group_by(mrn) %>%
-		mutate(BCCT_tmp = exp_lp*survprob*dN0) %>%
-		mutate(BCCT = cumsum(BCCT_tmp)) %>%
-		filter(t.startage>=20) %>%
-		ungroup() %>%
-		as.data.frame()
-	for_web_BCCT = as.data.frame(tidyr::pivot_wider(BCCT,id_cols = mrn, names_from=time_cat,values_from=BCCT))
-	for_web_BCCT =for_web_BCCT[,-1]
-	#### for non-recurrent ones, maximum burden is 1 if the grouped conditions had only 1 condition. (11, 19,29) had only 1 conditons non-recurrent. (15,17,25) had 2 conditons. Take 25 as an example, it had obesity/underweight where underweight was so rare. So max 1 is still good.
-	#### non-recurrent CHCs are 11, 15, 17, 19, 25, 29. ==I think making it maximum 1 is not good always, becuase these are grouped conditions. For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
-	ncoltmp=75  ## from 20 to 94
-	if(chc_num %in% c(11, 15, 17, 19, 25, 29)){
-	for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,1))
-	for_web_BCCT=as.data.frame(for_web_BCCT2)
-	colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
-	}
-	#For example, chc=10 contains 3 non-recurrent events, so one person could have each of these once, making it maximum 3 in this person for chc=10.
-	if(chc_num %in% c(10)){
-	for_web_BCCT2=apply(for_web_BCCT,c(1,2),function(x) min(x,3))
-	for_web_BCCT=as.data.frame(for_web_BCCT2)
-	colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
-	}
-	##### if female condition 6, then it is 0 for males.
-	if(chc_num %in% c(6) & sexval==1){
-		for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
-	for_web_BCCT=as.data.frame(for_web_BCCT2)
-	colnames(for_web_BCCT)=colnames(person_burden[1:75])
-	}
-	##### if male condition 7, then it is 0 for females.d
-	if(chc_num %in% c(7) & sexval==0){
-		for_web_BCCT2=matrix(0,nrow=1,ncol=ncoltmp)
-	for_web_BCCT=as.data.frame(for_web_BCCT2)
-	colnames(for_web_BCCT)=colnames(person_burden[1:ncoltmp])
-	}
-	for_web_BCCT$chc=chc_num
-	return(for_web_BCCT)
-}
-# this serial loop works
-# for(chc_num in chc_nums) {
-# 	person_burden=rbind(person_burden, get_estimate(chc_num))
-# }
-# get estimates
-# parallelize across chc_nums
-results <- mclapply(X = chc_nums, FUN = get_estimate, mc.cores = cores)
-# combine rows into person_burden data frame
-for (n in 1:length(results)) {
-  row <- results[[n]]
-  if (!identical(names(row), names(results[[1]]))) {
-    # some rows may have empty column names because they
-    # used the columns names from the person_burden table, which
-    # is NULL when get_estimate() is run in parallel (see the
-    # if() statements in get_estimate())
-    # in this situation, use the column names from the first row
-    names(row) <- names(results[[1]])
-  }
-  person_burden <- rbind(person_burden, row)
-}
-# person_burden[,30:31]
-# sum(person_burden[,31])  ## total burden at 50 years old. 8.971574 for this example.
-#### The predicated burden for 26 grouped CHCs from age 20 to 95.
-# write.csv(person_burden,file=paste("primary",pr,".csv"),row.names=F)
-toJSON(person_burden, digits = NA, na = "string")