@platforma-open/milaboratories.run-tcrdisco-enrichment.software 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. package/.turbo/turbo-build.log +2183 -0
  2. package/CHANGELOG.md +7 -0
  3. package/Dockerfile +56 -0
  4. package/dist/artifacts/get-enriched-frequencies/archive.json +1 -0
  5. package/dist/artifacts/get-enriched-frequencies/docker_x64.json +1 -0
  6. package/dist/artifacts/subset-assignment/archive.json +1 -0
  7. package/dist/artifacts/subset-assignment/docker_x64.json +1 -0
  8. package/dist/artifacts/tcr-ab-pairs/archive.json +1 -0
  9. package/dist/artifacts/tcr-ab-pairs/docker_x64.json +1 -0
  10. package/dist/artifacts/tcr-disco/archive.json +1 -0
  11. package/dist/artifacts/tcr-disco/docker_x64.json +1 -0
  12. package/dist/tengo/software/get-enriched-frequencies.sw.json +1 -0
  13. package/dist/tengo/software/subset-assignment.sw.json +1 -0
  14. package/dist/tengo/software/tcr-ab-pairs.sw.json +1 -0
  15. package/dist/tengo/software/tcr-disco.sw.json +1 -0
  16. package/package.json +116 -0
  17. package/pkg-platforma-open-milaboratories.run-tcrdisco-enrichment.software-get-enriched-frequencies-1.1.0.tgz +0 -0
  18. package/pkg-platforma-open-milaboratories.run-tcrdisco-enrichment.software-subset-assignment-1.1.0.tgz +0 -0
  19. package/pkg-platforma-open-milaboratories.run-tcrdisco-enrichment.software-tcr-ab-pairs-1.1.0.tgz +0 -0
  20. package/pkg-platforma-open-milaboratories.run-tcrdisco-enrichment.software-tcr-disco-1.1.0.tgz +0 -0
  21. package/src/tcr-disco/find-pairs.R +271 -0
  22. package/src/tcr-disco/get-enriched-frequencies.R +102 -0
  23. package/src/tcr-disco/main.R +291 -0
  24. package/src/tcr-disco/renv.lock +6375 -0
  25. package/src/tcr-disco/subset-assignment.R +194 -0
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env Rscript
2
+
3
+ # Load required libraries
4
+ suppressMessages(library("optparse"))
5
+ #----------------------------------------
6
+
7
+ # Required functions
8
+ # 1. *create_subsets_df(metadata_location, clonotypes_location)*
9
+
10
+ # Function creates CD4 and CD8 subset list that is used after edgeR analysis for CD4/CD8 clonotype assigning.
11
+
12
+ # Function requires a link to the metadata file and a link to the folder location with the corresponding CD4 and CD8 T cell clonotype files.
13
+
14
+ # Metadata is a txt file that contains following columns.
15
+
16
+ # | file_name | sample_id | subset | chain |
17
+ # |---------------------------------|-----------------|--------|-------|
18
+ # | filename_x\_CD4.clones_TRAD.tsv | filename_x\_CD4 | CD4 | tra |
19
+ # | filename_y\_CD8.clones_TRAD.tsv | filename_y\_CD8 | CD8 | tra |
20
+ create_subsets_df = function(metadata_table, cd_subset_col, clonotypes, clonotypeKeyCol){
21
+ # additional function to create separate subsets (for CD4 and CD8)
22
+ make_subset = function(metadata_table, clonotypes, population, clonotypeKeyCol){
23
+ # Get from metadata table the sample ids of the requested population subset
24
+ # This will get both tra and trb, but only data from one of them is provided in clonotypes
25
+ pos <- toupper(metadata_table[,"subset"]) == population
26
+ list_samples = metadata_table[pos, "internalSampleId"]
27
+
28
+ # Combine all files in metadata_location with information for the same subset/population
29
+ # and reformat them
30
+ cdt_subset = clonotypes[clonotypes[,"internalSampleId"] %in% list_samples,]
31
+ cdt_subset["subset"] = population
32
+
33
+ return(cdt_subset[c(clonotypeKeyCol, "count", "fraction", "subset")])
34
+ }
35
+
36
+ # create separate CD4 and CD8 subsets using function above
37
+ metadata_table["subset"] = metadata_table[,cd_subset_col]
38
+ cd4_subset = make_subset(metadata_table, clonotypes, "CD4", clonotypeKeyCol)
39
+ cd8_subset = make_subset(metadata_table, clonotypes, "CD8", clonotypeKeyCol)
40
+
41
+ #merge two subset by internalSampleId column and define clonotype subset
42
+ subsets = merge(cd4_subset, cd8_subset, by = clonotypeKeyCol, all = T)
43
+ # Keep only unique clonotypes
44
+ subsets = subsets[!duplicated(subsets[[clonotypeKeyCol]]), ]
45
+ # filter(!grepl("\\*|\\_", internalSampleId)) %>% #filter-out stopcodons and frameshifts
46
+
47
+ # Define subset based on conditions
48
+ # Handle NA values in counts (set to 0 for comparison)
49
+ count_x = ifelse(is.na(subsets$count.x), 0, subsets$count.x)
50
+ count_y = ifelse(is.na(subsets$count.y), 0, subsets$count.y)
51
+
52
+ subsets$subset = ifelse(subsets$subset.x == "CD4" & is.na(subsets$subset.y), "CD4",
53
+ ifelse(count_x > 0 & count_y > 0 & count_x / count_y >= 5, "CD4", #assign to CD4 by 5-to-1 ratio
54
+ ifelse(subsets$subset.y == "CD8" & is.na(subsets$subset.x), "CD8",
55
+ ifelse(count_y > 0 & (count_x == 0 | (count_x > 0 & count_y / count_x >= 5)), "CD8", NA)))) #assign to CD8 by 5-to-1 ratio
56
+
57
+ # Select and rename columns
58
+ subsets = subsets[, c(clonotypeKeyCol, "count.x", "fraction.x", "count.y", "fraction.y", "subset")]
59
+ colnames(subsets)[colnames(subsets) == "count.x"] = "umi_count_CD4"
60
+ colnames(subsets)[colnames(subsets) == "fraction.x"] = "umi_freq_CD4"
61
+ colnames(subsets)[colnames(subsets) == "count.y"] = "umi_count_CD8"
62
+ colnames(subsets)[colnames(subsets) == "fraction.y"] = "umi_freq_CD8"
63
+
64
+ # Calculate subset_frequency
65
+ subsets$subset_frequency = ifelse(!is.na(subsets$subset) & subsets$subset == "CD4", log10(subsets$umi_freq_CD4),
66
+ ifelse(!is.na(subsets$subset) & subsets$subset == "CD8", log10(subsets$umi_freq_CD8), 0))
67
+
68
+ return(subsets)
69
+ }
70
+
71
+ #----------------------------------------
72
+
73
+ # Main code
74
+
75
+ # Parse command line arguments
76
+ option_list <- list(
77
+ make_option(c("--main_alpha"),
78
+ type = "character", default = "mainAlpha.tsv",
79
+ help = "Path to main TCR alpha clonotypes TSV file", metavar = "character"
80
+ ),
81
+ make_option(c("--main_beta"),
82
+ type = "character", default = "mainBeta.tsv",
83
+ help = "Path to main TCR beta clonotypes TSV file", metavar = "character"
84
+ ),
85
+ make_option(c("--metadata"),
86
+ type = "character", default = "metadata.tsv",
87
+ help = "Path to metadata TSV file", metavar = "character"
88
+ ),
89
+ make_option(c("--cd_alpha"),
90
+ type = "character", default = NA,
91
+ help = "Path to CD alpha clonotypes TSV file", metavar = "character"
92
+ ),
93
+ make_option(c("--cd_beta"),
94
+ type = "character", default = NA,
95
+ help = "Path to CD beta clonotypes TSV file", metavar = "character"
96
+ ),
97
+ make_option(c("--cd_subset_col"),
98
+ type = "character", default = NA,
99
+ help = "Metadata column with CD4/8 information", metavar = "character"
100
+ ),
101
+ make_option(c("-o", "--output"),
102
+ type = "character",
103
+ default = ".",
104
+ help = "Output folder for TSV results", metavar = "character"
105
+ )
106
+ )
107
+
108
+ opt_parser <- OptionParser(option_list = option_list)
109
+ opt <- parse_args(opt_parser)
110
+
111
+ # Get input data
112
+ metadata <- opt$metadata
113
+ main_alpha <- opt$main_alpha
114
+ main_beta <- opt$main_beta
115
+ cd_alpha <- opt$cd_alpha
116
+ cd_beta <- opt$cd_beta
117
+ output_folder <- opt$output
118
+ cd_subset_col <- opt$cd_subset_col
119
+
120
+ # test
121
+ # metadata <- "./metadata.tsv"
122
+ # main_alpha <- "./mainAlpha.tsv"
123
+ # main_beta <- "./mainBeta.tsv"
124
+ # cd_alpha <- "./cdAlpha.tsv"
125
+ # cd_beta <- "./cdBeta.tsv"
126
+ # output_folder <- "./results"
127
+ # cd_subset_col <- "Subset"
128
+
129
+ print(paste0("cd_alpha: ", cd_alpha))
130
+ print(paste0("cd_beta: ", cd_beta))
131
+ print(paste0("cd_subset_col: ", cd_subset_col))
132
+ print(paste0("output_folder: ", output_folder))
133
+ print(paste0("metadata: ", metadata))
134
+ print(paste0("main_alpha: ", main_alpha))
135
+ print(paste0("main_beta: ", main_beta))
136
+
137
+ ## 1.1. TCR Discovery
138
+ ### Load main data
139
+ metadata_table <- read.table(metadata, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
140
+ main_alpha_table <- read.table(main_alpha, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
141
+ main_beta_table <- read.table(main_beta, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
142
+
143
+ # Check if any internalSampleId appears both in alpha and beta we remove it assuming contamination
144
+ repeatedSamples <- unique(main_alpha_table$internalSampleId)
145
+ repeatedSamples <- repeatedSamples[repeatedSamples %in% unique(main_beta_table$internalSampleId)]
146
+ if (length(repeatedSamples) > 0) {
147
+ alpha_counts <- table(main_alpha_table$internalSampleId)[repeatedSamples]
148
+ beta_counts <- table(main_beta_table$internalSampleId)[repeatedSamples]
149
+
150
+ remove_from_alpha <- repeatedSamples[alpha_counts < beta_counts]
151
+ remove_from_beta <- repeatedSamples[beta_counts <= alpha_counts]
152
+ # Remove duplicated sampleIds from files in which they appear less (when comparing between tcra and tcrb)
153
+ main_alpha_table <- main_alpha_table[!main_alpha_table$internalSampleId %in% remove_from_alpha, ]
154
+ main_beta_table <- main_beta_table[!main_beta_table$internalSampleId %in% remove_from_beta, ]
155
+
156
+ print(paste0("Removed ", length(remove_from_alpha), " repeated sample(s) from alpha"))
157
+ print(paste0("Removed ", length(remove_from_beta), " repeated sample(s) from beta"))
158
+
159
+ }
160
+
161
+ # Load CD4 and CD8 dataframe (optional step)
162
+ if (!is.na(cd_alpha) && !is.na(cd_beta) && !is.na(cd_subset_col)) {
163
+ # Load CD data
164
+ cd_alpha_table <- read.table(cd_alpha, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
165
+ cd_beta_table <- read.table(cd_beta, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
166
+ clonotypeKeyCol <- "clonotypeKey"
167
+
168
+ subsets_tra = NULL
169
+ subsets_trb = NULL
170
+ subsets_tra = create_subsets_df(metadata_table, cd_subset_col, cd_alpha_table, clonotypeKeyCol)
171
+ subsets_trb = create_subsets_df(metadata_table, cd_subset_col, cd_beta_table, clonotypeKeyCol)
172
+
173
+ # Assign T cell subset to main data
174
+ cat("\n Assigning T cell subset...")
175
+ reorder_cols = function(tbl) {
176
+ cols = colnames(tbl)
177
+ # Remove clonotypeKeyCol if it exists to avoid duplicates
178
+ cols = cols[cols != clonotypeKeyCol]
179
+ idx = match("internalSampleId", cols)
180
+ tbl[, c(cols[1:idx], clonotypeKeyCol, cols[(idx+1):length(cols)])]
181
+ }
182
+ main_alpha_table = reorder_cols(merge(main_alpha_table, subsets_tra, by = clonotypeKeyCol, all.x = T))
183
+ main_beta_table = reorder_cols(merge(main_beta_table, subsets_trb, by = clonotypeKeyCol, all.x = T))
184
+ cat("Done")
185
+ } else {
186
+ cat("\n No CD4/CD8 T cell subset available")
187
+ }
188
+
189
+ # Write merged tables
190
+ if (!dir.exists(output_folder)) {
191
+ dir.create(output_folder, recursive = TRUE)
192
+ }
193
+ write.table(main_alpha_table, paste0(output_folder, "/main_alpha_table.tsv"), sep = "\t", row.names = F, quote = F)
194
+ write.table(main_beta_table, paste0(output_folder, "/main_beta_table.tsv"), sep = "\t", row.names = F, quote = F)