@platforma-open/milaboratories.mixcr-amplicon-alignment.workflow 1.17.0 → 1.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
   WARN  Issue while reading "/home/runner/work/mixcr-amplicon-alignment/mixcr-amplicon-alignment/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
2
2
 
3
- > @platforma-open/milaboratories.mixcr-amplicon-alignment.workflow@1.17.0 build /home/runner/work/mixcr-amplicon-alignment/mixcr-amplicon-alignment/workflow
3
+ > @platforma-open/milaboratories.mixcr-amplicon-alignment.workflow@1.18.1 build /home/runner/work/mixcr-amplicon-alignment/mixcr-amplicon-alignment/workflow
4
4
  > rm -rf dist && pl-tengo check && pl-tengo build
5
5
 
6
6
  Processing "src/aggregate-by-clonotype-key.tpl.tengo"...
package/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # @platforma-open/milaboratories.mixcr-amplicon-alignment.workflow
2
2
 
3
+ ## 1.18.1
4
+
5
+ ### Patch Changes
6
+
7
+ - 5198666: Upgrade MiXCR to 4.7.0-300-develop, add MI_LICENSE_DEBUG env, use --use-local-temp, show loading spinner while sample list loads
8
+
9
+ ## 1.18.0
10
+
11
+ ### Minor Changes
12
+
13
+ - 656f2fe: stop codon replacement and dep updates
14
+
15
+ ### Patch Changes
16
+
17
+ - Updated dependencies [656f2fe]
18
+ - @platforma-open/milaboratories.mixcr-amplicon-alignment.software@1.1.0
19
+
3
20
  ## 1.17.0
4
21
 
5
22
  ### Minor Changes
@@ -298,6 +298,9 @@ calculateExportSpecs := func(presetSpecForBack, blockId) {
298
298
  columnsSpecPerClonotypeAggregates += [ sampleCountColumn ]
299
299
 
300
300
  orderP := 80000
301
+ aminoAcidSeqColumns := []
302
+ aminoAcidSeqColumnPairs := []
303
+ cdr3SeqColumns := []
301
304
 
302
305
 
303
306
 
@@ -325,6 +328,16 @@ inFrameFeatures := {
325
328
  alphabetShortMixcr := isAminoAcid ? "aa" : "n"
326
329
  columnName := alphabetShortMixcr + "Seq" + featureInFrameU
327
330
  visibility := featureU == "VDJRegion" || featureU == "CDR3"
331
+ if featureU == "CDR3" {
332
+ cdr3SeqColumns += [ columnName ]
333
+ }
334
+ if isAminoAcid {
335
+ aminoAcidSeqColumns += [ columnName ]
336
+ aminoAcidSeqColumnPairs += [ {
337
+ aa: columnName,
338
+ nt: "nSeq" + featureU
339
+ } ]
340
+ }
328
341
  columnsSpecPerClonotypeNoAggregates += [ {
329
342
  column: columnName,
330
343
  id: alphabetShortMixcr + "-seq-" + featureInFrameL,
@@ -807,6 +820,7 @@ inFrameFeatures := {
807
820
  } ]
808
821
 
809
822
  return {
823
+ productiveFeature: productiveFeature,
810
824
  clonotypeKeyColumns: clonotypeKeyColumns,
811
825
  clonotypeKeyArgs: clonotypeKeyArgs,
812
826
 
@@ -816,6 +830,9 @@ inFrameFeatures := {
816
830
  columnsSpecPerClonotypeNoAggregates: columnsSpecPerClonotypeNoAggregates,
817
831
  columnsSpecPerClonotypeAggregates: columnsSpecPerClonotypeAggregates,
818
832
  cdr3DistanceColumnsSpec: cdr3DistanceColumnsSpec,
833
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
834
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
835
+ cdr3SeqColumns: cdr3SeqColumns,
819
836
 
820
837
  columnsSpec: columnsSpec,
821
838
 
@@ -571,6 +571,38 @@ getQcReportColumns := func(hasUmi, sampleIdAxisSpec, chains, umiTags) {
571
571
  }
572
572
  }
573
573
  },
574
+ {
575
+ column: "assemble.clonotypesDroppedByStopCodons",
576
+ id: "assemble-clonotypes-dropped-by-stop-codons",
577
+ allowNA: true,
578
+ naRegex: "NaN",
579
+ spec: {
580
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByStopCodons",
581
+ valueType: "Long",
582
+ annotations: {
583
+ "pl7.app/min": "0",
584
+ "pl7.app/table/orderPriority": "108200",
585
+ "pl7.app/table/visibility": "optional",
586
+ "pl7.app/label": "Clonotypes Dropped - Stop Codons"
587
+ }
588
+ }
589
+ },
590
+ {
591
+ column: "assemble.clonotypesDroppedByOutOfFrame",
592
+ id: "assemble-clonotypes-dropped-by-out-of-frame",
593
+ allowNA: true,
594
+ naRegex: "NaN",
595
+ spec: {
596
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByOutOfFrame",
597
+ valueType: "Long",
598
+ annotations: {
599
+ "pl7.app/min": "0",
600
+ "pl7.app/table/orderPriority": "108100",
601
+ "pl7.app/table/visibility": "optional",
602
+ "pl7.app/label": "Clonotypes Dropped - Out of Frame"
603
+ }
604
+ }
605
+ },
574
606
  {
575
607
  column: "totalClonotypes",
576
608
  id: "total-clonotypes",
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,16 +1,16 @@
1
1
  {
2
2
  "name": "@platforma-open/milaboratories.mixcr-amplicon-alignment.workflow",
3
- "version": "1.17.0",
3
+ "version": "1.18.1",
4
4
  "description": "MiXCR Amplicon Alignment Workflow",
5
5
  "type": "module",
6
6
  "dependencies": {
7
- "@platforma-sdk/workflow-tengo": "5.8.0",
8
- "@platforma-open/milaboratories.software-mixcr": "4.7.0-279-develop",
7
+ "@platforma-sdk/workflow-tengo": "5.8.2",
8
+ "@platforma-open/milaboratories.software-mixcr": "4.7.0-300-develop",
9
9
  "@platforma-open/milaboratories.software-repseqio": "^2.5.0-13-master",
10
- "@platforma-open/milaboratories.mixcr-amplicon-alignment.software": "1.0.0"
10
+ "@platforma-open/milaboratories.mixcr-amplicon-alignment.software": "1.1.0"
11
11
  },
12
12
  "devDependencies": {
13
- "@platforma-sdk/tengo-builder": "2.4.11"
13
+ "@platforma-sdk/tengo-builder": "2.4.17"
14
14
  },
15
15
  "scripts": {
16
16
  "build": "rm -rf dist && pl-tengo check && pl-tengo build",
@@ -298,6 +298,9 @@ calculateExportSpecs := func(presetSpecForBack, blockId) {
298
298
  columnsSpecPerClonotypeAggregates += [ sampleCountColumn ]
299
299
 
300
300
  orderP := 80000
301
+ aminoAcidSeqColumns := []
302
+ aminoAcidSeqColumnPairs := []
303
+ cdr3SeqColumns := []
301
304
 
302
305
  // Sequences
303
306
 
@@ -325,6 +328,16 @@ inFrameFeatures := {
325
328
  alphabetShortMixcr := isAminoAcid ? "aa" : "n"
326
329
  columnName := alphabetShortMixcr + "Seq" + featureInFrameU
327
330
  visibility := featureU == "VDJRegion" || featureU == "CDR3"
331
+ if featureU == "CDR3" {
332
+ cdr3SeqColumns += [ columnName ]
333
+ }
334
+ if isAminoAcid {
335
+ aminoAcidSeqColumns += [ columnName ]
336
+ aminoAcidSeqColumnPairs += [ {
337
+ aa: columnName,
338
+ nt: "nSeq" + featureU
339
+ } ]
340
+ }
328
341
  columnsSpecPerClonotypeNoAggregates += [ {
329
342
  column: columnName,
330
343
  id: alphabetShortMixcr + "-seq-" + featureInFrameL,
@@ -807,6 +820,7 @@ inFrameFeatures := {
807
820
  } ]
808
821
 
809
822
  return {
823
+ productiveFeature: productiveFeature,
810
824
  clonotypeKeyColumns: clonotypeKeyColumns,
811
825
  clonotypeKeyArgs: clonotypeKeyArgs,
812
826
 
@@ -816,6 +830,9 @@ inFrameFeatures := {
816
830
  columnsSpecPerClonotypeNoAggregates: columnsSpecPerClonotypeNoAggregates,
817
831
  columnsSpecPerClonotypeAggregates: columnsSpecPerClonotypeAggregates,
818
832
  cdr3DistanceColumnsSpec: cdr3DistanceColumnsSpec,
833
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
834
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
835
+ cdr3SeqColumns: cdr3SeqColumns,
819
836
 
820
837
  columnsSpec: columnsSpec,
821
838
 
@@ -32,6 +32,75 @@ self.body(func(inputs) {
32
32
 
33
33
  umiTags := inputs.umiTags
34
34
  hasUmi := !is_undefined(umiTags) && len(umiTags) > 0
35
+ stopCodonTypes := inputs.stopCodonTypes
36
+ stopCodonReplacements := inputs.stopCodonReplacements
37
+ if is_undefined(stopCodonTypes) || !is_array(stopCodonTypes) {
38
+ stopCodonTypes = []
39
+ }
40
+ useStopCodonReplacement := !is_undefined(stopCodonTypes) && is_array(stopCodonTypes) && len(stopCodonTypes) > 0
41
+ if is_undefined(stopCodonReplacements) || !is_map(stopCodonReplacements) {
42
+ stopCodonReplacements = {}
43
+ }
44
+
45
+ featureForFlags := inputs.productiveFeature
46
+ if is_undefined(featureForFlags) || featureForFlags == "" {
47
+ featureForFlags = "CDR3"
48
+ }
49
+ if is_array(featureForFlags) && len(featureForFlags) > 0 {
50
+ featureForFlags = featureForFlags[0]
51
+ }
52
+ isOOFColumn := "isOOF" + featureForFlags
53
+ hasStopsColumn := "hasStopsIn" + featureForFlags
54
+
55
+ contains := func(arr, value) {
56
+ for v in arr {
57
+ if v == value { return true }
58
+ }
59
+ return false
60
+ }
61
+
62
+ stopReplacement := func(stopType) {
63
+ if !contains(stopCodonTypes, stopType) {
64
+ return "*"
65
+ }
66
+ aa := stopCodonReplacements[stopType]
67
+ if is_undefined(aa) || aa == "" {
68
+ return "*"
69
+ }
70
+ return text.to_upper(aa)
71
+ }
72
+
73
+ codonMapReplace := {
74
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
75
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
76
+ "TAT": "Y", "TAC": "Y", "TAA": stopReplacement("ochre"),
77
+ "TAG": stopReplacement("amber"), "TGT": "C", "TGC": "C",
78
+ "TGA": stopReplacement("opal"), "TGG": "W",
79
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
80
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
81
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
82
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
83
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
84
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
85
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
86
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
87
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
88
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
89
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
90
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
91
+ }
92
+
93
+ translateNtToAaExpr := func(ntExpr, codonMap) {
94
+ seq := ntExpr.fillNull("").strToUpper()
95
+ seq = seq.strReplace("(.{3})", "$1|", { replaceAll: true })
96
+ for codon, aa in codonMap {
97
+ seq = seq.strReplace(codon + "|", aa + "|", { replaceAll: true, literal: true })
98
+ }
99
+ seq = seq.strReplace("\\|$", "", { replaceAll: false })
100
+ seq = seq.strReplace("|", "", { replaceAll: true, literal: true })
101
+ seq = seq.strReplace("[ACGT]{1,2}$", "", { replaceAll: true })
102
+ return seq
103
+ }
35
104
 
36
105
  chainInfos := {
37
106
  "IGHeavy": { mixcrFilter: "IGH", name: "IG Heavy", shortName: "Heavy" },
@@ -56,6 +125,7 @@ self.body(func(inputs) {
56
125
  software(mixcrSw).
57
126
  env("MI_USE_SYSTEM_CA", "true").
58
127
  secret("MI_LICENSE", "MI_LICENSE").
128
+ env("MI_LICENSE_DEBUG", "MI_LICENSE_DEBUG").
59
129
  arg("exportReportsTable")
60
130
 
61
131
  // Add all clns files as input
@@ -124,6 +194,78 @@ self.body(func(inputs) {
124
194
  // Join counts and overwrite totalClonotypes to reflect exported (productive) clones
125
195
  joinedDf := processedDf.join(aggregatedCounts, { how: "left", on: ["sampleId"] })
126
196
 
197
+ // Count clonotypes filtered by stop codons and out-of-frame per sample
198
+ filterCountDfs := []
199
+ mixcrChainsArg := text.join(chainsForMixcr, ",")
200
+ for key, clnsFile in clnsFiles {
201
+ sampleId := json.decode(key)[0]
202
+ exportFiltersCmd := exec.builder().
203
+ inMediumQueue().
204
+ mem("16GiB").
205
+ cpu(2).
206
+ software(mixcrSw).
207
+ env("MI_USE_SYSTEM_CA", "true").
208
+ secret("MI_LICENSE", "MI_LICENSE").
209
+ env("MI_LICENSE_DEBUG", "MI_LICENSE_DEBUG").
210
+ arg("exportClones").
211
+ arg("--dont-split-files").
212
+ arg("--drop-default-fields").
213
+ arg("--reset-export-clone-table-splitting")
214
+ if mixcrChainsArg != "" {
215
+ exportFiltersCmd.arg("--chains").arg(mixcrChainsArg)
216
+ }
217
+ exportFiltersCmd = exportFiltersCmd.
218
+ arg("-isOOF").arg(featureForFlags).
219
+ arg("-hasStops").arg(featureForFlags)
220
+ if useStopCodonReplacement {
221
+ exportFiltersCmd = exportFiltersCmd.arg("-nFeature").arg(featureForFlags)
222
+ }
223
+ exportFiltersCmd = exportFiltersCmd.
224
+ arg("clones.clns").
225
+ addFile("clones.clns", clnsFile).
226
+ arg("clones.tsv").
227
+ saveFile("clones.tsv")
228
+ if library {
229
+ if isLibraryFileGzipped {
230
+ exportFiltersCmd.addFile("library.json.gz", library)
231
+ } else {
232
+ exportFiltersCmd.addFile("library.json", library)
233
+ }
234
+ }
235
+ exportFiltersResult := exportFiltersCmd.cacheHours(3).run()
236
+ filterTsv := exportFiltersResult.getFile("clones.tsv")
237
+ schema := [ { column: isOOFColumn, type: "String" }, { column: hasStopsColumn, type: "String" } ]
238
+ if useStopCodonReplacement {
239
+ schema = append(schema, { column: "nSeq" + featureForFlags, type: "String" })
240
+ }
241
+ dfFilters := wf.frame(filterTsv, { xsvType: "tsv", inferSchema: false, schema: schema })
242
+ stopExpr := pt.when(pt.col(hasStopsColumn).strToUpper().eq("TRUE")).then(pt.lit(1)).otherwise(pt.lit(0))
243
+ if useStopCodonReplacement {
244
+ translated := translateNtToAaExpr(pt.col("nSeq" + featureForFlags), codonMapReplace)
245
+ stopExpr = pt.when(translated.strContains("*", { literal: true })).then(pt.lit(1)).otherwise(pt.lit(0))
246
+ }
247
+ dfFilters = dfFilters.withColumns(
248
+ pt.when(pt.col(isOOFColumn).strToUpper().eq("TRUE")).then(pt.lit(1)).otherwise(pt.lit(0)).alias("__oof"),
249
+ stopExpr.alias("__stop")
250
+ )
251
+ dfFilterCount := dfFilters.select(
252
+ pt.lit(sampleId).alias("sampleId"),
253
+ pt.col("__oof").sum().alias("assemble.clonotypesDroppedByOutOfFrame"),
254
+ pt.col("__stop").sum().alias("assemble.clonotypesDroppedByStopCodons")
255
+ )
256
+ filterCountDfs = append(filterCountDfs, dfFilterCount)
257
+ }
258
+
259
+ if len(filterCountDfs) > 0 {
260
+ filterCountsDf := len(filterCountDfs) > 1 ? pt.concat(filterCountDfs) : filterCountDfs[0]
261
+ joinedDf = joinedDf.join(filterCountsDf, { how: "left", on: ["sampleId"] })
262
+ } else {
263
+ joinedDf = joinedDf.withColumns(
264
+ pt.lit(0).alias("assemble.clonotypesDroppedByOutOfFrame"),
265
+ pt.lit(0).alias("assemble.clonotypesDroppedByStopCodons")
266
+ )
267
+ }
268
+
127
269
  // Per-chain clonotype counts
128
270
  perChainJoined := joinedDf
129
271
  for chain in chains {
@@ -159,6 +301,10 @@ self.body(func(inputs) {
159
301
  pt.col("exportedClonotypes").fillNull(0).cast("Long").alias("totalClonotypes"),
160
302
  pt.col("readsUsedInClonotypesNew").fillNull(0).cast("Long").alias("readsUsedInClonotypes")
161
303
  )
304
+ finalDf = finalDf.withColumns(
305
+ pt.col("assemble.clonotypesDroppedByOutOfFrame").fillNull(0).cast("Long").alias("assemble.clonotypesDroppedByOutOfFrame"),
306
+ pt.col("assemble.clonotypesDroppedByStopCodons").fillNull(0).cast("Long").alias("assemble.clonotypesDroppedByStopCodons")
307
+ )
162
308
  for chain in chains {
163
309
  col := "clonotypesByChain." + chain
164
310
  finalDf = finalDf.withColumns(pt.col(col).fillNull(0).cast("Long").alias(col))
@@ -68,7 +68,9 @@ wf.body(func(args) {
68
68
  mixcrChains: chainInfos[chains].mixcrFilter,
69
69
  cloneClusteringMode: cloneClusteringMode,
70
70
  tagPattern: args.tagPattern,
71
- assemblingFeature: args.assemblingFeature
71
+ assemblingFeature: args.assemblingFeature,
72
+ stopCodonTypes: args.stopCodonTypes,
73
+ stopCodonReplacements: args.stopCodonReplacements
72
74
  })
73
75
  })
74
76
 
@@ -58,7 +58,9 @@ self.body(func(inputs) {
58
58
  env("MI_PROGRESS_PREFIX", progressPrefix).
59
59
  software(mixcrSw).
60
60
  secret("MI_LICENSE", "MI_LICENSE").
61
- arg("analyze")
61
+ env("MI_LICENSE_DEBUG", "MI_LICENSE_DEBUG").
62
+ arg("analyze").
63
+ arg("--use-local-temp")
62
64
 
63
65
  // Select preset based on UMI presence
64
66
  if !is_undefined(params.hasUMI) && params.hasUMI {
@@ -9,11 +9,157 @@ pt := import("@platforma-sdk/workflow-tengo:pt")
9
9
  clonotypeLabel := import(":clonotype-label")
10
10
 
11
11
  json := import("json")
12
+ text := import("text")
12
13
 
13
14
  mixcrSw := assets.importSoftware("@platforma-open/milaboratories.software-mixcr:main")
14
15
 
15
16
  self.defineOutputs("tsv")
16
17
 
18
+ applyStopCodonReplacementsPt := func(df, opts) {
19
+ if is_undefined(opts) {
20
+ return df
21
+ }
22
+ aminoAcidSeqColumns := opts.aminoAcidSeqColumns
23
+ cdr3SeqColumns := opts.cdr3SeqColumns
24
+ stopCodonTypes := opts.stopCodonTypes
25
+ stopCodonReplacements := opts.stopCodonReplacements
26
+
27
+ if is_undefined(aminoAcidSeqColumns) || len(aminoAcidSeqColumns) == 0 {
28
+ return df
29
+ }
30
+ if is_undefined(stopCodonTypes) || !is_array(stopCodonTypes) || len(stopCodonTypes) == 0 {
31
+ return df
32
+ }
33
+ if !is_undefined(stopCodonReplacements) && !is_map(stopCodonReplacements) {
34
+ stopCodonReplacements = undefined
35
+ }
36
+
37
+ contains := func(arr, value) {
38
+ for v in arr {
39
+ if v == value { return true }
40
+ }
41
+ return false
42
+ }
43
+
44
+ stopReplacement := func(stopType) {
45
+ if !contains(stopCodonTypes, stopType) {
46
+ return "*"
47
+ }
48
+ if is_undefined(stopCodonReplacements) {
49
+ return "*"
50
+ }
51
+ aa := stopCodonReplacements[stopType]
52
+ if is_undefined(aa) || aa == "" {
53
+ return "*"
54
+ }
55
+ return text.to_upper(aa)
56
+ }
57
+
58
+ codonMapBase := {
59
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
60
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
61
+ "TAT": "Y", "TAC": "Y", "TAA": "*",
62
+ "TAG": "*", "TGT": "C", "TGC": "C",
63
+ "TGA": "*", "TGG": "W",
64
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
65
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
66
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
67
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
68
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
69
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
70
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
71
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
72
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
73
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
74
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
75
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
76
+ }
77
+
78
+ codonMapReplace := {
79
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
80
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
81
+ "TAT": "Y", "TAC": "Y", "TAA": stopReplacement("ochre"),
82
+ "TAG": stopReplacement("amber"), "TGT": "C", "TGC": "C",
83
+ "TGA": stopReplacement("opal"), "TGG": "W",
84
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
85
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
86
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
87
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
88
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
89
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
90
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
91
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
92
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
93
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
94
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
95
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
96
+ }
97
+
98
+ translateNtToAaExpr := func(ntExpr, codonMap) {
99
+ seq := ntExpr.fillNull("").strToUpper()
100
+ seq = seq.strReplace("(.{3})", "$1|", { replaceAll: true })
101
+ for codon, aa in codonMap {
102
+ seq = seq.strReplace(codon + "|", aa + "|", { replaceAll: true, literal: true })
103
+ }
104
+ seq = seq.strReplace("\\|$", "", { replaceAll: false })
105
+ seq = seq.strReplace("|", "", { replaceAll: true, literal: true })
106
+ seq = seq.strReplace("[ACGT]{1,2}$", "", { replaceAll: true })
107
+ return seq
108
+ }
109
+
110
+ pairs := []
111
+ for aaCol in aminoAcidSeqColumns {
112
+ ntCol := text.replace(aaCol, "aaSeq", "nSeq", 1)
113
+ pairs = append(pairs, { aa: aaCol, nt: ntCol })
114
+ }
115
+
116
+ expressions := []
117
+ replacedAnyExprs := []
118
+ replacedColsExprs := []
119
+ for pair in pairs {
120
+ aaCol := pair.aa
121
+ ntCol := pair.nt
122
+ translatedBase := translateNtToAaExpr(pt.col(ntCol), codonMapBase)
123
+ translatedReplaced := translateNtToAaExpr(pt.col(ntCol), codonMapReplace)
124
+ expressions = append(expressions, translatedReplaced.alias(aaCol))
125
+ cond := translatedReplaced.neq(translatedBase)
126
+ replacedAnyExprs = append(replacedAnyExprs, cond)
127
+ replacedColsExprs = append(replacedColsExprs, pt.when(cond).then(pt.lit(aaCol)).otherwise(pt.lit("")))
128
+ }
129
+ if len(expressions) > 0 {
130
+ df = df.withColumns(expressions...)
131
+ }
132
+
133
+ if len(replacedAnyExprs) > 0 {
134
+ colsList := pt.concatStr(replacedColsExprs, { delimiter: "," })
135
+ colsList = colsList.strReplace(",+", ",", { replaceAll: true }).strReplace("^,|,$", "", { replaceAll: true })
136
+ df = df.withColumns(
137
+ pt.anyHorizontal(replacedAnyExprs...).alias("stopCodonReplaced"),
138
+ colsList.alias("stopCodonReplacedColumns")
139
+ )
140
+ }
141
+
142
+ stopChecks := []
143
+ for colName in aminoAcidSeqColumns {
144
+ stopChecks = append(stopChecks, pt.col(colName).strContains("*", { literal: true }))
145
+ }
146
+ if len(stopChecks) > 0 {
147
+ df = df.filter(pt.anyHorizontal(stopChecks...).eq(false))
148
+ }
149
+
150
+ if !is_undefined(cdr3SeqColumns) && len(cdr3SeqColumns) > 0 {
151
+ regionChecks := []
152
+ for colName in cdr3SeqColumns {
153
+ regionChecks = append(regionChecks, pt.col(colName).strToUpper().eq("REGION_NOT_COVERED"))
154
+ }
155
+ if len(regionChecks) > 0 {
156
+ df = df.filter(pt.anyHorizontal(regionChecks...).eq(false))
157
+ }
158
+ }
159
+
160
+ return df
161
+ }
162
+
17
163
  self.body(func(inputs) {
18
164
  clnsFile := inputs[pConstants.VALUE_FIELD_NAME]
19
165
 
@@ -22,6 +168,13 @@ self.body(func(inputs) {
22
168
 
23
169
  clonotypeKeyColumns := params.clonotypeKeyColumns
24
170
  mainIsProductiveColumn := params.mainIsProductiveColumn
171
+ aminoAcidSeqColumns := params.aminoAcidSeqColumns
172
+ aminoAcidSeqColumnPairs := params.aminoAcidSeqColumnPairs
173
+ cdr3SeqColumns := params.cdr3SeqColumns
174
+ stopCodonTypes := params.stopCodonTypes
175
+ stopCodonReplacements := params.stopCodonReplacements
176
+
177
+ useProductiveFilter := is_undefined(stopCodonTypes) || len(stopCodonTypes) == 0
25
178
 
26
179
  hashKeyDerivationExpressionPt := func(sourceColumns) {
27
180
  return pt.concatStr(
@@ -38,12 +191,18 @@ self.body(func(inputs) {
38
191
  printErrStreamToStdout().
39
192
  software(mixcrSw).
40
193
  secret("MI_LICENSE", "MI_LICENSE").
194
+ env("MI_LICENSE_DEBUG", "MI_LICENSE_DEBUG").
41
195
  arg("exportClones").
42
196
  arg("--dont-split-files").
43
197
  arg("--drop-default-fields").
44
198
  arg("--reset-export-clone-table-splitting").
45
- arg("--chains").arg(params.mixcrChains).
46
- arg("--export-productive-clones-only")
199
+ arg("--chains").arg(params.mixcrChains)
200
+
201
+ if useProductiveFilter {
202
+ mixcrCmdBuilder = mixcrCmdBuilder.arg("--export-productive-clones-only")
203
+ } else {
204
+ mixcrCmdBuilder = mixcrCmdBuilder.arg("--filter-out-of-frames")
205
+ }
47
206
 
48
207
  additionalAction(mixcrCmdBuilder)
49
208
 
@@ -96,6 +255,15 @@ self.body(func(inputs) {
96
255
  alias(mainIsProductiveColumn)
97
256
  )
98
257
  }
258
+ if !is_undefined(stopCodonTypes) && len(stopCodonTypes) > 0 {
259
+ dfMain = applyStopCodonReplacementsPt(dfMain, {
260
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
261
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
262
+ cdr3SeqColumns: cdr3SeqColumns,
263
+ stopCodonTypes: stopCodonTypes,
264
+ stopCodonReplacements: stopCodonReplacements
265
+ })
266
+ }
99
267
  dfMain.addColumns(
100
268
  hashKeyDerivationExpressionPt(clonotypeKeyColumns).alias("clonotypeKey")
101
269
  )
@@ -94,7 +94,11 @@ self.body(func(inputs) {
94
94
  mainAbundanceColumnNormalized := exportSpecs.mainAbundanceColumnNormalized
95
95
  mainAbundanceColumnUnnormalized := exportSpecs.mainAbundanceColumnUnnormalized
96
96
  mainIsProductiveColumn := exportSpecs.mainIsProductiveColumn
97
+ productiveFeature := exportSpecs.productiveFeature
97
98
  axesByClonotypeKey := exportSpecs.axesByClonotypeKey
99
+ aminoAcidSeqColumns := exportSpecs.aminoAcidSeqColumns
100
+ aminoAcidSeqColumnPairs := exportSpecs.aminoAcidSeqColumnPairs
101
+ cdr3SeqColumns := exportSpecs.cdr3SeqColumns
98
102
 
99
103
  columnsToSchema := func(columns) {
100
104
  schema := []
@@ -256,13 +260,18 @@ self.body(func(inputs) {
256
260
  exportOutputs,
257
261
  {
258
262
  extra: {
259
- params: {
263
+ params: maps.clone({
260
264
  clonotypeKeyColumns: clonotypeKeyColumns,
261
265
  exportArgs: exportArgs,
262
266
  referenceLibrary: referenceLibrary,
263
- mixcrChains: mixcrChains,
264
- mainIsProductiveColumn: mainIsProductiveColumn
265
- }
267
+ mixcrChains: mixcrChains,
268
+ mainIsProductiveColumn: mainIsProductiveColumn,
269
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
270
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
271
+ cdr3SeqColumns: cdr3SeqColumns,
272
+ stopCodonTypes: params.stopCodonTypes,
273
+ stopCodonReplacements: params.stopCodonReplacements
274
+ }, { removeUndefs: true })
266
275
  }
267
276
  }
268
277
  )
@@ -346,7 +355,10 @@ self.body(func(inputs) {
346
355
  isLibraryFileGzipped: false,
347
356
  clonotypeTablesData: clonotypeTablesData,
348
357
  hasUmi: hasUMI,
349
- umiTags: umiTags
358
+ umiTags: umiTags,
359
+ productiveFeature: productiveFeature,
360
+ stopCodonTypes: params.stopCodonTypes,
361
+ stopCodonReplacements: params.stopCodonReplacements
350
362
  })
351
363
 
352
364
  return {
@@ -571,6 +571,38 @@ getQcReportColumns := func(hasUmi, sampleIdAxisSpec, chains, umiTags) {
571
571
  }
572
572
  }
573
573
  },
574
+ {
575
+ column: "assemble.clonotypesDroppedByStopCodons",
576
+ id: "assemble-clonotypes-dropped-by-stop-codons",
577
+ allowNA: true,
578
+ naRegex: "NaN",
579
+ spec: {
580
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByStopCodons",
581
+ valueType: "Long",
582
+ annotations: {
583
+ "pl7.app/min": "0",
584
+ "pl7.app/table/orderPriority": "108200",
585
+ "pl7.app/table/visibility": "optional",
586
+ "pl7.app/label": "Clonotypes Dropped - Stop Codons"
587
+ }
588
+ }
589
+ },
590
+ {
591
+ column: "assemble.clonotypesDroppedByOutOfFrame",
592
+ id: "assemble-clonotypes-dropped-by-out-of-frame",
593
+ allowNA: true,
594
+ naRegex: "NaN",
595
+ spec: {
596
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByOutOfFrame",
597
+ valueType: "Long",
598
+ annotations: {
599
+ "pl7.app/min": "0",
600
+ "pl7.app/table/orderPriority": "108100",
601
+ "pl7.app/table/visibility": "optional",
602
+ "pl7.app/label": "Clonotypes Dropped - Out of Frame"
603
+ }
604
+ }
605
+ },
574
606
  {
575
607
  column: "totalClonotypes",
576
608
  id: "total-clonotypes",