@platforma-open/milaboratories.mixcr-amplicon-alignment.workflow 1.17.0 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
   WARN  Issue while reading "/home/runner/work/mixcr-amplicon-alignment/mixcr-amplicon-alignment/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
2
2
 
3
- > @platforma-open/milaboratories.mixcr-amplicon-alignment.workflow@1.17.0 build /home/runner/work/mixcr-amplicon-alignment/mixcr-amplicon-alignment/workflow
3
+ > @platforma-open/milaboratories.mixcr-amplicon-alignment.workflow@1.18.0 build /home/runner/work/mixcr-amplicon-alignment/mixcr-amplicon-alignment/workflow
4
4
  > rm -rf dist && pl-tengo check && pl-tengo build
5
5
 
6
6
  Processing "src/aggregate-by-clonotype-key.tpl.tengo"...
package/CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  # @platforma-open/milaboratories.mixcr-amplicon-alignment.workflow
2
2
 
3
+ ## 1.18.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 656f2fe: stop codon replacement and dep updates
8
+
9
+ ### Patch Changes
10
+
11
+ - Updated dependencies [656f2fe]
12
+ - @platforma-open/milaboratories.mixcr-amplicon-alignment.software@1.1.0
13
+
3
14
  ## 1.17.0
4
15
 
5
16
  ### Minor Changes
@@ -298,6 +298,9 @@ calculateExportSpecs := func(presetSpecForBack, blockId) {
298
298
  columnsSpecPerClonotypeAggregates += [ sampleCountColumn ]
299
299
 
300
300
  orderP := 80000
301
+ aminoAcidSeqColumns := []
302
+ aminoAcidSeqColumnPairs := []
303
+ cdr3SeqColumns := []
301
304
 
302
305
 
303
306
 
@@ -325,6 +328,16 @@ inFrameFeatures := {
325
328
  alphabetShortMixcr := isAminoAcid ? "aa" : "n"
326
329
  columnName := alphabetShortMixcr + "Seq" + featureInFrameU
327
330
  visibility := featureU == "VDJRegion" || featureU == "CDR3"
331
+ if featureU == "CDR3" {
332
+ cdr3SeqColumns += [ columnName ]
333
+ }
334
+ if isAminoAcid {
335
+ aminoAcidSeqColumns += [ columnName ]
336
+ aminoAcidSeqColumnPairs += [ {
337
+ aa: columnName,
338
+ nt: "nSeq" + featureU
339
+ } ]
340
+ }
328
341
  columnsSpecPerClonotypeNoAggregates += [ {
329
342
  column: columnName,
330
343
  id: alphabetShortMixcr + "-seq-" + featureInFrameL,
@@ -807,6 +820,7 @@ inFrameFeatures := {
807
820
  } ]
808
821
 
809
822
  return {
823
+ productiveFeature: productiveFeature,
810
824
  clonotypeKeyColumns: clonotypeKeyColumns,
811
825
  clonotypeKeyArgs: clonotypeKeyArgs,
812
826
 
@@ -816,6 +830,9 @@ inFrameFeatures := {
816
830
  columnsSpecPerClonotypeNoAggregates: columnsSpecPerClonotypeNoAggregates,
817
831
  columnsSpecPerClonotypeAggregates: columnsSpecPerClonotypeAggregates,
818
832
  cdr3DistanceColumnsSpec: cdr3DistanceColumnsSpec,
833
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
834
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
835
+ cdr3SeqColumns: cdr3SeqColumns,
819
836
 
820
837
  columnsSpec: columnsSpec,
821
838
 
@@ -571,6 +571,38 @@ getQcReportColumns := func(hasUmi, sampleIdAxisSpec, chains, umiTags) {
571
571
  }
572
572
  }
573
573
  },
574
+ {
575
+ column: "assemble.clonotypesDroppedByStopCodons",
576
+ id: "assemble-clonotypes-dropped-by-stop-codons",
577
+ allowNA: true,
578
+ naRegex: "NaN",
579
+ spec: {
580
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByStopCodons",
581
+ valueType: "Long",
582
+ annotations: {
583
+ "pl7.app/min": "0",
584
+ "pl7.app/table/orderPriority": "108200",
585
+ "pl7.app/table/visibility": "optional",
586
+ "pl7.app/label": "Clonotypes Dropped - Stop Codons"
587
+ }
588
+ }
589
+ },
590
+ {
591
+ column: "assemble.clonotypesDroppedByOutOfFrame",
592
+ id: "assemble-clonotypes-dropped-by-out-of-frame",
593
+ allowNA: true,
594
+ naRegex: "NaN",
595
+ spec: {
596
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByOutOfFrame",
597
+ valueType: "Long",
598
+ annotations: {
599
+ "pl7.app/min": "0",
600
+ "pl7.app/table/orderPriority": "108100",
601
+ "pl7.app/table/visibility": "optional",
602
+ "pl7.app/label": "Clonotypes Dropped - Out of Frame"
603
+ }
604
+ }
605
+ },
574
606
  {
575
607
  column: "totalClonotypes",
576
608
  id: "total-clonotypes",
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,16 +1,16 @@
1
1
  {
2
2
  "name": "@platforma-open/milaboratories.mixcr-amplicon-alignment.workflow",
3
- "version": "1.17.0",
3
+ "version": "1.18.0",
4
4
  "description": "MiXCR Amplicon Alignment Workflow",
5
5
  "type": "module",
6
6
  "dependencies": {
7
- "@platforma-sdk/workflow-tengo": "5.8.0",
7
+ "@platforma-sdk/workflow-tengo": "5.8.1",
8
8
  "@platforma-open/milaboratories.software-mixcr": "4.7.0-279-develop",
9
9
  "@platforma-open/milaboratories.software-repseqio": "^2.5.0-13-master",
10
- "@platforma-open/milaboratories.mixcr-amplicon-alignment.software": "1.0.0"
10
+ "@platforma-open/milaboratories.mixcr-amplicon-alignment.software": "1.1.0"
11
11
  },
12
12
  "devDependencies": {
13
- "@platforma-sdk/tengo-builder": "2.4.11"
13
+ "@platforma-sdk/tengo-builder": "2.4.12"
14
14
  },
15
15
  "scripts": {
16
16
  "build": "rm -rf dist && pl-tengo check && pl-tengo build",
@@ -298,6 +298,9 @@ calculateExportSpecs := func(presetSpecForBack, blockId) {
298
298
  columnsSpecPerClonotypeAggregates += [ sampleCountColumn ]
299
299
 
300
300
  orderP := 80000
301
+ aminoAcidSeqColumns := []
302
+ aminoAcidSeqColumnPairs := []
303
+ cdr3SeqColumns := []
301
304
 
302
305
  // Sequences
303
306
 
@@ -325,6 +328,16 @@ inFrameFeatures := {
325
328
  alphabetShortMixcr := isAminoAcid ? "aa" : "n"
326
329
  columnName := alphabetShortMixcr + "Seq" + featureInFrameU
327
330
  visibility := featureU == "VDJRegion" || featureU == "CDR3"
331
+ if featureU == "CDR3" {
332
+ cdr3SeqColumns += [ columnName ]
333
+ }
334
+ if isAminoAcid {
335
+ aminoAcidSeqColumns += [ columnName ]
336
+ aminoAcidSeqColumnPairs += [ {
337
+ aa: columnName,
338
+ nt: "nSeq" + featureU
339
+ } ]
340
+ }
328
341
  columnsSpecPerClonotypeNoAggregates += [ {
329
342
  column: columnName,
330
343
  id: alphabetShortMixcr + "-seq-" + featureInFrameL,
@@ -807,6 +820,7 @@ inFrameFeatures := {
807
820
  } ]
808
821
 
809
822
  return {
823
+ productiveFeature: productiveFeature,
810
824
  clonotypeKeyColumns: clonotypeKeyColumns,
811
825
  clonotypeKeyArgs: clonotypeKeyArgs,
812
826
 
@@ -816,6 +830,9 @@ inFrameFeatures := {
816
830
  columnsSpecPerClonotypeNoAggregates: columnsSpecPerClonotypeNoAggregates,
817
831
  columnsSpecPerClonotypeAggregates: columnsSpecPerClonotypeAggregates,
818
832
  cdr3DistanceColumnsSpec: cdr3DistanceColumnsSpec,
833
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
834
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
835
+ cdr3SeqColumns: cdr3SeqColumns,
819
836
 
820
837
  columnsSpec: columnsSpec,
821
838
 
@@ -32,6 +32,75 @@ self.body(func(inputs) {
32
32
 
33
33
  umiTags := inputs.umiTags
34
34
  hasUmi := !is_undefined(umiTags) && len(umiTags) > 0
35
+ stopCodonTypes := inputs.stopCodonTypes
36
+ stopCodonReplacements := inputs.stopCodonReplacements
37
+ if is_undefined(stopCodonTypes) || !is_array(stopCodonTypes) {
38
+ stopCodonTypes = []
39
+ }
40
+ useStopCodonReplacement := !is_undefined(stopCodonTypes) && is_array(stopCodonTypes) && len(stopCodonTypes) > 0
41
+ if is_undefined(stopCodonReplacements) || !is_map(stopCodonReplacements) {
42
+ stopCodonReplacements = {}
43
+ }
44
+
45
+ featureForFlags := inputs.productiveFeature
46
+ if is_undefined(featureForFlags) || featureForFlags == "" {
47
+ featureForFlags = "CDR3"
48
+ }
49
+ if is_array(featureForFlags) && len(featureForFlags) > 0 {
50
+ featureForFlags = featureForFlags[0]
51
+ }
52
+ isOOFColumn := "isOOF" + featureForFlags
53
+ hasStopsColumn := "hasStopsIn" + featureForFlags
54
+
55
+ contains := func(arr, value) {
56
+ for v in arr {
57
+ if v == value { return true }
58
+ }
59
+ return false
60
+ }
61
+
62
+ stopReplacement := func(stopType) {
63
+ if !contains(stopCodonTypes, stopType) {
64
+ return "*"
65
+ }
66
+ aa := stopCodonReplacements[stopType]
67
+ if is_undefined(aa) || aa == "" {
68
+ return "*"
69
+ }
70
+ return text.to_upper(aa)
71
+ }
72
+
73
+ codonMapReplace := {
74
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
75
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
76
+ "TAT": "Y", "TAC": "Y", "TAA": stopReplacement("ochre"),
77
+ "TAG": stopReplacement("amber"), "TGT": "C", "TGC": "C",
78
+ "TGA": stopReplacement("opal"), "TGG": "W",
79
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
80
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
81
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
82
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
83
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
84
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
85
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
86
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
87
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
88
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
89
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
90
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
91
+ }
92
+
93
+ translateNtToAaExpr := func(ntExpr, codonMap) {
94
+ seq := ntExpr.fillNull("").strToUpper()
95
+ seq = seq.strReplace("(.{3})", "$1|", { replaceAll: true })
96
+ for codon, aa in codonMap {
97
+ seq = seq.strReplace(codon + "|", aa + "|", { replaceAll: true, literal: true })
98
+ }
99
+ seq = seq.strReplace("\\|$", "", { replaceAll: false })
100
+ seq = seq.strReplace("|", "", { replaceAll: true, literal: true })
101
+ seq = seq.strReplace("[ACGT]{1,2}$", "", { replaceAll: true })
102
+ return seq
103
+ }
35
104
 
36
105
  chainInfos := {
37
106
  "IGHeavy": { mixcrFilter: "IGH", name: "IG Heavy", shortName: "Heavy" },
@@ -124,6 +193,77 @@ self.body(func(inputs) {
124
193
  // Join counts and overwrite totalClonotypes to reflect exported (productive) clones
125
194
  joinedDf := processedDf.join(aggregatedCounts, { how: "left", on: ["sampleId"] })
126
195
 
196
+ // Count clonotypes filtered by stop codons and out-of-frame per sample
197
+ filterCountDfs := []
198
+ mixcrChainsArg := text.join(chainsForMixcr, ",")
199
+ for key, clnsFile in clnsFiles {
200
+ sampleId := json.decode(key)[0]
201
+ exportFiltersCmd := exec.builder().
202
+ inMediumQueue().
203
+ mem("16GiB").
204
+ cpu(2).
205
+ software(mixcrSw).
206
+ env("MI_USE_SYSTEM_CA", "true").
207
+ secret("MI_LICENSE", "MI_LICENSE").
208
+ arg("exportClones").
209
+ arg("--dont-split-files").
210
+ arg("--drop-default-fields").
211
+ arg("--reset-export-clone-table-splitting")
212
+ if mixcrChainsArg != "" {
213
+ exportFiltersCmd.arg("--chains").arg(mixcrChainsArg)
214
+ }
215
+ exportFiltersCmd = exportFiltersCmd.
216
+ arg("-isOOF").arg(featureForFlags).
217
+ arg("-hasStops").arg(featureForFlags)
218
+ if useStopCodonReplacement {
219
+ exportFiltersCmd = exportFiltersCmd.arg("-nFeature").arg(featureForFlags)
220
+ }
221
+ exportFiltersCmd = exportFiltersCmd.
222
+ arg("clones.clns").
223
+ addFile("clones.clns", clnsFile).
224
+ arg("clones.tsv").
225
+ saveFile("clones.tsv")
226
+ if library {
227
+ if isLibraryFileGzipped {
228
+ exportFiltersCmd.addFile("library.json.gz", library)
229
+ } else {
230
+ exportFiltersCmd.addFile("library.json", library)
231
+ }
232
+ }
233
+ exportFiltersResult := exportFiltersCmd.cacheHours(3).run()
234
+ filterTsv := exportFiltersResult.getFile("clones.tsv")
235
+ schema := [ { column: isOOFColumn, type: "String" }, { column: hasStopsColumn, type: "String" } ]
236
+ if useStopCodonReplacement {
237
+ schema = append(schema, { column: "nSeq" + featureForFlags, type: "String" })
238
+ }
239
+ dfFilters := wf.frame(filterTsv, { xsvType: "tsv", inferSchema: false, schema: schema })
240
+ stopExpr := pt.when(pt.col(hasStopsColumn).strToUpper().eq("TRUE")).then(pt.lit(1)).otherwise(pt.lit(0))
241
+ if useStopCodonReplacement {
242
+ translated := translateNtToAaExpr(pt.col("nSeq" + featureForFlags), codonMapReplace)
243
+ stopExpr = pt.when(translated.strContains("*", { literal: true })).then(pt.lit(1)).otherwise(pt.lit(0))
244
+ }
245
+ dfFilters = dfFilters.withColumns(
246
+ pt.when(pt.col(isOOFColumn).strToUpper().eq("TRUE")).then(pt.lit(1)).otherwise(pt.lit(0)).alias("__oof"),
247
+ stopExpr.alias("__stop")
248
+ )
249
+ dfFilterCount := dfFilters.select(
250
+ pt.lit(sampleId).alias("sampleId"),
251
+ pt.col("__oof").sum().alias("assemble.clonotypesDroppedByOutOfFrame"),
252
+ pt.col("__stop").sum().alias("assemble.clonotypesDroppedByStopCodons")
253
+ )
254
+ filterCountDfs = append(filterCountDfs, dfFilterCount)
255
+ }
256
+
257
+ if len(filterCountDfs) > 0 {
258
+ filterCountsDf := len(filterCountDfs) > 1 ? pt.concat(filterCountDfs) : filterCountDfs[0]
259
+ joinedDf = joinedDf.join(filterCountsDf, { how: "left", on: ["sampleId"] })
260
+ } else {
261
+ joinedDf = joinedDf.withColumns(
262
+ pt.lit(0).alias("assemble.clonotypesDroppedByOutOfFrame"),
263
+ pt.lit(0).alias("assemble.clonotypesDroppedByStopCodons")
264
+ )
265
+ }
266
+
127
267
  // Per-chain clonotype counts
128
268
  perChainJoined := joinedDf
129
269
  for chain in chains {
@@ -159,6 +299,10 @@ self.body(func(inputs) {
159
299
  pt.col("exportedClonotypes").fillNull(0).cast("Long").alias("totalClonotypes"),
160
300
  pt.col("readsUsedInClonotypesNew").fillNull(0).cast("Long").alias("readsUsedInClonotypes")
161
301
  )
302
+ finalDf = finalDf.withColumns(
303
+ pt.col("assemble.clonotypesDroppedByOutOfFrame").fillNull(0).cast("Long").alias("assemble.clonotypesDroppedByOutOfFrame"),
304
+ pt.col("assemble.clonotypesDroppedByStopCodons").fillNull(0).cast("Long").alias("assemble.clonotypesDroppedByStopCodons")
305
+ )
162
306
  for chain in chains {
163
307
  col := "clonotypesByChain." + chain
164
308
  finalDf = finalDf.withColumns(pt.col(col).fillNull(0).cast("Long").alias(col))
@@ -68,7 +68,9 @@ wf.body(func(args) {
68
68
  mixcrChains: chainInfos[chains].mixcrFilter,
69
69
  cloneClusteringMode: cloneClusteringMode,
70
70
  tagPattern: args.tagPattern,
71
- assemblingFeature: args.assemblingFeature
71
+ assemblingFeature: args.assemblingFeature,
72
+ stopCodonTypes: args.stopCodonTypes,
73
+ stopCodonReplacements: args.stopCodonReplacements
72
74
  })
73
75
  })
74
76
 
@@ -9,11 +9,157 @@ pt := import("@platforma-sdk/workflow-tengo:pt")
9
9
  clonotypeLabel := import(":clonotype-label")
10
10
 
11
11
  json := import("json")
12
+ text := import("text")
12
13
 
13
14
  mixcrSw := assets.importSoftware("@platforma-open/milaboratories.software-mixcr:main")
14
15
 
15
16
  self.defineOutputs("tsv")
16
17
 
18
+ applyStopCodonReplacementsPt := func(df, opts) {
19
+ if is_undefined(opts) {
20
+ return df
21
+ }
22
+ aminoAcidSeqColumns := opts.aminoAcidSeqColumns
23
+ cdr3SeqColumns := opts.cdr3SeqColumns
24
+ stopCodonTypes := opts.stopCodonTypes
25
+ stopCodonReplacements := opts.stopCodonReplacements
26
+
27
+ if is_undefined(aminoAcidSeqColumns) || len(aminoAcidSeqColumns) == 0 {
28
+ return df
29
+ }
30
+ if is_undefined(stopCodonTypes) || !is_array(stopCodonTypes) || len(stopCodonTypes) == 0 {
31
+ return df
32
+ }
33
+ if !is_undefined(stopCodonReplacements) && !is_map(stopCodonReplacements) {
34
+ stopCodonReplacements = undefined
35
+ }
36
+
37
+ contains := func(arr, value) {
38
+ for v in arr {
39
+ if v == value { return true }
40
+ }
41
+ return false
42
+ }
43
+
44
+ stopReplacement := func(stopType) {
45
+ if !contains(stopCodonTypes, stopType) {
46
+ return "*"
47
+ }
48
+ if is_undefined(stopCodonReplacements) {
49
+ return "*"
50
+ }
51
+ aa := stopCodonReplacements[stopType]
52
+ if is_undefined(aa) || aa == "" {
53
+ return "*"
54
+ }
55
+ return text.to_upper(aa)
56
+ }
57
+
58
+ codonMapBase := {
59
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
60
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
61
+ "TAT": "Y", "TAC": "Y", "TAA": "*",
62
+ "TAG": "*", "TGT": "C", "TGC": "C",
63
+ "TGA": "*", "TGG": "W",
64
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
65
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
66
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
67
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
68
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
69
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
70
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
71
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
72
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
73
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
74
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
75
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
76
+ }
77
+
78
+ codonMapReplace := {
79
+ "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
80
+ "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
81
+ "TAT": "Y", "TAC": "Y", "TAA": stopReplacement("ochre"),
82
+ "TAG": stopReplacement("amber"), "TGT": "C", "TGC": "C",
83
+ "TGA": stopReplacement("opal"), "TGG": "W",
84
+ "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
85
+ "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
86
+ "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
87
+ "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
88
+ "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
89
+ "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
90
+ "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
91
+ "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
92
+ "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
93
+ "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
94
+ "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
95
+ "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"
96
+ }
97
+
98
+ translateNtToAaExpr := func(ntExpr, codonMap) {
99
+ seq := ntExpr.fillNull("").strToUpper()
100
+ seq = seq.strReplace("(.{3})", "$1|", { replaceAll: true })
101
+ for codon, aa in codonMap {
102
+ seq = seq.strReplace(codon + "|", aa + "|", { replaceAll: true, literal: true })
103
+ }
104
+ seq = seq.strReplace("\\|$", "", { replaceAll: false })
105
+ seq = seq.strReplace("|", "", { replaceAll: true, literal: true })
106
+ seq = seq.strReplace("[ACGT]{1,2}$", "", { replaceAll: true })
107
+ return seq
108
+ }
109
+
110
+ pairs := []
111
+ for aaCol in aminoAcidSeqColumns {
112
+ ntCol := text.replace(aaCol, "aaSeq", "nSeq", 1)
113
+ pairs = append(pairs, { aa: aaCol, nt: ntCol })
114
+ }
115
+
116
+ expressions := []
117
+ replacedAnyExprs := []
118
+ replacedColsExprs := []
119
+ for pair in pairs {
120
+ aaCol := pair.aa
121
+ ntCol := pair.nt
122
+ translatedBase := translateNtToAaExpr(pt.col(ntCol), codonMapBase)
123
+ translatedReplaced := translateNtToAaExpr(pt.col(ntCol), codonMapReplace)
124
+ expressions = append(expressions, translatedReplaced.alias(aaCol))
125
+ cond := translatedReplaced.neq(translatedBase)
126
+ replacedAnyExprs = append(replacedAnyExprs, cond)
127
+ replacedColsExprs = append(replacedColsExprs, pt.when(cond).then(pt.lit(aaCol)).otherwise(pt.lit("")))
128
+ }
129
+ if len(expressions) > 0 {
130
+ df = df.withColumns(expressions...)
131
+ }
132
+
133
+ if len(replacedAnyExprs) > 0 {
134
+ colsList := pt.concatStr(replacedColsExprs, { delimiter: "," })
135
+ colsList = colsList.strReplace(",+", ",", { replaceAll: true }).strReplace("^,|,$", "", { replaceAll: true })
136
+ df = df.withColumns(
137
+ pt.anyHorizontal(replacedAnyExprs...).alias("stopCodonReplaced"),
138
+ colsList.alias("stopCodonReplacedColumns")
139
+ )
140
+ }
141
+
142
+ stopChecks := []
143
+ for colName in aminoAcidSeqColumns {
144
+ stopChecks = append(stopChecks, pt.col(colName).strContains("*", { literal: true }))
145
+ }
146
+ if len(stopChecks) > 0 {
147
+ df = df.filter(pt.anyHorizontal(stopChecks...).eq(false))
148
+ }
149
+
150
+ if !is_undefined(cdr3SeqColumns) && len(cdr3SeqColumns) > 0 {
151
+ regionChecks := []
152
+ for colName in cdr3SeqColumns {
153
+ regionChecks = append(regionChecks, pt.col(colName).strToUpper().eq("REGION_NOT_COVERED"))
154
+ }
155
+ if len(regionChecks) > 0 {
156
+ df = df.filter(pt.anyHorizontal(regionChecks...).eq(false))
157
+ }
158
+ }
159
+
160
+ return df
161
+ }
162
+
17
163
  self.body(func(inputs) {
18
164
  clnsFile := inputs[pConstants.VALUE_FIELD_NAME]
19
165
 
@@ -22,6 +168,13 @@ self.body(func(inputs) {
22
168
 
23
169
  clonotypeKeyColumns := params.clonotypeKeyColumns
24
170
  mainIsProductiveColumn := params.mainIsProductiveColumn
171
+ aminoAcidSeqColumns := params.aminoAcidSeqColumns
172
+ aminoAcidSeqColumnPairs := params.aminoAcidSeqColumnPairs
173
+ cdr3SeqColumns := params.cdr3SeqColumns
174
+ stopCodonTypes := params.stopCodonTypes
175
+ stopCodonReplacements := params.stopCodonReplacements
176
+
177
+ useProductiveFilter := is_undefined(stopCodonTypes) || len(stopCodonTypes) == 0
25
178
 
26
179
  hashKeyDerivationExpressionPt := func(sourceColumns) {
27
180
  return pt.concatStr(
@@ -42,8 +195,13 @@ self.body(func(inputs) {
42
195
  arg("--dont-split-files").
43
196
  arg("--drop-default-fields").
44
197
  arg("--reset-export-clone-table-splitting").
45
- arg("--chains").arg(params.mixcrChains).
46
- arg("--export-productive-clones-only")
198
+ arg("--chains").arg(params.mixcrChains)
199
+
200
+ if useProductiveFilter {
201
+ mixcrCmdBuilder = mixcrCmdBuilder.arg("--export-productive-clones-only")
202
+ } else {
203
+ mixcrCmdBuilder = mixcrCmdBuilder.arg("--filter-out-of-frames")
204
+ }
47
205
 
48
206
  additionalAction(mixcrCmdBuilder)
49
207
 
@@ -96,6 +254,15 @@ self.body(func(inputs) {
96
254
  alias(mainIsProductiveColumn)
97
255
  )
98
256
  }
257
+ if !is_undefined(stopCodonTypes) && len(stopCodonTypes) > 0 {
258
+ dfMain = applyStopCodonReplacementsPt(dfMain, {
259
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
260
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
261
+ cdr3SeqColumns: cdr3SeqColumns,
262
+ stopCodonTypes: stopCodonTypes,
263
+ stopCodonReplacements: stopCodonReplacements
264
+ })
265
+ }
99
266
  dfMain.addColumns(
100
267
  hashKeyDerivationExpressionPt(clonotypeKeyColumns).alias("clonotypeKey")
101
268
  )
@@ -94,7 +94,11 @@ self.body(func(inputs) {
94
94
  mainAbundanceColumnNormalized := exportSpecs.mainAbundanceColumnNormalized
95
95
  mainAbundanceColumnUnnormalized := exportSpecs.mainAbundanceColumnUnnormalized
96
96
  mainIsProductiveColumn := exportSpecs.mainIsProductiveColumn
97
+ productiveFeature := exportSpecs.productiveFeature
97
98
  axesByClonotypeKey := exportSpecs.axesByClonotypeKey
99
+ aminoAcidSeqColumns := exportSpecs.aminoAcidSeqColumns
100
+ aminoAcidSeqColumnPairs := exportSpecs.aminoAcidSeqColumnPairs
101
+ cdr3SeqColumns := exportSpecs.cdr3SeqColumns
98
102
 
99
103
  columnsToSchema := func(columns) {
100
104
  schema := []
@@ -256,13 +260,18 @@ self.body(func(inputs) {
256
260
  exportOutputs,
257
261
  {
258
262
  extra: {
259
- params: {
263
+ params: maps.clone({
260
264
  clonotypeKeyColumns: clonotypeKeyColumns,
261
265
  exportArgs: exportArgs,
262
266
  referenceLibrary: referenceLibrary,
263
- mixcrChains: mixcrChains,
264
- mainIsProductiveColumn: mainIsProductiveColumn
265
- }
267
+ mixcrChains: mixcrChains,
268
+ mainIsProductiveColumn: mainIsProductiveColumn,
269
+ aminoAcidSeqColumns: aminoAcidSeqColumns,
270
+ aminoAcidSeqColumnPairs: aminoAcidSeqColumnPairs,
271
+ cdr3SeqColumns: cdr3SeqColumns,
272
+ stopCodonTypes: params.stopCodonTypes,
273
+ stopCodonReplacements: params.stopCodonReplacements
274
+ }, { removeUndefs: true })
266
275
  }
267
276
  }
268
277
  )
@@ -346,7 +355,10 @@ self.body(func(inputs) {
346
355
  isLibraryFileGzipped: false,
347
356
  clonotypeTablesData: clonotypeTablesData,
348
357
  hasUmi: hasUMI,
349
- umiTags: umiTags
358
+ umiTags: umiTags,
359
+ productiveFeature: productiveFeature,
360
+ stopCodonTypes: params.stopCodonTypes,
361
+ stopCodonReplacements: params.stopCodonReplacements
350
362
  })
351
363
 
352
364
  return {
@@ -571,6 +571,38 @@ getQcReportColumns := func(hasUmi, sampleIdAxisSpec, chains, umiTags) {
571
571
  }
572
572
  }
573
573
  },
574
+ {
575
+ column: "assemble.clonotypesDroppedByStopCodons",
576
+ id: "assemble-clonotypes-dropped-by-stop-codons",
577
+ allowNA: true,
578
+ naRegex: "NaN",
579
+ spec: {
580
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByStopCodons",
581
+ valueType: "Long",
582
+ annotations: {
583
+ "pl7.app/min": "0",
584
+ "pl7.app/table/orderPriority": "108200",
585
+ "pl7.app/table/visibility": "optional",
586
+ "pl7.app/label": "Clonotypes Dropped - Stop Codons"
587
+ }
588
+ }
589
+ },
590
+ {
591
+ column: "assemble.clonotypesDroppedByOutOfFrame",
592
+ id: "assemble-clonotypes-dropped-by-out-of-frame",
593
+ allowNA: true,
594
+ naRegex: "NaN",
595
+ spec: {
596
+ name: "mixcr.com/reports/assemble/clonotypesDroppedByOutOfFrame",
597
+ valueType: "Long",
598
+ annotations: {
599
+ "pl7.app/min": "0",
600
+ "pl7.app/table/orderPriority": "108100",
601
+ "pl7.app/table/visibility": "optional",
602
+ "pl7.app/label": "Clonotypes Dropped - Out of Frame"
603
+ }
604
+ }
605
+ },
574
606
  {
575
607
  column: "totalClonotypes",
576
608
  id: "total-clonotypes",