@platforma-open/milaboratories.immune-assay-data.workflow 1.7.1 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +9 -1
- package/CHANGELOG.md +6 -0
- package/dist/index.cjs +4 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +4 -0
- package/dist/tengo/tpl/analysis.plj.gz +0 -0
- package/dist/tengo/tpl/build-outputs.plj.gz +0 -0
- package/dist/tengo/tpl/check-content-empty.plj.gz +0 -0
- package/dist/tengo/tpl/extract-unique-values.plj.gz +0 -0
- package/dist/tengo/tpl/get-unique-values.plj.gz +0 -0
- package/dist/tengo/tpl/main.plj.gz +0 -0
- package/dist/tengo/tpl/process-outputs.plj.gz +0 -0
- package/dist/tengo/tpl/run-alignment.plj.gz +0 -0
- package/package.json +6 -6
- package/src/analysis.tpl.tengo +370 -0
- package/src/build-outputs.tpl.tengo +2 -19
- package/src/check-content-empty.tpl.tengo +21 -0
- package/src/extract-unique-values.tpl.tengo +2 -5
- package/src/get-unique-values.tpl.tengo +51 -0
- package/src/main.tpl.tengo +80 -337
- package/src/process-outputs.tpl.tengo +39 -0
- package/src/run-alignment.tpl.tengo +4 -4
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,17 +1,25 @@
|
|
|
1
1
|
WARN Issue while reading "/home/runner/work/immune-assay-data/immune-assay-data/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
|
|
2
2
|
|
|
3
|
-
> @platforma-open/milaboratories.immune-assay-data.workflow@1.
|
|
3
|
+
> @platforma-open/milaboratories.immune-assay-data.workflow@1.8.0 build /home/runner/work/immune-assay-data/immune-assay-data/workflow
|
|
4
4
|
> rm -rf dist && pl-tengo check && pl-tengo build
|
|
5
5
|
|
|
6
|
+
Processing "src/analysis.tpl.tengo"...
|
|
6
7
|
Processing "src/build-outputs.tpl.tengo"...
|
|
8
|
+
Processing "src/check-content-empty.tpl.tengo"...
|
|
7
9
|
Processing "src/extract-unique-values.tpl.tengo"...
|
|
10
|
+
Processing "src/get-unique-values.tpl.tengo"...
|
|
8
11
|
Processing "src/main.tpl.tengo"...
|
|
12
|
+
Processing "src/process-outputs.tpl.tengo"...
|
|
9
13
|
Processing "src/run-alignment.tpl.tengo"...
|
|
10
14
|
No syntax errors found.
|
|
11
15
|
info: Compiling 'dist'...
|
|
12
16
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/build-outputs.plj.gz
|
|
17
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/check-content-empty.plj.gz
|
|
13
18
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/extract-unique-values.plj.gz
|
|
19
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/get-unique-values.plj.gz
|
|
20
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/process-outputs.plj.gz
|
|
14
21
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/run-alignment.plj.gz
|
|
22
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/analysis.plj.gz
|
|
15
23
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/main.plj.gz
|
|
16
24
|
info: Template Pack build done.
|
|
17
25
|
info: Template Pack build done.
|
package/CHANGELOG.md
CHANGED
package/dist/index.cjs
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
module.exports = { Templates: {
|
|
2
2
|
'build-outputs': { type: 'from-file', path: require.resolve('./tengo/tpl/build-outputs.plj.gz') },
|
|
3
|
+
'check-content-empty': { type: 'from-file', path: require.resolve('./tengo/tpl/check-content-empty.plj.gz') },
|
|
3
4
|
'extract-unique-values': { type: 'from-file', path: require.resolve('./tengo/tpl/extract-unique-values.plj.gz') },
|
|
5
|
+
'get-unique-values': { type: 'from-file', path: require.resolve('./tengo/tpl/get-unique-values.plj.gz') },
|
|
6
|
+
'process-outputs': { type: 'from-file', path: require.resolve('./tengo/tpl/process-outputs.plj.gz') },
|
|
4
7
|
'run-alignment': { type: 'from-file', path: require.resolve('./tengo/tpl/run-alignment.plj.gz') },
|
|
8
|
+
'analysis': { type: 'from-file', path: require.resolve('./tengo/tpl/analysis.plj.gz') },
|
|
5
9
|
'main': { type: 'from-file', path: require.resolve('./tengo/tpl/main.plj.gz') }
|
|
6
10
|
}};
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
declare type TemplateFromFile = { readonly type: "from-file"; readonly path: string; };
|
|
2
|
-
declare type TplName = "build-outputs" | "extract-unique-values" | "run-alignment" | "main";
|
|
2
|
+
declare type TplName = "build-outputs" | "check-content-empty" | "extract-unique-values" | "get-unique-values" | "process-outputs" | "run-alignment" | "analysis" | "main";
|
|
3
3
|
declare const Templates: Record<TplName, TemplateFromFile>;
|
|
4
4
|
export { Templates };
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import { resolve } from 'node:path';
|
|
2
2
|
export const Templates = {
|
|
3
3
|
'build-outputs': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/build-outputs.plj.gz') },
|
|
4
|
+
'check-content-empty': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/check-content-empty.plj.gz') },
|
|
4
5
|
'extract-unique-values': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/extract-unique-values.plj.gz') },
|
|
6
|
+
'get-unique-values': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/get-unique-values.plj.gz') },
|
|
7
|
+
'process-outputs': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/process-outputs.plj.gz') },
|
|
5
8
|
'run-alignment': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/run-alignment.plj.gz') },
|
|
9
|
+
'analysis': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/analysis.plj.gz') },
|
|
6
10
|
'main': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/main.plj.gz') }
|
|
7
11
|
};
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@platforma-open/milaboratories.immune-assay-data.workflow",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Tengo-based template",
|
|
6
6
|
"dependencies": {
|
|
7
|
-
"@platforma-sdk/workflow-tengo": "5.9.
|
|
7
|
+
"@platforma-sdk/workflow-tengo": "5.9.1",
|
|
8
8
|
"@platforma-open/soedinglab.software-mmseqs2": "1.18.3",
|
|
9
|
-
"@platforma-open/milaboratories.immune-assay-data.prepare-fasta": "1.1.3",
|
|
10
9
|
"@platforma-open/milaboratories.immune-assay-data.add-header": "1.1.3",
|
|
11
|
-
"@platforma-open/milaboratories.immune-assay-data.
|
|
10
|
+
"@platforma-open/milaboratories.immune-assay-data.prepare-fasta": "1.1.3",
|
|
12
11
|
"@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc": "1.2.0",
|
|
13
|
-
"@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3"
|
|
12
|
+
"@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3",
|
|
13
|
+
"@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv": "1.1.0"
|
|
14
14
|
},
|
|
15
15
|
"devDependencies": {
|
|
16
|
-
"@platforma-sdk/tengo-builder": "2.4.
|
|
16
|
+
"@platforma-sdk/tengo-builder": "2.4.27"
|
|
17
17
|
},
|
|
18
18
|
"scripts": {
|
|
19
19
|
"build": "rm -rf dist && pl-tengo check && pl-tengo build",
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
3
|
+
exec := import("@platforma-sdk/workflow-tengo:exec")
|
|
4
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
5
|
+
pt := import("@platforma-sdk/workflow-tengo:pt")
|
|
6
|
+
text := import("text")
|
|
7
|
+
json := import("json")
|
|
8
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
9
|
+
|
|
10
|
+
prepareFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.prepare-fasta:main")
|
|
11
|
+
fastaToTsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv:main")
|
|
12
|
+
addHeaderSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.add-header:main")
|
|
13
|
+
covModeCalcSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc:main")
|
|
14
|
+
xlsxToCsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv:main")
|
|
15
|
+
|
|
16
|
+
runAlignmentTpl := assets.importTemplate(":run-alignment")
|
|
17
|
+
checkContentEmptyTpl := assets.importTemplate(":check-content-empty")
|
|
18
|
+
getUniqueValuesTpl := assets.importTemplate(":get-unique-values")
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Convert xlsx file to csv file
|
|
22
|
+
* @param fileXlsx - xlsx file
|
|
23
|
+
* @return csv file run result
|
|
24
|
+
*/
|
|
25
|
+
runXlsxToCsv := func(fileXlsx) {
|
|
26
|
+
e := exec.builder().
|
|
27
|
+
software(xlsxToCsvSw).
|
|
28
|
+
mem("16GiB").
|
|
29
|
+
cpu(1).
|
|
30
|
+
addFile("input.xlsx", fileXlsx).
|
|
31
|
+
arg("-i").arg("input.xlsx").
|
|
32
|
+
arg("-o").arg("output.csv").
|
|
33
|
+
saveFile("output.csv")
|
|
34
|
+
return e.run()
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Convert fasta file to tsv file
|
|
39
|
+
* @param fileFasta - fasta file
|
|
40
|
+
* @return tsv file run result
|
|
41
|
+
*/
|
|
42
|
+
runFastaToTsv := func(fileFasta) {
|
|
43
|
+
e := exec.builder().
|
|
44
|
+
software(fastaToTsvSw).
|
|
45
|
+
mem("8GiB").
|
|
46
|
+
cpu(1).
|
|
47
|
+
addFile("input.fasta", fileFasta).
|
|
48
|
+
arg("-i").arg("input.fasta").
|
|
49
|
+
arg("-o").arg("output.tsv").
|
|
50
|
+
saveFile("output.tsv")
|
|
51
|
+
return e.run()
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Convert tsv file to fasta file
|
|
56
|
+
* @param fileTsv - tsv file
|
|
57
|
+
* @param idCol - id column header
|
|
58
|
+
* @param seqCol - sequence column header
|
|
59
|
+
* @return fasta file run result
|
|
60
|
+
*/
|
|
61
|
+
runTsvToFasta := func(fileTsv, idCol, seqCol) {
|
|
62
|
+
e := exec.builder().
|
|
63
|
+
software(prepareFastaSw).
|
|
64
|
+
mem("8GiB").
|
|
65
|
+
cpu(1).
|
|
66
|
+
addFile("input.tsv", fileTsv).
|
|
67
|
+
arg("-i").arg("input.tsv").
|
|
68
|
+
arg("-o").arg("output.fasta").
|
|
69
|
+
arg("--seq_col").arg(seqCol).
|
|
70
|
+
arg("--id_col").arg(idCol).
|
|
71
|
+
saveFile("output.fasta")
|
|
72
|
+
return e.run()
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
prepareAssayFile := func(file, xsvType, sequenceColumnHeader) {
|
|
76
|
+
// assign ids to assay sequences
|
|
77
|
+
ptw := pt.workflow()
|
|
78
|
+
df := ptw.frame({
|
|
79
|
+
file: file,
|
|
80
|
+
xsvType: xsvType
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
//////// calculate sequence id ////////
|
|
84
|
+
// Create unique seqId for each row by combining sequence with row index
|
|
85
|
+
// First add row index using ordinal rank
|
|
86
|
+
df = df.withColumns(
|
|
87
|
+
pt.rank(pt.col(sequenceColumnHeader)).
|
|
88
|
+
over(pt.col(sequenceColumnHeader)).
|
|
89
|
+
alias("rowIndex")
|
|
90
|
+
)
|
|
91
|
+
// Concatenate sequence with row index and then hash
|
|
92
|
+
df = df.withColumns(
|
|
93
|
+
pt.when(pt.col("rowIndex").gt(pt.lit(1))).
|
|
94
|
+
then(pt.concatStr([pt.col(sequenceColumnHeader), pt.col("rowIndex").cast("String")],
|
|
95
|
+
{delimiter: "_"})).
|
|
96
|
+
otherwise(pt.col(sequenceColumnHeader)).
|
|
97
|
+
alias("uniqueKey")
|
|
98
|
+
)
|
|
99
|
+
// Create hash from the unique key
|
|
100
|
+
df = df.addColumns(
|
|
101
|
+
pt.col("uniqueKey").hash("sha256", "base64_alphanumeric", 120).alias("seqId")
|
|
102
|
+
)
|
|
103
|
+
df = df.withoutColumns("uniqueKey", "rowIndex")
|
|
104
|
+
//////// add label to ids ////////
|
|
105
|
+
df = df.withColumns(
|
|
106
|
+
pt.col("seqId").
|
|
107
|
+
strReplace("\\d", "", { replaceAll: true }).
|
|
108
|
+
strSlice(0, 5). // Take first 5 characters
|
|
109
|
+
strToUpper(). // Convert to uppercase
|
|
110
|
+
alias("tmpLabel")
|
|
111
|
+
)
|
|
112
|
+
df = df.withColumns(
|
|
113
|
+
pt.rank(pt.col("seqId")). // Rank based on clonotypeKeyCol (default ascending)
|
|
114
|
+
over(pt.col("tmpLabel")). // Partition by prefixTempCol
|
|
115
|
+
alias("rank")
|
|
116
|
+
)
|
|
117
|
+
df = df.withColumns(
|
|
118
|
+
pt.when(pt.col("rank").gt(pt.lit(1))).
|
|
119
|
+
then(pt.concatStr([pt.lit("A"), pt.col("tmpLabel"), pt.col("rank").cast("String")], {
|
|
120
|
+
delimiter: "-" })).
|
|
121
|
+
otherwise(pt.concatStr([pt.lit("A"), pt.col("tmpLabel")], { delimiter: "-" })).
|
|
122
|
+
alias("seqIdLabel")
|
|
123
|
+
)
|
|
124
|
+
df = df.withoutColumns("rank", "tmpLabel")
|
|
125
|
+
|
|
126
|
+
//////// add sequence column ////////
|
|
127
|
+
df = df.addColumns(
|
|
128
|
+
pt.col(sequenceColumnHeader).alias("sequence")
|
|
129
|
+
)
|
|
130
|
+
df.save("output.tsv")
|
|
131
|
+
|
|
132
|
+
return ptw.run().getFile("output.tsv")
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
self.defineOutputs(
|
|
136
|
+
"bestAlignmentTsv",
|
|
137
|
+
"assayDataTsv",
|
|
138
|
+
"clonesDataTsv",
|
|
139
|
+
"mmseqsOutput",
|
|
140
|
+
"emptyResults",
|
|
141
|
+
"uniqueValuesMap",
|
|
142
|
+
"columnsToImport"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
self.body(func(args) {
|
|
146
|
+
file := args.file
|
|
147
|
+
xsvType := args.xsvType
|
|
148
|
+
sequenceColumnHeader := args.sequenceColumnHeader
|
|
149
|
+
clonesFasta := args.clonesFasta
|
|
150
|
+
emptyClonesInput := args.emptyClonesInput
|
|
151
|
+
targetSequenceType := args.targetSequenceType
|
|
152
|
+
assaySequenceType := args.assaySequenceType
|
|
153
|
+
coverageThreshold := args.coverageThreshold
|
|
154
|
+
identityThreshold := args.identityThreshold
|
|
155
|
+
similarityType := args.similarityType
|
|
156
|
+
lessSensitive := args.lessSensitive
|
|
157
|
+
mem := args.metaInputs.mem
|
|
158
|
+
cpu := args.metaInputs.cpu
|
|
159
|
+
|
|
160
|
+
// Filter columns to import based on user selection
|
|
161
|
+
columnsToImport := args.importColumns
|
|
162
|
+
if args.selectedColumns != undefined && len(args.selectedColumns) > 0 {
|
|
163
|
+
selectedHeaders := {}
|
|
164
|
+
for header in args.selectedColumns {
|
|
165
|
+
selectedHeaders[header] = true
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
filteredColumns := []
|
|
169
|
+
for col in args.importColumns {
|
|
170
|
+
// Always include the main sequence column
|
|
171
|
+
if col.header == sequenceColumnHeader || selectedHeaders[col.header] {
|
|
172
|
+
filteredColumns = append(filteredColumns, col)
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
columnsToImport = filteredColumns
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Determine string columns for unique value extraction
|
|
179
|
+
stringColumns := []
|
|
180
|
+
for h in columnsToImport {
|
|
181
|
+
if h.type == "String" && h.header != sequenceColumnHeader {
|
|
182
|
+
stringColumns = append(stringColumns, h.header)
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Check if clones input is empty (resolved from upstream check template)
|
|
187
|
+
if emptyClonesInput {
|
|
188
|
+
return {
|
|
189
|
+
bestAlignmentTsv: {},
|
|
190
|
+
assayDataTsv: {},
|
|
191
|
+
clonesDataTsv: {},
|
|
192
|
+
mmseqsOutput: {},
|
|
193
|
+
emptyResults: true,
|
|
194
|
+
uniqueValuesMap: {},
|
|
195
|
+
columnsToImport: columnsToImport
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Handle XLSX files (convert to CSV)
|
|
200
|
+
if xsvType == "xlsx" {
|
|
201
|
+
xlsxToCsvRun := runXlsxToCsv(file)
|
|
202
|
+
file = xlsxToCsvRun.getFile("output.csv")
|
|
203
|
+
xsvType = "csv"
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Handle FASTA files (convert to TSV)
|
|
207
|
+
if xsvType == "fasta" || xsvType == "fa" {
|
|
208
|
+
fastaToTsvRun := runFastaToTsv(file)
|
|
209
|
+
file = fastaToTsvRun.getFile("output.tsv")
|
|
210
|
+
xsvType = "tsv"
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
assayTsv := prepareAssayFile(file, xsvType, sequenceColumnHeader)
|
|
214
|
+
|
|
215
|
+
// Prepare assay FASTA (clones FASTA already prepared upstream)
|
|
216
|
+
assayFastaRun := runTsvToFasta(assayTsv, "seqId", "sequence")
|
|
217
|
+
assayFasta := assayFastaRun.getFile("output.fasta")
|
|
218
|
+
|
|
219
|
+
// Dynamically determine coverage mode by comparing average sequence lengths
|
|
220
|
+
coverageModeRun := exec.builder().
|
|
221
|
+
software(covModeCalcSw).
|
|
222
|
+
mem("16GiB").
|
|
223
|
+
cpu(1).
|
|
224
|
+
addFile("clones.fasta", clonesFasta).
|
|
225
|
+
addFile("assay.fasta", assayFasta).
|
|
226
|
+
arg("--clones-fasta").arg("clones.fasta").
|
|
227
|
+
arg("--assay-fasta").arg("assay.fasta").
|
|
228
|
+
arg("--output").arg("coverage_mode.txt").
|
|
229
|
+
saveFileContent("coverage_mode.txt").
|
|
230
|
+
run()
|
|
231
|
+
|
|
232
|
+
covMode := coverageModeRun.getFileContent("coverage_mode.txt")
|
|
233
|
+
|
|
234
|
+
// MMseqs2 Alignment
|
|
235
|
+
mmseqsSearchType := "0"
|
|
236
|
+
if targetSequenceType == "aminoacid" && assaySequenceType == "aminoacid" {
|
|
237
|
+
//1: amino acid
|
|
238
|
+
mmseqsSearchType = "1"
|
|
239
|
+
} else if targetSequenceType == "nucleotide" && assaySequenceType == "nucleotide" {
|
|
240
|
+
// 3: nucleotide
|
|
241
|
+
mmseqsSearchType = "3"
|
|
242
|
+
} else if targetSequenceType == "nucleotide" && assaySequenceType == "aminoacid" {
|
|
243
|
+
// 4: translated nucleotide alignment
|
|
244
|
+
mmseqsSearchType = "4"
|
|
245
|
+
} else if targetSequenceType == "aminoacid" && assaySequenceType == "nucleotide" {
|
|
246
|
+
// 2: nucleotide
|
|
247
|
+
mmseqsSearchType = "2"
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
runMmseqs := render.create(runAlignmentTpl, {
|
|
251
|
+
covMode: covMode,
|
|
252
|
+
mmseqsSearchType: mmseqsSearchType,
|
|
253
|
+
coverageThreshold: coverageThreshold,
|
|
254
|
+
identityThreshold: identityThreshold,
|
|
255
|
+
similarityType: similarityType,
|
|
256
|
+
clonesFasta: clonesFasta,
|
|
257
|
+
assayFasta: assayFasta,
|
|
258
|
+
lessSensitive: lessSensitive
|
|
259
|
+
}, {
|
|
260
|
+
metaInputs: {
|
|
261
|
+
mem: mem,
|
|
262
|
+
cpu: cpu
|
|
263
|
+
}
|
|
264
|
+
})
|
|
265
|
+
|
|
266
|
+
mmseqsOutput := runMmseqs.output("mmseqsOutput")
|
|
267
|
+
|
|
268
|
+
// @TODO remove header stuff and replace with pt when available (!)
|
|
269
|
+
addHeaderRun := exec.builder().
|
|
270
|
+
software(addHeaderSw).
|
|
271
|
+
mem("16GiB").
|
|
272
|
+
cpu(1).
|
|
273
|
+
arg("-i").arg("results.tsv").
|
|
274
|
+
arg("-o").arg("results_with_header.tsv").
|
|
275
|
+
addFile("results.tsv", mmseqsOutput).
|
|
276
|
+
saveFile("results_with_header.tsv").
|
|
277
|
+
saveFileContent("results_with_header.tsv").
|
|
278
|
+
run()
|
|
279
|
+
|
|
280
|
+
mmseqsResultTsv := addHeaderRun.getFile("results_with_header.tsv")
|
|
281
|
+
mmseqsResultContent := addHeaderRun.getFileContent("results_with_header.tsv")
|
|
282
|
+
|
|
283
|
+
// Use subtemplate to check if file is empty
|
|
284
|
+
checkResult := render.create(checkContentEmptyTpl, {
|
|
285
|
+
content: mmseqsResultContent,
|
|
286
|
+
mode: "headerOnly"
|
|
287
|
+
})
|
|
288
|
+
emptyResults := checkResult.output("result")
|
|
289
|
+
|
|
290
|
+
result := {
|
|
291
|
+
mmseqsOutput: mmseqsOutput,
|
|
292
|
+
emptyResults: emptyResults
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Post-processing with PT
|
|
296
|
+
ptw2 := pt.workflow()
|
|
297
|
+
dfRes := ptw2.frame({
|
|
298
|
+
file: mmseqsResultTsv,
|
|
299
|
+
xsvType: "tsv"
|
|
300
|
+
})
|
|
301
|
+
|
|
302
|
+
// Cast columns to ensure correct types for aggregation
|
|
303
|
+
dfRes = dfRes.withColumns(
|
|
304
|
+
pt.col("evalue").cast("Float64").alias("evalue"),
|
|
305
|
+
pt.col("bits").cast("Float64").alias("bits")
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
cols := []
|
|
309
|
+
for _, col in ["bits", "evalue", "query", "pident", "alnlen", "mismatch",
|
|
310
|
+
"gapopen", "qstart", "qend", "tstart", "tend"] {
|
|
311
|
+
cols = append(cols,
|
|
312
|
+
pt.col(col).maxBy(
|
|
313
|
+
pt.col("evalue").multiply(-1),
|
|
314
|
+
pt.col("bits")
|
|
315
|
+
).alias(col)
|
|
316
|
+
)
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
dfRes = dfRes.groupBy("target").agg(cols...)
|
|
320
|
+
// Add link column for linker pFrame (assayLinkerPframe)
|
|
321
|
+
dfRes = dfRes.withColumns(
|
|
322
|
+
pt.lit(1).cast("Int64").alias("link")
|
|
323
|
+
)
|
|
324
|
+
dfRes.save("best_alignment.tsv")
|
|
325
|
+
|
|
326
|
+
// Assay data summary
|
|
327
|
+
assayDf := ptw2.frame({
|
|
328
|
+
file: assayTsv,
|
|
329
|
+
xsvType: "tsv"
|
|
330
|
+
})
|
|
331
|
+
|
|
332
|
+
// import how many matches per assay sequence found
|
|
333
|
+
assayDf = assayDf.join(
|
|
334
|
+
dfRes.groupBy("query").agg(
|
|
335
|
+
pt.col("target").count().alias("queryCount")
|
|
336
|
+
),
|
|
337
|
+
{
|
|
338
|
+
how: "left",
|
|
339
|
+
leftOn: "seqId",
|
|
340
|
+
rightOn: "query"
|
|
341
|
+
}
|
|
342
|
+
)
|
|
343
|
+
assayDf.save("assay_data.tsv")
|
|
344
|
+
|
|
345
|
+
// Clones data
|
|
346
|
+
clonesDf := dfRes.join(assayDf,
|
|
347
|
+
{
|
|
348
|
+
how: "left",
|
|
349
|
+
leftOn: "query",
|
|
350
|
+
rightOn: "seqId"
|
|
351
|
+
}
|
|
352
|
+
)
|
|
353
|
+
clonesDf.save("clones_data.tsv")
|
|
354
|
+
|
|
355
|
+
ptRun2 := ptw2.run()
|
|
356
|
+
|
|
357
|
+
result.bestAlignmentTsv = ptRun2.getFile("best_alignment.tsv")
|
|
358
|
+
result.assayDataTsv = ptRun2.getFile("assay_data.tsv")
|
|
359
|
+
result.clonesDataTsv = ptRun2.getFile("clones_data.tsv")
|
|
360
|
+
|
|
361
|
+
// Extract unique values for string columns
|
|
362
|
+
getUniqueValuesResult := render.create(getUniqueValuesTpl, {
|
|
363
|
+
assayDataTsv: result.assayDataTsv,
|
|
364
|
+
stringColumns: stringColumns
|
|
365
|
+
})
|
|
366
|
+
result.uniqueValuesMap = getUniqueValuesResult.output("uniqueValuesMap")
|
|
367
|
+
result.columnsToImport = columnsToImport
|
|
368
|
+
|
|
369
|
+
return result
|
|
370
|
+
})
|
|
@@ -57,24 +57,7 @@ self.body(func(inputs) {
|
|
|
57
57
|
}
|
|
58
58
|
]
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
if inputs.selectedColumns != undefined && len(inputs.selectedColumns) > 0 {
|
|
62
|
-
selectedHeaders := {}
|
|
63
|
-
for header in inputs.selectedColumns {
|
|
64
|
-
selectedHeaders[header] = true
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
filteredColumns := []
|
|
68
|
-
for col in inputs.importColumns {
|
|
69
|
-
// Always include the main sequence column
|
|
70
|
-
if col.header == inputs.sequenceColumnHeader || selectedHeaders[col.header] {
|
|
71
|
-
filteredColumns = append(filteredColumns, col)
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
columnsToImport = filteredColumns
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
for h in columnsToImport {
|
|
60
|
+
for h in inputs.columnsToImport {
|
|
78
61
|
if h.header == inputs.sequenceColumnHeader {
|
|
79
62
|
continue
|
|
80
63
|
}
|
|
@@ -160,7 +143,7 @@ self.body(func(inputs) {
|
|
|
160
143
|
}
|
|
161
144
|
}]
|
|
162
145
|
|
|
163
|
-
for h in columnsToImport {
|
|
146
|
+
for h in inputs.columnsToImport {
|
|
164
147
|
annotations := {
|
|
165
148
|
"pl7.app/label": h.header,
|
|
166
149
|
"pl7.app/table/visibility": h.header == inputs.sequenceColumnHeader ? "optional" : "default"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
text := import("text")
|
|
3
|
+
|
|
4
|
+
self.defineOutputs("result")
|
|
5
|
+
|
|
6
|
+
self.body(func(args) {
|
|
7
|
+
content := string(args.content.getData())
|
|
8
|
+
mode := args.mode
|
|
9
|
+
|
|
10
|
+
isEmpty := false
|
|
11
|
+
if mode == "raw" {
|
|
12
|
+
// No header line (e.g. FASTA): empty content means no data
|
|
13
|
+
isEmpty = text.trim_space(content) == ""
|
|
14
|
+
} else if mode == "headerOnly" {
|
|
15
|
+
// File has a header line (e.g. TSV with added header): empty means <= 1 line
|
|
16
|
+
lines := text.split(text.trim_space(content), "\n")
|
|
17
|
+
isEmpty = len(lines) <= 1
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return { result: isEmpty }
|
|
21
|
+
})
|
|
@@ -10,8 +10,7 @@ self.body(func(inputs) {
|
|
|
10
10
|
uniqueValuesMap := {}
|
|
11
11
|
|
|
12
12
|
for header, contentField in fileContents {
|
|
13
|
-
// In subtemplate, we can call getData() directly on
|
|
14
|
-
// (following clonotype-enrichment pattern)
|
|
13
|
+
// In subtemplate, we can call getData() directly on inputs
|
|
15
14
|
contentBytes := contentField.getData()
|
|
16
15
|
content := string(contentBytes)
|
|
17
16
|
lines := text.split(text.trim_space(content), "\n")
|
|
@@ -20,8 +19,7 @@ self.body(func(inputs) {
|
|
|
20
19
|
// Skip header and collect values
|
|
21
20
|
values := lines[1:]
|
|
22
21
|
// JSON encode the array of strings and convert to string
|
|
23
|
-
|
|
24
|
-
uniqueValuesMap[header] = string(encodedBytes)
|
|
22
|
+
uniqueValuesMap[header] = string(json.encode(values))
|
|
25
23
|
}
|
|
26
24
|
}
|
|
27
25
|
|
|
@@ -29,4 +27,3 @@ self.body(func(inputs) {
|
|
|
29
27
|
uniqueValuesMap: uniqueValuesMap
|
|
30
28
|
}
|
|
31
29
|
})
|
|
32
|
-
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
3
|
+
pt := import("@platforma-sdk/workflow-tengo:pt")
|
|
4
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
5
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
6
|
+
strings := import("@platforma-sdk/workflow-tengo:strings")
|
|
7
|
+
|
|
8
|
+
extractUniqueValuesTpl := assets.importTemplate(":extract-unique-values")
|
|
9
|
+
|
|
10
|
+
self.defineOutputs("uniqueValuesMap")
|
|
11
|
+
|
|
12
|
+
self.body(func(inputs) {
|
|
13
|
+
assayDataTsv := inputs.assayDataTsv
|
|
14
|
+
stringColumns := inputs.stringColumns
|
|
15
|
+
|
|
16
|
+
if len(stringColumns) == 0 {
|
|
17
|
+
return { uniqueValuesMap: {} }
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
uniqueValuesWf := pt.workflow().mem("4GiB").cpu(1)
|
|
21
|
+
baseDf := uniqueValuesWf.frame({
|
|
22
|
+
file: assayDataTsv,
|
|
23
|
+
xsvType: "tsv"
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
// Process each String column to extract unique values
|
|
27
|
+
for colHeader in stringColumns {
|
|
28
|
+
uniqueValuesDf := baseDf.select(pt.col(colHeader).alias("value")).groupBy("value").agg(pt.col("value").count().alias("_count"))
|
|
29
|
+
uniqueValuesDf = uniqueValuesDf.select("value")
|
|
30
|
+
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
31
|
+
uniqueValuesDf.saveContent(fileName)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Run once and collect all results
|
|
35
|
+
uniqueValuesResult := uniqueValuesWf.run()
|
|
36
|
+
|
|
37
|
+
fileContentsMap := {}
|
|
38
|
+
for colHeader in stringColumns {
|
|
39
|
+
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
40
|
+
fileContentsMap[colHeader] = uniqueValuesResult.getFileContent(fileName)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Now call the extraction subtemplate which will use .getData() on these resources
|
|
44
|
+
extractResult := render.create(extractUniqueValuesTpl, {
|
|
45
|
+
fileContents: fileContentsMap
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
uniqueValuesMap: extractResult.output("uniqueValuesMap")
|
|
50
|
+
}
|
|
51
|
+
})
|
package/src/main.tpl.tengo
CHANGED
|
@@ -3,25 +3,19 @@ ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
|
3
3
|
file := import("@platforma-sdk/workflow-tengo:file")
|
|
4
4
|
exec := import("@platforma-sdk/workflow-tengo:exec")
|
|
5
5
|
assets:= import("@platforma-sdk/workflow-tengo:assets")
|
|
6
|
-
maps:= import("@platforma-sdk/workflow-tengo:maps")
|
|
7
|
-
xsv := import("@platforma-sdk/workflow-tengo:pframes.xsv")
|
|
8
6
|
pframes := import("@platforma-sdk/workflow-tengo:pframes")
|
|
9
|
-
pSpec := import("@platforma-sdk/workflow-tengo:pframes.spec")
|
|
10
7
|
pt := import("@platforma-sdk/workflow-tengo:pt")
|
|
11
8
|
path := import("@platforma-sdk/workflow-tengo:path")
|
|
12
9
|
json := import("json")
|
|
13
10
|
text := import("text")
|
|
14
11
|
render := import("@platforma-sdk/workflow-tengo:render")
|
|
15
12
|
strings := import("@platforma-sdk/workflow-tengo:strings")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
13
|
+
|
|
14
|
+
analysisTpl := assets.importTemplate(":analysis")
|
|
15
|
+
processOutputsTpl := assets.importTemplate(":process-outputs")
|
|
16
|
+
checkContentEmptyTpl := assets.importTemplate(":check-content-empty")
|
|
19
17
|
|
|
20
18
|
prepareFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.prepare-fasta:main")
|
|
21
|
-
fastaToTsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv:main")
|
|
22
|
-
addHeaderSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.add-header:main")
|
|
23
|
-
covModeCalcSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc:main")
|
|
24
|
-
xlsxToCsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv:main")
|
|
25
19
|
|
|
26
20
|
wf.prepare(func(args){
|
|
27
21
|
bundleBuilder := wf.createPBundleBuilder()
|
|
@@ -32,65 +26,6 @@ wf.prepare(func(args){
|
|
|
32
26
|
}
|
|
33
27
|
})
|
|
34
28
|
|
|
35
|
-
prepareAssayFile := func(args, file, xsvType) {
|
|
36
|
-
// assign ids to assay sequences
|
|
37
|
-
ptw := pt.workflow()
|
|
38
|
-
df := ptw.frame({
|
|
39
|
-
file: file,
|
|
40
|
-
xsvType: xsvType
|
|
41
|
-
})
|
|
42
|
-
|
|
43
|
-
//////// calculate sequence id ////////
|
|
44
|
-
// Create unique seqId for each row by combining sequence with row index
|
|
45
|
-
// First add row index using ordinal rank
|
|
46
|
-
df = df.withColumns(
|
|
47
|
-
pt.rank(pt.col(args.sequenceColumnHeader)).
|
|
48
|
-
over(pt.col(args.sequenceColumnHeader)).
|
|
49
|
-
alias("rowIndex")
|
|
50
|
-
)
|
|
51
|
-
// Concatenate sequence with row index and then hash
|
|
52
|
-
df = df.withColumns(
|
|
53
|
-
pt.when(pt.col("rowIndex").gt(pt.lit(1))).
|
|
54
|
-
then(pt.concatStr([pt.col(args.sequenceColumnHeader), pt.col("rowIndex").cast("String")], {delimiter: "_"})).
|
|
55
|
-
otherwise(pt.col(args.sequenceColumnHeader)).
|
|
56
|
-
alias("uniqueKey")
|
|
57
|
-
)
|
|
58
|
-
// Create hash from the unique key
|
|
59
|
-
df = df.addColumns(
|
|
60
|
-
pt.col("uniqueKey").hash("sha256", "base64_alphanumeric", 120).alias("seqId")
|
|
61
|
-
)
|
|
62
|
-
// Remove the temporary columns
|
|
63
|
-
//df = df.withoutColumns("uniqueKey", "rowIndex")
|
|
64
|
-
//////// add label to ids ////////
|
|
65
|
-
df = df.withColumns(
|
|
66
|
-
pt.col("seqId").
|
|
67
|
-
strReplace("\\d", "", { replaceAll: true }).
|
|
68
|
-
strSlice(0, 5). // Take first 5 characters
|
|
69
|
-
strToUpper(). // Convert to uppercase
|
|
70
|
-
alias("tmpLabel")
|
|
71
|
-
)
|
|
72
|
-
df = df.withColumns(
|
|
73
|
-
pt.rank(pt.col("seqId")). // Rank based on clonotypeKeyCol (default ascending)
|
|
74
|
-
over(pt.col("tmpLabel")). // Partition by prefixTempCol
|
|
75
|
-
alias("rank")
|
|
76
|
-
)
|
|
77
|
-
df = df.withColumns(
|
|
78
|
-
pt.when(pt.col("rank").gt(pt.lit(1))).
|
|
79
|
-
then(pt.concatStr([pt.lit("A"), pt.col("tmpLabel"), pt.col("rank").cast("String")], { delimiter: "-" })).
|
|
80
|
-
otherwise(pt.concatStr([pt.lit("A"), pt.col("tmpLabel")], { delimiter: "-" })).
|
|
81
|
-
alias("seqIdLabel")
|
|
82
|
-
)
|
|
83
|
-
df = df.withoutColumns("rank", "tmpLabel")
|
|
84
|
-
|
|
85
|
-
//////// add sequence column ////////
|
|
86
|
-
df = df.addColumns(
|
|
87
|
-
pt.col(args.sequenceColumnHeader).alias("sequence")
|
|
88
|
-
)
|
|
89
|
-
df.save("output.tsv")
|
|
90
|
-
|
|
91
|
-
return ptw.run().getFile("output.tsv")
|
|
92
|
-
}
|
|
93
|
-
|
|
94
29
|
prepareClonesTsv := func(args) {
|
|
95
30
|
columns := args.columns
|
|
96
31
|
datasetSpec := columns.getSpec(args.datasetRef)
|
|
@@ -105,62 +40,6 @@ prepareClonesTsv := func(args) {
|
|
|
105
40
|
return cloneTable.build()
|
|
106
41
|
}
|
|
107
42
|
|
|
108
|
-
/**
|
|
109
|
-
* Convert tsv file to fasta file
|
|
110
|
-
* @param fileTsv - tsv file
|
|
111
|
-
* @return fasta file run result
|
|
112
|
-
*/
|
|
113
|
-
runTsvToFasta := func(fileTsv) {
|
|
114
|
-
e := exec.builder().
|
|
115
|
-
software(prepareFastaSw).
|
|
116
|
-
mem("8GiB").
|
|
117
|
-
cpu(1).
|
|
118
|
-
addFile("input.tsv", fileTsv).
|
|
119
|
-
arg("-i").arg("input.tsv").
|
|
120
|
-
arg("-o").arg("output.fasta").
|
|
121
|
-
arg("--seq_col").arg("sequence").
|
|
122
|
-
arg("--id_col").arg("seqId").
|
|
123
|
-
saveFile("output.fasta")
|
|
124
|
-
|
|
125
|
-
return e.run()
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
/**
|
|
129
|
-
* Convert fasta file to tsv file
|
|
130
|
-
* @param fileFasta - fasta file
|
|
131
|
-
* @return tsv file run result
|
|
132
|
-
*/
|
|
133
|
-
runFastaToTsv := func(fileFasta) {
|
|
134
|
-
e := exec.builder().
|
|
135
|
-
software(fastaToTsvSw).
|
|
136
|
-
mem("8GiB").
|
|
137
|
-
cpu(1).
|
|
138
|
-
addFile("input.fasta", fileFasta).
|
|
139
|
-
arg("-i").arg("input.fasta").
|
|
140
|
-
arg("-o").arg("output.tsv").
|
|
141
|
-
saveFile("output.tsv")
|
|
142
|
-
|
|
143
|
-
return e.run()
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Convert xlsx file to csv file
|
|
148
|
-
* @param fileXlsx - xlsx file
|
|
149
|
-
* @return csv file run result
|
|
150
|
-
*/
|
|
151
|
-
runXlsxToCsv := func(fileXlsx) {
|
|
152
|
-
e := exec.builder().
|
|
153
|
-
software(xlsxToCsvSw).
|
|
154
|
-
mem("16GiB").
|
|
155
|
-
cpu(1).
|
|
156
|
-
addFile("input.xlsx", fileXlsx).
|
|
157
|
-
arg("-i").arg("input.xlsx").
|
|
158
|
-
arg("-o").arg("output.csv").
|
|
159
|
-
saveFile("output.csv")
|
|
160
|
-
|
|
161
|
-
return e.run()
|
|
162
|
-
}
|
|
163
|
-
|
|
164
43
|
wf.body(func(args) {
|
|
165
44
|
importFile := file.importFile(args.fileHandle)
|
|
166
45
|
datasetSpec := args.columns.getSpec(args.datasetRef)
|
|
@@ -211,238 +90,102 @@ wf.body(func(args) {
|
|
|
211
90
|
xsvType = args.detectedXsvType
|
|
212
91
|
}
|
|
213
92
|
|
|
214
|
-
// Handle XLSX files by converting to CSV first
|
|
215
|
-
if xsvType == "xlsx" {
|
|
216
|
-
xlsxToCsvRun := runXlsxToCsv(importFile.file)
|
|
217
|
-
importFile.file = xlsxToCsvRun.getFile("output.csv")
|
|
218
|
-
xsvType = "csv"
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
// Handle FASTA files by converting to TSV first
|
|
222
|
-
if xsvType == "fasta" || xsvType == "fa" {
|
|
223
|
-
fastaToTsvRun := runFastaToTsv(importFile.file)
|
|
224
|
-
importFile.file = fastaToTsvRun.getFile("output.tsv")
|
|
225
|
-
xsvType = "tsv"
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
assayTsv := prepareAssayFile(args, importFile.file, xsvType)
|
|
229
93
|
clonesTsv := prepareClonesTsv(args)
|
|
230
94
|
|
|
231
|
-
//
|
|
232
|
-
clonesFastaRun :=
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
assayFasta := assayFastaRun.getFile("output.fasta")
|
|
236
|
-
|
|
237
|
-
// Dynamically determine coverage mode by comparing average sequence lengths
|
|
238
|
-
coverageMode := exec.builder().
|
|
239
|
-
software(covModeCalcSw).
|
|
240
|
-
mem("16GiB").
|
|
95
|
+
// Convert clones TSV to FASTA and check if empty
|
|
96
|
+
clonesFastaRun := exec.builder().
|
|
97
|
+
software(prepareFastaSw).
|
|
98
|
+
mem("8GiB").
|
|
241
99
|
cpu(1).
|
|
242
|
-
addFile("
|
|
243
|
-
|
|
244
|
-
arg("
|
|
245
|
-
arg("--
|
|
246
|
-
arg("--
|
|
247
|
-
|
|
100
|
+
addFile("input.tsv", clonesTsv).
|
|
101
|
+
arg("-i").arg("input.tsv").
|
|
102
|
+
arg("-o").arg("output.fasta").
|
|
103
|
+
arg("--seq_col").arg("sequence").
|
|
104
|
+
arg("--id_col").arg("seqId").
|
|
105
|
+
saveFile("output.fasta").
|
|
106
|
+
saveFileContent("output.fasta").
|
|
248
107
|
run()
|
|
108
|
+
clonesFasta := clonesFastaRun.getFile("output.fasta")
|
|
109
|
+
clonesFastaContent := clonesFastaRun.getFileContent("output.fasta")
|
|
249
110
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
} else if targetSequenceType == "nucleotide" && assaySequenceType == "nucleotide" {
|
|
257
|
-
// 3: nucleotide
|
|
258
|
-
mmseqsSearchType = "3"
|
|
259
|
-
} else if targetSequenceType == "nucleotide" && assaySequenceType == "aminoacid" {
|
|
260
|
-
// 4: translated nucleotide alignment
|
|
261
|
-
mmseqsSearchType = "4"
|
|
262
|
-
} else if targetSequenceType == "aminoacid" && assaySequenceType == "nucleotide" {
|
|
263
|
-
// 2: nucleotide
|
|
264
|
-
mmseqsSearchType = "2"
|
|
265
|
-
}
|
|
111
|
+
// Check if clones are empty before running mmseqs2
|
|
112
|
+
checkClonesResult := render.create(checkContentEmptyTpl, {
|
|
113
|
+
content: clonesFastaContent,
|
|
114
|
+
mode: "raw"
|
|
115
|
+
})
|
|
116
|
+
emptyClonesInput := checkClonesResult.output("result")
|
|
266
117
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
118
|
+
// Run Analysis Template
|
|
119
|
+
analysisResult := render.create(analysisTpl, {
|
|
120
|
+
file: importFile.file,
|
|
121
|
+
xsvType: xsvType,
|
|
122
|
+
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
123
|
+
clonesFasta: clonesFasta,
|
|
124
|
+
emptyClonesInput: emptyClonesInput,
|
|
125
|
+
targetSequenceType: targetSequenceType,
|
|
126
|
+
assaySequenceType: assaySequenceType,
|
|
270
127
|
coverageThreshold: args.settings.coverageThreshold,
|
|
271
128
|
identityThreshold: args.settings.identity,
|
|
272
129
|
similarityType: args.settings.similarityType,
|
|
273
|
-
clonesFasta: clonesFasta,
|
|
274
|
-
assayFasta: assayFasta,
|
|
275
130
|
lessSensitive: args.lessSensitive,
|
|
276
|
-
|
|
277
|
-
|
|
131
|
+
importColumns: args.importColumns,
|
|
132
|
+
selectedColumns: args.selectedColumns
|
|
133
|
+
}, {
|
|
134
|
+
metaInputs: {
|
|
135
|
+
mem: args.mem,
|
|
136
|
+
cpu: args.cpu
|
|
137
|
+
}
|
|
278
138
|
})
|
|
279
139
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
addHeaderRunResult := exec.builder().
|
|
284
|
-
software(addHeaderSw).
|
|
285
|
-
mem("16GiB").
|
|
286
|
-
cpu(1).
|
|
287
|
-
arg("-i").arg("results.tsv").
|
|
288
|
-
arg("-o").arg("results_with_header.tsv").
|
|
289
|
-
addFile("results.tsv", mmseqsOutput).
|
|
290
|
-
saveFile("results_with_header.tsv").
|
|
291
|
-
saveFileContent("results_with_header.tsv").
|
|
292
|
-
run()
|
|
293
|
-
|
|
294
|
-
mmseqsResultTsv := addHeaderRunResult.getFile("results_with_header.tsv")
|
|
295
|
-
mmseqsResultTsvContent := addHeaderRunResult.getFileContent("results_with_header.tsv")
|
|
296
|
-
|
|
297
|
-
emptyResults := len(text.trim_space(string(mmseqsResultTsvContent))) == 0
|
|
140
|
+
emptyResults := analysisResult.output("emptyResults")
|
|
141
|
+
mmseqsOutput := analysisResult.output("mmseqsOutput")
|
|
142
|
+
|
|
298
143
|
blockId := wf.blockId().getDataAsJson()
|
|
299
144
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
uniqueValuesMap :=
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
pt.col(col).maxBy(
|
|
327
|
-
pt.col("evalue").multiply(-1),
|
|
328
|
-
pt.col("bits")
|
|
329
|
-
).alias(col)
|
|
330
|
-
)
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
df = df.groupBy("target").agg(cols...)
|
|
334
|
-
|
|
335
|
-
// Add link column for linker pFrame (assayLinkerPframe)
|
|
336
|
-
df = df.withColumns(
|
|
337
|
-
pt.lit(1).cast("Int64").alias("link")
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
df.save("results.tsv")
|
|
341
|
-
|
|
342
|
-
// assay data import summary
|
|
343
|
-
assayDf := ptw.frame({
|
|
344
|
-
file: assayTsv,
|
|
345
|
-
xsvType: "tsv"
|
|
346
|
-
})
|
|
347
|
-
// import how many matches per assay sequence found
|
|
348
|
-
assayDf = assayDf.join(
|
|
349
|
-
df.groupBy("query").agg(
|
|
350
|
-
pt.col("target").count().alias("queryCount")
|
|
351
|
-
),
|
|
352
|
-
{
|
|
353
|
-
how: "left",
|
|
354
|
-
leftOn: "seqId",
|
|
355
|
-
rightOn: "query"
|
|
356
|
-
}
|
|
357
|
-
)
|
|
358
|
-
assayDf.save("assayData.tsv")
|
|
359
|
-
|
|
360
|
-
// clones
|
|
361
|
-
clonesDf := df.join(assayDf,
|
|
362
|
-
{
|
|
363
|
-
how: "left",
|
|
364
|
-
leftOn: "query",
|
|
365
|
-
rightOn: "seqId"
|
|
366
|
-
}
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
clonesDf.save("clonesData.tsv")
|
|
370
|
-
ptw = ptw.run()
|
|
371
|
-
|
|
372
|
-
//////// Extract unique values from String columns ////////
|
|
373
|
-
// Find all String columns
|
|
374
|
-
stringColumns := []
|
|
375
|
-
for h in args.importColumns {
|
|
376
|
-
if h.type == "String" && h.header != args.sequenceColumnHeader {
|
|
377
|
-
stringColumns = append(stringColumns, h.header)
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// Extract unique values for all String columns
|
|
382
|
-
fileContentsMap := {}
|
|
383
|
-
if len(stringColumns) > 0 {
|
|
384
|
-
uniqueValuesWf := pt.workflow().mem("4GiB").cpu(1)
|
|
385
|
-
baseDf := uniqueValuesWf.frame({
|
|
386
|
-
file: ptw.getFile("assayData.tsv"),
|
|
387
|
-
xsvType: "tsv"
|
|
388
|
-
})
|
|
389
|
-
// Process each String column to extract unique values
|
|
390
|
-
for colHeader in stringColumns {
|
|
391
|
-
uniqueValuesDf := baseDf.select(pt.col(colHeader).alias("value")).groupBy("value").agg(pt.col("value").count().alias("_count"))
|
|
392
|
-
uniqueValuesDf = uniqueValuesDf.select("value")
|
|
393
|
-
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
394
|
-
uniqueValuesDf.saveContent(fileName)
|
|
395
|
-
}
|
|
396
|
-
// Run once and collect all results
|
|
397
|
-
uniqueValuesResult := uniqueValuesWf.run()
|
|
398
|
-
for colHeader in stringColumns {
|
|
399
|
-
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
400
|
-
fileContentsMap[colHeader] = uniqueValuesResult.getFileContent(fileName)
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
// Use subtemplate to extract content (getData() only works in subtemplates)
|
|
404
|
-
extractResult := render.create(extractUniqueValuesTpl, {
|
|
405
|
-
fileContents: fileContentsMap
|
|
406
|
-
})
|
|
407
|
-
uniqueValuesMap = extractResult.output("uniqueValuesMap")
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
//////// Building outputs & exports ////////
|
|
411
|
-
buildOutputsResult := render.createEphemeral(buildOutputsTpl, {
|
|
412
|
-
importColumns: args.importColumns,
|
|
413
|
-
selectedColumns: args.selectedColumns,
|
|
414
|
-
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
415
|
-
sequenceColumnInfo: sequenceColumnInfo,
|
|
416
|
-
assaySequenceType: assaySequenceType,
|
|
417
|
-
blockId: blockId,
|
|
418
|
-
datasetSpec: datasetSpec,
|
|
419
|
-
bestAlignmentTsv: ptw.getFile("results.tsv"),
|
|
420
|
-
assayDataTsv: ptw.getFile("assayData.tsv"),
|
|
421
|
-
clonesDataTsv: ptw.getFile("clonesData.tsv"),
|
|
422
|
-
uniqueValuesMap: uniqueValuesMap,
|
|
423
|
-
settings: args.settings,
|
|
424
|
-
customBlockLabel: args.customBlockLabel,
|
|
425
|
-
defaultBlockLabel: args.defaultBlockLabel
|
|
426
|
-
})
|
|
427
|
-
assayPframe = buildOutputsResult.output("assayPframe")
|
|
428
|
-
epf = buildOutputsResult.output("epf")
|
|
429
|
-
assayLinkerPframe = buildOutputsResult.output("assayLinkerPframe")
|
|
430
|
-
}
|
|
145
|
+
bestAlignmentTsv := analysisResult.output("bestAlignmentTsv")
|
|
146
|
+
assayDataTsv := analysisResult.output("assayDataTsv")
|
|
147
|
+
clonesDataTsv := analysisResult.output("clonesDataTsv")
|
|
148
|
+
uniqueValuesMap := analysisResult.output("uniqueValuesMap")
|
|
149
|
+
columnsToImport := analysisResult.output("columnsToImport")
|
|
150
|
+
|
|
151
|
+
//////// Build outputs ////////
|
|
152
|
+
processOutputsResult := render.create(processOutputsTpl, {
|
|
153
|
+
emptyClonesInput: emptyClonesInput,
|
|
154
|
+
assayDataTsv: assayDataTsv,
|
|
155
|
+
bestAlignmentTsv: bestAlignmentTsv,
|
|
156
|
+
clonesDataTsv: clonesDataTsv,
|
|
157
|
+
columnsToImport: columnsToImport,
|
|
158
|
+
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
159
|
+
sequenceColumnInfo: sequenceColumnInfo,
|
|
160
|
+
assaySequenceType: assaySequenceType,
|
|
161
|
+
datasetSpec: datasetSpec,
|
|
162
|
+
uniqueValuesMap: uniqueValuesMap,
|
|
163
|
+
settings: args.settings,
|
|
164
|
+
customBlockLabel: args.customBlockLabel,
|
|
165
|
+
defaultBlockLabel: args.defaultBlockLabel,
|
|
166
|
+
blockId: blockId
|
|
167
|
+
})
|
|
168
|
+
assayPframe := processOutputsResult.output("assayPframe")
|
|
169
|
+
epf := processOutputsResult.output("epf")
|
|
170
|
+
assayLinkerPframe := processOutputsResult.output("assayLinkerPframe")
|
|
431
171
|
|
|
432
|
-
|
|
172
|
+
return {
|
|
433
173
|
outputs: {
|
|
434
174
|
dataImportHandle: importFile.handle,
|
|
435
175
|
table: assayPframe,
|
|
436
176
|
assayLinkerPframe: assayLinkerPframe,
|
|
437
|
-
mmseqsOutput: mmseqsOutput,
|
|
438
|
-
emptyResults: emptyResults
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
177
|
+
mmseqsOutput: mmseqsOutput,
|
|
178
|
+
emptyResults: emptyResults,
|
|
179
|
+
emptyClonesInput: emptyClonesInput,
|
|
180
|
+
// Needed for deduplication
|
|
181
|
+
assayDataTsv: assayDataTsv,
|
|
182
|
+
bestAlignmentTsv: bestAlignmentTsv,
|
|
183
|
+
clonesDataTsv: clonesDataTsv,
|
|
184
|
+
uniqueValuesMap: uniqueValuesMap,
|
|
185
|
+
columnsToImport: columnsToImport
|
|
186
|
+
},
|
|
187
|
+
exports: {
|
|
443
188
|
epf: epf
|
|
444
189
|
}
|
|
445
190
|
}
|
|
446
|
-
|
|
447
|
-
return result
|
|
448
191
|
})
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
3
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
4
|
+
|
|
5
|
+
buildOutputsTpl := assets.importTemplate(":build-outputs")
|
|
6
|
+
|
|
7
|
+
self.defineOutputs("assayPframe", "epf", "assayLinkerPframe")
|
|
8
|
+
|
|
9
|
+
self.body(func(args) {
|
|
10
|
+
if args.emptyClonesInput {
|
|
11
|
+
return {
|
|
12
|
+
assayPframe: {},
|
|
13
|
+
epf: {},
|
|
14
|
+
assayLinkerPframe: {}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
buildOutputsResult := render.createEphemeral(buildOutputsTpl, {
|
|
19
|
+
columnsToImport: args.columnsToImport,
|
|
20
|
+
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
21
|
+
sequenceColumnInfo: args.sequenceColumnInfo,
|
|
22
|
+
assaySequenceType: args.assaySequenceType,
|
|
23
|
+
datasetSpec: args.datasetSpec,
|
|
24
|
+
bestAlignmentTsv: args.bestAlignmentTsv,
|
|
25
|
+
assayDataTsv: args.assayDataTsv,
|
|
26
|
+
clonesDataTsv: args.clonesDataTsv,
|
|
27
|
+
uniqueValuesMap: args.uniqueValuesMap,
|
|
28
|
+
settings: args.settings,
|
|
29
|
+
customBlockLabel: args.customBlockLabel,
|
|
30
|
+
defaultBlockLabel: args.defaultBlockLabel,
|
|
31
|
+
blockId: args.blockId
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
return {
|
|
35
|
+
assayPframe: buildOutputsResult.output("assayPframe"),
|
|
36
|
+
epf: buildOutputsResult.output("epf"),
|
|
37
|
+
assayLinkerPframe: buildOutputsResult.output("assayLinkerPframe")
|
|
38
|
+
}
|
|
39
|
+
})
|
|
@@ -18,11 +18,11 @@ self.body(func(args) {
|
|
|
18
18
|
|
|
19
19
|
mem := "8GiB"
|
|
20
20
|
cpu := 1
|
|
21
|
-
if !is_undefined(args.mem) {
|
|
22
|
-
mem = string(args.mem) + "GiB"
|
|
21
|
+
if !is_undefined(args.metaInputs.mem) {
|
|
22
|
+
mem = string(args.metaInputs.mem) + "GiB"
|
|
23
23
|
}
|
|
24
|
-
if !is_undefined(args.cpu) {
|
|
25
|
-
cpu = args.cpu
|
|
24
|
+
if !is_undefined(args.metaInputs.cpu) {
|
|
25
|
+
cpu = args.metaInputs.cpu
|
|
26
26
|
}
|
|
27
27
|
|
|
28
28
|
mmseqs := exec.builder().
|