@platforma-open/milaboratories.immune-assay-data.workflow 1.7.1 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +9 -1
- package/CHANGELOG.md +14 -0
- package/dist/index.cjs +4 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +4 -0
- package/dist/tengo/tpl/analysis.plj.gz +0 -0
- package/dist/tengo/tpl/build-outputs.plj.gz +0 -0
- package/dist/tengo/tpl/check-content-empty.plj.gz +0 -0
- package/dist/tengo/tpl/extract-unique-values.plj.gz +0 -0
- package/dist/tengo/tpl/get-unique-values.plj.gz +0 -0
- package/dist/tengo/tpl/main.plj.gz +0 -0
- package/dist/tengo/tpl/process-outputs.plj.gz +0 -0
- package/dist/tengo/tpl/run-alignment.plj.gz +0 -0
- package/package.json +6 -5
- package/src/analysis.tpl.tengo +379 -0
- package/src/build-outputs.tpl.tengo +2 -19
- package/src/check-content-empty.tpl.tengo +9 -0
- package/src/extract-unique-values.tpl.tengo +2 -5
- package/src/get-unique-values.tpl.tengo +51 -0
- package/src/main.tpl.tengo +87 -335
- package/src/process-outputs.tpl.tengo +39 -0
- package/src/run-alignment.tpl.tengo +4 -4
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,17 +1,25 @@
|
|
|
1
1
|
WARN Issue while reading "/home/runner/work/immune-assay-data/immune-assay-data/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
|
|
2
2
|
|
|
3
|
-
> @platforma-open/milaboratories.immune-assay-data.workflow@1.
|
|
3
|
+
> @platforma-open/milaboratories.immune-assay-data.workflow@1.8.1 build /home/runner/work/immune-assay-data/immune-assay-data/workflow
|
|
4
4
|
> rm -rf dist && pl-tengo check && pl-tengo build
|
|
5
5
|
|
|
6
|
+
Processing "src/analysis.tpl.tengo"...
|
|
6
7
|
Processing "src/build-outputs.tpl.tengo"...
|
|
8
|
+
Processing "src/check-content-empty.tpl.tengo"...
|
|
7
9
|
Processing "src/extract-unique-values.tpl.tengo"...
|
|
10
|
+
Processing "src/get-unique-values.tpl.tengo"...
|
|
8
11
|
Processing "src/main.tpl.tengo"...
|
|
12
|
+
Processing "src/process-outputs.tpl.tengo"...
|
|
9
13
|
Processing "src/run-alignment.tpl.tengo"...
|
|
10
14
|
No syntax errors found.
|
|
11
15
|
info: Compiling 'dist'...
|
|
12
16
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/build-outputs.plj.gz
|
|
17
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/check-content-empty.plj.gz
|
|
13
18
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/extract-unique-values.plj.gz
|
|
19
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/get-unique-values.plj.gz
|
|
20
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/process-outputs.plj.gz
|
|
14
21
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/run-alignment.plj.gz
|
|
22
|
+
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/analysis.plj.gz
|
|
15
23
|
info: - writing /home/runner/work/immune-assay-data/immune-assay-data/workflow/dist/tengo/tpl/main.plj.gz
|
|
16
24
|
info: Template Pack build done.
|
|
17
25
|
info: Template Pack build done.
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# @platforma-open/milaboratories.immune-assay-data.workflow
|
|
2
2
|
|
|
3
|
+
## 1.8.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 36ed105: Improve empty input detection
|
|
8
|
+
- Updated dependencies [36ed105]
|
|
9
|
+
- @platforma-open/milaboratories.immune-assay-data.check-content-empty@1.0.1
|
|
10
|
+
|
|
11
|
+
## 1.8.0
|
|
12
|
+
|
|
13
|
+
### Minor Changes
|
|
14
|
+
|
|
15
|
+
- de02090: Allow deduplication and minor fixes
|
|
16
|
+
|
|
3
17
|
## 1.7.1
|
|
4
18
|
|
|
5
19
|
### Patch Changes
|
package/dist/index.cjs
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
module.exports = { Templates: {
|
|
2
2
|
'build-outputs': { type: 'from-file', path: require.resolve('./tengo/tpl/build-outputs.plj.gz') },
|
|
3
|
+
'check-content-empty': { type: 'from-file', path: require.resolve('./tengo/tpl/check-content-empty.plj.gz') },
|
|
3
4
|
'extract-unique-values': { type: 'from-file', path: require.resolve('./tengo/tpl/extract-unique-values.plj.gz') },
|
|
5
|
+
'get-unique-values': { type: 'from-file', path: require.resolve('./tengo/tpl/get-unique-values.plj.gz') },
|
|
6
|
+
'process-outputs': { type: 'from-file', path: require.resolve('./tengo/tpl/process-outputs.plj.gz') },
|
|
4
7
|
'run-alignment': { type: 'from-file', path: require.resolve('./tengo/tpl/run-alignment.plj.gz') },
|
|
8
|
+
'analysis': { type: 'from-file', path: require.resolve('./tengo/tpl/analysis.plj.gz') },
|
|
5
9
|
'main': { type: 'from-file', path: require.resolve('./tengo/tpl/main.plj.gz') }
|
|
6
10
|
}};
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
declare type TemplateFromFile = { readonly type: "from-file"; readonly path: string; };
|
|
2
|
-
declare type TplName = "build-outputs" | "extract-unique-values" | "run-alignment" | "main";
|
|
2
|
+
declare type TplName = "build-outputs" | "check-content-empty" | "extract-unique-values" | "get-unique-values" | "process-outputs" | "run-alignment" | "analysis" | "main";
|
|
3
3
|
declare const Templates: Record<TplName, TemplateFromFile>;
|
|
4
4
|
export { Templates };
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import { resolve } from 'node:path';
|
|
2
2
|
export const Templates = {
|
|
3
3
|
'build-outputs': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/build-outputs.plj.gz') },
|
|
4
|
+
'check-content-empty': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/check-content-empty.plj.gz') },
|
|
4
5
|
'extract-unique-values': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/extract-unique-values.plj.gz') },
|
|
6
|
+
'get-unique-values': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/get-unique-values.plj.gz') },
|
|
7
|
+
'process-outputs': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/process-outputs.plj.gz') },
|
|
5
8
|
'run-alignment': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/run-alignment.plj.gz') },
|
|
9
|
+
'analysis': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/analysis.plj.gz') },
|
|
6
10
|
'main': { type: 'from-file', path: resolve(import.meta.dirname, './tengo/tpl/main.plj.gz') }
|
|
7
11
|
};
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@platforma-open/milaboratories.immune-assay-data.workflow",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Tengo-based template",
|
|
6
6
|
"dependencies": {
|
|
7
|
-
"@platforma-sdk/workflow-tengo": "5.9.
|
|
7
|
+
"@platforma-sdk/workflow-tengo": "5.9.1",
|
|
8
8
|
"@platforma-open/soedinglab.software-mmseqs2": "1.18.3",
|
|
9
9
|
"@platforma-open/milaboratories.immune-assay-data.prepare-fasta": "1.1.3",
|
|
10
10
|
"@platforma-open/milaboratories.immune-assay-data.add-header": "1.1.3",
|
|
11
|
-
"@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv": "1.1.0",
|
|
12
11
|
"@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc": "1.2.0",
|
|
13
|
-
"@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3"
|
|
12
|
+
"@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3",
|
|
13
|
+
"@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv": "1.1.0",
|
|
14
|
+
"@platforma-open/milaboratories.immune-assay-data.check-content-empty": "1.0.1"
|
|
14
15
|
},
|
|
15
16
|
"devDependencies": {
|
|
16
|
-
"@platforma-sdk/tengo-builder": "2.4.
|
|
17
|
+
"@platforma-sdk/tengo-builder": "2.4.28"
|
|
17
18
|
},
|
|
18
19
|
"scripts": {
|
|
19
20
|
"build": "rm -rf dist && pl-tengo check && pl-tengo build",
|
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
3
|
+
exec := import("@platforma-sdk/workflow-tengo:exec")
|
|
4
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
5
|
+
pt := import("@platforma-sdk/workflow-tengo:pt")
|
|
6
|
+
text := import("text")
|
|
7
|
+
json := import("json")
|
|
8
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
9
|
+
|
|
10
|
+
prepareFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.prepare-fasta:main")
|
|
11
|
+
fastaToTsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv:main")
|
|
12
|
+
addHeaderSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.add-header:main")
|
|
13
|
+
covModeCalcSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc:main")
|
|
14
|
+
xlsxToCsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv:main")
|
|
15
|
+
checkContentEmptySw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.check-content-empty:main")
|
|
16
|
+
|
|
17
|
+
runAlignmentTpl := assets.importTemplate(":run-alignment")
|
|
18
|
+
checkContentEmptyTpl := assets.importTemplate(":check-content-empty")
|
|
19
|
+
getUniqueValuesTpl := assets.importTemplate(":get-unique-values")
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Convert xlsx file to csv file
|
|
23
|
+
* @param fileXlsx - xlsx file
|
|
24
|
+
* @return csv file run result
|
|
25
|
+
*/
|
|
26
|
+
runXlsxToCsv := func(fileXlsx) {
|
|
27
|
+
e := exec.builder().
|
|
28
|
+
software(xlsxToCsvSw).
|
|
29
|
+
mem("16GiB").
|
|
30
|
+
cpu(1).
|
|
31
|
+
addFile("input.xlsx", fileXlsx).
|
|
32
|
+
arg("-i").arg("input.xlsx").
|
|
33
|
+
arg("-o").arg("output.csv").
|
|
34
|
+
saveFile("output.csv")
|
|
35
|
+
return e.run()
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Convert fasta file to tsv file
|
|
40
|
+
* @param fileFasta - fasta file
|
|
41
|
+
* @return tsv file run result
|
|
42
|
+
*/
|
|
43
|
+
runFastaToTsv := func(fileFasta) {
|
|
44
|
+
e := exec.builder().
|
|
45
|
+
software(fastaToTsvSw).
|
|
46
|
+
mem("8GiB").
|
|
47
|
+
cpu(1).
|
|
48
|
+
addFile("input.fasta", fileFasta).
|
|
49
|
+
arg("-i").arg("input.fasta").
|
|
50
|
+
arg("-o").arg("output.tsv").
|
|
51
|
+
saveFile("output.tsv")
|
|
52
|
+
return e.run()
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Convert tsv file to fasta file
|
|
57
|
+
* @param fileTsv - tsv file
|
|
58
|
+
* @param idCol - id column header
|
|
59
|
+
* @param seqCol - sequence column header
|
|
60
|
+
* @return fasta file run result
|
|
61
|
+
*/
|
|
62
|
+
runTsvToFasta := func(fileTsv, idCol, seqCol) {
|
|
63
|
+
e := exec.builder().
|
|
64
|
+
software(prepareFastaSw).
|
|
65
|
+
mem("8GiB").
|
|
66
|
+
cpu(1).
|
|
67
|
+
addFile("input.tsv", fileTsv).
|
|
68
|
+
arg("-i").arg("input.tsv").
|
|
69
|
+
arg("-o").arg("output.fasta").
|
|
70
|
+
arg("--seq_col").arg(seqCol).
|
|
71
|
+
arg("--id_col").arg(idCol).
|
|
72
|
+
saveFile("output.fasta")
|
|
73
|
+
return e.run()
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
prepareAssayFile := func(file, xsvType, sequenceColumnHeader) {
|
|
77
|
+
// assign ids to assay sequences
|
|
78
|
+
ptw := pt.workflow()
|
|
79
|
+
df := ptw.frame({
|
|
80
|
+
file: file,
|
|
81
|
+
xsvType: xsvType
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
//////// calculate sequence id ////////
|
|
85
|
+
// Create unique seqId for each row by combining sequence with row index
|
|
86
|
+
// First add row index using ordinal rank
|
|
87
|
+
df = df.withColumns(
|
|
88
|
+
pt.rank(pt.col(sequenceColumnHeader)).
|
|
89
|
+
over(pt.col(sequenceColumnHeader)).
|
|
90
|
+
alias("rowIndex")
|
|
91
|
+
)
|
|
92
|
+
// Concatenate sequence with row index and then hash
|
|
93
|
+
df = df.withColumns(
|
|
94
|
+
pt.when(pt.col("rowIndex").gt(pt.lit(1))).
|
|
95
|
+
then(pt.concatStr([pt.col(sequenceColumnHeader), pt.col("rowIndex").cast("String")],
|
|
96
|
+
{delimiter: "_"})).
|
|
97
|
+
otherwise(pt.col(sequenceColumnHeader)).
|
|
98
|
+
alias("uniqueKey")
|
|
99
|
+
)
|
|
100
|
+
// Create hash from the unique key
|
|
101
|
+
df = df.addColumns(
|
|
102
|
+
pt.col("uniqueKey").hash("sha256", "base64_alphanumeric", 120).alias("seqId")
|
|
103
|
+
)
|
|
104
|
+
df = df.withoutColumns("uniqueKey", "rowIndex")
|
|
105
|
+
//////// add label to ids ////////
|
|
106
|
+
df = df.withColumns(
|
|
107
|
+
pt.col("seqId").
|
|
108
|
+
strReplace("\\d", "", { replaceAll: true }).
|
|
109
|
+
strSlice(0, 5). // Take first 5 characters
|
|
110
|
+
strToUpper(). // Convert to uppercase
|
|
111
|
+
alias("tmpLabel")
|
|
112
|
+
)
|
|
113
|
+
df = df.withColumns(
|
|
114
|
+
pt.rank(pt.col("seqId")). // Rank based on clonotypeKeyCol (default ascending)
|
|
115
|
+
over(pt.col("tmpLabel")). // Partition by prefixTempCol
|
|
116
|
+
alias("rank")
|
|
117
|
+
)
|
|
118
|
+
df = df.withColumns(
|
|
119
|
+
pt.when(pt.col("rank").gt(pt.lit(1))).
|
|
120
|
+
then(pt.concatStr([pt.lit("A"), pt.col("tmpLabel"), pt.col("rank").cast("String")], {
|
|
121
|
+
delimiter: "-" })).
|
|
122
|
+
otherwise(pt.concatStr([pt.lit("A"), pt.col("tmpLabel")], { delimiter: "-" })).
|
|
123
|
+
alias("seqIdLabel")
|
|
124
|
+
)
|
|
125
|
+
df = df.withoutColumns("rank", "tmpLabel")
|
|
126
|
+
|
|
127
|
+
//////// add sequence column ////////
|
|
128
|
+
df = df.addColumns(
|
|
129
|
+
pt.col(sequenceColumnHeader).alias("sequence")
|
|
130
|
+
)
|
|
131
|
+
df.save("output.tsv")
|
|
132
|
+
|
|
133
|
+
return ptw.run().getFile("output.tsv")
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
self.defineOutputs(
|
|
137
|
+
"bestAlignmentTsv",
|
|
138
|
+
"assayDataTsv",
|
|
139
|
+
"clonesDataTsv",
|
|
140
|
+
"mmseqsOutput",
|
|
141
|
+
"emptyResults",
|
|
142
|
+
"uniqueValuesMap",
|
|
143
|
+
"columnsToImport"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
self.body(func(args) {
|
|
147
|
+
file := args.file
|
|
148
|
+
xsvType := args.xsvType
|
|
149
|
+
sequenceColumnHeader := args.sequenceColumnHeader
|
|
150
|
+
clonesFasta := args.clonesFasta
|
|
151
|
+
emptyClonesInput := args.emptyClonesInput
|
|
152
|
+
targetSequenceType := args.targetSequenceType
|
|
153
|
+
assaySequenceType := args.assaySequenceType
|
|
154
|
+
coverageThreshold := args.coverageThreshold
|
|
155
|
+
identityThreshold := args.identityThreshold
|
|
156
|
+
similarityType := args.similarityType
|
|
157
|
+
lessSensitive := args.lessSensitive
|
|
158
|
+
mem := args.metaInputs.mem
|
|
159
|
+
cpu := args.metaInputs.cpu
|
|
160
|
+
|
|
161
|
+
// Filter columns to import based on user selection
|
|
162
|
+
columnsToImport := args.importColumns
|
|
163
|
+
if args.selectedColumns != undefined && len(args.selectedColumns) > 0 {
|
|
164
|
+
selectedHeaders := {}
|
|
165
|
+
for header in args.selectedColumns {
|
|
166
|
+
selectedHeaders[header] = true
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
filteredColumns := []
|
|
170
|
+
for col in args.importColumns {
|
|
171
|
+
// Always include the main sequence column
|
|
172
|
+
if col.header == sequenceColumnHeader || selectedHeaders[col.header] {
|
|
173
|
+
filteredColumns = append(filteredColumns, col)
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
columnsToImport = filteredColumns
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Determine string columns for unique value extraction
|
|
180
|
+
stringColumns := []
|
|
181
|
+
for h in columnsToImport {
|
|
182
|
+
if h.type == "String" && h.header != sequenceColumnHeader {
|
|
183
|
+
stringColumns = append(stringColumns, h.header)
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Check if clones input is empty (resolved from upstream check template)
|
|
188
|
+
if emptyClonesInput {
|
|
189
|
+
return {
|
|
190
|
+
bestAlignmentTsv: {},
|
|
191
|
+
assayDataTsv: {},
|
|
192
|
+
clonesDataTsv: {},
|
|
193
|
+
mmseqsOutput: {},
|
|
194
|
+
emptyResults: true,
|
|
195
|
+
uniqueValuesMap: {},
|
|
196
|
+
columnsToImport: columnsToImport
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Handle XLSX files (convert to CSV)
|
|
201
|
+
if xsvType == "xlsx" {
|
|
202
|
+
xlsxToCsvRun := runXlsxToCsv(file)
|
|
203
|
+
file = xlsxToCsvRun.getFile("output.csv")
|
|
204
|
+
xsvType = "csv"
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Handle FASTA files (convert to TSV)
|
|
208
|
+
if xsvType == "fasta" || xsvType == "fa" {
|
|
209
|
+
fastaToTsvRun := runFastaToTsv(file)
|
|
210
|
+
file = fastaToTsvRun.getFile("output.tsv")
|
|
211
|
+
xsvType = "tsv"
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
assayTsv := prepareAssayFile(file, xsvType, sequenceColumnHeader)
|
|
215
|
+
|
|
216
|
+
// Prepare assay FASTA (clones FASTA already prepared upstream)
|
|
217
|
+
assayFastaRun := runTsvToFasta(assayTsv, "seqId", "sequence")
|
|
218
|
+
assayFasta := assayFastaRun.getFile("output.fasta")
|
|
219
|
+
|
|
220
|
+
// Dynamically determine coverage mode by comparing average sequence lengths
|
|
221
|
+
coverageModeRun := exec.builder().
|
|
222
|
+
software(covModeCalcSw).
|
|
223
|
+
mem("16GiB").
|
|
224
|
+
cpu(1).
|
|
225
|
+
addFile("clones.fasta", clonesFasta).
|
|
226
|
+
addFile("assay.fasta", assayFasta).
|
|
227
|
+
arg("--clones-fasta").arg("clones.fasta").
|
|
228
|
+
arg("--assay-fasta").arg("assay.fasta").
|
|
229
|
+
arg("--output").arg("coverage_mode.txt").
|
|
230
|
+
saveFileContent("coverage_mode.txt").
|
|
231
|
+
run()
|
|
232
|
+
|
|
233
|
+
covMode := coverageModeRun.getFileContent("coverage_mode.txt")
|
|
234
|
+
|
|
235
|
+
// MMseqs2 Alignment
|
|
236
|
+
mmseqsSearchType := "0"
|
|
237
|
+
if targetSequenceType == "aminoacid" && assaySequenceType == "aminoacid" {
|
|
238
|
+
//1: amino acid
|
|
239
|
+
mmseqsSearchType = "1"
|
|
240
|
+
} else if targetSequenceType == "nucleotide" && assaySequenceType == "nucleotide" {
|
|
241
|
+
// 3: nucleotide
|
|
242
|
+
mmseqsSearchType = "3"
|
|
243
|
+
} else if targetSequenceType == "nucleotide" && assaySequenceType == "aminoacid" {
|
|
244
|
+
// 4: translated nucleotide alignment
|
|
245
|
+
mmseqsSearchType = "4"
|
|
246
|
+
} else if targetSequenceType == "aminoacid" && assaySequenceType == "nucleotide" {
|
|
247
|
+
// 2: nucleotide
|
|
248
|
+
mmseqsSearchType = "2"
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
runMmseqs := render.create(runAlignmentTpl, {
|
|
252
|
+
covMode: covMode,
|
|
253
|
+
mmseqsSearchType: mmseqsSearchType,
|
|
254
|
+
coverageThreshold: coverageThreshold,
|
|
255
|
+
identityThreshold: identityThreshold,
|
|
256
|
+
similarityType: similarityType,
|
|
257
|
+
clonesFasta: clonesFasta,
|
|
258
|
+
assayFasta: assayFasta,
|
|
259
|
+
lessSensitive: lessSensitive
|
|
260
|
+
}, {
|
|
261
|
+
metaInputs: {
|
|
262
|
+
mem: mem,
|
|
263
|
+
cpu: cpu
|
|
264
|
+
}
|
|
265
|
+
})
|
|
266
|
+
|
|
267
|
+
mmseqsOutput := runMmseqs.output("mmseqsOutput")
|
|
268
|
+
|
|
269
|
+
// @TODO remove header stuff and replace with pt when available (!)
|
|
270
|
+
addHeaderRun := exec.builder().
|
|
271
|
+
software(addHeaderSw).
|
|
272
|
+
mem("16GiB").
|
|
273
|
+
cpu(1).
|
|
274
|
+
arg("-i").arg("results.tsv").
|
|
275
|
+
arg("-o").arg("results_with_header.tsv").
|
|
276
|
+
addFile("results.tsv", mmseqsOutput).
|
|
277
|
+
saveFile("results_with_header.tsv").
|
|
278
|
+
run()
|
|
279
|
+
|
|
280
|
+
mmseqsResultTsv := addHeaderRun.getFile("results_with_header.tsv")
|
|
281
|
+
|
|
282
|
+
// Check if results are empty (only header line or nothing)
|
|
283
|
+
checkResultsRun := exec.builder().
|
|
284
|
+
software(checkContentEmptySw).
|
|
285
|
+
arg("-i").arg("input.file").
|
|
286
|
+
arg("-n").arg("2"). // Require at least 2 non-empty lines (header + 1 data line)
|
|
287
|
+
addFile("input.file", mmseqsResultTsv).
|
|
288
|
+
saveStdoutContent().
|
|
289
|
+
mem("8GiB").
|
|
290
|
+
cpu(1).
|
|
291
|
+
inLightQueue().
|
|
292
|
+
run()
|
|
293
|
+
|
|
294
|
+
checkResult := render.create(checkContentEmptyTpl, {
|
|
295
|
+
content: checkResultsRun.getStdoutContent()
|
|
296
|
+
})
|
|
297
|
+
emptyResults := checkResult.output("result")
|
|
298
|
+
|
|
299
|
+
result := {
|
|
300
|
+
mmseqsOutput: mmseqsOutput,
|
|
301
|
+
emptyResults: emptyResults
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Post-processing with PT
|
|
305
|
+
ptw2 := pt.workflow()
|
|
306
|
+
dfRes := ptw2.frame({
|
|
307
|
+
file: mmseqsResultTsv,
|
|
308
|
+
xsvType: "tsv"
|
|
309
|
+
})
|
|
310
|
+
|
|
311
|
+
// Cast columns to ensure correct types for aggregation
|
|
312
|
+
dfRes = dfRes.withColumns(
|
|
313
|
+
pt.col("evalue").cast("Float64").alias("evalue"),
|
|
314
|
+
pt.col("bits").cast("Float64").alias("bits")
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
cols := []
|
|
318
|
+
for _, col in ["bits", "evalue", "query", "pident", "alnlen", "mismatch",
|
|
319
|
+
"gapopen", "qstart", "qend", "tstart", "tend"] {
|
|
320
|
+
cols = append(cols,
|
|
321
|
+
pt.col(col).maxBy(
|
|
322
|
+
pt.col("evalue").multiply(-1),
|
|
323
|
+
pt.col("bits")
|
|
324
|
+
).alias(col)
|
|
325
|
+
)
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
dfRes = dfRes.groupBy("target").agg(cols...)
|
|
329
|
+
// Add link column for linker pFrame (assayLinkerPframe)
|
|
330
|
+
dfRes = dfRes.withColumns(
|
|
331
|
+
pt.lit(1).cast("Int64").alias("link")
|
|
332
|
+
)
|
|
333
|
+
dfRes.save("best_alignment.tsv")
|
|
334
|
+
|
|
335
|
+
// Assay data summary
|
|
336
|
+
assayDf := ptw2.frame({
|
|
337
|
+
file: assayTsv,
|
|
338
|
+
xsvType: "tsv"
|
|
339
|
+
})
|
|
340
|
+
|
|
341
|
+
// import how many matches per assay sequence found
|
|
342
|
+
assayDf = assayDf.join(
|
|
343
|
+
dfRes.groupBy("query").agg(
|
|
344
|
+
pt.col("target").count().alias("queryCount")
|
|
345
|
+
),
|
|
346
|
+
{
|
|
347
|
+
how: "left",
|
|
348
|
+
leftOn: "seqId",
|
|
349
|
+
rightOn: "query"
|
|
350
|
+
}
|
|
351
|
+
)
|
|
352
|
+
assayDf.save("assay_data.tsv")
|
|
353
|
+
|
|
354
|
+
// Clones data
|
|
355
|
+
clonesDf := dfRes.join(assayDf,
|
|
356
|
+
{
|
|
357
|
+
how: "left",
|
|
358
|
+
leftOn: "query",
|
|
359
|
+
rightOn: "seqId"
|
|
360
|
+
}
|
|
361
|
+
)
|
|
362
|
+
clonesDf.save("clones_data.tsv")
|
|
363
|
+
|
|
364
|
+
ptRun2 := ptw2.run()
|
|
365
|
+
|
|
366
|
+
result.bestAlignmentTsv = ptRun2.getFile("best_alignment.tsv")
|
|
367
|
+
result.assayDataTsv = ptRun2.getFile("assay_data.tsv")
|
|
368
|
+
result.clonesDataTsv = ptRun2.getFile("clones_data.tsv")
|
|
369
|
+
|
|
370
|
+
// Extract unique values for string columns
|
|
371
|
+
getUniqueValuesResult := render.create(getUniqueValuesTpl, {
|
|
372
|
+
assayDataTsv: result.assayDataTsv,
|
|
373
|
+
stringColumns: stringColumns
|
|
374
|
+
})
|
|
375
|
+
result.uniqueValuesMap = getUniqueValuesResult.output("uniqueValuesMap")
|
|
376
|
+
result.columnsToImport = columnsToImport
|
|
377
|
+
|
|
378
|
+
return result
|
|
379
|
+
})
|
|
@@ -57,24 +57,7 @@ self.body(func(inputs) {
|
|
|
57
57
|
}
|
|
58
58
|
]
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
if inputs.selectedColumns != undefined && len(inputs.selectedColumns) > 0 {
|
|
62
|
-
selectedHeaders := {}
|
|
63
|
-
for header in inputs.selectedColumns {
|
|
64
|
-
selectedHeaders[header] = true
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
filteredColumns := []
|
|
68
|
-
for col in inputs.importColumns {
|
|
69
|
-
// Always include the main sequence column
|
|
70
|
-
if col.header == inputs.sequenceColumnHeader || selectedHeaders[col.header] {
|
|
71
|
-
filteredColumns = append(filteredColumns, col)
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
columnsToImport = filteredColumns
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
for h in columnsToImport {
|
|
60
|
+
for h in inputs.columnsToImport {
|
|
78
61
|
if h.header == inputs.sequenceColumnHeader {
|
|
79
62
|
continue
|
|
80
63
|
}
|
|
@@ -160,7 +143,7 @@ self.body(func(inputs) {
|
|
|
160
143
|
}
|
|
161
144
|
}]
|
|
162
145
|
|
|
163
|
-
for h in columnsToImport {
|
|
146
|
+
for h in inputs.columnsToImport {
|
|
164
147
|
annotations := {
|
|
165
148
|
"pl7.app/label": h.header,
|
|
166
149
|
"pl7.app/table/visibility": h.header == inputs.sequenceColumnHeader ? "optional" : "default"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
text := import("text")
|
|
3
|
+
|
|
4
|
+
self.defineOutputs("result")
|
|
5
|
+
|
|
6
|
+
self.body(func(args) {
|
|
7
|
+
// 'content' is the stdout of the check-software (already a "true"/"false" string)
|
|
8
|
+
return { result: text.trim_space(string(args.content.getData())) == "true" }
|
|
9
|
+
})
|
|
@@ -10,8 +10,7 @@ self.body(func(inputs) {
|
|
|
10
10
|
uniqueValuesMap := {}
|
|
11
11
|
|
|
12
12
|
for header, contentField in fileContents {
|
|
13
|
-
// In subtemplate, we can call getData() directly on
|
|
14
|
-
// (following clonotype-enrichment pattern)
|
|
13
|
+
// In subtemplate, we can call getData() directly on inputs
|
|
15
14
|
contentBytes := contentField.getData()
|
|
16
15
|
content := string(contentBytes)
|
|
17
16
|
lines := text.split(text.trim_space(content), "\n")
|
|
@@ -20,8 +19,7 @@ self.body(func(inputs) {
|
|
|
20
19
|
// Skip header and collect values
|
|
21
20
|
values := lines[1:]
|
|
22
21
|
// JSON encode the array of strings and convert to string
|
|
23
|
-
|
|
24
|
-
uniqueValuesMap[header] = string(encodedBytes)
|
|
22
|
+
uniqueValuesMap[header] = string(json.encode(values))
|
|
25
23
|
}
|
|
26
24
|
}
|
|
27
25
|
|
|
@@ -29,4 +27,3 @@ self.body(func(inputs) {
|
|
|
29
27
|
uniqueValuesMap: uniqueValuesMap
|
|
30
28
|
}
|
|
31
29
|
})
|
|
32
|
-
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
3
|
+
pt := import("@platforma-sdk/workflow-tengo:pt")
|
|
4
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
5
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
6
|
+
strings := import("@platforma-sdk/workflow-tengo:strings")
|
|
7
|
+
|
|
8
|
+
extractUniqueValuesTpl := assets.importTemplate(":extract-unique-values")
|
|
9
|
+
|
|
10
|
+
self.defineOutputs("uniqueValuesMap")
|
|
11
|
+
|
|
12
|
+
self.body(func(inputs) {
|
|
13
|
+
assayDataTsv := inputs.assayDataTsv
|
|
14
|
+
stringColumns := inputs.stringColumns
|
|
15
|
+
|
|
16
|
+
if len(stringColumns) == 0 {
|
|
17
|
+
return { uniqueValuesMap: {} }
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
uniqueValuesWf := pt.workflow().mem("4GiB").cpu(1)
|
|
21
|
+
baseDf := uniqueValuesWf.frame({
|
|
22
|
+
file: assayDataTsv,
|
|
23
|
+
xsvType: "tsv"
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
// Process each String column to extract unique values
|
|
27
|
+
for colHeader in stringColumns {
|
|
28
|
+
uniqueValuesDf := baseDf.select(pt.col(colHeader).alias("value")).groupBy("value").agg(pt.col("value").count().alias("_count"))
|
|
29
|
+
uniqueValuesDf = uniqueValuesDf.select("value")
|
|
30
|
+
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
31
|
+
uniqueValuesDf.saveContent(fileName)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Run once and collect all results
|
|
35
|
+
uniqueValuesResult := uniqueValuesWf.run()
|
|
36
|
+
|
|
37
|
+
fileContentsMap := {}
|
|
38
|
+
for colHeader in stringColumns {
|
|
39
|
+
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
40
|
+
fileContentsMap[colHeader] = uniqueValuesResult.getFileContent(fileName)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Now call the extraction subtemplate which will use .getData() on these resources
|
|
44
|
+
extractResult := render.create(extractUniqueValuesTpl, {
|
|
45
|
+
fileContents: fileContentsMap
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
uniqueValuesMap: extractResult.output("uniqueValuesMap")
|
|
50
|
+
}
|
|
51
|
+
})
|
package/src/main.tpl.tengo
CHANGED
|
@@ -3,25 +3,20 @@ ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
|
3
3
|
file := import("@platforma-sdk/workflow-tengo:file")
|
|
4
4
|
exec := import("@platforma-sdk/workflow-tengo:exec")
|
|
5
5
|
assets:= import("@platforma-sdk/workflow-tengo:assets")
|
|
6
|
-
maps:= import("@platforma-sdk/workflow-tengo:maps")
|
|
7
|
-
xsv := import("@platforma-sdk/workflow-tengo:pframes.xsv")
|
|
8
6
|
pframes := import("@platforma-sdk/workflow-tengo:pframes")
|
|
9
|
-
pSpec := import("@platforma-sdk/workflow-tengo:pframes.spec")
|
|
10
7
|
pt := import("@platforma-sdk/workflow-tengo:pt")
|
|
11
8
|
path := import("@platforma-sdk/workflow-tengo:path")
|
|
12
9
|
json := import("json")
|
|
13
10
|
text := import("text")
|
|
14
11
|
render := import("@platforma-sdk/workflow-tengo:render")
|
|
15
12
|
strings := import("@platforma-sdk/workflow-tengo:strings")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
13
|
+
|
|
14
|
+
analysisTpl := assets.importTemplate(":analysis")
|
|
15
|
+
processOutputsTpl := assets.importTemplate(":process-outputs")
|
|
16
|
+
checkContentEmptyTpl := assets.importTemplate(":check-content-empty")
|
|
19
17
|
|
|
20
18
|
prepareFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.prepare-fasta:main")
|
|
21
|
-
|
|
22
|
-
addHeaderSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.add-header:main")
|
|
23
|
-
covModeCalcSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc:main")
|
|
24
|
-
xlsxToCsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv:main")
|
|
19
|
+
checkContentEmptySw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.check-content-empty:main")
|
|
25
20
|
|
|
26
21
|
wf.prepare(func(args){
|
|
27
22
|
bundleBuilder := wf.createPBundleBuilder()
|
|
@@ -32,65 +27,6 @@ wf.prepare(func(args){
|
|
|
32
27
|
}
|
|
33
28
|
})
|
|
34
29
|
|
|
35
|
-
prepareAssayFile := func(args, file, xsvType) {
|
|
36
|
-
// assign ids to assay sequences
|
|
37
|
-
ptw := pt.workflow()
|
|
38
|
-
df := ptw.frame({
|
|
39
|
-
file: file,
|
|
40
|
-
xsvType: xsvType
|
|
41
|
-
})
|
|
42
|
-
|
|
43
|
-
//////// calculate sequence id ////////
|
|
44
|
-
// Create unique seqId for each row by combining sequence with row index
|
|
45
|
-
// First add row index using ordinal rank
|
|
46
|
-
df = df.withColumns(
|
|
47
|
-
pt.rank(pt.col(args.sequenceColumnHeader)).
|
|
48
|
-
over(pt.col(args.sequenceColumnHeader)).
|
|
49
|
-
alias("rowIndex")
|
|
50
|
-
)
|
|
51
|
-
// Concatenate sequence with row index and then hash
|
|
52
|
-
df = df.withColumns(
|
|
53
|
-
pt.when(pt.col("rowIndex").gt(pt.lit(1))).
|
|
54
|
-
then(pt.concatStr([pt.col(args.sequenceColumnHeader), pt.col("rowIndex").cast("String")], {delimiter: "_"})).
|
|
55
|
-
otherwise(pt.col(args.sequenceColumnHeader)).
|
|
56
|
-
alias("uniqueKey")
|
|
57
|
-
)
|
|
58
|
-
// Create hash from the unique key
|
|
59
|
-
df = df.addColumns(
|
|
60
|
-
pt.col("uniqueKey").hash("sha256", "base64_alphanumeric", 120).alias("seqId")
|
|
61
|
-
)
|
|
62
|
-
// Remove the temporary columns
|
|
63
|
-
//df = df.withoutColumns("uniqueKey", "rowIndex")
|
|
64
|
-
//////// add label to ids ////////
|
|
65
|
-
df = df.withColumns(
|
|
66
|
-
pt.col("seqId").
|
|
67
|
-
strReplace("\\d", "", { replaceAll: true }).
|
|
68
|
-
strSlice(0, 5). // Take first 5 characters
|
|
69
|
-
strToUpper(). // Convert to uppercase
|
|
70
|
-
alias("tmpLabel")
|
|
71
|
-
)
|
|
72
|
-
df = df.withColumns(
|
|
73
|
-
pt.rank(pt.col("seqId")). // Rank based on clonotypeKeyCol (default ascending)
|
|
74
|
-
over(pt.col("tmpLabel")). // Partition by prefixTempCol
|
|
75
|
-
alias("rank")
|
|
76
|
-
)
|
|
77
|
-
df = df.withColumns(
|
|
78
|
-
pt.when(pt.col("rank").gt(pt.lit(1))).
|
|
79
|
-
then(pt.concatStr([pt.lit("A"), pt.col("tmpLabel"), pt.col("rank").cast("String")], { delimiter: "-" })).
|
|
80
|
-
otherwise(pt.concatStr([pt.lit("A"), pt.col("tmpLabel")], { delimiter: "-" })).
|
|
81
|
-
alias("seqIdLabel")
|
|
82
|
-
)
|
|
83
|
-
df = df.withoutColumns("rank", "tmpLabel")
|
|
84
|
-
|
|
85
|
-
//////// add sequence column ////////
|
|
86
|
-
df = df.addColumns(
|
|
87
|
-
pt.col(args.sequenceColumnHeader).alias("sequence")
|
|
88
|
-
)
|
|
89
|
-
df.save("output.tsv")
|
|
90
|
-
|
|
91
|
-
return ptw.run().getFile("output.tsv")
|
|
92
|
-
}
|
|
93
|
-
|
|
94
30
|
prepareClonesTsv := func(args) {
|
|
95
31
|
columns := args.columns
|
|
96
32
|
datasetSpec := columns.getSpec(args.datasetRef)
|
|
@@ -105,62 +41,6 @@ prepareClonesTsv := func(args) {
|
|
|
105
41
|
return cloneTable.build()
|
|
106
42
|
}
|
|
107
43
|
|
|
108
|
-
/**
|
|
109
|
-
* Convert tsv file to fasta file
|
|
110
|
-
* @param fileTsv - tsv file
|
|
111
|
-
* @return fasta file run result
|
|
112
|
-
*/
|
|
113
|
-
runTsvToFasta := func(fileTsv) {
|
|
114
|
-
e := exec.builder().
|
|
115
|
-
software(prepareFastaSw).
|
|
116
|
-
mem("8GiB").
|
|
117
|
-
cpu(1).
|
|
118
|
-
addFile("input.tsv", fileTsv).
|
|
119
|
-
arg("-i").arg("input.tsv").
|
|
120
|
-
arg("-o").arg("output.fasta").
|
|
121
|
-
arg("--seq_col").arg("sequence").
|
|
122
|
-
arg("--id_col").arg("seqId").
|
|
123
|
-
saveFile("output.fasta")
|
|
124
|
-
|
|
125
|
-
return e.run()
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
/**
|
|
129
|
-
* Convert fasta file to tsv file
|
|
130
|
-
* @param fileFasta - fasta file
|
|
131
|
-
* @return tsv file run result
|
|
132
|
-
*/
|
|
133
|
-
runFastaToTsv := func(fileFasta) {
|
|
134
|
-
e := exec.builder().
|
|
135
|
-
software(fastaToTsvSw).
|
|
136
|
-
mem("8GiB").
|
|
137
|
-
cpu(1).
|
|
138
|
-
addFile("input.fasta", fileFasta).
|
|
139
|
-
arg("-i").arg("input.fasta").
|
|
140
|
-
arg("-o").arg("output.tsv").
|
|
141
|
-
saveFile("output.tsv")
|
|
142
|
-
|
|
143
|
-
return e.run()
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Convert xlsx file to csv file
|
|
148
|
-
* @param fileXlsx - xlsx file
|
|
149
|
-
* @return csv file run result
|
|
150
|
-
*/
|
|
151
|
-
runXlsxToCsv := func(fileXlsx) {
|
|
152
|
-
e := exec.builder().
|
|
153
|
-
software(xlsxToCsvSw).
|
|
154
|
-
mem("16GiB").
|
|
155
|
-
cpu(1).
|
|
156
|
-
addFile("input.xlsx", fileXlsx).
|
|
157
|
-
arg("-i").arg("input.xlsx").
|
|
158
|
-
arg("-o").arg("output.csv").
|
|
159
|
-
saveFile("output.csv")
|
|
160
|
-
|
|
161
|
-
return e.run()
|
|
162
|
-
}
|
|
163
|
-
|
|
164
44
|
wf.body(func(args) {
|
|
165
45
|
importFile := file.importFile(args.fileHandle)
|
|
166
46
|
datasetSpec := args.columns.getSpec(args.datasetRef)
|
|
@@ -211,238 +91,110 @@ wf.body(func(args) {
|
|
|
211
91
|
xsvType = args.detectedXsvType
|
|
212
92
|
}
|
|
213
93
|
|
|
214
|
-
// Handle XLSX files by converting to CSV first
|
|
215
|
-
if xsvType == "xlsx" {
|
|
216
|
-
xlsxToCsvRun := runXlsxToCsv(importFile.file)
|
|
217
|
-
importFile.file = xlsxToCsvRun.getFile("output.csv")
|
|
218
|
-
xsvType = "csv"
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
// Handle FASTA files by converting to TSV first
|
|
222
|
-
if xsvType == "fasta" || xsvType == "fa" {
|
|
223
|
-
fastaToTsvRun := runFastaToTsv(importFile.file)
|
|
224
|
-
importFile.file = fastaToTsvRun.getFile("output.tsv")
|
|
225
|
-
xsvType = "tsv"
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
assayTsv := prepareAssayFile(args, importFile.file, xsvType)
|
|
229
94
|
clonesTsv := prepareClonesTsv(args)
|
|
230
95
|
|
|
231
|
-
//
|
|
232
|
-
clonesFastaRun :=
|
|
233
|
-
|
|
96
|
+
// Convert clones TSV to FASTA and check if empty
|
|
97
|
+
clonesFastaRun := exec.builder().
|
|
98
|
+
software(prepareFastaSw).
|
|
99
|
+
mem("8GiB").
|
|
100
|
+
cpu(1).
|
|
101
|
+
addFile("input.tsv", clonesTsv).
|
|
102
|
+
arg("-i").arg("input.tsv").
|
|
103
|
+
arg("-o").arg("output.fasta").
|
|
104
|
+
arg("--seq_col").arg("sequence").
|
|
105
|
+
arg("--id_col").arg("seqId").
|
|
106
|
+
saveFile("output.fasta").
|
|
107
|
+
run()
|
|
234
108
|
clonesFasta := clonesFastaRun.getFile("output.fasta")
|
|
235
|
-
assayFasta := assayFastaRun.getFile("output.fasta")
|
|
236
109
|
|
|
237
|
-
//
|
|
238
|
-
|
|
239
|
-
software(
|
|
240
|
-
|
|
110
|
+
// Check if clones are empty before running mmseqs2
|
|
111
|
+
checkClonesRun := exec.builder().
|
|
112
|
+
software(checkContentEmptySw).
|
|
113
|
+
arg("-i").arg("input.file").
|
|
114
|
+
arg("-n").arg("1").
|
|
115
|
+
addFile("input.file", clonesFasta).
|
|
116
|
+
saveStdoutContent().
|
|
117
|
+
mem("8GiB").
|
|
241
118
|
cpu(1).
|
|
242
|
-
|
|
243
|
-
addFile("assay.fasta", assayFasta).
|
|
244
|
-
arg("--clones-fasta").arg("clones.fasta").
|
|
245
|
-
arg("--assay-fasta").arg("assay.fasta").
|
|
246
|
-
arg("--output").arg("coverage_mode.txt").
|
|
247
|
-
saveFileContent("coverage_mode.txt").
|
|
119
|
+
inLightQueue().
|
|
248
120
|
run()
|
|
249
121
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
//1: amino acid
|
|
255
|
-
mmseqsSearchType = "1"
|
|
256
|
-
} else if targetSequenceType == "nucleotide" && assaySequenceType == "nucleotide" {
|
|
257
|
-
// 3: nucleotide
|
|
258
|
-
mmseqsSearchType = "3"
|
|
259
|
-
} else if targetSequenceType == "nucleotide" && assaySequenceType == "aminoacid" {
|
|
260
|
-
// 4: translated nucleotide alignment
|
|
261
|
-
mmseqsSearchType = "4"
|
|
262
|
-
} else if targetSequenceType == "aminoacid" && assaySequenceType == "nucleotide" {
|
|
263
|
-
// 2: nucleotide
|
|
264
|
-
mmseqsSearchType = "2"
|
|
265
|
-
}
|
|
122
|
+
checkClonesResult := render.create(checkContentEmptyTpl, {
|
|
123
|
+
content: checkClonesRun.getStdoutContent()
|
|
124
|
+
})
|
|
125
|
+
emptyClonesInput := checkClonesResult.output("result")
|
|
266
126
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
127
|
+
// Run Analysis Template
|
|
128
|
+
analysisResult := render.create(analysisTpl, {
|
|
129
|
+
file: importFile.file,
|
|
130
|
+
xsvType: xsvType,
|
|
131
|
+
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
132
|
+
clonesFasta: clonesFasta,
|
|
133
|
+
emptyClonesInput: emptyClonesInput,
|
|
134
|
+
targetSequenceType: targetSequenceType,
|
|
135
|
+
assaySequenceType: assaySequenceType,
|
|
270
136
|
coverageThreshold: args.settings.coverageThreshold,
|
|
271
137
|
identityThreshold: args.settings.identity,
|
|
272
138
|
similarityType: args.settings.similarityType,
|
|
273
|
-
clonesFasta: clonesFasta,
|
|
274
|
-
assayFasta: assayFasta,
|
|
275
139
|
lessSensitive: args.lessSensitive,
|
|
276
|
-
|
|
277
|
-
|
|
140
|
+
importColumns: args.importColumns,
|
|
141
|
+
selectedColumns: args.selectedColumns
|
|
142
|
+
}, {
|
|
143
|
+
metaInputs: {
|
|
144
|
+
mem: args.mem,
|
|
145
|
+
cpu: args.cpu
|
|
146
|
+
}
|
|
278
147
|
})
|
|
279
148
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
addHeaderRunResult := exec.builder().
|
|
284
|
-
software(addHeaderSw).
|
|
285
|
-
mem("16GiB").
|
|
286
|
-
cpu(1).
|
|
287
|
-
arg("-i").arg("results.tsv").
|
|
288
|
-
arg("-o").arg("results_with_header.tsv").
|
|
289
|
-
addFile("results.tsv", mmseqsOutput).
|
|
290
|
-
saveFile("results_with_header.tsv").
|
|
291
|
-
saveFileContent("results_with_header.tsv").
|
|
292
|
-
run()
|
|
293
|
-
|
|
294
|
-
mmseqsResultTsv := addHeaderRunResult.getFile("results_with_header.tsv")
|
|
295
|
-
mmseqsResultTsvContent := addHeaderRunResult.getFileContent("results_with_header.tsv")
|
|
296
|
-
|
|
297
|
-
emptyResults := len(text.trim_space(string(mmseqsResultTsvContent))) == 0
|
|
149
|
+
emptyResults := analysisResult.output("emptyResults")
|
|
150
|
+
mmseqsOutput := analysisResult.output("mmseqsOutput")
|
|
151
|
+
|
|
298
152
|
blockId := wf.blockId().getDataAsJson()
|
|
299
153
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
uniqueValuesMap :=
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
pt.col(col).maxBy(
|
|
327
|
-
pt.col("evalue").multiply(-1),
|
|
328
|
-
pt.col("bits")
|
|
329
|
-
).alias(col)
|
|
330
|
-
)
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
df = df.groupBy("target").agg(cols...)
|
|
334
|
-
|
|
335
|
-
// Add link column for linker pFrame (assayLinkerPframe)
|
|
336
|
-
df = df.withColumns(
|
|
337
|
-
pt.lit(1).cast("Int64").alias("link")
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
df.save("results.tsv")
|
|
341
|
-
|
|
342
|
-
// assay data import summary
|
|
343
|
-
assayDf := ptw.frame({
|
|
344
|
-
file: assayTsv,
|
|
345
|
-
xsvType: "tsv"
|
|
346
|
-
})
|
|
347
|
-
// import how many matches per assay sequence found
|
|
348
|
-
assayDf = assayDf.join(
|
|
349
|
-
df.groupBy("query").agg(
|
|
350
|
-
pt.col("target").count().alias("queryCount")
|
|
351
|
-
),
|
|
352
|
-
{
|
|
353
|
-
how: "left",
|
|
354
|
-
leftOn: "seqId",
|
|
355
|
-
rightOn: "query"
|
|
356
|
-
}
|
|
357
|
-
)
|
|
358
|
-
assayDf.save("assayData.tsv")
|
|
359
|
-
|
|
360
|
-
// clones
|
|
361
|
-
clonesDf := df.join(assayDf,
|
|
362
|
-
{
|
|
363
|
-
how: "left",
|
|
364
|
-
leftOn: "query",
|
|
365
|
-
rightOn: "seqId"
|
|
366
|
-
}
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
clonesDf.save("clonesData.tsv")
|
|
370
|
-
ptw = ptw.run()
|
|
371
|
-
|
|
372
|
-
//////// Extract unique values from String columns ////////
|
|
373
|
-
// Find all String columns
|
|
374
|
-
stringColumns := []
|
|
375
|
-
for h in args.importColumns {
|
|
376
|
-
if h.type == "String" && h.header != args.sequenceColumnHeader {
|
|
377
|
-
stringColumns = append(stringColumns, h.header)
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// Extract unique values for all String columns
|
|
382
|
-
fileContentsMap := {}
|
|
383
|
-
if len(stringColumns) > 0 {
|
|
384
|
-
uniqueValuesWf := pt.workflow().mem("4GiB").cpu(1)
|
|
385
|
-
baseDf := uniqueValuesWf.frame({
|
|
386
|
-
file: ptw.getFile("assayData.tsv"),
|
|
387
|
-
xsvType: "tsv"
|
|
388
|
-
})
|
|
389
|
-
// Process each String column to extract unique values
|
|
390
|
-
for colHeader in stringColumns {
|
|
391
|
-
uniqueValuesDf := baseDf.select(pt.col(colHeader).alias("value")).groupBy("value").agg(pt.col("value").count().alias("_count"))
|
|
392
|
-
uniqueValuesDf = uniqueValuesDf.select("value")
|
|
393
|
-
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
394
|
-
uniqueValuesDf.saveContent(fileName)
|
|
395
|
-
}
|
|
396
|
-
// Run once and collect all results
|
|
397
|
-
uniqueValuesResult := uniqueValuesWf.run()
|
|
398
|
-
for colHeader in stringColumns {
|
|
399
|
-
fileName := "unique_values_" + strings.substituteSpecialCharacters(colHeader) + ".csv"
|
|
400
|
-
fileContentsMap[colHeader] = uniqueValuesResult.getFileContent(fileName)
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
// Use subtemplate to extract content (getData() only works in subtemplates)
|
|
404
|
-
extractResult := render.create(extractUniqueValuesTpl, {
|
|
405
|
-
fileContents: fileContentsMap
|
|
406
|
-
})
|
|
407
|
-
uniqueValuesMap = extractResult.output("uniqueValuesMap")
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
//////// Building outputs & exports ////////
|
|
411
|
-
buildOutputsResult := render.createEphemeral(buildOutputsTpl, {
|
|
412
|
-
importColumns: args.importColumns,
|
|
413
|
-
selectedColumns: args.selectedColumns,
|
|
414
|
-
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
415
|
-
sequenceColumnInfo: sequenceColumnInfo,
|
|
416
|
-
assaySequenceType: assaySequenceType,
|
|
417
|
-
blockId: blockId,
|
|
418
|
-
datasetSpec: datasetSpec,
|
|
419
|
-
bestAlignmentTsv: ptw.getFile("results.tsv"),
|
|
420
|
-
assayDataTsv: ptw.getFile("assayData.tsv"),
|
|
421
|
-
clonesDataTsv: ptw.getFile("clonesData.tsv"),
|
|
422
|
-
uniqueValuesMap: uniqueValuesMap,
|
|
423
|
-
settings: args.settings,
|
|
424
|
-
customBlockLabel: args.customBlockLabel,
|
|
425
|
-
defaultBlockLabel: args.defaultBlockLabel
|
|
426
|
-
})
|
|
427
|
-
assayPframe = buildOutputsResult.output("assayPframe")
|
|
428
|
-
epf = buildOutputsResult.output("epf")
|
|
429
|
-
assayLinkerPframe = buildOutputsResult.output("assayLinkerPframe")
|
|
430
|
-
}
|
|
154
|
+
bestAlignmentTsv := analysisResult.output("bestAlignmentTsv")
|
|
155
|
+
assayDataTsv := analysisResult.output("assayDataTsv")
|
|
156
|
+
clonesDataTsv := analysisResult.output("clonesDataTsv")
|
|
157
|
+
uniqueValuesMap := analysisResult.output("uniqueValuesMap")
|
|
158
|
+
columnsToImport := analysisResult.output("columnsToImport")
|
|
159
|
+
|
|
160
|
+
//////// Build outputs ////////
|
|
161
|
+
processOutputsResult := render.create(processOutputsTpl, {
|
|
162
|
+
emptyClonesInput: emptyClonesInput,
|
|
163
|
+
assayDataTsv: assayDataTsv,
|
|
164
|
+
bestAlignmentTsv: bestAlignmentTsv,
|
|
165
|
+
clonesDataTsv: clonesDataTsv,
|
|
166
|
+
columnsToImport: columnsToImport,
|
|
167
|
+
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
168
|
+
sequenceColumnInfo: sequenceColumnInfo,
|
|
169
|
+
assaySequenceType: assaySequenceType,
|
|
170
|
+
datasetSpec: datasetSpec,
|
|
171
|
+
uniqueValuesMap: uniqueValuesMap,
|
|
172
|
+
settings: args.settings,
|
|
173
|
+
customBlockLabel: args.customBlockLabel,
|
|
174
|
+
defaultBlockLabel: args.defaultBlockLabel,
|
|
175
|
+
blockId: blockId
|
|
176
|
+
})
|
|
177
|
+
assayPframe := processOutputsResult.output("assayPframe")
|
|
178
|
+
epf := processOutputsResult.output("epf")
|
|
179
|
+
assayLinkerPframe := processOutputsResult.output("assayLinkerPframe")
|
|
431
180
|
|
|
432
|
-
|
|
181
|
+
return {
|
|
433
182
|
outputs: {
|
|
434
183
|
dataImportHandle: importFile.handle,
|
|
435
184
|
table: assayPframe,
|
|
436
185
|
assayLinkerPframe: assayLinkerPframe,
|
|
437
|
-
mmseqsOutput: mmseqsOutput,
|
|
438
|
-
emptyResults: emptyResults
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
186
|
+
mmseqsOutput: mmseqsOutput,
|
|
187
|
+
emptyResults: emptyResults,
|
|
188
|
+
emptyClonesInput: emptyClonesInput,
|
|
189
|
+
// Needed for deduplication
|
|
190
|
+
assayDataTsv: assayDataTsv,
|
|
191
|
+
bestAlignmentTsv: bestAlignmentTsv,
|
|
192
|
+
clonesDataTsv: clonesDataTsv,
|
|
193
|
+
uniqueValuesMap: uniqueValuesMap,
|
|
194
|
+
columnsToImport: columnsToImport
|
|
195
|
+
},
|
|
196
|
+
exports: {
|
|
443
197
|
epf: epf
|
|
444
198
|
}
|
|
445
199
|
}
|
|
446
|
-
|
|
447
|
-
return result
|
|
448
200
|
})
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
2
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
3
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
4
|
+
|
|
5
|
+
buildOutputsTpl := assets.importTemplate(":build-outputs")
|
|
6
|
+
|
|
7
|
+
self.defineOutputs("assayPframe", "epf", "assayLinkerPframe")
|
|
8
|
+
|
|
9
|
+
self.body(func(args) {
|
|
10
|
+
if args.emptyClonesInput {
|
|
11
|
+
return {
|
|
12
|
+
assayPframe: {},
|
|
13
|
+
epf: {},
|
|
14
|
+
assayLinkerPframe: {}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
buildOutputsResult := render.createEphemeral(buildOutputsTpl, {
|
|
19
|
+
columnsToImport: args.columnsToImport,
|
|
20
|
+
sequenceColumnHeader: args.sequenceColumnHeader,
|
|
21
|
+
sequenceColumnInfo: args.sequenceColumnInfo,
|
|
22
|
+
assaySequenceType: args.assaySequenceType,
|
|
23
|
+
datasetSpec: args.datasetSpec,
|
|
24
|
+
bestAlignmentTsv: args.bestAlignmentTsv,
|
|
25
|
+
assayDataTsv: args.assayDataTsv,
|
|
26
|
+
clonesDataTsv: args.clonesDataTsv,
|
|
27
|
+
uniqueValuesMap: args.uniqueValuesMap,
|
|
28
|
+
settings: args.settings,
|
|
29
|
+
customBlockLabel: args.customBlockLabel,
|
|
30
|
+
defaultBlockLabel: args.defaultBlockLabel,
|
|
31
|
+
blockId: args.blockId
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
return {
|
|
35
|
+
assayPframe: buildOutputsResult.output("assayPframe"),
|
|
36
|
+
epf: buildOutputsResult.output("epf"),
|
|
37
|
+
assayLinkerPframe: buildOutputsResult.output("assayLinkerPframe")
|
|
38
|
+
}
|
|
39
|
+
})
|
|
@@ -18,11 +18,11 @@ self.body(func(args) {
|
|
|
18
18
|
|
|
19
19
|
mem := "8GiB"
|
|
20
20
|
cpu := 1
|
|
21
|
-
if !is_undefined(args.mem) {
|
|
22
|
-
mem = string(args.mem) + "GiB"
|
|
21
|
+
if !is_undefined(args.metaInputs.mem) {
|
|
22
|
+
mem = string(args.metaInputs.mem) + "GiB"
|
|
23
23
|
}
|
|
24
|
-
if !is_undefined(args.cpu) {
|
|
25
|
-
cpu = args.cpu
|
|
24
|
+
if !is_undefined(args.metaInputs.cpu) {
|
|
25
|
+
cpu = args.metaInputs.cpu
|
|
26
26
|
}
|
|
27
27
|
|
|
28
28
|
mmseqs := exec.builder().
|