@platforma-open/milaboratories.immune-assay-data.workflow 1.10.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
   WARN  Issue while reading "/home/runner/work/immune-assay-data/immune-assay-data/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
2
2
 
3
- > @platforma-open/milaboratories.immune-assay-data.workflow@1.10.0 build /home/runner/work/immune-assay-data/immune-assay-data/workflow
3
+ > @platforma-open/milaboratories.immune-assay-data.workflow@1.12.0 build /home/runner/work/immune-assay-data/immune-assay-data/workflow
4
4
  > rm -rf dist && pl-tengo check && pl-tengo build
5
5
 
6
6
  Processing "src/analysis.tpl.tengo"...
package/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # @platforma-open/milaboratories.immune-assay-data.workflow
2
2
 
3
+ ## 1.12.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 5f43c2b: Improved scalability for large datasets
8
+
9
+ ### Patch Changes
10
+
11
+ - Updated dependencies [5f43c2b]
12
+ - @platforma-open/milaboratories.immune-assay-data.coverage-mode-calc@1.3.0
13
+ - @platforma-open/milaboratories.immune-assay-data.merge-results@1.1.0
14
+ - @platforma-open/milaboratories.immune-assay-data.split-fasta@1.1.0
15
+
16
+ ## 1.11.0
17
+
18
+ ### Minor Changes
19
+
20
+ - ac74170: Improved performance on large datasets, eliminating disk and memory pressure
21
+
3
22
  ## 1.10.0
4
23
 
5
24
  ### Minor Changes
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@platforma-open/milaboratories.immune-assay-data.workflow",
3
- "version": "1.10.0",
3
+ "version": "1.12.0",
4
4
  "type": "module",
5
5
  "description": "Tengo-based template",
6
6
  "dependencies": {
@@ -8,8 +8,10 @@
8
8
  "@platforma-open/soedinglab.software-mmseqs2": "1.18.3",
9
9
  "@platforma-open/milaboratories.immune-assay-data.prepare-fasta": "1.1.3",
10
10
  "@platforma-open/milaboratories.immune-assay-data.add-header": "1.1.3",
11
- "@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc": "1.2.0",
11
+ "@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc": "1.3.0",
12
12
  "@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3",
13
+ "@platforma-open/milaboratories.immune-assay-data.merge-results": "1.1.0",
14
+ "@platforma-open/milaboratories.immune-assay-data.split-fasta": "1.1.0",
13
15
  "@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv": "1.1.0",
14
16
  "@platforma-open/milaboratories.immune-assay-data.check-content-empty": "1.0.1"
15
17
  },
@@ -9,10 +9,11 @@ render := import("@platforma-sdk/workflow-tengo:render")
9
9
 
10
10
  prepareFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.prepare-fasta:main")
11
11
  fastaToTsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv:main")
12
- addHeaderSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.add-header:main")
13
12
  covModeCalcSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc:main")
14
13
  xlsxToCsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv:main")
15
14
  checkContentEmptySw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.check-content-empty:main")
15
+ splitFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.split-fasta:main")
16
+ mergeResultsSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.merge-results:main")
16
17
 
17
18
  runAlignmentTpl := assets.importTemplate(":run-alignment")
18
19
  checkContentEmptyTpl := assets.importTemplate(":check-content-empty")
@@ -232,6 +233,27 @@ self.body(func(args) {
232
233
 
233
234
  covMode := coverageModeRun.getFileContent("coverage_mode.txt")
234
235
 
236
+ // Split clone FASTA into 2 equal chunks to limit mmseqs2 index disk usage.
237
+ // Running two searches against 25M sequences each uses half the peak disk
238
+ // of a single 50M search. E-values are normalized to the full database size.
239
+ splitRun := exec.builder().
240
+ software(splitFastaSw).
241
+ mem("8GiB").
242
+ cpu(1).
243
+ addFile("clones.fasta", clonesFasta).
244
+ arg("-i").arg("clones.fasta").
245
+ arg("--chunk1").arg("chunk_1.fasta").
246
+ arg("--chunk2").arg("chunk_2.fasta").
247
+ arg("--counts").arg("counts.json").
248
+ saveFile("chunk_1.fasta").
249
+ saveFile("chunk_2.fasta").
250
+ saveFile("counts.json").
251
+ run()
252
+
253
+ chunk1Fasta := splitRun.getFile("chunk_1.fasta")
254
+ chunk2Fasta := splitRun.getFile("chunk_2.fasta")
255
+ splitCounts := splitRun.getFile("counts.json")
256
+
235
257
  // MMseqs2 Alignment
236
258
  mmseqsSearchType := "0"
237
259
  if targetSequenceType == "aminoacid" && assaySequenceType == "aminoacid" {
@@ -248,13 +270,28 @@ self.body(func(args) {
248
270
  mmseqsSearchType = "2"
249
271
  }
250
272
 
251
- runMmseqs := render.create(runAlignmentTpl, {
273
+ runMmseqs1 := render.create(runAlignmentTpl, {
274
+ covMode: covMode,
275
+ mmseqsSearchType: mmseqsSearchType,
276
+ coverageThreshold: coverageThreshold,
277
+ identityThreshold: identityThreshold,
278
+ similarityType: similarityType,
279
+ clonesFasta: chunk1Fasta,
280
+ assayFasta: assayFasta,
281
+ lessSensitive: lessSensitive
282
+ }, {
283
+ metaInputs: {
284
+ mem: mem,
285
+ cpu: cpu
286
+ }
287
+ })
288
+ runMmseqs2 := render.create(runAlignmentTpl, {
252
289
  covMode: covMode,
253
290
  mmseqsSearchType: mmseqsSearchType,
254
291
  coverageThreshold: coverageThreshold,
255
292
  identityThreshold: identityThreshold,
256
293
  similarityType: similarityType,
257
- clonesFasta: clonesFasta,
294
+ clonesFasta: chunk2Fasta,
258
295
  assayFasta: assayFasta,
259
296
  lessSensitive: lessSensitive
260
297
  }, {
@@ -264,20 +301,25 @@ self.body(func(args) {
264
301
  }
265
302
  })
266
303
 
267
- mmseqsOutput := runMmseqs.output("mmseqsOutput")
304
+ mmseqsOutput1 := runMmseqs1.output("mmseqsOutput")
305
+ mmseqsOutput2 := runMmseqs2.output("mmseqsOutput")
268
306
 
269
- // @TODO remove header stuff and replace with pt when available (!)
270
- addHeaderRun := exec.builder().
271
- software(addHeaderSw).
307
+ // Merge both raw results, add header, and normalize e-values to full database size
308
+ mergeRun := exec.builder().
309
+ software(mergeResultsSw).
272
310
  mem("16GiB").
273
311
  cpu(1).
274
- arg("-i").arg("results.tsv").
312
+ addFile("results_1.tsv", mmseqsOutput1).
313
+ addFile("results_2.tsv", mmseqsOutput2).
314
+ addFile("counts.json", splitCounts).
315
+ arg("-i1").arg("results_1.tsv").
316
+ arg("-i2").arg("results_2.tsv").
317
+ arg("--counts").arg("counts.json").
275
318
  arg("-o").arg("results_with_header.tsv").
276
- addFile("results.tsv", mmseqsOutput).
277
319
  saveFile("results_with_header.tsv").
278
320
  run()
279
321
 
280
- mmseqsResultTsv := addHeaderRun.getFile("results_with_header.tsv")
322
+ mmseqsResultTsv := mergeRun.getFile("results_with_header.tsv")
281
323
 
282
324
  // Check if results are empty (only header line or nothing)
283
325
  checkResultsRun := exec.builder().
@@ -297,7 +339,7 @@ self.body(func(args) {
297
339
  emptyResults := checkResult.output("result")
298
340
 
299
341
  result := {
300
- mmseqsOutput: mmseqsOutput,
342
+ mmseqsOutput: mmseqsResultTsv,
301
343
  emptyResults: emptyResults
302
344
  }
303
345
 
@@ -27,10 +27,6 @@ self.body(func(args) {
27
27
  cpu = args.metaInputs.cpu
28
28
  }
29
29
 
30
- // Cap mmseqs2 in-RAM usage to 80% of allocated memory so it splits to disk
31
- // rather than getting OOM-killed by the kernel on large datasets.
32
- memLimit := "{int(ceil(system.ram.gb * 0.8))}" + "G"
33
-
34
30
  mmseqs := exec.builder().
35
31
  software(mmseqsSw).
36
32
  mem(mem).
@@ -40,7 +36,6 @@ self.body(func(args) {
40
36
  arg("clones.fasta").
41
37
  arg("results.tsv").
42
38
  arg("tmp").
43
- arg("--split-memory-limit").argWithVar(memLimit).
44
39
  arg("--threads").arg(string(cpu)).
45
40
  arg("--max-seqs").arg("10000").
46
41
  arg("--search-type").arg(mmseqsSearchType).