@platforma-open/milaboratories.immune-assay-data.workflow 1.11.0 → 1.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
   WARN  Issue while reading "/home/runner/work/immune-assay-data/immune-assay-data/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
2
2
 
3
- > @platforma-open/milaboratories.immune-assay-data.workflow@1.11.0 build /home/runner/work/immune-assay-data/immune-assay-data/workflow
3
+ > @platforma-open/milaboratories.immune-assay-data.workflow@1.12.1 build /home/runner/work/immune-assay-data/immune-assay-data/workflow
4
4
  > rm -rf dist && pl-tengo check && pl-tengo build
5
5
 
6
6
  Processing "src/analysis.tpl.tengo"...
package/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # @platforma-open/milaboratories.immune-assay-data.workflow
2
2
 
3
+ ## 1.12.1
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [2eff9dc]
8
+ - @platforma-open/milaboratories.immune-assay-data.split-fasta@1.2.0
9
+
10
+ ## 1.12.0
11
+
12
+ ### Minor Changes
13
+
14
+ - 5f43c2b: Improved scalability for large datasets
15
+
16
+ ### Patch Changes
17
+
18
+ - Updated dependencies [5f43c2b]
19
+ - @platforma-open/milaboratories.immune-assay-data.coverage-mode-calc@1.3.0
20
+ - @platforma-open/milaboratories.immune-assay-data.merge-results@1.1.0
21
+ - @platforma-open/milaboratories.immune-assay-data.split-fasta@1.1.0
22
+
3
23
  ## 1.11.0
4
24
 
5
25
  ### Minor Changes
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,17 +1,19 @@
1
1
  {
2
2
  "name": "@platforma-open/milaboratories.immune-assay-data.workflow",
3
- "version": "1.11.0",
3
+ "version": "1.12.1",
4
4
  "type": "module",
5
5
  "description": "Tengo-based template",
6
6
  "dependencies": {
7
7
  "@platforma-sdk/workflow-tengo": "5.11.0",
8
8
  "@platforma-open/soedinglab.software-mmseqs2": "1.18.3",
9
+ "@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3",
9
10
  "@platforma-open/milaboratories.immune-assay-data.add-header": "1.1.3",
10
- "@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc": "1.2.0",
11
- "@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv": "1.1.0",
12
11
  "@platforma-open/milaboratories.immune-assay-data.prepare-fasta": "1.1.3",
13
- "@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv": "1.1.3",
14
- "@platforma-open/milaboratories.immune-assay-data.check-content-empty": "1.0.1"
12
+ "@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv": "1.1.0",
13
+ "@platforma-open/milaboratories.immune-assay-data.check-content-empty": "1.0.1",
14
+ "@platforma-open/milaboratories.immune-assay-data.split-fasta": "1.2.0",
15
+ "@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc": "1.3.0",
16
+ "@platforma-open/milaboratories.immune-assay-data.merge-results": "1.1.0"
15
17
  },
16
18
  "devDependencies": {
17
19
  "@platforma-sdk/tengo-builder": "2.5.5"
@@ -9,10 +9,11 @@ render := import("@platforma-sdk/workflow-tengo:render")
9
9
 
10
10
  prepareFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.prepare-fasta:main")
11
11
  fastaToTsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.fasta-to-tsv:main")
12
- addHeaderSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.add-header:main")
13
12
  covModeCalcSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.coverage-mode-calc:main")
14
13
  xlsxToCsvSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.xlsx-to-csv:main")
15
14
  checkContentEmptySw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.check-content-empty:main")
15
+ splitFastaSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.split-fasta:main")
16
+ mergeResultsSw := assets.importSoftware("@platforma-open/milaboratories.immune-assay-data.merge-results:main")
16
17
 
17
18
  runAlignmentTpl := assets.importTemplate(":run-alignment")
18
19
  checkContentEmptyTpl := assets.importTemplate(":check-content-empty")
@@ -232,6 +233,27 @@ self.body(func(args) {
232
233
 
233
234
  covMode := coverageModeRun.getFileContent("coverage_mode.txt")
234
235
 
236
+ // Split clone FASTA into 2 equal chunks to limit mmseqs2 index disk usage.
237
+ // Running two searches against 25M sequences each uses half the peak disk
238
+ // of a single 50M search. E-values are normalized to the full database size.
239
+ splitRun := exec.builder().
240
+ software(splitFastaSw).
241
+ mem("8GiB").
242
+ cpu(1).
243
+ addFile("clones.fasta", clonesFasta).
244
+ arg("-i").arg("clones.fasta").
245
+ arg("--chunk1").arg("chunk_1.fasta").
246
+ arg("--chunk2").arg("chunk_2.fasta").
247
+ arg("--counts").arg("counts.json").
248
+ saveFile("chunk_1.fasta").
249
+ saveFile("chunk_2.fasta").
250
+ saveFile("counts.json").
251
+ run()
252
+
253
+ chunk1Fasta := splitRun.getFile("chunk_1.fasta")
254
+ chunk2Fasta := splitRun.getFile("chunk_2.fasta")
255
+ splitCounts := splitRun.getFile("counts.json")
256
+
235
257
  // MMseqs2 Alignment
236
258
  mmseqsSearchType := "0"
237
259
  if targetSequenceType == "aminoacid" && assaySequenceType == "aminoacid" {
@@ -248,13 +270,28 @@ self.body(func(args) {
248
270
  mmseqsSearchType = "2"
249
271
  }
250
272
 
251
- runMmseqs := render.create(runAlignmentTpl, {
273
+ runMmseqs1 := render.create(runAlignmentTpl, {
274
+ covMode: covMode,
275
+ mmseqsSearchType: mmseqsSearchType,
276
+ coverageThreshold: coverageThreshold,
277
+ identityThreshold: identityThreshold,
278
+ similarityType: similarityType,
279
+ clonesFasta: chunk1Fasta,
280
+ assayFasta: assayFasta,
281
+ lessSensitive: lessSensitive
282
+ }, {
283
+ metaInputs: {
284
+ mem: mem,
285
+ cpu: cpu
286
+ }
287
+ })
288
+ runMmseqs2 := render.create(runAlignmentTpl, {
252
289
  covMode: covMode,
253
290
  mmseqsSearchType: mmseqsSearchType,
254
291
  coverageThreshold: coverageThreshold,
255
292
  identityThreshold: identityThreshold,
256
293
  similarityType: similarityType,
257
- clonesFasta: clonesFasta,
294
+ clonesFasta: chunk2Fasta,
258
295
  assayFasta: assayFasta,
259
296
  lessSensitive: lessSensitive
260
297
  }, {
@@ -264,20 +301,25 @@ self.body(func(args) {
264
301
  }
265
302
  })
266
303
 
267
- mmseqsOutput := runMmseqs.output("mmseqsOutput")
304
+ mmseqsOutput1 := runMmseqs1.output("mmseqsOutput")
305
+ mmseqsOutput2 := runMmseqs2.output("mmseqsOutput")
268
306
 
269
- // @TODO remove header stuff and replace with pt when available (!)
270
- addHeaderRun := exec.builder().
271
- software(addHeaderSw).
307
+ // Merge both raw results, add header, and normalize e-values to full database size
308
+ mergeRun := exec.builder().
309
+ software(mergeResultsSw).
272
310
  mem("16GiB").
273
311
  cpu(1).
274
- arg("-i").arg("results.tsv").
312
+ addFile("results_1.tsv", mmseqsOutput1).
313
+ addFile("results_2.tsv", mmseqsOutput2).
314
+ addFile("counts.json", splitCounts).
315
+ arg("-i1").arg("results_1.tsv").
316
+ arg("-i2").arg("results_2.tsv").
317
+ arg("--counts").arg("counts.json").
275
318
  arg("-o").arg("results_with_header.tsv").
276
- addFile("results.tsv", mmseqsOutput).
277
319
  saveFile("results_with_header.tsv").
278
320
  run()
279
321
 
280
- mmseqsResultTsv := addHeaderRun.getFile("results_with_header.tsv")
322
+ mmseqsResultTsv := mergeRun.getFile("results_with_header.tsv")
281
323
 
282
324
  // Check if results are empty (only header line or nothing)
283
325
  checkResultsRun := exec.builder().
@@ -297,7 +339,7 @@ self.body(func(args) {
297
339
  emptyResults := checkResult.output("result")
298
340
 
299
341
  result := {
300
- mmseqsOutput: mmseqsOutput,
342
+ mmseqsOutput: mmseqsResultTsv,
301
343
  emptyResults: emptyResults
302
344
  }
303
345
 
@@ -315,7 +357,7 @@ self.body(func(args) {
315
357
  )
316
358
 
317
359
  cols := []
318
- for _, col in ["bits", "evalue", "target", "pident", "alnlen", "mismatch",
360
+ for _, col in ["bits", "evalue", "query", "pident", "alnlen", "mismatch",
319
361
  "gapopen", "qstart", "qend", "tstart", "tend"] {
320
362
  cols = append(cols,
321
363
  pt.col(col).maxBy(
@@ -325,7 +367,7 @@ self.body(func(args) {
325
367
  )
326
368
  }
327
369
 
328
- dfRes = dfRes.groupBy("query").agg(cols...)
370
+ dfRes = dfRes.groupBy("target").agg(cols...)
329
371
  // Add link column for linker pFrame (assayLinkerPframe)
330
372
  dfRes = dfRes.withColumns(
331
373
  pt.lit(1).cast("Int64").alias("link")
@@ -340,13 +382,13 @@ self.body(func(args) {
340
382
 
341
383
  // import how many matches per assay sequence found
342
384
  assayDf = assayDf.join(
343
- dfRes.groupBy("target").agg(
344
- pt.col("query").count().alias("queryCount")
385
+ dfRes.groupBy("query").agg(
386
+ pt.col("target").count().alias("queryCount")
345
387
  ),
346
388
  {
347
389
  how: "left",
348
390
  leftOn: "seqId",
349
- rightOn: "target"
391
+ rightOn: "query"
350
392
  }
351
393
  )
352
394
  assayDf.save("assay_data.tsv")
@@ -355,7 +397,7 @@ self.body(func(args) {
355
397
  clonesDf := dfRes.join(assayDf,
356
398
  {
357
399
  how: "left",
358
- leftOn: "target",
400
+ leftOn: "query",
359
401
  rightOn: "seqId"
360
402
  }
361
403
  )
@@ -174,7 +174,7 @@ self.body(func(inputs) {
174
174
  cloneImportResults := xsv.importFile(
175
175
  inputs.clonesDataTsv, "tsv", {
176
176
  axes: [{
177
- column: "query",
177
+ column: "target",
178
178
  spec: inputs.datasetSpec.axesSpec[1]
179
179
  }],
180
180
  columns: cloneColumns,
@@ -189,11 +189,11 @@ self.body(func(inputs) {
189
189
  inputs.bestAlignmentTsv, "tsv", {
190
190
  axes: [
191
191
  {
192
- column: "query",
192
+ column: "target",
193
193
  spec: inputs.datasetSpec.axesSpec[1]
194
194
  },
195
195
  {
196
- column: "target",
196
+ column: "query",
197
197
  spec: {
198
198
  name: "pl7.app/vdj/assay/sequenceId",
199
199
  type: "String",
@@ -32,8 +32,8 @@ self.body(func(args) {
32
32
  mem(mem).
33
33
  cpu(cpu).
34
34
  arg("easy-search").
35
- arg("clones.fasta").
36
35
  arg("assay.fasta").
36
+ arg("clones.fasta").
37
37
  arg("results.tsv").
38
38
  arg("tmp").
39
39
  arg("--threads").arg(string(cpu)).