@platforma-open/milaboratories.top-antibodies.workflow 1.13.1 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,16 @@ pframes := import("@platforma-sdk/workflow-tengo:pframes")
7
7
  slices := import("@platforma-sdk/workflow-tengo:slices")
8
8
  render := import("@platforma-sdk/workflow-tengo:render")
9
9
  ll := import("@platforma-sdk/workflow-tengo:ll")
10
- kabatConv := import(":pf-kabat-conv")
10
+ pt := import("@platforma-sdk/workflow-tengo:pt")
11
+ text := import("text")
12
+ json := import("json")
11
13
 
12
- spectratypeConv := import(":pf-spectratype-conv")
13
- vjUsageConv := import(":pf-vj-usage-conv")
14
+ dataUtils := import(":libs.data-utils")
15
+ spectratypeConv := import(":libs.pf-spectratype-conv")
16
+ vjUsageConv := import(":libs.pf-vj-usage-conv")
17
+ sampledColsConv := import(":libs.sampled-cols-conv")
18
+ kabatConv := import(":libs.pf-kabat-conv")
14
19
 
15
- filterAndSampleTpl := assets.importTemplate(":filter-and-sample")
16
20
 
17
21
  wf.prepare(func(args){
18
22
  if is_undefined(args.inputAnchor) {
@@ -123,252 +127,105 @@ wf.body(func(args) {
123
127
 
124
128
  // Needed conditional variable
125
129
  isSingleCell := datasetSpec.axesSpec[1].name == "pl7.app/vdj/scClonotypeKey"
126
-
127
- ////////// Clonotype Filtering //////////
128
- // Build clonotype table
129
- cloneTable := pframes.csvFileBuilder()
130
- cloneTable.setAxisHeader(datasetSpec.axesSpec[1], "clonotypeKey")
131
-
132
- // Add Filters to table
133
- addedAxes := []
134
- filterMap := {}
135
- rankingMap := {}
136
- addedCols := false
137
- if len(args.filters) > 0 {
138
- for i, filter in args.filters {
139
- if filter.value != undefined {
140
- // Columns added here might also be in ranking list, so we add default IDs
141
- cloneTable.add(columns.getColumn(filter.value.column),
142
- {header: "Filter_" + string(i), id: "filter_" + string(i)})
143
- addedCols = true
144
- // Store reference value and filter type associated to this column
145
- filterMap["Filter_" + string(i)] = filter.filter
146
- filterMap["Filter_" + string(i)]["valueType"] = columns.getSpec(filter.value.column).valueType
147
-
148
- // If column does not have main anchor axis we have to include theirs
149
- colsSpec := columns.getSpec(filter.value.column)
150
- axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
151
- if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
152
- for na, ax in colsSpec.axesSpec {
153
- if ax.name != datasetSpec.axesSpec[1].name {
154
- cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
155
- addedAxes = append(addedAxes, ax.name)
156
- }
157
- }
158
- }
159
- }
160
- }
161
- }
162
-
163
- // Add ranking columns to table
164
- validRanks := false
165
- if len(args.rankingOrder) > 0 {
166
- for i, col in args.rankingOrder {
167
- if col.value != undefined {
168
- validRanks = true
169
- cloneTable.add(columns.getColumn(col.value.column), {header: "Col" + string(i)})
170
- addedCols = true
171
- // Store ranking order for this column
172
- rankingMap["Col" + string(i)] = col.rankingOrder
173
-
174
- // If column does not have main anchor axis we have to include theirs
175
- colsSpec := columns.getSpec(col.value.column)
176
- axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
177
- if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
178
- for na, ax in colsSpec.axesSpec {
179
- if ax.name != datasetSpec.axesSpec[1].name && !slices.hasElement(addedAxes, ax.name) {
180
- cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
181
- }
182
- }
183
- }
184
- }
185
- }
186
- }
187
- // If we didn't have any ranking column or all where not valid
188
- if !validRanks {
189
- // @TODO: this is a temporal patch for issue where rankingOrderDefault
190
- // are not defined by the time prerun works
191
- if args.rankingOrderDefault.value != undefined {
192
- i := 0
193
- cloneTable.add(columns.getColumn(args.rankingOrderDefault.value.column), {header: "Col" + string(i)})
194
- addedCols = true
195
- // Store default ranking order
196
- rankingMap["Col" + string(i)] = args.rankingOrderDefault.rankingOrder
197
-
198
- // If column does not have main anchor axis we have to include theirs
199
- colsSpec := columns.getSpec(args.rankingOrderDefault.value.column)
200
- axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
201
- if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
202
- for na, ax in colsSpec.axesSpec {
203
- if ax.name != datasetSpec.axesSpec[1].name {
204
- cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
205
- }
206
- }
207
- }
208
- }
209
- }
210
130
 
211
- // Get linker columns if needed
212
- linkerAxisSpec := {}
213
- if len(columns.getColumns("linkers")) > 0 {
214
- for i, col in columns.getColumns("linkers") {
215
- if datasetSpec.axesSpec[1].name == col.spec.axesSpec[1].name {
216
- cloneTable.add(col, {header: "linker." + string(i)})
217
- cloneTable.setAxisHeader(col.spec.axesSpec[0], "cluster_" + string(i))
218
- linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[0]
219
- } else if datasetSpec.axesSpec[1].name == col.spec.axesSpec[0].name {
220
- cloneTable.add(col, {header: "linker." + string(i)})
221
- cloneTable.setAxisHeader(col.spec.axesSpec[1], "cluster_" + string(i))
222
- linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[1]
223
- }
224
- addedCols = true
225
- }
226
- }
227
-
228
- // Add cluster size columns if available
229
- if len(columns.getColumns("clusterSizes")) > 0 {
230
- for i, col in columns.getColumns("clusterSizes") {
231
- cloneTable.add(col, {header: "clusterSize." + string(i)})
232
- addedCols = true
233
- // Add the cluster axis header
234
- for axisIdx, axis in col.spec.axesSpec {
235
- if axis.name != datasetSpec.axesSpec[1].name {
236
- cloneTable.setAxisHeader(axis, "clusterAxis_" + string(i) + "_" + string(axisIdx))
237
- }
238
- }
239
- }
240
- }
131
+ ////////// Clonotype Filtering //////////
132
+ clonotypeData := dataUtils.prepareClonotypeData(args.filters, args.rankingOrder, args.rankingOrderDefault, columns, datasetSpec)
133
+ structuredMap := clonotypeData.structuredMap
134
+ axisRenames := clonotypeData.axisRenames
135
+ filterMap := clonotypeData.filterMap
136
+ rankingMap := clonotypeData.rankingMap
137
+ addedCols := clonotypeData.addedCols
138
+ linkerAxisSpec := clonotypeData.linkerAxisSpec
241
139
 
242
140
  // Continue only if we have at least a column
243
141
  // This condition prevents temporal intermittent error while filters are
244
142
  // being processed and possibly in other situations too
245
143
  if addedCols {
246
- cloneTable.mem("16GiB")
247
- cloneTable.cpu(1)
248
- cloneTable = cloneTable.build()
249
-
250
- // Use ender.create to call the filter-clonotypes template
251
- filterSampleResult := render.create(filterAndSampleTpl, {
252
- inputAnchor: args.inputAnchor,
253
- cloneTable: cloneTable,
254
- rankingOrder: args.rankingOrder,
255
- rankingOrderDefault: args.rankingOrderDefault,
256
- filters: args.filters,
257
- filterMap: filterMap,
258
- rankingMap: rankingMap,
259
- datasetSpec: datasetSpec,
260
- topClonotypes: args.topClonotypes
261
- })
262
-
263
- // Get the filtered clonotypes from the template result
264
- outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
265
-
266
- // Get the filtered and sampled clonotypes P-frame and CSV from the template result
267
- finalClonotypesCsv := filterSampleResult.output("finalClonotypesCsv", 24 * 60 * 60 * 1000)
268
- // outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
269
-
270
- ////////// CDR3 Length Calculation //////////
144
+ // Run ptabler-based filtering (matches filter.py logic)
145
+ filterResult := dataUtils.filterClonotypes(structuredMap, axisRenames, filterMap, datasetSpec)
146
+ // Run sampling script if topClonotypes is defined
147
+ finalClonotypesParquet := undefined
148
+ if args.topClonotypes != undefined {
149
+ sampleClones := exec.builder().
150
+ software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:main")).
151
+ mem("16GiB").
152
+ cpu(1).
153
+ addFile("filteredClonotypes.parquet", filterResult.filteredParquet).
154
+ arg("--input").arg("filteredClonotypes.parquet").
155
+ arg("--n").arg(string(topClonotypes)).
156
+ arg("--ranking-map").arg(string(json.encode(rankingMap))).
157
+ arg("--out").arg("sampledClonotypes_top.csv").
158
+ arg("--out-parquet").arg("sampledClonotypes_top.parquet").
159
+ saveFile("sampledClonotypes_top.csv").
160
+ saveFile("sampledClonotypes_top.parquet").
161
+ printErrStreamToStdout().
162
+ saveStdoutContent().
163
+ cache(24 * 60 * 60 * 1000).
164
+ run()
271
165
 
272
- cdr3SeqTable := pframes.tsvFileBuilder()
273
- cdr3SeqTable.setAxisHeader(datasetSpec.axesSpec[1].name, "clonotypeKey")
274
-
275
- // Must deal with multiple CDR3 sequences (two for each cell in single cell data)
276
- // Chain will be added in the header as cdr3Sequence.chain and used in python script
277
- // Notice chain is in spec.domain for single cell data and spec.axesSpec[0].domain for bulk data
278
-
279
- // Helper function to add chain information to the headers dynamically
280
- chainMapping := {
281
- "IG": { "A": "Heavy", "B": "Light" },
282
- "TCRAB": { "A": "TRA", "B": "TRB" },
283
- "TCRGD": { "A": "TRG", "B": "TRD" }
284
- }
285
-
286
- makeHeaderName := func(col, baseHeaderName, isSingleCell) {
287
- if isSingleCell {
288
- chain := col.spec.domain["pl7.app/vdj/scClonotypeChain"] // e.g., "A", "B"
289
- receptor := col.spec.axesSpec[0].domain["pl7.app/vdj/receptor"] // e.g., "IG", "TCRAB", "TCRGD"
290
- chainLabel := chainMapping[receptor][chain]
291
- return baseHeaderName + "." + chainLabel // e.g., "cdr3Sequence.Heavy"
292
- } else {
293
- // For bulk, if chain info is available (e.g. IGH, IGK, IGL)
294
- chainFromDomain := col.spec.axesSpec[0].domain["pl7.app/vdj/chain"] // e.g. "IGH", "IGK"
295
- if chainFromDomain != undefined {
296
- return baseHeaderName + "." + chainFromDomain // e.g., "cdr3Sequence.IGH"
297
- }
298
- }
299
- return baseHeaderName
300
- };
301
-
302
- // Process CDR3 sequences
303
- cdr3Sequences := columns.getColumns("cdr3Sequences")
304
-
305
- for col in cdr3Sequences {
306
- headerName := makeHeaderName(col, "cdr3Sequence", isSingleCell)
307
- if isSingleCell {
308
- if col.spec.domain["pl7.app/vdj/scClonotypeChain/index"] == "primary" {
309
- cdr3SeqTable.add(col, {header: headerName})
310
- }
311
- } else {
312
- cdr3SeqTable.add(col, {header: headerName})
313
- }
166
+ finalClonotypesCsv := sampleClones.getFile("sampledClonotypes_top.csv")
167
+ sampledColumnsPf := xsv.importFile(finalClonotypesCsv, "csv",
168
+ sampledColsConv.getColumns(datasetSpec, true), {cpu: 1, mem: "4GiB"})
169
+ outputs["sampledRows"] = pframes.exportFrame(sampledColumnsPf)
170
+ finalClonotypesParquet = sampleClones.getFile("sampledClonotypes_top.parquet")
171
+ } else {
172
+ // No sampling, use filtered parquet as final output
173
+ finalClonotypesParquet = filterResult.filteredParquet
174
+ outputs["sampledRows"] = pframes.exportFrame(filterResult.pframe)
314
175
  }
315
-
316
- // Process V genes
317
- vGenes := columns.getColumns("VGenes")
318
-
319
- for col in vGenes {
320
- headerName := makeHeaderName(col, "vGene", isSingleCell)
321
- cdr3SeqTable.add(col, {header: headerName})
176
+ ////////// CDR3 Length Calculation //////////
177
+ cdr3Data := dataUtils.prepareCdr3Data(columns, datasetSpec, isSingleCell)
178
+ cdr3SeqStructuredMap := cdr3Data.structuredMap
179
+ cdr3SeqAxisRenames := cdr3Data.axisRenames
180
+
181
+ // Build ptabler workflow
182
+ wfCdr3Seq := pt.workflow().cacheInputs(24 * 60 * 60 * 1000)
183
+ cdr3SeqProjection := []
184
+ for origAxis, aliasName in cdr3SeqAxisRenames {
185
+ cdr3SeqProjection = append(cdr3SeqProjection, pt.axis(origAxis).alias(aliasName))
322
186
  }
323
-
324
- // Process J genes
325
- jGenes := columns.getColumns("JGenes")
326
-
327
- for col in jGenes {
328
- headerName := makeHeaderName(col, "jGene", isSingleCell)
329
- cdr3SeqTable.add(col, {header: headerName})
187
+ for colName, _ in cdr3SeqStructuredMap {
188
+ cdr3SeqProjection = append(cdr3SeqProjection, pt.col(colName))
330
189
  }
331
190
 
332
- cdr3SeqTable.mem("16GiB")
333
- cdr3SeqTable.cpu(1)
334
- cdr3SeqTableBuilt := cdr3SeqTable.build()
191
+ dfCdr3Seq := wfCdr3Seq.frame(pt.p.full(cdr3SeqStructuredMap)).select(cdr3SeqProjection...)
192
+ dfCdr3Seq.save("cdr3_sequences.parquet")
193
+ cdr3SeqResult := wfCdr3Seq.run()
194
+ cdr3SeqParquet := cdr3SeqResult.getFile("cdr3_sequences.parquet")
335
195
 
336
196
  cdr3VspectratypeCmd := exec.builder().
337
197
  software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.spectratype:main")).
338
198
  mem("16GiB").
339
199
  cpu(1).
340
- addFile("cdr3_sequences_input.tsv", cdr3SeqTableBuilt).
341
- arg("--input_tsv").arg("cdr3_sequences_input.tsv").
200
+ addFile("cdr3_sequences_input.parquet", cdr3SeqParquet).
201
+ arg("--input").arg("cdr3_sequences_input.parquet").
342
202
  arg("--spectratype_tsv").arg("spectratype.tsv").
343
203
  arg("--vj_usage_tsv").arg("vj_usage.tsv") // no dot here
344
204
 
345
205
  // Add top clonotypes argument and file to the builder if provided
346
- if finalClonotypesCsv != undefined {
206
+ if finalClonotypesParquet != undefined {
347
207
  cdr3VspectratypeCmd = cdr3VspectratypeCmd.
348
- arg("--final_clonotypes_csv").arg("finalClonotypes.csv").
349
- addFile("finalClonotypes.csv", finalClonotypesCsv)
208
+ arg("--final_clonotypes_parquet").arg("finalClonotypes.parquet").
209
+ addFile("finalClonotypes.parquet", finalClonotypesParquet)
350
210
  }
351
211
 
352
212
  cdr3VspectratypeCmd = cdr3VspectratypeCmd. // continue building the command
353
213
  saveFile("spectratype.tsv").
354
214
  saveFile("vj_usage.tsv").
355
215
  printErrStreamToStdout().
356
- saveStdoutContent().
357
216
  cache(24 * 60 * 60 * 1000).
358
217
  run()
359
218
 
360
-
361
219
  // Spectratype PFrame structure is [chain][cdr3Length][vGene] -> count
362
-
363
220
  cdr3VspectratypePf := xsv.importFile(cdr3VspectratypeCmd.getFile("spectratype.tsv"),
364
221
  "tsv", spectratypeConv.getColumns(),
365
- {cpu: 1, mem: "16GiB"})
222
+ {cpu: 1, mem: "4GiB"})
366
223
  outputs["cdr3VspectratypePf"] = pframes.exportFrame(cdr3VspectratypePf)
367
224
 
368
225
  // For vjUsage structure is [chain][vGene][jGene] -> count
369
226
  vjUsagePf := xsv.importFile(cdr3VspectratypeCmd.getFile("vj_usage.tsv"),
370
227
  "tsv", vjUsageConv.getColumns(),
371
- {cpu: 1, mem: "16GiB"})
228
+ {cpu: 1, mem: "4GiB"})
372
229
  outputs["vjUsagePf"] = pframes.exportFrame(vjUsagePf)
373
230
 
374
231
  if args.kabatNumbering == true {
@@ -379,7 +236,7 @@ wf.body(func(args) {
379
236
 
380
237
  seqCols := columns.getColumns("assemblingAaSeqs")
381
238
  for col in seqCols {
382
- headerName := makeHeaderName(col, "assemblingFeature", isSingleCell)
239
+ headerName := dataUtils.makeHeaderName(col, "assemblingFeature", isSingleCell)
383
240
  assemSeqTable.add(col, {header: headerName})
384
241
  }
385
242
 
@@ -403,7 +260,7 @@ wf.body(func(args) {
403
260
  assem := render.create(assemFastaTpl, {
404
261
  inputTsv: assemSeqTableBuilt,
405
262
  keyColumn: "clonotypeKey",
406
- finalClonotypesCsv: finalClonotypesCsv,
263
+ finalClonotypesParquet: finalClonotypesParquet,
407
264
  isSingleCell: isSingleCell,
408
265
  bulkChain: bulkChain
409
266
  })
@@ -1,83 +0,0 @@
1
- // Template for clonotype filtering and sampling
2
- self := import("@platforma-sdk/workflow-tengo:tpl")
3
- exec := import("@platforma-sdk/workflow-tengo:exec")
4
- assets := import("@platforma-sdk/workflow-tengo:assets")
5
- pframes := import("@platforma-sdk/workflow-tengo:pframes")
6
- xsv := import("@platforma-sdk/workflow-tengo:pframes.xsv")
7
- render := import("@platforma-sdk/workflow-tengo:render")
8
- sampledColsConv := import(":sampled-cols-conv")
9
- json := import("json")
10
-
11
- self.defineOutputs("sampledRows", "finalClonotypesCsv")
12
-
13
- self.body(func(inputs) {
14
-
15
- cloneTable := inputs.cloneTable
16
- datasetSpec := inputs.datasetSpec
17
- filterMap := inputs.filterMap
18
- rankingMap := inputs.rankingMap
19
- topClonotypes := inputs.topClonotypes
20
-
21
- outputs := {}
22
- finalClonotypesCsv := undefined
23
-
24
- // Run filtering script
25
- filterResult := exec.builder().
26
- software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:filter")).
27
- mem("16GiB").
28
- cpu(1).
29
- addFile("clonotypes.csv", cloneTable).
30
- arg("--csv").arg("clonotypes.csv").
31
- arg("--out").arg("filteredClonotypes.csv").
32
- arg("--filter-map").arg(string(json.encode(filterMap))).
33
- saveFile("filteredClonotypes.csv").
34
- printErrStreamToStdout().
35
- saveStdoutContent().
36
- cache(24 * 60 * 60 * 1000).
37
- run()
38
-
39
- // Save filtered CSV file
40
- filteredClonotypesCsv := filterResult.getFile("filteredClonotypes.csv")
41
-
42
- // Store outputs
43
- sampledColsParams := sampledColsConv.getColumns(datasetSpec, false) // No ranking column
44
- filteredClonotypesPf := xsv.importFile(filteredClonotypesCsv, "csv", sampledColsParams,
45
- {cpu: 1, mem: "16GiB"})
46
-
47
- // Prepare outputs in case there is no top ranking
48
- outputs["sampledRows"] = pframes.exportFrame(filteredClonotypesPf)
49
- finalClonotypesCsv = filteredClonotypesCsv
50
-
51
- if topClonotypes != undefined {
52
-
53
- ////////// Top Clonotypes Sampling //////////
54
- // Run sampling script on filtered data
55
- sampleClones := exec.builder().
56
- software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:main")).
57
- mem("16GiB").
58
- cpu(1).
59
- addFile("filteredClonotypes.csv", filteredClonotypesCsv).
60
- arg("--csv").arg("filteredClonotypes.csv").
61
- arg("--n").arg(string(topClonotypes)).
62
- arg("--ranking-map").arg(string(json.encode(rankingMap))).
63
- arg("--out").arg("sampledClonotypes_top.csv").
64
- saveFile("sampledClonotypes_top.csv").
65
- printErrStreamToStdout().
66
- saveStdoutContent().
67
- cache(24 * 60 * 60 * 1000).
68
- run()
69
-
70
- // Save top clonotypes CSV file
71
- finalClonotypesCsv = sampleClones.getFile("sampledClonotypes_top.csv")
72
-
73
- // Store outputs
74
- sampledColsParams := sampledColsConv.getColumns(datasetSpec, true) // Add ranking column
75
- sampledColumnsPf := xsv.importFile(finalClonotypesCsv, "csv", sampledColsParams,
76
- {cpu: 1, mem: "16GiB"})
77
- outputs["sampledRows"] = pframes.exportFrame(sampledColumnsPf)
78
- }
79
-
80
- outputs["finalClonotypesCsv"] = finalClonotypesCsv
81
-
82
- return outputs
83
- })