@platforma-open/milaboratories.top-antibodies.workflow 1.13.2 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,12 +7,16 @@ pframes := import("@platforma-sdk/workflow-tengo:pframes")
7
7
  slices := import("@platforma-sdk/workflow-tengo:slices")
8
8
  render := import("@platforma-sdk/workflow-tengo:render")
9
9
  ll := import("@platforma-sdk/workflow-tengo:ll")
10
- kabatConv := import(":pf-kabat-conv")
10
+ pt := import("@platforma-sdk/workflow-tengo:pt")
11
+ text := import("text")
12
+ json := import("json")
11
13
 
12
- spectratypeConv := import(":pf-spectratype-conv")
13
- vjUsageConv := import(":pf-vj-usage-conv")
14
+ dataUtils := import(":libs.data-utils")
15
+ spectratypeConv := import(":libs.pf-spectratype-conv")
16
+ vjUsageConv := import(":libs.pf-vj-usage-conv")
17
+ sampledColsConv := import(":libs.sampled-cols-conv")
18
+ kabatConv := import(":libs.pf-kabat-conv")
14
19
 
15
- filterAndSampleTpl := assets.importTemplate(":filter-and-sample")
16
20
 
17
21
  wf.prepare(func(args){
18
22
  if is_undefined(args.inputAnchor) {
@@ -123,230 +127,86 @@ wf.body(func(args) {
123
127
 
124
128
  // Needed conditional variable
125
129
  isSingleCell := datasetSpec.axesSpec[1].name == "pl7.app/vdj/scClonotypeKey"
126
-
127
- ////////// Clonotype Filtering //////////
128
- // Build clonotype table
129
- cloneTable := pframes.csvFileBuilder()
130
- cloneTable.setAxisHeader(datasetSpec.axesSpec[1], "clonotypeKey")
131
-
132
- // Add Filters to table
133
- addedAxes := []
134
- filterMap := {}
135
- rankingMap := {}
136
- addedCols := false
137
- if len(args.filters) > 0 {
138
- for i, filter in args.filters {
139
- if filter.value != undefined {
140
- // Columns added here might also be in ranking list, so we add default IDs
141
- cloneTable.add(columns.getColumn(filter.value.column),
142
- {header: "Filter_" + string(i), id: "filter_" + string(i)})
143
- addedCols = true
144
- // Store reference value and filter type associated to this column
145
- filterMap["Filter_" + string(i)] = filter.filter
146
- filterMap["Filter_" + string(i)]["valueType"] = columns.getSpec(filter.value.column).valueType
147
-
148
- // If column does not have main anchor axis we have to include theirs
149
- colsSpec := columns.getSpec(filter.value.column)
150
- axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
151
- if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
152
- for na, ax in colsSpec.axesSpec {
153
- if ax.name != datasetSpec.axesSpec[1].name {
154
- cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
155
- addedAxes = append(addedAxes, ax.name)
156
- }
157
- }
158
- }
159
- }
160
- }
161
- }
162
-
163
- // Add ranking columns to table
164
- validRanks := false
165
- if len(args.rankingOrder) > 0 {
166
- for i, col in args.rankingOrder {
167
- if col.value != undefined {
168
- validRanks = true
169
- cloneTable.add(columns.getColumn(col.value.column), {header: "Col" + string(i)})
170
- addedCols = true
171
- // Store ranking order for this column
172
- rankingMap["Col" + string(i)] = col.rankingOrder
173
-
174
- // If column does not have main anchor axis we have to include theirs
175
- colsSpec := columns.getSpec(col.value.column)
176
- axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
177
- if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
178
- for na, ax in colsSpec.axesSpec {
179
- if ax.name != datasetSpec.axesSpec[1].name && !slices.hasElement(addedAxes, ax.name) {
180
- cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
181
- }
182
- }
183
- }
184
- }
185
- }
186
- }
187
- // If we didn't have any ranking column or all where not valid
188
- if !validRanks {
189
- // @TODO: this is a temporal patch for issue where rankingOrderDefault
190
- // are not defined by the time prerun works
191
- if args.rankingOrderDefault.value != undefined {
192
- i := 0
193
- cloneTable.add(columns.getColumn(args.rankingOrderDefault.value.column), {header: "Col" + string(i)})
194
- addedCols = true
195
- // Store default ranking order
196
- rankingMap["Col" + string(i)] = args.rankingOrderDefault.rankingOrder
197
-
198
- // If column does not have main anchor axis we have to include theirs
199
- colsSpec := columns.getSpec(args.rankingOrderDefault.value.column)
200
- axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
201
- if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
202
- for na, ax in colsSpec.axesSpec {
203
- if ax.name != datasetSpec.axesSpec[1].name {
204
- cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
205
- }
206
- }
207
- }
208
- }
209
- }
210
130
 
211
- // Get linker columns if needed
212
- linkerAxisSpec := {}
213
- if len(columns.getColumns("linkers")) > 0 {
214
- for i, col in columns.getColumns("linkers") {
215
- if datasetSpec.axesSpec[1].name == col.spec.axesSpec[1].name {
216
- cloneTable.add(col, {header: "linker." + string(i)})
217
- cloneTable.setAxisHeader(col.spec.axesSpec[0], "cluster_" + string(i))
218
- linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[0]
219
- } else if datasetSpec.axesSpec[1].name == col.spec.axesSpec[0].name {
220
- cloneTable.add(col, {header: "linker." + string(i)})
221
- cloneTable.setAxisHeader(col.spec.axesSpec[1], "cluster_" + string(i))
222
- linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[1]
223
- }
224
- addedCols = true
225
- }
226
- }
227
-
228
- // Add cluster size columns if available
229
- if len(columns.getColumns("clusterSizes")) > 0 {
230
- for i, col in columns.getColumns("clusterSizes") {
231
- cloneTable.add(col, {header: "clusterSize." + string(i)})
232
- addedCols = true
233
- // Add the cluster axis header
234
- for axisIdx, axis in col.spec.axesSpec {
235
- if axis.name != datasetSpec.axesSpec[1].name {
236
- cloneTable.setAxisHeader(axis, "clusterAxis_" + string(i) + "_" + string(axisIdx))
237
- }
238
- }
239
- }
240
- }
131
+ ////////// Clonotype Filtering //////////
132
+ clonotypeData := dataUtils.prepareClonotypeData(args.filters, args.rankingOrder, args.rankingOrderDefault, columns, datasetSpec)
133
+ structuredMap := clonotypeData.structuredMap
134
+ axisRenames := clonotypeData.axisRenames
135
+ filterMap := clonotypeData.filterMap
136
+ rankingMap := clonotypeData.rankingMap
137
+ addedCols := clonotypeData.addedCols
138
+ linkerAxisSpec := clonotypeData.linkerAxisSpec
241
139
 
242
140
  // Continue only if we have at least a column
243
141
  // This condition prevents temporal intermittent error while filters are
244
142
  // being processed and possibly in other situations too
245
143
  if addedCols {
246
- cloneTable.mem("16GiB")
247
- cloneTable.cpu(1)
248
- cloneTable = cloneTable.build()
249
-
250
- // Use ender.create to call the filter-clonotypes template
251
- filterSampleResult := render.create(filterAndSampleTpl, {
252
- inputAnchor: args.inputAnchor,
253
- cloneTable: cloneTable,
254
- rankingOrder: args.rankingOrder,
255
- rankingOrderDefault: args.rankingOrderDefault,
256
- filters: args.filters,
257
- filterMap: filterMap,
258
- rankingMap: rankingMap,
259
- datasetSpec: datasetSpec,
260
- topClonotypes: args.topClonotypes
261
- })
262
-
263
- // Get the filtered clonotypes from the template result
264
- outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
265
-
266
- // Get the filtered and sampled clonotypes P-frame and CSV from the template result
267
- finalClonotypesCsv := filterSampleResult.output("finalClonotypesCsv", 24 * 60 * 60 * 1000)
268
- // outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
269
-
270
- ////////// CDR3 Length Calculation //////////
144
+ // Run ptabler-based filtering (matches filter.py logic)
145
+ filterResult := dataUtils.filterClonotypes(structuredMap, axisRenames, filterMap, datasetSpec)
146
+ // Run sampling script if topClonotypes is defined
147
+ finalClonotypesParquet := undefined
148
+ if args.topClonotypes != undefined {
149
+ sampleClones := exec.builder().
150
+ software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:main")).
151
+ mem("16GiB").
152
+ cpu(1).
153
+ addFile("filteredClonotypes.parquet", filterResult.filteredParquet).
154
+ arg("--input").arg("filteredClonotypes.parquet").
155
+ arg("--n").arg(string(topClonotypes)).
156
+ arg("--ranking-map").arg(string(json.encode(rankingMap))).
157
+ arg("--out").arg("sampledClonotypes_top.csv").
158
+ arg("--out-parquet").arg("sampledClonotypes_top.parquet").
159
+ saveFile("sampledClonotypes_top.csv").
160
+ saveFile("sampledClonotypes_top.parquet").
161
+ printErrStreamToStdout().
162
+ saveStdoutContent().
163
+ cache(24 * 60 * 60 * 1000).
164
+ run()
271
165
 
272
- cdr3SeqTable := pframes.tsvFileBuilder()
273
- cdr3SeqTable.setAxisHeader(datasetSpec.axesSpec[1].name, "clonotypeKey")
274
-
275
- // Must deal with multiple CDR3 sequences (two for each cell in single cell data)
276
- // Chain will be added in the header as cdr3Sequence.chain and used in python script
277
- // Notice chain is in spec.domain for single cell data and spec.axesSpec[0].domain for bulk data
278
-
279
- // Helper function to add chain information to the headers dynamically
280
- chainMapping := {
281
- "IG": { "A": "Heavy", "B": "Light" },
282
- "TCRAB": { "A": "TRA", "B": "TRB" },
283
- "TCRGD": { "A": "TRG", "B": "TRD" }
284
- }
285
-
286
- makeHeaderName := func(col, baseHeaderName, isSingleCell) {
287
- if isSingleCell {
288
- chain := col.spec.domain["pl7.app/vdj/scClonotypeChain"] // e.g., "A", "B"
289
- receptor := col.spec.axesSpec[0].domain["pl7.app/vdj/receptor"] // e.g., "IG", "TCRAB", "TCRGD"
290
- chainLabel := chainMapping[receptor][chain]
291
- return baseHeaderName + "." + chainLabel // e.g., "cdr3Sequence.Heavy"
292
- } else {
293
- // For bulk, if chain info is available (e.g. IGH, IGK, IGL)
294
- chainFromDomain := col.spec.axesSpec[0].domain["pl7.app/vdj/chain"] // e.g. "IGH", "IGK"
295
- if chainFromDomain != undefined {
296
- return baseHeaderName + "." + chainFromDomain // e.g., "cdr3Sequence.IGH"
297
- }
298
- }
299
- return baseHeaderName
300
- };
301
-
302
- // Process CDR3 sequences
303
- cdr3Sequences := columns.getColumns("cdr3Sequences")
304
-
305
- for col in cdr3Sequences {
306
- headerName := makeHeaderName(col, "cdr3Sequence", isSingleCell)
307
- if isSingleCell {
308
- if col.spec.domain["pl7.app/vdj/scClonotypeChain/index"] == "primary" {
309
- cdr3SeqTable.add(col, {header: headerName})
310
- }
311
- } else {
312
- cdr3SeqTable.add(col, {header: headerName})
313
- }
166
+ finalClonotypesCsv := sampleClones.getFile("sampledClonotypes_top.csv")
167
+ sampledColumnsPf := xsv.importFile(finalClonotypesCsv, "csv",
168
+ sampledColsConv.getColumns(datasetSpec, true), {cpu: 1, mem: "4GiB"})
169
+ outputs["sampledRows"] = pframes.exportFrame(sampledColumnsPf)
170
+ finalClonotypesParquet = sampleClones.getFile("sampledClonotypes_top.parquet")
171
+ } else {
172
+ // No sampling, use filtered parquet as final output
173
+ finalClonotypesParquet = filterResult.filteredParquet
174
+ outputs["sampledRows"] = pframes.exportFrame(filterResult.pframe)
314
175
  }
315
-
316
- // Process V genes
317
- vGenes := columns.getColumns("VGenes")
318
-
319
- for col in vGenes {
320
- headerName := makeHeaderName(col, "vGene", isSingleCell)
321
- cdr3SeqTable.add(col, {header: headerName})
176
+ ////////// CDR3 Length Calculation //////////
177
+ cdr3Data := dataUtils.prepareCdr3Data(columns, datasetSpec, isSingleCell)
178
+ cdr3SeqStructuredMap := cdr3Data.structuredMap
179
+ cdr3SeqAxisRenames := cdr3Data.axisRenames
180
+
181
+ // Build ptabler workflow
182
+ wfCdr3Seq := pt.workflow().cacheInputs(24 * 60 * 60 * 1000)
183
+ cdr3SeqProjection := []
184
+ for origAxis, aliasName in cdr3SeqAxisRenames {
185
+ cdr3SeqProjection = append(cdr3SeqProjection, pt.axis(origAxis).alias(aliasName))
322
186
  }
323
-
324
- // Process J genes
325
- jGenes := columns.getColumns("JGenes")
326
-
327
- for col in jGenes {
328
- headerName := makeHeaderName(col, "jGene", isSingleCell)
329
- cdr3SeqTable.add(col, {header: headerName})
187
+ for colName, _ in cdr3SeqStructuredMap {
188
+ cdr3SeqProjection = append(cdr3SeqProjection, pt.col(colName))
330
189
  }
331
190
 
332
- cdr3SeqTable.mem("16GiB")
333
- cdr3SeqTable.cpu(1)
334
- cdr3SeqTableBuilt := cdr3SeqTable.build()
191
+ dfCdr3Seq := wfCdr3Seq.frame(pt.p.full(cdr3SeqStructuredMap)).select(cdr3SeqProjection...)
192
+ dfCdr3Seq.save("cdr3_sequences.parquet")
193
+ cdr3SeqResult := wfCdr3Seq.run()
194
+ cdr3SeqParquet := cdr3SeqResult.getFile("cdr3_sequences.parquet")
335
195
 
336
196
  cdr3VspectratypeCmd := exec.builder().
337
197
  software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.spectratype:main")).
338
198
  mem("16GiB").
339
199
  cpu(1).
340
- addFile("cdr3_sequences_input.tsv", cdr3SeqTableBuilt).
341
- arg("--input_tsv").arg("cdr3_sequences_input.tsv").
200
+ addFile("cdr3_sequences_input.parquet", cdr3SeqParquet).
201
+ arg("--input").arg("cdr3_sequences_input.parquet").
342
202
  arg("--spectratype_tsv").arg("spectratype.tsv").
343
203
  arg("--vj_usage_tsv").arg("vj_usage.tsv") // no dot here
344
204
 
345
205
  // Add top clonotypes argument and file to the builder if provided
346
- if finalClonotypesCsv != undefined {
206
+ if finalClonotypesParquet != undefined {
347
207
  cdr3VspectratypeCmd = cdr3VspectratypeCmd.
348
- arg("--final_clonotypes_csv").arg("finalClonotypes.csv").
349
- addFile("finalClonotypes.csv", finalClonotypesCsv)
208
+ arg("--final_clonotypes_parquet").arg("finalClonotypes.parquet").
209
+ addFile("finalClonotypes.parquet", finalClonotypesParquet)
350
210
  }
351
211
 
352
212
  cdr3VspectratypeCmd = cdr3VspectratypeCmd. // continue building the command
@@ -356,18 +216,16 @@ wf.body(func(args) {
356
216
  cache(24 * 60 * 60 * 1000).
357
217
  run()
358
218
 
359
-
360
219
  // Spectratype PFrame structure is [chain][cdr3Length][vGene] -> count
361
-
362
220
  cdr3VspectratypePf := xsv.importFile(cdr3VspectratypeCmd.getFile("spectratype.tsv"),
363
221
  "tsv", spectratypeConv.getColumns(),
364
- {cpu: 1, mem: "16GiB"})
222
+ {cpu: 1, mem: "4GiB"})
365
223
  outputs["cdr3VspectratypePf"] = pframes.exportFrame(cdr3VspectratypePf)
366
224
 
367
225
  // For vjUsage structure is [chain][vGene][jGene] -> count
368
226
  vjUsagePf := xsv.importFile(cdr3VspectratypeCmd.getFile("vj_usage.tsv"),
369
227
  "tsv", vjUsageConv.getColumns(),
370
- {cpu: 1, mem: "16GiB"})
228
+ {cpu: 1, mem: "4GiB"})
371
229
  outputs["vjUsagePf"] = pframes.exportFrame(vjUsagePf)
372
230
 
373
231
  if args.kabatNumbering == true {
@@ -378,7 +236,7 @@ wf.body(func(args) {
378
236
 
379
237
  seqCols := columns.getColumns("assemblingAaSeqs")
380
238
  for col in seqCols {
381
- headerName := makeHeaderName(col, "assemblingFeature", isSingleCell)
239
+ headerName := dataUtils.makeHeaderName(col, "assemblingFeature", isSingleCell)
382
240
  assemSeqTable.add(col, {header: headerName})
383
241
  }
384
242
 
@@ -402,7 +260,7 @@ wf.body(func(args) {
402
260
  assem := render.create(assemFastaTpl, {
403
261
  inputTsv: assemSeqTableBuilt,
404
262
  keyColumn: "clonotypeKey",
405
- finalClonotypesCsv: finalClonotypesCsv,
263
+ finalClonotypesParquet: finalClonotypesParquet,
406
264
  isSingleCell: isSingleCell,
407
265
  bulkChain: bulkChain
408
266
  })
@@ -1,81 +0,0 @@
1
- // Template for clonotype filtering and sampling
2
- self := import("@platforma-sdk/workflow-tengo:tpl")
3
- exec := import("@platforma-sdk/workflow-tengo:exec")
4
- assets := import("@platforma-sdk/workflow-tengo:assets")
5
- pframes := import("@platforma-sdk/workflow-tengo:pframes")
6
- xsv := import("@platforma-sdk/workflow-tengo:pframes.xsv")
7
- render := import("@platforma-sdk/workflow-tengo:render")
8
- sampledColsConv := import(":sampled-cols-conv")
9
- json := import("json")
10
-
11
- self.defineOutputs("sampledRows", "finalClonotypesCsv")
12
-
13
- self.body(func(inputs) {
14
-
15
- cloneTable := inputs.cloneTable
16
- datasetSpec := inputs.datasetSpec
17
- filterMap := inputs.filterMap
18
- rankingMap := inputs.rankingMap
19
- topClonotypes := inputs.topClonotypes
20
-
21
- outputs := {}
22
- finalClonotypesCsv := undefined
23
-
24
- // Run filtering script
25
- filterResult := exec.builder().
26
- software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:filter")).
27
- mem("16GiB").
28
- cpu(1).
29
- addFile("clonotypes.csv", cloneTable).
30
- arg("--csv").arg("clonotypes.csv").
31
- arg("--out").arg("filteredClonotypes.csv").
32
- arg("--filter-map").arg(string(json.encode(filterMap))).
33
- saveFile("filteredClonotypes.csv").
34
- printErrStreamToStdout().
35
- cache(24 * 60 * 60 * 1000).
36
- run()
37
-
38
- // Save filtered CSV file
39
- filteredClonotypesCsv := filterResult.getFile("filteredClonotypes.csv")
40
-
41
- // Store outputs
42
- sampledColsParams := sampledColsConv.getColumns(datasetSpec, false) // No ranking column
43
- filteredClonotypesPf := xsv.importFile(filteredClonotypesCsv, "csv", sampledColsParams,
44
- {cpu: 1, mem: "16GiB"})
45
-
46
- // Prepare outputs in case there is no top ranking
47
- outputs["sampledRows"] = pframes.exportFrame(filteredClonotypesPf)
48
- finalClonotypesCsv = filteredClonotypesCsv
49
-
50
- if topClonotypes != undefined {
51
-
52
- ////////// Top Clonotypes Sampling //////////
53
- // Run sampling script on filtered data
54
- sampleClones := exec.builder().
55
- software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:main")).
56
- mem("16GiB").
57
- cpu(1).
58
- addFile("filteredClonotypes.csv", filteredClonotypesCsv).
59
- arg("--csv").arg("filteredClonotypes.csv").
60
- arg("--n").arg(string(topClonotypes)).
61
- arg("--ranking-map").arg(string(json.encode(rankingMap))).
62
- arg("--out").arg("sampledClonotypes_top.csv").
63
- saveFile("sampledClonotypes_top.csv").
64
- printErrStreamToStdout().
65
- cache(24 * 60 * 60 * 1000).
66
- run()
67
-
68
- // Save top clonotypes CSV file
69
- finalClonotypesCsv = sampleClones.getFile("sampledClonotypes_top.csv")
70
-
71
- // Store outputs
72
- sampledColsParams := sampledColsConv.getColumns(datasetSpec, true) // Add ranking column
73
- sampledColumnsPf := xsv.importFile(finalClonotypesCsv, "csv", sampledColsParams,
74
- {cpu: 1, mem: "16GiB"})
75
- outputs["sampledRows"] = pframes.exportFrame(sampledColumnsPf)
76
- }
77
-
78
- outputs["finalClonotypesCsv"] = finalClonotypesCsv
79
-
80
- return outputs
81
- })