@platforma-open/milaboratories.top-antibodies.workflow 1.14.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,16 +7,12 @@ pframes := import("@platforma-sdk/workflow-tengo:pframes")
7
7
  slices := import("@platforma-sdk/workflow-tengo:slices")
8
8
  render := import("@platforma-sdk/workflow-tengo:render")
9
9
  ll := import("@platforma-sdk/workflow-tengo:ll")
10
- pt := import("@platforma-sdk/workflow-tengo:pt")
11
- text := import("text")
12
- json := import("json")
10
+ kabatConv := import(":pf-kabat-conv")
13
11
 
14
- dataUtils := import(":libs.data-utils")
15
- spectratypeConv := import(":libs.pf-spectratype-conv")
16
- vjUsageConv := import(":libs.pf-vj-usage-conv")
17
- sampledColsConv := import(":libs.sampled-cols-conv")
18
- kabatConv := import(":libs.pf-kabat-conv")
12
+ spectratypeConv := import(":pf-spectratype-conv")
13
+ vjUsageConv := import(":pf-vj-usage-conv")
19
14
 
15
+ filterAndSampleTpl := assets.importTemplate(":filter-and-sample")
20
16
 
21
17
  wf.prepare(func(args){
22
18
  if is_undefined(args.inputAnchor) {
@@ -127,86 +123,295 @@ wf.body(func(args) {
127
123
 
128
124
  // Needed conditional variable
129
125
  isSingleCell := datasetSpec.axesSpec[1].name == "pl7.app/vdj/scClonotypeKey"
130
-
126
+
131
127
  ////////// Clonotype Filtering //////////
132
- clonotypeData := dataUtils.prepareClonotypeData(args.filters, args.rankingOrder, args.rankingOrderDefault, columns, datasetSpec)
133
- structuredMap := clonotypeData.structuredMap
134
- axisRenames := clonotypeData.axisRenames
135
- filterMap := clonotypeData.filterMap
136
- rankingMap := clonotypeData.rankingMap
137
- addedCols := clonotypeData.addedCols
138
- linkerAxisSpec := clonotypeData.linkerAxisSpec
128
+ // Build clonotype table
129
+ cloneTable := pframes.csvFileBuilder()
130
+ cloneTable.setAxisHeader(datasetSpec.axesSpec[1], "clonotypeKey")
131
+
132
+ // Add Filters to table
133
+ addedAxes := []
134
+ filterMap := {}
135
+ rankingMap := {}
136
+ addedCols := false
137
+ if len(args.filters) > 0 {
138
+ for i, filter in args.filters {
139
+ if filter.value != undefined {
140
+ // Columns added here might also be in ranking list, so we add default IDs
141
+ cloneTable.add(columns.getColumn(filter.value.column),
142
+ {header: "Filter_" + string(i), id: "filter_" + string(i)})
143
+ addedCols = true
144
+ // Store reference value and filter type associated to this column
145
+ filterMap["Filter_" + string(i)] = filter.filter
146
+ filterMap["Filter_" + string(i)]["valueType"] = columns.getSpec(filter.value.column).valueType
147
+
148
+ // If column does not have main anchor axis we have to include theirs
149
+ colsSpec := columns.getSpec(filter.value.column)
150
+ axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
151
+ if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
152
+ for na, ax in colsSpec.axesSpec {
153
+ if ax.name != datasetSpec.axesSpec[1].name {
154
+ cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
155
+ addedAxes = append(addedAxes, ax.name)
156
+ }
157
+ }
158
+ }
159
+ }
160
+ }
161
+ }
162
+
163
+ // Add ranking columns to table
164
+ validRanks := false
165
+ if len(args.rankingOrder) > 0 {
166
+ for i, col in args.rankingOrder {
167
+ if col.value != undefined {
168
+ validRanks = true
169
+ cloneTable.add(columns.getColumn(col.value.column), {header: "Col" + string(i)})
170
+ addedCols = true
171
+ // Store ranking order for this column
172
+ rankingMap["Col" + string(i)] = col.rankingOrder
173
+
174
+ // If column does not have main anchor axis we have to include theirs
175
+ colsSpec := columns.getSpec(col.value.column)
176
+ axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
177
+ if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
178
+ for na, ax in colsSpec.axesSpec {
179
+ if ax.name != datasetSpec.axesSpec[1].name && !slices.hasElement(addedAxes, ax.name) {
180
+ cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
181
+ }
182
+ }
183
+ }
184
+ }
185
+ }
186
+ }
187
+ // If we didn't have any ranking column or all where not valid
188
+ if !validRanks {
189
+ // @TODO: this is a temporal patch for issue where rankingOrderDefault
190
+ // are not defined by the time prerun works
191
+ if args.rankingOrderDefault.value != undefined {
192
+ i := 0
193
+ cloneTable.add(columns.getColumn(args.rankingOrderDefault.value.column), {header: "Col" + string(i)})
194
+ addedCols = true
195
+ // Store default ranking order
196
+ rankingMap["Col" + string(i)] = args.rankingOrderDefault.rankingOrder
197
+
198
+ // If column does not have main anchor axis we have to include theirs
199
+ colsSpec := columns.getSpec(args.rankingOrderDefault.value.column)
200
+ axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
201
+ if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
202
+ for na, ax in colsSpec.axesSpec {
203
+ if ax.name != datasetSpec.axesSpec[1].name {
204
+ cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
205
+ }
206
+ }
207
+ }
208
+ }
209
+ }
210
+
211
+ // Get linker columns if needed
212
+ linkerAxisSpec := {}
213
+ linkerClusterIdAxes := []
214
+ if len(columns.getColumns("linkers")) > 0 {
215
+ for i, col in columns.getColumns("linkers") {
216
+ clusterIdAxis := undefined
217
+ if datasetSpec.axesSpec[1].name == col.spec.axesSpec[1].name {
218
+ cloneTable.add(col, {header: "linker." + string(i)})
219
+ cloneTable.setAxisHeader(col.spec.axesSpec[0], "cluster_" + string(i))
220
+ linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[0]
221
+ clusterIdAxis = col.spec.axesSpec[0]
222
+ } else if datasetSpec.axesSpec[1].name == col.spec.axesSpec[0].name {
223
+ cloneTable.add(col, {header: "linker." + string(i)})
224
+ cloneTable.setAxisHeader(col.spec.axesSpec[1], "cluster_" + string(i))
225
+ linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[1]
226
+ clusterIdAxis = col.spec.axesSpec[1]
227
+ }
228
+ // Collect clusterId axes from linker columns to match cluster size columns
229
+ if !is_undefined(clusterIdAxis) && clusterIdAxis.name == "pl7.app/vdj/clusterId" {
230
+ linkerClusterIdAxes = append(linkerClusterIdAxes, clusterIdAxis)
231
+ }
232
+ addedCols = true
233
+ }
234
+ }
235
+
236
+ // Add cluster size columns if available, but only those matching linker columns' clusterId axes
237
+ // This ensures we only join columns from the same clustering run
238
+ if len(columns.getColumns("clusterSizes")) > 0 {
239
+ clusterSizeIdx := 0
240
+ for col in columns.getColumns("clusterSizes") {
241
+ // Find the clusterId axis in this cluster size column
242
+ clusterSizeClusterIdAxis := undefined
243
+ for axis in col.spec.axesSpec {
244
+ if axis.name == "pl7.app/vdj/clusterId" {
245
+ clusterSizeClusterIdAxis = axis
246
+ break
247
+ }
248
+ }
249
+
250
+ // Only add if we have linker columns and this cluster size matches one of them
251
+ shouldAdd := false
252
+ if len(linkerClusterIdAxes) > 0 && !is_undefined(clusterSizeClusterIdAxis) {
253
+ // Check if this cluster size column matches any linker's clusterId axis
254
+ for linkerAxis in linkerClusterIdAxes {
255
+ // Compare domains - they must match exactly for same clustering run
256
+ if clusterSizeClusterIdAxis.name == linkerAxis.name &&
257
+ clusterSizeClusterIdAxis.type == linkerAxis.type {
258
+ // Check if domains match
259
+ domainsMatch := true
260
+ if is_undefined(clusterSizeClusterIdAxis.domain) != is_undefined(linkerAxis.domain) {
261
+ domainsMatch = false
262
+ } else if !is_undefined(clusterSizeClusterIdAxis.domain) && !is_undefined(linkerAxis.domain) {
263
+ // Compare domain keys and values
264
+ if len(clusterSizeClusterIdAxis.domain) != len(linkerAxis.domain) {
265
+ domainsMatch = false
266
+ } else {
267
+ for k, v in clusterSizeClusterIdAxis.domain {
268
+ if is_undefined(linkerAxis.domain[k]) || linkerAxis.domain[k] != v {
269
+ domainsMatch = false
270
+ break
271
+ }
272
+ }
273
+ }
274
+ }
275
+ if domainsMatch {
276
+ shouldAdd = true
277
+ break
278
+ }
279
+ }
280
+ }
281
+ }
282
+
283
+ // Only add cluster size columns that match a linker column's clustering run
284
+ if shouldAdd {
285
+ // Trace elements are already present in col.spec from the clustering block.
286
+ // deriveLabels (in label.ts) will use these existing trace elements to construct
287
+ // distinguishing labels when multiple clustering blocks are joined, similar to
288
+ // how LabelTypeFull ('__LABEL__@1') works. The trace includes:
289
+ // - Original dataset trace
290
+ // - "milaboratories.clonotype-clustering.sequences" trace element
291
+ // - "milaboratories.clonotype-clustering.clustering" trace element
292
+ // No modification needed - just preserve the existing trace.
293
+
294
+ cloneTable.add(col, {header: "clusterSize." + string(clusterSizeIdx)})
295
+ addedCols = true
296
+ // Add the cluster axis header
297
+ for axisIdx, axis in col.spec.axesSpec {
298
+ if axis.name != datasetSpec.axesSpec[1].name {
299
+ cloneTable.setAxisHeader(axis, "clusterAxis_" + string(clusterSizeIdx) + "_" + string(axisIdx))
300
+ }
301
+ }
302
+ clusterSizeIdx = clusterSizeIdx + 1
303
+ }
304
+ }
305
+ }
139
306
 
140
307
  // Continue only if we have at least a column
141
308
  // This condition prevents temporal intermittent error while filters are
142
309
  // being processed and possibly in other situations too
143
310
  if addedCols {
144
- // Run ptabler-based filtering (matches filter.py logic)
145
- filterResult := dataUtils.filterClonotypes(structuredMap, axisRenames, filterMap, datasetSpec)
146
- // Run sampling script if topClonotypes is defined
147
- finalClonotypesParquet := undefined
148
- if args.topClonotypes != undefined {
149
- sampleClones := exec.builder().
150
- software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:main")).
151
- mem("16GiB").
152
- cpu(1).
153
- addFile("filteredClonotypes.parquet", filterResult.filteredParquet).
154
- arg("--input").arg("filteredClonotypes.parquet").
155
- arg("--n").arg(string(topClonotypes)).
156
- arg("--ranking-map").arg(string(json.encode(rankingMap))).
157
- arg("--out").arg("sampledClonotypes_top.csv").
158
- arg("--out-parquet").arg("sampledClonotypes_top.parquet").
159
- saveFile("sampledClonotypes_top.csv").
160
- saveFile("sampledClonotypes_top.parquet").
161
- printErrStreamToStdout().
162
- saveStdoutContent().
163
- cache(24 * 60 * 60 * 1000).
164
- run()
311
+ cloneTable.mem("16GiB")
312
+ cloneTable.cpu(1)
313
+ cloneTable = cloneTable.build()
314
+
315
+ // Use ender.create to call the filter-clonotypes template
316
+ filterSampleResult := render.create(filterAndSampleTpl, {
317
+ inputAnchor: args.inputAnchor,
318
+ cloneTable: cloneTable,
319
+ rankingOrder: args.rankingOrder,
320
+ rankingOrderDefault: args.rankingOrderDefault,
321
+ filters: args.filters,
322
+ filterMap: filterMap,
323
+ rankingMap: rankingMap,
324
+ datasetSpec: datasetSpec,
325
+ topClonotypes: args.topClonotypes
326
+ })
327
+
328
+ // Get the filtered clonotypes from the template result
329
+ outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
330
+
331
+ // Get the filtered and sampled clonotypes P-frame and CSV from the template result
332
+ finalClonotypesCsv := filterSampleResult.output("finalClonotypesCsv", 24 * 60 * 60 * 1000)
333
+ // outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
334
+
335
+ ////////// CDR3 Length Calculation //////////
165
336
 
166
- finalClonotypesCsv := sampleClones.getFile("sampledClonotypes_top.csv")
167
- sampledColumnsPf := xsv.importFile(finalClonotypesCsv, "csv",
168
- sampledColsConv.getColumns(datasetSpec, true), {cpu: 1, mem: "4GiB"})
169
- outputs["sampledRows"] = pframes.exportFrame(sampledColumnsPf)
170
- finalClonotypesParquet = sampleClones.getFile("sampledClonotypes_top.parquet")
171
- } else {
172
- // No sampling, use filtered parquet as final output
173
- finalClonotypesParquet = filterResult.filteredParquet
174
- outputs["sampledRows"] = pframes.exportFrame(filterResult.pframe)
337
+ cdr3SeqTable := pframes.tsvFileBuilder()
338
+ cdr3SeqTable.setAxisHeader(datasetSpec.axesSpec[1].name, "clonotypeKey")
339
+
340
+ // Must deal with multiple CDR3 sequences (two for each cell in single cell data)
341
+ // Chain will be added in the header as cdr3Sequence.chain and used in python script
342
+ // Notice chain is in spec.domain for single cell data and spec.axesSpec[0].domain for bulk data
343
+
344
+ // Helper function to add chain information to the headers dynamically
345
+ chainMapping := {
346
+ "IG": { "A": "Heavy", "B": "Light" },
347
+ "TCRAB": { "A": "TRA", "B": "TRB" },
348
+ "TCRGD": { "A": "TRG", "B": "TRD" }
175
349
  }
176
- ////////// CDR3 Length Calculation //////////
177
- cdr3Data := dataUtils.prepareCdr3Data(columns, datasetSpec, isSingleCell)
178
- cdr3SeqStructuredMap := cdr3Data.structuredMap
179
- cdr3SeqAxisRenames := cdr3Data.axisRenames
180
-
181
- // Build ptabler workflow
182
- wfCdr3Seq := pt.workflow().cacheInputs(24 * 60 * 60 * 1000)
183
- cdr3SeqProjection := []
184
- for origAxis, aliasName in cdr3SeqAxisRenames {
185
- cdr3SeqProjection = append(cdr3SeqProjection, pt.axis(origAxis).alias(aliasName))
350
+
351
+ makeHeaderName := func(col, baseHeaderName, isSingleCell) {
352
+ if isSingleCell {
353
+ chain := col.spec.domain["pl7.app/vdj/scClonotypeChain"] // e.g., "A", "B"
354
+ receptor := col.spec.axesSpec[0].domain["pl7.app/vdj/receptor"] // e.g., "IG", "TCRAB", "TCRGD"
355
+ chainLabel := chainMapping[receptor][chain]
356
+ return baseHeaderName + "." + chainLabel // e.g., "cdr3Sequence.Heavy"
357
+ } else {
358
+ // For bulk, if chain info is available (e.g. IGH, IGK, IGL)
359
+ chainFromDomain := col.spec.axesSpec[0].domain["pl7.app/vdj/chain"] // e.g. "IGH", "IGK"
360
+ if chainFromDomain != undefined {
361
+ return baseHeaderName + "." + chainFromDomain // e.g., "cdr3Sequence.IGH"
362
+ }
363
+ }
364
+ return baseHeaderName
365
+ };
366
+
367
+ // Process CDR3 sequences
368
+ cdr3Sequences := columns.getColumns("cdr3Sequences")
369
+
370
+ for col in cdr3Sequences {
371
+ headerName := makeHeaderName(col, "cdr3Sequence", isSingleCell)
372
+ if isSingleCell {
373
+ if col.spec.domain["pl7.app/vdj/scClonotypeChain/index"] == "primary" {
374
+ cdr3SeqTable.add(col, {header: headerName})
375
+ }
376
+ } else {
377
+ cdr3SeqTable.add(col, {header: headerName})
378
+ }
379
+ }
380
+
381
+ // Process V genes
382
+ vGenes := columns.getColumns("VGenes")
383
+
384
+ for col in vGenes {
385
+ headerName := makeHeaderName(col, "vGene", isSingleCell)
386
+ cdr3SeqTable.add(col, {header: headerName})
186
387
  }
187
- for colName, _ in cdr3SeqStructuredMap {
188
- cdr3SeqProjection = append(cdr3SeqProjection, pt.col(colName))
388
+
389
+ // Process J genes
390
+ jGenes := columns.getColumns("JGenes")
391
+
392
+ for col in jGenes {
393
+ headerName := makeHeaderName(col, "jGene", isSingleCell)
394
+ cdr3SeqTable.add(col, {header: headerName})
189
395
  }
190
396
 
191
- dfCdr3Seq := wfCdr3Seq.frame(pt.p.full(cdr3SeqStructuredMap)).select(cdr3SeqProjection...)
192
- dfCdr3Seq.save("cdr3_sequences.parquet")
193
- cdr3SeqResult := wfCdr3Seq.run()
194
- cdr3SeqParquet := cdr3SeqResult.getFile("cdr3_sequences.parquet")
397
+ cdr3SeqTable.mem("16GiB")
398
+ cdr3SeqTable.cpu(1)
399
+ cdr3SeqTableBuilt := cdr3SeqTable.build()
195
400
 
196
401
  cdr3VspectratypeCmd := exec.builder().
197
402
  software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.spectratype:main")).
198
403
  mem("16GiB").
199
404
  cpu(1).
200
- addFile("cdr3_sequences_input.parquet", cdr3SeqParquet).
201
- arg("--input").arg("cdr3_sequences_input.parquet").
405
+ addFile("cdr3_sequences_input.tsv", cdr3SeqTableBuilt).
406
+ arg("--input_tsv").arg("cdr3_sequences_input.tsv").
202
407
  arg("--spectratype_tsv").arg("spectratype.tsv").
203
408
  arg("--vj_usage_tsv").arg("vj_usage.tsv") // no dot here
204
409
 
205
410
  // Add top clonotypes argument and file to the builder if provided
206
- if finalClonotypesParquet != undefined {
411
+ if finalClonotypesCsv != undefined {
207
412
  cdr3VspectratypeCmd = cdr3VspectratypeCmd.
208
- arg("--final_clonotypes_parquet").arg("finalClonotypes.parquet").
209
- addFile("finalClonotypes.parquet", finalClonotypesParquet)
413
+ arg("--final_clonotypes_csv").arg("finalClonotypes.csv").
414
+ addFile("finalClonotypes.csv", finalClonotypesCsv)
210
415
  }
211
416
 
212
417
  cdr3VspectratypeCmd = cdr3VspectratypeCmd. // continue building the command
@@ -216,16 +421,18 @@ wf.body(func(args) {
216
421
  cache(24 * 60 * 60 * 1000).
217
422
  run()
218
423
 
424
+
219
425
  // Spectratype PFrame structure is [chain][cdr3Length][vGene] -> count
426
+
220
427
  cdr3VspectratypePf := xsv.importFile(cdr3VspectratypeCmd.getFile("spectratype.tsv"),
221
428
  "tsv", spectratypeConv.getColumns(),
222
- {cpu: 1, mem: "4GiB"})
429
+ {cpu: 1, mem: "16GiB"})
223
430
  outputs["cdr3VspectratypePf"] = pframes.exportFrame(cdr3VspectratypePf)
224
431
 
225
432
  // For vjUsage structure is [chain][vGene][jGene] -> count
226
433
  vjUsagePf := xsv.importFile(cdr3VspectratypeCmd.getFile("vj_usage.tsv"),
227
434
  "tsv", vjUsageConv.getColumns(),
228
- {cpu: 1, mem: "4GiB"})
435
+ {cpu: 1, mem: "16GiB"})
229
436
  outputs["vjUsagePf"] = pframes.exportFrame(vjUsagePf)
230
437
 
231
438
  if args.kabatNumbering == true {
@@ -236,7 +443,7 @@ wf.body(func(args) {
236
443
 
237
444
  seqCols := columns.getColumns("assemblingAaSeqs")
238
445
  for col in seqCols {
239
- headerName := dataUtils.makeHeaderName(col, "assemblingFeature", isSingleCell)
446
+ headerName := makeHeaderName(col, "assemblingFeature", isSingleCell)
240
447
  assemSeqTable.add(col, {header: headerName})
241
448
  }
242
449
 
@@ -260,7 +467,7 @@ wf.body(func(args) {
260
467
  assem := render.create(assemFastaTpl, {
261
468
  inputTsv: assemSeqTableBuilt,
262
469
  keyColumn: "clonotypeKey",
263
- finalClonotypesParquet: finalClonotypesParquet,
470
+ finalClonotypesCsv: finalClonotypesCsv,
264
471
  isSingleCell: isSingleCell,
265
472
  bulkChain: bulkChain
266
473
  })
@@ -48,4 +48,3 @@ getColumns := func(datasetSpec, addRanking) {
48
48
  export ll.toStrict({
49
49
  getColumns: getColumns
50
50
  })
51
-