@platforma-open/milaboratories.top-antibodies.workflow 1.14.0 → 1.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,16 +7,12 @@ pframes := import("@platforma-sdk/workflow-tengo:pframes")
7
7
  slices := import("@platforma-sdk/workflow-tengo:slices")
8
8
  render := import("@platforma-sdk/workflow-tengo:render")
9
9
  ll := import("@platforma-sdk/workflow-tengo:ll")
10
- pt := import("@platforma-sdk/workflow-tengo:pt")
11
- text := import("text")
12
- json := import("json")
10
+ kabatConv := import(":pf-kabat-conv")
13
11
 
14
- dataUtils := import(":libs.data-utils")
15
- spectratypeConv := import(":libs.pf-spectratype-conv")
16
- vjUsageConv := import(":libs.pf-vj-usage-conv")
17
- sampledColsConv := import(":libs.sampled-cols-conv")
18
- kabatConv := import(":libs.pf-kabat-conv")
12
+ spectratypeConv := import(":pf-spectratype-conv")
13
+ vjUsageConv := import(":pf-vj-usage-conv")
19
14
 
15
+ filterAndSampleTpl := assets.importTemplate(":filter-and-sample")
20
16
 
21
17
  wf.prepare(func(args){
22
18
  if is_undefined(args.inputAnchor) {
@@ -127,86 +123,230 @@ wf.body(func(args) {
127
123
 
128
124
  // Needed conditional variable
129
125
  isSingleCell := datasetSpec.axesSpec[1].name == "pl7.app/vdj/scClonotypeKey"
130
-
126
+
131
127
  ////////// Clonotype Filtering //////////
132
- clonotypeData := dataUtils.prepareClonotypeData(args.filters, args.rankingOrder, args.rankingOrderDefault, columns, datasetSpec)
133
- structuredMap := clonotypeData.structuredMap
134
- axisRenames := clonotypeData.axisRenames
135
- filterMap := clonotypeData.filterMap
136
- rankingMap := clonotypeData.rankingMap
137
- addedCols := clonotypeData.addedCols
138
- linkerAxisSpec := clonotypeData.linkerAxisSpec
128
+ // Build clonotype table
129
+ cloneTable := pframes.csvFileBuilder()
130
+ cloneTable.setAxisHeader(datasetSpec.axesSpec[1], "clonotypeKey")
131
+
132
+ // Add Filters to table
133
+ addedAxes := []
134
+ filterMap := {}
135
+ rankingMap := {}
136
+ addedCols := false
137
+ if len(args.filters) > 0 {
138
+ for i, filter in args.filters {
139
+ if filter.value != undefined {
140
+ // Columns added here might also be in ranking list, so we add default IDs
141
+ cloneTable.add(columns.getColumn(filter.value.column),
142
+ {header: "Filter_" + string(i), id: "filter_" + string(i)})
143
+ addedCols = true
144
+ // Store reference value and filter type associated to this column
145
+ filterMap["Filter_" + string(i)] = filter.filter
146
+ filterMap["Filter_" + string(i)]["valueType"] = columns.getSpec(filter.value.column).valueType
147
+
148
+ // If column does not have main anchor axis we have to include theirs
149
+ colsSpec := columns.getSpec(filter.value.column)
150
+ axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
151
+ if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
152
+ for na, ax in colsSpec.axesSpec {
153
+ if ax.name != datasetSpec.axesSpec[1].name {
154
+ cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
155
+ addedAxes = append(addedAxes, ax.name)
156
+ }
157
+ }
158
+ }
159
+ }
160
+ }
161
+ }
162
+
163
+ // Add ranking columns to table
164
+ validRanks := false
165
+ if len(args.rankingOrder) > 0 {
166
+ for i, col in args.rankingOrder {
167
+ if col.value != undefined {
168
+ validRanks = true
169
+ cloneTable.add(columns.getColumn(col.value.column), {header: "Col" + string(i)})
170
+ addedCols = true
171
+ // Store ranking order for this column
172
+ rankingMap["Col" + string(i)] = col.rankingOrder
173
+
174
+ // If column does not have main anchor axis we have to include theirs
175
+ colsSpec := columns.getSpec(col.value.column)
176
+ axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
177
+ if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
178
+ for na, ax in colsSpec.axesSpec {
179
+ if ax.name != datasetSpec.axesSpec[1].name && !slices.hasElement(addedAxes, ax.name) {
180
+ cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
181
+ }
182
+ }
183
+ }
184
+ }
185
+ }
186
+ }
187
+ // If we didn't have any ranking column or all where not valid
188
+ if !validRanks {
189
+ // @TODO: this is a temporal patch for issue where rankingOrderDefault
190
+ // are not defined by the time prerun works
191
+ if args.rankingOrderDefault.value != undefined {
192
+ i := 0
193
+ cloneTable.add(columns.getColumn(args.rankingOrderDefault.value.column), {header: "Col" + string(i)})
194
+ addedCols = true
195
+ // Store default ranking order
196
+ rankingMap["Col" + string(i)] = args.rankingOrderDefault.rankingOrder
197
+
198
+ // If column does not have main anchor axis we have to include theirs
199
+ colsSpec := columns.getSpec(args.rankingOrderDefault.value.column)
200
+ axesNames := slices.map(colsSpec.axesSpec, func (a) { return a.name})
201
+ if !slices.hasElement(axesNames, datasetSpec.axesSpec[1].name) {
202
+ for na, ax in colsSpec.axesSpec {
203
+ if ax.name != datasetSpec.axesSpec[1].name {
204
+ cloneTable.setAxisHeader(ax, "cluster_" + string(i) + string(na))
205
+ }
206
+ }
207
+ }
208
+ }
209
+ }
210
+
211
+ // Get linker columns if needed
212
+ linkerAxisSpec := {}
213
+ if len(columns.getColumns("linkers")) > 0 {
214
+ for i, col in columns.getColumns("linkers") {
215
+ if datasetSpec.axesSpec[1].name == col.spec.axesSpec[1].name {
216
+ cloneTable.add(col, {header: "linker." + string(i)})
217
+ cloneTable.setAxisHeader(col.spec.axesSpec[0], "cluster_" + string(i))
218
+ linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[0]
219
+ } else if datasetSpec.axesSpec[1].name == col.spec.axesSpec[0].name {
220
+ cloneTable.add(col, {header: "linker." + string(i)})
221
+ cloneTable.setAxisHeader(col.spec.axesSpec[1], "cluster_" + string(i))
222
+ linkerAxisSpec["cluster_" + string(i)] = col.spec.axesSpec[1]
223
+ }
224
+ addedCols = true
225
+ }
226
+ }
227
+
228
+ // Add cluster size columns if available
229
+ if len(columns.getColumns("clusterSizes")) > 0 {
230
+ for i, col in columns.getColumns("clusterSizes") {
231
+ cloneTable.add(col, {header: "clusterSize." + string(i)})
232
+ addedCols = true
233
+ // Add the cluster axis header
234
+ for axisIdx, axis in col.spec.axesSpec {
235
+ if axis.name != datasetSpec.axesSpec[1].name {
236
+ cloneTable.setAxisHeader(axis, "clusterAxis_" + string(i) + "_" + string(axisIdx))
237
+ }
238
+ }
239
+ }
240
+ }
139
241
 
140
242
  // Continue only if we have at least a column
141
243
  // This condition prevents temporal intermittent error while filters are
142
244
  // being processed and possibly in other situations too
143
245
  if addedCols {
144
- // Run ptabler-based filtering (matches filter.py logic)
145
- filterResult := dataUtils.filterClonotypes(structuredMap, axisRenames, filterMap, datasetSpec)
146
- // Run sampling script if topClonotypes is defined
147
- finalClonotypesParquet := undefined
148
- if args.topClonotypes != undefined {
149
- sampleClones := exec.builder().
150
- software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.sample-clonotypes:main")).
151
- mem("16GiB").
152
- cpu(1).
153
- addFile("filteredClonotypes.parquet", filterResult.filteredParquet).
154
- arg("--input").arg("filteredClonotypes.parquet").
155
- arg("--n").arg(string(topClonotypes)).
156
- arg("--ranking-map").arg(string(json.encode(rankingMap))).
157
- arg("--out").arg("sampledClonotypes_top.csv").
158
- arg("--out-parquet").arg("sampledClonotypes_top.parquet").
159
- saveFile("sampledClonotypes_top.csv").
160
- saveFile("sampledClonotypes_top.parquet").
161
- printErrStreamToStdout().
162
- saveStdoutContent().
163
- cache(24 * 60 * 60 * 1000).
164
- run()
246
+ cloneTable.mem("16GiB")
247
+ cloneTable.cpu(1)
248
+ cloneTable = cloneTable.build()
249
+
250
+ // Use ender.create to call the filter-clonotypes template
251
+ filterSampleResult := render.create(filterAndSampleTpl, {
252
+ inputAnchor: args.inputAnchor,
253
+ cloneTable: cloneTable,
254
+ rankingOrder: args.rankingOrder,
255
+ rankingOrderDefault: args.rankingOrderDefault,
256
+ filters: args.filters,
257
+ filterMap: filterMap,
258
+ rankingMap: rankingMap,
259
+ datasetSpec: datasetSpec,
260
+ topClonotypes: args.topClonotypes
261
+ })
262
+
263
+ // Get the filtered clonotypes from the template result
264
+ outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
265
+
266
+ // Get the filtered and sampled clonotypes P-frame and CSV from the template result
267
+ finalClonotypesCsv := filterSampleResult.output("finalClonotypesCsv", 24 * 60 * 60 * 1000)
268
+ // outputs["sampledRows"] = filterSampleResult.output("sampledRows", 24 * 60 * 60 * 1000)
269
+
270
+ ////////// CDR3 Length Calculation //////////
165
271
 
166
- finalClonotypesCsv := sampleClones.getFile("sampledClonotypes_top.csv")
167
- sampledColumnsPf := xsv.importFile(finalClonotypesCsv, "csv",
168
- sampledColsConv.getColumns(datasetSpec, true), {cpu: 1, mem: "4GiB"})
169
- outputs["sampledRows"] = pframes.exportFrame(sampledColumnsPf)
170
- finalClonotypesParquet = sampleClones.getFile("sampledClonotypes_top.parquet")
171
- } else {
172
- // No sampling, use filtered parquet as final output
173
- finalClonotypesParquet = filterResult.filteredParquet
174
- outputs["sampledRows"] = pframes.exportFrame(filterResult.pframe)
272
+ cdr3SeqTable := pframes.tsvFileBuilder()
273
+ cdr3SeqTable.setAxisHeader(datasetSpec.axesSpec[1].name, "clonotypeKey")
274
+
275
+ // Must deal with multiple CDR3 sequences (two for each cell in single cell data)
276
+ // Chain will be added in the header as cdr3Sequence.chain and used in python script
277
+ // Notice chain is in spec.domain for single cell data and spec.axesSpec[0].domain for bulk data
278
+
279
+ // Helper function to add chain information to the headers dynamically
280
+ chainMapping := {
281
+ "IG": { "A": "Heavy", "B": "Light" },
282
+ "TCRAB": { "A": "TRA", "B": "TRB" },
283
+ "TCRGD": { "A": "TRG", "B": "TRD" }
175
284
  }
176
- ////////// CDR3 Length Calculation //////////
177
- cdr3Data := dataUtils.prepareCdr3Data(columns, datasetSpec, isSingleCell)
178
- cdr3SeqStructuredMap := cdr3Data.structuredMap
179
- cdr3SeqAxisRenames := cdr3Data.axisRenames
180
-
181
- // Build ptabler workflow
182
- wfCdr3Seq := pt.workflow().cacheInputs(24 * 60 * 60 * 1000)
183
- cdr3SeqProjection := []
184
- for origAxis, aliasName in cdr3SeqAxisRenames {
185
- cdr3SeqProjection = append(cdr3SeqProjection, pt.axis(origAxis).alias(aliasName))
285
+
286
+ makeHeaderName := func(col, baseHeaderName, isSingleCell) {
287
+ if isSingleCell {
288
+ chain := col.spec.domain["pl7.app/vdj/scClonotypeChain"] // e.g., "A", "B"
289
+ receptor := col.spec.axesSpec[0].domain["pl7.app/vdj/receptor"] // e.g., "IG", "TCRAB", "TCRGD"
290
+ chainLabel := chainMapping[receptor][chain]
291
+ return baseHeaderName + "." + chainLabel // e.g., "cdr3Sequence.Heavy"
292
+ } else {
293
+ // For bulk, if chain info is available (e.g. IGH, IGK, IGL)
294
+ chainFromDomain := col.spec.axesSpec[0].domain["pl7.app/vdj/chain"] // e.g. "IGH", "IGK"
295
+ if chainFromDomain != undefined {
296
+ return baseHeaderName + "." + chainFromDomain // e.g., "cdr3Sequence.IGH"
297
+ }
298
+ }
299
+ return baseHeaderName
300
+ };
301
+
302
+ // Process CDR3 sequences
303
+ cdr3Sequences := columns.getColumns("cdr3Sequences")
304
+
305
+ for col in cdr3Sequences {
306
+ headerName := makeHeaderName(col, "cdr3Sequence", isSingleCell)
307
+ if isSingleCell {
308
+ if col.spec.domain["pl7.app/vdj/scClonotypeChain/index"] == "primary" {
309
+ cdr3SeqTable.add(col, {header: headerName})
310
+ }
311
+ } else {
312
+ cdr3SeqTable.add(col, {header: headerName})
313
+ }
186
314
  }
187
- for colName, _ in cdr3SeqStructuredMap {
188
- cdr3SeqProjection = append(cdr3SeqProjection, pt.col(colName))
315
+
316
+ // Process V genes
317
+ vGenes := columns.getColumns("VGenes")
318
+
319
+ for col in vGenes {
320
+ headerName := makeHeaderName(col, "vGene", isSingleCell)
321
+ cdr3SeqTable.add(col, {header: headerName})
189
322
  }
190
323
 
191
- dfCdr3Seq := wfCdr3Seq.frame(pt.p.full(cdr3SeqStructuredMap)).select(cdr3SeqProjection...)
192
- dfCdr3Seq.save("cdr3_sequences.parquet")
193
- cdr3SeqResult := wfCdr3Seq.run()
194
- cdr3SeqParquet := cdr3SeqResult.getFile("cdr3_sequences.parquet")
324
+ // Process J genes
325
+ jGenes := columns.getColumns("JGenes")
326
+
327
+ for col in jGenes {
328
+ headerName := makeHeaderName(col, "jGene", isSingleCell)
329
+ cdr3SeqTable.add(col, {header: headerName})
330
+ }
331
+
332
+ cdr3SeqTable.mem("16GiB")
333
+ cdr3SeqTable.cpu(1)
334
+ cdr3SeqTableBuilt := cdr3SeqTable.build()
195
335
 
196
336
  cdr3VspectratypeCmd := exec.builder().
197
337
  software(assets.importSoftware("@platforma-open/milaboratories.top-antibodies.spectratype:main")).
198
338
  mem("16GiB").
199
339
  cpu(1).
200
- addFile("cdr3_sequences_input.parquet", cdr3SeqParquet).
201
- arg("--input").arg("cdr3_sequences_input.parquet").
340
+ addFile("cdr3_sequences_input.tsv", cdr3SeqTableBuilt).
341
+ arg("--input_tsv").arg("cdr3_sequences_input.tsv").
202
342
  arg("--spectratype_tsv").arg("spectratype.tsv").
203
343
  arg("--vj_usage_tsv").arg("vj_usage.tsv") // no dot here
204
344
 
205
345
  // Add top clonotypes argument and file to the builder if provided
206
- if finalClonotypesParquet != undefined {
346
+ if finalClonotypesCsv != undefined {
207
347
  cdr3VspectratypeCmd = cdr3VspectratypeCmd.
208
- arg("--final_clonotypes_parquet").arg("finalClonotypes.parquet").
209
- addFile("finalClonotypes.parquet", finalClonotypesParquet)
348
+ arg("--final_clonotypes_csv").arg("finalClonotypes.csv").
349
+ addFile("finalClonotypes.csv", finalClonotypesCsv)
210
350
  }
211
351
 
212
352
  cdr3VspectratypeCmd = cdr3VspectratypeCmd. // continue building the command
@@ -216,16 +356,18 @@ wf.body(func(args) {
216
356
  cache(24 * 60 * 60 * 1000).
217
357
  run()
218
358
 
359
+
219
360
  // Spectratype PFrame structure is [chain][cdr3Length][vGene] -> count
361
+
220
362
  cdr3VspectratypePf := xsv.importFile(cdr3VspectratypeCmd.getFile("spectratype.tsv"),
221
363
  "tsv", spectratypeConv.getColumns(),
222
- {cpu: 1, mem: "4GiB"})
364
+ {cpu: 1, mem: "16GiB"})
223
365
  outputs["cdr3VspectratypePf"] = pframes.exportFrame(cdr3VspectratypePf)
224
366
 
225
367
  // For vjUsage structure is [chain][vGene][jGene] -> count
226
368
  vjUsagePf := xsv.importFile(cdr3VspectratypeCmd.getFile("vj_usage.tsv"),
227
369
  "tsv", vjUsageConv.getColumns(),
228
- {cpu: 1, mem: "4GiB"})
370
+ {cpu: 1, mem: "16GiB"})
229
371
  outputs["vjUsagePf"] = pframes.exportFrame(vjUsagePf)
230
372
 
231
373
  if args.kabatNumbering == true {
@@ -236,7 +378,7 @@ wf.body(func(args) {
236
378
 
237
379
  seqCols := columns.getColumns("assemblingAaSeqs")
238
380
  for col in seqCols {
239
- headerName := dataUtils.makeHeaderName(col, "assemblingFeature", isSingleCell)
381
+ headerName := makeHeaderName(col, "assemblingFeature", isSingleCell)
240
382
  assemSeqTable.add(col, {header: headerName})
241
383
  }
242
384
 
@@ -260,7 +402,7 @@ wf.body(func(args) {
260
402
  assem := render.create(assemFastaTpl, {
261
403
  inputTsv: assemSeqTableBuilt,
262
404
  keyColumn: "clonotypeKey",
263
- finalClonotypesParquet: finalClonotypesParquet,
405
+ finalClonotypesCsv: finalClonotypesCsv,
264
406
  isSingleCell: isSingleCell,
265
407
  bulkChain: bulkChain
266
408
  })
@@ -48,4 +48,3 @@ getColumns := func(datasetSpec, addRanking) {
48
48
  export ll.toStrict({
49
49
  getColumns: getColumns
50
50
  })
51
-