@platforma-open/milaboratories.sequence-properties.workflow 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +17 -0
- package/CHANGELOG.md +40 -0
- package/dist/index.cjs +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +5 -0
- package/dist/tengo/lib/messages.lib.tengo +58 -0
- package/dist/tengo/tpl/main.plj.gz +0 -0
- package/dist/tengo/tpl/process.plj.gz +0 -0
- package/format.el +43 -0
- package/index.d.ts +4 -0
- package/index.js +3 -0
- package/package.json +22 -0
- package/src/main.tpl.tengo +365 -0
- package/src/messages.lib.tengo +58 -0
- package/src/process.tpl.tengo +474 -0
- package/src/wf.test.ts +9 -0
- package/tsconfig.json +16 -0
- package/vitest.config.mts +9 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
WARN Issue while reading "/home/runner/work/sequence-properties/sequence-properties/.npmrc". Failed to replace env in config: ${NPMJS_TOKEN}
|
|
2
|
+
|
|
3
|
+
> @platforma-open/milaboratories.sequence-properties.workflow@1.1.1 build /home/runner/work/sequence-properties/sequence-properties/workflow
|
|
4
|
+
> shx rm -rf dist && pl-tengo check && pl-tengo build
|
|
5
|
+
|
|
6
|
+
info: Skipping unknown file type: wf.test.ts
|
|
7
|
+
Processing "src/main.tpl.tengo"...
|
|
8
|
+
Processing "src/messages.lib.tengo"...
|
|
9
|
+
Processing "src/process.tpl.tengo"...
|
|
10
|
+
No syntax errors found.
|
|
11
|
+
info: Skipping unknown file type: wf.test.ts
|
|
12
|
+
info: Compiling 'dist'...
|
|
13
|
+
info: - writing /home/runner/work/sequence-properties/sequence-properties/workflow/dist/tengo/lib/messages.lib.tengo
|
|
14
|
+
info: - writing /home/runner/work/sequence-properties/sequence-properties/workflow/dist/tengo/tpl/process.plj.gz
|
|
15
|
+
info: - writing /home/runner/work/sequence-properties/sequence-properties/workflow/dist/tengo/tpl/main.plj.gz
|
|
16
|
+
info: Template Pack build done.
|
|
17
|
+
info: Template Pack build done.
|
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# @platforma-open/MiLaboratories.sequence-properties.workflow
|
|
2
|
+
|
|
3
|
+
## 1.1.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- bb07f98: Rename all package scopes from `MiLaboratories.sequence-properties` to `milaboratories.sequence-properties`. npm registry rejects new package names with uppercase letters, which blocked the first publish. Lowercase form aligns with the existing `@platforma-open/milaboratories.*` convention used by sibling blocks. Also corrects the GitHub URL in the block manifest to point at the actual repo (`platforma-open/sequence-properties`).
|
|
8
|
+
|
|
9
|
+
## 1.1.0
|
|
10
|
+
|
|
11
|
+
### Minor Changes
|
|
12
|
+
|
|
13
|
+
- 1059d80: Initial release of the Sequence Properties block.
|
|
14
|
+
|
|
15
|
+
Computes physico-chemical properties (charge, pI, GRAVY, MW, extinction
|
|
16
|
+
coefficients, instability and aliphatic indices, aromaticity, AA composition)
|
|
17
|
+
for peptide and antibody/TCR sequence inputs. The block auto-detects modality
|
|
18
|
+
from the input axes and degrades gracefully on partial coverage: CDR3
|
|
19
|
+
properties when CDR3 is present, full-chain VH/VL when all seven IMGT regions
|
|
20
|
+
are exported, and Fv-level properties when both chains reconstruct. An R11c
|
|
21
|
+
heuristic flags likely VHH/single-domain inputs.
|
|
22
|
+
|
|
23
|
+
Property math uses BioPython ProtParam + IsoelectricPoint with IPC 2.0 pKa
|
|
24
|
+
overrides — peptide set for peptide and CDR3 inputs, protein set for full
|
|
25
|
+
VH/VL. Charge and pI round to 3 decimals at the output boundary; combined
|
|
26
|
+
with sorted Tengo iteration, canonical-JSON resources, and sorted TSV writes,
|
|
27
|
+
output bytes hash identically across runs so the block joins the dedup path.
|
|
28
|
+
|
|
29
|
+
M3 validation is locked down by `tests/unit/test_m3_validation.py` (38 cases:
|
|
30
|
+
≥5 VH pI, ≥2 VL pI, Fv on ≥2 paired chains, ≥10 CDR-H3 charge, ≥3 CDR-L3
|
|
31
|
+
charge, ≥3 VH aliphatic) against pinned IPC 2.0 webserver values and an
|
|
32
|
+
independent Henderson-Hasselbalch reference.
|
|
33
|
+
|
|
34
|
+
Block title is the static "Sequence Properties"; the selected input dataset
|
|
35
|
+
appears as the subtitle.
|
|
36
|
+
|
|
37
|
+
### Patch Changes
|
|
38
|
+
|
|
39
|
+
- Updated dependencies [1059d80]
|
|
40
|
+
- @platforma-open/MiLaboratories.sequence-properties.software@1.1.0
|
package/dist/index.cjs
ADDED
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
9
|
+
|
|
10
|
+
partialChainMissingFullChain := func(present, chainLabel) {
|
|
11
|
+
return "Partial-region input: " + string(present) + " of 7 required regions found for " +
|
|
12
|
+
chainLabel + " chain — full-chain properties not computed. " +
|
|
13
|
+
"All seven regions (FR1, CDR1, FR2, CDR2, FR3, CDR3, FR4) are required."
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
partialChainNoCdr3 := func(present, chainLabel) {
|
|
17
|
+
return "Partial-region input: " + string(present) + " of 7 required regions found for " +
|
|
18
|
+
chainLabel + " chain (CDR3 absent) — no per-chain properties computed. " +
|
|
19
|
+
"CDR3 is required for per-chain charge / hydrophobicity; " +
|
|
20
|
+
"all seven regions are required for full-chain properties."
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
noRecognizedColumns := func() {
|
|
24
|
+
return "No recognized VDJ region columns found in the input dataset."
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
cdr3OnlyInput := func() {
|
|
28
|
+
return "CDR3-only input detected — full-chain properties not computed. " +
|
|
29
|
+
"To enable them, use a MiXCR preset that exports all VDJ regions."
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
gammaDeltaTcr := func() {
|
|
33
|
+
return "γδ TCR input detected — displaying with γδ-specific labels; " +
|
|
34
|
+
"Fv columns are not computed for TCR inputs."
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
receptorNotDetected := func() {
|
|
38
|
+
return "Receptor type not detected on the input dataset; defaulting to antibody labels. " +
|
|
39
|
+
"Use a MiXCR preset that emits the receptor annotation if this is a TCR dataset."
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
vhh := func() {
|
|
44
|
+
return "Possible VHH/single-domain antibody input detected (heavy chain only; " +
|
|
45
|
+
"CDR-H3 length distribution consistent with VHH). IgG-calibrated CDR-H3 length " +
|
|
46
|
+
"thresholds (>15 aa elevated risk, >20 aa high risk) do not apply to VHH — " +
|
|
47
|
+
"disregard these thresholds for nanobody libraries."
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export ll.toStrict({
|
|
51
|
+
partialChainMissingFullChain: partialChainMissingFullChain,
|
|
52
|
+
partialChainNoCdr3: partialChainNoCdr3,
|
|
53
|
+
noRecognizedColumns: noRecognizedColumns,
|
|
54
|
+
cdr3OnlyInput: cdr3OnlyInput,
|
|
55
|
+
gammaDeltaTcr: gammaDeltaTcr,
|
|
56
|
+
receptorNotDetected: receptorNotDetected,
|
|
57
|
+
vhh: vhh
|
|
58
|
+
})
|
|
Binary file
|
|
Binary file
|
package/format.el
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
;; This program formats all files inside src directory. Usage: emacs --script ./format.el
|
|
2
|
+
|
|
3
|
+
(defun install-go-mode ()
|
|
4
|
+
"Installs go-mode"
|
|
5
|
+
(require 'package)
|
|
6
|
+
(add-to-list 'package-archives
|
|
7
|
+
'("melpa-stable" . "https://stable.melpa.org/packages/"))
|
|
8
|
+
(package-initialize)
|
|
9
|
+
(unless package-archive-contents
|
|
10
|
+
(package-refresh-contents))
|
|
11
|
+
|
|
12
|
+
(package-install 'go-mode t)
|
|
13
|
+
(require 'go-mode))
|
|
14
|
+
|
|
15
|
+
;; spaces -> tabs only at the beginning of lines
|
|
16
|
+
(setq tabify-regexp "^\t* [ \t]+")
|
|
17
|
+
|
|
18
|
+
(defun format-file (file)
|
|
19
|
+
"Formats a file according to slightly changed Go rules"
|
|
20
|
+
(message "Format %s" file)
|
|
21
|
+
(save-excursion
|
|
22
|
+
(find-file file)
|
|
23
|
+
(delete-trailing-whitespace) ;; deletes whitespaces
|
|
24
|
+
(go-mode) ;; sets golang rules for indentation
|
|
25
|
+
(tabify (point-min) (point-max)) ;; spaces -> tabs in the whole file
|
|
26
|
+
(indent-region (point-min) (point-max)) ;; indentation in the whole file
|
|
27
|
+
(save-buffer))) ;; save file
|
|
28
|
+
|
|
29
|
+
(install-go-mode)
|
|
30
|
+
|
|
31
|
+
;; change syntax of a standard go-mode a bit
|
|
32
|
+
(advice-add
|
|
33
|
+
'go--in-composite-literal-p
|
|
34
|
+
:filter-return
|
|
35
|
+
(lambda (&rest r) t))
|
|
36
|
+
|
|
37
|
+
;; find all files in src
|
|
38
|
+
(setq files (directory-files-recursively "src" "\\.tengo\\'"))
|
|
39
|
+
|
|
40
|
+
;; call format on every file.
|
|
41
|
+
(dolist (file files)
|
|
42
|
+
(format-file file))
|
|
43
|
+
|
package/index.d.ts
ADDED
package/index.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@platforma-open/milaboratories.sequence-properties.workflow",
|
|
3
|
+
"version": "1.1.1",
|
|
4
|
+
"description": "Block Workflow",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"dependencies": {
|
|
7
|
+
"@platforma-sdk/workflow-tengo": "5.16.0",
|
|
8
|
+
"@platforma-open/milaboratories.sequence-properties.software": "1.1.0"
|
|
9
|
+
},
|
|
10
|
+
"devDependencies": {
|
|
11
|
+
"@platforma-sdk/tengo-builder": "2.5.17",
|
|
12
|
+
"@platforma-sdk/test": "1.69.0"
|
|
13
|
+
},
|
|
14
|
+
"peerDependencies": {
|
|
15
|
+
"vitest": "*"
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "shx rm -rf dist && pl-tengo check && pl-tengo build",
|
|
19
|
+
"test": "vitest",
|
|
20
|
+
"format": "/usr/bin/env emacs --script ./format.el"
|
|
21
|
+
}
|
|
22
|
+
}
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
// Sequence Properties — workflow root.
|
|
2
|
+
//
|
|
3
|
+
// Detects modality (peptide vs antibody/TCR), collects amino-acid sequence columns,
|
|
4
|
+
// builds a per-entity TSV, runs the Python computation step, and hands off to
|
|
5
|
+
// process.tpl.tengo for output PColumn construction.
|
|
6
|
+
|
|
7
|
+
wf := import("@platforma-sdk/workflow-tengo:workflow")
|
|
8
|
+
exec := import("@platforma-sdk/workflow-tengo:exec")
|
|
9
|
+
assets := import("@platforma-sdk/workflow-tengo:assets")
|
|
10
|
+
render := import("@platforma-sdk/workflow-tengo:render")
|
|
11
|
+
smart := import("@platforma-sdk/workflow-tengo:smart")
|
|
12
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
13
|
+
pframes := import("@platforma-sdk/workflow-tengo:pframes")
|
|
14
|
+
canonical := import("@platforma-sdk/workflow-tengo:canonical")
|
|
15
|
+
maps := import("@platforma-sdk/workflow-tengo:maps")
|
|
16
|
+
constants := import("@platforma-sdk/workflow-tengo:constants")
|
|
17
|
+
messages := import(":messages")
|
|
18
|
+
|
|
19
|
+
processTpl := assets.importTemplate(":process")
|
|
20
|
+
|
|
21
|
+
// JSON resource with sorted-key canonical bytes. smart.createJsonResource uses
|
|
22
|
+
// Tengo's stdlib json.encode, which preserves Go's randomized map iteration —
|
|
23
|
+
// resource bytes vary across runs and the CID becomes non-deterministic,
|
|
24
|
+
// defeating dedup. canonical.encode sorts keys at every level so identical
|
|
25
|
+
// values always produce identical bytes.
|
|
26
|
+
canonicalJsonResource := func(value) {
|
|
27
|
+
return smart.createValueResource(constants.RTYPE_JSON, canonical.encode(value))
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
REQUIRED_FEATURES := ["FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"]
|
|
31
|
+
|
|
32
|
+
detectMode := func(axisSpec) {
|
|
33
|
+
dom := axisSpec.domain
|
|
34
|
+
if axisSpec.name == "pl7.app/variantKey" {
|
|
35
|
+
if dom != undefined && dom["pl7.app/peptide/extractionRunId"] != undefined {
|
|
36
|
+
return "peptide"
|
|
37
|
+
}
|
|
38
|
+
if dom != undefined && dom["pl7.app/vdj/clonotypingRunId"] != undefined {
|
|
39
|
+
return "antibody_tcr_universal"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
if axisSpec.name == "pl7.app/vdj/cloneId" {
|
|
43
|
+
return "antibody_tcr_legacy_bulk"
|
|
44
|
+
}
|
|
45
|
+
if axisSpec.name == "pl7.app/vdj/clonotypeKey" {
|
|
46
|
+
return "antibody_tcr_legacy_bulk"
|
|
47
|
+
}
|
|
48
|
+
if axisSpec.name == "pl7.app/vdj/scClonotypeKey" {
|
|
49
|
+
return "antibody_tcr_legacy_sc"
|
|
50
|
+
}
|
|
51
|
+
return ""
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
contains := func(arr, x) {
|
|
55
|
+
for v in arr {
|
|
56
|
+
if v == x { return true }
|
|
57
|
+
}
|
|
58
|
+
return false
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
wf.prepare(func(args) {
|
|
62
|
+
bb := wf.createPBundleBuilder()
|
|
63
|
+
bb.ignoreMissingDomains()
|
|
64
|
+
bb.addAnchor("main", args.inputAnchor)
|
|
65
|
+
|
|
66
|
+
// Peptide sequence column (universal naming, post-peptide-extraction).
|
|
67
|
+
bb.addMulti({
|
|
68
|
+
axes: [{ anchor: "main", idx: 1 }],
|
|
69
|
+
name: "pl7.app/sequence",
|
|
70
|
+
domain: {
|
|
71
|
+
"pl7.app/feature": "peptide",
|
|
72
|
+
"pl7.app/alphabet": "aminoacid"
|
|
73
|
+
}
|
|
74
|
+
}, "peptideSequences")
|
|
75
|
+
|
|
76
|
+
// VDJ region columns (legacy MiXCR path).
|
|
77
|
+
// Filter on alphabet only — NOT on pl7.app/vdj/isAssemblingFeature, per spec R4.
|
|
78
|
+
bb.addMulti({
|
|
79
|
+
axes: [{ anchor: "main", idx: 1 }],
|
|
80
|
+
name: "pl7.app/vdj/sequence",
|
|
81
|
+
domain: {
|
|
82
|
+
"pl7.app/alphabet": "aminoacid"
|
|
83
|
+
}
|
|
84
|
+
}, "vdjSequences")
|
|
85
|
+
|
|
86
|
+
// Universal-naming VDJ region columns (forward compatibility, post-MiXCR migration).
|
|
87
|
+
bb.addMulti({
|
|
88
|
+
axes: [{ anchor: "main", idx: 1 }],
|
|
89
|
+
name: "pl7.app/sequence",
|
|
90
|
+
domain: {
|
|
91
|
+
"pl7.app/alphabet": "aminoacid"
|
|
92
|
+
}
|
|
93
|
+
}, "universalSequences")
|
|
94
|
+
|
|
95
|
+
return { columns: bb.build() }
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
wf.body(func(args) {
|
|
99
|
+
blockId := wf.blockId().getDataAsJson()
|
|
100
|
+
bundle := args.columns
|
|
101
|
+
datasetSpec := bundle.getSpec(args.inputAnchor)
|
|
102
|
+
|
|
103
|
+
axes := datasetSpec.axesSpec
|
|
104
|
+
if len(axes) == 0 {
|
|
105
|
+
ll.panic("input anchor has no axes")
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// R1a: first axis matching a recognized name + domain. Equivalent to
|
|
109
|
+
// `axes[len-1]` on every observed `[sampleId, key]` input.
|
|
110
|
+
keyAxisIdx := -1
|
|
111
|
+
keyAxisSpec := undefined
|
|
112
|
+
mode := ""
|
|
113
|
+
for i, axisSpec in axes {
|
|
114
|
+
m := detectMode(axisSpec)
|
|
115
|
+
if m != "" {
|
|
116
|
+
keyAxisIdx = i
|
|
117
|
+
keyAxisSpec = axisSpec
|
|
118
|
+
mode = m
|
|
119
|
+
break
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if mode == "" {
|
|
123
|
+
ll.panic("no recognized sequence key axis found; connect a peptide extraction or MiXCR clonotyping dataset")
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
infoMessages := []
|
|
127
|
+
receptor := "IG" // R13b: default when receptor key absent
|
|
128
|
+
receptorSeen := false
|
|
129
|
+
|
|
130
|
+
// Spec deviation SD-003 — see docs/spec-deviations.md.
|
|
131
|
+
// MiXCR places the receptor key on the clonotypeKey AXIS domain (the input
|
|
132
|
+
// anchor's secondary axis), not on per-region sequence column domains. Read
|
|
133
|
+
// from the axis first; the per-column check inside the loop stays as a
|
|
134
|
+
// fallback for non-MiXCR producers.
|
|
135
|
+
if keyAxisSpec.domain != undefined {
|
|
136
|
+
axisR := keyAxisSpec.domain["pl7.app/vdj/receptor"]
|
|
137
|
+
if axisR == "IG" || axisR == "TCRAB" || axisR == "TCRGD" {
|
|
138
|
+
receptor = axisR
|
|
139
|
+
receptorSeen = true
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
chainsFound := {} // chain -> { feature -> 1 }
|
|
144
|
+
|
|
145
|
+
seqTb := pframes.tsvFileBuilder()
|
|
146
|
+
seqTb.setAxisHeader(keyAxisSpec, "entity_key")
|
|
147
|
+
|
|
148
|
+
if mode == "peptide" {
|
|
149
|
+
peptideCols := bundle.getColumns("peptideSequences")
|
|
150
|
+
if len(peptideCols) == 0 {
|
|
151
|
+
ll.panic("peptide mode detected but no peptide amino-acid sequence column was found in the input dataset")
|
|
152
|
+
}
|
|
153
|
+
seqTb.add(bundle.getColumn(peptideCols[0].key), { header: "sequence" })
|
|
154
|
+
|
|
155
|
+
} else {
|
|
156
|
+
// Legacy MiXCR sequences first; fall back to universal naming for forward
|
|
157
|
+
// compatibility with the post-MiXCR-migration column shape.
|
|
158
|
+
vdjCols := bundle.getColumns("vdjSequences")
|
|
159
|
+
if len(vdjCols) == 0 {
|
|
160
|
+
vdjCols = bundle.getColumns("universalSequences")
|
|
161
|
+
}
|
|
162
|
+
if len(vdjCols) == 0 {
|
|
163
|
+
ll.panic("antibody/TCR mode detected but no amino-acid VDJ sequence columns found")
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
for s in vdjCols {
|
|
167
|
+
d := s.spec.domain
|
|
168
|
+
if d == undefined { continue }
|
|
169
|
+
|
|
170
|
+
feat := d["pl7.app/vdj/feature"]
|
|
171
|
+
if d["pl7.app/feature"] != undefined { feat = d["pl7.app/feature"] }
|
|
172
|
+
|
|
173
|
+
// Spec deviation SD-002 — see docs/spec-deviations.md.
|
|
174
|
+
// MiXCR emits FR4 only as "FR4InFrame" (in-frame-filtered translation).
|
|
175
|
+
// Normalise to "FR4" so the REQUIRED_FEATURES check and downstream
|
|
176
|
+
// header naming treat it as the canonical FR4 region.
|
|
177
|
+
if feat == "FR4InFrame" { feat = "FR4" }
|
|
178
|
+
|
|
179
|
+
if !contains(REQUIRED_FEATURES, feat) { continue }
|
|
180
|
+
|
|
181
|
+
// Spec deviation SD-001 — see docs/spec-deviations.md.
|
|
182
|
+
// MiXCR single-cell emits primary + secondary alleles per chain. Spec assumes
|
|
183
|
+
// one allele per chain slot; secondary alleles would collide on the TSV header.
|
|
184
|
+
// Keep primary only.
|
|
185
|
+
idx := d["pl7.app/vdj/scClonotypeChain/index"]
|
|
186
|
+
if idx != undefined && idx != "primary" { continue }
|
|
187
|
+
|
|
188
|
+
chain := d["pl7.app/vdj/scClonotypeChain"]
|
|
189
|
+
if chain == undefined || chain == "" {
|
|
190
|
+
// Bulk MiXCR data without chain annotation — assume primary chain "A".
|
|
191
|
+
chain = "A"
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Same receptor value is expected on every input column; last seen wins.
|
|
195
|
+
r := d["pl7.app/vdj/receptor"]
|
|
196
|
+
if r == "IG" || r == "TCRAB" || r == "TCRGD" {
|
|
197
|
+
receptor = r
|
|
198
|
+
receptorSeen = true
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
header := chain + "_" + feat
|
|
202
|
+
seqTb.add(bundle.getColumn(s.key), { header: header })
|
|
203
|
+
|
|
204
|
+
if chainsFound[chain] == undefined { chainsFound[chain] = {} }
|
|
205
|
+
chainsFound[chain][feat] = 1
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Receptor-aware chain label for user-facing messages (R11b).
|
|
210
|
+
chainLabel := func(ch) {
|
|
211
|
+
if receptor == "TCRAB" {
|
|
212
|
+
if ch == "A" { return "alpha" }
|
|
213
|
+
if ch == "B" { return "beta" }
|
|
214
|
+
}
|
|
215
|
+
if receptor == "TCRGD" {
|
|
216
|
+
if ch == "A" { return "gamma" }
|
|
217
|
+
if ch == "B" { return "delta" }
|
|
218
|
+
}
|
|
219
|
+
// IG / unknown — antibody convention.
|
|
220
|
+
if ch == "A" { return "heavy" }
|
|
221
|
+
if ch == "B" { return "light" }
|
|
222
|
+
return ch
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
fullChain := {}
|
|
226
|
+
cdr3Only := {}
|
|
227
|
+
chainsWithCdr3 := []
|
|
228
|
+
if mode != "peptide" {
|
|
229
|
+
// Sorted iteration so partial-region messages append in a stable order.
|
|
230
|
+
chainKeys := maps.getKeys(chainsFound)
|
|
231
|
+
for _, chain in chainKeys {
|
|
232
|
+
feats := chainsFound[chain]
|
|
233
|
+
present := 0
|
|
234
|
+
for rf in REQUIRED_FEATURES {
|
|
235
|
+
if feats[rf] { present += 1 }
|
|
236
|
+
}
|
|
237
|
+
if feats["CDR3"] {
|
|
238
|
+
chainsWithCdr3 += [chain]
|
|
239
|
+
}
|
|
240
|
+
if present == len(REQUIRED_FEATURES) {
|
|
241
|
+
fullChain[chain] = true
|
|
242
|
+
} else if feats["CDR3"] && present == 1 {
|
|
243
|
+
cdr3Only[chain] = true
|
|
244
|
+
} else if feats["CDR3"] {
|
|
245
|
+
infoMessages += [messages.partialChainMissingFullChain(present, chainLabel(chain))]
|
|
246
|
+
} else {
|
|
247
|
+
// Chain has 1-6 of 7 regions but lacks CDR3 — neither CDR3-mode nor
|
|
248
|
+
// full-chain mode applies. Without R11b's surfaced silent fallthrough
|
|
249
|
+
// the user would see neither full-chain nor CDR3 columns and no
|
|
250
|
+
// explanation why.
|
|
251
|
+
infoMessages += [messages.partialChainNoCdr3(present, chainLabel(chain))]
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
anyChain := len(chainKeys) > 0
|
|
255
|
+
if !anyChain {
|
|
256
|
+
infoMessages += [messages.noRecognizedColumns()]
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if len(cdr3Only) > 0 && len(fullChain) == 0 {
|
|
260
|
+
infoMessages += [messages.cdr3OnlyInput()]
|
|
261
|
+
}
|
|
262
|
+
if receptor == "TCRGD" {
|
|
263
|
+
infoMessages += [messages.gammaDeltaTcr()]
|
|
264
|
+
}
|
|
265
|
+
// R13b: warn when no recognised receptor was seen — defaults to IG.
|
|
266
|
+
if !receptorSeen && anyChain {
|
|
267
|
+
infoMessages += [messages.receptorNotDetected()]
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
seqTb.mem("4GiB")
|
|
272
|
+
seqTb.cpu(1)
|
|
273
|
+
seqTable := seqTb.build()
|
|
274
|
+
|
|
275
|
+
// Sorted lists feed both plan.json (Python step input) and the params
|
|
276
|
+
// resource (process template input) — they must hash deterministically so
|
|
277
|
+
// the CIDs land on the dedup path across runs of identical input.
|
|
278
|
+
hasFv := mode != "peptide" && receptor == "IG" && fullChain["A"] && fullChain["B"]
|
|
279
|
+
chainList := maps.getKeys(chainsFound)
|
|
280
|
+
fullChainList := maps.getKeys(fullChain)
|
|
281
|
+
|
|
282
|
+
plan := {
|
|
283
|
+
mode: mode,
|
|
284
|
+
receptor: receptor,
|
|
285
|
+
chains: chainList,
|
|
286
|
+
fullChains: fullChainList,
|
|
287
|
+
hasFv: hasFv
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// Python step contract: reads input.tsv + plan.json; writes properties.tsv,
|
|
291
|
+
// plus aa_fraction.tsv in peptide mode (empty body in antibody/TCR mode), plus
|
|
292
|
+
// stats.json (dataset-level scalars consumed by the info layer — e.g. R11c
|
|
293
|
+
// median CDR-H3 length per chain).
|
|
294
|
+
soft := assets.importSoftware("@platforma-open/milaboratories.sequence-properties.software:compute-properties")
|
|
295
|
+
pyRun := exec.builder().
|
|
296
|
+
software(soft).
|
|
297
|
+
mem("4GiB").
|
|
298
|
+
cpu(1).
|
|
299
|
+
addFile("input.tsv", seqTable).
|
|
300
|
+
writeFile("plan.json", canonical.encode(plan)).
|
|
301
|
+
arg("--input").arg("input.tsv").
|
|
302
|
+
arg("--plan").arg("plan.json").
|
|
303
|
+
arg("--output").arg("properties.tsv").
|
|
304
|
+
arg("--aa-fraction").arg("aa_fraction.tsv").
|
|
305
|
+
arg("--stats").arg("stats.json").
|
|
306
|
+
saveFile("properties.tsv").
|
|
307
|
+
saveFile("aa_fraction.tsv").
|
|
308
|
+
saveFileContent("stats.json").
|
|
309
|
+
saveStderrStream().
|
|
310
|
+
run()
|
|
311
|
+
|
|
312
|
+
propertiesTsv := pyRun.getFile("properties.tsv")
|
|
313
|
+
aaFractionTsv := pyRun.getFile("aa_fraction.tsv")
|
|
314
|
+
statsResource := pyRun.getFileContent("stats.json")
|
|
315
|
+
processingLog := pyRun.getStderrStream()
|
|
316
|
+
|
|
317
|
+
coverageTier := "peptide"
|
|
318
|
+
if mode != "peptide" {
|
|
319
|
+
if len(fullChainList) > 0 {
|
|
320
|
+
coverageTier = "full_chain"
|
|
321
|
+
} else if len(infoMessages) > 0 {
|
|
322
|
+
coverageTier = "cdr3_only"
|
|
323
|
+
} else {
|
|
324
|
+
coverageTier = "partial"
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Hand off to process template for column specs, pFrame export, and
|
|
329
|
+
// info-blob assembly. The info blob depends on Python's stats output, so
|
|
330
|
+
// it builds inside the render template rather than at workflow body time.
|
|
331
|
+
processResult := render.create(processTpl, {
|
|
332
|
+
blockId: blockId,
|
|
333
|
+
propertiesTsv: propertiesTsv,
|
|
334
|
+
aaFractionTsv: aaFractionTsv,
|
|
335
|
+
stats: statsResource,
|
|
336
|
+
params: canonicalJsonResource({
|
|
337
|
+
datasetSpec: datasetSpec,
|
|
338
|
+
keyAxisIdx: keyAxisIdx,
|
|
339
|
+
mode: mode,
|
|
340
|
+
receptor: receptor,
|
|
341
|
+
chains: chainList,
|
|
342
|
+
chainsWithCdr3: chainsWithCdr3,
|
|
343
|
+
fullChains: fullChainList,
|
|
344
|
+
hasFv: hasFv,
|
|
345
|
+
coverageTier: coverageTier,
|
|
346
|
+
infoMessages: infoMessages
|
|
347
|
+
})
|
|
348
|
+
})
|
|
349
|
+
|
|
350
|
+
// Cache outputs for 24 hours (ms) to skip re-running identical work.
|
|
351
|
+
propertiesPf := processResult.output("propertiesPf", 24 * 60 * 60 * 1000)
|
|
352
|
+
exportPframe := processResult.output("exportPframe", 24 * 60 * 60 * 1000)
|
|
353
|
+
infoBlob := processResult.output("info", 24 * 60 * 60 * 1000)
|
|
354
|
+
|
|
355
|
+
return {
|
|
356
|
+
outputs: {
|
|
357
|
+
propertiesPf: propertiesPf,
|
|
358
|
+
info: infoBlob,
|
|
359
|
+
processingLog: processingLog
|
|
360
|
+
},
|
|
361
|
+
exports: {
|
|
362
|
+
properties: exportPframe
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
})
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
// User-facing info messages emitted by the workflow.
|
|
2
|
+
//
|
|
3
|
+
// Centralised so the inventory of UX strings is scannable in one place.
|
|
4
|
+
// Each helper returns a single message string; callers append it to the
|
|
5
|
+
// running info-message list. Receptor/chain rendering is the caller's job —
|
|
6
|
+
// helpers accept already-rendered chain labels.
|
|
7
|
+
|
|
8
|
+
ll := import("@platforma-sdk/workflow-tengo:ll")
|
|
9
|
+
|
|
10
|
+
partialChainMissingFullChain := func(present, chainLabel) {
|
|
11
|
+
return "Partial-region input: " + string(present) + " of 7 required regions found for " +
|
|
12
|
+
chainLabel + " chain — full-chain properties not computed. " +
|
|
13
|
+
"All seven regions (FR1, CDR1, FR2, CDR2, FR3, CDR3, FR4) are required."
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
partialChainNoCdr3 := func(present, chainLabel) {
|
|
17
|
+
return "Partial-region input: " + string(present) + " of 7 required regions found for " +
|
|
18
|
+
chainLabel + " chain (CDR3 absent) — no per-chain properties computed. " +
|
|
19
|
+
"CDR3 is required for per-chain charge / hydrophobicity; " +
|
|
20
|
+
"all seven regions are required for full-chain properties."
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
noRecognizedColumns := func() {
|
|
24
|
+
return "No recognized VDJ region columns found in the input dataset."
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
cdr3OnlyInput := func() {
|
|
28
|
+
return "CDR3-only input detected — full-chain properties not computed. " +
|
|
29
|
+
"To enable them, use a MiXCR preset that exports all VDJ regions."
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
gammaDeltaTcr := func() {
|
|
33
|
+
return "γδ TCR input detected — displaying with γδ-specific labels; " +
|
|
34
|
+
"Fv columns are not computed for TCR inputs."
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
receptorNotDetected := func() {
|
|
38
|
+
return "Receptor type not detected on the input dataset; defaulting to antibody labels. " +
|
|
39
|
+
"Use a MiXCR preset that emits the receptor annotation if this is a TCR dataset."
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// R11c — single-domain antibody (VHH / nanobody) heuristic.
|
|
43
|
+
vhh := func() {
|
|
44
|
+
return "Possible VHH/single-domain antibody input detected (heavy chain only; " +
|
|
45
|
+
"CDR-H3 length distribution consistent with VHH). IgG-calibrated CDR-H3 length " +
|
|
46
|
+
"thresholds (>15 aa elevated risk, >20 aa high risk) do not apply to VHH — " +
|
|
47
|
+
"disregard these thresholds for nanobody libraries."
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export ll.toStrict({
|
|
51
|
+
partialChainMissingFullChain: partialChainMissingFullChain,
|
|
52
|
+
partialChainNoCdr3: partialChainNoCdr3,
|
|
53
|
+
noRecognizedColumns: noRecognizedColumns,
|
|
54
|
+
cdr3OnlyInput: cdr3OnlyInput,
|
|
55
|
+
gammaDeltaTcr: gammaDeltaTcr,
|
|
56
|
+
receptorNotDetected: receptorNotDetected,
|
|
57
|
+
vhh: vhh
|
|
58
|
+
})
|
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
// Process — receives the property TSV(s) from the Python step and the column-emission
|
|
2
|
+
// plan from main.tpl.tengo, builds output PColumn specs per the project's pcolumn-spec.md,
|
|
3
|
+
// imports the file as a pFrame, and returns the result and a sliced export pFrame.
|
|
4
|
+
|
|
5
|
+
self := import("@platforma-sdk/workflow-tengo:tpl")
|
|
6
|
+
xsv := import("@platforma-sdk/workflow-tengo:pframes.xsv")
|
|
7
|
+
pframes := import("@platforma-sdk/workflow-tengo:pframes")
|
|
8
|
+
pSpec := import("@platforma-sdk/workflow-tengo:pframes.spec")
|
|
9
|
+
maps := import("@platforma-sdk/workflow-tengo:maps")
|
|
10
|
+
smart := import("@platforma-sdk/workflow-tengo:smart")
|
|
11
|
+
canonical := import("@platforma-sdk/workflow-tengo:canonical")
|
|
12
|
+
constants := import("@platforma-sdk/workflow-tengo:constants")
|
|
13
|
+
messages := import(":messages")
|
|
14
|
+
|
|
15
|
+
self.defineOutputs("propertiesPf", "exportPframe", "info")
|
|
16
|
+
|
|
17
|
+
// Receptor + chain → human label fragments (CDR3 / full-chain).
|
|
18
|
+
// Spec R13a: PColumn name and chain domain are unchanged; only the label varies.
|
|
19
|
+
labelFragments := func(receptor, chain) {
|
|
20
|
+
if receptor == "TCRAB" {
|
|
21
|
+
if chain == "A" { return { cdr3: "CDR-α3", fullChain: "Vα" } }
|
|
22
|
+
if chain == "B" { return { cdr3: "CDR-β3", fullChain: "Vβ" } }
|
|
23
|
+
}
|
|
24
|
+
if receptor == "TCRGD" {
|
|
25
|
+
if chain == "A" { return { cdr3: "CDR-γ3", fullChain: "Vγ" } }
|
|
26
|
+
if chain == "B" { return { cdr3: "CDR-δ3", fullChain: "Vδ" } }
|
|
27
|
+
}
|
|
28
|
+
// IG / unknown — antibody convention.
|
|
29
|
+
if chain == "A" { return { cdr3: "CDR-H3", fullChain: "VH" } }
|
|
30
|
+
return { cdr3: "CDR-L3", fullChain: "VL" }
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Build a single output column descriptor consumed by xsv.importFile.
|
|
34
|
+
// `tsvCol` is the TSV column header emitted by Python (e.g. "charge_peptide", "charge_A_CDR3").
|
|
35
|
+
// Clones the caller's `annotations` dict — mutating it in place would stamp
|
|
36
|
+
// the label into shared references if any caller ever reused the literal,
|
|
37
|
+
// the same aliasing footgun that the export-domain clone below already guards.
|
|
38
|
+
makeCol := func(tsvCol, valName, valueType, label, domain, annotations) {
|
|
39
|
+
newAnnotations := {}
|
|
40
|
+
if annotations {
|
|
41
|
+
for k, v in annotations {
|
|
42
|
+
newAnnotations[k] = v
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
newAnnotations["pl7.app/label"] = label
|
|
46
|
+
spec := {
|
|
47
|
+
name: valName,
|
|
48
|
+
valueType: valueType,
|
|
49
|
+
domain: domain,
|
|
50
|
+
annotations: newAnnotations
|
|
51
|
+
}
|
|
52
|
+
return { column: tsvCol, id: tsvCol, naRegex: "", allowNA: true, spec: spec }
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
self.body(func(args) {
|
|
56
|
+
blockId := args.blockId
|
|
57
|
+
propertiesTsv := args.propertiesTsv
|
|
58
|
+
params := args.params
|
|
59
|
+
|
|
60
|
+
datasetSpec := params.datasetSpec
|
|
61
|
+
keyAxisIdx := params.keyAxisIdx
|
|
62
|
+
mode := params.mode
|
|
63
|
+
receptor := params.receptor
|
|
64
|
+
chains := params.chains
|
|
65
|
+
chainsWithCdr3 := params.chainsWithCdr3
|
|
66
|
+
fullChains := params.fullChains
|
|
67
|
+
hasFv := params.hasFv
|
|
68
|
+
coverageTier := params.coverageTier
|
|
69
|
+
infoMessages := params.infoMessages
|
|
70
|
+
|
|
71
|
+
stats := args.stats.getDataAsJson()
|
|
72
|
+
medians := stats.medianCdr3Length
|
|
73
|
+
|
|
74
|
+
// R11c — single-domain antibodies (nanobodies / VHH) miss the IgG-calibrated
|
|
75
|
+
// CDR-H3 length risk thresholds. Surface an info message when the dataset
|
|
76
|
+
// looks like VHH (heavy chain only, long median CDR-H3 ≥ 16 aa).
|
|
77
|
+
if receptor == "IG" && len(chainsWithCdr3) == 1 && chainsWithCdr3[0] == "A" {
|
|
78
|
+
medA := medians["A"]
|
|
79
|
+
if medA != undefined && medA >= 16 {
|
|
80
|
+
infoMessages += [messages.vhh()]
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
infoBlob := smart.createValueResource(constants.RTYPE_JSON, canonical.encode({
|
|
85
|
+
mode: mode,
|
|
86
|
+
receptor: receptor,
|
|
87
|
+
coverageTier: coverageTier,
|
|
88
|
+
messages: infoMessages
|
|
89
|
+
}))
|
|
90
|
+
|
|
91
|
+
keyAxisSpec := datasetSpec.axesSpec[keyAxisIdx]
|
|
92
|
+
|
|
93
|
+
axes := [{ column: "entity_key", spec: keyAxisSpec }]
|
|
94
|
+
columns := []
|
|
95
|
+
|
|
96
|
+
if mode == "peptide" {
|
|
97
|
+
// Peptide mode — 9 scalar properties on `pl7.app/feature: "peptide"`.
|
|
98
|
+
dom := { "pl7.app/feature": "peptide" }
|
|
99
|
+
|
|
100
|
+
columns += [makeCol("charge_peptide", "pl7.app/charge", "Double",
|
|
101
|
+
"Net Charge (pH 7)", dom, {
|
|
102
|
+
"pl7.app/format": ".2f",
|
|
103
|
+
"pl7.app/isScore": "true",
|
|
104
|
+
"pl7.app/description": "Net charge at pH 7 (Henderson-Hasselbalch, IPC 2.0 peptide pKa set). Positive = net basic (Arg, Lys, His dominate); negative = net acidic (Asp, Glu dominate). No universal preferred direction.",
|
|
105
|
+
"pl7.app/table/visibility": "default",
|
|
106
|
+
"pl7.app/table/orderPriority": "70000"
|
|
107
|
+
})]
|
|
108
|
+
|
|
109
|
+
columns += [makeCol("gravy_peptide", "pl7.app/hydrophobicity", "Double",
|
|
110
|
+
"Hydrophobicity (GRAVY)", dom, {
|
|
111
|
+
"pl7.app/format": ".3f",
|
|
112
|
+
"pl7.app/isScore": "true",
|
|
113
|
+
"pl7.app/score/rankingOrder": "increasing",
|
|
114
|
+
"pl7.app/description": "rankingOrder: increasing reflects preference for lower hydrophobicity. Invert direction in Lead Selection for hydrophobic-target applications.",
|
|
115
|
+
"pl7.app/table/visibility": "default",
|
|
116
|
+
"pl7.app/table/orderPriority": "69900"
|
|
117
|
+
})]
|
|
118
|
+
|
|
119
|
+
columns += [makeCol("mw_peptide", "pl7.app/molecularWeight", "Double",
|
|
120
|
+
"Molecular Weight (Da, average masses)", dom, {
|
|
121
|
+
"pl7.app/format": ".1f",
|
|
122
|
+
"pl7.app/min": "0",
|
|
123
|
+
"pl7.app/table/visibility": "default",
|
|
124
|
+
"pl7.app/table/orderPriority": "69800"
|
|
125
|
+
})]
|
|
126
|
+
|
|
127
|
+
columns += [makeCol("pi_peptide", "pl7.app/isoelectricPoint", "Double",
|
|
128
|
+
"Isoelectric Point (pI)", dom, {
|
|
129
|
+
"pl7.app/format": ".2f",
|
|
130
|
+
"pl7.app/min": "0",
|
|
131
|
+
"pl7.app/max": "14",
|
|
132
|
+
"pl7.app/table/visibility": "default",
|
|
133
|
+
"pl7.app/table/orderPriority": "69700"
|
|
134
|
+
})]
|
|
135
|
+
|
|
136
|
+
columns += [makeCol("eox_peptide", "pl7.app/extinctionCoefficientOx", "Double",
|
|
137
|
+
"Extinction Coeff., Oxidized (M⁻¹cm⁻¹)", dom, {
|
|
138
|
+
"pl7.app/format": ".0f",
|
|
139
|
+
"pl7.app/min": "0",
|
|
140
|
+
"pl7.app/description": "Assumes all Cys are in disulfide bonds. For unprotected linear peptides use the reduced form.",
|
|
141
|
+
"pl7.app/table/visibility": "optional",
|
|
142
|
+
"pl7.app/table/orderPriority": "69600"
|
|
143
|
+
})]
|
|
144
|
+
|
|
145
|
+
columns += [makeCol("ered_peptide", "pl7.app/extinctionCoefficientRed", "Double",
|
|
146
|
+
"Extinction Coeff., Reduced (M⁻¹cm⁻¹)", dom, {
|
|
147
|
+
"pl7.app/format": ".0f",
|
|
148
|
+
"pl7.app/min": "0",
|
|
149
|
+
"pl7.app/description": "Extinction coefficient at 280 nm, disulfide bonds reduced (Cys contribution omitted). A value of 0 means no Tyr or Trp — A280-based quantification is not possible.",
|
|
150
|
+
"pl7.app/table/visibility": "optional",
|
|
151
|
+
"pl7.app/table/orderPriority": "69500"
|
|
152
|
+
})]
|
|
153
|
+
|
|
154
|
+
columns += [makeCol("instability_peptide", "pl7.app/instabilityIndex", "Double",
|
|
155
|
+
"Instability Index", dom, {
|
|
156
|
+
"pl7.app/format": ".2f",
|
|
157
|
+
"pl7.app/description": "Guruprasad index — derived from globular proteins. The II > 40 threshold does not apply to short linear peptides; use as a relative composition ranking aid only.",
|
|
158
|
+
"pl7.app/table/visibility": "default",
|
|
159
|
+
"pl7.app/table/orderPriority": "69400"
|
|
160
|
+
})]
|
|
161
|
+
|
|
162
|
+
columns += [makeCol("aliphatic_peptide", "pl7.app/aliphaticIndex", "Double",
|
|
163
|
+
"Aliphatic Index", dom, {
|
|
164
|
+
"pl7.app/format": ".1f",
|
|
165
|
+
"pl7.app/min": "0",
|
|
166
|
+
"pl7.app/description": "Measures fraction of nonpolar aliphatic residues (Ala, Val, Ile, Leu). For short linear peptides, thermostability interpretation does not apply — the Ikai index was derived for globular mesophilic enzymes, and thermostability is not a meaningful concept for unstructured peptides. Useful as a composition indicator and a proxy for hydrophobic character alongside GRAVY — both metrics increase with Ala/Val/Ile/Leu content, but neither has a universal preferred direction for therapeutic peptides.",
|
|
167
|
+
"pl7.app/table/visibility": "optional",
|
|
168
|
+
"pl7.app/table/orderPriority": "69300"
|
|
169
|
+
})]
|
|
170
|
+
|
|
171
|
+
columns += [makeCol("aromaticity_peptide", "pl7.app/aromaticity", "Double",
|
|
172
|
+
"Aromaticity", dom, {
|
|
173
|
+
"pl7.app/format": ".3f",
|
|
174
|
+
"pl7.app/min": "0",
|
|
175
|
+
"pl7.app/max": "1",
|
|
176
|
+
"pl7.app/description": "Fraction of aromatic residues (Phe, Trp, Tyr).",
|
|
177
|
+
"pl7.app/table/visibility": "optional",
|
|
178
|
+
"pl7.app/table/orderPriority": "69200"
|
|
179
|
+
})]
|
|
180
|
+
|
|
181
|
+
} else {
|
|
182
|
+
// Antibody/TCR mode — CDR3 columns per chain, full-chain when present, Fv when paired.
|
|
183
|
+
// CDR-H3 (chain A) and CDR-L3 (chain B) carry different descriptions per
|
|
184
|
+
// pcolumn-spec.md — different developability signals.
|
|
185
|
+
cdr3ChargeDesc := {
|
|
186
|
+
A: "Strongly positive CDR3 charge correlates with polyreactivity via electrostatic interactions. No universal preferred direction in Lead Selection. IPC 2.0 peptide pKa set.",
|
|
187
|
+
B: "Strongly positive CDR-L3 charge contributes to paratope polyreactivity. Strongly negative charge is primarily a PK concern. No universal preferred direction. IPC 2.0 peptide pKa set."
|
|
188
|
+
}
|
|
189
|
+
cdr3GravyDesc := {
|
|
190
|
+
A: "Lower hydrophobicity preferred for developability. CDR3 GRAVY > 0 is an informal aggregation/polyreactivity heuristic.",
|
|
191
|
+
B: "Same aggregation and polyreactivity signal as CDR-H3 hydrophobicity; lower independent predictive weight. The TAP score uses combined 6-CDR GRAVY — CDR-L3 alone has limited independent validation."
|
|
192
|
+
}
|
|
193
|
+
cdr3OrderA := 68000
|
|
194
|
+
cdr3OrderB := 67700
|
|
195
|
+
for chain in chains {
|
|
196
|
+
frag := labelFragments(receptor, chain)
|
|
197
|
+
cdr3Dom := { "pl7.app/feature": "CDR3", "pl7.app/vdj/scClonotypeChain": chain }
|
|
198
|
+
chargeOrder := (chain == "A" ? cdr3OrderA : cdr3OrderB)
|
|
199
|
+
gravyOrder := chargeOrder - 100
|
|
200
|
+
|
|
201
|
+
columns += [makeCol("charge_" + chain + "_CDR3", "pl7.app/charge", "Double",
|
|
202
|
+
frag.cdr3 + " Net Charge (pH 7)", cdr3Dom, {
|
|
203
|
+
"pl7.app/format": ".2f",
|
|
204
|
+
"pl7.app/isScore": "true",
|
|
205
|
+
"pl7.app/description": cdr3ChargeDesc[chain],
|
|
206
|
+
"pl7.app/table/visibility": "default",
|
|
207
|
+
"pl7.app/table/orderPriority": string(chargeOrder)
|
|
208
|
+
})]
|
|
209
|
+
columns += [makeCol("gravy_" + chain + "_CDR3", "pl7.app/hydrophobicity", "Double",
|
|
210
|
+
frag.cdr3 + " Hydrophobicity (GRAVY)", cdr3Dom, {
|
|
211
|
+
"pl7.app/format": ".3f",
|
|
212
|
+
"pl7.app/isScore": "true",
|
|
213
|
+
"pl7.app/score/rankingOrder": "increasing",
|
|
214
|
+
"pl7.app/description": cdr3GravyDesc[chain],
|
|
215
|
+
"pl7.app/table/visibility": "default",
|
|
216
|
+
"pl7.app/table/orderPriority": string(gravyOrder)
|
|
217
|
+
})]
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Full-chain columns (9 per chain when reconstructed).
|
|
221
|
+
fcOrderBaseA := 67000
|
|
222
|
+
fcOrderBaseB := 66000
|
|
223
|
+
for chain in fullChains {
|
|
224
|
+
frag := labelFragments(receptor, chain)
|
|
225
|
+
fcDom := { "pl7.app/feature": "VDJRegion", "pl7.app/vdj/scClonotypeChain": chain }
|
|
226
|
+
base := (chain == "A" ? fcOrderBaseA : fcOrderBaseB)
|
|
227
|
+
fcLabel := frag.fullChain
|
|
228
|
+
|
|
229
|
+
columns += [makeCol("charge_" + chain + "_VDJRegion", "pl7.app/charge", "Double",
|
|
230
|
+
fcLabel + " Net Charge (pH 7)", fcDom, {
|
|
231
|
+
"pl7.app/format": ".2f",
|
|
232
|
+
"pl7.app/isScore": "true",
|
|
233
|
+
"pl7.app/description": "Non-monotonic vs developability: strongly positive correlates with polyreactivity; strongly negative with rapid clearance.",
|
|
234
|
+
"pl7.app/table/visibility": "default",
|
|
235
|
+
"pl7.app/table/orderPriority": string(base)
|
|
236
|
+
})]
|
|
237
|
+
columns += [makeCol("pi_" + chain + "_VDJRegion", "pl7.app/isoelectricPoint", "Double",
|
|
238
|
+
fcLabel + " Isoelectric Point (pI)", fcDom, {
|
|
239
|
+
"pl7.app/format": ".2f",
|
|
240
|
+
"pl7.app/isScore": "true",
|
|
241
|
+
"pl7.app/min": "0",
|
|
242
|
+
"pl7.app/max": "14",
|
|
243
|
+
"pl7.app/table/visibility": "default",
|
|
244
|
+
"pl7.app/table/orderPriority": string(base - 100)
|
|
245
|
+
})]
|
|
246
|
+
columns += [makeCol("gravy_" + chain + "_VDJRegion", "pl7.app/hydrophobicity", "Double",
|
|
247
|
+
fcLabel + " Hydrophobicity (GRAVY)", fcDom, {
|
|
248
|
+
"pl7.app/format": ".3f",
|
|
249
|
+
"pl7.app/description": "Framework regions dominate; weak developability signal at chain level — CDR3 hydrophobicity is more discriminating.",
|
|
250
|
+
"pl7.app/table/visibility": "default",
|
|
251
|
+
"pl7.app/table/orderPriority": string(base - 200)
|
|
252
|
+
})]
|
|
253
|
+
columns += [makeCol("mw_" + chain + "_VDJRegion", "pl7.app/molecularWeight", "Double",
|
|
254
|
+
fcLabel + " Molecular Weight (Da, average masses)", fcDom, {
|
|
255
|
+
"pl7.app/format": ".1f",
|
|
256
|
+
"pl7.app/min": "0",
|
|
257
|
+
"pl7.app/description": "Unglycosylated sequence mass — does not include N-glycan contributions from any NXS/NXT sequons in the variable region.",
|
|
258
|
+
"pl7.app/table/visibility": "optional",
|
|
259
|
+
"pl7.app/table/orderPriority": string(base - 300)
|
|
260
|
+
})]
|
|
261
|
+
columns += [makeCol("eox_" + chain + "_VDJRegion", "pl7.app/extinctionCoefficientOx", "Double",
|
|
262
|
+
fcLabel + " Extinction Coeff., Oxidized (M⁻¹cm⁻¹)", fcDom, {
|
|
263
|
+
"pl7.app/format": ".0f",
|
|
264
|
+
"pl7.app/min": "0",
|
|
265
|
+
"pl7.app/table/visibility": "optional",
|
|
266
|
+
"pl7.app/table/orderPriority": string(base - 400)
|
|
267
|
+
})]
|
|
268
|
+
columns += [makeCol("ered_" + chain + "_VDJRegion", "pl7.app/extinctionCoefficientRed", "Double",
|
|
269
|
+
fcLabel + " Extinction Coeff., Reduced (M⁻¹cm⁻¹)", fcDom, {
|
|
270
|
+
"pl7.app/format": ".0f",
|
|
271
|
+
"pl7.app/min": "0",
|
|
272
|
+
"pl7.app/table/visibility": "optional",
|
|
273
|
+
"pl7.app/table/orderPriority": string(base - 500)
|
|
274
|
+
})]
|
|
275
|
+
columns += [makeCol("instability_" + chain + "_VDJRegion", "pl7.app/instabilityIndex", "Double",
|
|
276
|
+
fcLabel + " Instability Index", fcDom, {
|
|
277
|
+
"pl7.app/format": ".2f",
|
|
278
|
+
"pl7.app/description": "Guruprasad index, calibrated for in-vitro stability of soluble globular proteins via dipeptide composition. Weak predictor of antibody Tm — use as supplementary ranking aid.",
|
|
279
|
+
"pl7.app/table/visibility": "optional",
|
|
280
|
+
"pl7.app/table/orderPriority": string(base - 600)
|
|
281
|
+
})]
|
|
282
|
+
columns += [makeCol("aliphatic_" + chain + "_VDJRegion", "pl7.app/aliphaticIndex", "Double",
|
|
283
|
+
fcLabel + " Aliphatic Index", fcDom, {
|
|
284
|
+
"pl7.app/format": ".1f",
|
|
285
|
+
"pl7.app/min": "0",
|
|
286
|
+
"pl7.app/description": "Ikai aliphatic index, derived from globular mesophilic enzymes. Weak correlation with antibody Tm. No rankingOrder — high values can correlate with aggregation propensity.",
|
|
287
|
+
"pl7.app/table/visibility": "optional",
|
|
288
|
+
"pl7.app/table/orderPriority": string(base - 700)
|
|
289
|
+
})]
|
|
290
|
+
columns += [makeCol("aromaticity_" + chain + "_VDJRegion", "pl7.app/aromaticity", "Double",
|
|
291
|
+
fcLabel + " Aromaticity", fcDom, {
|
|
292
|
+
"pl7.app/format": ".3f",
|
|
293
|
+
"pl7.app/min": "0",
|
|
294
|
+
"pl7.app/max": "1",
|
|
295
|
+
"pl7.app/description": "Fraction of aromatic residues (Phe, Trp, Tyr) over the full chain. Framework dominates; CDR-specific aromaticity is a stronger predictor (Phase 2).",
|
|
296
|
+
"pl7.app/table/visibility": "optional",
|
|
297
|
+
"pl7.app/table/orderPriority": string(base - 800)
|
|
298
|
+
})]
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Fv columns — only when both VH and VL full chains reconstructed (antibody only).
|
|
302
|
+
if hasFv {
|
|
303
|
+
fvDom := { "pl7.app/feature": "Fv" }
|
|
304
|
+
columns += [makeCol("charge_Fv", "pl7.app/charge", "Double",
|
|
305
|
+
"Fv Net Charge (pH 7)", fvDom, {
|
|
306
|
+
"pl7.app/format": ".2f",
|
|
307
|
+
"pl7.app/isScore": "true",
|
|
308
|
+
"pl7.app/table/visibility": "default",
|
|
309
|
+
"pl7.app/table/orderPriority": "65100"
|
|
310
|
+
})]
|
|
311
|
+
columns += [makeCol("pi_Fv", "pl7.app/isoelectricPoint", "Double",
|
|
312
|
+
"Fv Isoelectric Point (pI)", fvDom, {
|
|
313
|
+
"pl7.app/format": ".2f",
|
|
314
|
+
"pl7.app/isScore": "true",
|
|
315
|
+
"pl7.app/min": "0",
|
|
316
|
+
"pl7.app/max": "14",
|
|
317
|
+
"pl7.app/description": "Variable region (VH+VL) only. Fv pI is typically 2–4 pH units higher than whole-IgG cIEF measurements, which include constant regions (IgG1 Fc pI ≈ 5–6).",
|
|
318
|
+
"pl7.app/table/visibility": "default",
|
|
319
|
+
"pl7.app/table/orderPriority": "65000"
|
|
320
|
+
})]
|
|
321
|
+
columns += [makeCol("eox_Fv", "pl7.app/extinctionCoefficientOx", "Double",
|
|
322
|
+
"Fv Extinction Coeff., Oxidized (M⁻¹cm⁻¹)", fvDom, {
|
|
323
|
+
"pl7.app/format": ".0f",
|
|
324
|
+
"pl7.app/min": "0",
|
|
325
|
+
"pl7.app/description": "Variable region (VH+VL) only — does not include constant regions. For whole-IgG A280 quantification, use the full-antibody ε.",
|
|
326
|
+
"pl7.app/table/visibility": "optional",
|
|
327
|
+
"pl7.app/table/orderPriority": "64900"
|
|
328
|
+
})]
|
|
329
|
+
columns += [makeCol("ered_Fv", "pl7.app/extinctionCoefficientRed", "Double",
|
|
330
|
+
"Fv Extinction Coeff., Reduced (M⁻¹cm⁻¹)", fvDom, {
|
|
331
|
+
"pl7.app/format": ".0f",
|
|
332
|
+
"pl7.app/min": "0",
|
|
333
|
+
"pl7.app/description": "Variable region (VH+VL) only, disulfide bonds reduced (Cys contribution omitted). A value of 0 means no Tyr or Trp — A280-based quantification is not possible.",
|
|
334
|
+
"pl7.app/table/visibility": "optional",
|
|
335
|
+
"pl7.app/table/orderPriority": "64800"
|
|
336
|
+
})]
|
|
337
|
+
columns += [makeCol("mw_Fv", "pl7.app/molecularWeight", "Double",
|
|
338
|
+
"Fv Molecular Weight (Da, average masses)", fvDom, {
|
|
339
|
+
"pl7.app/format": ".1f",
|
|
340
|
+
"pl7.app/min": "0",
|
|
341
|
+
"pl7.app/description": "Unglycosylated sequence mass (VH + VL).",
|
|
342
|
+
"pl7.app/table/visibility": "optional",
|
|
343
|
+
"pl7.app/table/orderPriority": "64700"
|
|
344
|
+
})]
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
outputSpecs := {
|
|
349
|
+
axes: axes,
|
|
350
|
+
columns: columns,
|
|
351
|
+
storageFormat: "Parquet",
|
|
352
|
+
partitionKeyLength: 0
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Stamp blockId on the export-only columns (lets downstream blocks distinguish runs).
|
|
356
|
+
// Build a fresh column with a cloned domain dict — mutating col.spec.domain
|
|
357
|
+
// in place would also stamp blockId on outputSpecs.columns since both lists
|
|
358
|
+
// share the same column references.
|
|
359
|
+
exportColumns := []
|
|
360
|
+
for col in columns {
|
|
361
|
+
if col.spec.annotations && col.spec.annotations["pl7.app/isScore"] == "true" {
|
|
362
|
+
newDomain := {}
|
|
363
|
+
if col.spec.domain {
|
|
364
|
+
for k, v in col.spec.domain {
|
|
365
|
+
newDomain[k] = v
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
newDomain["pl7.app/blockId"] = blockId
|
|
369
|
+
exportColumns += [{
|
|
370
|
+
column: col.column,
|
|
371
|
+
id: col.id,
|
|
372
|
+
naRegex: col.naRegex,
|
|
373
|
+
allowNA: col.allowNA,
|
|
374
|
+
spec: {
|
|
375
|
+
name: col.spec.name,
|
|
376
|
+
valueType: col.spec.valueType,
|
|
377
|
+
domain: newDomain,
|
|
378
|
+
annotations: col.spec.annotations
|
|
379
|
+
}
|
|
380
|
+
}]
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
exportSpecs := {
|
|
385
|
+
axes: axes,
|
|
386
|
+
columns: exportColumns,
|
|
387
|
+
storageFormat: "Parquet",
|
|
388
|
+
partitionKeyLength: 0
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
scalarOut := xsv.importFile(propertiesTsv, "tsv", outputSpecs, { splitDataAndSpec: true, cpu: 1, mem: "4GiB" })
|
|
392
|
+
exportOut := xsv.importFile(propertiesTsv, "tsv", exportSpecs, { splitDataAndSpec: true, cpu: 1, mem: "4GiB" })
|
|
393
|
+
|
|
394
|
+
trace := pSpec.makeTrace(datasetSpec, {
|
|
395
|
+
type: "milaboratories.sequence-properties",
|
|
396
|
+
importance: 30,
|
|
397
|
+
label: "Sequence Properties",
|
|
398
|
+
id: blockId
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
// AA fraction — peptide mode only (R7). 2-axis: [variantKey, aminoAcid].
|
|
402
|
+
// The TSV is long format: entity_key, aminoAcid, value. The aminoAcid axis
|
|
403
|
+
// values are the 20 standard single-letter codes (R7).
|
|
404
|
+
aaOut := undefined
|
|
405
|
+
if mode == "peptide" {
|
|
406
|
+
aaAxes := [
|
|
407
|
+
{ column: "entity_key", spec: keyAxisSpec },
|
|
408
|
+
{
|
|
409
|
+
column: "aminoAcid",
|
|
410
|
+
spec: {
|
|
411
|
+
name: "pl7.app/aminoAcid",
|
|
412
|
+
type: "String",
|
|
413
|
+
annotations: { "pl7.app/label": "Amino Acid" }
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
]
|
|
417
|
+
aaCols := [
|
|
418
|
+
{
|
|
419
|
+
column: "value",
|
|
420
|
+
id: "aaFraction",
|
|
421
|
+
naRegex: "",
|
|
422
|
+
allowNA: true,
|
|
423
|
+
spec: {
|
|
424
|
+
name: "pl7.app/aaFraction",
|
|
425
|
+
valueType: "Double",
|
|
426
|
+
domain: { "pl7.app/feature": "peptide" },
|
|
427
|
+
annotations: {
|
|
428
|
+
"pl7.app/label": "AA Fraction",
|
|
429
|
+
"pl7.app/format": ".3f",
|
|
430
|
+
"pl7.app/min": "0",
|
|
431
|
+
"pl7.app/max": "1",
|
|
432
|
+
"pl7.app/table/visibility": "optional",
|
|
433
|
+
"pl7.app/table/orderPriority": "69000"
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
]
|
|
438
|
+
aaSpecs := {
|
|
439
|
+
axes: aaAxes,
|
|
440
|
+
columns: aaCols,
|
|
441
|
+
storageFormat: "Parquet",
|
|
442
|
+
partitionKeyLength: 0
|
|
443
|
+
}
|
|
444
|
+
aaOut = xsv.importFile(args.aaFractionTsv, "tsv", aaSpecs, { splitDataAndSpec: true, cpu: 1, mem: "4GiB" })
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
// Combined output pFrame — scalar properties + (peptide mode only) AA fraction.
|
|
448
|
+
// Sorted .add() order keeps the pframe resource bytes stable across runs (dedup).
|
|
449
|
+
resultPframe := pframes.pFrameBuilder()
|
|
450
|
+
for _, k in maps.getKeys(scalarOut) {
|
|
451
|
+
v := scalarOut[k]
|
|
452
|
+
resultPframe.add(k, trace.inject(v.spec), v.data)
|
|
453
|
+
}
|
|
454
|
+
if aaOut != undefined {
|
|
455
|
+
for _, k in maps.getKeys(aaOut) {
|
|
456
|
+
v := aaOut[k]
|
|
457
|
+
resultPframe.add(k, trace.inject(v.spec), v.data)
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
resultPframe = resultPframe.build()
|
|
461
|
+
|
|
462
|
+
exportPframe := pframes.pFrameBuilder()
|
|
463
|
+
for _, k in maps.getKeys(exportOut) {
|
|
464
|
+
v := exportOut[k]
|
|
465
|
+
exportPframe.add(k, trace.inject(v.spec), v.data)
|
|
466
|
+
}
|
|
467
|
+
exportPframe = exportPframe.build()
|
|
468
|
+
|
|
469
|
+
return {
|
|
470
|
+
propertiesPf: pframes.exportFrame(resultPframe),
|
|
471
|
+
exportPframe: exportPframe,
|
|
472
|
+
info: infoBlob
|
|
473
|
+
}
|
|
474
|
+
})
|
package/src/wf.test.ts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
// Workflow integration tests will be added once Python property computation lands.
|
|
2
|
+
// Placeholder kept so vitest discovers the suite without failures.
|
|
3
|
+
import { describe, it } from "vitest";
|
|
4
|
+
|
|
5
|
+
describe("sequence-properties workflow", () => {
|
|
6
|
+
it.skip("computes peptide properties end-to-end", () => {
|
|
7
|
+
// TODO: implement once compute_properties.py is real.
|
|
8
|
+
});
|
|
9
|
+
});
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "es2022",
|
|
4
|
+
"module": "commonjs",
|
|
5
|
+
"moduleResolution": "node",
|
|
6
|
+
"esModuleInterop": true,
|
|
7
|
+
"strict": true,
|
|
8
|
+
"outDir": "./dist",
|
|
9
|
+
"rootDir": "./src",
|
|
10
|
+
"sourceMap": true,
|
|
11
|
+
"declaration": true
|
|
12
|
+
},
|
|
13
|
+
"types": [],
|
|
14
|
+
"include": ["src/**/*"],
|
|
15
|
+
"exclude": ["node_modules", "dist"]
|
|
16
|
+
}
|