cradle-sdk 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cradle/sdk/__init__.py +9 -0
- cradle/sdk/auth/__init__.py +0 -0
- cradle/sdk/auth/device.py +478 -0
- cradle/sdk/client.py +937 -0
- cradle/sdk/exceptions.py +11 -0
- cradle/sdk/types/__init__.py +0 -0
- cradle/sdk/types/assembly.py +8 -0
- cradle/sdk/types/common.py +62 -0
- cradle/sdk/types/data.py +353 -0
- cradle/sdk/types/task.py +985 -0
- cradle/sdk/types/workspace.py +48 -0
- cradle/sdk/uploader.py +445 -0
- cradle/sdk/utils.py +40 -0
- cradle_sdk-0.1.1.dist-info/METADATA +17 -0
- cradle_sdk-0.1.1.dist-info/RECORD +16 -0
- cradle_sdk-0.1.1.dist-info/WHEEL +4 -0
cradle/sdk/types/task.py
ADDED
|
@@ -0,0 +1,985 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from enum import StrEnum
|
|
3
|
+
from typing import Annotated, ClassVar, Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from .assembly import AssemblyFeature
|
|
8
|
+
from .common import (
|
|
9
|
+
ArchivableResourceMixin,
|
|
10
|
+
BaseListResponse,
|
|
11
|
+
Context,
|
|
12
|
+
ErrorResponseMixin,
|
|
13
|
+
ResourceResponse,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ParametersBase(BaseModel):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TableSourceTable(BaseModel):
|
|
22
|
+
kind: Literal["TABLE"] = Field(default="TABLE")
|
|
23
|
+
table: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TableSourceQuery(BaseModel):
|
|
27
|
+
kind: Literal["QUERY"] = Field(default="QUERY")
|
|
28
|
+
query: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TableSourceTaskResult(BaseModel):
|
|
32
|
+
kind: Literal["TASK_RESULT"] = Field(default="TASK_RESULT")
|
|
33
|
+
table: str
|
|
34
|
+
task_id: int | list[int]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class FeatureAnnotation(StrEnum):
|
|
38
|
+
"""Annotation to be configured per residue in a domain:
|
|
39
|
+
|
|
40
|
+
UNRELIABLE: The evolutionary context is not reliable, eg in CDRs of antibodies.
|
|
41
|
+
IGNORE: A part of the sequence that should not be used for evolutionary search and won't be mutated, this can be tags, linkers or other artificial subsequences.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
UNRELIABLE = "UNRELIABLE"
|
|
45
|
+
IGNORE = "IGNORE"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DomainFeatureItem(BaseModel):
|
|
49
|
+
annotation: FeatureAnnotation = Field(
|
|
50
|
+
description="Annotation for the residues. `UNRELIABLE`: do not look at the MSA to infer conservation-based features (e.g. CDRs) for the protein `IGNORE`: do not change this position at all (e.g. for linkers) and do not use to infer any conservation-based features."
|
|
51
|
+
)
|
|
52
|
+
ranges: list[tuple[int, int]] = Field(
|
|
53
|
+
description="List of ranges where the annotation applies. Each range is a tuple of (start, end). Indices are 0-based and inclusive for the start and exclusive for the end. For example, (0, 5) blocks positions 0 to 4."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ProteinModelType(StrEnum):
|
|
58
|
+
DEFAULT = "DEFAULT"
|
|
59
|
+
ANTIBODY = "ANTIBODY"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ScaleType(StrEnum):
|
|
63
|
+
MULTIPLICATIVE = "MULTIPLICATIVE"
|
|
64
|
+
ADDITIVE = "ADDITIVE"
|
|
65
|
+
RANK = "RANK"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Assay(BaseModel):
|
|
69
|
+
"""Metadata of an assay."""
|
|
70
|
+
|
|
71
|
+
assay_id: str = Field(description="ID of the assay")
|
|
72
|
+
name: str = Field(description="Human readable name of the assay")
|
|
73
|
+
scale_type: ScaleType = Field(
|
|
74
|
+
description="""The scale type defines how the assay behaves.
|
|
75
|
+
|
|
76
|
+
* **Additive**: Assay values are comparable in magnitude over batches, a delta of eg 5 in one batch is equivalent to 5 in another batch.
|
|
77
|
+
* **Multiplicative**: Assay values are comparable over batches by applying a multiplication factor, eg fold improvement: `score_variantB_batch1 = factor(typically starting sequence score) * score_variantB_batch`.
|
|
78
|
+
* **Rank**: Assay values are not comparable over batches, we can only assume the ranking within a batch is correct."""
|
|
79
|
+
)
|
|
80
|
+
unit: str | None = Field(default=None, description="Unit of the assay - used for display purposes only")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class OptimizationDirection(StrEnum):
|
|
84
|
+
MAXIMIZE = "MAXIMIZE"
|
|
85
|
+
MINIMIZE = "MINIMIZE"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PrimaryObjective(BaseModel):
|
|
89
|
+
assay_id: str = Field(description="The ID of the assay to be optimized.")
|
|
90
|
+
direction: OptimizationDirection = Field(description="Whether the maximize or minimize the assay value.")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TrainParameters(ParametersBase):
|
|
94
|
+
task_type: ClassVar[str] = "train/v1"
|
|
95
|
+
|
|
96
|
+
homologs: "TableInput" = Field(
|
|
97
|
+
description="The table of homologous sequences to be used for training the base model."
|
|
98
|
+
)
|
|
99
|
+
domain_features: dict[
|
|
100
|
+
str,
|
|
101
|
+
Annotated[
|
|
102
|
+
list[DomainFeatureItem],
|
|
103
|
+
Field(
|
|
104
|
+
description="List of domain annotations. A residue may be assigned zero or one annotations. Ranges which are not annotated are considered reliable, i.e. the MSA in those ranges is used to infer conservation-based features."
|
|
105
|
+
),
|
|
106
|
+
],
|
|
107
|
+
] = Field(
|
|
108
|
+
default_factory=dict,
|
|
109
|
+
description="Mapping of protein sequences to their respective domain features. Each range in the domain features must be a valid index into the corresponding protein sequence.",
|
|
110
|
+
)
|
|
111
|
+
protein_model_type: ProteinModelType = Field(
|
|
112
|
+
default=ProteinModelType.DEFAULT,
|
|
113
|
+
description="Specifies the type of base model to use for training: use `DEFAULT` for single-chain proteins and `ANTIBODY` for multi-chain antibodies.",
|
|
114
|
+
)
|
|
115
|
+
dataset: "TableInput" = Field(description="The name of the input table to be used for model training.")
|
|
116
|
+
assays: list[Assay] = Field(default_factory=list, description="List of assay metadata entries")
|
|
117
|
+
primary_objective: PrimaryObjective = Field(
|
|
118
|
+
description="The primary objective on which the sampler model is conditioned. Should correspond to an assay in `dataset`."
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class AbsoluteConstraint(BaseModel):
|
|
123
|
+
type_: Literal["ABSOLUTE_CONSTRAINT"] = Field(
|
|
124
|
+
default="ABSOLUTE_CONSTRAINT", alias="type", validation_alias="type", serialization_alias="type"
|
|
125
|
+
)
|
|
126
|
+
assay_id: str = Field(description="The ID of the constrained assay.")
|
|
127
|
+
direction: OptimizationDirection = Field(
|
|
128
|
+
description="Whether the assay value should be above (MAXIMIZE) or below (MINIMIZE) the threshold."
|
|
129
|
+
)
|
|
130
|
+
threshold: float = Field(description="The threshold assay value.")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class RelativeConstraint(BaseModel):
|
|
134
|
+
type_: Literal["RELATIVE_CONSTRAINT"] = Field(
|
|
135
|
+
default="RELATIVE_CONSTRAINT", alias="type", validation_alias="type", serialization_alias="type"
|
|
136
|
+
)
|
|
137
|
+
assay_id: str = Field(description="The ID of the constrained assay.")
|
|
138
|
+
direction: OptimizationDirection = Field(
|
|
139
|
+
description="Whether the assay value of new sequences should be above (MAXIMIZE) or below (MINIMIZE) the assay value of the `relative_to` sequence."
|
|
140
|
+
)
|
|
141
|
+
relative_to: str = Field(description="The sequence relative to which new sequences are constrained.")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class ArtifactParam(BaseModel):
|
|
145
|
+
artifact_id: int
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class BlockedAAItem(BaseModel):
|
|
149
|
+
blocked_aas: str = Field(
|
|
150
|
+
description="One or more amino acid letter code to be blocked from being mutated to. Use '*' to indicate that a position is fully blocked from being mutated."
|
|
151
|
+
)
|
|
152
|
+
ranges: list[tuple[int, int]] = Field(
|
|
153
|
+
description="List of ranges where the mutations are blocked. Each range is a tuple of (start, end). Indices are 0-based and inclusive for the start and exclusive for the end. For example, (0, 5) blocks positions 0 to 4."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class BlockedMotifItem(BaseModel):
|
|
158
|
+
blocked_motif: str = Field(description="Amino acid motif to be blocked from appearing in the final sequence.")
|
|
159
|
+
ranges: list[tuple[int, int]] = Field(
|
|
160
|
+
description="List of ranges where the motif is blocked. Each range is a tuple of (start, end). Indices are 0-based and inclusive for the start and exclusive for the end. For example, (0, 5) blocks positions 0 to 4. The entire motif must fit into the specified positions to be blocked."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class TemplateInputs(BaseModel):
|
|
165
|
+
"""Configuration for template-based sequence generation."""
|
|
166
|
+
|
|
167
|
+
template_id: str = Field(
|
|
168
|
+
description="Identifier for the template used to identify the source of generated sequences in the output."
|
|
169
|
+
)
|
|
170
|
+
sequence: str = Field(description="The template amino acid sequence")
|
|
171
|
+
num_results: int = Field(
|
|
172
|
+
description="Number of sequences generated from this template to be added to the final plate."
|
|
173
|
+
)
|
|
174
|
+
min_mutations: int = Field(default=1, description="Minimum number of mutations per sequence.")
|
|
175
|
+
max_mutations: int = Field(default=8, description="Maximum number of mutations per sequence.")
|
|
176
|
+
hit_redundancy: int = Field(
|
|
177
|
+
default=10,
|
|
178
|
+
description="Number of top candidates for which to optimize the average score. A higher number means less diversity.",
|
|
179
|
+
)
|
|
180
|
+
blocked_aas: list[BlockedAAItem] = Field(
|
|
181
|
+
default_factory=list,
|
|
182
|
+
description="List of blocked amino acids which may not be mutated to, with their respective ranges.",
|
|
183
|
+
)
|
|
184
|
+
blocked_motifs: list[BlockedMotifItem] = Field(
|
|
185
|
+
default_factory=list, description="List of blocked amino acid motifs with their respective ranges."
|
|
186
|
+
)
|
|
187
|
+
blocked_regexps: list[str] | None = Field(
|
|
188
|
+
default_factory=list, description="Regular expressions that the sequence cannot match"
|
|
189
|
+
)
|
|
190
|
+
must_match_regexps: list[str] | None = Field(
|
|
191
|
+
default_factory=list,
|
|
192
|
+
description="Generated sequences must match all of the given regular expressions. Can be used to enforce certain motifs.",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class EngineerParameters(ParametersBase):
|
|
197
|
+
task_type: ClassVar[str] = "engineer/v1"
|
|
198
|
+
|
|
199
|
+
dataset: "TableInput" = Field(description="The table of assayed data, to be used for exploitation.")
|
|
200
|
+
assays: list[Assay] = Field(default_factory=list, description="List of assay metadata entries")
|
|
201
|
+
primary_objective: PrimaryObjective = Field(description="The primary objective the samplers are trained for.")
|
|
202
|
+
constraints: "Constraints" = Field(default_factory=list, description="List of assay constraints")
|
|
203
|
+
samplers: list[ArtifactParam] = Field(description="The sampling models used to generate candidate sequences.")
|
|
204
|
+
scorer: ArtifactParam = Field(description="Scoring model used to predict assay values for candidate sequences.")
|
|
205
|
+
template_sequences: list[TemplateInputs] = Field(description="Initial sequences used as templates for generation")
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class MonomerAssembly(BaseModel):
|
|
209
|
+
monomer: str = Field(description="The monomer sequence")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class MonomerSamplingTemplate(BaseModel):
|
|
213
|
+
kind: Literal["MONOMER"] = Field(default="MONOMER")
|
|
214
|
+
assembly: MonomerAssembly = Field(description="The template sequence to use for generation.")
|
|
215
|
+
sampler: ArtifactParam = Field(description="The trained sampler artifact used to generate candidate sequences.")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class VhVlAssembly(BaseModel):
|
|
219
|
+
vh: str | None = Field(description="The heavy chain sequence")
|
|
220
|
+
vl: str | None = Field(description="The light chain sequence")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class VhVlSamplingTemplate(BaseModel):
|
|
224
|
+
kind: Literal["VHVL"] = Field(default="VHVL")
|
|
225
|
+
assembly: VhVlAssembly = Field(description="The template sequence to use for generation.")
|
|
226
|
+
sampler: ArtifactParam = Field(description="The trained sampler artifact used to generate candidate sequences.")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class RangeAnnotation(BaseModel):
|
|
230
|
+
ranges: dict[str, list[tuple[int, int]]] = Field(
|
|
231
|
+
description="A mapping from polymer names to annotation ranges. Each range is a tuple of (start, end). Indices are 0-based and inclusive for the start and exclusive for the end. For example, (0, 5) refers to positions 0 to 4."
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class Requirement(BaseModel):
|
|
236
|
+
"""Assembly generation requirements specifying required and forbidden features on the generated assemblies.
|
|
237
|
+
|
|
238
|
+
Each requirement defines constraints for a specific region of the assembly.
|
|
239
|
+
Multiple requirements can target different or overlapping regions.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
apply_to: RangeAnnotation
|
|
243
|
+
motifs: list[Annotated[str, Field(description="A motif or regular expression of amino acids.")]]
|
|
244
|
+
operator: Literal["none", "all", "any"] = Field(
|
|
245
|
+
description="The choice of how to interpret the `motifs` together as a requirement."
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class ModelBasedGenerator(BaseModel):
|
|
250
|
+
"""Uses a trained sampler model to generate assemblies."""
|
|
251
|
+
|
|
252
|
+
type: Literal["MODEL_BASED"] = Field(default="MODEL_BASED")
|
|
253
|
+
template: MonomerSamplingTemplate | VhVlSamplingTemplate = Field(
|
|
254
|
+
description="The template sequence to use for generation.", discriminator="kind"
|
|
255
|
+
)
|
|
256
|
+
requirements: list[Requirement] = Field(
|
|
257
|
+
default_factory=list,
|
|
258
|
+
description="Assembly based generation requirements to satisfy. All requirements must be met for a valid assembly.",
|
|
259
|
+
)
|
|
260
|
+
discourage_mutations: RangeAnnotation | None = Field(
|
|
261
|
+
default=None, description="Expression defining regions where mutations should be discouraged."
|
|
262
|
+
)
|
|
263
|
+
min_mutations: int = Field(description="Minimum number of mutations from the template assembly.")
|
|
264
|
+
max_mutations: int = Field(description="Maximum number of mutations from the template assembly.")
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class MonomerSource(BaseModel):
|
|
268
|
+
kind: Literal["MONOMER"] = Field(default="MONOMER")
|
|
269
|
+
source: "TableInput"
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class VhVlSource(BaseModel):
|
|
273
|
+
kind: Literal["VHVL"] = Field(default="VHVL")
|
|
274
|
+
source: "TableInput"
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class Explicit(BaseModel):
|
|
278
|
+
"""A pre-defined table of assemblies to use directly for generation without modification.
|
|
279
|
+
|
|
280
|
+
Assemblies will be evaluated in the order provided.
|
|
281
|
+
"""
|
|
282
|
+
|
|
283
|
+
type: Literal["EXPLICIT"] = Field(default="EXPLICIT")
|
|
284
|
+
source: MonomerSource | VhVlSource = Field(
|
|
285
|
+
description="A pre-defined table of assemblies to use directly for generation without modification. Assemblies will be evaluated in the order provided.",
|
|
286
|
+
discriminator="kind",
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class AllowedMutation(BaseModel):
|
|
291
|
+
in_range: RangeAnnotation = Field(
|
|
292
|
+
description="The annotation expression defining the positions at which the amino acids listed in `amino_acids` can be used."
|
|
293
|
+
)
|
|
294
|
+
amino_acids: list[Annotated[str, Field(description="A single amino acid.")]]
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class CombinatorialGenerator(BaseModel):
|
|
298
|
+
"""Combinatorial assembly generation from explicit mutation.
|
|
299
|
+
|
|
300
|
+
Systematically generates assemblies by combining allowed mutations up to a
|
|
301
|
+
specified maximum number.
|
|
302
|
+
"""
|
|
303
|
+
|
|
304
|
+
type: Literal["COMBINATORIAL"] = Field(default="COMBINATORIAL")
|
|
305
|
+
template: MonomerAssembly | VhVlAssembly = Field(
|
|
306
|
+
description="The assembly that serves as template for generation, including any annotations to be used in `requirements`."
|
|
307
|
+
)
|
|
308
|
+
max_mutations: int = Field(
|
|
309
|
+
description="Maximum number of mutation to combine in generation. Lower values will result in fewer total candidate sequences evaluated for this generator."
|
|
310
|
+
)
|
|
311
|
+
requirements: list[Requirement] = Field(
|
|
312
|
+
default_factory=list,
|
|
313
|
+
description="Assembly based generation requirements to satisfy. All requirements must be met for a valid sequence.",
|
|
314
|
+
)
|
|
315
|
+
allowed_mutations: list[AllowedMutation] = Field(
|
|
316
|
+
description="The complete set of mutations that can be used in generation."
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class Parallel(BaseModel):
|
|
321
|
+
"""An ensemble of generators to use in parallel.
|
|
322
|
+
|
|
323
|
+
All generators in the ensemble run independently and each contribute an equal
|
|
324
|
+
number of candidate assemblies for evaluation. This does not guarantee that each
|
|
325
|
+
generator will contribute an equal number of assemblies to `selected_assemblies`,
|
|
326
|
+
as assemblies from some generators in `ensemble` may perform better than others.
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
type: Literal["PARALLEL"] = Field(default="PARALLEL")
|
|
330
|
+
ensemble: list["Generator"] = Field(
|
|
331
|
+
default_factory=list, description="The list of generators to run in parallel within this ensemble."
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class Sequential(BaseModel):
|
|
336
|
+
"""An ensemble of generators to evaluate sequentially.
|
|
337
|
+
|
|
338
|
+
Each generator will have the opportunity to evaluate an equal number of candidate
|
|
339
|
+
assemblies. However, if earlier generators are able to satisfy the requirements of
|
|
340
|
+
the `ranker`, later generators may never be used. In this way this method of
|
|
341
|
+
ensembling expresses a strict preference among the generators in `ensemble`.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
type: Literal["SEQUENTIAL"] = Field(default="SEQUENTIAL")
|
|
345
|
+
ensemble: list["Generator"] = Field(
|
|
346
|
+
default_factory=list, description="The list of generators to run in sequence within this ensemble."
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
class Measure(BaseModel):
|
|
351
|
+
assay_id: str = Field(description="The ID of the constrained assay.")
|
|
352
|
+
direction: OptimizationDirection = Field(
|
|
353
|
+
description="Whether the assay value should be above (MAXIMIZE) or below (MINIMIZE) the threshold."
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class RelativeTo(BaseModel):
|
|
358
|
+
reference: MonomerAssembly | VhVlAssembly = Field(description="The reference assembly.")
|
|
359
|
+
margin: float | None = Field(
|
|
360
|
+
default=None,
|
|
361
|
+
description="The margin (additive assay) or desired fold improvement (multiplicative assay) relative to the reference assembly.",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
class PrimaryObjectiveV2(BaseModel):
|
|
366
|
+
measure: Measure
|
|
367
|
+
reference: float | RelativeTo = Field(
|
|
368
|
+
description="The reference relative to which the primary objective is defined."
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class ConstraintV2(BaseModel):
|
|
373
|
+
measure: Measure
|
|
374
|
+
threshold: float | RelativeTo = Field(
|
|
375
|
+
description="The threshold value for the assay, expressed as an absolute value or relative to another sequence. Specify relative constraints where possible. Absolute values cannot be used for assays with a RANK scale type, and must be positive for assay with a MULTIPLICATIVE scale type."
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class MonomerScorerConfig(BaseModel):
|
|
380
|
+
kind: Literal["MONOMER_SCORER"] = Field(default="MONOMER_SCORER")
|
|
381
|
+
scorer: ArtifactParam = Field(description="The trained scoring model artifact used to score sequences.")
|
|
382
|
+
controls: "TableInput | None" = Field(
|
|
383
|
+
default=None,
|
|
384
|
+
description="Optional control sequences to include in the final output for experimental validation. These controls should generally be included in the assayed plate, as `selected_sequences` will be intentionally diverse from these sequences. However these sequences will *not* be included in the `selected_sequences` output table, so -- if used -- take care to include reinclude them. ",
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
class VhVlScorerConfig(BaseModel):
|
|
389
|
+
kind: Literal["VHVL_SCORER"] = Field(default="VHVL_SCORER")
|
|
390
|
+
scorer: ArtifactParam = Field(description="The trained scoring model artifact used to score sequences.")
|
|
391
|
+
controls: "TableInput | None" = Field(
|
|
392
|
+
default=None,
|
|
393
|
+
description="Optional control sequences to include in the final output for experimental validation. These controls should generally be included in the assayed plate, as `selected_sequences` will be intentionally diverse from these sequences. However these sequences will *not* be included in the `selected_sequences` output table, so -- if used -- take care to include reinclude them. ",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class ScoreBasedRanker(BaseModel):
|
|
398
|
+
"""Configuration for ranking and selecting generated assemblies."""
|
|
399
|
+
|
|
400
|
+
type: Literal["SCORE_BASED"] = Field(default="SCORE_BASED")
|
|
401
|
+
primary_objective: PrimaryObjectiveV2 = Field(description="The primary optimization objective of `engineer`.")
|
|
402
|
+
constraints: "ConstraintsV2" = Field(description="List of constraints `engineer` will attempt to satisfy.")
|
|
403
|
+
scorer_config: MonomerScorerConfig | VhVlScorerConfig = Field(
|
|
404
|
+
description="The trained scoring model artifact used to score sequences, along with any optional control sequences to include in the final output for experimental validation.",
|
|
405
|
+
discriminator="kind",
|
|
406
|
+
)
|
|
407
|
+
hit_redundancy: int = Field(
|
|
408
|
+
default=3,
|
|
409
|
+
description="""The maximum number of similar variants within the proposed set.
|
|
410
|
+
|
|
411
|
+
We say two variants are "similar" when their predicted performances are highly correlated.
|
|
412
|
+
At lower values this will mean on average fewer hits, but greater diversity within the set,
|
|
413
|
+
while at maximum value (`hit_redundancy=num_selected`) the selection is purely greedy, with
|
|
414
|
+
no requirement on diversity within the set.""",
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
class EngineerParametersV2(ParametersBase):
|
|
419
|
+
task_type: ClassVar[str] = "engineer/v2"
|
|
420
|
+
|
|
421
|
+
generator: "Generator" = Field(
|
|
422
|
+
description="The generator configuration or configurations that define requirements and strategies for generating candidate assemblies.",
|
|
423
|
+
discriminator="type",
|
|
424
|
+
)
|
|
425
|
+
ranker: ScoreBasedRanker = Field(
|
|
426
|
+
description="The configuration defining how candidate assemblies are evaluated for inclusion in `selected_assemblies`."
|
|
427
|
+
)
|
|
428
|
+
num_assemblies: int = Field(
|
|
429
|
+
description="The target number of selected assemblies to output. In rare cases of very difficult `constraints` this task may produce a smaller number."
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
class DiversifyTemplateInputs(BaseModel):
|
|
434
|
+
"""Configuration for template-based sequence generation."""
|
|
435
|
+
|
|
436
|
+
template_id: str = Field(
|
|
437
|
+
description="Identifier for the template used to identify the source of generated sequences in the output."
|
|
438
|
+
)
|
|
439
|
+
sequence: str = Field(description="The template amino acid sequence")
|
|
440
|
+
num_results: int = Field(
|
|
441
|
+
description="Number of sequences to generate for the final plate from the template sequence."
|
|
442
|
+
)
|
|
443
|
+
min_mutations: int = Field(default=1, description="Minimum number of mutations per sequence")
|
|
444
|
+
max_mutations: int = Field(default=4, description="Maximum number of mutations per sequence")
|
|
445
|
+
blocked_aas: list[BlockedAAItem] = Field(
|
|
446
|
+
default_factory=list,
|
|
447
|
+
description="List of blocked amino acids which may not be mutated to, with their respective ranges.",
|
|
448
|
+
)
|
|
449
|
+
blocked_motifs: list[BlockedMotifItem] = Field(
|
|
450
|
+
default_factory=list, description="List of blocked amino acid motifs with their respective ranges."
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
class DiversifyParameters(ParametersBase):
|
|
455
|
+
"""Parameters for the diversify task."""
|
|
456
|
+
|
|
457
|
+
task_type: ClassVar[str] = "diversify/v1"
|
|
458
|
+
|
|
459
|
+
template_sequence: DiversifyTemplateInputs = Field(description="The template amino acid sequence to diversify.")
|
|
460
|
+
homologs: "TableInput" = Field(
|
|
461
|
+
description="Table of homologs to use for generator training. In the simplest case, this is just the result of a multiple sequence alignment (MSA) against the sequence to optimize."
|
|
462
|
+
)
|
|
463
|
+
domain_features: dict[
|
|
464
|
+
str,
|
|
465
|
+
Annotated[
|
|
466
|
+
list[DomainFeatureItem],
|
|
467
|
+
Field(
|
|
468
|
+
description="List of domain annotations. A residue may be assigned zero or one annotations. Ranges which are not annotated are considered reliable, i.e. the MSA in those ranges is used to infer conservation-based features."
|
|
469
|
+
),
|
|
470
|
+
],
|
|
471
|
+
] = Field(
|
|
472
|
+
default_factory=dict,
|
|
473
|
+
description="Properties for each domain. A domain is one or more subsequences of the protein sequence, for example the active site of an enzyme, CDRs of an antibody, heavy and light chains of an scFv, etc. ",
|
|
474
|
+
)
|
|
475
|
+
protein_model_type: ProteinModelType = Field(
|
|
476
|
+
default=ProteinModelType.DEFAULT,
|
|
477
|
+
description="Specifies the type of base model to use for training: use `DEFAULT` for single-chain proteins and `ANTIBODY` for multi-chain antibodies.",
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
class MonomerSamplerConfig(BaseModel):
|
|
482
|
+
kind: Literal["MONOMER_SAMPLER"] = Field(default="MONOMER_SAMPLER")
|
|
483
|
+
sampler: ArtifactParam = Field(description="The trained sampler artifact used to compute sequence likelihoods.")
|
|
484
|
+
reference_assembly: MonomerAssembly = Field(
|
|
485
|
+
description="The reference assembly used as the denominator when computing sequence likelihood ratios."
|
|
486
|
+
)
|
|
487
|
+
controls: "TableInput | None" = Field(
|
|
488
|
+
default=None,
|
|
489
|
+
description="Optional control assemblies to include in the final output for experimental validation. These controls should generally be included in the assayed plate, as `selected_assemblies` will be intentionally diverse from these assemblies. However these assemblies will *not* be included in the `selected_assemblies` output table, so -- if used -- take care to include reinclude them. ",
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
class VhVlSamplerConfig(BaseModel):
|
|
494
|
+
kind: Literal["VHVL_SAMPLER"] = Field(default="VHVL_SAMPLER")
|
|
495
|
+
sampler: ArtifactParam = Field(description="The trained sampler artifact used to compute assembly likelihoods.")
|
|
496
|
+
reference_assembly: VhVlAssembly = Field(
|
|
497
|
+
description="The reference assembly used as the denominator when computing assembly likelihood ratios."
|
|
498
|
+
)
|
|
499
|
+
controls: "TableInput | None" = Field(
|
|
500
|
+
default=None,
|
|
501
|
+
description="Optional control assemblies to include in the final output for experimental validation. These controls should generally be included in the assayed plate, as `selected_assemblies` will be intentionally diverse from these assemblies. However these assemblies will *not* be included in the `selected_assemblies` output table, so -- if used -- take care to include reinclude them. ",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
class LikelihoodBasedRanker(BaseModel):
|
|
506
|
+
"""Configuration for ranking and selecting generated assemblies based on likelihoods."""
|
|
507
|
+
|
|
508
|
+
type: Literal["LIKELIHOOD_BASED"] = Field(default="LIKELIHOOD_BASED")
|
|
509
|
+
sampler_config: MonomerSamplerConfig | VhVlSamplerConfig = Field(
|
|
510
|
+
description="The trained sampler artifact used to compute assembly likelihoods, along with any optional control assemblies to include in the final output for experimental validation.",
|
|
511
|
+
discriminator="kind",
|
|
512
|
+
)
|
|
513
|
+
hit_redundancy: int = Field(
|
|
514
|
+
default=3,
|
|
515
|
+
description="""The maximum number of similar variants within the proposed set.
|
|
516
|
+
|
|
517
|
+
We say two variants are "similar" when their predicted performances are highly correlated.
|
|
518
|
+
At lower values this will mean on average fewer hits, but greater diversity within the set,
|
|
519
|
+
while at maximum value (`hit_redundancy=num_selected`) the selection is purely greedy, with
|
|
520
|
+
no requirement on diversity within the set.""",
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
class DiversifyParametersV2(ParametersBase):
|
|
525
|
+
task_type: ClassVar[str] = "diversify/v2"
|
|
526
|
+
|
|
527
|
+
generator: "Generator" = Field(
|
|
528
|
+
description="The generator configuration or configurations that define requirements and strategies for generating candidate assemblies.",
|
|
529
|
+
discriminator="type",
|
|
530
|
+
)
|
|
531
|
+
ranker: LikelihoodBasedRanker = Field(
|
|
532
|
+
description="The configuration defining how candidate assemblies are evaluated for inclusion in `selected_assemblies`."
|
|
533
|
+
)
|
|
534
|
+
num_assemblies: int = Field(
|
|
535
|
+
description="The target number of `selected_assemblies` to output. In rare cases, the actual number of output assemblies may be slightly lower."
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
class AnalyzeDataParameters(ParametersBase):
|
|
540
|
+
task_type: ClassVar[str] = "analyze.data/v1"
|
|
541
|
+
|
|
542
|
+
reference_sequence: str = Field(description="The sequence to consider as a reference for computing mutations.")
|
|
543
|
+
dataset: "TableInput" = Field(description="The assay data to be used for model training.")
|
|
544
|
+
assays: list[Assay] = Field(default_factory=list, description="List of assay metadata entries")
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
class AnalyzeDataParametersV2(ParametersBase):
|
|
548
|
+
task_type: ClassVar[str] = "analyze.data/v2"
|
|
549
|
+
|
|
550
|
+
dataset: "TableInput" = Field(description="The data to analyze before training.")
|
|
551
|
+
reference: MonomerAssembly | VhVlAssembly = Field(
|
|
552
|
+
description="The reference assembly used for computing mutations."
|
|
553
|
+
)
|
|
554
|
+
assays: list[Assay] = Field(default_factory=list, description="The assays present in the dataset.")
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
class TemplateMetadata(BaseModel):
|
|
558
|
+
template_id: str = Field(
|
|
559
|
+
description="Identifier for the template used to identify the source of generated sequences."
|
|
560
|
+
)
|
|
561
|
+
sequence: str = Field(description="Amino acid template sequence used for generation.")
|
|
562
|
+
blocked_aas: list[BlockedAAItem] = Field(
|
|
563
|
+
default_factory=list,
|
|
564
|
+
description="List of blocked amino acids which may not be mutated to, with their respective ranges.",
|
|
565
|
+
)
|
|
566
|
+
min_mutations: int
|
|
567
|
+
max_mutations: int
|
|
568
|
+
structure: ArtifactParam | None = Field(default=None)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
class AnalyzeDiversifyParameters(ParametersBase):
|
|
572
|
+
task_type: ClassVar[str] = "analyze.diversify/v1"
|
|
573
|
+
|
|
574
|
+
selected_sequences: "TableInput" = Field(description="The name of the input table with the selected sequences.")
|
|
575
|
+
generated_sequences: "TableInput" = Field(
|
|
576
|
+
description="The name of the input view/table with the generated sequences."
|
|
577
|
+
)
|
|
578
|
+
template_metadata: TemplateMetadata = Field(description="Metadata about templates.")
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
class AnalyzeTrainParameters(ParametersBase):
|
|
582
|
+
task_type: ClassVar[str] = "analyze.train/v1"
|
|
583
|
+
|
|
584
|
+
prediction_data: "TableInput" = Field(
|
|
585
|
+
description="Table of predicted assay values for the input dataset under the scorer."
|
|
586
|
+
)
|
|
587
|
+
generator_prediction_data: "TableInput" = Field(
|
|
588
|
+
description="Pseudo log-likelihoods of the sequences in the dataset under the sampler."
|
|
589
|
+
)
|
|
590
|
+
assays: list[Assay] = Field(default_factory=list, description="List of assay metadata entries")
|
|
591
|
+
primary_objective: PrimaryObjective = Field(
|
|
592
|
+
description="The primary objective on which the sampler model is conditioned."
|
|
593
|
+
)
|
|
594
|
+
constraints: "Constraints" = Field(default_factory=list, description="The constraints applied to training.")
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
class AnalyzeEngineerParameters(ParametersBase):
|
|
598
|
+
task_type: ClassVar[str] = "analyze.engineer/v1"
|
|
599
|
+
|
|
600
|
+
selected_sequences: "TableInput" = Field(description="The name of the input table with the selected sequences.")
|
|
601
|
+
engineered_sequences: "TableInput" = Field(
|
|
602
|
+
description="The name of the input view/table with the generated sequences."
|
|
603
|
+
)
|
|
604
|
+
assays: list[Assay] = Field(
|
|
605
|
+
default_factory=list, description="The list of objectives that were optimized during training."
|
|
606
|
+
)
|
|
607
|
+
primary_objective: PrimaryObjective = Field(description="The primary objective to be optimized.")
|
|
608
|
+
constraints: "Constraints" = Field(default_factory=list, description="The constraints applied to engineering.")
|
|
609
|
+
template_metadata: list[TemplateMetadata] = Field(description="Metadata about templates.")
|
|
610
|
+
new_mutation_ratio: float = Field(description="Ratio of new mutations in the generated sequences.")
|
|
611
|
+
scorer: ArtifactParam = Field(description="Scoring model used to predict assay values for candidate sequences.")
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
class Homologs(BaseModel):
|
|
615
|
+
type_: Literal["HOMOLOGS"] = Field(
|
|
616
|
+
default="HOMOLOGS", alias="type", validation_alias="type", serialization_alias="type"
|
|
617
|
+
)
|
|
618
|
+
homologs: "TableInput" = Field(description="Precomputed homologs table.")
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
class ProteinDatabase(StrEnum):
|
|
622
|
+
UNIREF_30 = "UNIREF_30"
|
|
623
|
+
OAS_90 = "OAS_90"
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
class DatabaseSearch(BaseModel):
|
|
627
|
+
type_: Literal["DATABASE_SEARCH"] = Field(
|
|
628
|
+
default="DATABASE_SEARCH", alias="type", validation_alias="type", serialization_alias="type"
|
|
629
|
+
)
|
|
630
|
+
seed_domains: list[str] = Field(description="A list of domain sequences to search for.")
|
|
631
|
+
database: ProteinDatabase = Field(
|
|
632
|
+
default=ProteinDatabase.UNIREF_30, description="Type of the protein database to use."
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
class SearchParameters(ParametersBase):
|
|
637
|
+
task_type: ClassVar[str] = "search/v1"
|
|
638
|
+
|
|
639
|
+
source: Homologs | DatabaseSearch = Field(
|
|
640
|
+
description="Precomputed homologs table or seed sequences to search for. If a table of precomputed sequence homologs is provided, the entries in the `seed_domain` column of this table may be used forsequence feature computation. For example, if the sequence to be optimized is an scFv, the `seed_domain` column would contain the heavy and light chain scaffold sequences of the scFv. These 2 scaffold sequences will be used to search for homologs and infer features such as the CDRs.",
|
|
641
|
+
discriminator="type_",
|
|
642
|
+
)
|
|
643
|
+
antibody_sequence_features: bool = Field(
|
|
644
|
+
default=False, description="Whether to compute sequence features. Only supported for antibodies."
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
class UnirefDatabaseSearch(BaseModel):
|
|
649
|
+
type_: Literal["UNIREF_SEARCH"] = Field(
|
|
650
|
+
default="UNIREF_SEARCH", alias="type", validation_alias="type", serialization_alias="type"
|
|
651
|
+
)
|
|
652
|
+
seed_assemblies: list[MonomerAssembly] = Field(description="A list of monomeric assemblies to search for.")
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
class OASDatabaseSearch(BaseModel):
|
|
656
|
+
type_: Literal["OAS_SEARCH"] = Field(
|
|
657
|
+
default="OAS_SEARCH", alias="type", validation_alias="type", serialization_alias="type"
|
|
658
|
+
)
|
|
659
|
+
seed_assemblies: list[VhVlAssembly] = Field(description="The list of light/heavy chain assemblies to search for.")
|
|
660
|
+
search_unpaired: bool = Field(default=True, description="Whether to search for unpaired heavy/light chains.")
|
|
661
|
+
search_paired: bool = Field(default=True, description="Whether to search for paired heavy/light chains.")
|
|
662
|
+
annotate_assembly_features: bool = Field(
|
|
663
|
+
default=True, description="Whether to infer CDR features based on OAS annotations."
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
class SearchParametersV2(ParametersBase):
|
|
668
|
+
task_type: ClassVar[str] = "search/v2"
|
|
669
|
+
|
|
670
|
+
source: UnirefDatabaseSearch | OASDatabaseSearch = Field(
|
|
671
|
+
description="The source database to search against, along with the assemblies to search for, and database-specific configuration.",
|
|
672
|
+
discriminator="type_",
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
class PreRanked(BaseModel):
|
|
677
|
+
"""Configuration for selecting from pre-ranked assemblies.
|
|
678
|
+
|
|
679
|
+
With this option, the assemblies passed for selection are assumed to be ranked in descending
|
|
680
|
+
order of desirability. Selection is based on this order and subject to diversity constraints that
|
|
681
|
+
can be controlled via the `hit_redundancy` parameter.
|
|
682
|
+
"""
|
|
683
|
+
|
|
684
|
+
type: Literal["PRE_RANKED"] = Field(default="PRE_RANKED")
|
|
685
|
+
hit_redundancy: int = Field(
|
|
686
|
+
default=3,
|
|
687
|
+
description="""The maximum number of similar variants within the proposed set.
|
|
688
|
+
|
|
689
|
+
We say two variants are "similar" when their predicted performances are highly correlated.
|
|
690
|
+
At lower values this will mean on average fewer hits, but greater diversity within the set,
|
|
691
|
+
while at maximum value (`hit_redundancy=num_assemblies`) the selection is purely greedy, with
|
|
692
|
+
no requirement on diversity within the set.""",
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
class SelectParametersV2(ParametersBase):
|
|
697
|
+
task_type: ClassVar[str] = "select/v2"
|
|
698
|
+
|
|
699
|
+
assemblies: MonomerSource | VhVlSource = Field(
|
|
700
|
+
description="The source of assemblies to select from.", discriminator="kind"
|
|
701
|
+
)
|
|
702
|
+
ranker: (
|
|
703
|
+
Annotated[
|
|
704
|
+
LikelihoodBasedRanker | ScoreBasedRanker,
|
|
705
|
+
Field(description="The ranking strategy for selecting candidate assemblies.", discriminator="type"),
|
|
706
|
+
]
|
|
707
|
+
| PreRanked
|
|
708
|
+
) = Field(
|
|
709
|
+
description="The configuration defining how candidate assemblies are evaluated for inclusion in `selected_assemblies`."
|
|
710
|
+
)
|
|
711
|
+
min_assemblies_to_select: int = Field(
|
|
712
|
+
description="The minimum number of assemblies to select. The number of selected candidates is dynamically determined and may be greater than this number. If an exact number of assemblies is desired, set `min_assemblies_to_select` and `max_assemblies_to_select` to the same value."
|
|
713
|
+
)
|
|
714
|
+
max_assemblies_to_select: int | None = Field(
|
|
715
|
+
default=None,
|
|
716
|
+
description="An optional (inclusive) upper limit on the number of assemblies to select. If provided, the task will select between `min_assemblies_to_select` and `max_assemblies_to_select` assemblies.",
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
class MonomerData(BaseModel):
|
|
721
|
+
kind: Literal["MONOMER"] = Field(default="MONOMER")
|
|
722
|
+
homologs: "TableInput" = Field(description="The homologous assemblies used for training.")
|
|
723
|
+
dataset: "TableInput | None" = Field(
|
|
724
|
+
description="The assayed assemblies used for training. If None, predictor finetuning and generator conditioning will be skipped."
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
class VhVlData(BaseModel):
|
|
729
|
+
kind: Literal["VHVL"] = Field(default="VHVL")
|
|
730
|
+
homologs: "TableInput" = Field(description="The homologous assemblies used for training.")
|
|
731
|
+
dataset: "TableInput | None" = Field(
|
|
732
|
+
description="The assayed assemblies used for training. If None, predictor finetuning and generator conditioning will be skipped."
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
class TrainParametersV2(ParametersBase):
|
|
737
|
+
task_type: ClassVar[str] = "train/v2"
|
|
738
|
+
|
|
739
|
+
data: MonomerData | VhVlData = Field(
|
|
740
|
+
description="The training data, including homologs and assayed sequences.", discriminator="kind"
|
|
741
|
+
)
|
|
742
|
+
assays: list[Assay] = Field(
|
|
743
|
+
default_factory=list,
|
|
744
|
+
description="The assays used for sampler conditioning (on the `primary_objective`) and scorer finetuning and evaluation. Must correspond to assays present in `data.dataset`. If `data.dataset` is None, `assays` must be empty. Otherwise, it must contain at least one assay.",
|
|
745
|
+
)
|
|
746
|
+
primary_objective: Measure | None = Field(
|
|
747
|
+
default=None,
|
|
748
|
+
description="The assay on which the sampler model is conditioned. Should correspond to an assay in `dataset`. If `data.dataset` is None, `primary_objective` must not be set. Otherwise, it must be set.",
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
class ResultBase(BaseModel):
|
|
753
|
+
pass
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
class AbstractTableResult(BaseModel):
|
|
757
|
+
table: str = Field(description="The reference to the table the result was written to.")
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
class TrainResult(ResultBase):
|
|
761
|
+
base_sampler: ArtifactParam = Field(
|
|
762
|
+
description="A sampler which attempts to produce plausible sequences as understood relative to the passed `homologs`. "
|
|
763
|
+
)
|
|
764
|
+
conditioned_sampler: ArtifactParam | None = Field(
|
|
765
|
+
description="A sampler which attempts to produce high quality sequences as understood relative to the passed `primary_objective`."
|
|
766
|
+
)
|
|
767
|
+
scorer: ArtifactParam = Field(description="The scoring model, trained on the assay data provided.")
|
|
768
|
+
prediction_data: AbstractTableResult = Field(
|
|
769
|
+
description="Predicted assay values on all folds of the input dataset."
|
|
770
|
+
)
|
|
771
|
+
generator_prediction_data: AbstractTableResult = Field(
|
|
772
|
+
description="Pseudo log-likelihoods of the sequences in the dataset under the generator(s)."
|
|
773
|
+
)
|
|
774
|
+
report: ArtifactParam = Field(description="A report on the result of the task.")
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
class MonomerModels(BaseModel):
|
|
778
|
+
kind: Literal["MONOMER"] = Field(default="MONOMER")
|
|
779
|
+
sampler: ArtifactParam = Field(description="The monomer sampler model.")
|
|
780
|
+
scorer: ArtifactParam | None = Field(description="The monomer scorer model.")
|
|
781
|
+
conditioned_sampler: ArtifactParam | None = Field(
|
|
782
|
+
description="The monomer sampler model conditioned on the primary objective."
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
class VhVlModels(BaseModel):
|
|
787
|
+
kind: Literal["VH_VL"] = Field(default="VH_VL")
|
|
788
|
+
sampler: ArtifactParam = Field(description="The VhVl sampler model.")
|
|
789
|
+
scorer: ArtifactParam | None = Field(description="The VhVl scorer model.")
|
|
790
|
+
conditioned_sampler: ArtifactParam | None = Field(
|
|
791
|
+
description="The VhVl sampler model conditioned on the primary objective."
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
class TrainResultV2(ResultBase):
|
|
796
|
+
models: MonomerModels | VhVlModels = Field(description="The trained models.", discriminator="kind")
|
|
797
|
+
report: ArtifactParam | None = Field(description="A report on the result of the task.")
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
class EngineerResult(ResultBase):
|
|
801
|
+
selected_sequences: AbstractTableResult = Field(description="Table containing the selected sequences.")
|
|
802
|
+
engineered_sequences: AbstractTableResult = Field(description="Table containing the generated sequences.")
|
|
803
|
+
num_evaluated_seqs: int = Field(
|
|
804
|
+
description="Number of sequences evaluated in silico (only the ones sent to the predictor)."
|
|
805
|
+
)
|
|
806
|
+
num_generated_seqs: int = Field(
|
|
807
|
+
description="Number of sequences generated in silico (this is > num_evaluated_seqs, as it includes generated sequences that were not sent to the predictor, due to lower likelihood)."
|
|
808
|
+
)
|
|
809
|
+
new_mutation_ratio: float = Field(
|
|
810
|
+
description="Number of mutations in `selected_candidates` that do not appear in train/test datasets."
|
|
811
|
+
)
|
|
812
|
+
template_metadata: list[TemplateMetadata] = Field(description="Metadata about templates.")
|
|
813
|
+
report: ArtifactParam = Field(description="A report on the result of the task.")
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class EngineerResultV2(ResultBase):
|
|
817
|
+
selected_assemblies: AbstractTableResult = Field(
|
|
818
|
+
description="Table containing the selected assemblies output by `engineer`."
|
|
819
|
+
)
|
|
820
|
+
report: ArtifactParam | None = Field(description="A report on the result of the task.")
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
class AnalyzeResult(ResultBase):
|
|
824
|
+
report: ArtifactParam = Field(description="The report data")
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
class AnalyzeResultV2(ResultBase):
|
|
828
|
+
report: ArtifactParam = Field(description="The analysis report")
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
class SearchResult(ResultBase):
|
|
832
|
+
domain_features: dict[
|
|
833
|
+
str,
|
|
834
|
+
Annotated[
|
|
835
|
+
list[DomainFeatureItem],
|
|
836
|
+
Field(
|
|
837
|
+
description="List of domain annotations. A residue may be assigned zero or one annotations. Ranges which are not annotated are considered reliable, i.e. the MSA in those ranges is used to infer conservation-based features."
|
|
838
|
+
),
|
|
839
|
+
],
|
|
840
|
+
] = Field(
|
|
841
|
+
default_factory=dict,
|
|
842
|
+
description="Mapping of protein sequences to their respective domain features. Each range in the domain features must be a valid index into the corresponding protein sequence.",
|
|
843
|
+
)
|
|
844
|
+
homologs: AbstractTableResult
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
class DiversifyResult(ResultBase):
|
|
848
|
+
"""Result of the diversify task."""
|
|
849
|
+
|
|
850
|
+
selected_sequences: AbstractTableResult = Field(description="Table containing the selected sequences.")
|
|
851
|
+
generated_sequences: AbstractTableResult = Field(description="Table containing the generated sequences.")
|
|
852
|
+
template_metadata: TemplateMetadata = Field(description="Metadata about templates.")
|
|
853
|
+
report: ArtifactParam = Field(description="A report on the result of the task.")
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
class DiversifyResultV2(ResultBase):
|
|
857
|
+
selected_assemblies: AbstractTableResult = Field(
|
|
858
|
+
description="Table containing the selected assemblies output by `diversify`."
|
|
859
|
+
)
|
|
860
|
+
report: ArtifactParam | None = Field(description="A report on the result of the task.")
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
class SearchResultV2(ResultBase):
|
|
864
|
+
assembly_features: list[dict[str, AssemblyFeature]] = Field(
|
|
865
|
+
default_factory=list,
|
|
866
|
+
description="Parallel array (wrt to `seed_assemblies`) of features, keyed by feature name.",
|
|
867
|
+
)
|
|
868
|
+
homologs: AbstractTableResult
|
|
869
|
+
report: ArtifactParam | None = Field(default=None, description="A report on the result of the task.")
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
class SelectResultV2(ResultBase):
|
|
873
|
+
selected_assemblies: AbstractTableResult = Field(
|
|
874
|
+
description="Table containing the selected assemblies output by `select` sorted in descending order of preference."
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
class TaskState(StrEnum):
|
|
879
|
+
INIT = "INIT"
|
|
880
|
+
PREPARING_INPUTS = "PREPARING_INPUTS"
|
|
881
|
+
LAUNCHING = "LAUNCHING"
|
|
882
|
+
EXECUTING = "EXECUTING"
|
|
883
|
+
LOADING_RESULTS = "LOADING_RESULTS"
|
|
884
|
+
CANCELLING = "CANCELLING"
|
|
885
|
+
RECOVERING = "RECOVERING"
|
|
886
|
+
FAILED = "FAILED"
|
|
887
|
+
COMPLETED = "COMPLETED"
|
|
888
|
+
CANCELLED = "CANCELLED"
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
class TaskResponse(ResourceResponse, ArchivableResourceMixin, ErrorResponseMixin):
|
|
892
|
+
type: str
|
|
893
|
+
context: "Context" = Field(
|
|
894
|
+
description="The context in which this task runs. Tasks started in a project or round context will only 'see' table data that belongs to the associated project.",
|
|
895
|
+
discriminator="kind",
|
|
896
|
+
)
|
|
897
|
+
name: str | None = Field(description="An optional name of the task. It must be unique within the task's context.")
|
|
898
|
+
description: str | None = Field(description="A description of the task.")
|
|
899
|
+
data_version_id: int | None = Field(description="The data version at which table inputs are observed.")
|
|
900
|
+
data_load_id: int | None = Field(description="ID of the data load for the task's results.")
|
|
901
|
+
parameters: "TaskParameters" = Field(description="The parameters of the task.")
|
|
902
|
+
result: (
|
|
903
|
+
TrainResult
|
|
904
|
+
| TrainResultV2
|
|
905
|
+
| EngineerResult
|
|
906
|
+
| EngineerResultV2
|
|
907
|
+
| AnalyzeResult
|
|
908
|
+
| AnalyzeResultV2
|
|
909
|
+
| SearchResult
|
|
910
|
+
| DiversifyResult
|
|
911
|
+
| DiversifyResultV2
|
|
912
|
+
| SearchResultV2
|
|
913
|
+
| SelectResultV2
|
|
914
|
+
| None
|
|
915
|
+
) = Field(description="The result of the completed task.")
|
|
916
|
+
state: TaskState = Field(description="The current state of the task's execution.")
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
class TaskUpdateBase(BaseModel, ABC):
|
|
920
|
+
name: str | None = Field(
|
|
921
|
+
default=None,
|
|
922
|
+
description="Optional name for the task. If provided it will ensure that only one task with that name exists within the task's context.",
|
|
923
|
+
)
|
|
924
|
+
description: str | None = Field(
|
|
925
|
+
default=None, description="Optional description to provide additional notes about the executed task."
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
class TaskUpdate(TaskUpdateBase):
|
|
930
|
+
pass
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
class ListTaskResponse(BaseListResponse):
|
|
934
|
+
items: list[TaskResponse]
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
class TaskCreate(TaskUpdateBase):
|
|
938
|
+
parameters: "TaskParameters" = Field(description="The parameters for the task.")
|
|
939
|
+
context: "Context" = Field(description="The context in which the task is executed.", discriminator="kind")
|
|
940
|
+
data_version_id: int | None = Field(
|
|
941
|
+
default=None,
|
|
942
|
+
description="For tasks that have parameters referencing data tables, this specifies the version of the table data to use. If no version is specified, it defaults to the most recent data version.",
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
TaskParameters = (
|
|
947
|
+
TrainParameters
|
|
948
|
+
| EngineerParameters
|
|
949
|
+
| EngineerParametersV2
|
|
950
|
+
| DiversifyParameters
|
|
951
|
+
| DiversifyParametersV2
|
|
952
|
+
| AnalyzeDataParameters
|
|
953
|
+
| AnalyzeDataParametersV2
|
|
954
|
+
| AnalyzeDiversifyParameters
|
|
955
|
+
| AnalyzeTrainParameters
|
|
956
|
+
| AnalyzeEngineerParameters
|
|
957
|
+
| SearchParameters
|
|
958
|
+
| SearchParametersV2
|
|
959
|
+
| SelectParametersV2
|
|
960
|
+
| TrainParametersV2
|
|
961
|
+
)
|
|
962
|
+
TaskResult = (
|
|
963
|
+
TrainResult
|
|
964
|
+
| TrainResultV2
|
|
965
|
+
| EngineerResult
|
|
966
|
+
| EngineerResultV2
|
|
967
|
+
| AnalyzeResult
|
|
968
|
+
| AnalyzeResultV2
|
|
969
|
+
| SearchResult
|
|
970
|
+
| DiversifyResult
|
|
971
|
+
| DiversifyResultV2
|
|
972
|
+
| SearchResultV2
|
|
973
|
+
| SelectResultV2
|
|
974
|
+
)
|
|
975
|
+
TableInput = list[dict[str, str | int | float | None]] | TableSourceTable | TableSourceQuery | TableSourceTaskResult
|
|
976
|
+
Constraint = Annotated[AbsoluteConstraint | RelativeConstraint, Field(discriminator="type_")]
|
|
977
|
+
Constraints = Annotated[
|
|
978
|
+
list[Annotated[AbsoluteConstraint | RelativeConstraint, Field(discriminator="type_")]],
|
|
979
|
+
Field(default_factory=list, description="List of assay constraints"),
|
|
980
|
+
]
|
|
981
|
+
Generator = Annotated[
|
|
982
|
+
Sequential | Parallel | ModelBasedGenerator | Explicit | CombinatorialGenerator,
|
|
983
|
+
Field(description="The generation strategy for creating candidate sequences.", discriminator="type"),
|
|
984
|
+
]
|
|
985
|
+
ConstraintsV2 = Annotated[list[ConstraintV2], Field(default_factory=list, description="List of assay constraints")]
|