rc-foundry 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- foundry/version.py +2 -2
- {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/METADATA +1 -1
- {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/RECORD +139 -8
- rf3/configs/callbacks/default.yaml +5 -0
- rf3/configs/callbacks/dump_validation_structures.yaml +6 -0
- rf3/configs/callbacks/metrics_logging.yaml +10 -0
- rf3/configs/callbacks/train_logging.yaml +16 -0
- rf3/configs/dataloader/default.yaml +15 -0
- rf3/configs/datasets/base.yaml +31 -0
- rf3/configs/datasets/pdb_and_distillation.yaml +58 -0
- rf3/configs/datasets/pdb_only.yaml +17 -0
- rf3/configs/datasets/train/disorder_distillation.yaml +48 -0
- rf3/configs/datasets/train/domain_distillation.yaml +50 -0
- rf3/configs/datasets/train/monomer_distillation.yaml +49 -0
- rf3/configs/datasets/train/na_complex_distillation.yaml +50 -0
- rf3/configs/datasets/train/pdb/af3_weighted_sampling.yaml +8 -0
- rf3/configs/datasets/train/pdb/base.yaml +32 -0
- rf3/configs/datasets/train/pdb/plinder.yaml +54 -0
- rf3/configs/datasets/train/pdb/train_interface.yaml +51 -0
- rf3/configs/datasets/train/pdb/train_pn_unit.yaml +46 -0
- rf3/configs/datasets/train/rna_monomer_distillation.yaml +56 -0
- rf3/configs/datasets/val/af3_ab_set.yaml +11 -0
- rf3/configs/datasets/val/af3_validation.yaml +11 -0
- rf3/configs/datasets/val/base.yaml +32 -0
- rf3/configs/datasets/val/runs_and_poses.yaml +12 -0
- rf3/configs/debug/default.yaml +66 -0
- rf3/configs/debug/train_specific_examples.yaml +21 -0
- rf3/configs/experiment/pretrained/rf3.yaml +50 -0
- rf3/configs/experiment/pretrained/rf3_with_confidence.yaml +13 -0
- rf3/configs/experiment/quick-rf3-with-confidence.yaml +15 -0
- rf3/configs/experiment/quick-rf3.yaml +61 -0
- rf3/configs/hydra/default.yaml +18 -0
- rf3/configs/hydra/no_logging.yaml +7 -0
- rf3/configs/inference.yaml +7 -0
- rf3/configs/inference_engine/base.yaml +23 -0
- rf3/configs/inference_engine/rf3.yaml +33 -0
- rf3/configs/logger/csv.yaml +6 -0
- rf3/configs/logger/default.yaml +3 -0
- rf3/configs/logger/wandb.yaml +15 -0
- rf3/configs/model/components/ema.yaml +1 -0
- rf3/configs/model/components/rf3_net.yaml +177 -0
- rf3/configs/model/components/rf3_net_with_confidence_head.yaml +45 -0
- rf3/configs/model/optimizers/adam.yaml +5 -0
- rf3/configs/model/rf3.yaml +43 -0
- rf3/configs/model/rf3_with_confidence.yaml +7 -0
- rf3/configs/model/schedulers/af3.yaml +6 -0
- rf3/configs/paths/data/default.yaml +43 -0
- rf3/configs/paths/default.yaml +21 -0
- rf3/configs/train.yaml +42 -0
- rf3/configs/trainer/cpu.yaml +6 -0
- rf3/configs/trainer/ddp.yaml +5 -0
- rf3/configs/trainer/loss/losses/confidence_loss.yaml +29 -0
- rf3/configs/trainer/loss/losses/diffusion_loss.yaml +9 -0
- rf3/configs/trainer/loss/losses/distogram_loss.yaml +2 -0
- rf3/configs/trainer/loss/structure_prediction.yaml +4 -0
- rf3/configs/trainer/loss/structure_prediction_with_confidence.yaml +2 -0
- rf3/configs/trainer/metrics/structure_prediction.yaml +14 -0
- rf3/configs/trainer/rf3.yaml +20 -0
- rf3/configs/trainer/rf3_with_confidence.yaml +13 -0
- rf3/configs/validate.yaml +45 -0
- rfd3/cli.py +10 -4
- rfd3/configs/__init__.py +0 -0
- rfd3/configs/callbacks/design_callbacks.yaml +10 -0
- rfd3/configs/callbacks/metrics_logging.yaml +20 -0
- rfd3/configs/callbacks/train_logging.yaml +24 -0
- rfd3/configs/dataloader/default.yaml +15 -0
- rfd3/configs/dataloader/fast.yaml +11 -0
- rfd3/configs/datasets/conditions/dna_condition.yaml +3 -0
- rfd3/configs/datasets/conditions/island.yaml +28 -0
- rfd3/configs/datasets/conditions/ppi.yaml +2 -0
- rfd3/configs/datasets/conditions/sequence_design.yaml +17 -0
- rfd3/configs/datasets/conditions/tipatom.yaml +28 -0
- rfd3/configs/datasets/conditions/unconditional.yaml +21 -0
- rfd3/configs/datasets/design_base.yaml +97 -0
- rfd3/configs/datasets/train/pdb/af3_train_interface.yaml +46 -0
- rfd3/configs/datasets/train/pdb/af3_train_pn_unit.yaml +42 -0
- rfd3/configs/datasets/train/pdb/base.yaml +14 -0
- rfd3/configs/datasets/train/pdb/base_no_weights.yaml +19 -0
- rfd3/configs/datasets/train/pdb/base_transform_args.yaml +59 -0
- rfd3/configs/datasets/train/pdb/na_complex_distillation.yaml +20 -0
- rfd3/configs/datasets/train/pdb/pdb_base.yaml +11 -0
- rfd3/configs/datasets/train/pdb/rfd3_train_interface.yaml +22 -0
- rfd3/configs/datasets/train/pdb/rfd3_train_pn_unit.yaml +23 -0
- rfd3/configs/datasets/train/rfd3_monomer_distillation.yaml +38 -0
- rfd3/configs/datasets/val/bcov_ppi_easy_medium.yaml +9 -0
- rfd3/configs/datasets/val/design_validation_base.yaml +40 -0
- rfd3/configs/datasets/val/dna_binder_design5.yaml +9 -0
- rfd3/configs/datasets/val/dna_binder_long.yaml +13 -0
- rfd3/configs/datasets/val/dna_binder_short.yaml +13 -0
- rfd3/configs/datasets/val/indexed.yaml +9 -0
- rfd3/configs/datasets/val/mcsa_41.yaml +9 -0
- rfd3/configs/datasets/val/mcsa_41_short_rigid.yaml +10 -0
- rfd3/configs/datasets/val/ppi_inference.yaml +7 -0
- rfd3/configs/datasets/val/sm_binder_hbonds.yaml +13 -0
- rfd3/configs/datasets/val/sm_binder_hbonds_short.yaml +15 -0
- rfd3/configs/datasets/val/unconditional.yaml +9 -0
- rfd3/configs/datasets/val/unconditional_deep.yaml +9 -0
- rfd3/configs/datasets/val/unindexed.yaml +8 -0
- rfd3/configs/datasets/val/val_examples/bcov_ppi_easy_medium_with_ori.yaml +151 -0
- rfd3/configs/datasets/val/val_examples/bcov_ppi_easy_medium_with_ori_spoof_helical_bundle.yaml +7 -0
- rfd3/configs/datasets/val/val_examples/bcov_ppi_easy_medium_with_ori_varying_lengths.yaml +28 -0
- rfd3/configs/datasets/val/val_examples/bpem_ori_hb.yaml +212 -0
- rfd3/configs/debug/default.yaml +64 -0
- rfd3/configs/debug/train_specific_examples.yaml +21 -0
- rfd3/configs/dev.yaml +9 -0
- rfd3/configs/experiment/debug.yaml +14 -0
- rfd3/configs/experiment/pretrain.yaml +31 -0
- rfd3/configs/experiment/test-uncond.yaml +10 -0
- rfd3/configs/experiment/test-unindexed.yaml +21 -0
- rfd3/configs/hydra/default.yaml +18 -0
- rfd3/configs/hydra/no_logging.yaml +7 -0
- rfd3/configs/inference.yaml +9 -0
- rfd3/configs/inference_engine/base.yaml +15 -0
- rfd3/configs/inference_engine/dev.yaml +20 -0
- rfd3/configs/inference_engine/rfdiffusion3.yaml +65 -0
- rfd3/configs/logger/csv.yaml +6 -0
- rfd3/configs/logger/default.yaml +2 -0
- rfd3/configs/logger/wandb.yaml +15 -0
- rfd3/configs/model/components/ema.yaml +1 -0
- rfd3/configs/model/components/rfd3_net.yaml +131 -0
- rfd3/configs/model/optimizers/adam.yaml +5 -0
- rfd3/configs/model/rfd3_base.yaml +8 -0
- rfd3/configs/model/samplers/edm.yaml +21 -0
- rfd3/configs/model/samplers/symmetry.yaml +10 -0
- rfd3/configs/model/schedulers/af3.yaml +6 -0
- rfd3/configs/paths/data/default.yaml +18 -0
- rfd3/configs/paths/default.yaml +22 -0
- rfd3/configs/train.yaml +28 -0
- rfd3/configs/trainer/cpu.yaml +6 -0
- rfd3/configs/trainer/ddp.yaml +5 -0
- rfd3/configs/trainer/loss/losses/diffusion_loss.yaml +12 -0
- rfd3/configs/trainer/loss/losses/sequence_loss.yaml +3 -0
- rfd3/configs/trainer/metrics/design_metrics.yaml +22 -0
- rfd3/configs/trainer/rfd3_base.yaml +35 -0
- rfd3/configs/validate.yaml +34 -0
- rfd3/run_inference.py +3 -7
- {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/WHEEL +0 -0
- {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/entry_points.txt +0 -0
- {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
|
|
2
|
+
_target_: rfd3.transforms.training_conditions.IslandCondition
|
|
3
|
+
frequency: 1.0
|
|
4
|
+
name: island
|
|
5
|
+
|
|
6
|
+
# Island sampling (`is_motif_token` assignment)
|
|
7
|
+
island_sampling_kwargs:
|
|
8
|
+
island_len_min: 1
|
|
9
|
+
island_len_max: 12 # Rec 25, kept lower because unindexed motifs get sampled too and create more tokens.
|
|
10
|
+
n_islands_min: 2
|
|
11
|
+
n_islands_max: 5
|
|
12
|
+
|
|
13
|
+
# Subgraph / within-token sampling (`is_motif_atom` assignment)
|
|
14
|
+
p_diffuse_motif_sidechains: 0.80 # 80% probability of diffusing sidechains
|
|
15
|
+
p_diffuse_subgraph_atoms: 0.0 # 0% probability of sampling subgraph atoms (defaults to fully fixed)
|
|
16
|
+
subgraph_sampling_kwargs: # see tipatom
|
|
17
|
+
residue_p_seed_furthest_from_o: null
|
|
18
|
+
residue_n_bond_expectation: null
|
|
19
|
+
residue_p_fix_all: null
|
|
20
|
+
hetatom_n_bond_expectation: null
|
|
21
|
+
hetatom_p_fix_all: null
|
|
22
|
+
|
|
23
|
+
# Sets `is_motif_atom_with_fixed_seq`
|
|
24
|
+
p_fix_motif_sequence: 0.2 # probability that sequence is fixed for all motifs during training
|
|
25
|
+
# Sets `is_motif_atom_with_fixed_coord`
|
|
26
|
+
p_fix_motif_coordinates: 0.8 # Of the atoms that are sampled, should their coordinates be fixed?
|
|
27
|
+
# Sets `is_motif_atom_with_unindexed`
|
|
28
|
+
p_unindex_motif_tokens: 0.5 # probability of unindexing all motif atoms
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- island
|
|
3
|
+
- _self_
|
|
4
|
+
|
|
5
|
+
frequency: 1.0
|
|
6
|
+
name: sequence_design
|
|
7
|
+
|
|
8
|
+
island_sampling_kwargs:
|
|
9
|
+
island_len_min: 99999
|
|
10
|
+
island_len_max: 999999999
|
|
11
|
+
|
|
12
|
+
p_diffuse_motif_sidechains: 1.0
|
|
13
|
+
p_unindex_motif_tokens: 0.0
|
|
14
|
+
|
|
15
|
+
# For ChemNet-style sampling < 1.0
|
|
16
|
+
p_fix_motif_coordinates: 0.8
|
|
17
|
+
p_fix_motif_sequence: 0.1
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
|
|
2
|
+
defaults:
|
|
3
|
+
- island
|
|
4
|
+
- _self_
|
|
5
|
+
|
|
6
|
+
frequency: 1.0
|
|
7
|
+
name: tipatom
|
|
8
|
+
|
|
9
|
+
# Island sampling (`is_motif_token` assignment)
|
|
10
|
+
island_sampling_kwargs:
|
|
11
|
+
island_len_min: 1
|
|
12
|
+
island_len_max: 1
|
|
13
|
+
n_islands_min: 2
|
|
14
|
+
n_islands_max: 12
|
|
15
|
+
|
|
16
|
+
# Subgraph / within-token sampling (`is_motif_atom` assignment)
|
|
17
|
+
p_diffuse_motif_sidechains: 0.0 # 80% probability of diffusing sidechains
|
|
18
|
+
p_diffuse_subgraph_atoms: 1.0
|
|
19
|
+
subgraph_sampling_kwargs:
|
|
20
|
+
residue_p_seed_furthest_from_o: 0.8
|
|
21
|
+
residue_n_bond_expectation: 3.0
|
|
22
|
+
residue_p_fix_all: 0.05
|
|
23
|
+
hetatom_n_bond_expectation: 8
|
|
24
|
+
hetatom_p_fix_all: 0.5
|
|
25
|
+
|
|
26
|
+
p_fix_motif_sequence: 0.7
|
|
27
|
+
p_fix_motif_coordinates: 1.0
|
|
28
|
+
p_unindex_motif_tokens: 0.5
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Unconditional that fixes non-protein targets
|
|
2
|
+
|
|
3
|
+
defaults:
|
|
4
|
+
- island
|
|
5
|
+
- _self_
|
|
6
|
+
|
|
7
|
+
frequency: 1.0
|
|
8
|
+
name: unconditional
|
|
9
|
+
|
|
10
|
+
island_sampling_kwargs:
|
|
11
|
+
island_len_min: 0
|
|
12
|
+
island_len_max: 0
|
|
13
|
+
n_islands_min: 0
|
|
14
|
+
n_islands_max: 0
|
|
15
|
+
|
|
16
|
+
# Conditional assignments won't matter for protein regions since always diffused:
|
|
17
|
+
p_diffuse_motif_sidechains: 0.0
|
|
18
|
+
p_diffuse_subgraph_atoms: 0.0
|
|
19
|
+
p_fix_motif_sequence: 0.0
|
|
20
|
+
p_fix_motif_coordinates: 0.0
|
|
21
|
+
p_unindex_motif_tokens: 0.0
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# base training dataset for training AF3 design models (atom14 variants):
|
|
2
|
+
# protein subsampling only.
|
|
3
|
+
|
|
4
|
+
defaults:
|
|
5
|
+
# Grab datasets
|
|
6
|
+
- train/pdb/rfd3_train_interface@train.pdb.sub_datasets.interface
|
|
7
|
+
- train/pdb/rfd3_train_pn_unit@train.pdb.sub_datasets.pn_unit
|
|
8
|
+
#- train/rfd3_monomer_distillation@train
|
|
9
|
+
|
|
10
|
+
# Customized validation datasets
|
|
11
|
+
- val/unconditional@val.unconditional
|
|
12
|
+
- val/unconditional_deep@val.unconditional_deep
|
|
13
|
+
- val/indexed@val.indexed
|
|
14
|
+
|
|
15
|
+
# Customized train masks
|
|
16
|
+
- conditions/unconditional@global_transform_args.train_conditions.unconditional
|
|
17
|
+
- conditions/island@global_transform_args.train_conditions.island
|
|
18
|
+
- conditions/tipatom@global_transform_args.train_conditions.tipatom
|
|
19
|
+
- conditions/sequence_design@global_transform_args.train_conditions.sequence_design
|
|
20
|
+
- conditions/ppi@global_transform_args.train_conditions.ppi
|
|
21
|
+
|
|
22
|
+
- _self_
|
|
23
|
+
|
|
24
|
+
# Create a dictionary used for transform arguments
|
|
25
|
+
pipeline_target: rfd3.transforms.pipelines.build_atom14_base_pipeline
|
|
26
|
+
|
|
27
|
+
# Base config overrides:
|
|
28
|
+
diffusion_batch_size_train: 32
|
|
29
|
+
diffusion_batch_size_inference: 8
|
|
30
|
+
crop_size: 384
|
|
31
|
+
n_recycles_train: 2
|
|
32
|
+
n_recycles_validation: 1
|
|
33
|
+
max_atoms_in_crop: 3840 # ~10x crop size.
|
|
34
|
+
|
|
35
|
+
# Global transform arguments are necessary for arguments shared between training and inference
|
|
36
|
+
global_transform_args:
|
|
37
|
+
n_atoms_per_token: 14
|
|
38
|
+
central_atom: CB
|
|
39
|
+
sigma_perturb: 2.0
|
|
40
|
+
sigma_perturb_com: 1.0
|
|
41
|
+
association_scheme: dense
|
|
42
|
+
center_option: diffuse # options are ["all", "motif", "diffuse"]
|
|
43
|
+
|
|
44
|
+
# Reference conformer policy
|
|
45
|
+
generate_conformers: True
|
|
46
|
+
generate_conformers_for_non_protein_only: True
|
|
47
|
+
provide_reference_conformer_when_unmasked: True
|
|
48
|
+
ground_truth_conformer_policy: IGNORE # Other options: REPLACE, ADD, FALLBACK. See atomworks.enums for details
|
|
49
|
+
provide_elements_for_unindexed_components: True
|
|
50
|
+
use_element_for_atom_names_of_atomized_tokens: True # TODO: correct name, implies unindexed do too
|
|
51
|
+
|
|
52
|
+
# PPI Cropping
|
|
53
|
+
keep_full_binder_in_spatial_crop: False
|
|
54
|
+
max_binder_length: 170
|
|
55
|
+
|
|
56
|
+
# PPI Hotspots
|
|
57
|
+
max_ppi_hotspots_frac_to_provide: 0.2
|
|
58
|
+
ppi_hotspot_max_distance: 4.5
|
|
59
|
+
|
|
60
|
+
# Secondary structure features
|
|
61
|
+
max_ss_frac_to_provide: 0.4
|
|
62
|
+
min_ss_island_len: 1
|
|
63
|
+
max_ss_island_len: 10
|
|
64
|
+
|
|
65
|
+
train_conditions:
|
|
66
|
+
unconditional:
|
|
67
|
+
frequency: 5.0
|
|
68
|
+
sequence_design:
|
|
69
|
+
frequency: 2.0
|
|
70
|
+
island:
|
|
71
|
+
frequency: 1.0
|
|
72
|
+
tipatom:
|
|
73
|
+
frequency: 0.0
|
|
74
|
+
ppi:
|
|
75
|
+
frequency: 0.0
|
|
76
|
+
|
|
77
|
+
# Used to create simple boolean flags for downstream conditioning
|
|
78
|
+
meta_conditioning_probabilities:
|
|
79
|
+
calculate_hbonds: 0.2
|
|
80
|
+
calculate_rasa: 0.6
|
|
81
|
+
|
|
82
|
+
keep_protein_motif_rasa: 0.1 # Small to prevent noisy input to model
|
|
83
|
+
hbond_subsample: 0.5
|
|
84
|
+
|
|
85
|
+
# fully indexed training
|
|
86
|
+
unindex_leak_global_index: 0.10
|
|
87
|
+
unindex_insert_random_break: 0.10
|
|
88
|
+
unindex_remove_random_break: 0.10
|
|
89
|
+
|
|
90
|
+
# Probability of adding 1d secondary structure conditioning
|
|
91
|
+
add_1d_ss_features: 0.1
|
|
92
|
+
featurize_plddt: 0.9 # Applied for monomer distillation only
|
|
93
|
+
add_global_is_non_loopy_feature: 0.99
|
|
94
|
+
|
|
95
|
+
# PPI
|
|
96
|
+
add_ppi_hotspots: 0.75
|
|
97
|
+
full_binder_crop: 0.75
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- base
|
|
3
|
+
|
|
4
|
+
dataset:
|
|
5
|
+
dataset_parser:
|
|
6
|
+
_target_: atomworks.ml.datasets.parsers.InterfacesDFParser
|
|
7
|
+
base_dir: ${paths.data.pdb_data_dir}
|
|
8
|
+
dataset:
|
|
9
|
+
name: interface
|
|
10
|
+
data: ${paths.data.pdb_parquet_dir}/interfaces_df_train.parquet
|
|
11
|
+
filters:
|
|
12
|
+
# filters common across all PDB datasets
|
|
13
|
+
- "deposition_date < '2021-09-30'"
|
|
14
|
+
- "resolution < 9.0"
|
|
15
|
+
- "num_polymer_pn_units <= 300"
|
|
16
|
+
- "cluster.notnull()"
|
|
17
|
+
# interface specific filters
|
|
18
|
+
- "~(pn_unit_1_non_polymer_res_names.notnull() and pn_unit_1_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
|
|
19
|
+
- "~(pn_unit_2_non_polymer_res_names.notnull() and pn_unit_2_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
|
|
20
|
+
- "is_inter_molecule"
|
|
21
|
+
columns_to_load:
|
|
22
|
+
# columns common across all PDB datasets
|
|
23
|
+
- example_id
|
|
24
|
+
- pdb_id
|
|
25
|
+
- assembly_id
|
|
26
|
+
- deposition_date
|
|
27
|
+
- resolution
|
|
28
|
+
- num_polymer_pn_units
|
|
29
|
+
- method
|
|
30
|
+
- cluster
|
|
31
|
+
- n_prot
|
|
32
|
+
- n_nuc
|
|
33
|
+
- n_ligand
|
|
34
|
+
- n_peptide
|
|
35
|
+
# interface specific columns
|
|
36
|
+
- pn_unit_1_iid
|
|
37
|
+
- pn_unit_2_iid
|
|
38
|
+
- pn_unit_1_non_polymer_res_names
|
|
39
|
+
- pn_unit_2_non_polymer_res_names
|
|
40
|
+
- is_inter_molecule
|
|
41
|
+
- all_pn_unit_iids_after_processing
|
|
42
|
+
- involves_loi
|
|
43
|
+
transform:
|
|
44
|
+
# interface-specific Transform pipeline parameters
|
|
45
|
+
crop_contiguous_probability: 0.0
|
|
46
|
+
crop_spatial_probability: 1.0
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- base
|
|
3
|
+
|
|
4
|
+
dataset:
|
|
5
|
+
dataset_parser:
|
|
6
|
+
_target_: atomworks.ml.datasets.parsers.PNUnitsDFParser
|
|
7
|
+
base_dir: ${paths.data.pdb_data_dir}
|
|
8
|
+
dataset:
|
|
9
|
+
name: pn_unit
|
|
10
|
+
data: ${paths.data.pdb_parquet_dir}/pn_units_df_train.parquet
|
|
11
|
+
filters:
|
|
12
|
+
# filters common across all PDB datasets
|
|
13
|
+
- "deposition_date < '2021-09-30'"
|
|
14
|
+
- "resolution < 9.0"
|
|
15
|
+
- "num_polymer_pn_units <= 300"
|
|
16
|
+
- "cluster.notnull()"
|
|
17
|
+
# pn_unit specific filters
|
|
18
|
+
- "~(q_pn_unit_non_polymer_res_names.notnull() and q_pn_unit_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
|
|
19
|
+
columns_to_load:
|
|
20
|
+
# columns common across all PDB datasets
|
|
21
|
+
- example_id
|
|
22
|
+
- pdb_id
|
|
23
|
+
- assembly_id
|
|
24
|
+
- deposition_date
|
|
25
|
+
- resolution
|
|
26
|
+
- num_polymer_pn_units
|
|
27
|
+
- method
|
|
28
|
+
- cluster
|
|
29
|
+
- n_prot
|
|
30
|
+
- n_nuc
|
|
31
|
+
- n_ligand
|
|
32
|
+
- n_peptide
|
|
33
|
+
- total_num_atoms_in_unprocessed_assembly
|
|
34
|
+
# pn_unit specific columns
|
|
35
|
+
- q_pn_unit_iid
|
|
36
|
+
- q_pn_unit_non_polymer_res_names
|
|
37
|
+
- all_pn_unit_iids_after_processing
|
|
38
|
+
- q_pn_unit_is_loi
|
|
39
|
+
transform:
|
|
40
|
+
# pn_unit-specific Transform pipeline parameters
|
|
41
|
+
crop_contiguous_probability: 0.3333333333333333
|
|
42
|
+
crop_spatial_probability: 0.6666666666666667
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Adds weights to the sampler
|
|
2
|
+
|
|
3
|
+
defaults:
|
|
4
|
+
- base_no_weights
|
|
5
|
+
- _self_
|
|
6
|
+
|
|
7
|
+
weights:
|
|
8
|
+
_target_: atomworks.ml.samplers.calculate_weights_for_pdb_dataset_df
|
|
9
|
+
beta: 0.5
|
|
10
|
+
alphas:
|
|
11
|
+
a_prot: 3.0 # 3 for AF-3
|
|
12
|
+
a_nuc: 0.0 # 3 for AF-3
|
|
13
|
+
a_ligand: 1.0 # 1 for AF-3
|
|
14
|
+
a_loi: 5.0 # 5 for AF-3
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- base_transform_args
|
|
3
|
+
- _self_
|
|
4
|
+
|
|
5
|
+
dataset:
|
|
6
|
+
_target_: atomworks.ml.datasets.StructuralDatasetWrapper
|
|
7
|
+
save_failed_examples_to_dir: ${paths.data.failed_examples_dir}
|
|
8
|
+
cif_parser_args:
|
|
9
|
+
cache_dir: null
|
|
10
|
+
load_from_cache: false
|
|
11
|
+
save_to_cache: false
|
|
12
|
+
dataset:
|
|
13
|
+
_target_: atomworks.ml.datasets.PandasDataset
|
|
14
|
+
# we will use the example_id as the unique column
|
|
15
|
+
id_column: example_id
|
|
16
|
+
transform:
|
|
17
|
+
# common Transform pipeline components for all PDB datasets
|
|
18
|
+
_target_: ${datasets.pipeline_target}
|
|
19
|
+
is_inference: False
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# All required training args
|
|
2
|
+
|
|
3
|
+
defaults:
|
|
4
|
+
- _self_
|
|
5
|
+
|
|
6
|
+
dataset:
|
|
7
|
+
transform:
|
|
8
|
+
_target_: ${datasets.pipeline_target}
|
|
9
|
+
is_inference: False
|
|
10
|
+
return_atom_array: False
|
|
11
|
+
|
|
12
|
+
# Model
|
|
13
|
+
sigma_perturb: ${datasets.global_transform_args.sigma_perturb}
|
|
14
|
+
sigma_perturb_com: ${datasets.global_transform_args.sigma_perturb_com}
|
|
15
|
+
sigma_data: ${model.net.diffusion_module.sigma_data}
|
|
16
|
+
diffusion_batch_size: ${datasets.diffusion_batch_size_train}
|
|
17
|
+
central_atom: ${datasets.global_transform_args.central_atom}
|
|
18
|
+
n_atoms_per_token: ${datasets.global_transform_args.n_atoms_per_token}
|
|
19
|
+
association_scheme: ${datasets.global_transform_args.association_scheme}
|
|
20
|
+
center_option: ${datasets.global_transform_args.center_option}
|
|
21
|
+
|
|
22
|
+
# Conformers
|
|
23
|
+
generate_conformers: ${datasets.global_transform_args.generate_conformers}
|
|
24
|
+
generate_conformers_for_non_protein_only: ${datasets.global_transform_args.generate_conformers_for_non_protein_only}
|
|
25
|
+
provide_reference_conformer_when_unmasked: ${datasets.global_transform_args.provide_reference_conformer_when_unmasked}
|
|
26
|
+
ground_truth_conformer_policy: ${datasets.global_transform_args.ground_truth_conformer_policy}
|
|
27
|
+
provide_elements_for_unindexed_components: ${datasets.global_transform_args.provide_elements_for_unindexed_components}
|
|
28
|
+
use_element_for_atom_names_of_atomized_tokens: ${datasets.global_transform_args.use_element_for_atom_names_of_atomized_tokens}
|
|
29
|
+
residue_cache_dir: ${paths.data.residue_cache_dir}
|
|
30
|
+
|
|
31
|
+
# Conditions
|
|
32
|
+
train_conditions: ${datasets.global_transform_args.train_conditions}
|
|
33
|
+
meta_conditioning_probabilities: ${datasets.global_transform_args.meta_conditioning_probabilities}
|
|
34
|
+
|
|
35
|
+
# PPI Hypers
|
|
36
|
+
keep_full_binder_in_spatial_crop: ${datasets.global_transform_args.keep_full_binder_in_spatial_crop}
|
|
37
|
+
max_binder_length: ${datasets.global_transform_args.max_binder_length}
|
|
38
|
+
max_ppi_hotspots_frac_to_provide: ${datasets.global_transform_args.max_ppi_hotspots_frac_to_provide}
|
|
39
|
+
ppi_hotspot_max_distance: ${datasets.global_transform_args.ppi_hotspot_max_distance}
|
|
40
|
+
|
|
41
|
+
# 1D SS hypers
|
|
42
|
+
max_ss_frac_to_provide: ${datasets.global_transform_args.max_ss_frac_to_provide}
|
|
43
|
+
min_ss_island_len: ${datasets.global_transform_args.min_ss_island_len}
|
|
44
|
+
max_ss_island_len: ${datasets.global_transform_args.max_ss_island_len}
|
|
45
|
+
|
|
46
|
+
# Cropping
|
|
47
|
+
crop_size: ${datasets.crop_size}
|
|
48
|
+
max_atoms_in_crop: ${datasets.max_atoms_in_crop}
|
|
49
|
+
allowed_types: ALL
|
|
50
|
+
crop_spatial_probability: ???
|
|
51
|
+
crop_contiguous_probability: ???
|
|
52
|
+
dna_contact_crop_probability: 0.0
|
|
53
|
+
crop_center_cutoff_distance: 15.0
|
|
54
|
+
zero_occ_on_exposure_after_cropping: False
|
|
55
|
+
b_factor_min: null
|
|
56
|
+
|
|
57
|
+
# Other dataset-specific parameters
|
|
58
|
+
atom_1d_features: ${model.net.token_initializer.atom_1d_features}
|
|
59
|
+
token_1d_features: ${model.net.token_initializer.token_1d_features}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- base_no_weights
|
|
3
|
+
- _self_
|
|
4
|
+
|
|
5
|
+
dataset:
|
|
6
|
+
dataset_parser:
|
|
7
|
+
_target_: atomworks.ml.datasets.parsers.GenericDFParser
|
|
8
|
+
pn_unit_iid_colnames: null
|
|
9
|
+
|
|
10
|
+
dataset:
|
|
11
|
+
name: tf_distillation
|
|
12
|
+
data: /projects/ml/prot_dna/transcriptionFactor_distillation_rf3.newDL.csv
|
|
13
|
+
columns_to_load:
|
|
14
|
+
- example_id
|
|
15
|
+
- path
|
|
16
|
+
|
|
17
|
+
transform:
|
|
18
|
+
crop_contiguous_probability: 0.4
|
|
19
|
+
crop_spatial_probability: 0.0
|
|
20
|
+
dna_contact_crop_probability: 0.6
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Inherit
|
|
2
|
+
defaults:
|
|
3
|
+
- af3_train_interface
|
|
4
|
+
- pdb_base
|
|
5
|
+
- _self_
|
|
6
|
+
|
|
7
|
+
dataset:
|
|
8
|
+
transform:
|
|
9
|
+
crop_contiguous_probability: 0.0
|
|
10
|
+
crop_spatial_probability: 1.0
|
|
11
|
+
filters:
|
|
12
|
+
# filters common across all PDB datasets
|
|
13
|
+
- 'pdb_id not in ["7rte", "7m5w", "7n5u"]'
|
|
14
|
+
- 'pdb_id not in ["3di3", "5o45", "1z92", "2gy5", "4zxb"]'
|
|
15
|
+
- "deposition_date < '2024-12-16'"
|
|
16
|
+
- "resolution < 9.0"
|
|
17
|
+
- "num_polymer_pn_units <= 300"
|
|
18
|
+
- "cluster.notnull()"
|
|
19
|
+
# interface specific filters
|
|
20
|
+
- "~(pn_unit_1_non_polymer_res_names.notnull() and pn_unit_1_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
|
|
21
|
+
- "~(pn_unit_2_non_polymer_res_names.notnull() and pn_unit_2_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
|
|
22
|
+
- "is_inter_molecule"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- af3_train_pn_unit
|
|
3
|
+
- pdb_base
|
|
4
|
+
- _self_
|
|
5
|
+
|
|
6
|
+
dataset:
|
|
7
|
+
transform:
|
|
8
|
+
# pn_unit-specific Transform pipeline parameters
|
|
9
|
+
crop_contiguous_probability: 0.25
|
|
10
|
+
crop_spatial_probability: 0.75
|
|
11
|
+
|
|
12
|
+
# Modify: date & clustering parquet
|
|
13
|
+
dataset:
|
|
14
|
+
filters:
|
|
15
|
+
# filters common across all PDB datasets
|
|
16
|
+
- 'pdb_id not in ["7rte", "7m5w", "7n5u"]'
|
|
17
|
+
- 'pdb_id not in ["3di3", "5o45", "1z92", "2gy5", "4zxb"]'
|
|
18
|
+
- "deposition_date < '2024-12-16'"
|
|
19
|
+
- "resolution < 9.0"
|
|
20
|
+
- "num_polymer_pn_units <= 300"
|
|
21
|
+
- "cluster.notnull()"
|
|
22
|
+
# pn_unit specific filters
|
|
23
|
+
- "~(q_pn_unit_non_polymer_res_names.notnull() and q_pn_unit_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- pdb/base_transform_args@monomer_distillation
|
|
3
|
+
- _self_
|
|
4
|
+
|
|
5
|
+
monomer_distillation:
|
|
6
|
+
dataset:
|
|
7
|
+
_target_: atomworks.ml.datasets.StructuralDatasetWrapper
|
|
8
|
+
save_failed_examples_to_dir: ${paths.data.failed_examples_dir}
|
|
9
|
+
# Explicitly do not load from cache.
|
|
10
|
+
# Dataset too big, and structures are small
|
|
11
|
+
cif_parser_args:
|
|
12
|
+
cache_dir: null
|
|
13
|
+
load_from_cache: False
|
|
14
|
+
save_to_cache: False
|
|
15
|
+
|
|
16
|
+
# metadata dataset
|
|
17
|
+
dataset:
|
|
18
|
+
_target_: atomworks.ml.datasets.PandasDataset
|
|
19
|
+
name: af2fb_distillation
|
|
20
|
+
id_column: example_id
|
|
21
|
+
data: ${paths.data.monomer_distillation_parquet_dir}/af2_distillation_facebook.parquet
|
|
22
|
+
columns_to_load:
|
|
23
|
+
- example_id
|
|
24
|
+
- path
|
|
25
|
+
|
|
26
|
+
# metadata parser
|
|
27
|
+
dataset_parser:
|
|
28
|
+
_target_: atomworks.ml.datasets.parsers.GenericDFParser
|
|
29
|
+
pn_unit_iid_colnames: null
|
|
30
|
+
|
|
31
|
+
transform:
|
|
32
|
+
_target_: ${datasets.pipeline_target}
|
|
33
|
+
is_inference: False
|
|
34
|
+
# protein_msa_dirs: [{"dir": "${paths.data.monomer_distillation_data_dir}/msa", "extension": ".a3m", "directory_depth": 2}]
|
|
35
|
+
# rna_msa_dirs: []
|
|
36
|
+
crop_contiguous_probability: 0.25
|
|
37
|
+
crop_spatial_probability: 0.75
|
|
38
|
+
b_factor_min: 70
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
dataset:
|
|
2
|
+
_target_: rfd3.inference.datasets.ContigJsonDataset
|
|
3
|
+
|
|
4
|
+
# Required parameters for each inheriting dataset
|
|
5
|
+
data: ??? # Path to json file
|
|
6
|
+
name: ??? # Name for displaying and saving files
|
|
7
|
+
eval_every_n: ??? # Evaluate on this dataset every n epochs
|
|
8
|
+
subset_to_keys: null # Specific keys in json to keep, ignores all others.
|
|
9
|
+
|
|
10
|
+
# NB: Used for parsing input files (not for atom_array reloading anymore)
|
|
11
|
+
cif_parser_args:
|
|
12
|
+
cache_dir: null
|
|
13
|
+
load_from_cache: False
|
|
14
|
+
save_to_cache: False
|
|
15
|
+
add_missing_atoms: False
|
|
16
|
+
|
|
17
|
+
# Common Transform pipeline components for all PDB datasets
|
|
18
|
+
transform:
|
|
19
|
+
_target_: ${datasets.pipeline_target}
|
|
20
|
+
is_inference: True
|
|
21
|
+
return_atom_array: True
|
|
22
|
+
diffusion_batch_size: ${datasets.diffusion_batch_size_train}
|
|
23
|
+
sigma_data: ${model.net.diffusion_module.sigma_data}
|
|
24
|
+
central_atom: ${datasets.global_transform_args.central_atom}
|
|
25
|
+
n_atoms_per_token: ${datasets.global_transform_args.n_atoms_per_token}
|
|
26
|
+
association_scheme: ${datasets.global_transform_args.association_scheme}
|
|
27
|
+
center_option: ${datasets.global_transform_args.center_option}
|
|
28
|
+
|
|
29
|
+
# Conformers
|
|
30
|
+
generate_conformers: ${datasets.global_transform_args.generate_conformers}
|
|
31
|
+
generate_conformers_for_non_protein_only: ${datasets.global_transform_args.generate_conformers_for_non_protein_only}
|
|
32
|
+
provide_reference_conformer_when_unmasked: ${datasets.global_transform_args.provide_reference_conformer_when_unmasked}
|
|
33
|
+
ground_truth_conformer_policy: ${datasets.global_transform_args.ground_truth_conformer_policy}
|
|
34
|
+
provide_elements_for_unindexed_components: ${datasets.global_transform_args.provide_elements_for_unindexed_components}
|
|
35
|
+
use_element_for_atom_names_of_atomized_tokens: ${datasets.global_transform_args.use_element_for_atom_names_of_atomized_tokens}
|
|
36
|
+
residue_cache_dir: ${paths.data.residue_cache_dir}
|
|
37
|
+
|
|
38
|
+
# Other dataset-specific parameters
|
|
39
|
+
atom_1d_features: ${model.net.token_initializer.atom_1d_features}
|
|
40
|
+
token_1d_features: ${model.net.token_initializer.token_1d_features}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
defaults:
|
|
2
|
+
- sm_binder_hbonds
|
|
3
|
+
- _self_
|
|
4
|
+
|
|
5
|
+
dataset:
|
|
6
|
+
eval_every_n: 1
|
|
7
|
+
data: ${paths.data.design_benchmark_data_dir}/sm_binder_hbonds_sampled.json
|
|
8
|
+
name: sm_binder_hbonds-design-short
|
|
9
|
+
subset_to_keys:
|
|
10
|
+
- FAD_1
|
|
11
|
+
- FAD_2
|
|
12
|
+
- FAD_3
|
|
13
|
+
- IAI_1
|
|
14
|
+
- IAI_2
|
|
15
|
+
- IAI_3
|