rc-foundry 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. foundry/version.py +2 -2
  2. {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/METADATA +1 -1
  3. {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/RECORD +139 -8
  4. rf3/configs/callbacks/default.yaml +5 -0
  5. rf3/configs/callbacks/dump_validation_structures.yaml +6 -0
  6. rf3/configs/callbacks/metrics_logging.yaml +10 -0
  7. rf3/configs/callbacks/train_logging.yaml +16 -0
  8. rf3/configs/dataloader/default.yaml +15 -0
  9. rf3/configs/datasets/base.yaml +31 -0
  10. rf3/configs/datasets/pdb_and_distillation.yaml +58 -0
  11. rf3/configs/datasets/pdb_only.yaml +17 -0
  12. rf3/configs/datasets/train/disorder_distillation.yaml +48 -0
  13. rf3/configs/datasets/train/domain_distillation.yaml +50 -0
  14. rf3/configs/datasets/train/monomer_distillation.yaml +49 -0
  15. rf3/configs/datasets/train/na_complex_distillation.yaml +50 -0
  16. rf3/configs/datasets/train/pdb/af3_weighted_sampling.yaml +8 -0
  17. rf3/configs/datasets/train/pdb/base.yaml +32 -0
  18. rf3/configs/datasets/train/pdb/plinder.yaml +54 -0
  19. rf3/configs/datasets/train/pdb/train_interface.yaml +51 -0
  20. rf3/configs/datasets/train/pdb/train_pn_unit.yaml +46 -0
  21. rf3/configs/datasets/train/rna_monomer_distillation.yaml +56 -0
  22. rf3/configs/datasets/val/af3_ab_set.yaml +11 -0
  23. rf3/configs/datasets/val/af3_validation.yaml +11 -0
  24. rf3/configs/datasets/val/base.yaml +32 -0
  25. rf3/configs/datasets/val/runs_and_poses.yaml +12 -0
  26. rf3/configs/debug/default.yaml +66 -0
  27. rf3/configs/debug/train_specific_examples.yaml +21 -0
  28. rf3/configs/experiment/pretrained/rf3.yaml +50 -0
  29. rf3/configs/experiment/pretrained/rf3_with_confidence.yaml +13 -0
  30. rf3/configs/experiment/quick-rf3-with-confidence.yaml +15 -0
  31. rf3/configs/experiment/quick-rf3.yaml +61 -0
  32. rf3/configs/hydra/default.yaml +18 -0
  33. rf3/configs/hydra/no_logging.yaml +7 -0
  34. rf3/configs/inference.yaml +7 -0
  35. rf3/configs/inference_engine/base.yaml +23 -0
  36. rf3/configs/inference_engine/rf3.yaml +33 -0
  37. rf3/configs/logger/csv.yaml +6 -0
  38. rf3/configs/logger/default.yaml +3 -0
  39. rf3/configs/logger/wandb.yaml +15 -0
  40. rf3/configs/model/components/ema.yaml +1 -0
  41. rf3/configs/model/components/rf3_net.yaml +177 -0
  42. rf3/configs/model/components/rf3_net_with_confidence_head.yaml +45 -0
  43. rf3/configs/model/optimizers/adam.yaml +5 -0
  44. rf3/configs/model/rf3.yaml +43 -0
  45. rf3/configs/model/rf3_with_confidence.yaml +7 -0
  46. rf3/configs/model/schedulers/af3.yaml +6 -0
  47. rf3/configs/paths/data/default.yaml +43 -0
  48. rf3/configs/paths/default.yaml +21 -0
  49. rf3/configs/train.yaml +42 -0
  50. rf3/configs/trainer/cpu.yaml +6 -0
  51. rf3/configs/trainer/ddp.yaml +5 -0
  52. rf3/configs/trainer/loss/losses/confidence_loss.yaml +29 -0
  53. rf3/configs/trainer/loss/losses/diffusion_loss.yaml +9 -0
  54. rf3/configs/trainer/loss/losses/distogram_loss.yaml +2 -0
  55. rf3/configs/trainer/loss/structure_prediction.yaml +4 -0
  56. rf3/configs/trainer/loss/structure_prediction_with_confidence.yaml +2 -0
  57. rf3/configs/trainer/metrics/structure_prediction.yaml +14 -0
  58. rf3/configs/trainer/rf3.yaml +20 -0
  59. rf3/configs/trainer/rf3_with_confidence.yaml +13 -0
  60. rf3/configs/validate.yaml +45 -0
  61. rfd3/cli.py +10 -4
  62. rfd3/configs/__init__.py +0 -0
  63. rfd3/configs/callbacks/design_callbacks.yaml +10 -0
  64. rfd3/configs/callbacks/metrics_logging.yaml +20 -0
  65. rfd3/configs/callbacks/train_logging.yaml +24 -0
  66. rfd3/configs/dataloader/default.yaml +15 -0
  67. rfd3/configs/dataloader/fast.yaml +11 -0
  68. rfd3/configs/datasets/conditions/dna_condition.yaml +3 -0
  69. rfd3/configs/datasets/conditions/island.yaml +28 -0
  70. rfd3/configs/datasets/conditions/ppi.yaml +2 -0
  71. rfd3/configs/datasets/conditions/sequence_design.yaml +17 -0
  72. rfd3/configs/datasets/conditions/tipatom.yaml +28 -0
  73. rfd3/configs/datasets/conditions/unconditional.yaml +21 -0
  74. rfd3/configs/datasets/design_base.yaml +97 -0
  75. rfd3/configs/datasets/train/pdb/af3_train_interface.yaml +46 -0
  76. rfd3/configs/datasets/train/pdb/af3_train_pn_unit.yaml +42 -0
  77. rfd3/configs/datasets/train/pdb/base.yaml +14 -0
  78. rfd3/configs/datasets/train/pdb/base_no_weights.yaml +19 -0
  79. rfd3/configs/datasets/train/pdb/base_transform_args.yaml +59 -0
  80. rfd3/configs/datasets/train/pdb/na_complex_distillation.yaml +20 -0
  81. rfd3/configs/datasets/train/pdb/pdb_base.yaml +11 -0
  82. rfd3/configs/datasets/train/pdb/rfd3_train_interface.yaml +22 -0
  83. rfd3/configs/datasets/train/pdb/rfd3_train_pn_unit.yaml +23 -0
  84. rfd3/configs/datasets/train/rfd3_monomer_distillation.yaml +38 -0
  85. rfd3/configs/datasets/val/bcov_ppi_easy_medium.yaml +9 -0
  86. rfd3/configs/datasets/val/design_validation_base.yaml +40 -0
  87. rfd3/configs/datasets/val/dna_binder_design5.yaml +9 -0
  88. rfd3/configs/datasets/val/dna_binder_long.yaml +13 -0
  89. rfd3/configs/datasets/val/dna_binder_short.yaml +13 -0
  90. rfd3/configs/datasets/val/indexed.yaml +9 -0
  91. rfd3/configs/datasets/val/mcsa_41.yaml +9 -0
  92. rfd3/configs/datasets/val/mcsa_41_short_rigid.yaml +10 -0
  93. rfd3/configs/datasets/val/ppi_inference.yaml +7 -0
  94. rfd3/configs/datasets/val/sm_binder_hbonds.yaml +13 -0
  95. rfd3/configs/datasets/val/sm_binder_hbonds_short.yaml +15 -0
  96. rfd3/configs/datasets/val/unconditional.yaml +9 -0
  97. rfd3/configs/datasets/val/unconditional_deep.yaml +9 -0
  98. rfd3/configs/datasets/val/unindexed.yaml +8 -0
  99. rfd3/configs/datasets/val/val_examples/bcov_ppi_easy_medium_with_ori.yaml +151 -0
  100. rfd3/configs/datasets/val/val_examples/bcov_ppi_easy_medium_with_ori_spoof_helical_bundle.yaml +7 -0
  101. rfd3/configs/datasets/val/val_examples/bcov_ppi_easy_medium_with_ori_varying_lengths.yaml +28 -0
  102. rfd3/configs/datasets/val/val_examples/bpem_ori_hb.yaml +212 -0
  103. rfd3/configs/debug/default.yaml +64 -0
  104. rfd3/configs/debug/train_specific_examples.yaml +21 -0
  105. rfd3/configs/dev.yaml +9 -0
  106. rfd3/configs/experiment/debug.yaml +14 -0
  107. rfd3/configs/experiment/pretrain.yaml +31 -0
  108. rfd3/configs/experiment/test-uncond.yaml +10 -0
  109. rfd3/configs/experiment/test-unindexed.yaml +21 -0
  110. rfd3/configs/hydra/default.yaml +18 -0
  111. rfd3/configs/hydra/no_logging.yaml +7 -0
  112. rfd3/configs/inference.yaml +9 -0
  113. rfd3/configs/inference_engine/base.yaml +15 -0
  114. rfd3/configs/inference_engine/dev.yaml +20 -0
  115. rfd3/configs/inference_engine/rfdiffusion3.yaml +65 -0
  116. rfd3/configs/logger/csv.yaml +6 -0
  117. rfd3/configs/logger/default.yaml +2 -0
  118. rfd3/configs/logger/wandb.yaml +15 -0
  119. rfd3/configs/model/components/ema.yaml +1 -0
  120. rfd3/configs/model/components/rfd3_net.yaml +131 -0
  121. rfd3/configs/model/optimizers/adam.yaml +5 -0
  122. rfd3/configs/model/rfd3_base.yaml +8 -0
  123. rfd3/configs/model/samplers/edm.yaml +21 -0
  124. rfd3/configs/model/samplers/symmetry.yaml +10 -0
  125. rfd3/configs/model/schedulers/af3.yaml +6 -0
  126. rfd3/configs/paths/data/default.yaml +18 -0
  127. rfd3/configs/paths/default.yaml +22 -0
  128. rfd3/configs/train.yaml +28 -0
  129. rfd3/configs/trainer/cpu.yaml +6 -0
  130. rfd3/configs/trainer/ddp.yaml +5 -0
  131. rfd3/configs/trainer/loss/losses/diffusion_loss.yaml +12 -0
  132. rfd3/configs/trainer/loss/losses/sequence_loss.yaml +3 -0
  133. rfd3/configs/trainer/metrics/design_metrics.yaml +22 -0
  134. rfd3/configs/trainer/rfd3_base.yaml +35 -0
  135. rfd3/configs/validate.yaml +34 -0
  136. rfd3/run_inference.py +3 -7
  137. {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/WHEEL +0 -0
  138. {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/entry_points.txt +0 -0
  139. {rc_foundry-0.1.4.dist-info → rc_foundry-0.1.6.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,3 @@
1
+ _target_: rfd3.transforms.training_conditions.SubtypeCondition
2
+ frequency: 1.0
3
+ subtype: ["is_dna", "is_rna"]
@@ -0,0 +1,28 @@
1
+
2
+ _target_: rfd3.transforms.training_conditions.IslandCondition
3
+ frequency: 1.0
4
+ name: island
5
+
6
+ # Island sampling (`is_motif_token` assignment)
7
+ island_sampling_kwargs:
8
+ island_len_min: 1
9
+ island_len_max: 12 # Rec 25, kept lower because unindexed motifs get sampled too and create more tokens.
10
+ n_islands_min: 2
11
+ n_islands_max: 5
12
+
13
+ # Subgraph / within-token sampling (`is_motif_atom` assignment)
14
+ p_diffuse_motif_sidechains: 0.80 # 80% probability of diffusing sidechains
15
+ p_diffuse_subgraph_atoms: 0.0 # 0% probability of sampling subgraph atoms (defaults to fully fixed)
16
+ subgraph_sampling_kwargs: # see tipatom
17
+ residue_p_seed_furthest_from_o: null
18
+ residue_n_bond_expectation: null
19
+ residue_p_fix_all: null
20
+ hetatom_n_bond_expectation: null
21
+ hetatom_p_fix_all: null
22
+
23
+ # Sets `is_motif_atom_with_fixed_seq`
24
+ p_fix_motif_sequence: 0.2 # probability that sequence is fixed for all motifs during training
25
+ # Sets `is_motif_atom_with_fixed_coord`
26
+ p_fix_motif_coordinates: 0.8 # Of the atoms that are sampled, should their coordinates be fixed?
27
+ # Sets `is_motif_atom_with_unindexed`
28
+ p_unindex_motif_tokens: 0.5 # probability of unindexing all motif atoms
@@ -0,0 +1,2 @@
1
+ _target_: rfd3.transforms.training_conditions.PPICondition
2
+ frequency: 1.0
@@ -0,0 +1,17 @@
1
+ defaults:
2
+ - island
3
+ - _self_
4
+
5
+ frequency: 1.0
6
+ name: sequence_design
7
+
8
+ island_sampling_kwargs:
9
+ island_len_min: 99999
10
+ island_len_max: 999999999
11
+
12
+ p_diffuse_motif_sidechains: 1.0
13
+ p_unindex_motif_tokens: 0.0
14
+
15
+ # For ChemNet-style sampling < 1.0
16
+ p_fix_motif_coordinates: 0.8
17
+ p_fix_motif_sequence: 0.1
@@ -0,0 +1,28 @@
1
+
2
+ defaults:
3
+ - island
4
+ - _self_
5
+
6
+ frequency: 1.0
7
+ name: tipatom
8
+
9
+ # Island sampling (`is_motif_token` assignment)
10
+ island_sampling_kwargs:
11
+ island_len_min: 1
12
+ island_len_max: 1
13
+ n_islands_min: 2
14
+ n_islands_max: 12
15
+
16
+ # Subgraph / within-token sampling (`is_motif_atom` assignment)
17
+ p_diffuse_motif_sidechains: 0.0 # 80% probability of diffusing sidechains
18
+ p_diffuse_subgraph_atoms: 1.0
19
+ subgraph_sampling_kwargs:
20
+ residue_p_seed_furthest_from_o: 0.8
21
+ residue_n_bond_expectation: 3.0
22
+ residue_p_fix_all: 0.05
23
+ hetatom_n_bond_expectation: 8
24
+ hetatom_p_fix_all: 0.5
25
+
26
+ p_fix_motif_sequence: 0.7
27
+ p_fix_motif_coordinates: 1.0
28
+ p_unindex_motif_tokens: 0.5
@@ -0,0 +1,21 @@
1
+ # Unconditional that fixes non-protein targets
2
+
3
+ defaults:
4
+ - island
5
+ - _self_
6
+
7
+ frequency: 1.0
8
+ name: unconditional
9
+
10
+ island_sampling_kwargs:
11
+ island_len_min: 0
12
+ island_len_max: 0
13
+ n_islands_min: 0
14
+ n_islands_max: 0
15
+
16
+ # Conditional assignments won't matter for protein regions since always diffused:
17
+ p_diffuse_motif_sidechains: 0.0
18
+ p_diffuse_subgraph_atoms: 0.0
19
+ p_fix_motif_sequence: 0.0
20
+ p_fix_motif_coordinates: 0.0
21
+ p_unindex_motif_tokens: 0.0
@@ -0,0 +1,97 @@
1
+ # base training dataset for training AF3 design models (atom14 variants):
2
+ # protein subsampling only.
3
+
4
+ defaults:
5
+ # Grab datasets
6
+ - train/pdb/rfd3_train_interface@train.pdb.sub_datasets.interface
7
+ - train/pdb/rfd3_train_pn_unit@train.pdb.sub_datasets.pn_unit
8
+ #- train/rfd3_monomer_distillation@train
9
+
10
+ # Customized validation datasets
11
+ - val/unconditional@val.unconditional
12
+ - val/unconditional_deep@val.unconditional_deep
13
+ - val/indexed@val.indexed
14
+
15
+ # Customized train masks
16
+ - conditions/unconditional@global_transform_args.train_conditions.unconditional
17
+ - conditions/island@global_transform_args.train_conditions.island
18
+ - conditions/tipatom@global_transform_args.train_conditions.tipatom
19
+ - conditions/sequence_design@global_transform_args.train_conditions.sequence_design
20
+ - conditions/ppi@global_transform_args.train_conditions.ppi
21
+
22
+ - _self_
23
+
24
+ # Create a dictionary used for transform arguments
25
+ pipeline_target: rfd3.transforms.pipelines.build_atom14_base_pipeline
26
+
27
+ # Base config overrides:
28
+ diffusion_batch_size_train: 32
29
+ diffusion_batch_size_inference: 8
30
+ crop_size: 384
31
+ n_recycles_train: 2
32
+ n_recycles_validation: 1
33
+ max_atoms_in_crop: 3840 # ~10x crop size.
34
+
35
+ # Global transform arguments are necessary for arguments shared between training and inference
36
+ global_transform_args:
37
+ n_atoms_per_token: 14
38
+ central_atom: CB
39
+ sigma_perturb: 2.0
40
+ sigma_perturb_com: 1.0
41
+ association_scheme: dense
42
+ center_option: diffuse # options are ["all", "motif", "diffuse"]
43
+
44
+ # Reference conformer policy
45
+ generate_conformers: True
46
+ generate_conformers_for_non_protein_only: True
47
+ provide_reference_conformer_when_unmasked: True
48
+ ground_truth_conformer_policy: IGNORE # Other options: REPLACE, ADD, FALLBACK. See atomworks.enums for details
49
+ provide_elements_for_unindexed_components: True
50
+ use_element_for_atom_names_of_atomized_tokens: True # TODO: correct name, implies unindexed do too
51
+
52
+ # PPI Cropping
53
+ keep_full_binder_in_spatial_crop: False
54
+ max_binder_length: 170
55
+
56
+ # PPI Hotspots
57
+ max_ppi_hotspots_frac_to_provide: 0.2
58
+ ppi_hotspot_max_distance: 4.5
59
+
60
+ # Secondary structure features
61
+ max_ss_frac_to_provide: 0.4
62
+ min_ss_island_len: 1
63
+ max_ss_island_len: 10
64
+
65
+ train_conditions:
66
+ unconditional:
67
+ frequency: 5.0
68
+ sequence_design:
69
+ frequency: 2.0
70
+ island:
71
+ frequency: 1.0
72
+ tipatom:
73
+ frequency: 0.0
74
+ ppi:
75
+ frequency: 0.0
76
+
77
+ # Used to create simple boolean flags for downstream conditioning
78
+ meta_conditioning_probabilities:
79
+ calculate_hbonds: 0.2
80
+ calculate_rasa: 0.6
81
+
82
+ keep_protein_motif_rasa: 0.1 # Small to prevent noisy input to model
83
+ hbond_subsample: 0.5
84
+
85
+ # fully indexed training
86
+ unindex_leak_global_index: 0.10
87
+ unindex_insert_random_break: 0.10
88
+ unindex_remove_random_break: 0.10
89
+
90
+ # Probability of adding 1d secondary structure conditioning
91
+ add_1d_ss_features: 0.1
92
+ featurize_plddt: 0.9 # Applied for monomer distillation only
93
+ add_global_is_non_loopy_feature: 0.99
94
+
95
+ # PPI
96
+ add_ppi_hotspots: 0.75
97
+ full_binder_crop: 0.75
@@ -0,0 +1,46 @@
1
+ defaults:
2
+ - base
3
+
4
+ dataset:
5
+ dataset_parser:
6
+ _target_: atomworks.ml.datasets.parsers.InterfacesDFParser
7
+ base_dir: ${paths.data.pdb_data_dir}
8
+ dataset:
9
+ name: interface
10
+ data: ${paths.data.pdb_parquet_dir}/interfaces_df_train.parquet
11
+ filters:
12
+ # filters common across all PDB datasets
13
+ - "deposition_date < '2021-09-30'"
14
+ - "resolution < 9.0"
15
+ - "num_polymer_pn_units <= 300"
16
+ - "cluster.notnull()"
17
+ # interface specific filters
18
+ - "~(pn_unit_1_non_polymer_res_names.notnull() and pn_unit_1_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
19
+ - "~(pn_unit_2_non_polymer_res_names.notnull() and pn_unit_2_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
20
+ - "is_inter_molecule"
21
+ columns_to_load:
22
+ # columns common across all PDB datasets
23
+ - example_id
24
+ - pdb_id
25
+ - assembly_id
26
+ - deposition_date
27
+ - resolution
28
+ - num_polymer_pn_units
29
+ - method
30
+ - cluster
31
+ - n_prot
32
+ - n_nuc
33
+ - n_ligand
34
+ - n_peptide
35
+ # interface specific columns
36
+ - pn_unit_1_iid
37
+ - pn_unit_2_iid
38
+ - pn_unit_1_non_polymer_res_names
39
+ - pn_unit_2_non_polymer_res_names
40
+ - is_inter_molecule
41
+ - all_pn_unit_iids_after_processing
42
+ - involves_loi
43
+ transform:
44
+ # interface-specific Transform pipeline parameters
45
+ crop_contiguous_probability: 0.0
46
+ crop_spatial_probability: 1.0
@@ -0,0 +1,42 @@
1
+ defaults:
2
+ - base
3
+
4
+ dataset:
5
+ dataset_parser:
6
+ _target_: atomworks.ml.datasets.parsers.PNUnitsDFParser
7
+ base_dir: ${paths.data.pdb_data_dir}
8
+ dataset:
9
+ name: pn_unit
10
+ data: ${paths.data.pdb_parquet_dir}/pn_units_df_train.parquet
11
+ filters:
12
+ # filters common across all PDB datasets
13
+ - "deposition_date < '2021-09-30'"
14
+ - "resolution < 9.0"
15
+ - "num_polymer_pn_units <= 300"
16
+ - "cluster.notnull()"
17
+ # pn_unit specific filters
18
+ - "~(q_pn_unit_non_polymer_res_names.notnull() and q_pn_unit_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
19
+ columns_to_load:
20
+ # columns common across all PDB datasets
21
+ - example_id
22
+ - pdb_id
23
+ - assembly_id
24
+ - deposition_date
25
+ - resolution
26
+ - num_polymer_pn_units
27
+ - method
28
+ - cluster
29
+ - n_prot
30
+ - n_nuc
31
+ - n_ligand
32
+ - n_peptide
33
+ - total_num_atoms_in_unprocessed_assembly
34
+ # pn_unit specific columns
35
+ - q_pn_unit_iid
36
+ - q_pn_unit_non_polymer_res_names
37
+ - all_pn_unit_iids_after_processing
38
+ - q_pn_unit_is_loi
39
+ transform:
40
+ # pn_unit-specific Transform pipeline parameters
41
+ crop_contiguous_probability: 0.3333333333333333
42
+ crop_spatial_probability: 0.6666666666666667
@@ -0,0 +1,14 @@
1
+ # Adds weights to the sampler
2
+
3
+ defaults:
4
+ - base_no_weights
5
+ - _self_
6
+
7
+ weights:
8
+ _target_: atomworks.ml.samplers.calculate_weights_for_pdb_dataset_df
9
+ beta: 0.5
10
+ alphas:
11
+ a_prot: 3.0 # 3 for AF-3
12
+ a_nuc: 0.0 # 3 for AF-3
13
+ a_ligand: 1.0 # 1 for AF-3
14
+ a_loi: 5.0 # 5 for AF-3
@@ -0,0 +1,19 @@
1
+ defaults:
2
+ - base_transform_args
3
+ - _self_
4
+
5
+ dataset:
6
+ _target_: atomworks.ml.datasets.StructuralDatasetWrapper
7
+ save_failed_examples_to_dir: ${paths.data.failed_examples_dir}
8
+ cif_parser_args:
9
+ cache_dir: null
10
+ load_from_cache: false
11
+ save_to_cache: false
12
+ dataset:
13
+ _target_: atomworks.ml.datasets.PandasDataset
14
+ # we will use the example_id as the unique column
15
+ id_column: example_id
16
+ transform:
17
+ # common Transform pipeline components for all PDB datasets
18
+ _target_: ${datasets.pipeline_target}
19
+ is_inference: False
@@ -0,0 +1,59 @@
1
+ # All required training args
2
+
3
+ defaults:
4
+ - _self_
5
+
6
+ dataset:
7
+ transform:
8
+ _target_: ${datasets.pipeline_target}
9
+ is_inference: False
10
+ return_atom_array: False
11
+
12
+ # Model
13
+ sigma_perturb: ${datasets.global_transform_args.sigma_perturb}
14
+ sigma_perturb_com: ${datasets.global_transform_args.sigma_perturb_com}
15
+ sigma_data: ${model.net.diffusion_module.sigma_data}
16
+ diffusion_batch_size: ${datasets.diffusion_batch_size_train}
17
+ central_atom: ${datasets.global_transform_args.central_atom}
18
+ n_atoms_per_token: ${datasets.global_transform_args.n_atoms_per_token}
19
+ association_scheme: ${datasets.global_transform_args.association_scheme}
20
+ center_option: ${datasets.global_transform_args.center_option}
21
+
22
+ # Conformers
23
+ generate_conformers: ${datasets.global_transform_args.generate_conformers}
24
+ generate_conformers_for_non_protein_only: ${datasets.global_transform_args.generate_conformers_for_non_protein_only}
25
+ provide_reference_conformer_when_unmasked: ${datasets.global_transform_args.provide_reference_conformer_when_unmasked}
26
+ ground_truth_conformer_policy: ${datasets.global_transform_args.ground_truth_conformer_policy}
27
+ provide_elements_for_unindexed_components: ${datasets.global_transform_args.provide_elements_for_unindexed_components}
28
+ use_element_for_atom_names_of_atomized_tokens: ${datasets.global_transform_args.use_element_for_atom_names_of_atomized_tokens}
29
+ residue_cache_dir: ${paths.data.residue_cache_dir}
30
+
31
+ # Conditions
32
+ train_conditions: ${datasets.global_transform_args.train_conditions}
33
+ meta_conditioning_probabilities: ${datasets.global_transform_args.meta_conditioning_probabilities}
34
+
35
+ # PPI Hypers
36
+ keep_full_binder_in_spatial_crop: ${datasets.global_transform_args.keep_full_binder_in_spatial_crop}
37
+ max_binder_length: ${datasets.global_transform_args.max_binder_length}
38
+ max_ppi_hotspots_frac_to_provide: ${datasets.global_transform_args.max_ppi_hotspots_frac_to_provide}
39
+ ppi_hotspot_max_distance: ${datasets.global_transform_args.ppi_hotspot_max_distance}
40
+
41
+ # 1D SS hypers
42
+ max_ss_frac_to_provide: ${datasets.global_transform_args.max_ss_frac_to_provide}
43
+ min_ss_island_len: ${datasets.global_transform_args.min_ss_island_len}
44
+ max_ss_island_len: ${datasets.global_transform_args.max_ss_island_len}
45
+
46
+ # Cropping
47
+ crop_size: ${datasets.crop_size}
48
+ max_atoms_in_crop: ${datasets.max_atoms_in_crop}
49
+ allowed_types: ALL
50
+ crop_spatial_probability: ???
51
+ crop_contiguous_probability: ???
52
+ dna_contact_crop_probability: 0.0
53
+ crop_center_cutoff_distance: 15.0
54
+ zero_occ_on_exposure_after_cropping: False
55
+ b_factor_min: null
56
+
57
+ # Other dataset-specific parameters
58
+ atom_1d_features: ${model.net.token_initializer.atom_1d_features}
59
+ token_1d_features: ${model.net.token_initializer.token_1d_features}
@@ -0,0 +1,20 @@
1
+ defaults:
2
+ - base_no_weights
3
+ - _self_
4
+
5
+ dataset:
6
+ dataset_parser:
7
+ _target_: atomworks.ml.datasets.parsers.GenericDFParser
8
+ pn_unit_iid_colnames: null
9
+
10
+ dataset:
11
+ name: tf_distillation
12
+ data: /projects/ml/prot_dna/transcriptionFactor_distillation_rf3.newDL.csv
13
+ columns_to_load:
14
+ - example_id
15
+ - path
16
+
17
+ transform:
18
+ crop_contiguous_probability: 0.4
19
+ crop_spatial_probability: 0.0
20
+ dna_contact_crop_probability: 0.6
@@ -0,0 +1,11 @@
1
+ # Base config for all PDB datasets
2
+ defaults:
3
+ - base
4
+ - _self_
5
+
6
+ dataset:
7
+ # All PDB datasets load from this cache:
8
+ cif_parser_args:
9
+ cache_dir: ${paths.data.cif_cache_dir}
10
+ load_from_cache: True
11
+ save_to_cache: False
@@ -0,0 +1,22 @@
1
+ # Inherit
2
+ defaults:
3
+ - af3_train_interface
4
+ - pdb_base
5
+ - _self_
6
+
7
+ dataset:
8
+ transform:
9
+ crop_contiguous_probability: 0.0
10
+ crop_spatial_probability: 1.0
11
+ filters:
12
+ # filters common across all PDB datasets
13
+ - 'pdb_id not in ["7rte", "7m5w", "7n5u"]'
14
+ - 'pdb_id not in ["3di3", "5o45", "1z92", "2gy5", "4zxb"]'
15
+ - "deposition_date < '2024-12-16'"
16
+ - "resolution < 9.0"
17
+ - "num_polymer_pn_units <= 300"
18
+ - "cluster.notnull()"
19
+ # interface specific filters
20
+ - "~(pn_unit_1_non_polymer_res_names.notnull() and pn_unit_1_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
21
+ - "~(pn_unit_2_non_polymer_res_names.notnull() and pn_unit_2_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
22
+ - "is_inter_molecule"
@@ -0,0 +1,23 @@
1
+ defaults:
2
+ - af3_train_pn_unit
3
+ - pdb_base
4
+ - _self_
5
+
6
+ dataset:
7
+ transform:
8
+ # pn_unit-specific Transform pipeline parameters
9
+ crop_contiguous_probability: 0.25
10
+ crop_spatial_probability: 0.75
11
+
12
+ # Modify: date & clustering parquet
13
+ dataset:
14
+ filters:
15
+ # filters common across all PDB datasets
16
+ - 'pdb_id not in ["7rte", "7m5w", "7n5u"]'
17
+ - 'pdb_id not in ["3di3", "5o45", "1z92", "2gy5", "4zxb"]'
18
+ - "deposition_date < '2024-12-16'"
19
+ - "resolution < 9.0"
20
+ - "num_polymer_pn_units <= 300"
21
+ - "cluster.notnull()"
22
+ # pn_unit specific filters
23
+ - "~(q_pn_unit_non_polymer_res_names.notnull() and q_pn_unit_non_polymer_res_names.str.contains('${resolve_import:atomworks.constants,AF3_EXCLUDED_LIGANDS_REGEX}', regex=True))"
@@ -0,0 +1,38 @@
1
+ defaults:
2
+ - pdb/base_transform_args@monomer_distillation
3
+ - _self_
4
+
5
+ monomer_distillation:
6
+ dataset:
7
+ _target_: atomworks.ml.datasets.StructuralDatasetWrapper
8
+ save_failed_examples_to_dir: ${paths.data.failed_examples_dir}
9
+ # Explicitly do not load from cache.
10
+ # Dataset too big, and structures are small
11
+ cif_parser_args:
12
+ cache_dir: null
13
+ load_from_cache: False
14
+ save_to_cache: False
15
+
16
+ # metadata dataset
17
+ dataset:
18
+ _target_: atomworks.ml.datasets.PandasDataset
19
+ name: af2fb_distillation
20
+ id_column: example_id
21
+ data: ${paths.data.monomer_distillation_parquet_dir}/af2_distillation_facebook.parquet
22
+ columns_to_load:
23
+ - example_id
24
+ - path
25
+
26
+ # metadata parser
27
+ dataset_parser:
28
+ _target_: atomworks.ml.datasets.parsers.GenericDFParser
29
+ pn_unit_iid_colnames: null
30
+
31
+ transform:
32
+ _target_: ${datasets.pipeline_target}
33
+ is_inference: False
34
+ # protein_msa_dirs: [{"dir": "${paths.data.monomer_distillation_data_dir}/msa", "extension": ".a3m", "directory_depth": 2}]
35
+ # rna_msa_dirs: []
36
+ crop_contiguous_probability: 0.25
37
+ crop_spatial_probability: 0.75
38
+ b_factor_min: 70
@@ -0,0 +1,9 @@
1
+
2
+ defaults:
3
+ - design_validation_base
4
+ - val_examples/bcov_ppi_easy_medium_with_ori@dataset.data
5
+ - _self_
6
+
7
+ dataset:
8
+ eval_every_n: 1
9
+ name: bcov-ppi-easy-medium
@@ -0,0 +1,40 @@
1
+ dataset:
2
+ _target_: rfd3.inference.datasets.ContigJsonDataset
3
+
4
+ # Required parameters for each inheriting dataset
5
+ data: ??? # Path to json file
6
+ name: ??? # Name for displaying and saving files
7
+ eval_every_n: ??? # Evaluate on this dataset every n epochs
8
+ subset_to_keys: null # Specific keys in json to keep, ignores all others.
9
+
10
+ # NB: Used for parsing input files (not for atom_array reloading anymore)
11
+ cif_parser_args:
12
+ cache_dir: null
13
+ load_from_cache: False
14
+ save_to_cache: False
15
+ add_missing_atoms: False
16
+
17
+ # Common Transform pipeline components for all PDB datasets
18
+ transform:
19
+ _target_: ${datasets.pipeline_target}
20
+ is_inference: True
21
+ return_atom_array: True
22
+ diffusion_batch_size: ${datasets.diffusion_batch_size_train}
23
+ sigma_data: ${model.net.diffusion_module.sigma_data}
24
+ central_atom: ${datasets.global_transform_args.central_atom}
25
+ n_atoms_per_token: ${datasets.global_transform_args.n_atoms_per_token}
26
+ association_scheme: ${datasets.global_transform_args.association_scheme}
27
+ center_option: ${datasets.global_transform_args.center_option}
28
+
29
+ # Conformers
30
+ generate_conformers: ${datasets.global_transform_args.generate_conformers}
31
+ generate_conformers_for_non_protein_only: ${datasets.global_transform_args.generate_conformers_for_non_protein_only}
32
+ provide_reference_conformer_when_unmasked: ${datasets.global_transform_args.provide_reference_conformer_when_unmasked}
33
+ ground_truth_conformer_policy: ${datasets.global_transform_args.ground_truth_conformer_policy}
34
+ provide_elements_for_unindexed_components: ${datasets.global_transform_args.provide_elements_for_unindexed_components}
35
+ use_element_for_atom_names_of_atomized_tokens: ${datasets.global_transform_args.use_element_for_atom_names_of_atomized_tokens}
36
+ residue_cache_dir: ${paths.data.residue_cache_dir}
37
+
38
+ # Other dataset-specific parameters
39
+ atom_1d_features: ${model.net.token_initializer.atom_1d_features}
40
+ token_1d_features: ${model.net.token_initializer.token_1d_features}
@@ -0,0 +1,9 @@
1
+
2
+ defaults:
3
+ - design_validation_base
4
+ - _self_
5
+
6
+ dataset:
7
+ data: ${paths.data.design_benchmark_data_dir}/dna_binder.json
8
+ name: dna_binder_design
9
+ eval_every_n: 1
@@ -0,0 +1,13 @@
1
+
2
+ defaults:
3
+ - design_validation_base
4
+ - _self_
5
+
6
+ dataset:
7
+ data: ${paths.root_dir}/tests/dna.json
8
+ name: dna_binder_design
9
+ eval_every_n: 10
10
+ subset_to_keys:
11
+ - 7rte_sequence_only
12
+ - 7rte_with_structure
13
+
@@ -0,0 +1,13 @@
1
+
2
+ defaults:
3
+ - design_validation_base
4
+ - _self_
5
+
6
+ dataset:
7
+ data: ${paths.root_dir}/rfd3/tests/test_data/dna.json
8
+ name: dna_binder_design
9
+ eval_every_n: 1
10
+ subset_to_keys:
11
+ - 7rte_sequence_only
12
+ - 7rte_with_structure
13
+
@@ -0,0 +1,9 @@
1
+
2
+ defaults:
3
+ - design_validation_base
4
+ - _self_
5
+
6
+ dataset:
7
+ data: ${paths.data.design_benchmark_data_dir}/indexed.json
8
+ name: indexed-design
9
+ eval_every_n: 8
@@ -0,0 +1,9 @@
1
+
2
+ defaults:
3
+ - design_validation_base
4
+ - _self_
5
+
6
+ dataset:
7
+ data: ${paths.data.design_benchmark_data_dir}/mcsa_41.json
8
+ name: woodys-benchmark
9
+ eval_every_n: 16
@@ -0,0 +1,10 @@
1
+
2
+ defaults:
3
+ - unindexed
4
+ - _self_
5
+
6
+ dataset:
7
+ name: rigid-ligand-enzymes
8
+ eval_every_n: 1
9
+ data: ${paths.data.design_benchmark_data_dir}/mcsa_41_short_rigid_new.json
10
+
@@ -0,0 +1,7 @@
1
+ defaults:
2
+ - unconditional
3
+ - _self_
4
+
5
+ dataset:
6
+ name: ppi_inference
7
+ data: ??? # This is a required override, specifying a path to the dataset json or yaml file.
@@ -0,0 +1,13 @@
1
+ defaults:
2
+ - design_validation_base
3
+ - _self_
4
+
5
+ dataset:
6
+ data: ${paths.data.design_benchmark_data_dir}/sm_binder_hbonds.json
7
+ eval_every_n: 5
8
+ name: sm_binder_hbonds-design
9
+ subset_to_keys:
10
+ - FAD
11
+ - IAI
12
+ - OQO
13
+ - SAM
@@ -0,0 +1,15 @@
1
+ defaults:
2
+ - sm_binder_hbonds
3
+ - _self_
4
+
5
+ dataset:
6
+ eval_every_n: 1
7
+ data: ${paths.data.design_benchmark_data_dir}/sm_binder_hbonds_sampled.json
8
+ name: sm_binder_hbonds-design-short
9
+ subset_to_keys:
10
+ - FAD_1
11
+ - FAD_2
12
+ - FAD_3
13
+ - IAI_1
14
+ - IAI_2
15
+ - IAI_3