dayhoff-tools 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools-1.0.0/PKG-INFO +122 -0
- dayhoff_tools-1.0.0/README.md +55 -0
- dayhoff_tools-1.0.0/dayhoff_tools/__init__.py +0 -0
- dayhoff_tools-1.0.0/dayhoff_tools/chemistry/standardizer.py +297 -0
- dayhoff_tools-1.0.0/dayhoff_tools/chemistry/utils.py +63 -0
- dayhoff_tools-1.0.0/dayhoff_tools/cli/__init__.py +0 -0
- dayhoff_tools-1.0.0/dayhoff_tools/cli/main.py +90 -0
- dayhoff_tools-1.0.0/dayhoff_tools/cli/swarm_commands.py +156 -0
- dayhoff_tools-1.0.0/dayhoff_tools/cli/utility_commands.py +244 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/base.py +434 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/deploy_aws.py +458 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/deploy_gcp.py +176 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/deploy_utils.py +781 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/job_runner.py +153 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/processors.py +125 -0
- dayhoff_tools-1.0.0/dayhoff_tools/deployment/swarm.py +591 -0
- dayhoff_tools-1.0.0/dayhoff_tools/embedders.py +893 -0
- dayhoff_tools-1.0.0/dayhoff_tools/fasta.py +1082 -0
- dayhoff_tools-1.0.0/dayhoff_tools/file_ops.py +261 -0
- dayhoff_tools-1.0.0/dayhoff_tools/gcp.py +85 -0
- dayhoff_tools-1.0.0/dayhoff_tools/h5.py +542 -0
- dayhoff_tools-1.0.0/dayhoff_tools/kegg.py +37 -0
- dayhoff_tools-1.0.0/dayhoff_tools/logs.py +27 -0
- dayhoff_tools-1.0.0/dayhoff_tools/mmseqs.py +164 -0
- dayhoff_tools-1.0.0/dayhoff_tools/sqlite.py +516 -0
- dayhoff_tools-1.0.0/dayhoff_tools/structure.py +751 -0
- dayhoff_tools-1.0.0/dayhoff_tools/uniprot.py +434 -0
- dayhoff_tools-1.0.0/dayhoff_tools/warehouse.py +418 -0
- dayhoff_tools-1.0.0/pyproject.toml +86 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: dayhoff-tools
|
3
|
+
Version: 1.0.0
|
4
|
+
Summary: Common tools for all the repos at Dayhoff Labs
|
5
|
+
Author: Daniel Martin-Alarcon
|
6
|
+
Author-email: dma@dayhofflabs.com
|
7
|
+
Requires-Python: >=3.10,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
13
|
+
Provides-Extra: all
|
14
|
+
Provides-Extra: core
|
15
|
+
Provides-Extra: dev
|
16
|
+
Requires-Dist: biopython (>=1.84) ; extra == "all"
|
17
|
+
Requires-Dist: biopython (>=1.84) ; extra == "core"
|
18
|
+
Requires-Dist: black (>=25.1.0) ; extra == "all"
|
19
|
+
Requires-Dist: black (>=25.1.0) ; extra == "dev"
|
20
|
+
Requires-Dist: boto3 (>=1.36.8) ; extra == "all"
|
21
|
+
Requires-Dist: boto3 (>=1.36.8) ; extra == "core"
|
22
|
+
Requires-Dist: colorlog (>=6.8.2) ; extra == "all"
|
23
|
+
Requires-Dist: colorlog (>=6.8.2) ; extra == "dev"
|
24
|
+
Requires-Dist: docker (>=7.1.0) ; extra == "all"
|
25
|
+
Requires-Dist: docker (>=7.1.0) ; extra == "core"
|
26
|
+
Requires-Dist: dvc (>=3.48.2) ; extra == "all"
|
27
|
+
Requires-Dist: dvc (>=3.48.2) ; extra == "dev"
|
28
|
+
Requires-Dist: dvc-gs (>=3.0.1) ; extra == "all"
|
29
|
+
Requires-Dist: dvc-gs (>=3.0.1) ; extra == "dev"
|
30
|
+
Requires-Dist: fair-esm (>=2.0.0) ; extra == "all"
|
31
|
+
Requires-Dist: fair-esm (>=2.0.0) ; extra == "core"
|
32
|
+
Requires-Dist: firebase-admin (>=6.5.0)
|
33
|
+
Requires-Dist: flake8 (>=7.0.0) ; extra == "all"
|
34
|
+
Requires-Dist: flake8 (>=7.0.0) ; extra == "dev"
|
35
|
+
Requires-Dist: h5py (>=3.11.0) ; extra == "all"
|
36
|
+
Requires-Dist: h5py (>=3.11.0) ; extra == "core"
|
37
|
+
Requires-Dist: isort (>=5.13.2) ; extra == "all"
|
38
|
+
Requires-Dist: isort (>=5.13.2) ; extra == "dev"
|
39
|
+
Requires-Dist: numpy (<2.0.0) ; extra == "all"
|
40
|
+
Requires-Dist: numpy (<2.0.0) ; extra == "dev"
|
41
|
+
Requires-Dist: pandas (>=2.2.3) ; extra == "dev"
|
42
|
+
Requires-Dist: pylance (>=0.10.2) ; extra == "all"
|
43
|
+
Requires-Dist: pylance (>=0.10.2) ; extra == "dev"
|
44
|
+
Requires-Dist: pylint (>=3.1.0) ; extra == "all"
|
45
|
+
Requires-Dist: pylint (>=3.1.0) ; extra == "dev"
|
46
|
+
Requires-Dist: pytest (>=8.0.2) ; extra == "all"
|
47
|
+
Requires-Dist: pytest (>=8.0.2) ; extra == "dev"
|
48
|
+
Requires-Dist: pytest-cov (>=4.1.0) ; extra == "all"
|
49
|
+
Requires-Dist: pytest-cov (>=4.1.0) ; extra == "dev"
|
50
|
+
Requires-Dist: pytest-mock (>=3.12.0) ; extra == "all"
|
51
|
+
Requires-Dist: pytest-mock (>=3.12.0) ; extra == "dev"
|
52
|
+
Requires-Dist: pyyaml (>=6.0)
|
53
|
+
Requires-Dist: questionary (>=2.0.1) ; extra == "all"
|
54
|
+
Requires-Dist: questionary (>=2.0.1) ; extra == "core"
|
55
|
+
Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "all"
|
56
|
+
Requires-Dist: rdkit-pypi (>=2022.9.5) ; extra == "dev"
|
57
|
+
Requires-Dist: requests (>=2.31.0)
|
58
|
+
Requires-Dist: torch (>=1.10.0) ; extra == "all"
|
59
|
+
Requires-Dist: torch (>=1.10.0) ; extra == "dev"
|
60
|
+
Requires-Dist: torchvision (>=0.11.0) ; extra == "all"
|
61
|
+
Requires-Dist: torchvision (>=0.11.0) ; extra == "dev"
|
62
|
+
Requires-Dist: transformers (>=4.20.0) ; extra == "all"
|
63
|
+
Requires-Dist: transformers (>=4.36.2) ; extra == "dev"
|
64
|
+
Requires-Dist: typer (>=0.9.0)
|
65
|
+
Description-Content-Type: text/markdown
|
66
|
+
|
67
|
+
# dayhoff-tools
|
68
|
+
|
69
|
+
A set of small, sharp tools for everyone at Dayhoff.
|
70
|
+
|
71
|
+
## Hosting and Auth
|
72
|
+
|
73
|
+
This repo uses Poetry to build and publish a package to GCP Artifact Registry, at `https://us-central1-python.pkg.dev/enzyme-discovery/pypirate/`. This depends on a Poetry plugin that's now in the standard chassis setup (`keyrings.google-artifactregistry-auth`), and also on the active service account having read access to Artifact Registry. That much is set up for the standard dev container service account, but may not be available to other intended users.
|
74
|
+
|
75
|
+
## CLI commands
|
76
|
+
|
77
|
+
Unlike all the repos that use dayhoff-tools, here you have to install the package explicitly before using the CLI:
|
78
|
+
|
79
|
+
```sh
|
80
|
+
poetry install
|
81
|
+
```
|
82
|
+
|
83
|
+
## Publish a new version
|
84
|
+
|
85
|
+
1. Update version number in `pyproject.toml`
|
86
|
+
2. Run `dh wheel`
|
87
|
+
3. In other repos, run `poetry update dayhoff-tools`
|
88
|
+
|
89
|
+
If you want to overwrite an existing wheel, you'll have to manually delete it from the `dist` folder and also the [Artifact Registry repo](https://console.cloud.google.com/artifacts/python/enzyme-discovery/us-central1/pypirate/dayhoff-tools).
|
90
|
+
|
91
|
+
## Install in other repos
|
92
|
+
|
93
|
+
Installing this library is tricky because we need GCS authentication and also a couple of plugins to install this with either Pip or Poetry. These have been incorporated into `chassis`, but it's worth noting here what the various parts are. All this info came from this [Medium post](https://medium.com/google-cloud/python-packages-via-gcps-artifact-registry-ce1714f8e7c1).
|
94
|
+
|
95
|
+
1. Get a Service Account with read access to Artifact Registry (such as `github-actions`, which I made for this purpose).
|
96
|
+
2. Export the SA key file, copy it to your repo, and make it available through this envvar: `export GOOGLE_APPLICATION_CREDENTIALS=github_actions_key.json`
|
97
|
+
|
98
|
+
### ... with Pip
|
99
|
+
|
100
|
+
1. `pip install keyring`
|
101
|
+
2. `pip install keyrings.google-artifactregistry-auth`
|
102
|
+
3. `pip install --upgrade dayhoff-tools --index-url https://us-central1-python.pkg.dev/enzyme-discovery/pypirate/simple/`
|
103
|
+
|
104
|
+
### ... with Poetry
|
105
|
+
|
106
|
+
1. Add this plugin: `poetry self add keyrings.google-artifactregistry-auth`
|
107
|
+
2. Add these sections to `pyproject.toml`. Note that dayhoff-tools is in a separate group `pypirate` that installs separately from the others.
|
108
|
+
|
109
|
+
```toml
|
110
|
+
[tool.poetry.group.pypirate.dependencies]
|
111
|
+
dayhoff-tools = {version = "*", source = "pypirate"}
|
112
|
+
|
113
|
+
[[tool.poetry.source]]
|
114
|
+
name = "pypirate"
|
115
|
+
url = "https://us-central1-python.pkg.dev/enzyme-discovery/pypirate/simple/"
|
116
|
+
priority = "supplemental"
|
117
|
+
```
|
118
|
+
|
119
|
+
3. When building a dev container, or in other circumstances when you can't easily authenticate as above, run `poetry install --without pypirate`.
|
120
|
+
4. Otherwise, just `poetry install`.
|
121
|
+
5. To ensure you have the latest version, run `poetry update dayhoff-tools`.
|
122
|
+
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# dayhoff-tools
|
2
|
+
|
3
|
+
A set of small, sharp tools for everyone at Dayhoff.
|
4
|
+
|
5
|
+
## Hosting and Auth
|
6
|
+
|
7
|
+
This repo uses Poetry to build and publish a package to GCP Artifact Registry, at `https://us-central1-python.pkg.dev/enzyme-discovery/pypirate/`. This depends on a Poetry plugin that's now in the standard chassis setup (`keyrings.google-artifactregistry-auth`), and also on the active service account having read access to Artifact Registry. That much is set up for the standard dev container service account, but may not be available to other intended users.
|
8
|
+
|
9
|
+
## CLI commands
|
10
|
+
|
11
|
+
Unlike all the repos that use dayhoff-tools, here you have to install the package explicitly before using the CLI:
|
12
|
+
|
13
|
+
```sh
|
14
|
+
poetry install
|
15
|
+
```
|
16
|
+
|
17
|
+
## Publish a new version
|
18
|
+
|
19
|
+
1. Update version number in `pyproject.toml`
|
20
|
+
2. Run `dh wheel`
|
21
|
+
3. In other repos, run `poetry update dayhoff-tools`
|
22
|
+
|
23
|
+
If you want to overwrite an existing wheel, you'll have to manually delete it from the `dist` folder and also the [Artifact Registry repo](https://console.cloud.google.com/artifacts/python/enzyme-discovery/us-central1/pypirate/dayhoff-tools).
|
24
|
+
|
25
|
+
## Install in other repos
|
26
|
+
|
27
|
+
Installing this library is tricky because we need GCS authentication and also a couple of plugins to install this with either Pip or Poetry. These have been incorporated into `chassis`, but it's worth noting here what the various parts are. All this info came from this [Medium post](https://medium.com/google-cloud/python-packages-via-gcps-artifact-registry-ce1714f8e7c1).
|
28
|
+
|
29
|
+
1. Get a Service Account with read access to Artifact Registry (such as `github-actions`, which I made for this purpose).
|
30
|
+
2. Export the SA key file, copy it to your repo, and make it available through this envvar: `export GOOGLE_APPLICATION_CREDENTIALS=github_actions_key.json`
|
31
|
+
|
32
|
+
### ... with Pip
|
33
|
+
|
34
|
+
1. `pip install keyring`
|
35
|
+
2. `pip install keyrings.google-artifactregistry-auth`
|
36
|
+
3. `pip install --upgrade dayhoff-tools --index-url https://us-central1-python.pkg.dev/enzyme-discovery/pypirate/simple/`
|
37
|
+
|
38
|
+
### ... with Poetry
|
39
|
+
|
40
|
+
1. Add this plugin: `poetry self add keyrings.google-artifactregistry-auth`
|
41
|
+
2. Add these sections to `pyproject.toml`. Note that dayhoff-tools is in a separate group `pypirate` that installs separately from the others.
|
42
|
+
|
43
|
+
```toml
|
44
|
+
[tool.poetry.group.pypirate.dependencies]
|
45
|
+
dayhoff-tools = {version = "*", source = "pypirate"}
|
46
|
+
|
47
|
+
[[tool.poetry.source]]
|
48
|
+
name = "pypirate"
|
49
|
+
url = "https://us-central1-python.pkg.dev/enzyme-discovery/pypirate/simple/"
|
50
|
+
priority = "supplemental"
|
51
|
+
```
|
52
|
+
|
53
|
+
3. When building a dev container, or in other circumstances when you can't easily authenticate as above, run `poetry install --without pypirate`.
|
54
|
+
4. Otherwise, just `poetry install`.
|
55
|
+
5. To ensure you have the latest version, run `poetry update dayhoff-tools`.
|
File without changes
|
@@ -0,0 +1,297 @@
|
|
1
|
+
"""Normalization classes for molecules and reactions."""
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
|
5
|
+
from rdkit import Chem, rdBase
|
6
|
+
from rdkit.Chem import AllChem
|
7
|
+
from rdkit.Chem.MolStandardize import rdMolStandardize # type: ignore
|
8
|
+
|
9
|
+
rdBase.DisableLog("rdApp.error")
|
10
|
+
rdBase.DisableLog("rdApp.warning")
|
11
|
+
|
12
|
+
|
13
|
+
def is_smiles_aromatic(smiles: str) -> bool:
|
14
|
+
"""Check if SMILES string contains aromatic atoms.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
smiles (str): Input SMILES string
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
bool: True if aromatic atoms are found
|
21
|
+
"""
|
22
|
+
|
23
|
+
rdmol = Chem.MolFromSmiles(smiles, sanitize=False) # type: ignore
|
24
|
+
if rdmol is None:
|
25
|
+
raise ValueError("invalid SMILES string")
|
26
|
+
return any(at.GetIsAromatic() for at in rdmol.GetAtoms())
|
27
|
+
|
28
|
+
|
29
|
+
class BaseStandardizer(ABC):
|
30
|
+
"""Abstract base class for normalizing molecules and reactions."""
|
31
|
+
|
32
|
+
@abstractmethod
|
33
|
+
def standardize_molecule(self, smiles: str) -> str:
|
34
|
+
"""Standardize molecules as SMILES strings.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
smiles (str): Input SMILES string
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
str: Output SMILES string
|
41
|
+
"""
|
42
|
+
pass
|
43
|
+
|
44
|
+
def standardize_reaction(self, smiles: str) -> str:
|
45
|
+
"""Standardize reactions as SMILES/SMARTS strings.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
smiles (str): Input SMILES/SMARTS string
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
str: Output SMILES/SMARTS string
|
52
|
+
"""
|
53
|
+
rdrxn = AllChem.ReactionFromSmarts(smiles, useSmiles=True) # type: ignore
|
54
|
+
rdrxn1 = AllChem.ChemicalReaction() # type: ignore
|
55
|
+
for rdmol in rdrxn.GetReactants():
|
56
|
+
smiles1 = Chem.MolToSmiles(rdmol, canonical=False) # type: ignore
|
57
|
+
smiles2 = self.standardize_molecule(smiles1)
|
58
|
+
rdmol1 = Chem.MolFromSmiles(smiles2, sanitize=False) # type: ignore
|
59
|
+
rdfrags = Chem.GetMolFrags(rdmol1, asMols=True, sanitizeFrags=False) # type: ignore
|
60
|
+
if len(rdfrags) == 1:
|
61
|
+
rdrxn1.AddReactantTemplate(rdmol1)
|
62
|
+
else:
|
63
|
+
for rdfrag in rdfrags:
|
64
|
+
rdrxn1.AddReactantTemplate(rdfrag)
|
65
|
+
# rdrxn1.AddReactantTemplate(rdmol1)
|
66
|
+
for rdmol in rdrxn.GetProducts():
|
67
|
+
smiles1 = Chem.MolToSmiles(rdmol, canonical=False) # type: ignore
|
68
|
+
smiles2 = self.standardize_molecule(smiles1)
|
69
|
+
rdmol1 = Chem.MolFromSmiles(smiles2, sanitize=False) # type: ignore
|
70
|
+
rdfrags = Chem.GetMolFrags(rdmol1, asMols=True, sanitizeFrags=False) # type: ignore
|
71
|
+
if len(rdfrags) == 1:
|
72
|
+
rdrxn1.AddProductTemplate(rdmol1)
|
73
|
+
else:
|
74
|
+
for rdfrag in rdfrags:
|
75
|
+
rdrxn1.AddProductTemplate(rdfrag)
|
76
|
+
return AllChem.ReactionToSmiles(rdrxn1) # type: ignore
|
77
|
+
|
78
|
+
|
79
|
+
class HypervalentStandardizer(BaseStandardizer):
|
80
|
+
"""Standardizer for converting double to single bonds in hypervalent
|
81
|
+
compounds."""
|
82
|
+
|
83
|
+
def standardize_molecule(self, smiles: str) -> str:
|
84
|
+
"""Standardize molecules as SMILES strings.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
smiles (str): Input SMILES string
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
str: Output SMILES string
|
91
|
+
"""
|
92
|
+
rdmol = Chem.MolFromSmiles(smiles, sanitize=False) # type: ignore
|
93
|
+
if rdmol is None:
|
94
|
+
raise ValueError(f"Invalid SMILES input '{smiles}'")
|
95
|
+
ret = Chem.SanitizeMol(rdmol, sanitizeOps=Chem.SANITIZE_CLEANUP) # type: ignore
|
96
|
+
if ret > 0:
|
97
|
+
raise ValueError(f"Sanitization failed for SMILES input '{smiles}'")
|
98
|
+
return Chem.MolToSmiles(rdmol) # type: ignore
|
99
|
+
|
100
|
+
|
101
|
+
class RemoveHsStandardizer(BaseStandardizer):
|
102
|
+
"""Standardizer for removing explicit hydrogens from molecules."""
|
103
|
+
|
104
|
+
def standardize_molecule(self, smiles: str) -> str:
|
105
|
+
"""Standardize molecules as SMILES strings.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
smiles (str): Input SMILES string
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
str: Output SMILES string
|
112
|
+
"""
|
113
|
+
rdmol = Chem.MolFromSmiles(smiles, sanitize=False) # type: ignore
|
114
|
+
if rdmol is None:
|
115
|
+
raise ValueError(f"Invalid SMILES input '{smiles}'")
|
116
|
+
rdmol1 = Chem.RemoveHs(rdmol, sanitize=False) # type: ignore
|
117
|
+
ret = Chem.SanitizeMol(rdmol1, sanitizeOps=Chem.SANITIZE_FINDRADICALS) # type: ignore
|
118
|
+
if ret > 0:
|
119
|
+
raise ValueError(f"Sanitization failed for SMILES input '{smiles}'")
|
120
|
+
return Chem.MolToSmiles(rdmol1, canonical=True) # type: ignore
|
121
|
+
|
122
|
+
|
123
|
+
class KekulizeStandardizer(BaseStandardizer):
|
124
|
+
"""Standardizer for kekulizing aromatic compounds."""
|
125
|
+
|
126
|
+
def standardize_molecule(self, smiles: str) -> str:
|
127
|
+
"""Standardize molecules as SMILES strings.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
smiles (str): Input SMILES string
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
str: Output SMILES string
|
134
|
+
"""
|
135
|
+
rdmol = Chem.MolFromSmiles(smiles, sanitize=False) # type: ignore
|
136
|
+
if rdmol is None:
|
137
|
+
raise ValueError(f"Invalid SMILES input '{smiles}'")
|
138
|
+
rdmol.UpdatePropertyCache(strict=False)
|
139
|
+
Chem.Kekulize(rdmol, clearAromaticFlags=True) # type: ignore
|
140
|
+
return Chem.MolToSmiles(rdmol, canonical=True) # type: ignore
|
141
|
+
|
142
|
+
|
143
|
+
class UnchargeStandardizer(BaseStandardizer):
|
144
|
+
"""Standardizer for removing charges from molecules by protonation/deprotonation."""
|
145
|
+
|
146
|
+
def __init__(self, *args, **kwargs):
|
147
|
+
super().__init__(*args, **kwargs)
|
148
|
+
self._uncharger = rdMolStandardize.Uncharger()
|
149
|
+
|
150
|
+
def standardize_molecule(self, smiles: str) -> str:
|
151
|
+
"""Standardize molecules as SMILES strings.
|
152
|
+
|
153
|
+
Args:
|
154
|
+
smiles (str): Input SMILES string
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
str: Output SMILES string
|
158
|
+
"""
|
159
|
+
rdmol = Chem.MolFromSmiles(smiles, sanitize=False) # type: ignore
|
160
|
+
if rdmol is None:
|
161
|
+
raise ValueError(f"Invalid SMILES input '{smiles}'")
|
162
|
+
rdmol1 = self._uncharger.uncharge(rdmol)
|
163
|
+
return Chem.MolToSmiles(rdmol1) # type: ignore
|
164
|
+
|
165
|
+
def standardize_reaction(self, smiles: str) -> str:
|
166
|
+
"""Standardize reactions as SMILES/SMARTS strings.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
smiles (str): Input SMILES/SMARTS string
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
str: Output SMILES/SMARTS string
|
173
|
+
"""
|
174
|
+
rdrxn = AllChem.ReactionFromSmarts(smiles, useSmiles=True) # type: ignore
|
175
|
+
rdrxn1 = AllChem.ChemicalReaction() # type: ignore
|
176
|
+
|
177
|
+
# Remove all explicit protons from the reaction
|
178
|
+
reactant_total_charge = 0
|
179
|
+
product_total_charge = 0
|
180
|
+
for rdmol in rdrxn.GetReactants():
|
181
|
+
smiles1 = Chem.MolToSmiles(rdmol, canonical=False) # type: ignore
|
182
|
+
if smiles1 != "[H+]":
|
183
|
+
smiles2 = self.standardize_molecule(smiles1)
|
184
|
+
rdmol1 = Chem.MolFromSmiles(smiles2, sanitize=False) # type: ignore
|
185
|
+
reactant_total_charge += Chem.GetFormalCharge(rdmol1) # type: ignore
|
186
|
+
rdrxn1.AddReactantTemplate(rdmol1)
|
187
|
+
for rdmol in rdrxn.GetProducts():
|
188
|
+
smiles1 = Chem.MolToSmiles(rdmol, canonical=False) # type: ignore
|
189
|
+
if smiles1 != "[H+]":
|
190
|
+
smiles2 = self.standardize_molecule(smiles1)
|
191
|
+
rdmol1 = Chem.MolFromSmiles(smiles2, sanitize=False) # type: ignore
|
192
|
+
product_total_charge += Chem.GetFormalCharge(rdmol1) # type: ignore
|
193
|
+
rdrxn1.AddProductTemplate(rdmol1)
|
194
|
+
|
195
|
+
# Rebalance reaction with protons
|
196
|
+
if reactant_total_charge > product_total_charge:
|
197
|
+
rdmol1 = Chem.MolFromSmiles("[H+]", sanitize=False) # type: ignore
|
198
|
+
for _ in range(reactant_total_charge - product_total_charge):
|
199
|
+
rdrxn1.AddProductTemplate(rdmol1)
|
200
|
+
elif product_total_charge > reactant_total_charge:
|
201
|
+
rdmol1 = Chem.MolFromSmiles("[H+]", sanitize=False) # type: ignore
|
202
|
+
for _ in range(product_total_charge - reactant_total_charge):
|
203
|
+
rdrxn1.AddReactantTemplate(rdmol1)
|
204
|
+
return AllChem.ReactionToSmiles(rdrxn1) # type: ignore
|
205
|
+
|
206
|
+
|
207
|
+
class MetalStandardizer(BaseStandardizer):
|
208
|
+
"""Standardizer for disconnecting bonds between metals and N, O, F atoms."""
|
209
|
+
|
210
|
+
def __init__(self, *args, **kwargs):
|
211
|
+
super().__init__(*args, **kwargs)
|
212
|
+
self._disconnector = rdMolStandardize.MetalDisconnector()
|
213
|
+
|
214
|
+
def standardize_molecule(self, smiles: str) -> str:
|
215
|
+
"""Standardize molecules as SMILES strings.
|
216
|
+
|
217
|
+
Args:
|
218
|
+
smiles (str): Input SMILES string
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
str: Output SMILES string
|
222
|
+
"""
|
223
|
+
rdmol = Chem.MolFromSmiles(smiles, sanitize=False) # type: ignore
|
224
|
+
if rdmol is None:
|
225
|
+
raise ValueError(f"Invalid SMILES input '{smiles}'")
|
226
|
+
|
227
|
+
flags = Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES # type: ignore
|
228
|
+
if not is_smiles_aromatic(smiles):
|
229
|
+
flags ^= Chem.SANITIZE_SETAROMATICITY # type: ignore
|
230
|
+
Chem.SanitizeMol(rdmol, sanitizeOps=flags) # type: ignore
|
231
|
+
rdmol1 = self._disconnector.Disconnect(rdmol)
|
232
|
+
return Chem.MolToSmiles(rdmol1) # type: ignore
|
233
|
+
|
234
|
+
|
235
|
+
class Standardizer(BaseStandardizer):
|
236
|
+
"""Aggregate standardizer for molecules and reactions."""
|
237
|
+
|
238
|
+
def __init__(
|
239
|
+
self,
|
240
|
+
*,
|
241
|
+
standardize_hypervalent: bool = True,
|
242
|
+
standardize_remove_hs: bool = True,
|
243
|
+
standardize_kekulize: bool = False,
|
244
|
+
standardize_uncharge: bool = False,
|
245
|
+
standardize_metals: bool = True,
|
246
|
+
):
|
247
|
+
"""Initialize the standardizer.
|
248
|
+
|
249
|
+
Args:
|
250
|
+
standardize_hypervalent (bool): Convert double to single bonds in
|
251
|
+
hypervalent compounds
|
252
|
+
standardize_remove_hs (bool): Remove explicit hydrogen atoms
|
253
|
+
standardize_kekulize (bool): Kekulize aromatic compounds
|
254
|
+
standardize_uncharge (bool): Remove charges from molecules by
|
255
|
+
protonation/deprotonation
|
256
|
+
standardize_metals (bool): Disconnect bonds between metals and
|
257
|
+
N, O, F atoms
|
258
|
+
"""
|
259
|
+
self._standardizers = []
|
260
|
+
if standardize_hypervalent:
|
261
|
+
self._standardizers.append(HypervalentStandardizer())
|
262
|
+
if standardize_remove_hs:
|
263
|
+
self._standardizers.append(RemoveHsStandardizer())
|
264
|
+
if standardize_kekulize:
|
265
|
+
self._standardizers.append(KekulizeStandardizer())
|
266
|
+
if standardize_uncharge:
|
267
|
+
self._standardizers.append(UnchargeStandardizer())
|
268
|
+
if standardize_metals:
|
269
|
+
self._standardizers.append(MetalStandardizer())
|
270
|
+
|
271
|
+
def standardize_molecule(self, smiles: str) -> str:
|
272
|
+
"""Standardize molecules as SMILES strings.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
smiles (str): Input SMILES string
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
str: Output SMILES string
|
279
|
+
"""
|
280
|
+
smiles1 = smiles
|
281
|
+
for standardizer in self._standardizers:
|
282
|
+
smiles1 = standardizer.standardize_molecule(smiles1)
|
283
|
+
return smiles1
|
284
|
+
|
285
|
+
def standardize_reaction(self, smiles: str) -> str:
|
286
|
+
"""Standardize reactions as SMILES/SMARTS strings.
|
287
|
+
|
288
|
+
Args:
|
289
|
+
smarts (str): Input SMILES/SMARTS string
|
290
|
+
|
291
|
+
Returns:
|
292
|
+
str: Output SMILES/SMARTS string
|
293
|
+
"""
|
294
|
+
smiles1 = smiles
|
295
|
+
for standardizer in self._standardizers:
|
296
|
+
smiles1 = standardizer.standardize_reaction(smiles1)
|
297
|
+
return smiles1
|
@@ -0,0 +1,63 @@
|
|
1
|
+
"""Chemistry utils for refinery."""
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from rdkit import Chem, rdBase
|
6
|
+
|
7
|
+
__all__ = ["generate_inchikey"]
|
8
|
+
|
9
|
+
rdBase.DisableLog("rdApp.warning")
|
10
|
+
rdBase.DisableLog("rdApp.error")
|
11
|
+
|
12
|
+
|
13
|
+
def generate_inchikey(
|
14
|
+
s: str, rgroup_smiles: Optional[str] = None, ignore_direction: bool = False
|
15
|
+
) -> str:
|
16
|
+
"""Generate INChI key from SMILES or reaction SMILES string.
|
17
|
+
Passes exceptions to the caller.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
s (str): SMILES or reaction SMILES
|
21
|
+
rgroup_smiles (Optional[str]): Replacement SMILES string for R groups (*).
|
22
|
+
If None, R groups raise a ValueError.
|
23
|
+
ignore_direction (bool, optional): Ignore direction in reaction SMILES.
|
24
|
+
Has no effect on SMILES. Defaults to False.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
str: INChI key of molecule (products>>substrates)
|
28
|
+
"""
|
29
|
+
|
30
|
+
if ">>" in s:
|
31
|
+
reactants, products = s.split(">>", maxsplit=1)
|
32
|
+
reactants_inchikey = generate_inchikey(reactants, rgroup_smiles=rgroup_smiles)
|
33
|
+
products_inchikey = generate_inchikey(products, rgroup_smiles=rgroup_smiles)
|
34
|
+
if ignore_direction and reactants_inchikey > products_inchikey:
|
35
|
+
reaction_inchikey = products_inchikey + ">>" + reactants_inchikey
|
36
|
+
else:
|
37
|
+
reaction_inchikey = reactants_inchikey + ">>" + products_inchikey
|
38
|
+
return reaction_inchikey
|
39
|
+
elif "*" in s:
|
40
|
+
if rgroup_smiles is not None:
|
41
|
+
replaced_smiles = s.replace("*", rgroup_smiles)
|
42
|
+
if "()" in replaced_smiles:
|
43
|
+
replaced_smiles = replaced_smiles.replace("()", "")
|
44
|
+
return generate_inchikey(replaced_smiles)
|
45
|
+
else:
|
46
|
+
raise ValueError(
|
47
|
+
f"Found R (*) groups in SMILES string {s}. Set rgroup_smiles to replace."
|
48
|
+
)
|
49
|
+
elif s != "":
|
50
|
+
rdmol = None
|
51
|
+
try:
|
52
|
+
rdmol = Chem.MolFromSmiles(s, sanitize=True) # type: ignore
|
53
|
+
except Exception:
|
54
|
+
pass
|
55
|
+
if rdmol is None:
|
56
|
+
raise ValueError(f"Invalid SMILES string {s}")
|
57
|
+
inchikey = Chem.MolToInchiKey(rdmol)
|
58
|
+
if inchikey != "":
|
59
|
+
return inchikey
|
60
|
+
else:
|
61
|
+
raise ValueError("Could not generate INChI key")
|
62
|
+
else:
|
63
|
+
raise ValueError("Empty SMILES string")
|
File without changes
|
@@ -0,0 +1,90 @@
|
|
1
|
+
"""Entry file for the CLI, which aggregates and aliases all commands."""
|
2
|
+
|
3
|
+
import typer
|
4
|
+
from dayhoff_tools.cli.utility_commands import (
|
5
|
+
add_to_warehouse_typer,
|
6
|
+
build_and_upload_wheel,
|
7
|
+
delete_local_branch,
|
8
|
+
get_ancestry,
|
9
|
+
import_from_warehouse_typer,
|
10
|
+
rebuild_devcontainer_file,
|
11
|
+
test_github_actions_locally,
|
12
|
+
)
|
13
|
+
|
14
|
+
app = typer.Typer()
|
15
|
+
|
16
|
+
# Utility commands
|
17
|
+
app.command("clean")(delete_local_branch)
|
18
|
+
app.command("gha")(test_github_actions_locally)
|
19
|
+
app.command("rebuild")(rebuild_devcontainer_file)
|
20
|
+
app.command("wadd")(add_to_warehouse_typer)
|
21
|
+
app.command("wancestry")(get_ancestry)
|
22
|
+
app.command("wheel")(build_and_upload_wheel)
|
23
|
+
app.command("wimport")(import_from_warehouse_typer)
|
24
|
+
|
25
|
+
|
26
|
+
# Use lazy loading for slow-loading swarm commands
|
27
|
+
@app.command("reset")
|
28
|
+
def reset_wrapper(
|
29
|
+
firestore_collection: str = typer.Option(prompt=True),
|
30
|
+
old_status: str = typer.Option(default="failed", prompt=True),
|
31
|
+
new_status: str = typer.Option(default="available", prompt=True),
|
32
|
+
delete_old: bool = typer.Option(default=True, prompt=True),
|
33
|
+
):
|
34
|
+
"""Find all the documents in the database with a given status, and
|
35
|
+
make a new document with the same name and a new status."""
|
36
|
+
from dayhoff_tools.cli.swarm_commands import reset_failed_cards
|
37
|
+
|
38
|
+
reset_failed_cards(firestore_collection, old_status, new_status, delete_old)
|
39
|
+
|
40
|
+
|
41
|
+
@app.command("zombie")
|
42
|
+
def zombie_wrapper(
|
43
|
+
firestore_collection: str = typer.Option(prompt=True),
|
44
|
+
delete_old: bool = typer.Option(default=True, prompt=True),
|
45
|
+
minutes_threshold: int = typer.Option(default=60, prompt=True),
|
46
|
+
):
|
47
|
+
"""Find all the documents in the database with status "assigned", and "last_updated"
|
48
|
+
older than a specified threshold, and make a new "available" document for them."""
|
49
|
+
from dayhoff_tools.cli.swarm_commands import reset_zombie_cards
|
50
|
+
|
51
|
+
reset_zombie_cards(firestore_collection, delete_old, minutes_threshold)
|
52
|
+
|
53
|
+
|
54
|
+
@app.command("status")
|
55
|
+
def status_wrapper(
|
56
|
+
firestore_collection: str = typer.Argument(),
|
57
|
+
):
|
58
|
+
"""Count the various statuses of items in a given collection."""
|
59
|
+
from dayhoff_tools.cli.swarm_commands import get_firestore_collection_status
|
60
|
+
|
61
|
+
get_firestore_collection_status(firestore_collection)
|
62
|
+
|
63
|
+
|
64
|
+
# Deployment commands - use lazy loading but preserve argument passing
|
65
|
+
@app.command("deploy")
|
66
|
+
def deploy_command(
|
67
|
+
mode: str = typer.Argument(help="Deployment mode. Options: local, shell, batch"),
|
68
|
+
config_path: str = typer.Argument(help="Path to the YAML configuration file"),
|
69
|
+
):
|
70
|
+
"""Unified deployment command."""
|
71
|
+
from dayhoff_tools.deployment.base import deploy
|
72
|
+
|
73
|
+
deploy(mode, config_path)
|
74
|
+
|
75
|
+
|
76
|
+
@app.command("job")
|
77
|
+
def run_job_command(
|
78
|
+
mode: str = typer.Argument(
|
79
|
+
default="setup_and_execute",
|
80
|
+
help="Mode to run in: setup (setup only), execute (execute only), or setup_and_execute (both)",
|
81
|
+
)
|
82
|
+
):
|
83
|
+
"""Run a job."""
|
84
|
+
from dayhoff_tools.deployment.job_runner import run_job
|
85
|
+
|
86
|
+
run_job(mode)
|
87
|
+
|
88
|
+
|
89
|
+
if __name__ == "__main__":
|
90
|
+
app()
|