synkit 0.0.16__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. {synkit-0.0.16 → synkit-1.0.1}/PKG-INFO +3 -3
  2. {synkit-0.0.16 → synkit-1.0.1}/README.md +2 -2
  3. {synkit-0.0.16 → synkit-1.0.1}/pyproject.toml +12 -24
  4. synkit-1.0.1/synkit/Chem/Cluster/__init__.py +0 -0
  5. synkit-1.0.1/synkit/Chem/Cluster/butina.py +139 -0
  6. synkit-1.0.1/synkit/Chem/Fingerprint/__init__.py +0 -0
  7. synkit-1.0.1/synkit/Chem/Fingerprint/fp_calculator.py +155 -0
  8. synkit-1.0.1/synkit/Chem/Fingerprint/smiles_featurizer.py +258 -0
  9. synkit-1.0.1/synkit/Chem/Fingerprint/transformation_fp.py +135 -0
  10. synkit-1.0.1/synkit/Chem/Molecule/__init__.py +0 -0
  11. synkit-1.0.1/synkit/Chem/Molecule/standardize.py +167 -0
  12. synkit-1.0.1/synkit/Chem/Reaction/__init__.py +9 -0
  13. synkit-1.0.1/synkit/Chem/Reaction/aam_validator.py +264 -0
  14. synkit-1.0.1/synkit/Chem/Reaction/balance_check.py +138 -0
  15. synkit-1.0.1/synkit/Chem/Reaction/canon_rsmi.py +251 -0
  16. synkit-1.0.1/synkit/Chem/Reaction/cleaning.py +66 -0
  17. synkit-1.0.1/synkit/Chem/Reaction/deionize.py +199 -0
  18. synkit-1.0.1/synkit/Chem/Reaction/fix_aam.py +64 -0
  19. synkit-1.0.1/synkit/Chem/Reaction/neutralize.py +195 -0
  20. synkit-1.0.1/synkit/Chem/Reaction/radical_wildcard.py +223 -0
  21. synkit-1.0.1/synkit/Chem/Reaction/standardize.py +157 -0
  22. synkit-1.0.1/synkit/Chem/Reaction/tautomerize.py +162 -0
  23. synkit-1.0.1/synkit/Chem/__init__.py +1 -0
  24. synkit-1.0.1/synkit/Chem/utils.py +315 -0
  25. synkit-1.0.1/synkit/Data/__init__.py +0 -0
  26. synkit-1.0.1/synkit/Data/gen_partial_aam.py +147 -0
  27. synkit-1.0.1/synkit/Graph/Canon/__init__.py +3 -0
  28. synkit-1.0.1/synkit/Graph/Canon/canon_algs.py +254 -0
  29. synkit-1.0.1/synkit/Graph/Canon/canon_graph.py +530 -0
  30. synkit-1.0.1/synkit/Graph/Canon/nauty.py +320 -0
  31. synkit-1.0.1/synkit/Graph/Context/__init__.py +0 -0
  32. synkit-1.0.1/synkit/Graph/Context/hier_context.py +231 -0
  33. synkit-1.0.1/synkit/Graph/Context/radius_expand.py +242 -0
  34. synkit-1.0.1/synkit/Graph/Feature/Descriptors/topology.py +854 -0
  35. synkit-1.0.1/synkit/Graph/Feature/Fingerprint/__init__.py +0 -0
  36. synkit-1.0.1/synkit/Graph/Feature/Fingerprint/wl_rxn_fps.py +231 -0
  37. synkit-1.0.1/synkit/Graph/Feature/__init__.py +5 -0
  38. synkit-1.0.1/synkit/Graph/Feature/graph_descriptors.py +315 -0
  39. synkit-1.0.1/synkit/Graph/Feature/graph_fps.py +95 -0
  40. synkit-1.0.1/synkit/Graph/Feature/graph_signature.py +237 -0
  41. synkit-1.0.1/synkit/Graph/Feature/hash_fps.py +128 -0
  42. synkit-1.0.1/synkit/Graph/Feature/morgan_fps.py +85 -0
  43. synkit-1.0.1/synkit/Graph/Feature/path_fps.py +79 -0
  44. synkit-1.0.1/synkit/Graph/Feature/wl_hash.py +136 -0
  45. synkit-1.0.1/synkit/Graph/Hyrogen/__init__.py +0 -0
  46. synkit-1.0.1/synkit/Graph/Hyrogen/_misc.py +442 -0
  47. synkit-1.0.1/synkit/Graph/Hyrogen/hcomplete.py +354 -0
  48. synkit-1.0.1/synkit/Graph/Hyrogen/hextend.py +167 -0
  49. synkit-1.0.1/synkit/Graph/ITS/__init__.py +4 -0
  50. synkit-1.0.1/synkit/Graph/ITS/its_builder.py +114 -0
  51. synkit-1.0.1/synkit/Graph/ITS/its_construction.py +316 -0
  52. synkit-1.0.1/synkit/Graph/ITS/its_decompose.py +575 -0
  53. synkit-1.0.1/synkit/Graph/ITS/its_destruction.py +302 -0
  54. synkit-1.0.1/synkit/Graph/ITS/its_expand.py +86 -0
  55. synkit-1.0.1/synkit/Graph/ITS/its_relabel.py +186 -0
  56. synkit-1.0.1/synkit/Graph/ITS/normalize_aam.py +142 -0
  57. synkit-1.0.1/synkit/Graph/ITS/partial_its.py +238 -0
  58. synkit-1.0.1/synkit/Graph/MTG/__init__.py +0 -0
  59. synkit-1.0.1/synkit/Graph/MTG/group_comp.py +157 -0
  60. synkit-1.0.1/synkit/Graph/MTG/groupoid.py +358 -0
  61. synkit-1.0.1/synkit/Graph/MTG/mcs_matcher.py +248 -0
  62. synkit-1.0.1/synkit/Graph/MTG/mtg.py +886 -0
  63. synkit-1.0.1/synkit/Graph/MTG/mtg_explore.py +74 -0
  64. synkit-1.0.1/synkit/Graph/MTG/utils.py +425 -0
  65. synkit-1.0.1/synkit/Graph/Matcher/__init__.py +10 -0
  66. synkit-1.0.1/synkit/Graph/Matcher/automorphism.py +151 -0
  67. synkit-1.0.1/synkit/Graph/Matcher/batch_cluster.py +242 -0
  68. synkit-1.0.1/synkit/Graph/Matcher/graph_cluster.py +197 -0
  69. synkit-1.0.1/synkit/Graph/Matcher/graph_matcher.py +320 -0
  70. synkit-1.0.1/synkit/Graph/Matcher/graph_morphism.py +377 -0
  71. synkit-1.0.1/synkit/Graph/Matcher/mcs_matcher.py +202 -0
  72. synkit-1.0.1/synkit/Graph/Matcher/multi_turbo_iso.py +182 -0
  73. synkit-1.0.1/synkit/Graph/Matcher/partial_matcher.py +214 -0
  74. synkit-1.0.1/synkit/Graph/Matcher/sing.py +216 -0
  75. synkit-1.0.1/synkit/Graph/Matcher/subgraph_matcher.py +1162 -0
  76. synkit-1.0.1/synkit/Graph/Matcher/turbo_iso.py +209 -0
  77. synkit-1.0.1/synkit/Graph/Wildcard/__init__.py +0 -0
  78. synkit-1.0.1/synkit/Graph/Wildcard/fuse_graph.py +156 -0
  79. synkit-1.0.1/synkit/Graph/Wildcard/radwc.py +117 -0
  80. synkit-1.0.1/synkit/Graph/Wildcard/wildcard.py +230 -0
  81. synkit-1.0.1/synkit/Graph/__init__.py +17 -0
  82. synkit-1.0.1/synkit/Graph/canon_graph.py +530 -0
  83. synkit-1.0.1/synkit/Graph/syn_graph.py +155 -0
  84. synkit-1.0.1/synkit/Graph/utils.py +180 -0
  85. synkit-1.0.1/synkit/IO/__init__.py +3 -0
  86. synkit-1.0.1/synkit/IO/chem_converter.py +494 -0
  87. synkit-1.0.1/synkit/IO/combinatorial/__init__.py +8 -0
  88. synkit-1.0.1/synkit/IO/combinatorial/gml_to_graph.py +254 -0
  89. synkit-1.0.1/synkit/IO/combinatorial/graph_to_gml.py +291 -0
  90. synkit-1.0.1/synkit/IO/combinatorial/graph_to_smarts.py +189 -0
  91. synkit-1.0.1/synkit/IO/combinatorial/smarts_expander.py +152 -0
  92. synkit-1.0.1/synkit/IO/combinatorial/smarts_generalizer.py +134 -0
  93. synkit-1.0.1/synkit/IO/combinatorial/smarts_to_graph.py +183 -0
  94. synkit-1.0.1/synkit/IO/data_io.py +314 -0
  95. synkit-1.0.1/synkit/IO/data_process.py +48 -0
  96. synkit-1.0.1/synkit/IO/debug.py +73 -0
  97. synkit-1.0.1/synkit/IO/dg_to_gml.py +133 -0
  98. synkit-1.0.1/synkit/IO/gml_to_nx.py +151 -0
  99. synkit-1.0.1/synkit/IO/graph_to_mol.py +132 -0
  100. synkit-1.0.1/synkit/IO/mol_to_graph.py +354 -0
  101. synkit-1.0.1/synkit/IO/nx_to_gml.py +209 -0
  102. synkit-1.0.1/synkit/IO/smiles_to_id.py +118 -0
  103. synkit-1.0.1/synkit/Rule/Apply/__init__.py +0 -0
  104. synkit-1.0.1/synkit/Rule/Apply/reactor_rule.py +91 -0
  105. synkit-1.0.1/synkit/Rule/Apply/retro_reactor.py +213 -0
  106. synkit-1.0.1/synkit/Rule/Apply/rule_matcher.py +195 -0
  107. synkit-1.0.1/synkit/Rule/Apply/rule_rbl.py +86 -0
  108. synkit-1.0.1/synkit/Rule/Compose/__init__.py +0 -0
  109. synkit-1.0.1/synkit/Rule/Compose/compose_rule.py +226 -0
  110. synkit-1.0.1/synkit/Rule/Compose/rule_compose.py +236 -0
  111. synkit-1.0.1/synkit/Rule/Compose/rule_mapping.py +315 -0
  112. synkit-1.0.1/synkit/Rule/Compose/seq_comp.py +71 -0
  113. synkit-1.0.1/synkit/Rule/Compose/valence_constrain.py +107 -0
  114. synkit-1.0.1/synkit/Rule/Modify/__init__.py +0 -0
  115. synkit-1.0.1/synkit/Rule/Modify/implict_rule.py +65 -0
  116. synkit-1.0.1/synkit/Rule/Modify/longest_path.py +92 -0
  117. synkit-1.0.1/synkit/Rule/Modify/molecule_rule.py +112 -0
  118. synkit-1.0.1/synkit/Rule/Modify/prune_templates.py +75 -0
  119. synkit-1.0.1/synkit/Rule/Modify/rule_utils.py +193 -0
  120. synkit-1.0.1/synkit/Rule/Modify/strip_rule.py +97 -0
  121. synkit-1.0.1/synkit/Rule/__init__.py +1 -0
  122. synkit-1.0.1/synkit/Rule/syn_rule.py +282 -0
  123. synkit-1.0.1/synkit/Synthesis/CRN/__init__.py +0 -0
  124. synkit-1.0.1/synkit/Synthesis/CRN/crn.py +207 -0
  125. synkit-1.0.1/synkit/Synthesis/CRN/dcrn.py +137 -0
  126. synkit-1.0.1/synkit/Synthesis/CRN/mod_crn.py +160 -0
  127. synkit-1.0.1/synkit/Synthesis/MSR/__init__.py +0 -0
  128. synkit-1.0.1/synkit/Synthesis/MSR/multi_steps.py +137 -0
  129. synkit-1.0.1/synkit/Synthesis/MSR/path_finder.py +216 -0
  130. synkit-1.0.1/synkit/Synthesis/Metrics/__init__.py +0 -0
  131. synkit-1.0.1/synkit/Synthesis/Metrics/_base.py +49 -0
  132. synkit-1.0.1/synkit/Synthesis/Metrics/_plot.py +121 -0
  133. synkit-1.0.1/synkit/Synthesis/Metrics/_ranking.py +173 -0
  134. synkit-1.0.1/synkit/Synthesis/Reactor/__init__.py +0 -0
  135. synkit-1.0.1/synkit/Synthesis/Reactor/batch_reactor.py +462 -0
  136. synkit-1.0.1/synkit/Synthesis/Reactor/benchmark.py +152 -0
  137. synkit-1.0.1/synkit/Synthesis/Reactor/imba_engine.py +173 -0
  138. synkit-1.0.1/synkit/Synthesis/Reactor/mod_aam.py +279 -0
  139. synkit-1.0.1/synkit/Synthesis/Reactor/mod_reactor.py +428 -0
  140. synkit-1.0.1/synkit/Synthesis/Reactor/partial_engine.py +70 -0
  141. synkit-1.0.1/synkit/Synthesis/Reactor/post_syn.py +267 -0
  142. synkit-1.0.1/synkit/Synthesis/Reactor/rbl_engine.py +122 -0
  143. synkit-1.0.1/synkit/Synthesis/Reactor/rule_filter.py +195 -0
  144. synkit-1.0.1/synkit/Synthesis/Reactor/single_predictor.py +90 -0
  145. synkit-1.0.1/synkit/Synthesis/Reactor/strategy.py +51 -0
  146. synkit-1.0.1/synkit/Synthesis/Reactor/syn_reactor.py +609 -0
  147. synkit-1.0.1/synkit/Synthesis/__init__.py +0 -0
  148. synkit-1.0.1/synkit/Synthesis/reactor_utils.py +346 -0
  149. synkit-1.0.1/synkit/Utils/__init__.py +0 -0
  150. synkit-1.0.1/synkit/Utils/utils.py +178 -0
  151. synkit-1.0.1/synkit/Vis/__init__.py +5 -0
  152. synkit-1.0.1/synkit/Vis/chemical_space.py +83 -0
  153. synkit-1.0.1/synkit/Vis/embedding.py +85 -0
  154. synkit-1.0.1/synkit/Vis/graph_visualizer.py +382 -0
  155. synkit-1.0.1/synkit/Vis/pdf_writer.py +141 -0
  156. synkit-1.0.1/synkit/Vis/rule_vis.py +179 -0
  157. synkit-1.0.1/synkit/Vis/rxn_vis.py +159 -0
  158. synkit-1.0.1/synkit/__init__.py +0 -0
  159. synkit-1.0.1/synkit/examples.py +50 -0
  160. {synkit-0.0.16 → synkit-1.0.1}/.gitignore +0 -0
  161. {synkit-0.0.16 → synkit-1.0.1}/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: synkit
3
- Version: 0.0.16
3
+ Version: 1.0.1
4
4
  Summary: Utility for reaction modeling using graph grammar
5
5
  Project-URL: homepage, https://github.com/TieuLongPhan/SynKit
6
6
  Project-URL: source, https://github.com/TieuLongPhan/SynKit
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  **Toolkit for Synthesis Planning**
46
46
 
47
- SynKit is a collection of tools designed to support the planning and execution of chemical synthesis.
47
+ SynKit is a collection of tools designed to support the planning and execution of chemical synthesis. Check out the [documentation](https://tieulongphan.github.io/SynKit/) for a comprehensive description of its features.
48
48
 
49
49
  ![SynKit](https://raw.githubusercontent.com/TieuLongPhan/SynKit/main/Data/Figure/synkit.png)
50
50
 
@@ -96,7 +96,7 @@ For more details on each utility within the repository, please refer to the docu
96
96
  ```bash
97
97
  docker pull tieulongphan/synkit:latest
98
98
  # or a specific version:
99
- docker pull tieulongphan/synkit:0.1.0
99
+ docker pull tieulongphan/synkit:1.0.0
100
100
  ```
101
101
  Run a container (sanity check):
102
102
  ```
@@ -13,7 +13,7 @@
13
13
 
14
14
  **Toolkit for Synthesis Planning**
15
15
 
16
- SynKit is a collection of tools designed to support the planning and execution of chemical synthesis.
16
+ SynKit is a collection of tools designed to support the planning and execution of chemical synthesis. Check out the [documentation](https://tieulongphan.github.io/SynKit/) for a comprehensive description of its features.
17
17
 
18
18
  ![SynKit](https://raw.githubusercontent.com/TieuLongPhan/SynKit/main/Data/Figure/synkit.png)
19
19
 
@@ -65,7 +65,7 @@ For more details on each utility within the repository, please refer to the docu
65
65
  ```bash
66
66
  docker pull tieulongphan/synkit:latest
67
67
  # or a specific version:
68
- docker pull tieulongphan/synkit:0.1.0
68
+ docker pull tieulongphan/synkit:1.0.0
69
69
  ```
70
70
  Run a container (sanity check):
71
71
  ```
@@ -4,17 +4,17 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "synkit"
7
- version = "0.0.16"
8
- license = { text = "MIT" }
9
- license-files = ["LICENSE"]
10
- authors = [
11
- { name = "Tieu Long Phan", email = "tieu@bioinf.uni-leipzig.de" }
12
- ]
7
+ version = "1.0.1"
13
8
  description = "Utility for reaction modeling using graph grammar"
14
9
  readme = "README.md"
15
10
  long-description = { file = "CHANGELOG.md" }
16
11
  long-description-content-type = "text/markdown"
17
12
  requires-python = ">=3.11"
13
+ license = { text = "MIT" }
14
+ license-files = ["LICENSE"]
15
+ authors = [
16
+ { name = "Tieu Long Phan", email = "tieu@bioinf.uni-leipzig.de" }
17
+ ]
18
18
  classifiers = [
19
19
  "Programming Language :: Python :: 3",
20
20
  "License :: OSI Approved :: MIT License",
@@ -39,28 +39,16 @@ docs = [
39
39
  ]
40
40
 
41
41
  [project.urls]
42
- homepage = "https://github.com/TieuLongPhan/SynKit"
43
- source = "https://github.com/TieuLongPhan/SynKit"
44
- issues = "https://github.com/TieuLongPhan/SynKit/issues"
42
+ homepage = "https://github.com/TieuLongPhan/SynKit"
43
+ source = "https://github.com/TieuLongPhan/SynKit"
44
+ issues = "https://github.com/TieuLongPhan/SynKit/issues"
45
45
  documentation = "https://tieulongphan.github.io/SynKit/"
46
46
 
47
47
  [tool.hatch.build]
48
- # include non-Python data files in all build targets
49
- include = [
50
- { path = "synkit/Data/*.json" },
51
- { path = "synkit/Data/*.json.gz" }
52
- ]
48
+ packages = ["synkit"]
53
49
 
54
50
  [tool.hatch.build.targets.wheel]
55
- # wheel also gets the same data files
56
- include = [
57
- "synkit/Data/*.json",
58
- "synkit/Data/*.json.gz"
59
- ]
51
+ include = ["synkit/Data/**"]
60
52
 
61
53
  [tool.hatch.build.targets.sdist]
62
- # source sdist likewise
63
- include = [
64
- "synkit/Data/*.json",
65
- "synkit/Data/*.json.gz"
66
- ]
54
+ include = ["synkit/Data/**"]
File without changes
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+ from typing import List, Optional
3
+
4
+ import numpy as np
5
+ from rdkit.DataStructs import cDataStructs, CreateFromBitString, BulkTanimotoSimilarity
6
+ from rdkit.ML.Cluster import Butina
7
+ from sklearn.manifold import TSNE
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ class ButinaCluster:
12
+ """Cluster chemical fingerprint vectors using the Butina algorithm from
13
+ RDKit, with integrated t-SNE visualization of clusters.
14
+
15
+ Key features
16
+ ------------
17
+ * **Butina clustering** – fast hierarchical clustering with a similarity cutoff.
18
+ * **t-SNE visualization** – 2D embedding of fingerprints, highlighting top‑k clusters.
19
+ * **NumPy support** – accepts 2D arrays of 0/1 fingerprint data.
20
+ * **Configurable** – user‑defined cutoff, perplexity, and top‑k highlight.
21
+
22
+ Quick start
23
+ -----------
24
+ >>> from synkit.Chem.Fingerprint.fingerprint_clusterer import ButinaCluster
25
+ >>> clusters = ButinaCluster.cluster(arr, cutoff=0.3)
26
+ >>> ButinaCluster.visualize(arr, clusters, k=5)
27
+ """
28
+
29
+ @staticmethod
30
+ def cluster(arr: np.ndarray, cutoff: float = 0.2) -> List[List[int]]:
31
+ """Perform Butina clustering on fingerprint bit-vectors.
32
+
33
+ :param arr: 2D array of shape (n_samples, n_bits) with 0/1
34
+ dtype.
35
+ :type arr: np.ndarray
36
+ :param cutoff: Distance cutoff (1 – similarity) to form
37
+ clusters. Defaults to 0.2.
38
+ :type cutoff: float
39
+ :returns: List of clusters, each a list of sample indices.
40
+ :rtype: list of list of int
41
+ """
42
+ # Convert rows to RDKit ExplicitBitVect
43
+ fps: List[cDataStructs.ExplicitBitVect] = []
44
+ for row in arr:
45
+ bitstr = "".join(str(int(b)) for b in row.tolist())
46
+ fps.append(CreateFromBitString(bitstr))
47
+
48
+ n = len(fps)
49
+ # Build flattened upper‐triangular distance list
50
+ distances: List[float] = []
51
+ for i in range(n):
52
+ # fmt: off
53
+ sims = BulkTanimotoSimilarity(fps[i], fps[i + 1:])
54
+ # fmt: on
55
+ distances.extend((1.0 - np.array(sims, dtype=float)).tolist())
56
+
57
+ # Cluster: ClusterData(distanceList, nPts, cutoff, isDistData)
58
+ clusters = Butina.ClusterData(distances, n, cutoff, True)
59
+ return clusters
60
+
61
+ @staticmethod
62
+ def visualize(
63
+ arr: np.ndarray,
64
+ clusters: List[List[int]],
65
+ k: Optional[int] = None,
66
+ perplexity: float = 30.0,
67
+ random_state: int = 42,
68
+ ) -> None:
69
+ """Visualize clusters in 2D via t-SNE embedding.
70
+
71
+ :param arr: 2D array of shape (n_samples, n_features) with fingerprint data.
72
+ :type arr: np.ndarray
73
+ :param clusters: Clusters as returned by `cluster()`.
74
+ :type clusters: list of list of int
75
+ :param k: If provided, highlight only the top‑k largest clusters; others shown as 'Other'.
76
+ :type k: int or None
77
+ :param perplexity: t-SNE perplexity parameter. Defaults to 30.0.
78
+ :type perplexity: float
79
+ :param random_state: Random seed for reproducibility. Defaults to 42.
80
+ :type random_state: int
81
+ :returns: None
82
+ :rtype: NoneType
83
+
84
+ :example:
85
+ >>> clusters = ButinaCluster.cluster(arr, cutoff=0.3)
86
+ >>> ButinaCluster.visualize(arr, clusters, k=5)
87
+ """
88
+ n = arr.shape[0]
89
+ # assign labels: cluster idx or -1 for 'Other'
90
+ labels = np.full(n, -1, dtype=int)
91
+ # sort clusters by size
92
+ sorted_idx = sorted(
93
+ range(len(clusters)), key=lambda i: len(clusters[i]), reverse=True
94
+ )
95
+ top = set(sorted_idx[:k]) if k is not None else set(sorted_idx)
96
+ for idx, cluster in enumerate(clusters):
97
+ for i in cluster:
98
+ labels[i] = idx if idx in top else -1
99
+
100
+ # compute t-SNE embedding
101
+ tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state)
102
+ emb = tsne.fit_transform(arr)
103
+
104
+ # plot
105
+ plt.figure(figsize=(8, 6))
106
+ unique = sorted(set(labels))
107
+ for lab in unique:
108
+ mask = labels == lab
109
+ if lab == -1:
110
+ plt.scatter(
111
+ emb[mask, 0], emb[mask, 1], color="gray", alpha=0.3, label="Other"
112
+ )
113
+ else:
114
+ plt.scatter(
115
+ emb[mask, 0], emb[mask, 1], alpha=0.7, label=f"Cluster {lab}"
116
+ )
117
+ plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
118
+ plt.title("t-SNE visualization of Butina clusters")
119
+ plt.xlabel("t-SNE dim 1")
120
+ plt.ylabel("t-SNE dim 2")
121
+ plt.tight_layout()
122
+ plt.show()
123
+
124
+ def __str__(self) -> str:
125
+ """Short description of the clusterer.
126
+
127
+ :returns: Class name.
128
+ :rtype: str
129
+ """
130
+ return "<ButinaCluster>"
131
+
132
+ def help(self) -> None:
133
+ """Print usage summary for clustering and visualization.
134
+
135
+ :returns: None
136
+ :rtype: NoneType
137
+ """
138
+ print("ButinaCluster.cluster(arr, cutoff=0.2)")
139
+ print("ButinaCluster.visualize(arr, clusters, k=None, perplexity=30.0)")
File without changes
@@ -0,0 +1,155 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Dict, List
3
+ from joblib import Parallel, delayed
4
+
5
+ from synkit.IO.debug import configure_warnings_and_logs
6
+ from synkit.Chem.Fingerprint.transformation_fp import TransformationFP
7
+
8
+ configure_warnings_and_logs(True, True)
9
+
10
+
11
+ class FPCalculator:
12
+ """Calculate fingerprint vectors for chemical reactions represented by
13
+ SMILES strings.
14
+
15
+ :cvar fps: Shared fingerprint engine instance.
16
+ :vartype fps: TransformationFP
17
+ :cvar VALID_FP_TYPES: Supported fingerprint type identifiers.
18
+ :vartype VALID_FP_TYPES: List[str]
19
+ :param n_jobs: Number of parallel jobs to use for batch processing.
20
+ :type n_jobs: int
21
+ :param verbose: Verbosity level for parallel execution.
22
+ :type verbose: int
23
+ """
24
+
25
+ fps: TransformationFP = TransformationFP()
26
+ VALID_FP_TYPES: List[str] = [
27
+ "drfp",
28
+ "avalon",
29
+ "maccs",
30
+ "torsion",
31
+ "pharm2D",
32
+ "ecfp2",
33
+ "ecfp4",
34
+ "ecfp6",
35
+ "fcfp2",
36
+ "fcfp4",
37
+ "fcfp6",
38
+ "rdk5",
39
+ "rdk6",
40
+ "rdk7",
41
+ "ap",
42
+ ]
43
+
44
+ def __init__(self, n_jobs: int = 1, verbose: int = 0) -> None:
45
+ """Initialize the FPCalculator.
46
+
47
+ :param n_jobs: Number of parallel jobs to use for fingerprint
48
+ computation.
49
+ :type n_jobs: int
50
+ :param verbose: Verbosity level for the parallel processing.
51
+ :type verbose: int
52
+ """
53
+ self.n_jobs = n_jobs
54
+ self.verbose = verbose
55
+
56
+ def _validate_fp_type(self, fp_type: str) -> None:
57
+ """Ensure the requested fingerprint type is supported.
58
+
59
+ :param fp_type: Fingerprint type identifier to validate.
60
+ :type fp_type: str
61
+ :raises ValueError: If `fp_type` is not in VALID_FP_TYPES.
62
+ """
63
+ if fp_type not in self.VALID_FP_TYPES:
64
+ valid = ", ".join(self.VALID_FP_TYPES)
65
+ raise ValueError(
66
+ f"Unsupported fingerprint type '{fp_type}'. Supported types: {valid}."
67
+ )
68
+
69
+ @staticmethod
70
+ def dict_process(
71
+ data_dict: Dict[str, Any],
72
+ rsmi_key: str,
73
+ symbol: str = ">>",
74
+ fp_type: str = "ecfp4",
75
+ absolute: bool = True,
76
+ ) -> Dict[str, Any]:
77
+ """Compute a fingerprint for a single reaction SMILES entry and add it
78
+ to the dict.
79
+
80
+ :param data_dict: Dictionary containing reaction data.
81
+ :type data_dict: dict
82
+ :param rsmi_key: Key in `data_dict` for the reaction SMILES string.
83
+ :type rsmi_key: str
84
+ :param symbol: Delimiter between reactant and product in the SMILES.
85
+ :type symbol: str
86
+ :param fp_type: Fingerprint type to compute.
87
+ :type fp_type: str
88
+ :param absolute: Whether to take absolute values of the fingerprint difference.
89
+ :type absolute: bool
90
+ :returns: The input dictionary with a new key `fp_{fp_type}` holding the fingerprint vector.
91
+ :rtype: dict
92
+ :raises ValueError: If `rsmi_key` is missing in `data_dict`.
93
+ """
94
+ if rsmi_key not in data_dict:
95
+ raise ValueError(f"Key '{rsmi_key}' not found in data dictionary.")
96
+ # compute and insert fingerprint
97
+ vec = FPCalculator.fps.fit(
98
+ data_dict[rsmi_key], symbols=symbol, fp_type=fp_type, abs=absolute
99
+ )
100
+ data_dict[f"{fp_type}"] = vec
101
+ return data_dict
102
+
103
+ def parallel_process(
104
+ self,
105
+ data_dicts: List[Dict[str, Any]],
106
+ rsmi_key: str,
107
+ symbol: str = ">>",
108
+ fp_type: str = "ecfp4",
109
+ absolute: bool = True,
110
+ ) -> List[Dict[str, Any]]:
111
+ """Compute fingerprints for a batch of reaction dictionaries in
112
+ parallel.
113
+
114
+ :param data_dicts: List of dictionaries, each containing a reaction SMILES.
115
+ :type data_dicts: list of dict
116
+ :param rsmi_key: Key in each dict for the reaction SMILES string.
117
+ :type rsmi_key: str
118
+ :param symbol: Delimiter between reactant and product in the SMILES.
119
+ :type symbol: str
120
+ :param fp_type: Fingerprint type to compute.
121
+ :type fp_type: str
122
+ :param absolute: Whether to take absolute values of the fingerprint difference.
123
+ :type absolute: bool
124
+ :returns: A list of dictionaries augmented with `fp_{fp_type}` entries.
125
+ :rtype: list of dict
126
+ :raises ValueError: If `fp_type` is unsupported or any dict is missing `rsmi_key`.
127
+ """
128
+ # Validate fingerprint type once
129
+ self._validate_fp_type(fp_type)
130
+
131
+ # Process in parallel
132
+ results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
133
+ delayed(self.dict_process)(dd, rsmi_key, symbol, fp_type, absolute)
134
+ for dd in data_dicts
135
+ )
136
+ return results
137
+
138
+ def __str__(self) -> str:
139
+ """Short string summarizing the calculator configuration.
140
+
141
+ :returns: A summary of n_jobs and verbosity.
142
+ :rtype: str
143
+ """
144
+ return f"<FPCalculator n_jobs={self.n_jobs} verbose={self.verbose}>"
145
+
146
+ def help(self) -> None:
147
+ """Print details about supported fingerprint types and usage.
148
+
149
+ :returns: None
150
+ :rtype: NoneType
151
+ """
152
+ print("FPCalculator supports the following fingerprint types:")
153
+ for t in self.VALID_FP_TYPES:
154
+ print(" -", t)
155
+ print(f"Configured for {self.n_jobs} parallel jobs, verbose={self.verbose}")
@@ -0,0 +1,258 @@
1
+ """smiles_featurizer.py
2
+ =======================
3
+ Utility for converting SMILES strings into various cheminformatics fingerprints,
4
+ with optional NumPy‐array conversion.
5
+
6
+ Key features
7
+ ------------
8
+ * **Multi‐fingerprint support** – MACCS, Avalon, ECFP/FCFP, RDKit, AtomPair, Torsion, Pharm2D
9
+ * **SMILES validation** – raises on invalid input
10
+ * **Array conversion** – output as NumPy arrays for ML pipelines
11
+ * **Extensible** – add new methods or override via subclassing
12
+
13
+ Quick start
14
+ -----------
15
+ >>> from synkit.Chem.Fingerprint.smiles_featurizer import SmilesFeaturizer
16
+ >>> arr = SmilesFeaturizer.featurize_smiles("CCO", "ecfp4", convert_to_array=True)
17
+ """
18
+
19
+ from __future__ import annotations
20
+ from typing import Any
21
+
22
+ import numpy as np
23
+ from rdkit import Chem, DataStructs
24
+ from rdkit.Chem import AllChem, MACCSkeys
25
+ from rdkit.Chem.AtomPairs import Pairs, Torsions
26
+ from rdkit.Avalon import pyAvalonTools as fpAvalon
27
+ from rdkit.Chem.Pharm2D import Gobbi_Pharm2D, Generate
28
+
29
+
30
+ class SmilesFeaturizer:
31
+ """Convert SMILES strings into chemical fingerprint vectors.
32
+
33
+ :cvar None: This class only provides static/​class methods and holds no state.
34
+
35
+ Supported fingerprint methods:
36
+ - MACCS keys
37
+ - Avalon
38
+ - ECFP/FCFP (Morgan)
39
+ - RDKit topological
40
+ - AtomPair
41
+ - Torsion
42
+ - 2D Pharmacophore
43
+
44
+ Use `featurize_smiles` for one‑line access.
45
+ """
46
+
47
+ def __init__(self) -> None:
48
+ """Initialize SmilesFeaturizer.
49
+
50
+ This class has no instance state; all methods are static or
51
+ class‑level.
52
+ """
53
+ pass
54
+
55
+ @staticmethod
56
+ def smiles_to_mol(smiles: str) -> Chem.Mol:
57
+ """Convert a SMILES string to an RDKit Mol object.
58
+
59
+ :param smiles: The SMILES string to convert.
60
+ :type smiles: str
61
+ :returns: RDKit Mol object corresponding to the SMILES.
62
+ :rtype: Chem.Mol
63
+ :raises ValueError: If the SMILES string is invalid.
64
+ """
65
+ mol = Chem.MolFromSmiles(smiles)
66
+ if mol is None:
67
+ raise ValueError(f"Invalid SMILES string: {smiles!r}")
68
+ return mol
69
+
70
+ @staticmethod
71
+ def get_maccs_keys(mol: Chem.Mol) -> Any:
72
+ """Generate the MACCS keys fingerprint for a molecule.
73
+
74
+ :param mol: RDKit Mol object.
75
+ :type mol: Chem.Mol
76
+ :returns: MACCS keys fingerprint bit vector.
77
+ :rtype: ExplicitBitVect
78
+ """
79
+ return MACCSkeys.GenMACCSKeys(mol)
80
+
81
+ @staticmethod
82
+ def get_avalon_fp(mol: Chem.Mol, nBits: int = 1024) -> Any:
83
+ """Generate the Avalon fingerprint for a molecule.
84
+
85
+ :param mol: RDKit Mol object.
86
+ :type mol: Chem.Mol
87
+ :param nBits: Length of the fingerprint vector.
88
+ :type nBits: int
89
+ :returns: Avalon fingerprint bit vector.
90
+ :rtype: ExplicitBitVect
91
+ """
92
+ return fpAvalon.GetAvalonFP(mol, nBits)
93
+
94
+ @staticmethod
95
+ def get_ecfp(
96
+ mol: Chem.Mol, radius: int, nBits: int = 2048, useFeatures: bool = False
97
+ ) -> Any:
98
+ """Generate a Morgan fingerprint (ECFP or FCFP) for a molecule.
99
+
100
+ :param mol: RDKit Mol object.
101
+ :type mol: Chem.Mol
102
+ :param radius: Radius for the Morgan algorithm.
103
+ :type radius: int
104
+ :param nBits: Length of the fingerprint vector.
105
+ :type nBits: int
106
+ :param useFeatures: If True, generate a Feature‑Class
107
+ fingerprint (FCFP).
108
+ :type useFeatures: bool
109
+ :returns: Morgan fingerprint bit vector.
110
+ :rtype: ExplicitBitVect
111
+ """
112
+ return AllChem.GetMorganFingerprintAsBitVect(
113
+ mol, radius, nBits=nBits, useFeatures=useFeatures
114
+ )
115
+
116
+ @staticmethod
117
+ def get_rdk_fp(
118
+ mol: Chem.Mol, maxPath: int, fpSize: int = 2048, nBitsPerHash: int = 2
119
+ ) -> Any:
120
+ """Generate an RDKit topological fingerprint for a molecule.
121
+
122
+ :param mol: RDKit Mol object.
123
+ :type mol: Chem.Mol
124
+ :param maxPath: Maximum path length (bonds) to include.
125
+ :type maxPath: int
126
+ :param fpSize: Length of the fingerprint vector.
127
+ :type fpSize: int
128
+ :param nBitsPerHash: Bits per hash for path hashing.
129
+ :type nBitsPerHash: int
130
+ :returns: RDKit topological fingerprint bit vector.
131
+ :rtype: ExplicitBitVect
132
+ """
133
+ return Chem.RDKFingerprint(
134
+ mol, maxPath=maxPath, fpSize=fpSize, nBitsPerHash=nBitsPerHash
135
+ )
136
+
137
+ @staticmethod
138
+ def mol_to_ap(mol: Chem.Mol) -> Any:
139
+ """Generate an Atom Pair fingerprint for a molecule.
140
+
141
+ :param mol: RDKit Mol object.
142
+ :type mol: Chem.Mol
143
+ :returns: Atom Pair fingerprint as an integer vector.
144
+ :rtype: ExplicitBitVect
145
+ """
146
+ return Pairs.GetAtomPairFingerprint(mol)
147
+
148
+ @staticmethod
149
+ def mol_to_torsion(mol: Chem.Mol) -> Any:
150
+ """Generate a Topological Torsion fingerprint for a molecule.
151
+
152
+ :param mol: RDKit Mol object.
153
+ :type mol: Chem.Mol
154
+ :returns: Torsion fingerprint as an integer vector.
155
+ :rtype: ExplicitBitVect
156
+ """
157
+ return Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
158
+
159
+ @staticmethod
160
+ def mol_to_pharm2d(mol: Chem.Mol) -> Any:
161
+ """Generate a 2D Pharmacophore fingerprint for a molecule.
162
+
163
+ :param mol: RDKit Mol object.
164
+ :type mol: Chem.Mol
165
+ :returns: 2D pharmacophore fingerprint bit vector.
166
+ :rtype: ExplicitBitVect
167
+ """
168
+ return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
169
+
170
+ @classmethod
171
+ def featurize_smiles(
172
+ cls,
173
+ smiles: str,
174
+ fingerprint_type: str,
175
+ convert_to_array: bool = True,
176
+ **kwargs: Any,
177
+ ) -> Any:
178
+ """Featurize a SMILES string into a chosen fingerprint, optionally
179
+ converting to a NumPy array.
180
+
181
+ :param smiles: The SMILES string to featurize.
182
+ :type smiles: str
183
+ :param fingerprint_type: One of 'maccs', 'avalon', 'ecfp#', 'fcfp#',
184
+ 'rdk#', 'ap', 'torsion', 'pharm2d'.
185
+ :type fingerprint_type: str
186
+ :param convert_to_array: If True, convert the result to a NumPy array.
187
+ :type convert_to_array: bool
188
+ :param kwargs: Additional parameters passed to the chosen method:
189
+ - `nBits` for Avalon/ECFP/FCFP
190
+ - `radius` for ECFP/FCFP
191
+ - `maxPath`, `fpSize`, `nBitsPerHash` for RDKit FP
192
+ :type kwargs: dict
193
+ :returns: Fingerprint as a NumPy array (if `convert_to_array`) or RDKit bit vector.
194
+ :rtype: np.ndarray or ExplicitBitVect
195
+ :raises ValueError: If `fingerprint_type` is unsupported.
196
+ """
197
+ mol = cls.smiles_to_mol(smiles)
198
+
199
+ ft = fingerprint_type.lower()
200
+ if ft == "maccs":
201
+ fp = cls.get_maccs_keys(mol)
202
+ elif ft == "avalon":
203
+ fp = cls.get_avalon_fp(mol, nBits=kwargs.get("nBits", 1024))
204
+ elif ft.startswith("ecfp") or ft.startswith("fcfp"):
205
+ radius = int(ft[4])
206
+ use_features = ft.startswith("fcfp")
207
+ fp = cls.get_ecfp(
208
+ mol,
209
+ radius,
210
+ nBits=kwargs.get("nBits", 2048),
211
+ useFeatures=use_features,
212
+ )
213
+ elif ft.startswith("rdk"):
214
+ max_path = int(ft[3])
215
+ fp = cls.get_rdk_fp(
216
+ mol,
217
+ maxPath=max_path,
218
+ fpSize=kwargs.get("fpSize", 2048),
219
+ nBitsPerHash=kwargs.get("nBitsPerHash", 2),
220
+ )
221
+ elif ft == "ap":
222
+ fp = cls.mol_to_ap(mol)
223
+ elif ft == "torsion":
224
+ fp = cls.mol_to_torsion(mol)
225
+ elif ft == "pharm2d":
226
+ fp = cls.mol_to_pharm2d(mol)
227
+ else:
228
+ raise ValueError(f"Unsupported fingerprint type: {fingerprint_type!r}")
229
+
230
+ if convert_to_array:
231
+ if ft == "pharm2d":
232
+ bitstr = fp.ToBitString()
233
+ return np.array([int(b) for b in bitstr], dtype=np.int8)
234
+ arr = np.zeros((fp.GetNumBits(),), dtype=np.int8)
235
+ DataStructs.ConvertToNumpyArray(fp, arr)
236
+ return arr
237
+
238
+ return fp
239
+
240
+ def __str__(self) -> str:
241
+ """Short description of the featurizer.
242
+
243
+ :returns: Class name.
244
+ :rtype: str
245
+ """
246
+ return "<SmilesFeaturizer>"
247
+
248
+ def help(self) -> None:
249
+ """Print supported fingerprint types and usage summary.
250
+
251
+ :returns: None
252
+ :rtype: NoneType
253
+ """
254
+ print("SmilesFeaturizer supports the following fingerprint types:")
255
+ print(" - maccs, avalon, ecfp#, fcfp#, rdk#, ap, torsion, pharm2d")
256
+ print(
257
+ "Usage: SmilesFeaturizer.featurize_smiles(smiles, fingerprint_type, **kwargs)"
258
+ )