PyPI - workbench - Versions diffs - 0.8.219__py3-none-any.whl → 0.8.224__py3-none-any.whl - Mend

workbench 0.8.219py3-none-any.whl → 0.8.224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

workbench/algorithms/dataframe/compound_dataset_overlap.py ADDED Viewed

@@ -0,0 +1,321 @@
+"""Compound Dataset Overlap Analysis
+This module provides utilities for comparing two molecular datasets based on
+Tanimoto similarity in fingerprint space. It helps quantify the "overlap"
+between datasets in chemical space.
+Use cases:
+    - Train/test split validation: Ensure test set isn't too similar to training
+    - Dataset comparison: Compare proprietary vs public datasets
+    - Novelty assessment: Find compounds in query dataset that are novel vs reference
+"""
+import logging
+from typing import Optional, Tuple
+import pandas as pd
+from workbench.algorithms.dataframe.fingerprint_proximity import FingerprintProximity
+# Set up logging
+log = logging.getLogger("workbench")
+class CompoundDatasetOverlap:
+    """Compare two molecular datasets using Tanimoto similarity.
+    Builds a FingerprintProximity model on the reference dataset, then queries
+    with SMILES from the query dataset to find the nearest neighbor in the
+    reference for each query compound. This guarantees cross-dataset matches.
+    Attributes:
+        prox: FingerprintProximity instance on reference dataset
+        overlap_df: Results DataFrame with similarity scores for each query compound
+    """
+    def __init__(
+        self,
+        df_reference: pd.DataFrame,
+        df_query: pd.DataFrame,
+        id_column_reference: str = "id",
+        id_column_query: str = "id",
+        radius: int = 2,
+        n_bits: int = 2048,
+    ) -> None:
+        """
+        Initialize the CompoundDatasetOverlap analysis.
+        Args:
+            df_reference: Reference dataset (DataFrame with SMILES)
+            df_query: Query dataset (DataFrame with SMILES)
+            id_column_reference: ID column name in df_reference
+            id_column_query: ID column name in df_query
+            radius: Morgan fingerprint radius (default: 2 = ECFP4)
+            n_bits: Number of fingerprint bits (default: 2048)
+        """
+        self.id_column_reference = id_column_reference
+        self.id_column_query = id_column_query
+        self._radius = radius
+        self._n_bits = n_bits
+        # Store copies of the dataframes
+        self.df_reference = df_reference.copy()
+        self.df_query = df_query.copy()
+        # Find SMILES columns
+        self._smiles_col_reference = self._find_smiles_column(self.df_reference)
+        self._smiles_col_query = self._find_smiles_column(self.df_query)
+        if self._smiles_col_reference is None:
+            raise ValueError("Reference dataset must have a SMILES column")
+        if self._smiles_col_query is None:
+            raise ValueError("Query dataset must have a SMILES column")
+        log.info(f"Reference dataset: {len(self.df_reference)} compounds")
+        log.info(f"Query dataset: {len(self.df_query)} compounds")
+        # Build FingerprintProximity on reference dataset only
+        self.prox = FingerprintProximity(
+            self.df_reference,
+            id_column=id_column_reference,
+            radius=radius,
+            n_bits=n_bits,
+        )
+        # Compute cross-dataset overlap
+        self.overlap_df = self._compute_cross_dataset_overlap()
+    @staticmethod
+    def _find_smiles_column(df: pd.DataFrame) -> Optional[str]:
+        """Find the SMILES column in a DataFrame (case-insensitive)."""
+        for col in df.columns:
+            if col.lower() == "smiles":
+                return col
+        return None
+    def _compute_cross_dataset_overlap(self) -> pd.DataFrame:
+        """For each query compound, find nearest neighbor in reference using neighbors_from_smiles."""
+        log.info(f"Computing nearest neighbors in reference for {len(self.df_query)} query compounds")
+        # Get SMILES list from query dataset
+        query_smiles = self.df_query[self._smiles_col_query].tolist()
+        query_ids = self.df_query[self.id_column_query].tolist()
+        # Query all compounds against reference (get only nearest neighbor)
+        neighbors_df = self.prox.neighbors_from_smiles(query_smiles, n_neighbors=1)
+        # Build results with query IDs
+        results = []
+        for i, (q_id, q_smi) in enumerate(zip(query_ids, query_smiles)):
+            # Find the row for this query SMILES
+            match = neighbors_df[neighbors_df["query_id"] == q_smi]
+            if len(match) > 0:
+                row = match.iloc[0]
+                results.append(
+                    {
+                        "id": q_id,
+                        "smiles": q_smi,
+                        "nearest_neighbor_id": row["neighbor_id"],
+                        "tanimoto_similarity": row["similarity"],
+                    }
+                )
+            else:
+                # Should not happen, but handle gracefully
+                results.append(
+                    {
+                        "id": q_id,
+                        "smiles": q_smi,
+                        "nearest_neighbor_id": None,
+                        "tanimoto_similarity": 0.0,
+                    }
+                )
+        result_df = pd.DataFrame(results)
+        # Add nearest neighbor SMILES from reference
+        ref_smiles_map = self.df_reference.set_index(self.id_column_reference)[self._smiles_col_reference]
+        result_df["nearest_neighbor_smiles"] = result_df["nearest_neighbor_id"].map(ref_smiles_map)
+        return result_df.sort_values("tanimoto_similarity", ascending=False).reset_index(drop=True)
+    def summary_stats(self) -> pd.DataFrame:
+        """Return distribution statistics for nearest-neighbor Tanimoto similarities."""
+        return (
+            self.overlap_df["tanimoto_similarity"]
+            .describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
+            .to_frame()
+        )
+    def novel_compounds(self, threshold: float = 0.4) -> pd.DataFrame:
+        """Return query compounds that are novel (low similarity to reference).
+        Args:
+            threshold: Maximum Tanimoto similarity to consider "novel" (default: 0.4)
+        Returns:
+            DataFrame of query compounds with similarity below threshold
+        """
+        novel = self.overlap_df[self.overlap_df["tanimoto_similarity"] < threshold].copy()
+        return novel.sort_values("tanimoto_similarity", ascending=True).reset_index(drop=True)
+    def similar_compounds(self, threshold: float = 0.7) -> pd.DataFrame:
+        """Return query compounds that are similar to reference (high overlap).
+        Args:
+            threshold: Minimum Tanimoto similarity to consider "similar" (default: 0.7)
+        Returns:
+            DataFrame of query compounds with similarity above threshold
+        """
+        similar = self.overlap_df[self.overlap_df["tanimoto_similarity"] >= threshold].copy()
+        return similar.sort_values("tanimoto_similarity", ascending=False).reset_index(drop=True)
+    def overlap_fraction(self, threshold: float = 0.7) -> float:
+        """Return fraction of query compounds that overlap with reference above similarity threshold.
+        Args:
+            threshold: Minimum Tanimoto similarity to consider "overlapping"
+        Returns:
+            Fraction of query compounds with nearest neighbor similarity >= threshold
+        """
+        n_overlapping = (self.overlap_df["tanimoto_similarity"] >= threshold).sum()
+        return n_overlapping / len(self.overlap_df)
+    def plot_histogram(self, bins: int = 50, figsize: Tuple[int, int] = (10, 6)) -> None:
+        """Plot histogram of nearest-neighbor Tanimoto similarities.
+        Args:
+            bins: Number of histogram bins
+            figsize: Figure size (width, height)
+        """
+        import matplotlib.pyplot as plt
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.hist(self.overlap_df["tanimoto_similarity"], bins=bins, edgecolor="black", alpha=0.7)
+        ax.set_xlabel("Tanimoto Similarity (query → nearest in reference)")
+        ax.set_ylabel("Count")
+        ax.set_title(f"Dataset Overlap: {len(self.overlap_df)} query compounds")
+        ax.axvline(x=0.4, color="red", linestyle="--", label="Novel threshold (0.4)")
+        ax.axvline(x=0.7, color="green", linestyle="--", label="Similar threshold (0.7)")
+        ax.legend()
+        # Add summary stats as text
+        stats = self.overlap_df["tanimoto_similarity"]
+        textstr = f"Mean: {stats.mean():.3f}\nMedian: {stats.median():.3f}\nStd: {stats.std():.3f}"
+        ax.text(
+            0.02,
+            0.98,
+            textstr,
+            transform=ax.transAxes,
+            verticalalignment="top",
+            bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
+        )
+        plt.tight_layout()
+        plt.show()
+# =============================================================================
+# Testing
+# =============================================================================
+if __name__ == "__main__":
+    print("=" * 80)
+    print("Testing CompoundDatasetOverlap")
+    print("=" * 80)
+    # Test 1: Basic functionality with SMILES data
+    print("\n1. Testing with SMILES data...")
+    # Reference dataset: Known drug-like compounds
+    reference_data = {
+        "id": ["aspirin", "caffeine", "glucose", "ibuprofen", "naproxen", "ethanol", "methanol", "propanol"],
+        "smiles": [
+            "CC(=O)OC1=CC=CC=C1C(=O)O",  # aspirin
+            "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # caffeine
+            "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O",  # glucose
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # ibuprofen
+            "COC1=CC2=CC(C(C)C(O)=O)=CC=C2C=C1",  # naproxen
+            "CCO",  # ethanol
+            "CO",  # methanol
+            "CCCO",  # propanol
+        ],
+    }
+    # Query dataset: Compounds to compare against reference
+    query_data = {
+        "id": ["acetaminophen", "theophylline", "benzene", "toluene", "phenol", "aniline"],
+        "smiles": [
+            "CC(=O)NC1=CC=C(C=C1)O",  # acetaminophen - similar to aspirin
+            "CN1C=NC2=C1C(=O)NC(=O)N2",  # theophylline - similar to caffeine
+            "c1ccccc1",  # benzene - simple aromatic
+            "Cc1ccccc1",  # toluene - similar to benzene
+            "Oc1ccccc1",  # phenol - hydroxyl benzene
+            "Nc1ccccc1",  # aniline - amino benzene
+        ],
+    }
+    df_reference = pd.DataFrame(reference_data)
+    df_query = pd.DataFrame(query_data)
+    print(f"   Reference: {len(df_reference)} compounds, Query: {len(df_query)} compounds")
+    overlap = CompoundDatasetOverlap(
+        df_reference, df_query, id_column_reference="id", id_column_query="id", radius=2, n_bits=1024
+    )
+    print("\n   Overlap results:")
+    print(overlap.overlap_df[["id", "nearest_neighbor_id", "tanimoto_similarity"]].to_string(index=False))
+    print("\n   Summary statistics:")
+    print(overlap.summary_stats())
+    # Test 2: Novel and similar compound identification
+    print("\n2. Testing novel/similar compound identification...")
+    similar = overlap.similar_compounds(threshold=0.3)
+    print(f"   Similar compounds (sim >= 0.3): {len(similar)}")
+    if len(similar) > 0:
+        print(similar[["id", "nearest_neighbor_id", "tanimoto_similarity"]].to_string(index=False))
+    novel = overlap.novel_compounds(threshold=0.3)
+    print(f"\n   Novel compounds (sim < 0.3): {len(novel)}")
+    if len(novel) > 0:
+        print(novel[["id", "nearest_neighbor_id", "tanimoto_similarity"]].to_string(index=False))
+    # Test 3: With Workbench data (if available)
+    print("\n3. Testing with Workbench FeatureSet (if available)...")
+    try:
+        from workbench.api import FeatureSet
+        fs = FeatureSet("aqsol_features")
+        full_df = fs.pull_dataframe()[:1000]  # Limit to first 1000 for testing
+        # Split into reference and query sets
+        df_reference = full_df.sample(frac=0.8, random_state=42)
+        df_query = full_df.drop(df_reference.index)
+        print(f"   Reference set: {len(df_reference)} compounds")
+        print(f"   Query set: {len(df_query)} compounds")
+        overlap = CompoundDatasetOverlap(
+            df_reference, df_query, id_column_reference=fs.id_column, id_column_query=fs.id_column
+        )
+        print("\n   Summary statistics:")
+        print(overlap.summary_stats())
+        print(f"\n   Overlap fraction (sim >= 0.7): {overlap.overlap_fraction(0.7):.2%}")
+        print(f"   Overlap fraction (sim >= 0.5): {overlap.overlap_fraction(0.5):.2%}")
+        print(f"   Novel compounds (sim < 0.4): {len(overlap.novel_compounds(0.4))}")
+        # Uncomment to show histogram
+        overlap.plot_histogram()
+    except Exception as e:
+        print(f"   Skipping Workbench test: {e}")
+    print("\n" + "=" * 80)
+    print("✅ All CompoundDatasetOverlap tests completed!")
+    print("=" * 80)

workbench/algorithms/dataframe/fingerprint_proximity.py CHANGED Viewed

@@ -29,7 +29,6 @@ class FingerprintProximity(Proximity):
         include_all_columns: bool = False,
         radius: int = 2,
         n_bits: int = 1024,
-        counts: bool = False,
     ) -> None:
         """
         Initialize the FingerprintProximity class for binary fingerprint similarity.
@@ -43,12 +42,10 @@ class FingerprintProximity(Proximity):
             include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
             radius: Radius for Morgan fingerprint computation (default: 2).
             n_bits: Number of bits for fingerprint (default: 1024).
-            counts: Whether to use count simulation (default: False).
         """
         # Store fingerprint computation parameters
         self._fp_radius = radius
         self._fp_n_bits = n_bits
-        self._fp_counts = counts
         # Store the requested fingerprint column (may be None)
         self._fingerprint_column_arg = fingerprint_column
@@ -107,54 +104,77 @@ class FingerprintProximity(Proximity):
         # If fingerprint column doesn't exist yet, compute it
         if self.fingerprint_column not in self.df.columns:
             log.info(f"Computing Morgan fingerprints (radius={self._fp_radius}, n_bits={self._fp_n_bits})...")
-            self.df = compute_morgan_fingerprints(
-                self.df, radius=self._fp_radius, n_bits=self._fp_n_bits, counts=self._fp_counts
-            )
+            self.df = compute_morgan_fingerprints(self.df, radius=self._fp_radius, n_bits=self._fp_n_bits)
     def _build_model(self) -> None:
         """
         Build the fingerprint proximity model for Tanimoto similarity.
-        Converts fingerprint strings to binary arrays and initializes NearestNeighbors.
-        Note: sklearn uses Jaccard distance internally (1 - Tanimoto similarity).
-        We convert back to Tanimoto similarity in the output methods.
+        For binary fingerprints: uses Jaccard distance (1 - Tanimoto)
+        For count fingerprints: uses weighted Tanimoto (Ruzicka) distance
         """
-        log.info("Converting fingerprints to binary feature matrix...")
-        # Convert fingerprint strings to binary arrays and store for later use
-        self.X = self._fingerprints_to_matrix(self.df)
-        # sklearn uses Jaccard distance = 1 - Tanimoto similarity
-        # We convert to Tanimoto similarity in neighbors() and _precompute_metrics()
-        log.info("Building NearestNeighbors model (Jaccard/Tanimoto metric, BallTree)...")
-        self.nn = NearestNeighbors(metric="jaccard", algorithm="ball_tree").fit(self.X)
+        # Convert fingerprint strings to matrix and detect format
+        self.X, self._is_count_fp = self._fingerprints_to_matrix(self.df)
+        if self._is_count_fp:
+            # Weighted Tanimoto (Ruzicka) for count vectors: 1 - Σmin(A,B)/Σmax(A,B)
+            log.info("Building NearestNeighbors model (weighted Tanimoto for count fingerprints)...")
+            def ruzicka_distance(a, b):
+                """Ruzicka distance = 1 - weighted Tanimoto similarity."""
+                min_sum = np.minimum(a, b).sum()
+                max_sum = np.maximum(a, b).sum()
+                if max_sum == 0:
+                    return 0.0
+                return 1.0 - (min_sum / max_sum)
+            self.nn = NearestNeighbors(metric=ruzicka_distance, algorithm="ball_tree").fit(self.X)
+        else:
+            # Standard Jaccard for binary fingerprints
+            log.info("Building NearestNeighbors model (Jaccard/Tanimoto for binary fingerprints)...")
+            self.nn = NearestNeighbors(metric="jaccard", algorithm="ball_tree").fit(self.X)
     def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
         """
-        Transform fingerprints to binary matrix for querying.
+        Transform fingerprints to matrix for querying.
         Args:
             df: DataFrame containing fingerprints to transform.
         Returns:
-            Binary feature matrix for the fingerprints.
+            Feature matrix for the fingerprints (binary or count based on self._is_count_fp).
         """
-        return self._fingerprints_to_matrix(df)
+        matrix, _ = self._fingerprints_to_matrix(df)
+        return matrix
-    def _fingerprints_to_matrix(self, df: pd.DataFrame) -> np.ndarray:
+    def _fingerprints_to_matrix(self, df: pd.DataFrame) -> tuple[np.ndarray, bool]:
         """
-        Convert fingerprint strings to a binary numpy matrix.
+        Convert fingerprint strings to a numpy matrix.
+        Supports two formats (auto-detected):
+            - Bitstrings: "10110010..." → binary matrix (bool), is_count=False
+            - Count vectors: "0,3,0,1,5,..." → count matrix (uint8), is_count=True
         Args:
             df: DataFrame containing fingerprint column.
         Returns:
-            2D numpy array of binary fingerprint bits.
+            Tuple of (2D numpy array, is_count_fingerprint boolean)
         """
-        fingerprint_bits = df[self.fingerprint_column].apply(
-            lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
-        )
-        return np.vstack(fingerprint_bits)
+        # Auto-detect format based on first fingerprint
+        sample = str(df[self.fingerprint_column].iloc[0])
+        if "," in sample:
+            # Count vector format: preserve counts for weighted Tanimoto
+            fingerprint_values = df[self.fingerprint_column].apply(
+                lambda fp: np.array([int(x) for x in fp.split(",")], dtype=np.uint8)
+            )
+            return np.vstack(fingerprint_values), True
+        else:
+            # Bitstring format: binary values
+            fingerprint_bits = df[self.fingerprint_column].apply(
+                lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
+            )
+            return np.vstack(fingerprint_bits), False
     def _precompute_metrics(self) -> None:
         """Precompute metrics, adding Tanimoto similarity alongside distance."""
@@ -171,8 +191,13 @@ class FingerprintProximity(Proximity):
             self.core_columns.extend([self.target, "nn_target", "nn_target_diff"])
     def _project_2d(self) -> None:
-        """Project the fingerprint matrix to 2D for visualization using UMAP with Jaccard metric."""
-        self.df = Projection2D().fit_transform(self.df, feature_matrix=self.X, metric="jaccard")
+        """Project the fingerprint matrix to 2D for visualization using UMAP."""
+        if self._is_count_fp:
+            # For count fingerprints, convert to binary for UMAP projection (Jaccard needs binary)
+            X_binary = (self.X > 0).astype(np.bool_)
+            self.df = Projection2D().fit_transform(self.df, feature_matrix=X_binary, metric="jaccard")
+        else:
+            self.df = Projection2D().fit_transform(self.df, feature_matrix=self.X, metric="jaccard")
     def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
         """
@@ -240,6 +265,81 @@ class FingerprintProximity(Proximity):
         return neighbors_df
+    def neighbors_from_smiles(
+        self,
+        smiles: Union[str, List[str]],
+        n_neighbors: int = 5,
+        min_similarity: Optional[float] = None,
+    ) -> pd.DataFrame:
+        """
+        Find neighbors for SMILES strings not in the reference dataset.
+        Args:
+            smiles: Single SMILES string or list of SMILES to query
+            n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
+            min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)
+        Returns:
+            DataFrame containing neighbors with Tanimoto similarity scores.
+            The 'query_id' column contains the SMILES string (or index if list).
+        """
+        # Normalize to list
+        smiles_list = [smiles] if isinstance(smiles, str) else smiles
+        # Build a temporary DataFrame with the query SMILES
+        query_df = pd.DataFrame({"smiles": smiles_list})
+        # Compute fingerprints using same parameters as the reference dataset
+        query_df = compute_morgan_fingerprints(query_df, radius=self._fp_radius, n_bits=self._fp_n_bits)
+        # Transform to matrix (use same format detection as reference)
+        X_query, _ = self._fingerprints_to_matrix(query_df)
+        # Query the model
+        if min_similarity is not None:
+            radius = 1 - min_similarity
+            distances, indices = self.nn.radius_neighbors(X_query, radius=radius)
+        else:
+            distances, indices = self.nn.kneighbors(X_query, n_neighbors=n_neighbors)
+        # Build results
+        results = []
+        for i, (dists, nbrs) in enumerate(zip(distances, indices)):
+            query_id = smiles_list[i]
+            for neighbor_idx, dist in zip(nbrs, dists):
+                neighbor_row = self.df.iloc[neighbor_idx]
+                neighbor_id = neighbor_row[self.id_column]
+                similarity = 1.0 - dist if dist > 1e-6 else 1.0
+                result = {
+                    "query_id": query_id,
+                    "neighbor_id": neighbor_id,
+                    "similarity": similarity,
+                }
+                # Add target if present
+                if self.target and self.target in self.df.columns:
+                    result[self.target] = neighbor_row[self.target]
+                # Include all columns if requested
+                if self.include_all_columns:
+                    for col in self.df.columns:
+                        if col not in [self.id_column, "query_id", "neighbor_id", "similarity"]:
+                            result[f"neighbor_{col}"] = neighbor_row[col]
+                results.append(result)
+        df_results = pd.DataFrame(results)
+        # Sort by query_id then similarity descending
+        if len(df_results) > 0:
+            df_results = df_results.sort_values(["query_id", "similarity"], ascending=[True, False]).reset_index(
+                drop=True
+            )
+        return df_results
 # Testing the FingerprintProximity class
 if __name__ == "__main__":
@@ -273,12 +373,71 @@ if __name__ == "__main__":
     )
     print(prox.neighbors(["a", "b"]))
+    # Regression test: include_all_columns should not break neighbor sorting
+    print("\n" + "=" * 80)
+    print("Regression test: include_all_columns neighbor sorting...")
+    print("=" * 80)
+    neighbors_all_cols = prox.neighbors("a", n_neighbors=4)
+    # Verify neighbors are sorted by similarity (descending), not alphabetically by neighbor_id
+    similarities = neighbors_all_cols["similarity"].tolist()
+    assert similarities == sorted(
+        similarities, reverse=True
+    ), f"Neighbors not sorted by similarity! Got: {similarities}"
+    # Verify query_id column has correct value (the query, not the neighbor)
+    assert all(
+        neighbors_all_cols["id"] == "a"
+    ), f"Query ID column corrupted! Expected all 'a', got: {neighbors_all_cols['id'].tolist()}"
+    print("PASSED: Neighbors correctly sorted by similarity with include_all_columns=True")
+    # Test neighbors_from_smiles with synthetic data
+    print("\n" + "=" * 80)
+    print("Testing neighbors_from_smiles...")
+    print("=" * 80)
+    # Create reference dataset with known SMILES
+    ref_data = {
+        "id": ["aspirin", "ibuprofen", "naproxen", "caffeine", "ethanol"],
+        "smiles": [
+            "CC(=O)OC1=CC=CC=C1C(=O)O",  # aspirin
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # ibuprofen
+            "COC1=CC2=CC(C(C)C(O)=O)=CC=C2C=C1",  # naproxen
+            "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # caffeine
+            "CCO",  # ethanol
+        ],
+        "activity": [1.0, 2.0, 2.5, 3.0, 0.5],
+    }
+    ref_df = pd.DataFrame(ref_data)
+    prox_ref = FingerprintProximity(ref_df, id_column="id", target="activity", radius=2, n_bits=1024)
+    # Query with a single SMILES (acetaminophen - similar to aspirin)
+    query_smiles = "CC(=O)NC1=CC=C(C=C1)O"  # acetaminophen
+    print(f"\nQuery: acetaminophen ({query_smiles})")
+    neighbors = prox_ref.neighbors_from_smiles(query_smiles, n_neighbors=3)
+    print(neighbors)
+    # Query with multiple SMILES
+    print("\nQuery: multiple SMILES (theophylline, methanol)")
+    multi_query = [
+        "CN1C=NC2=C1C(=O)NC(=O)N2",  # theophylline - similar to caffeine
+        "CO",  # methanol - similar to ethanol
+    ]
+    neighbors_multi = prox_ref.neighbors_from_smiles(multi_query, n_neighbors=2)
+    print(neighbors_multi)
+    # Test with min_similarity threshold
+    print("\nQuery with min_similarity=0.3:")
+    neighbors_thresh = prox_ref.neighbors_from_smiles(query_smiles, min_similarity=0.3)
+    print(neighbors_thresh)
+    print("PASSED: neighbors_from_smiles working correctly")
     # Test on real data from Workbench
     from workbench.api import FeatureSet, Model
     fs = FeatureSet("aqsol_features")
     model = Model("aqsol-regression")
-    df = fs.pull_dataframe()
+    df = fs.pull_dataframe()[:1000]  # Limit to 1000 for testing
     prox = FingerprintProximity(df, id_column=fs.id_column, target=model.target())
     print("\n" + "=" * 80)

workbench/algorithms/dataframe/projection_2d.py CHANGED Viewed

@@ -106,8 +106,14 @@ class Projection2D:
             return PCA(n_components=2)
         if projection == "UMAP" and UMAP_AVAILABLE:
-            self.log.info(f"Projection: UMAP with metric={metric}")
-            return umap.UMAP(n_components=2, metric=metric)
+            # UMAP default n_neighbors=15, adjust if dataset is smaller
+            n_neighbors = min(15, len(df) - 1)
+            if n_neighbors < 15:
+                self.log.warning(
+                    f"Dataset size ({len(df)}) smaller than default n_neighbors, using n_neighbors={n_neighbors}"
+                )
+            self.log.info(f"Projection: UMAP with metric={metric}, n_neighbors={n_neighbors}")
+            return umap.UMAP(n_components=2, metric=metric, n_neighbors=n_neighbors)
         self.log.warning(
             f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."

workbench/algorithms/dataframe/proximity.py CHANGED Viewed

@@ -331,5 +331,8 @@ class Proximity(ABC):
         # Include all columns if requested
         if self.include_all_columns:
             result.update(neighbor_row.to_dict())
+            # Restore query_id after update (neighbor_row may have overwritten id column)
+            result[self.id_column] = query_id
+            result["neighbor_id"] = neighbor_id
         return result

workbench/api/feature_set.py CHANGED Viewed

@@ -214,7 +214,6 @@ class FeatureSet(FeatureSetCore):
             include_all_columns=include_all_columns,
             radius=radius,
             n_bits=n_bits,
-            counts=counts,
         )
     def cleanlab_model(

workbench 0.8.219__py3-none-any.whl → 0.8.224__py3-none-any.whl

workbench 0.8.219py3-none-any.whl → 0.8.224py3-none-any.whl