PyPI - isage-tooluse - Versions diffs - 0.1.0.0__py3-none-any.whl - Mend

isage-tooluse 0.1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

isage_tooluse-0.1.0.0.dist-info/METADATA +208 -0
isage_tooluse-0.1.0.0.dist-info/RECORD +14 -0
isage_tooluse-0.1.0.0.dist-info/WHEEL +5 -0
isage_tooluse-0.1.0.0.dist-info/licenses/LICENSE +21 -0
isage_tooluse-0.1.0.0.dist-info/top_level.txt +1 -0
sage_libs/sage_tooluse/__init__.py +75 -0
sage_libs/sage_tooluse/base.py +203 -0
sage_libs/sage_tooluse/dfsdt_selector.py +402 -0
sage_libs/sage_tooluse/embedding_selector.py +281 -0
sage_libs/sage_tooluse/gorilla_selector.py +495 -0
sage_libs/sage_tooluse/hybrid_selector.py +202 -0
sage_libs/sage_tooluse/keyword_selector.py +270 -0
sage_libs/sage_tooluse/registry.py +185 -0
sage_libs/sage_tooluse/schemas.py +196 -0

sage_libs/sage_tooluse/keyword_selector.py ADDED Viewed

@@ -0,0 +1,270 @@
+"""
+Keyword-based tool selector.
+Implements TF-IDF and token overlap strategies for tool selection.
+"""
+import logging
+import re
+from collections import Counter
+import numpy as np
+from .base import BaseToolSelector, SelectorResources
+from .schemas import KeywordSelectorConfig, ToolPrediction, ToolSelectionQuery
+logger = logging.getLogger(__name__)
+# Common English stopwords
+STOPWORDS = {
+    "a",
+    "an",
+    "and",
+    "are",
+    "as",
+    "at",
+    "be",
+    "by",
+    "for",
+    "from",
+    "has",
+    "he",
+    "in",
+    "is",
+    "it",
+    "its",
+    "of",
+    "on",
+    "that",
+    "the",
+    "to",
+    "was",
+    "will",
+    "with",
+    "this",
+    "but",
+    "they",
+    "have",
+}
+class KeywordSelector(BaseToolSelector):
+    """
+    Keyword-based tool selector using TF-IDF or token overlap.
+    Fast baseline selector with O(N) complexity.
+    """
+    def __init__(self, config: KeywordSelectorConfig, resources: SelectorResources):
+        """
+        Initialize keyword selector.
+        Args:
+            config: Keyword selector configuration
+            resources: Shared resources
+        """
+        super().__init__(config, resources)
+        self.config: KeywordSelectorConfig = config
+        # Precompute tool text representations
+        self._tool_texts: dict[str, str] = {}
+        self._tool_tokens: dict[str, set[str]] = {}
+        self._idf_scores: dict[str, float] = {}
+        self._preprocess_tools()
+    @classmethod
+    def from_config(
+        cls, config: KeywordSelectorConfig, resources: SelectorResources
+    ) -> "KeywordSelector":
+        """Create keyword selector from config."""
+        return cls(config, resources)
+    def _preprocess_tools(self) -> None:
+        """Preprocess all tools and compute IDF scores."""
+        try:
+            # Get all tools from loader
+            tools_loader = self.resources.tools_loader
+            # Build tool texts
+            for tool in tools_loader.iter_all():
+                text = self._build_tool_text(tool)
+                self._tool_texts[tool.tool_id] = text
+                self._tool_tokens[tool.tool_id] = self._tokenize(text)
+            # Compute IDF scores (needed for both TF-IDF and BM25)
+            if self.config.method in ("tfidf", "bm25"):
+                self._compute_idf()
+            self.logger.info(f"Preprocessed {len(self._tool_texts)} tools")
+        except Exception as e:
+            self.logger.error(f"Error preprocessing tools: {e}")
+            raise
+    def _build_tool_text(self, tool) -> str:
+        """Build searchable text from tool metadata."""
+        parts = [tool.name]
+        if hasattr(tool, "description") and tool.description:
+            parts.append(tool.description)
+        if hasattr(tool, "capabilities") and tool.capabilities:
+            if isinstance(tool.capabilities, list):
+                parts.extend(tool.capabilities)
+            else:
+                parts.append(str(tool.capabilities))
+        if hasattr(tool, "category") and tool.category:
+            parts.append(tool.category)
+        return " ".join(parts)
+    def _tokenize(self, text: str) -> set[str]:
+        """Tokenize text into set of tokens."""
+        if self.config.lowercase:
+            text = text.lower()
+        # Split on non-alphanumeric
+        tokens = re.findall(r"\b[a-z0-9_]+\b", text, re.IGNORECASE)
+        # Remove stopwords if enabled
+        if self.config.remove_stopwords:
+            tokens = [t for t in tokens if t.lower() not in STOPWORDS]
+        # Generate n-grams if needed
+        if self.config.ngram_range[1] > 1:
+            ngrams = []
+            for n in range(self.config.ngram_range[0], self.config.ngram_range[1] + 1):
+                for i in range(len(tokens) - n + 1):
+                    ngrams.append("_".join(tokens[i : i + n]))
+            tokens.extend(ngrams)
+        return set(tokens)
+    def _compute_idf(self) -> None:
+        """Compute IDF scores for all tokens."""
+        # Count document frequency for each token
+        df = Counter()
+        total_docs = len(self._tool_tokens)
+        for tokens in self._tool_tokens.values():
+            df.update(tokens)
+        # Compute IDF: log(N / df)
+        for token, freq in df.items():
+            self._idf_scores[token] = np.log(total_docs / freq)
+    def _select_impl(self, query: ToolSelectionQuery, top_k: int) -> list[ToolPrediction]:
+        """
+        Select tools using keyword matching.
+        Args:
+            query: Tool selection query
+            top_k: Number of tools to select
+        Returns:
+            List of tool predictions
+        """
+        # Tokenize query
+        query_tokens = self._tokenize(query.instruction)
+        if not query_tokens:
+            self.logger.warning(f"No tokens in query {query.sample_id}")
+            return []
+        # Filter candidates
+        candidate_ids = (
+            set(query.candidate_tools) if query.candidate_tools else set(self._tool_texts.keys())
+        )
+        # Score each candidate
+        scores = []
+        for tool_id in candidate_ids:
+            if tool_id not in self._tool_tokens:
+                continue
+            if self.config.method == "tfidf":
+                score = self._tfidf_score(query_tokens, tool_id)
+            elif self.config.method == "overlap":
+                score = self._overlap_score(query_tokens, tool_id)
+            elif self.config.method == "bm25":
+                score = self._bm25_score(query_tokens, tool_id)
+            else:
+                raise ValueError(f"Unknown method: {self.config.method}")
+            scores.append((tool_id, score))
+        # Sort by score and take top-k
+        scores.sort(key=lambda x: x[1], reverse=True)
+        scores = scores[:top_k]
+        # Create predictions
+        predictions = [
+            ToolPrediction(
+                tool_id=tool_id,
+                score=min(score, 1.0),  # Normalize to [0, 1]
+                metadata={"method": self.config.method},
+            )
+            for tool_id, score in scores
+        ]
+        return predictions
+    def _tfidf_score(self, query_tokens: set[str], tool_id: str) -> float:
+        """Compute TF-IDF score."""
+        tool_tokens = self._tool_tokens[tool_id]
+        common = query_tokens & tool_tokens
+        if not common:
+            return 0.0
+        # Sum IDF scores for matching tokens
+        score = sum(self._idf_scores.get(token, 0.0) for token in common)
+        # Normalize by query length
+        score /= len(query_tokens)
+        return score
+    def _overlap_score(self, query_tokens: set[str], tool_id: str) -> float:
+        """Compute token overlap score (Jaccard similarity)."""
+        tool_tokens = self._tool_tokens[tool_id]
+        if not query_tokens or not tool_tokens:
+            return 0.0
+        intersection = len(query_tokens & tool_tokens)
+        union = len(query_tokens | tool_tokens)
+        return intersection / union if union > 0 else 0.0
+    def _bm25_score(self, query_tokens: set[str], tool_id: str) -> float:
+        """Compute BM25 score (simplified)."""
+        tool_tokens = self._tool_tokens[tool_id]
+        common = query_tokens & tool_tokens
+        if not common:
+            return 0.0
+        # BM25 parameters
+        k1 = 1.5
+        b = 0.75
+        # Average document length
+        avg_len = np.mean([len(tokens) for tokens in self._tool_tokens.values()])
+        doc_len = len(tool_tokens)
+        score = 0.0
+        for token in common:
+            idf = self._idf_scores.get(token, 0.0)
+            tf = 1  # Binary TF
+            # BM25 formula
+            numerator = tf * (k1 + 1)
+            denominator = tf + k1 * (1 - b + b * doc_len / avg_len)
+            score += idf * (numerator / denominator)
+        return score

sage_libs/sage_tooluse/registry.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""
+Registry for tool selector strategies.
+Provides registration, lookup, and factory creation of selectors.
+"""
+import logging
+from typing import Any, Optional
+from .base import BaseToolSelector, SelectorResources
+from .schemas import SelectorConfig, create_selector_config
+logger = logging.getLogger(__name__)
+class SelectorRegistry:
+    """
+    Registry for tool selector strategies.
+    Supports registration, lookup, and factory creation of selectors.
+    """
+    _instance: Optional["SelectorRegistry"] = None
+    _selectors: dict[str, type[BaseToolSelector]] = {}
+    def __init__(self):
+        """Initialize registry."""
+        self._selectors = {}
+        self._instances: dict[str, BaseToolSelector] = {}
+    @classmethod
+    def get_instance(cls) -> "SelectorRegistry":
+        """Get singleton registry instance."""
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+    def register(self, name: str, selector_class: type[BaseToolSelector]) -> None:
+        """
+        Register a selector class.
+        Args:
+            name: Selector strategy name
+            selector_class: Selector class to register
+        """
+        if name in self._selectors:
+            logger.warning(f"Overwriting existing selector: {name}")
+        self._selectors[name] = selector_class
+        logger.info(f"Registered selector: {name}")
+    def get_class(self, name: str) -> Optional[type[BaseToolSelector]]:
+        """
+        Get selector class by name.
+        Args:
+            name: Selector strategy name
+        Returns:
+            Selector class or None if not found
+        """
+        return self._selectors.get(name)
+    def get(
+        self,
+        name: str,
+        config: Optional[SelectorConfig] = None,
+        resources: Optional[SelectorResources] = None,
+        cache: bool = True,
+    ) -> BaseToolSelector:
+        """
+        Get or create selector instance.
+        Args:
+            name: Selector strategy name
+            config: Optional selector configuration
+            resources: Optional resources (required for new instances)
+            cache: Whether to cache and reuse instances
+        Returns:
+            Selector instance
+        Raises:
+            ValueError: If selector not registered or resources missing
+        """
+        # Check cache
+        if cache and name in self._instances:
+            return self._instances[name]
+        # Get class
+        selector_class = self.get_class(name)
+        if selector_class is None:
+            raise ValueError(f"Unknown selector: {name}. Available: {list(self._selectors.keys())}")
+        # Create config if needed
+        if config is None:
+            config = create_selector_config({"name": name})
+        # Validate resources
+        if resources is None:
+            raise ValueError(f"Resources required to create selector: {name}")
+        # Create instance
+        instance = selector_class.from_config(config, resources)
+        # Cache if requested
+        if cache:
+            self._instances[name] = instance
+        return instance
+    def create_from_config(
+        self, config_dict: dict[str, Any], resources: SelectorResources
+    ) -> BaseToolSelector:
+        """
+        Create selector from configuration dictionary.
+        Args:
+            config_dict: Configuration dictionary
+            resources: Shared resources
+        Returns:
+            Initialized selector instance
+        """
+        config = create_selector_config(config_dict)
+        return self.get(config.name, config, resources, cache=False)
+    def list_selectors(self) -> list:
+        """List all registered selector names."""
+        return list(self._selectors.keys())
+    def clear_cache(self) -> None:
+        """Clear cached selector instances."""
+        self._instances.clear()
+        logger.info("Cleared selector instance cache")
+# Global registry instance
+_registry = SelectorRegistry.get_instance()
+def register_selector(name: str, selector_class: type[BaseToolSelector]) -> None:
+    """
+    Register a selector class globally.
+    Args:
+        name: Selector strategy name
+        selector_class: Selector class to register
+    """
+    _registry.register(name, selector_class)
+def get_selector(
+    name: str,
+    config: Optional[SelectorConfig] = None,
+    resources: Optional[SelectorResources] = None,
+) -> BaseToolSelector:
+    """
+    Get selector instance from global registry.
+    Args:
+        name: Selector strategy name
+        config: Optional selector configuration
+        resources: Optional resources
+    Returns:
+        Selector instance
+    """
+    return _registry.get(name, config, resources)
+def create_selector_from_config(
+    config_dict: dict[str, Any], resources: SelectorResources
+) -> BaseToolSelector:
+    """
+    Create selector from config dictionary using global registry.
+    Args:
+        config_dict: Configuration dictionary
+        resources: Shared resources
+    Returns:
+        Initialized selector instance
+    """
+    return _registry.create_from_config(config_dict, resources)

sage_libs/sage_tooluse/schemas.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""
+Data schemas for tool selection.
+Defines Pydantic models for queries, predictions, and configurations.
+"""
+from typing import Any, Optional
+from pydantic import BaseModel, Field
+class ToolSelectionQuery(BaseModel):
+    """Query for tool selection."""
+    sample_id: str = Field(..., description="Unique identifier for the query")
+    instruction: str = Field(..., description="User instruction or task description")
+    context: dict[str, Any] = Field(default_factory=dict, description="Additional context")
+    candidate_tools: list[str] = Field(..., description="List of candidate tool IDs")
+    metadata: dict[str, Any] = Field(default_factory=dict, description="Optional metadata")
+    class Config:
+        extra = "allow"
+class ToolPrediction(BaseModel):
+    """Prediction result for a single tool."""
+    tool_id: str = Field(..., description="Tool identifier")
+    score: float = Field(..., ge=0.0, le=1.0, description="Relevance score (0-1)")
+    explanation: Optional[str] = Field(default=None, description="Optional explanation")
+    metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
+    class Config:
+        frozen = True  # Make immutable for caching
+class SelectorConfig(BaseModel):
+    """Base configuration for tool selectors."""
+    name: str = Field(..., description="Selector strategy name")
+    top_k: int = Field(default=5, ge=1, description="Number of tools to select")
+    min_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Minimum score threshold")
+    cache_enabled: bool = Field(default=True, description="Enable result caching")
+    params: dict[str, Any] = Field(default_factory=dict, description="Strategy-specific parameters")
+    class Config:
+        extra = "allow"
+class KeywordSelectorConfig(SelectorConfig):
+    """Configuration for keyword-based selector."""
+    name: str = "keyword"
+    method: str = Field(
+        default="tfidf", description="Keyword matching method: tfidf, overlap, bm25"
+    )
+    lowercase: bool = Field(default=True, description="Convert to lowercase")
+    remove_stopwords: bool = Field(default=True, description="Remove stopwords")
+    ngram_range: tuple = Field(default=(1, 2), description="N-gram range for features")
+class EmbeddingSelectorConfig(SelectorConfig):
+    """Configuration for embedding-based selector."""
+    name: str = "embedding"
+    embedding_model: str = Field(default="default", description="Embedding model identifier")
+    similarity_metric: str = Field(
+        default="cosine", description="Similarity metric: cosine, dot, euclidean"
+    )
+    use_cache: bool = Field(default=True, description="Cache embedding vectors")
+    batch_size: int = Field(default=32, ge=1, description="Batch size for embedding")
+class TwoStageSelectorConfig(SelectorConfig):
+    """Configuration for two-stage selector."""
+    name: str = "two_stage"
+    coarse_k: int = Field(
+        default=20, ge=1, description="Number of candidates from coarse retrieval"
+    )
+    coarse_selector: str = Field(default="keyword", description="Coarse retrieval selector")
+    rerank_selector: str = Field(default="embedding", description="Reranking selector")
+    fusion_weight: float = Field(default=0.5, ge=0.0, le=1.0, description="Weight for score fusion")
+class AdaptiveSelectorConfig(SelectorConfig):
+    """Configuration for adaptive selector."""
+    name: str = "adaptive"
+    strategies: list[str] = Field(
+        default_factory=lambda: ["keyword", "embedding"], description="List of strategies"
+    )
+    selection_method: str = Field(
+        default="bandit", description="Selection method: bandit, ensemble, threshold"
+    )
+    exploration_rate: float = Field(
+        default=0.1, ge=0.0, le=1.0, description="Exploration rate for bandit"
+    )
+    update_interval: int = Field(default=100, ge=1, description="Update interval for adaptation")
+class DFSDTSelectorConfig(SelectorConfig):
+    """
+    Configuration for DFSDT (Depth-First Search-based Decision Tree) selector.
+    Based on ToolLLM paper (Qin et al., 2023):
+    "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs"
+    """
+    name: str = "dfsdt"
+    max_depth: int = Field(default=3, ge=1, le=10, description="Maximum search depth")
+    beam_width: int = Field(default=5, ge=1, le=20, description="Number of candidates per level")
+    llm_model: str = Field(
+        default="auto", description="LLM model for scoring (auto uses UnifiedInferenceClient)"
+    )
+    temperature: float = Field(default=0.1, ge=0.0, le=2.0, description="LLM sampling temperature")
+    use_diversity_prompt: bool = Field(
+        default=True, description="Use diversity prompting for exploration"
+    )
+    score_threshold: float = Field(
+        default=0.3, ge=0.0, le=1.0, description="Minimum score threshold for pruning"
+    )
+    use_keyword_prefilter: bool = Field(
+        default=True, description="Use keyword matching to pre-filter candidates"
+    )
+    prefilter_k: int = Field(
+        default=20, ge=5, le=100, description="Number of candidates after pre-filtering"
+    )
+class GorillaSelectorConfig(SelectorConfig):
+    """
+    Configuration for Gorilla-style retrieval-augmented selector.
+    Based on Gorilla paper (Patil et al., 2023):
+    "Gorilla: Large Language Model Connected with Massive APIs"
+    Two-stage approach: embedding retrieval + LLM selection.
+    """
+    name: str = "gorilla"
+    top_k_retrieve: int = Field(
+        default=20, ge=1, description="Number of tools to retrieve in first stage"
+    )
+    top_k_select: int = Field(
+        default=5, ge=1, description="Number of tools to select in final output"
+    )
+    embedding_model: str = Field(default="default", description="Embedding model for retrieval")
+    llm_model: str = Field(
+        default="auto", description="LLM model for selection (auto uses UnifiedInferenceClient)"
+    )
+    similarity_metric: str = Field(
+        default="cosine", description="Similarity metric: cosine, dot, euclidean"
+    )
+    temperature: float = Field(
+        default=0.1, ge=0.0, le=2.0, description="LLM temperature for selection"
+    )
+    use_detailed_docs: bool = Field(
+        default=True, description="Include detailed parameter docs in context"
+    )
+    max_context_tools: int = Field(
+        default=15, ge=1, description="Max tools to include in LLM context"
+    )
+# Config type registry
+CONFIG_TYPES = {
+    "keyword": KeywordSelectorConfig,
+    "embedding": EmbeddingSelectorConfig,
+    "two_stage": TwoStageSelectorConfig,
+    "adaptive": AdaptiveSelectorConfig,
+    "dfsdt": DFSDTSelectorConfig,
+    "gorilla": GorillaSelectorConfig,
+}
+def create_selector_config(config_dict: dict[str, Any]) -> SelectorConfig:
+    """
+    Create appropriate selector config from dictionary.
+    Args:
+        config_dict: Configuration dictionary
+    Returns:
+        Typed SelectorConfig subclass instance
+    Raises:
+        ValueError: If selector name not recognized
+    """
+    selector_name = config_dict.get("name", "keyword")
+    if selector_name not in CONFIG_TYPES:
+        raise ValueError(f"Unknown selector type: {selector_name}")
+    config_class = CONFIG_TYPES[selector_name]
+    return config_class(**config_dict)