PyPI - intentkit - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

intentkit 0.5.1py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of intentkit might be problematic. Click here for more details.

Files changed (400) hide show

intentkit/skills/firecrawl/scrape.py ADDED Viewed

@@ -0,0 +1,318 @@
+import logging
+from typing import List, Optional, Type
+import httpx
+from langchain_core.documents import Document
+from langchain_core.runnables import RunnableConfig
+from pydantic import BaseModel, Field
+from intentkit.skills.firecrawl.base import FirecrawlBaseTool
+logger = logging.getLogger(__name__)
+class FirecrawlScrapeInput(BaseModel):
+    """Input for Firecrawl scrape tool."""
+    url: str = Field(
+        description="The URL to scrape. Must be a valid HTTP or HTTPS URL."
+    )
+    formats: List[str] = Field(
+        description="Output formats to include in the response. Options: 'markdown', 'html', 'rawHtml', 'screenshot', 'links', 'json'",
+        default=["markdown"],
+    )
+    only_main_content: bool = Field(
+        description="Whether to extract only the main content (excluding headers, footers, navigation, etc.)",
+        default=True,
+    )
+    include_tags: Optional[List[str]] = Field(
+        description="HTML tags, classes, or IDs to include in the response (e.g., ['h1', 'p', '.main-content'])",
+        default=None,
+    )
+    exclude_tags: Optional[List[str]] = Field(
+        description="HTML tags, classes, or IDs to exclude from the response (e.g., ['#ad', '#footer'])",
+        default=None,
+    )
+    wait_for: int = Field(
+        description="Wait time in milliseconds before scraping (use only as last resort)",
+        default=0,
+        ge=0,
+    )
+    timeout: int = Field(
+        description="Maximum timeout in milliseconds for the scraping operation",
+        default=30000,
+        ge=1000,
+        le=120000,
+    )
+    index_content: bool = Field(
+        description="Whether to index the scraped content for later querying (default: True)",
+        default=True,
+    )
+    chunk_size: int = Field(
+        description="Size of text chunks for indexing (default: 1000)",
+        default=1000,
+        ge=100,
+        le=4000,
+    )
+    chunk_overlap: int = Field(
+        description="Overlap between chunks (default: 200)",
+        default=200,
+        ge=0,
+        le=1000,
+    )
+class FirecrawlScrape(FirecrawlBaseTool):
+    """Tool for scraping web pages using Firecrawl.
+    This tool uses Firecrawl's API to scrape web pages and convert them into clean,
+    LLM-ready formats like markdown, HTML, or structured JSON data.
+    Attributes:
+        name: The name of the tool.
+        description: A description of what the tool does.
+        args_schema: The schema for the tool's input arguments.
+    """
+    name: str = "firecrawl_scrape"
+    description: str = (
+        "Scrape a single web page and extract its content in various formats (markdown, HTML, JSON, etc.). "
+        "This tool can handle JavaScript-rendered content, PDFs, and dynamic websites. "
+        "Optionally indexes the content for later querying using the firecrawl_query_indexed_content tool. "
+        "Use this when you need to extract clean, structured content from a specific URL."
+    )
+    args_schema: Type[BaseModel] = FirecrawlScrapeInput
+    async def _arun(
+        self,
+        url: str,
+        formats: List[str] = None,
+        only_main_content: bool = True,
+        include_tags: Optional[List[str]] = None,
+        exclude_tags: Optional[List[str]] = None,
+        wait_for: int = 0,
+        timeout: int = 30000,
+        index_content: bool = True,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        config: RunnableConfig = None,
+        **kwargs,
+    ) -> str:
+        """Implementation of the Firecrawl scrape tool.
+        Args:
+            url: The URL to scrape.
+            formats: Output formats to include in the response.
+            only_main_content: Whether to extract only main content.
+            include_tags: HTML tags/classes/IDs to include.
+            exclude_tags: HTML tags/classes/IDs to exclude.
+            wait_for: Wait time in milliseconds before scraping.
+            timeout: Maximum timeout in milliseconds.
+            index_content: Whether to index the content for later querying.
+            chunk_size: Size of text chunks for indexing.
+            chunk_overlap: Overlap between chunks.
+            config: The configuration for the tool call.
+        Returns:
+            str: Formatted scraped content based on the requested formats.
+        """
+        context = self.context_from_config(config)
+        logger.debug(f"firecrawl_scrape: Running scrape with context {context}")
+        if context.config.get("api_key_provider") == "agent_owner":
+            if context.config.get("rate_limit_number") and context.config.get(
+                "rate_limit_minutes"
+            ):
+                await self.user_rate_limit_by_category(
+                    context.user_id,
+                    context.config["rate_limit_number"],
+                    context.config["rate_limit_minutes"],
+                )
+        # Get the API key from the agent's configuration
+        api_key = self.get_api_key(context)
+        if not api_key:
+            return "Error: No Firecrawl API key provided in the configuration."
+        # Validate and set defaults
+        if formats is None:
+            formats = ["markdown"]
+        # Validate formats
+        valid_formats = ["markdown", "html", "rawHtml", "screenshot", "links", "json"]
+        formats = [f for f in formats if f in valid_formats]
+        if not formats:
+            formats = ["markdown"]
+        # Prepare the request payload
+        payload = {
+            "url": url,
+            "formats": formats,
+            "onlyMainContent": only_main_content,
+            "timeout": timeout,
+        }
+        if include_tags:
+            payload["includeTags"] = include_tags
+        if exclude_tags:
+            payload["excludeTags"] = exclude_tags
+        if wait_for > 0:
+            payload["waitFor"] = wait_for
+        # Call Firecrawl scrape API
+        try:
+            async with httpx.AsyncClient(timeout=timeout / 1000 + 10) as client:
+                response = await client.post(
+                    "https://api.firecrawl.dev/v1/scrape",
+                    json=payload,
+                    headers={
+                        "Authorization": f"Bearer {api_key}",
+                        "Content-Type": "application/json",
+                    },
+                )
+                if response.status_code != 200:
+                    logger.error(
+                        f"firecrawl_scrape: Error from Firecrawl API: {response.status_code} - {response.text}"
+                    )
+                    return (
+                        f"Error scraping URL: {response.status_code} - {response.text}"
+                    )
+                data = response.json()
+                if not data.get("success"):
+                    error_msg = data.get("error", "Unknown error occurred")
+                    return f"Error scraping URL: {error_msg}"
+                result_data = data.get("data", {})
+                # Format the results based on requested formats
+                formatted_result = f"Successfully scraped: {url}\n\n"
+                if "markdown" in formats and result_data.get("markdown"):
+                    formatted_result += "## Markdown Content\n"
+                    formatted_result += result_data["markdown"][:2000]  # Limit length
+                    if len(result_data["markdown"]) > 2000:
+                        formatted_result += "... (content truncated)"
+                    formatted_result += "\n\n"
+                if "html" in formats and result_data.get("html"):
+                    formatted_result += "## HTML Content\n"
+                    formatted_result += f"HTML content available ({len(result_data['html'])} characters)\n\n"
+                if "links" in formats and result_data.get("links"):
+                    formatted_result += "## Extracted Links\n"
+                    links = result_data["links"][:10]  # Limit to first 10 links
+                    for link in links:
+                        formatted_result += f"- {link}\n"
+                    if len(result_data["links"]) > 10:
+                        formatted_result += (
+                            f"... and {len(result_data['links']) - 10} more links\n"
+                        )
+                    formatted_result += "\n"
+                if "json" in formats and result_data.get("json"):
+                    formatted_result += "## Structured Data (JSON)\n"
+                    formatted_result += str(result_data["json"])[:1000]  # Limit length
+                    if len(str(result_data["json"])) > 1000:
+                        formatted_result += "... (data truncated)"
+                    formatted_result += "\n\n"
+                if "screenshot" in formats and result_data.get("screenshot"):
+                    formatted_result += "## Screenshot\n"
+                    formatted_result += (
+                        f"Screenshot available at: {result_data['screenshot']}\n\n"
+                    )
+                # Add metadata information
+                metadata = result_data.get("metadata", {})
+                if metadata:
+                    formatted_result += "## Page Metadata\n"
+                    if metadata.get("title"):
+                        formatted_result += f"Title: {metadata['title']}\n"
+                    if metadata.get("description"):
+                        formatted_result += f"Description: {metadata['description']}\n"
+                    if metadata.get("language"):
+                        formatted_result += f"Language: {metadata['language']}\n"
+                    formatted_result += "\n"
+                # Index content if requested
+                if index_content and result_data.get("markdown"):
+                    try:
+                        # Import indexing utilities from firecrawl utils
+                        from intentkit.skills.firecrawl.utils import (
+                            FirecrawlMetadataManager,
+                            index_documents,
+                        )
+                        # Create document from scraped content
+                        document = Document(
+                            page_content=result_data["markdown"],
+                            metadata={
+                                "source": url,
+                                "title": metadata.get("title", ""),
+                                "description": metadata.get("description", ""),
+                                "language": metadata.get("language", ""),
+                                "source_type": "firecrawl_scrape",
+                                "indexed_at": str(context.agent_id),
+                            },
+                        )
+                        # Get agent ID for indexing
+                        agent_id = context.agent_id
+                        if agent_id:
+                            # Index the document
+                            total_chunks, was_merged = await index_documents(
+                                [document],
+                                agent_id,
+                                self.skill_store,
+                                chunk_size,
+                                chunk_overlap,
+                            )
+                            # Update metadata
+                            metadata_manager = FirecrawlMetadataManager(
+                                self.skill_store
+                            )
+                            new_metadata = metadata_manager.create_url_metadata(
+                                [url], [document], "firecrawl_scrape"
+                            )
+                            await metadata_manager.update_metadata(
+                                agent_id, new_metadata
+                            )
+                            formatted_result += "\n## Content Indexing\n"
+                            formatted_result += (
+                                "Successfully indexed content into vector store:\n"
+                            )
+                            formatted_result += f"- Chunks created: {total_chunks}\n"
+                            formatted_result += f"- Chunk size: {chunk_size}\n"
+                            formatted_result += f"- Chunk overlap: {chunk_overlap}\n"
+                            formatted_result += f"- Content merged with existing: {'Yes' if was_merged else 'No'}\n"
+                            formatted_result += "Use the 'firecrawl_query_indexed_content' skill to search this content.\n"
+                            logger.info(
+                                f"firecrawl_scrape: Successfully indexed {url} with {total_chunks} chunks"
+                            )
+                        else:
+                            formatted_result += "\n## Content Indexing\n"
+                            formatted_result += "Warning: Could not index content - agent ID not available.\n"
+                    except Exception as index_error:
+                        logger.error(
+                            f"firecrawl_scrape: Error indexing content: {index_error}"
+                        )
+                        formatted_result += "\n## Content Indexing\n"
+                        formatted_result += f"Warning: Failed to index content for later querying: {str(index_error)}\n"
+                return formatted_result.strip()
+        except httpx.TimeoutException:
+            logger.error(f"firecrawl_scrape: Timeout scraping URL: {url}")
+            return (
+                f"Timeout error: The request to scrape {url} took too long to complete."
+            )
+        except Exception as e:
+            logger.error(f"firecrawl_scrape: Error scraping URL: {e}", exc_info=True)
+            return f"An error occurred while scraping the URL: {str(e)}"

intentkit/skills/firecrawl/utils.py ADDED Viewed

@@ -0,0 +1,306 @@
+"""Utilities for Firecrawl skill content indexing and querying."""
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+from intentkit.abstracts.skill import SkillStoreABC
+logger = logging.getLogger(__name__)
+class FirecrawlDocumentProcessor:
+    """Handles document processing and sanitization for Firecrawl content."""
+    @staticmethod
+    def sanitize_for_database(text: str) -> str:
+        """Sanitize text content to prevent database storage errors."""
+        if not text:
+            return ""
+        # Remove null bytes and other problematic characters
+        text = text.replace("\x00", "")
+        text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)
+        # Normalize whitespace
+        text = re.sub(r"\s+", " ", text)
+        text = text.strip()
+        return text
+    @staticmethod
+    def split_documents(
+        documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200
+    ) -> List[Document]:
+        """Split documents into smaller chunks for better indexing."""
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+        )
+        split_docs = []
+        for doc in documents:
+            # Sanitize content before splitting
+            sanitized_content = FirecrawlDocumentProcessor.sanitize_for_database(
+                doc.page_content
+            )
+            doc.page_content = sanitized_content
+            # Split the document
+            chunks = text_splitter.split_documents([doc])
+            split_docs.extend(chunks)
+        return split_docs
+class FirecrawlVectorStoreManager:
+    """Manages vector store operations for Firecrawl content."""
+    def __init__(self, skill_store: SkillStoreABC):
+        self.skill_store = skill_store
+    def create_embeddings(self) -> OpenAIEmbeddings:
+        """Create OpenAI embeddings instance."""
+        openai_api_key = self.skill_store.get_system_config("openai_api_key")
+        if not openai_api_key:
+            raise ValueError("OpenAI API key not found in system configuration")
+        return OpenAIEmbeddings(
+            openai_api_key=openai_api_key, model="text-embedding-3-small"
+        )
+    def encode_vector_store(self, vector_store: FAISS) -> Dict[str, str]:
+        """Encode FAISS vector store to base64 for storage (compatible with web_scraper)."""
+        import base64
+        import os
+        import tempfile
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                vector_store.save_local(temp_dir)
+                encoded_files = {}
+                for filename in os.listdir(temp_dir):
+                    file_path = os.path.join(temp_dir, filename)
+                    if os.path.isfile(file_path):
+                        with open(file_path, "rb") as f:
+                            encoded_files[filename] = base64.b64encode(f.read()).decode(
+                                "utf-8"
+                            )
+                return encoded_files
+        except Exception as e:
+            logger.error(f"Error encoding vector store: {e}")
+            raise
+    def decode_vector_store(
+        self, encoded_files: Dict[str, str], embeddings: OpenAIEmbeddings
+    ) -> FAISS:
+        """Decode base64 files back to FAISS vector store (compatible with web_scraper)."""
+        import base64
+        import os
+        import tempfile
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Decode and write files
+                for filename, encoded_content in encoded_files.items():
+                    file_path = os.path.join(temp_dir, filename)
+                    with open(file_path, "wb") as f:
+                        f.write(base64.b64decode(encoded_content))
+                # Load vector store
+                return FAISS.load_local(
+                    temp_dir,
+                    embeddings,
+                    allow_dangerous_deserialization=True,
+                )
+        except Exception as e:
+            logger.error(f"Error decoding vector store: {e}")
+            raise
+    async def load_vector_store(self, agent_id: str) -> Optional[FAISS]:
+        """Load existing vector store for an agent."""
+        try:
+            vector_store_key = f"vector_store_{agent_id}"
+            stored_data = await self.skill_store.get_agent_skill_data(
+                agent_id, "web_scraper", vector_store_key
+            )
+            if not stored_data or "faiss_files" not in stored_data:
+                return None
+            embeddings = self.create_embeddings()
+            return self.decode_vector_store(stored_data["faiss_files"], embeddings)
+        except Exception as e:
+            logger.error(f"Error loading vector store for agent {agent_id}: {e}")
+            return None
+    async def save_vector_store(
+        self,
+        agent_id: str,
+        vector_store: FAISS,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ) -> None:
+        """Save vector store for an agent (compatible with web_scraper format)."""
+        try:
+            vector_store_key = f"vector_store_{agent_id}"
+            encoded_files = self.encode_vector_store(vector_store)
+            # Use the same data structure as web_scraper
+            storage_data = {
+                "faiss_files": encoded_files,
+                "chunk_size": chunk_size,
+                "chunk_overlap": chunk_overlap,
+            }
+            await self.skill_store.save_agent_skill_data(
+                agent_id, "web_scraper", vector_store_key, storage_data
+            )
+        except Exception as e:
+            logger.error(f"Error saving vector store for agent {agent_id}: {e}")
+            raise
+class FirecrawlMetadataManager:
+    """Manages metadata for Firecrawl indexed content."""
+    def __init__(self, skill_store: SkillStoreABC):
+        self.skill_store = skill_store
+    def create_url_metadata(
+        self, urls: List[str], documents: List[Document], source_type: str
+    ) -> Dict[str, Any]:
+        """Create metadata for indexed URLs."""
+        return {
+            "urls": urls,
+            "document_count": len(documents),
+            "source_type": source_type,
+            "indexed_at": str(len(urls)),  # Simple counter
+        }
+    async def update_metadata(
+        self, agent_id: str, new_metadata: Dict[str, Any]
+    ) -> None:
+        """Update metadata for an agent."""
+        try:
+            metadata_key = f"indexed_urls_{agent_id}"
+            await self.skill_store.save_agent_skill_data(
+                agent_id, "web_scraper", metadata_key, new_metadata
+            )
+        except Exception as e:
+            logger.error(f"Error updating metadata for agent {agent_id}: {e}")
+            raise
+async def index_documents(
+    documents: List[Document],
+    agent_id: str,
+    skill_store: SkillStoreABC,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 200,
+) -> Tuple[int, bool]:
+    """
+    Index documents into the Firecrawl vector store.
+    Args:
+        documents: List of documents to index
+        agent_id: Agent ID for storage
+        skill_store: Skill store for persistence
+        chunk_size: Size of text chunks
+        chunk_overlap: Overlap between chunks
+    Returns:
+        Tuple of (total_chunks, was_merged_with_existing)
+    """
+    try:
+        # Initialize managers
+        vs_manager = FirecrawlVectorStoreManager(skill_store)
+        # Split documents into chunks
+        split_docs = FirecrawlDocumentProcessor.split_documents(
+            documents, chunk_size, chunk_overlap
+        )
+        if not split_docs:
+            logger.warning("No documents to index after splitting")
+            return 0, False
+        # Create embeddings
+        embeddings = vs_manager.create_embeddings()
+        # Try to load existing vector store
+        existing_vector_store = await vs_manager.load_vector_store(agent_id)
+        if existing_vector_store:
+            # Add to existing vector store
+            existing_vector_store.add_documents(split_docs)
+            vector_store = existing_vector_store
+            was_merged = True
+        else:
+            # Create new vector store
+            vector_store = FAISS.from_documents(split_docs, embeddings)
+            was_merged = False
+        # Save the vector store
+        await vs_manager.save_vector_store(
+            agent_id, vector_store, chunk_size, chunk_overlap
+        )
+        logger.info(
+            f"Successfully indexed {len(split_docs)} chunks for agent {agent_id}"
+        )
+        return len(split_docs), was_merged
+    except Exception as e:
+        logger.error(f"Error indexing documents for agent {agent_id}: {e}")
+        raise
+async def query_indexed_content(
+    query: str,
+    agent_id: str,
+    skill_store: SkillStoreABC,
+    max_results: int = 4,
+) -> List[Document]:
+    """
+    Query the Firecrawl indexed content.
+    Args:
+        query: Search query
+        agent_id: Agent ID
+        skill_store: Skill store for persistence
+        max_results: Maximum number of results to return
+    Returns:
+        List of relevant documents
+    """
+    try:
+        # Initialize vector store manager
+        vs_manager = FirecrawlVectorStoreManager(skill_store)
+        # Load vector store
+        vector_store = await vs_manager.load_vector_store(agent_id)
+        if not vector_store:
+            logger.warning(f"No vector store found for agent {agent_id}")
+            return []
+        # Perform similarity search
+        docs = vector_store.similarity_search(query, k=max_results)
+        logger.info(f"Found {len(docs)} documents for query: {query}")
+        return docs
+    except Exception as e:
+        logger.error(f"Error querying indexed content for agent {agent_id}: {e}")
+        raise

{skills → intentkit/skills}/heurist/image_generation_animagine_xl.py RENAMED Viewed

@@ -137,7 +137,7 @@ class ImageGenerationAnimagineXL(HeuristBaseTool):
             # Store the image URL
             image_url = response.text.strip('"')
             # Generate a key with agent ID as prefix
-            image_key = f"{context.agent.id}/heurist/{job_id}"
+            image_key = f"{context.agent_id}/heurist/{job_id}"
             # Store the image and get the CDN URL
             stored_url = await store_image(image_url, image_key)

{skills → intentkit/skills}/heurist/image_generation_arthemy_comics.py RENAMED Viewed

@@ -137,7 +137,7 @@ class ImageGenerationArthemyComics(HeuristBaseTool):
             # Store the image URL
             image_url = response.text.strip('"')
             # Generate a key with agent ID as prefix
-            image_key = f"{context.agent.id}/heurist/{job_id}"
+            image_key = f"{context.agent_id}/heurist/{job_id}"
             # Store the image and get the CDN URL
             stored_url = await store_image(image_url, image_key)

{skills → intentkit/skills}/heurist/image_generation_arthemy_real.py RENAMED Viewed

@@ -137,7 +137,7 @@ class ImageGenerationArthemyReal(HeuristBaseTool):
             # Store the image URL
             image_url = response.text.strip('"')
             # Generate a key with agent ID as prefix
-            image_key = f"{context.agent.id}/heurist/{job_id}"
+            image_key = f"{context.agent_id}/heurist/{job_id}"
             # Store the image and get the CDN URL
             stored_url = await store_image(image_url, image_key)

{skills → intentkit/skills}/heurist/image_generation_braindance.py RENAMED Viewed

@@ -137,7 +137,7 @@ class ImageGenerationBrainDance(HeuristBaseTool):
             # Store the image URL
             image_url = response.text.strip('"')
             # Generate a key with agent ID as prefix
-            image_key = f"{context.agent.id}/heurist/{job_id}"
+            image_key = f"{context.agent_id}/heurist/{job_id}"
             # Store the image and get the CDN URL
             stored_url = await store_image(image_url, image_key)

{skills → intentkit/skills}/heurist/image_generation_cyber_realistic_xl.py RENAMED Viewed

@@ -137,7 +137,7 @@ class ImageGenerationCyberRealisticXL(HeuristBaseTool):
             # Store the image URL
             image_url = response.text.strip('"')
             # Generate a key with agent ID as prefix
-            image_key = f"{context.agent.id}/heurist/{job_id}"
+            image_key = f"{context.agent_id}/heurist/{job_id}"
             # Store the image and get the CDN URL
             stored_url = await store_image(image_url, image_key)

{skills → intentkit/skills}/heurist/image_generation_flux_1_dev.py RENAMED Viewed

@@ -137,7 +137,7 @@ class ImageGenerationFlux1Dev(HeuristBaseTool):
             # Store the image URL
             image_url = response.text.strip('"')
             # Generate a key with agent ID as prefix
-            image_key = f"{context.agent.id}/heurist/{job_id}"
+            image_key = f"{context.agent_id}/heurist/{job_id}"
             # Store the image and get the CDN URL
             stored_url = await store_image(image_url, image_key)

intentkit 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

intentkit 0.5.1py3-none-any.whl → 0.6.0py3-none-any.whl