PyPI - intentkit - Versions diffs - 0.6.0.dev7__py3-none-any.whl → 0.6.0.dev8__py3-none-any.whl - Mend

intentkit 0.6.0.dev7py3-none-any.whl → 0.6.0.dev8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of intentkit might be problematic. Click here for more details.

Files changed (12) hide show

intentkit/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@
 A powerful platform for building AI agents with blockchain and cryptocurrency capabilities.
 """
-__version__ = "0.6.0-dev.7"
+__version__ = "0.6.0-dev.8"
 __author__ = "hyacinthus"
 __email__ = "hyacinthus@gmail.com"

intentkit/skills/web_scraper/README.md CHANGED Viewed

@@ -10,12 +10,22 @@ Scrape content from URLs and index into a searchable vector store with configura
 ### 🔎 `query_indexed_content`
 Search indexed content using semantic similarity to answer questions and retrieve relevant information.
+### `website_indexer`
+Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.
+### `document_indexer`
+Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.
 ## Key Features
-- **Multi-URL Support**: Scrape up to 10 URLs simultaneously
+- **Multi-URL Support**: Scrape up to 10 URLs simultaneously
+- **Sitemap Discovery**: Automatic sitemap detection from robots.txt with common patterns
+- **Direct Text Input**: Add content directly without web scraping
 - **Smart Chunking**: Configurable text splitting (100-4000 chars) with overlap
 - **Vector Search**: FAISS + OpenAI embeddings for semantic retrieval
 - **Agent Storage**: Persistent, per-agent content indexing
+- **Content Filtering**: Include/exclude URL patterns for targeted scraping
+- **Tagging System**: Organize content with custom tags
 - **Rate Limiting**: Respectful scraping (0.1-10 req/sec)
 ## Testing Examples
@@ -39,7 +49,27 @@ Please scrape and index this URL: https://docs.crestal.network/introduction
 Scrape and index https://docs.crestal.network/introduction with chunk size 500 and overlap 100.
 ```
-### 3. Content Querying
+### 3. Complete Website Indexing
+**Agent Prompt:**
+```
+Index the entire documentation site at https://docs.crestal.network using its sitemap. Include only pages with '/docs/' and '/guides/' in the URL, exclude '/admin/' pages, and limit to 50 URLs.
+```
+### 4. Document Content Import
+**Agent Prompt:**
+```
+I'm going to paste some content from my Google Doc. Please add it to the knowledge base:
+Title: "Meeting Notes - Q4 Strategy"
+Source: "Google Docs"
+Tags: "meeting, strategy, q4, planning"
+[Paste your document content here...]
+```
+### 5. Content Querying
 **Agent Prompt (after indexing):**
 ```
@@ -75,8 +105,9 @@ curl -X POST "http://localhost:8000/agents/your-agent-id/chat" \
 ## Dependencies
 Required packages (add to `pyproject.toml` if missing):
-- `langchain-community` - WebBaseLoader
+- `langchain-community` - WebBaseLoader and document processing
 - `langchain-openai` - Embeddings
 - `langchain-text-splitters` - Document chunking
 - `faiss-cpu` - Vector storage
-- `beautifulsoup4` - HTML parsing
+- `beautifulsoup4` - HTML parsing
+- `httpx` - Async HTTP client for sitemap discovery

intentkit/skills/web_scraper/__init__.py CHANGED Viewed

@@ -6,10 +6,12 @@ from typing import TypedDict
 from intentkit.abstracts.skill import SkillStoreABC
 from intentkit.skills.base import SkillConfig, SkillOwnerState, SkillState
 from intentkit.skills.web_scraper.base import WebScraperBaseTool
+from intentkit.skills.web_scraper.document_indexer import DocumentIndexer
 from intentkit.skills.web_scraper.scrape_and_index import (
     QueryIndexedContent,
     ScrapeAndIndex,
 )
+from intentkit.skills.web_scraper.website_indexer import WebsiteIndexer
 # Cache skills at the system level, because they are stateless
 _cache: dict[str, WebScraperBaseTool] = {}
@@ -20,6 +22,8 @@ logger = logging.getLogger(__name__)
 class SkillStates(TypedDict):
     scrape_and_index: SkillOwnerState
     query_indexed_content: SkillState
+    website_indexer: SkillOwnerState
+    document_indexer: SkillOwnerState
 class Config(SkillConfig):
@@ -87,6 +91,18 @@ def get_web_scraper_skill(
                 skill_store=store,
             )
         return _cache[name]
+    elif name == "website_indexer":
+        if name not in _cache:
+            _cache[name] = WebsiteIndexer(
+                skill_store=store,
+            )
+        return _cache[name]
+    elif name == "document_indexer":
+        if name not in _cache:
+            _cache[name] = DocumentIndexer(
+                skill_store=store,
+            )
+        return _cache[name]
     else:
         logger.warning(f"Unknown web scraper skill: {name}")
         return None

intentkit/skills/web_scraper/document_indexer.py ADDED Viewed

@@ -0,0 +1,143 @@
+import logging
+from typing import Type
+from langchain_core.runnables import RunnableConfig
+from pydantic import BaseModel, Field
+from intentkit.skills.web_scraper.base import WebScraperBaseTool
+from intentkit.skills.web_scraper.utils import (
+    DocumentProcessor,
+    MetadataManager,
+    ResponseFormatter,
+    VectorStoreManager,
+    index_documents,
+)
+logger = logging.getLogger(__name__)
+class DocumentIndexerInput(BaseModel):
+    """Input for DocumentIndexer tool."""
+    text_content: str = Field(
+        description="The text content to add to the vector database. Can be content from Google Docs, Notion, or any other text source",
+        min_length=10,
+        max_length=100000,
+    )
+    title: str = Field(
+        description="Title or name for this text content (will be used as metadata)",
+        max_length=200,
+    )
+    source: str = Field(
+        description="Source of the text content (e.g., 'Google Doc', 'Notion Page', 'Manual Entry')",
+        default="Manual Entry",
+        max_length=100,
+    )
+    chunk_size: int = Field(
+        description="Size of text chunks for indexing (default: 1000)",
+        default=1000,
+        ge=100,
+        le=4000,
+    )
+    chunk_overlap: int = Field(
+        description="Overlap between chunks (default: 200)",
+        default=200,
+        ge=0,
+        le=1000,
+    )
+    tags: str = Field(
+        description="Optional tags for categorizing the content (comma-separated)",
+        default="",
+        max_length=500,
+    )
+class DocumentIndexer(WebScraperBaseTool):
+    """Tool for importing and indexing document content to the vector database.
+    This tool allows users to copy and paste document content from various sources
+    (like Google Docs, Notion, PDFs, etc.) and index it directly into the vector store
+    for later querying and retrieval.
+    """
+    name: str = "web_scraper_document_indexer"
+    description: str = (
+        "Import and index document content directly to the vector database. "
+        "Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources. "
+        "The indexed content can then be queried using the query_indexed_content tool."
+    )
+    args_schema: Type[BaseModel] = DocumentIndexerInput
+    async def _arun(
+        self,
+        text_content: str,
+        title: str,
+        source: str = "Manual Entry",
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        tags: str = "",
+        config: RunnableConfig = None,
+        **kwargs,
+    ) -> str:
+        """Add text content to the vector database."""
+        # Get agent context - throw error if not available
+        if not config:
+            raise ValueError("Configuration is required but not provided")
+        context = self.context_from_config(config)
+        if not context or not context.agent or not context.agent.id:
+            raise ValueError("Agent ID is required but not found in configuration")
+        agent_id = context.agent.id
+        logger.info(f"[{agent_id}] Starting document indexing for title: '{title}'")
+        # Validate content
+        if not DocumentProcessor.validate_content(text_content):
+            logger.error(f"[{agent_id}] Content validation failed - too short")
+            return "Error: Text content is too short. Please provide at least 10 characters of content."
+        # Create document with metadata
+        document = DocumentProcessor.create_document(
+            text_content,
+            title,
+            source,
+            tags,
+            extra_metadata={"source_type": "document_indexer"},
+        )
+        logger.info(
+            f"[{agent_id}] Document created, length: {len(document.page_content)} chars"
+        )
+        # Index the document
+        total_chunks, was_merged = await index_documents(
+            [document], agent_id, self.skill_store, chunk_size, chunk_overlap
+        )
+        # Get current storage size for response
+        vs_manager = VectorStoreManager(self.skill_store)
+        current_size = await vs_manager.get_content_size(agent_id)
+        # Update metadata
+        metadata_manager = MetadataManager(self.skill_store)
+        new_metadata = metadata_manager.create_document_metadata(
+            title, source, tags, [document], len(text_content)
+        )
+        await metadata_manager.update_metadata(agent_id, new_metadata)
+        logger.info(f"[{agent_id}] Document indexing completed successfully")
+        # Format response
+        response = ResponseFormatter.format_indexing_response(
+            "indexed",
+            f"Document: {title}",
+            total_chunks,
+            chunk_size,
+            chunk_overlap,
+            was_merged,
+            current_size_bytes=current_size,
+        )
+        logger.info(f"[{agent_id}] Document indexing completed successfully")
+        return response

intentkit/skills/web_scraper/schema.json CHANGED Viewed

@@ -50,6 +50,34 @@
           ],
           "description": "Search and retrieve relevant information from previously indexed web content using semantic similarity. Perfect for answering questions based on scraped documents.",
           "default": "private"
+        },
+        "website_indexer": {
+          "type": "string",
+          "title": "Complete Website Indexer",
+          "enum": [
+            "disabled",
+            "private"
+          ],
+          "x-enum-title": [
+            "Disabled",
+            "Agent Owner Only"
+          ],
+          "description": "Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.",
+          "default": "private"
+        },
+        "document_indexer": {
+          "type": "string",
+          "title": "Document Content Indexer",
+          "enum": [
+            "disabled",
+            "private"
+          ],
+          "x-enum-title": [
+            "Disabled",
+            "Agent Owner Only"
+          ],
+          "description": "Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.",
+          "default": "private"
         }
       },
       "description": "Configure the availability of each web scraper skill (disabled, public, or private)"

intentkit 0.6.0.dev7__py3-none-any.whl → 0.6.0.dev8__py3-none-any.whl

Potentially problematic release.

intentkit 0.6.0.dev7py3-none-any.whl → 0.6.0.dev8py3-none-any.whl