intentkit 0.6.0.dev7__py3-none-any.whl → 0.6.0.dev8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

intentkit/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  A powerful platform for building AI agents with blockchain and cryptocurrency capabilities.
4
4
  """
5
5
 
6
- __version__ = "0.6.0-dev.7"
6
+ __version__ = "0.6.0-dev.8"
7
7
  __author__ = "hyacinthus"
8
8
  __email__ = "hyacinthus@gmail.com"
9
9
 
@@ -10,12 +10,22 @@ Scrape content from URLs and index into a searchable vector store with configura
10
10
  ### 🔎 `query_indexed_content`
11
11
  Search indexed content using semantic similarity to answer questions and retrieve relevant information.
12
12
 
13
+ ### `website_indexer`
14
+ Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.
15
+
16
+ ### `document_indexer`
17
+ Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.
18
+
13
19
  ## Key Features
14
20
 
15
- - **Multi-URL Support**: Scrape up to 10 URLs simultaneously
21
+ - **Multi-URL Support**: Scrape up to 10 URLs simultaneously
22
+ - **Sitemap Discovery**: Automatic sitemap detection from robots.txt with common patterns
23
+ - **Direct Text Input**: Add content directly without web scraping
16
24
  - **Smart Chunking**: Configurable text splitting (100-4000 chars) with overlap
17
25
  - **Vector Search**: FAISS + OpenAI embeddings for semantic retrieval
18
26
  - **Agent Storage**: Persistent, per-agent content indexing
27
+ - **Content Filtering**: Include/exclude URL patterns for targeted scraping
28
+ - **Tagging System**: Organize content with custom tags
19
29
  - **Rate Limiting**: Respectful scraping (0.1-10 req/sec)
20
30
 
21
31
  ## Testing Examples
@@ -39,7 +49,27 @@ Please scrape and index this URL: https://docs.crestal.network/introduction
39
49
  Scrape and index https://docs.crestal.network/introduction with chunk size 500 and overlap 100.
40
50
  ```
41
51
 
42
- ### 3. Content Querying
52
+ ### 3. Complete Website Indexing
53
+
54
+ **Agent Prompt:**
55
+ ```
56
+ Index the entire documentation site at https://docs.crestal.network using its sitemap. Include only pages with '/docs/' and '/guides/' in the URL, exclude '/admin/' pages, and limit to 50 URLs.
57
+ ```
58
+
59
+ ### 4. Document Content Import
60
+
61
+ **Agent Prompt:**
62
+ ```
63
+ I'm going to paste some content from my Google Doc. Please add it to the knowledge base:
64
+
65
+ Title: "Meeting Notes - Q4 Strategy"
66
+ Source: "Google Docs"
67
+ Tags: "meeting, strategy, q4, planning"
68
+
69
+ [Paste your document content here...]
70
+ ```
71
+
72
+ ### 5. Content Querying
43
73
 
44
74
  **Agent Prompt (after indexing):**
45
75
  ```
@@ -75,8 +105,9 @@ curl -X POST "http://localhost:8000/agents/your-agent-id/chat" \
75
105
  ## Dependencies
76
106
 
77
107
  Required packages (add to `pyproject.toml` if missing):
78
- - `langchain-community` - WebBaseLoader
108
+ - `langchain-community` - WebBaseLoader and document processing
79
109
  - `langchain-openai` - Embeddings
80
110
  - `langchain-text-splitters` - Document chunking
81
111
  - `faiss-cpu` - Vector storage
82
- - `beautifulsoup4` - HTML parsing
112
+ - `beautifulsoup4` - HTML parsing
113
+ - `httpx` - Async HTTP client for sitemap discovery
@@ -6,10 +6,12 @@ from typing import TypedDict
6
6
  from intentkit.abstracts.skill import SkillStoreABC
7
7
  from intentkit.skills.base import SkillConfig, SkillOwnerState, SkillState
8
8
  from intentkit.skills.web_scraper.base import WebScraperBaseTool
9
+ from intentkit.skills.web_scraper.document_indexer import DocumentIndexer
9
10
  from intentkit.skills.web_scraper.scrape_and_index import (
10
11
  QueryIndexedContent,
11
12
  ScrapeAndIndex,
12
13
  )
14
+ from intentkit.skills.web_scraper.website_indexer import WebsiteIndexer
13
15
 
14
16
  # Cache skills at the system level, because they are stateless
15
17
  _cache: dict[str, WebScraperBaseTool] = {}
@@ -20,6 +22,8 @@ logger = logging.getLogger(__name__)
20
22
  class SkillStates(TypedDict):
21
23
  scrape_and_index: SkillOwnerState
22
24
  query_indexed_content: SkillState
25
+ website_indexer: SkillOwnerState
26
+ document_indexer: SkillOwnerState
23
27
 
24
28
 
25
29
  class Config(SkillConfig):
@@ -87,6 +91,18 @@ def get_web_scraper_skill(
87
91
  skill_store=store,
88
92
  )
89
93
  return _cache[name]
94
+ elif name == "website_indexer":
95
+ if name not in _cache:
96
+ _cache[name] = WebsiteIndexer(
97
+ skill_store=store,
98
+ )
99
+ return _cache[name]
100
+ elif name == "document_indexer":
101
+ if name not in _cache:
102
+ _cache[name] = DocumentIndexer(
103
+ skill_store=store,
104
+ )
105
+ return _cache[name]
90
106
  else:
91
107
  logger.warning(f"Unknown web scraper skill: {name}")
92
108
  return None
@@ -0,0 +1,143 @@
1
+ import logging
2
+ from typing import Type
3
+
4
+ from langchain_core.runnables import RunnableConfig
5
+ from pydantic import BaseModel, Field
6
+
7
+ from intentkit.skills.web_scraper.base import WebScraperBaseTool
8
+ from intentkit.skills.web_scraper.utils import (
9
+ DocumentProcessor,
10
+ MetadataManager,
11
+ ResponseFormatter,
12
+ VectorStoreManager,
13
+ index_documents,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class DocumentIndexerInput(BaseModel):
20
+ """Input for DocumentIndexer tool."""
21
+
22
+ text_content: str = Field(
23
+ description="The text content to add to the vector database. Can be content from Google Docs, Notion, or any other text source",
24
+ min_length=10,
25
+ max_length=100000,
26
+ )
27
+ title: str = Field(
28
+ description="Title or name for this text content (will be used as metadata)",
29
+ max_length=200,
30
+ )
31
+ source: str = Field(
32
+ description="Source of the text content (e.g., 'Google Doc', 'Notion Page', 'Manual Entry')",
33
+ default="Manual Entry",
34
+ max_length=100,
35
+ )
36
+ chunk_size: int = Field(
37
+ description="Size of text chunks for indexing (default: 1000)",
38
+ default=1000,
39
+ ge=100,
40
+ le=4000,
41
+ )
42
+ chunk_overlap: int = Field(
43
+ description="Overlap between chunks (default: 200)",
44
+ default=200,
45
+ ge=0,
46
+ le=1000,
47
+ )
48
+ tags: str = Field(
49
+ description="Optional tags for categorizing the content (comma-separated)",
50
+ default="",
51
+ max_length=500,
52
+ )
53
+
54
+
55
+ class DocumentIndexer(WebScraperBaseTool):
56
+ """Tool for importing and indexing document content to the vector database.
57
+
58
+ This tool allows users to copy and paste document content from various sources
59
+ (like Google Docs, Notion, PDFs, etc.) and index it directly into the vector store
60
+ for later querying and retrieval.
61
+ """
62
+
63
+ name: str = "web_scraper_document_indexer"
64
+ description: str = (
65
+ "Import and index document content directly to the vector database. "
66
+ "Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources. "
67
+ "The indexed content can then be queried using the query_indexed_content tool."
68
+ )
69
+ args_schema: Type[BaseModel] = DocumentIndexerInput
70
+
71
+ async def _arun(
72
+ self,
73
+ text_content: str,
74
+ title: str,
75
+ source: str = "Manual Entry",
76
+ chunk_size: int = 1000,
77
+ chunk_overlap: int = 200,
78
+ tags: str = "",
79
+ config: RunnableConfig = None,
80
+ **kwargs,
81
+ ) -> str:
82
+ """Add text content to the vector database."""
83
+ # Get agent context - throw error if not available
84
+ if not config:
85
+ raise ValueError("Configuration is required but not provided")
86
+
87
+ context = self.context_from_config(config)
88
+ if not context or not context.agent or not context.agent.id:
89
+ raise ValueError("Agent ID is required but not found in configuration")
90
+
91
+ agent_id = context.agent.id
92
+
93
+ logger.info(f"[{agent_id}] Starting document indexing for title: '{title}'")
94
+
95
+ # Validate content
96
+ if not DocumentProcessor.validate_content(text_content):
97
+ logger.error(f"[{agent_id}] Content validation failed - too short")
98
+ return "Error: Text content is too short. Please provide at least 10 characters of content."
99
+
100
+ # Create document with metadata
101
+ document = DocumentProcessor.create_document(
102
+ text_content,
103
+ title,
104
+ source,
105
+ tags,
106
+ extra_metadata={"source_type": "document_indexer"},
107
+ )
108
+
109
+ logger.info(
110
+ f"[{agent_id}] Document created, length: {len(document.page_content)} chars"
111
+ )
112
+
113
+ # Index the document
114
+ total_chunks, was_merged = await index_documents(
115
+ [document], agent_id, self.skill_store, chunk_size, chunk_overlap
116
+ )
117
+
118
+ # Get current storage size for response
119
+ vs_manager = VectorStoreManager(self.skill_store)
120
+ current_size = await vs_manager.get_content_size(agent_id)
121
+
122
+ # Update metadata
123
+ metadata_manager = MetadataManager(self.skill_store)
124
+ new_metadata = metadata_manager.create_document_metadata(
125
+ title, source, tags, [document], len(text_content)
126
+ )
127
+ await metadata_manager.update_metadata(agent_id, new_metadata)
128
+
129
+ logger.info(f"[{agent_id}] Document indexing completed successfully")
130
+
131
+ # Format response
132
+ response = ResponseFormatter.format_indexing_response(
133
+ "indexed",
134
+ f"Document: {title}",
135
+ total_chunks,
136
+ chunk_size,
137
+ chunk_overlap,
138
+ was_merged,
139
+ current_size_bytes=current_size,
140
+ )
141
+
142
+ logger.info(f"[{agent_id}] Document indexing completed successfully")
143
+ return response
@@ -50,6 +50,34 @@
50
50
  ],
51
51
  "description": "Search and retrieve relevant information from previously indexed web content using semantic similarity. Perfect for answering questions based on scraped documents.",
52
52
  "default": "private"
53
+ },
54
+ "website_indexer": {
55
+ "type": "string",
56
+ "title": "Complete Website Indexer",
57
+ "enum": [
58
+ "disabled",
59
+ "private"
60
+ ],
61
+ "x-enum-title": [
62
+ "Disabled",
63
+ "Agent Owner Only"
64
+ ],
65
+ "description": "Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.",
66
+ "default": "private"
67
+ },
68
+ "document_indexer": {
69
+ "type": "string",
70
+ "title": "Document Content Indexer",
71
+ "enum": [
72
+ "disabled",
73
+ "private"
74
+ ],
75
+ "x-enum-title": [
76
+ "Disabled",
77
+ "Agent Owner Only"
78
+ ],
79
+ "description": "Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.",
80
+ "default": "private"
53
81
  }
54
82
  },
55
83
  "description": "Configure the availability of each web scraper skill (disabled, public, or private)"