intentkit 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of intentkit might be problematic. Click here for more details.
- intentkit/__init__.py +1 -1
- intentkit/abstracts/skill.py +12 -0
- intentkit/clients/cdp.py +114 -16
- intentkit/config/config.py +12 -4
- intentkit/core/engine.py +39 -31
- intentkit/core/node.py +8 -4
- intentkit/core/prompt.py +5 -6
- intentkit/core/skill.py +11 -0
- intentkit/models/agent.py +2 -9
- intentkit/models/agent_data.py +18 -0
- intentkit/models/agent_schema.json +12 -0
- intentkit/models/chat.py +50 -0
- intentkit/models/skill.py +19 -0
- intentkit/skills/base.py +37 -17
- intentkit/skills/cdp/__init__.py +6 -14
- intentkit/skills/cdp/get_balance.py +77 -25
- intentkit/skills/cdp/schema.json +0 -64
- intentkit/skills/cryptocompare/fetch_news.py +2 -2
- intentkit/skills/cryptocompare/fetch_price.py +2 -2
- intentkit/skills/cryptocompare/fetch_top_exchanges.py +2 -2
- intentkit/skills/cryptocompare/fetch_top_market_cap.py +2 -2
- intentkit/skills/cryptocompare/fetch_top_volume.py +2 -2
- intentkit/skills/cryptocompare/fetch_trading_signals.py +2 -2
- intentkit/skills/defillama/base.py +3 -3
- intentkit/skills/enso/base.py +27 -4
- intentkit/skills/enso/networks.py +1 -1
- intentkit/skills/enso/route.py +24 -23
- intentkit/skills/enso/tokens.py +1 -1
- intentkit/skills/enso/wallet.py +27 -23
- intentkit/skills/firecrawl/README.md +211 -0
- intentkit/skills/firecrawl/__init__.py +107 -0
- intentkit/skills/firecrawl/base.py +28 -0
- intentkit/skills/firecrawl/clear.py +87 -0
- intentkit/skills/firecrawl/crawl.py +399 -0
- intentkit/skills/firecrawl/firecrawl.png +0 -0
- intentkit/skills/firecrawl/query.py +123 -0
- intentkit/skills/firecrawl/schema.json +153 -0
- intentkit/skills/firecrawl/scrape.py +318 -0
- intentkit/skills/firecrawl/utils.py +306 -0
- intentkit/skills/heurist/image_generation_animagine_xl.py +1 -1
- intentkit/skills/heurist/image_generation_arthemy_comics.py +1 -1
- intentkit/skills/heurist/image_generation_arthemy_real.py +1 -1
- intentkit/skills/heurist/image_generation_braindance.py +1 -1
- intentkit/skills/heurist/image_generation_cyber_realistic_xl.py +1 -1
- intentkit/skills/heurist/image_generation_flux_1_dev.py +1 -1
- intentkit/skills/heurist/image_generation_sdxl.py +1 -1
- intentkit/skills/http/README.md +78 -0
- intentkit/skills/http/__init__.py +100 -0
- intentkit/skills/http/base.py +21 -0
- intentkit/skills/http/get.py +96 -0
- intentkit/skills/http/http.svg +15 -0
- intentkit/skills/http/post.py +113 -0
- intentkit/skills/http/put.py +113 -0
- intentkit/skills/http/schema.json +80 -0
- intentkit/skills/lifi/token_execute.py +1 -1
- intentkit/skills/openai/dalle_image_generation.py +1 -1
- intentkit/skills/openai/gpt_image_generation.py +1 -1
- intentkit/skills/openai/gpt_image_to_image.py +1 -1
- intentkit/skills/supabase/__init__.py +116 -0
- intentkit/skills/supabase/base.py +72 -0
- intentkit/skills/supabase/delete_data.py +102 -0
- intentkit/skills/supabase/fetch_data.py +120 -0
- intentkit/skills/supabase/insert_data.py +70 -0
- intentkit/skills/supabase/invoke_function.py +74 -0
- intentkit/skills/supabase/schema.json +170 -0
- intentkit/skills/supabase/supabase.svg +15 -0
- intentkit/skills/supabase/update_data.py +105 -0
- intentkit/skills/supabase/upsert_data.py +77 -0
- intentkit/skills/system/read_agent_api_key.py +1 -1
- intentkit/skills/system/regenerate_agent_api_key.py +1 -1
- intentkit/skills/token/base.py +1 -39
- intentkit/skills/twitter/follow_user.py +3 -3
- intentkit/skills/twitter/get_mentions.py +6 -6
- intentkit/skills/twitter/get_timeline.py +5 -5
- intentkit/skills/twitter/get_user_by_username.py +3 -3
- intentkit/skills/twitter/get_user_tweets.py +5 -5
- intentkit/skills/twitter/like_tweet.py +3 -3
- intentkit/skills/twitter/post_tweet.py +4 -4
- intentkit/skills/twitter/reply_tweet.py +4 -4
- intentkit/skills/twitter/retweet.py +3 -3
- intentkit/skills/twitter/search_tweets.py +5 -5
- intentkit/skills/unrealspeech/text_to_speech.py +1 -1
- intentkit/skills/web_scraper/README.md +35 -4
- intentkit/skills/web_scraper/__init__.py +16 -0
- intentkit/skills/web_scraper/document_indexer.py +143 -0
- intentkit/skills/web_scraper/schema.json +28 -0
- intentkit/skills/web_scraper/scrape_and_index.py +135 -200
- intentkit/skills/web_scraper/utils.py +684 -0
- intentkit/skills/web_scraper/website_indexer.py +456 -0
- intentkit/utils/logging.py +1 -1
- {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/METADATA +1 -1
- {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/RECORD +94 -63
- {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/WHEEL +0 -0
- {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -56,7 +56,7 @@ class TwitterReplyTweet(TwitterBaseTool):
|
|
|
56
56
|
try:
|
|
57
57
|
context = self.context_from_config(config)
|
|
58
58
|
twitter = get_twitter_client(
|
|
59
|
-
agent_id=context.
|
|
59
|
+
agent_id=context.agent_id,
|
|
60
60
|
skill_store=self.skill_store,
|
|
61
61
|
config=context.config,
|
|
62
62
|
)
|
|
@@ -65,7 +65,7 @@ class TwitterReplyTweet(TwitterBaseTool):
|
|
|
65
65
|
# Check rate limit only when not using OAuth
|
|
66
66
|
if not twitter.use_key:
|
|
67
67
|
await self.check_rate_limit(
|
|
68
|
-
context.
|
|
68
|
+
context.agent_id, max_requests=48, interval=1440
|
|
69
69
|
)
|
|
70
70
|
|
|
71
71
|
media_ids = []
|
|
@@ -73,7 +73,7 @@ class TwitterReplyTweet(TwitterBaseTool):
|
|
|
73
73
|
# Handle image upload if provided
|
|
74
74
|
if image:
|
|
75
75
|
# Use the TwitterClient method to upload the image
|
|
76
|
-
media_ids = await twitter.upload_media(context.
|
|
76
|
+
media_ids = await twitter.upload_media(context.agent_id, image)
|
|
77
77
|
|
|
78
78
|
# Post reply tweet using tweepy client
|
|
79
79
|
tweet_params = {
|
|
@@ -95,4 +95,4 @@ class TwitterReplyTweet(TwitterBaseTool):
|
|
|
95
95
|
|
|
96
96
|
except Exception as e:
|
|
97
97
|
logger.error(f"Error replying to tweet: {str(e)}")
|
|
98
|
-
raise type(e)(f"[agent:{context.
|
|
98
|
+
raise type(e)(f"[agent:{context.agent_id}]: {e}") from e
|
|
@@ -39,7 +39,7 @@ class TwitterRetweet(TwitterBaseTool):
|
|
|
39
39
|
try:
|
|
40
40
|
context = self.context_from_config(config)
|
|
41
41
|
twitter = get_twitter_client(
|
|
42
|
-
agent_id=context.
|
|
42
|
+
agent_id=context.agent_id,
|
|
43
43
|
skill_store=self.skill_store,
|
|
44
44
|
config=context.config,
|
|
45
45
|
)
|
|
@@ -48,7 +48,7 @@ class TwitterRetweet(TwitterBaseTool):
|
|
|
48
48
|
# Check rate limit only when not using OAuth
|
|
49
49
|
if not twitter.use_key:
|
|
50
50
|
await self.check_rate_limit(
|
|
51
|
-
context.
|
|
51
|
+
context.agent_id, max_requests=5, interval=15
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
# Get authenticated user's ID
|
|
@@ -73,4 +73,4 @@ class TwitterRetweet(TwitterBaseTool):
|
|
|
73
73
|
|
|
74
74
|
except Exception as e:
|
|
75
75
|
logger.error(f"Error retweeting: {str(e)}")
|
|
76
|
-
raise type(e)(f"[agent:{context.
|
|
76
|
+
raise type(e)(f"[agent:{context.agent_id}]: {e}") from e
|
|
@@ -41,7 +41,7 @@ class TwitterSearchTweets(TwitterBaseTool):
|
|
|
41
41
|
try:
|
|
42
42
|
context = self.context_from_config(config)
|
|
43
43
|
twitter = get_twitter_client(
|
|
44
|
-
agent_id=context.
|
|
44
|
+
agent_id=context.agent_id,
|
|
45
45
|
skill_store=self.skill_store,
|
|
46
46
|
config=context.config,
|
|
47
47
|
)
|
|
@@ -50,12 +50,12 @@ class TwitterSearchTweets(TwitterBaseTool):
|
|
|
50
50
|
# Check rate limit only when not using OAuth
|
|
51
51
|
if not twitter.use_key:
|
|
52
52
|
await self.check_rate_limit(
|
|
53
|
-
context.
|
|
53
|
+
context.agent_id, max_requests=3, interval=60 * 24
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
# Get since_id from store to avoid duplicate results
|
|
57
57
|
last = await self.skill_store.get_agent_skill_data(
|
|
58
|
-
context.
|
|
58
|
+
context.agent_id, self.name, query
|
|
59
59
|
)
|
|
60
60
|
last = last or {}
|
|
61
61
|
since_id = last.get("since_id")
|
|
@@ -105,11 +105,11 @@ class TwitterSearchTweets(TwitterBaseTool):
|
|
|
105
105
|
last["since_id"] = tweets["meta"]["newest_id"]
|
|
106
106
|
last["timestamp"] = datetime.datetime.now().isoformat()
|
|
107
107
|
await self.skill_store.save_agent_skill_data(
|
|
108
|
-
context.
|
|
108
|
+
context.agent_id, self.name, query, last
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
return tweets
|
|
112
112
|
|
|
113
113
|
except Exception as e:
|
|
114
114
|
logger.error(f"Error searching tweets: {str(e)}")
|
|
115
|
-
raise type(e)(f"[agent:{context.
|
|
115
|
+
raise type(e)(f"[agent:{context.agent_id}]: {e}") from e
|
|
@@ -81,7 +81,7 @@ class TextToSpeech(UnrealSpeechBaseTool):
|
|
|
81
81
|
# If no API key in config, try to get it from skill store
|
|
82
82
|
if not api_key:
|
|
83
83
|
try:
|
|
84
|
-
agent_id = context.
|
|
84
|
+
agent_id = context.agent_id
|
|
85
85
|
api_key_data = await self.skill_store.get_agent_data(
|
|
86
86
|
agent_id, "unrealspeech_api_key"
|
|
87
87
|
)
|
|
@@ -10,12 +10,22 @@ Scrape content from URLs and index into a searchable vector store with configura
|
|
|
10
10
|
### 🔎 `query_indexed_content`
|
|
11
11
|
Search indexed content using semantic similarity to answer questions and retrieve relevant information.
|
|
12
12
|
|
|
13
|
+
### `website_indexer`
|
|
14
|
+
Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.
|
|
15
|
+
|
|
16
|
+
### `document_indexer`
|
|
17
|
+
Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.
|
|
18
|
+
|
|
13
19
|
## Key Features
|
|
14
20
|
|
|
15
|
-
- **Multi-URL Support**: Scrape up to 10 URLs simultaneously
|
|
21
|
+
- **Multi-URL Support**: Scrape up to 10 URLs simultaneously
|
|
22
|
+
- **Sitemap Discovery**: Automatic sitemap detection from robots.txt with common patterns
|
|
23
|
+
- **Direct Text Input**: Add content directly without web scraping
|
|
16
24
|
- **Smart Chunking**: Configurable text splitting (100-4000 chars) with overlap
|
|
17
25
|
- **Vector Search**: FAISS + OpenAI embeddings for semantic retrieval
|
|
18
26
|
- **Agent Storage**: Persistent, per-agent content indexing
|
|
27
|
+
- **Content Filtering**: Include/exclude URL patterns for targeted scraping
|
|
28
|
+
- **Tagging System**: Organize content with custom tags
|
|
19
29
|
- **Rate Limiting**: Respectful scraping (0.1-10 req/sec)
|
|
20
30
|
|
|
21
31
|
## Testing Examples
|
|
@@ -39,7 +49,27 @@ Please scrape and index this URL: https://docs.crestal.network/introduction
|
|
|
39
49
|
Scrape and index https://docs.crestal.network/introduction with chunk size 500 and overlap 100.
|
|
40
50
|
```
|
|
41
51
|
|
|
42
|
-
### 3.
|
|
52
|
+
### 3. Complete Website Indexing
|
|
53
|
+
|
|
54
|
+
**Agent Prompt:**
|
|
55
|
+
```
|
|
56
|
+
Index the entire documentation site at https://docs.crestal.network using its sitemap. Include only pages with '/docs/' and '/guides/' in the URL, exclude '/admin/' pages, and limit to 50 URLs.
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### 4. Document Content Import
|
|
60
|
+
|
|
61
|
+
**Agent Prompt:**
|
|
62
|
+
```
|
|
63
|
+
I'm going to paste some content from my Google Doc. Please add it to the knowledge base:
|
|
64
|
+
|
|
65
|
+
Title: "Meeting Notes - Q4 Strategy"
|
|
66
|
+
Source: "Google Docs"
|
|
67
|
+
Tags: "meeting, strategy, q4, planning"
|
|
68
|
+
|
|
69
|
+
[Paste your document content here...]
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 5. Content Querying
|
|
43
73
|
|
|
44
74
|
**Agent Prompt (after indexing):**
|
|
45
75
|
```
|
|
@@ -75,8 +105,9 @@ curl -X POST "http://localhost:8000/agents/your-agent-id/chat" \
|
|
|
75
105
|
## Dependencies
|
|
76
106
|
|
|
77
107
|
Required packages (add to `pyproject.toml` if missing):
|
|
78
|
-
- `langchain-community` - WebBaseLoader
|
|
108
|
+
- `langchain-community` - WebBaseLoader and document processing
|
|
79
109
|
- `langchain-openai` - Embeddings
|
|
80
110
|
- `langchain-text-splitters` - Document chunking
|
|
81
111
|
- `faiss-cpu` - Vector storage
|
|
82
|
-
- `beautifulsoup4` - HTML parsing
|
|
112
|
+
- `beautifulsoup4` - HTML parsing
|
|
113
|
+
- `httpx` - Async HTTP client for sitemap discovery
|
|
@@ -6,10 +6,12 @@ from typing import TypedDict
|
|
|
6
6
|
from intentkit.abstracts.skill import SkillStoreABC
|
|
7
7
|
from intentkit.skills.base import SkillConfig, SkillOwnerState, SkillState
|
|
8
8
|
from intentkit.skills.web_scraper.base import WebScraperBaseTool
|
|
9
|
+
from intentkit.skills.web_scraper.document_indexer import DocumentIndexer
|
|
9
10
|
from intentkit.skills.web_scraper.scrape_and_index import (
|
|
10
11
|
QueryIndexedContent,
|
|
11
12
|
ScrapeAndIndex,
|
|
12
13
|
)
|
|
14
|
+
from intentkit.skills.web_scraper.website_indexer import WebsiteIndexer
|
|
13
15
|
|
|
14
16
|
# Cache skills at the system level, because they are stateless
|
|
15
17
|
_cache: dict[str, WebScraperBaseTool] = {}
|
|
@@ -20,6 +22,8 @@ logger = logging.getLogger(__name__)
|
|
|
20
22
|
class SkillStates(TypedDict):
|
|
21
23
|
scrape_and_index: SkillOwnerState
|
|
22
24
|
query_indexed_content: SkillState
|
|
25
|
+
website_indexer: SkillOwnerState
|
|
26
|
+
document_indexer: SkillOwnerState
|
|
23
27
|
|
|
24
28
|
|
|
25
29
|
class Config(SkillConfig):
|
|
@@ -87,6 +91,18 @@ def get_web_scraper_skill(
|
|
|
87
91
|
skill_store=store,
|
|
88
92
|
)
|
|
89
93
|
return _cache[name]
|
|
94
|
+
elif name == "website_indexer":
|
|
95
|
+
if name not in _cache:
|
|
96
|
+
_cache[name] = WebsiteIndexer(
|
|
97
|
+
skill_store=store,
|
|
98
|
+
)
|
|
99
|
+
return _cache[name]
|
|
100
|
+
elif name == "document_indexer":
|
|
101
|
+
if name not in _cache:
|
|
102
|
+
_cache[name] = DocumentIndexer(
|
|
103
|
+
skill_store=store,
|
|
104
|
+
)
|
|
105
|
+
return _cache[name]
|
|
90
106
|
else:
|
|
91
107
|
logger.warning(f"Unknown web scraper skill: {name}")
|
|
92
108
|
return None
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Type
|
|
3
|
+
|
|
4
|
+
from langchain_core.runnables import RunnableConfig
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from intentkit.skills.web_scraper.base import WebScraperBaseTool
|
|
8
|
+
from intentkit.skills.web_scraper.utils import (
|
|
9
|
+
DocumentProcessor,
|
|
10
|
+
MetadataManager,
|
|
11
|
+
ResponseFormatter,
|
|
12
|
+
VectorStoreManager,
|
|
13
|
+
index_documents,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DocumentIndexerInput(BaseModel):
|
|
20
|
+
"""Input for DocumentIndexer tool."""
|
|
21
|
+
|
|
22
|
+
text_content: str = Field(
|
|
23
|
+
description="The text content to add to the vector database. Can be content from Google Docs, Notion, or any other text source",
|
|
24
|
+
min_length=10,
|
|
25
|
+
max_length=100000,
|
|
26
|
+
)
|
|
27
|
+
title: str = Field(
|
|
28
|
+
description="Title or name for this text content (will be used as metadata)",
|
|
29
|
+
max_length=200,
|
|
30
|
+
)
|
|
31
|
+
source: str = Field(
|
|
32
|
+
description="Source of the text content (e.g., 'Google Doc', 'Notion Page', 'Manual Entry')",
|
|
33
|
+
default="Manual Entry",
|
|
34
|
+
max_length=100,
|
|
35
|
+
)
|
|
36
|
+
chunk_size: int = Field(
|
|
37
|
+
description="Size of text chunks for indexing (default: 1000)",
|
|
38
|
+
default=1000,
|
|
39
|
+
ge=100,
|
|
40
|
+
le=4000,
|
|
41
|
+
)
|
|
42
|
+
chunk_overlap: int = Field(
|
|
43
|
+
description="Overlap between chunks (default: 200)",
|
|
44
|
+
default=200,
|
|
45
|
+
ge=0,
|
|
46
|
+
le=1000,
|
|
47
|
+
)
|
|
48
|
+
tags: str = Field(
|
|
49
|
+
description="Optional tags for categorizing the content (comma-separated)",
|
|
50
|
+
default="",
|
|
51
|
+
max_length=500,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DocumentIndexer(WebScraperBaseTool):
|
|
56
|
+
"""Tool for importing and indexing document content to the vector database.
|
|
57
|
+
|
|
58
|
+
This tool allows users to copy and paste document content from various sources
|
|
59
|
+
(like Google Docs, Notion, PDFs, etc.) and index it directly into the vector store
|
|
60
|
+
for later querying and retrieval.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
name: str = "web_scraper_document_indexer"
|
|
64
|
+
description: str = (
|
|
65
|
+
"Import and index document content directly to the vector database. "
|
|
66
|
+
"Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources. "
|
|
67
|
+
"The indexed content can then be queried using the query_indexed_content tool."
|
|
68
|
+
)
|
|
69
|
+
args_schema: Type[BaseModel] = DocumentIndexerInput
|
|
70
|
+
|
|
71
|
+
async def _arun(
|
|
72
|
+
self,
|
|
73
|
+
text_content: str,
|
|
74
|
+
title: str,
|
|
75
|
+
source: str = "Manual Entry",
|
|
76
|
+
chunk_size: int = 1000,
|
|
77
|
+
chunk_overlap: int = 200,
|
|
78
|
+
tags: str = "",
|
|
79
|
+
config: RunnableConfig = None,
|
|
80
|
+
**kwargs,
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Add text content to the vector database."""
|
|
83
|
+
# Get agent context - throw error if not available
|
|
84
|
+
if not config:
|
|
85
|
+
raise ValueError("Configuration is required but not provided")
|
|
86
|
+
|
|
87
|
+
context = self.context_from_config(config)
|
|
88
|
+
if not context or not context.agent_id:
|
|
89
|
+
raise ValueError("Agent ID is required but not found in configuration")
|
|
90
|
+
|
|
91
|
+
agent_id = context.agent_id
|
|
92
|
+
|
|
93
|
+
logger.info(f"[{agent_id}] Starting document indexing for title: '{title}'")
|
|
94
|
+
|
|
95
|
+
# Validate content
|
|
96
|
+
if not DocumentProcessor.validate_content(text_content):
|
|
97
|
+
logger.error(f"[{agent_id}] Content validation failed - too short")
|
|
98
|
+
return "Error: Text content is too short. Please provide at least 10 characters of content."
|
|
99
|
+
|
|
100
|
+
# Create document with metadata
|
|
101
|
+
document = DocumentProcessor.create_document(
|
|
102
|
+
text_content,
|
|
103
|
+
title,
|
|
104
|
+
source,
|
|
105
|
+
tags,
|
|
106
|
+
extra_metadata={"source_type": "document_indexer"},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
logger.info(
|
|
110
|
+
f"[{agent_id}] Document created, length: {len(document.page_content)} chars"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Index the document
|
|
114
|
+
total_chunks, was_merged = await index_documents(
|
|
115
|
+
[document], agent_id, self.skill_store, chunk_size, chunk_overlap
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Get current storage size for response
|
|
119
|
+
vs_manager = VectorStoreManager(self.skill_store)
|
|
120
|
+
current_size = await vs_manager.get_content_size(agent_id)
|
|
121
|
+
|
|
122
|
+
# Update metadata
|
|
123
|
+
metadata_manager = MetadataManager(self.skill_store)
|
|
124
|
+
new_metadata = metadata_manager.create_document_metadata(
|
|
125
|
+
title, source, tags, [document], len(text_content)
|
|
126
|
+
)
|
|
127
|
+
await metadata_manager.update_metadata(agent_id, new_metadata)
|
|
128
|
+
|
|
129
|
+
logger.info(f"[{agent_id}] Document indexing completed successfully")
|
|
130
|
+
|
|
131
|
+
# Format response
|
|
132
|
+
response = ResponseFormatter.format_indexing_response(
|
|
133
|
+
"indexed",
|
|
134
|
+
f"Document: {title}",
|
|
135
|
+
total_chunks,
|
|
136
|
+
chunk_size,
|
|
137
|
+
chunk_overlap,
|
|
138
|
+
was_merged,
|
|
139
|
+
current_size_bytes=current_size,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logger.info(f"[{agent_id}] Document indexing completed successfully")
|
|
143
|
+
return response
|
|
@@ -50,6 +50,34 @@
|
|
|
50
50
|
],
|
|
51
51
|
"description": "Search and retrieve relevant information from previously indexed web content using semantic similarity. Perfect for answering questions based on scraped documents.",
|
|
52
52
|
"default": "private"
|
|
53
|
+
},
|
|
54
|
+
"website_indexer": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"title": "Complete Website Indexer",
|
|
57
|
+
"enum": [
|
|
58
|
+
"disabled",
|
|
59
|
+
"private"
|
|
60
|
+
],
|
|
61
|
+
"x-enum-title": [
|
|
62
|
+
"Disabled",
|
|
63
|
+
"Agent Owner Only"
|
|
64
|
+
],
|
|
65
|
+
"description": "Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.",
|
|
66
|
+
"default": "private"
|
|
67
|
+
},
|
|
68
|
+
"document_indexer": {
|
|
69
|
+
"type": "string",
|
|
70
|
+
"title": "Document Content Indexer",
|
|
71
|
+
"enum": [
|
|
72
|
+
"disabled",
|
|
73
|
+
"private"
|
|
74
|
+
],
|
|
75
|
+
"x-enum-title": [
|
|
76
|
+
"Disabled",
|
|
77
|
+
"Agent Owner Only"
|
|
78
|
+
],
|
|
79
|
+
"description": "Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.",
|
|
80
|
+
"default": "private"
|
|
53
81
|
}
|
|
54
82
|
},
|
|
55
83
|
"description": "Configure the availability of each web scraper skill (disabled, public, or private)"
|