intentkit 0.6.0.dev11__py3-none-any.whl → 0.6.0.dev12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of intentkit might be problematic. Click here for more details.
- intentkit/__init__.py +1 -1
- intentkit/config/config.py +1 -0
- intentkit/models/agent_schema.json +4 -0
- intentkit/skills/firecrawl/README.md +178 -0
- intentkit/skills/firecrawl/__init__.py +99 -0
- intentkit/skills/firecrawl/base.py +28 -0
- intentkit/skills/firecrawl/crawl.py +407 -0
- intentkit/skills/firecrawl/firecrawl.png +0 -0
- intentkit/skills/firecrawl/query.py +123 -0
- intentkit/skills/firecrawl/schema.json +137 -0
- intentkit/skills/firecrawl/scrape.py +324 -0
- intentkit/skills/firecrawl/utils.py +287 -0
- {intentkit-0.6.0.dev11.dist-info → intentkit-0.6.0.dev12.dist-info}/METADATA +1 -1
- {intentkit-0.6.0.dev11.dist-info → intentkit-0.6.0.dev12.dist-info}/RECORD +16 -7
- {intentkit-0.6.0.dev11.dist-info → intentkit-0.6.0.dev12.dist-info}/WHEEL +0 -0
- {intentkit-0.6.0.dev11.dist-info → intentkit-0.6.0.dev12.dist-info}/licenses/LICENSE +0 -0
intentkit/__init__.py
CHANGED
intentkit/config/config.py
CHANGED
|
@@ -121,6 +121,7 @@ class Config:
|
|
|
121
121
|
self.moralis_api_key = self.load("MORALIS_API_KEY")
|
|
122
122
|
self.tavily_api_key = self.load("TAVILY_API_KEY")
|
|
123
123
|
self.cookiefun_api_key = self.load("COOKIEFUN_API_KEY")
|
|
124
|
+
self.firecrawl_api_key = self.load("FIRECRAWL_API_KEY")
|
|
124
125
|
# Sentry
|
|
125
126
|
self.sentry_dsn = self.load("SENTRY_DSN")
|
|
126
127
|
self.sentry_sample_rate = float(self.load("SENTRY_SAMPLE_RATE", "0.1"))
|
|
@@ -539,6 +539,10 @@
|
|
|
539
539
|
"title": "Web Scraper & Content Indexing",
|
|
540
540
|
"$ref": "../skills/web_scraper/schema.json"
|
|
541
541
|
},
|
|
542
|
+
"firecrawl": {
|
|
543
|
+
"title": "Firecrawl Web Scraping",
|
|
544
|
+
"$ref": "../skills/firecrawl/schema.json"
|
|
545
|
+
},
|
|
542
546
|
"aixbt": {
|
|
543
547
|
"title": "AIXBT",
|
|
544
548
|
"$ref": "../skills/aixbt/schema.json"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# Firecrawl Skills
|
|
2
|
+
|
|
3
|
+
The Firecrawl skills provide advanced web scraping and content indexing capabilities using the Firecrawl API. These skills can handle JavaScript-heavy websites, PDFs, and provide automatic content indexing for intelligent querying.
|
|
4
|
+
|
|
5
|
+
## Skills Overview
|
|
6
|
+
|
|
7
|
+
### 1. firecrawl_scrape
|
|
8
|
+
Scrapes a single webpage and optionally indexes the content for future querying.
|
|
9
|
+
|
|
10
|
+
**Parameters:**
|
|
11
|
+
- `url` (required): The URL to scrape
|
|
12
|
+
- `formats` (optional): Output formats - markdown, html, rawHtml, screenshot, links, extract (default: ["markdown"])
|
|
13
|
+
- `include_tags` (optional): HTML tags to include (e.g., ["h1", "h2", "p"])
|
|
14
|
+
- `exclude_tags` (optional): HTML tags to exclude
|
|
15
|
+
- `only_main_content` (optional): Extract only main content (default: true)
|
|
16
|
+
- `index_content` (optional): Whether to index content for querying (default: true)
|
|
17
|
+
- `chunk_size` (optional): Size of text chunks for indexing (default: 1000)
|
|
18
|
+
- `chunk_overlap` (optional): Overlap between chunks (default: 200)
|
|
19
|
+
|
|
20
|
+
### 2. firecrawl_crawl
|
|
21
|
+
Crawls multiple pages from a website and indexes all content.
|
|
22
|
+
|
|
23
|
+
**Parameters:**
|
|
24
|
+
- `url` (required): The base URL to start crawling
|
|
25
|
+
- `include_paths` (optional): URL patterns to include (e.g., ["/docs/*"])
|
|
26
|
+
- `exclude_paths` (optional): URL patterns to exclude
|
|
27
|
+
- `max_depth` (optional): Maximum crawl depth (default: 2)
|
|
28
|
+
- `limit` (optional): Maximum number of pages to crawl (default: 5)
|
|
29
|
+
- `index_content` (optional): Whether to index content for querying (default: true)
|
|
30
|
+
- `chunk_size` (optional): Size of text chunks for indexing (default: 1000)
|
|
31
|
+
- `chunk_overlap` (optional): Overlap between chunks (default: 200)
|
|
32
|
+
|
|
33
|
+
### 3. firecrawl_query_indexed_content
|
|
34
|
+
Queries previously indexed Firecrawl content using semantic search.
|
|
35
|
+
|
|
36
|
+
**Parameters:**
|
|
37
|
+
- `query` (required): The search query
|
|
38
|
+
- `limit` (optional): Maximum number of results to return (1-10, default: 4)
|
|
39
|
+
|
|
40
|
+
## API Key Configuration
|
|
41
|
+
Set your Firecrawl API key as an environment variable:
|
|
42
|
+
```bash
|
|
43
|
+
export FIRECRAWL_API_KEY=fc-your-api-key-here
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Testing Instructions
|
|
47
|
+
|
|
48
|
+
### Step 1: Create an Agent with Firecrawl Skills
|
|
49
|
+
|
|
50
|
+
1. **Create a new agent** via the API or UI with the following skills:
|
|
51
|
+
```json
|
|
52
|
+
{
|
|
53
|
+
"skills": [
|
|
54
|
+
"firecrawl_scrape",
|
|
55
|
+
"firecrawl_crawl",
|
|
56
|
+
"firecrawl_query_indexed_content"
|
|
57
|
+
]
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
2. **Note the agent ID** for testing
|
|
62
|
+
|
|
63
|
+
### Step 2: Test Single Page Scraping
|
|
64
|
+
|
|
65
|
+
**Test scraping a documentation homepage:**
|
|
66
|
+
```
|
|
67
|
+
Prompt: "Use firecrawl_scrape to scrape https://docs.joincommonwealth.xyz/ and index the content for future querying"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Expected Result:**
|
|
71
|
+
- Content successfully scraped
|
|
72
|
+
- Content automatically indexed with metadata
|
|
73
|
+
- Confirmation of chunk creation and indexing
|
|
74
|
+
|
|
75
|
+
### Step 3: Test Content Crawling
|
|
76
|
+
|
|
77
|
+
**Test crawling multiple pages:**
|
|
78
|
+
```
|
|
79
|
+
Prompt: "Use firecrawl_crawl to crawl https://docs.joincommonwealth.xyz/ with max_depth=2 and limit=3 to index multiple documentation pages"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Expected Result:**
|
|
83
|
+
- Multiple pages crawled and scraped
|
|
84
|
+
- Each page indexed separately
|
|
85
|
+
- Batch processing confirmation
|
|
86
|
+
|
|
87
|
+
### Step 4: Test Content Querying
|
|
88
|
+
|
|
89
|
+
**Test querying indexed content:**
|
|
90
|
+
```
|
|
91
|
+
Prompt: "Use firecrawl_query_indexed_content to search for 'What is All Street and what is its purpose?' in the indexed content"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Expected Result:**
|
|
95
|
+
- Relevant content retrieved from indexed documents
|
|
96
|
+
- Results tagged with [Firecrawl Scrape] or [Firecrawl Crawl]
|
|
97
|
+
- Source URLs and metadata included
|
|
98
|
+
|
|
99
|
+
### Step 5: Test Advanced Scraping Options
|
|
100
|
+
|
|
101
|
+
**Test with specific formatting:**
|
|
102
|
+
```
|
|
103
|
+
Prompt: "Use firecrawl_scrape to scrape https://docs.joincommonwealth.xyz/all-street-manifesto with formats=['markdown', 'html'] and include_tags=['h1', 'h2', 'p'] and index_content=true"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Expected Result:**
|
|
107
|
+
- Content in both markdown and HTML formats
|
|
108
|
+
- Only specified HTML tags included
|
|
109
|
+
- Content indexed for querying
|
|
110
|
+
|
|
111
|
+
### Step 6: Test Multiple Queries
|
|
112
|
+
|
|
113
|
+
**Test different query types:**
|
|
114
|
+
```
|
|
115
|
+
Prompt: "Use firecrawl_query_indexed_content to search for 'democratize finance' in the indexed content"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Expected Result:**
|
|
119
|
+
- Relevant content retrieved from Firecrawl's independent vector store
|
|
120
|
+
- Results tagged with [Firecrawl Scrape] or [Firecrawl Crawl]
|
|
121
|
+
- Source URLs and metadata included
|
|
122
|
+
|
|
123
|
+
## Common Use Cases
|
|
124
|
+
|
|
125
|
+
### Documentation Indexing
|
|
126
|
+
```
|
|
127
|
+
1. Scrape main documentation page
|
|
128
|
+
2. Crawl related documentation sections
|
|
129
|
+
3. Query for specific technical information
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Competitive Analysis
|
|
133
|
+
```
|
|
134
|
+
1. Scrape competitor websites
|
|
135
|
+
2. Index product information and features
|
|
136
|
+
3. Query for specific comparisons
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Research and Knowledge Base
|
|
140
|
+
```
|
|
141
|
+
1. Crawl research papers or articles
|
|
142
|
+
2. Index academic or technical content
|
|
143
|
+
3. Query for specific concepts or methodologies
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Troubleshooting
|
|
147
|
+
|
|
148
|
+
### Common Issues
|
|
149
|
+
|
|
150
|
+
1. **API Key Not Found**
|
|
151
|
+
- Ensure `FIRECRAWL_API_KEY` environment variable is set
|
|
152
|
+
- Restart the IntentKit server after setting the key
|
|
153
|
+
|
|
154
|
+
2. **Scraping Failures**
|
|
155
|
+
- Check if the URL is accessible
|
|
156
|
+
- Verify Firecrawl API quota and limits
|
|
157
|
+
- Some websites may block scraping
|
|
158
|
+
|
|
159
|
+
3. **Indexing Errors**
|
|
160
|
+
- Ensure OpenAI API key is configured for embeddings
|
|
161
|
+
- Check if content is too large for processing
|
|
162
|
+
- Verify vector store permissions
|
|
163
|
+
|
|
164
|
+
4. **Query Returns No Results**
|
|
165
|
+
- Ensure content was successfully indexed
|
|
166
|
+
- Try broader or different search terms
|
|
167
|
+
- Check if vector store contains data
|
|
168
|
+
|
|
169
|
+
## Features and Benefits
|
|
170
|
+
|
|
171
|
+
- **JavaScript Rendering**: Handles SPAs and dynamic content
|
|
172
|
+
- **PDF Support**: Can scrape and index PDF documents
|
|
173
|
+
- **Intelligent Chunking**: Optimized text splitting for better search
|
|
174
|
+
- **Independent Storage**: Uses its own dedicated vector store for Firecrawl content
|
|
175
|
+
- **Metadata Rich**: Includes source URLs, timestamps, and content types
|
|
176
|
+
- **Semantic Search**: Uses OpenAI embeddings for intelligent querying
|
|
177
|
+
- **Batch Processing**: Efficient handling of multiple pages
|
|
178
|
+
- **Content Filtering**: Flexible include/exclude options for targeted scraping
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Firecrawl skills for web scraping and crawling."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TypedDict
|
|
5
|
+
|
|
6
|
+
from intentkit.abstracts.skill import SkillStoreABC
|
|
7
|
+
from intentkit.skills.base import SkillConfig, SkillState
|
|
8
|
+
from intentkit.skills.firecrawl.base import FirecrawlBaseTool
|
|
9
|
+
from intentkit.skills.firecrawl.crawl import FirecrawlCrawl
|
|
10
|
+
from intentkit.skills.firecrawl.query import FirecrawlQueryIndexedContent
|
|
11
|
+
from intentkit.skills.firecrawl.scrape import FirecrawlScrape
|
|
12
|
+
|
|
13
|
+
# Cache skills at the system level, because they are stateless
|
|
14
|
+
_cache: dict[str, FirecrawlBaseTool] = {}
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SkillStates(TypedDict):
|
|
20
|
+
firecrawl_scrape: SkillState
|
|
21
|
+
firecrawl_crawl: SkillState
|
|
22
|
+
firecrawl_query_indexed_content: SkillState
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Config(SkillConfig):
|
|
26
|
+
"""Configuration for Firecrawl skills."""
|
|
27
|
+
|
|
28
|
+
states: SkillStates
|
|
29
|
+
api_key: str = ""
|
|
30
|
+
api_key_provider: str = "agent_owner"
|
|
31
|
+
rate_limit_number: int = 100
|
|
32
|
+
rate_limit_minutes: int = 60
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def get_skills(
|
|
36
|
+
config: "Config",
|
|
37
|
+
is_private: bool,
|
|
38
|
+
store: SkillStoreABC,
|
|
39
|
+
**_,
|
|
40
|
+
) -> list[FirecrawlBaseTool]:
|
|
41
|
+
"""Get all Firecrawl skills.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
config: The configuration for Firecrawl skills.
|
|
45
|
+
is_private: Whether to include private skills.
|
|
46
|
+
store: The skill store for persisting data.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A list of Firecrawl skills.
|
|
50
|
+
"""
|
|
51
|
+
available_skills = []
|
|
52
|
+
|
|
53
|
+
# Include skills based on their state
|
|
54
|
+
for skill_name, state in config["states"].items():
|
|
55
|
+
if state == "disabled":
|
|
56
|
+
continue
|
|
57
|
+
elif state == "public" or (state == "private" and is_private):
|
|
58
|
+
available_skills.append(skill_name)
|
|
59
|
+
|
|
60
|
+
# Get each skill using the cached getter
|
|
61
|
+
return [get_firecrawl_skill(name, store) for name in available_skills]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_firecrawl_skill(
|
|
65
|
+
name: str,
|
|
66
|
+
store: SkillStoreABC,
|
|
67
|
+
) -> FirecrawlBaseTool:
|
|
68
|
+
"""Get a Firecrawl skill by name.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
name: The name of the skill to get
|
|
72
|
+
store: The skill store for persisting data
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
The requested Firecrawl skill
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If the skill name is unknown
|
|
79
|
+
"""
|
|
80
|
+
if name == "firecrawl_scrape":
|
|
81
|
+
if name not in _cache:
|
|
82
|
+
_cache[name] = FirecrawlScrape(
|
|
83
|
+
skill_store=store,
|
|
84
|
+
)
|
|
85
|
+
return _cache[name]
|
|
86
|
+
elif name == "firecrawl_crawl":
|
|
87
|
+
if name not in _cache:
|
|
88
|
+
_cache[name] = FirecrawlCrawl(
|
|
89
|
+
skill_store=store,
|
|
90
|
+
)
|
|
91
|
+
return _cache[name]
|
|
92
|
+
elif name == "firecrawl_query_indexed_content":
|
|
93
|
+
if name not in _cache:
|
|
94
|
+
_cache[name] = FirecrawlQueryIndexedContent(
|
|
95
|
+
skill_store=store,
|
|
96
|
+
)
|
|
97
|
+
return _cache[name]
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError(f"Unknown Firecrawl skill: {name}")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from intentkit.abstracts.skill import SkillStoreABC
|
|
6
|
+
from intentkit.skills.base import IntentKitSkill, SkillContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FirecrawlBaseTool(IntentKitSkill):
|
|
10
|
+
"""Base class for Firecrawl tools."""
|
|
11
|
+
|
|
12
|
+
name: str = Field(description="The name of the tool")
|
|
13
|
+
description: str = Field(description="A description of what the tool does")
|
|
14
|
+
args_schema: Type[BaseModel]
|
|
15
|
+
skill_store: SkillStoreABC = Field(
|
|
16
|
+
description="The skill store for persisting data"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def get_api_key(self, context: SkillContext) -> str:
|
|
20
|
+
"""Get the Firecrawl API key from configuration."""
|
|
21
|
+
skill_config = context.config
|
|
22
|
+
if skill_config.get("api_key_provider") == "agent_owner":
|
|
23
|
+
return skill_config.get("api_key")
|
|
24
|
+
return self.skill_store.get_system_config("firecrawl_api_key")
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def category(self) -> str:
|
|
28
|
+
return "firecrawl"
|