intentkit 0.6.0.dev11__py3-none-any.whl → 0.6.0.dev12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

@@ -0,0 +1,407 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import List, Optional, Type
4
+
5
+ import httpx
6
+ from langchain_core.documents import Document
7
+ from langchain_core.runnables import RunnableConfig
8
+ from pydantic import BaseModel, Field
9
+
10
+ from intentkit.skills.firecrawl.base import FirecrawlBaseTool
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class FirecrawlCrawlInput(BaseModel):
16
+ """Input for Firecrawl crawl tool."""
17
+
18
+ url: str = Field(
19
+ description="The base URL to crawl. All accessible subpages will be crawled."
20
+ )
21
+ limit: int = Field(
22
+ description="Maximum number of pages to crawl", default=10, ge=1, le=1000
23
+ )
24
+ formats: List[str] = Field(
25
+ description="Output formats to include in the response. Options: 'markdown', 'html', 'rawHtml', 'screenshot', 'links', 'json'",
26
+ default=["markdown"],
27
+ )
28
+ include_paths: Optional[List[str]] = Field(
29
+ description="Regex patterns to include in the crawl (e.g., ['^/blog/.*$'])",
30
+ default=None,
31
+ )
32
+ exclude_paths: Optional[List[str]] = Field(
33
+ description="Regex patterns to exclude from the crawl (e.g., ['^/admin/.*$'])",
34
+ default=None,
35
+ )
36
+ max_depth: Optional[int] = Field(
37
+ description="Maximum depth to crawl from the base URL",
38
+ default=None,
39
+ ge=1,
40
+ le=10,
41
+ )
42
+ allow_backward_links: bool = Field(
43
+ description="Allow crawling parent and sibling URLs, not just child paths",
44
+ default=False,
45
+ )
46
+ allow_external_links: bool = Field(
47
+ description="Allow crawling external domains (use with caution)", default=False
48
+ )
49
+ allow_subdomains: bool = Field(
50
+ description="Allow crawling subdomains of the main domain", default=False
51
+ )
52
+ only_main_content: bool = Field(
53
+ description="Whether to extract only the main content (excluding headers, footers, navigation, etc.)",
54
+ default=True,
55
+ )
56
+ index_content: bool = Field(
57
+ description="Whether to index the crawled content for later querying (default: True)",
58
+ default=True,
59
+ )
60
+ chunk_size: int = Field(
61
+ description="Size of text chunks for indexing (default: 1000)",
62
+ default=1000,
63
+ ge=100,
64
+ le=4000,
65
+ )
66
+ chunk_overlap: int = Field(
67
+ description="Overlap between chunks (default: 200)",
68
+ default=200,
69
+ ge=0,
70
+ le=1000,
71
+ )
72
+
73
+
74
+ class FirecrawlCrawl(FirecrawlBaseTool):
75
+ """Tool for crawling entire websites using Firecrawl.
76
+
77
+ This tool uses Firecrawl's API to crawl websites and extract content from multiple pages.
78
+ It can handle JavaScript-rendered content, follow links, and extract structured data
79
+ from entire websites.
80
+
81
+ Attributes:
82
+ name: The name of the tool.
83
+ description: A description of what the tool does.
84
+ args_schema: The schema for the tool's input arguments.
85
+ """
86
+
87
+ name: str = "firecrawl_crawl"
88
+ description: str = (
89
+ "Crawl an entire website and extract content from multiple pages. "
90
+ "This tool can follow links, handle JavaScript-rendered content, and extract "
91
+ "structured data from all accessible pages on a website. "
92
+ "Optionally indexes all crawled content for later querying using the firecrawl_query_indexed_content tool. "
93
+ "Use this when you need to gather comprehensive information from a website."
94
+ )
95
+ args_schema: Type[BaseModel] = FirecrawlCrawlInput
96
+
97
+ async def _arun(
98
+ self,
99
+ url: str,
100
+ limit: int = 10,
101
+ formats: List[str] = None,
102
+ include_paths: Optional[List[str]] = None,
103
+ exclude_paths: Optional[List[str]] = None,
104
+ max_depth: Optional[int] = None,
105
+ allow_backward_links: bool = False,
106
+ allow_external_links: bool = False,
107
+ allow_subdomains: bool = False,
108
+ only_main_content: bool = True,
109
+ index_content: bool = True,
110
+ chunk_size: int = 1000,
111
+ chunk_overlap: int = 200,
112
+ config: RunnableConfig = None,
113
+ **kwargs,
114
+ ) -> str:
115
+ """Implementation of the Firecrawl crawl tool.
116
+
117
+ Args:
118
+ url: The base URL to crawl.
119
+ limit: Maximum number of pages to crawl.
120
+ formats: Output formats to include in the response.
121
+ include_paths: Regex patterns to include in the crawl.
122
+ exclude_paths: Regex patterns to exclude from the crawl.
123
+ max_depth: Maximum depth to crawl from the base URL.
124
+ allow_backward_links: Allow crawling parent and sibling URLs.
125
+ allow_external_links: Allow crawling external domains.
126
+ allow_subdomains: Allow crawling subdomains.
127
+ only_main_content: Whether to extract only main content.
128
+ config: The configuration for the tool call.
129
+
130
+ Returns:
131
+ str: Formatted crawled content from all pages.
132
+ """
133
+ context = self.context_from_config(config)
134
+ logger.debug(f"firecrawl_crawl: Running crawl with context {context}")
135
+
136
+ if context.config.get("api_key_provider") == "agent_owner":
137
+ if context.config.get("rate_limit_number") and context.config.get(
138
+ "rate_limit_minutes"
139
+ ):
140
+ await self.user_rate_limit_by_category(
141
+ context.user_id,
142
+ context.config["rate_limit_number"],
143
+ context.config["rate_limit_minutes"],
144
+ )
145
+
146
+ # Get the API key from the agent's configuration
147
+ api_key = self.get_api_key(context)
148
+ if not api_key:
149
+ return "Error: No Firecrawl API key provided in the configuration."
150
+
151
+ # Validate and set defaults
152
+ if formats is None:
153
+ formats = ["markdown"]
154
+
155
+ # Validate formats
156
+ valid_formats = ["markdown", "html", "rawHtml", "screenshot", "links", "json"]
157
+ formats = [f for f in formats if f in valid_formats]
158
+ if not formats:
159
+ formats = ["markdown"]
160
+
161
+ # Prepare the request payload
162
+ payload = {
163
+ "url": url,
164
+ "limit": min(limit, 1000), # Cap at 1000 for safety
165
+ "scrapeOptions": {"formats": formats, "onlyMainContent": only_main_content},
166
+ }
167
+
168
+ if include_paths:
169
+ payload["includePaths"] = include_paths
170
+ if exclude_paths:
171
+ payload["excludePaths"] = exclude_paths
172
+ if max_depth:
173
+ payload["maxDepth"] = max_depth
174
+ if allow_backward_links:
175
+ payload["allowBackwardLinks"] = allow_backward_links
176
+ if allow_external_links:
177
+ payload["allowExternalLinks"] = allow_external_links
178
+ if allow_subdomains:
179
+ payload["allowSubdomains"] = allow_subdomains
180
+
181
+ # Call Firecrawl crawl API
182
+ try:
183
+ async with httpx.AsyncClient(timeout=120.0) as client:
184
+ # Start the crawl
185
+ response = await client.post(
186
+ "https://api.firecrawl.dev/v1/crawl",
187
+ json=payload,
188
+ headers={
189
+ "Authorization": f"Bearer {api_key}",
190
+ "Content-Type": "application/json",
191
+ },
192
+ )
193
+
194
+ if response.status_code != 200:
195
+ logger.error(
196
+ f"firecrawl_crawl: Error from Firecrawl API: {response.status_code} - {response.text}"
197
+ )
198
+ return f"Error starting crawl: {response.status_code} - {response.text}"
199
+
200
+ crawl_data = response.json()
201
+
202
+ if not crawl_data.get("success"):
203
+ error_msg = crawl_data.get("error", "Unknown error occurred")
204
+ return f"Error starting crawl: {error_msg}"
205
+
206
+ crawl_id = crawl_data.get("id")
207
+ if not crawl_id:
208
+ return "Error: No crawl ID returned from Firecrawl API"
209
+
210
+ # Poll for crawl completion
211
+ max_polls = 60 # Maximum 5 minutes of polling (60 * 5 seconds)
212
+ poll_count = 0
213
+
214
+ while poll_count < max_polls:
215
+ # Check crawl status
216
+ status_response = await client.get(
217
+ f"https://api.firecrawl.dev/v1/crawl/{crawl_id}",
218
+ headers={
219
+ "Authorization": f"Bearer {api_key}",
220
+ "Content-Type": "application/json",
221
+ },
222
+ )
223
+
224
+ if status_response.status_code != 200:
225
+ logger.error(
226
+ f"firecrawl_crawl: Error checking crawl status: {status_response.status_code} - {status_response.text}"
227
+ )
228
+ return f"Error checking crawl status: {status_response.status_code} - {status_response.text}"
229
+
230
+ status_data = status_response.json()
231
+ status = status_data.get("status")
232
+
233
+ if status == "completed":
234
+ # Crawl completed successfully
235
+ pages_data = status_data.get("data", [])
236
+ total_pages = status_data.get("total", 0)
237
+ completed_pages = status_data.get("completed", 0)
238
+
239
+ # Format the results
240
+ formatted_result = f"Successfully crawled: {url}\n"
241
+ formatted_result += f"Total pages found: {total_pages}\n"
242
+ formatted_result += f"Pages completed: {completed_pages}\n\n"
243
+
244
+ # Process each page
245
+ for i, page_data in enumerate(
246
+ pages_data[:10], 1
247
+ ): # Limit to first 10 pages for output
248
+ page_url = page_data.get("metadata", {}).get(
249
+ "sourceURL", "Unknown URL"
250
+ )
251
+ formatted_result += f"## Page {i}: {page_url}\n"
252
+
253
+ if "markdown" in formats and page_data.get("markdown"):
254
+ content = page_data["markdown"][
255
+ :500
256
+ ] # Limit content length
257
+ formatted_result += f"{content}"
258
+ if len(page_data["markdown"]) > 500:
259
+ formatted_result += "... (content truncated)"
260
+ formatted_result += "\n\n"
261
+
262
+ # Add page metadata
263
+ metadata = page_data.get("metadata", {})
264
+ if metadata.get("title"):
265
+ formatted_result += f"Title: {metadata['title']}\n"
266
+ if metadata.get("description"):
267
+ formatted_result += (
268
+ f"Description: {metadata['description']}\n"
269
+ )
270
+ formatted_result += "\n"
271
+
272
+ if len(pages_data) > 10:
273
+ formatted_result += (
274
+ f"... and {len(pages_data) - 10} more pages\n"
275
+ )
276
+
277
+ # Index content if requested
278
+ if index_content and pages_data:
279
+ try:
280
+ # Import indexing utilities from firecrawl utils
281
+ from intentkit.skills.firecrawl.utils import (
282
+ FirecrawlMetadataManager,
283
+ index_documents,
284
+ )
285
+
286
+ # Create documents from crawled content
287
+ documents = []
288
+ for page_data in pages_data:
289
+ if page_data.get("markdown"):
290
+ metadata = page_data.get("metadata", {})
291
+ document = Document(
292
+ page_content=page_data["markdown"],
293
+ metadata={
294
+ "source": metadata.get(
295
+ "sourceURL", "Unknown URL"
296
+ ),
297
+ "title": metadata.get("title", ""),
298
+ "description": metadata.get(
299
+ "description", ""
300
+ ),
301
+ "language": metadata.get(
302
+ "language", ""
303
+ ),
304
+ "source_type": "firecrawl_crawl",
305
+ "indexed_at": str(
306
+ context.agent.id
307
+ if context and context.agent
308
+ else "unknown"
309
+ ),
310
+ },
311
+ )
312
+ documents.append(document)
313
+
314
+ # Get agent ID for indexing
315
+ agent_id = (
316
+ context.agent.id
317
+ if context and context.agent
318
+ else None
319
+ )
320
+ if agent_id and documents:
321
+ # Index all documents
322
+ total_chunks, was_merged = await index_documents(
323
+ documents,
324
+ agent_id,
325
+ self.skill_store,
326
+ chunk_size,
327
+ chunk_overlap,
328
+ )
329
+
330
+ # Update metadata
331
+ metadata_manager = FirecrawlMetadataManager(
332
+ self.skill_store
333
+ )
334
+ urls = [doc.metadata["source"] for doc in documents]
335
+ new_metadata = metadata_manager.create_url_metadata(
336
+ urls, documents, "firecrawl_crawl"
337
+ )
338
+ await metadata_manager.update_metadata(
339
+ agent_id, new_metadata
340
+ )
341
+
342
+ formatted_result += "\n## Content Indexing\n"
343
+ formatted_result += "Successfully indexed crawled content into vector store:\n"
344
+ formatted_result += (
345
+ f"- Pages indexed: {len(documents)}\n"
346
+ )
347
+ formatted_result += (
348
+ f"- Total chunks created: {total_chunks}\n"
349
+ )
350
+ formatted_result += f"- Chunk size: {chunk_size}\n"
351
+ formatted_result += (
352
+ f"- Chunk overlap: {chunk_overlap}\n"
353
+ )
354
+ formatted_result += f"- Content merged with existing: {'Yes' if was_merged else 'No'}\n"
355
+ formatted_result += "Use the 'firecrawl_query_indexed_content' skill to search this content.\n"
356
+
357
+ logger.info(
358
+ f"firecrawl_crawl: Successfully indexed {len(documents)} pages with {total_chunks} total chunks"
359
+ )
360
+ else:
361
+ formatted_result += "\n## Content Indexing\n"
362
+ formatted_result += "Warning: Could not index content - agent ID not available or no content to index.\n"
363
+
364
+ except Exception as index_error:
365
+ logger.error(
366
+ f"firecrawl_crawl: Error indexing content: {index_error}"
367
+ )
368
+ formatted_result += "\n## Content Indexing\n"
369
+ formatted_result += f"Warning: Failed to index content for later querying: {str(index_error)}\n"
370
+
371
+ return formatted_result.strip()
372
+
373
+ elif status == "failed":
374
+ error_msg = status_data.get("error", "Crawl failed")
375
+ return f"Crawl failed: {error_msg}"
376
+
377
+ elif status in ["scraping", "active"]:
378
+ # Still in progress, wait and poll again
379
+ completed = status_data.get("completed", 0)
380
+ total = status_data.get("total", 0)
381
+ logger.debug(
382
+ f"firecrawl_crawl: Crawl in progress: {completed}/{total} pages"
383
+ )
384
+
385
+ # Wait 5 seconds before next poll
386
+ await asyncio.sleep(5)
387
+ poll_count += 1
388
+
389
+ else:
390
+ # Unknown status
391
+ logger.warning(
392
+ f"firecrawl_crawl: Unknown crawl status: {status}"
393
+ )
394
+ await asyncio.sleep(5)
395
+ poll_count += 1
396
+
397
+ # If we've exceeded max polls, return partial results
398
+ return f"Crawl timeout: The crawl of {url} is taking longer than expected. Please try again later or reduce the crawl limit."
399
+
400
+ except httpx.TimeoutException:
401
+ logger.error(f"firecrawl_crawl: Timeout crawling URL: {url}")
402
+ return (
403
+ f"Timeout error: The request to crawl {url} took too long to complete."
404
+ )
405
+ except Exception as e:
406
+ logger.error(f"firecrawl_crawl: Error crawling URL: {e}", exc_info=True)
407
+ return f"An error occurred while crawling the URL: {str(e)}"
Binary file
@@ -0,0 +1,123 @@
1
+ import logging
2
+ from typing import Type
3
+
4
+ from langchain_core.runnables import RunnableConfig
5
+ from pydantic import BaseModel, Field
6
+
7
+ from intentkit.skills.firecrawl.base import FirecrawlBaseTool
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FirecrawlQueryInput(BaseModel):
13
+ """Input for Firecrawl query tool."""
14
+
15
+ query: str = Field(
16
+ description="Question or query to search in the indexed content",
17
+ min_length=1,
18
+ max_length=500,
19
+ )
20
+ max_results: int = Field(
21
+ description="Maximum number of relevant documents to return (default: 4)",
22
+ default=4,
23
+ ge=1,
24
+ le=10,
25
+ )
26
+
27
+
28
+ class FirecrawlQueryIndexedContent(FirecrawlBaseTool):
29
+ """Tool for querying previously indexed Firecrawl content.
30
+
31
+ This tool searches through content that was previously scraped and indexed
32
+ using the firecrawl_scrape or firecrawl_crawl tools to answer questions or find relevant information.
33
+ """
34
+
35
+ name: str = "firecrawl_query_indexed_content"
36
+ description: str = (
37
+ "Query previously indexed Firecrawl content to find relevant information and answer questions.\n"
38
+ "Use this tool to search through content that was previously scraped and indexed using Firecrawl tools.\n"
39
+ "This tool can help answer questions based on the indexed web content from Firecrawl scraping/crawling."
40
+ )
41
+ args_schema: Type[BaseModel] = FirecrawlQueryInput
42
+
43
+ async def _arun(
44
+ self,
45
+ query: str,
46
+ max_results: int = 4,
47
+ config: RunnableConfig = None,
48
+ **kwargs,
49
+ ) -> str:
50
+ """Query the indexed Firecrawl content."""
51
+ try:
52
+ # Get agent context - throw error if not available
53
+ if not config:
54
+ raise ValueError("Configuration is required but not provided")
55
+
56
+ context = self.context_from_config(config)
57
+ if not context or not context.agent or not context.agent.id:
58
+ raise ValueError("Agent ID is required but not found in configuration")
59
+
60
+ agent_id = context.agent.id
61
+
62
+ logger.info(f"[{agent_id}] Starting Firecrawl query operation: '{query}'")
63
+
64
+ # Import query utilities from firecrawl utils
65
+ from intentkit.skills.firecrawl.utils import (
66
+ FirecrawlDocumentProcessor,
67
+ query_indexed_content,
68
+ )
69
+
70
+ # Query the indexed content
71
+ docs = await query_indexed_content(
72
+ query, agent_id, self.skill_store, max_results
73
+ )
74
+
75
+ if not docs:
76
+ logger.info(f"[{agent_id}] No relevant documents found for query")
77
+ return f"No relevant information found for your query: '{query}'. The indexed content may not contain information related to your search."
78
+
79
+ # Format results
80
+ results = []
81
+ for i, doc in enumerate(docs, 1):
82
+ # Sanitize content to prevent database storage errors
83
+ content = FirecrawlDocumentProcessor.sanitize_for_database(
84
+ doc.page_content.strip()
85
+ )
86
+ source = doc.metadata.get("source", "Unknown")
87
+ source_type = doc.metadata.get("source_type", "unknown")
88
+
89
+ # Add source type indicator for Firecrawl content
90
+ if source_type.startswith("firecrawl"):
91
+ source_indicator = (
92
+ f"[Firecrawl {source_type.replace('firecrawl_', '').title()}]"
93
+ )
94
+ else:
95
+ source_indicator = ""
96
+
97
+ results.append(
98
+ f"**Source {i}:** {source} {source_indicator}\n{content}"
99
+ )
100
+
101
+ response = "\n\n".join(results)
102
+ logger.info(
103
+ f"[{agent_id}] Firecrawl query completed successfully, returning {len(response)} chars"
104
+ )
105
+
106
+ return response
107
+
108
+ except Exception as e:
109
+ # Extract agent_id for error logging if possible
110
+ agent_id = "UNKNOWN"
111
+ try:
112
+ if config:
113
+ context = self.context_from_config(config)
114
+ if context and context.agent and context.agent.id:
115
+ agent_id = context.agent.id
116
+ except Exception:
117
+ pass
118
+
119
+ logger.error(
120
+ f"[{agent_id}] Error in FirecrawlQueryIndexedContent: {e}",
121
+ exc_info=True,
122
+ )
123
+ raise type(e)(f"[agent:{agent_id}]: {e}") from e
@@ -0,0 +1,137 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "type": "object",
4
+ "title": "Firecrawl Web Scraping and Crawling",
5
+ "description": "AI-powered web scraping and crawling capabilities using Firecrawl",
6
+ "x-icon": "https://ai.service.crestal.dev/skills/firecrawl/firecrawl.png",
7
+ "x-tags": [
8
+ "Web Scraping",
9
+ "Crawling",
10
+ "Content Extraction",
11
+ "Data Mining",
12
+ "Website Analysis"
13
+ ],
14
+ "properties": {
15
+ "enabled": {
16
+ "type": "boolean",
17
+ "title": "Enabled",
18
+ "description": "Whether this skill is enabled",
19
+ "default": true
20
+ },
21
+ "states": {
22
+ "type": "object",
23
+ "properties": {
24
+ "firecrawl_scrape": {
25
+ "type": "string",
26
+ "title": "Firecrawl Scrape",
27
+ "enum": [
28
+ "disabled",
29
+ "public",
30
+ "private"
31
+ ],
32
+ "x-enum-title": [
33
+ "Disabled",
34
+ "Agent Owner + All Users",
35
+ "Agent Owner Only"
36
+ ],
37
+ "description": "Scrape single web pages and extract content in various formats (markdown, HTML, JSON, etc.). Handles JavaScript-rendered content, PDFs, and dynamic websites.",
38
+ "default": "private"
39
+ },
40
+ "firecrawl_crawl": {
41
+ "type": "string",
42
+ "title": "Firecrawl Crawl",
43
+ "enum": [
44
+ "disabled",
45
+ "public",
46
+ "private"
47
+ ],
48
+ "x-enum-title": [
49
+ "Disabled",
50
+ "Agent Owner + All Users",
51
+ "Agent Owner Only"
52
+ ],
53
+ "description": "Crawl entire websites and extract content from multiple pages. Can follow links, handle JavaScript-rendered content, and extract structured data from entire websites.",
54
+ "default": "private"
55
+ },
56
+ "firecrawl_query_indexed_content": {
57
+ "type": "string",
58
+ "title": "Query Indexed Content",
59
+ "enum": [
60
+ "disabled",
61
+ "public",
62
+ "private"
63
+ ],
64
+ "x-enum-title": [
65
+ "Disabled",
66
+ "Agent Owner + All Users",
67
+ "Agent Owner Only"
68
+ ],
69
+ "description": "Query previously indexed Firecrawl content to find relevant information and answer questions. Use this to search through content that was scraped and indexed using Firecrawl tools.",
70
+ "default": "private"
71
+ }
72
+ },
73
+ "description": "States for each Firecrawl skill (disabled, public, or private)"
74
+ },
75
+ "api_key_provider": {
76
+ "type": "string",
77
+ "title": "API Key Provider",
78
+ "description": "Provider of the API key",
79
+ "enum": [
80
+ "agent_owner"
81
+ ],
82
+ "x-enum-title": [
83
+ "Owner Provided"
84
+ ],
85
+ "default": "agent_owner"
86
+ },
87
+ "api_key": {
88
+ "type": "string",
89
+ "title": "Firecrawl API Key",
90
+ "description": "API key for Firecrawl services",
91
+ "x-link": "[Get your API key](https://firecrawl.dev/)",
92
+ "x-sensitive": true
93
+ },
94
+ "rate_limit_number": {
95
+ "type": "integer",
96
+ "title": "Rate Limit Number",
97
+ "description": "Number of requests allowed per time window",
98
+ "minimum": 1,
99
+ "maximum": 1000,
100
+ "default": 100
101
+ },
102
+ "rate_limit_minutes": {
103
+ "type": "integer",
104
+ "title": "Rate Limit Minutes",
105
+ "description": "Time window in minutes for rate limiting",
106
+ "minimum": 1,
107
+ "maximum": 1440,
108
+ "default": 60
109
+ }
110
+ },
111
+ "required": [
112
+ "states",
113
+ "enabled"
114
+ ],
115
+ "if": {
116
+ "properties": {
117
+ "api_key_provider": {
118
+ "const": "agent_owner"
119
+ }
120
+ }
121
+ },
122
+ "then": {
123
+ "if": {
124
+ "properties": {
125
+ "enabled": {
126
+ "const": true
127
+ }
128
+ }
129
+ },
130
+ "then": {
131
+ "required": [
132
+ "api_key"
133
+ ]
134
+ }
135
+ },
136
+ "additionalProperties": true
137
+ }