intentkit 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

Files changed (94) hide show
  1. intentkit/__init__.py +1 -1
  2. intentkit/abstracts/skill.py +12 -0
  3. intentkit/clients/cdp.py +114 -16
  4. intentkit/config/config.py +12 -4
  5. intentkit/core/engine.py +39 -31
  6. intentkit/core/node.py +8 -4
  7. intentkit/core/prompt.py +5 -6
  8. intentkit/core/skill.py +11 -0
  9. intentkit/models/agent.py +2 -9
  10. intentkit/models/agent_data.py +18 -0
  11. intentkit/models/agent_schema.json +12 -0
  12. intentkit/models/chat.py +50 -0
  13. intentkit/models/skill.py +19 -0
  14. intentkit/skills/base.py +37 -17
  15. intentkit/skills/cdp/__init__.py +6 -14
  16. intentkit/skills/cdp/get_balance.py +77 -25
  17. intentkit/skills/cdp/schema.json +0 -64
  18. intentkit/skills/cryptocompare/fetch_news.py +2 -2
  19. intentkit/skills/cryptocompare/fetch_price.py +2 -2
  20. intentkit/skills/cryptocompare/fetch_top_exchanges.py +2 -2
  21. intentkit/skills/cryptocompare/fetch_top_market_cap.py +2 -2
  22. intentkit/skills/cryptocompare/fetch_top_volume.py +2 -2
  23. intentkit/skills/cryptocompare/fetch_trading_signals.py +2 -2
  24. intentkit/skills/defillama/base.py +3 -3
  25. intentkit/skills/enso/base.py +27 -4
  26. intentkit/skills/enso/networks.py +1 -1
  27. intentkit/skills/enso/route.py +24 -23
  28. intentkit/skills/enso/tokens.py +1 -1
  29. intentkit/skills/enso/wallet.py +27 -23
  30. intentkit/skills/firecrawl/README.md +211 -0
  31. intentkit/skills/firecrawl/__init__.py +107 -0
  32. intentkit/skills/firecrawl/base.py +28 -0
  33. intentkit/skills/firecrawl/clear.py +87 -0
  34. intentkit/skills/firecrawl/crawl.py +399 -0
  35. intentkit/skills/firecrawl/firecrawl.png +0 -0
  36. intentkit/skills/firecrawl/query.py +123 -0
  37. intentkit/skills/firecrawl/schema.json +153 -0
  38. intentkit/skills/firecrawl/scrape.py +318 -0
  39. intentkit/skills/firecrawl/utils.py +306 -0
  40. intentkit/skills/heurist/image_generation_animagine_xl.py +1 -1
  41. intentkit/skills/heurist/image_generation_arthemy_comics.py +1 -1
  42. intentkit/skills/heurist/image_generation_arthemy_real.py +1 -1
  43. intentkit/skills/heurist/image_generation_braindance.py +1 -1
  44. intentkit/skills/heurist/image_generation_cyber_realistic_xl.py +1 -1
  45. intentkit/skills/heurist/image_generation_flux_1_dev.py +1 -1
  46. intentkit/skills/heurist/image_generation_sdxl.py +1 -1
  47. intentkit/skills/http/README.md +78 -0
  48. intentkit/skills/http/__init__.py +100 -0
  49. intentkit/skills/http/base.py +21 -0
  50. intentkit/skills/http/get.py +96 -0
  51. intentkit/skills/http/http.svg +15 -0
  52. intentkit/skills/http/post.py +113 -0
  53. intentkit/skills/http/put.py +113 -0
  54. intentkit/skills/http/schema.json +80 -0
  55. intentkit/skills/lifi/token_execute.py +1 -1
  56. intentkit/skills/openai/dalle_image_generation.py +1 -1
  57. intentkit/skills/openai/gpt_image_generation.py +1 -1
  58. intentkit/skills/openai/gpt_image_to_image.py +1 -1
  59. intentkit/skills/supabase/__init__.py +116 -0
  60. intentkit/skills/supabase/base.py +72 -0
  61. intentkit/skills/supabase/delete_data.py +102 -0
  62. intentkit/skills/supabase/fetch_data.py +120 -0
  63. intentkit/skills/supabase/insert_data.py +70 -0
  64. intentkit/skills/supabase/invoke_function.py +74 -0
  65. intentkit/skills/supabase/schema.json +170 -0
  66. intentkit/skills/supabase/supabase.svg +15 -0
  67. intentkit/skills/supabase/update_data.py +105 -0
  68. intentkit/skills/supabase/upsert_data.py +77 -0
  69. intentkit/skills/system/read_agent_api_key.py +1 -1
  70. intentkit/skills/system/regenerate_agent_api_key.py +1 -1
  71. intentkit/skills/token/base.py +1 -39
  72. intentkit/skills/twitter/follow_user.py +3 -3
  73. intentkit/skills/twitter/get_mentions.py +6 -6
  74. intentkit/skills/twitter/get_timeline.py +5 -5
  75. intentkit/skills/twitter/get_user_by_username.py +3 -3
  76. intentkit/skills/twitter/get_user_tweets.py +5 -5
  77. intentkit/skills/twitter/like_tweet.py +3 -3
  78. intentkit/skills/twitter/post_tweet.py +4 -4
  79. intentkit/skills/twitter/reply_tweet.py +4 -4
  80. intentkit/skills/twitter/retweet.py +3 -3
  81. intentkit/skills/twitter/search_tweets.py +5 -5
  82. intentkit/skills/unrealspeech/text_to_speech.py +1 -1
  83. intentkit/skills/web_scraper/README.md +35 -4
  84. intentkit/skills/web_scraper/__init__.py +16 -0
  85. intentkit/skills/web_scraper/document_indexer.py +143 -0
  86. intentkit/skills/web_scraper/schema.json +28 -0
  87. intentkit/skills/web_scraper/scrape_and_index.py +135 -200
  88. intentkit/skills/web_scraper/utils.py +684 -0
  89. intentkit/skills/web_scraper/website_indexer.py +456 -0
  90. intentkit/utils/logging.py +1 -1
  91. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/METADATA +1 -1
  92. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/RECORD +94 -63
  93. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/WHEEL +0 -0
  94. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,318 @@
1
+ import logging
2
+ from typing import List, Optional, Type
3
+
4
+ import httpx
5
+ from langchain_core.documents import Document
6
+ from langchain_core.runnables import RunnableConfig
7
+ from pydantic import BaseModel, Field
8
+
9
+ from intentkit.skills.firecrawl.base import FirecrawlBaseTool
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class FirecrawlScrapeInput(BaseModel):
15
+ """Input for Firecrawl scrape tool."""
16
+
17
+ url: str = Field(
18
+ description="The URL to scrape. Must be a valid HTTP or HTTPS URL."
19
+ )
20
+ formats: List[str] = Field(
21
+ description="Output formats to include in the response. Options: 'markdown', 'html', 'rawHtml', 'screenshot', 'links', 'json'",
22
+ default=["markdown"],
23
+ )
24
+ only_main_content: bool = Field(
25
+ description="Whether to extract only the main content (excluding headers, footers, navigation, etc.)",
26
+ default=True,
27
+ )
28
+ include_tags: Optional[List[str]] = Field(
29
+ description="HTML tags, classes, or IDs to include in the response (e.g., ['h1', 'p', '.main-content'])",
30
+ default=None,
31
+ )
32
+ exclude_tags: Optional[List[str]] = Field(
33
+ description="HTML tags, classes, or IDs to exclude from the response (e.g., ['#ad', '#footer'])",
34
+ default=None,
35
+ )
36
+ wait_for: int = Field(
37
+ description="Wait time in milliseconds before scraping (use only as last resort)",
38
+ default=0,
39
+ ge=0,
40
+ )
41
+ timeout: int = Field(
42
+ description="Maximum timeout in milliseconds for the scraping operation",
43
+ default=30000,
44
+ ge=1000,
45
+ le=120000,
46
+ )
47
+ index_content: bool = Field(
48
+ description="Whether to index the scraped content for later querying (default: True)",
49
+ default=True,
50
+ )
51
+ chunk_size: int = Field(
52
+ description="Size of text chunks for indexing (default: 1000)",
53
+ default=1000,
54
+ ge=100,
55
+ le=4000,
56
+ )
57
+ chunk_overlap: int = Field(
58
+ description="Overlap between chunks (default: 200)",
59
+ default=200,
60
+ ge=0,
61
+ le=1000,
62
+ )
63
+
64
+
65
+ class FirecrawlScrape(FirecrawlBaseTool):
66
+ """Tool for scraping web pages using Firecrawl.
67
+
68
+ This tool uses Firecrawl's API to scrape web pages and convert them into clean,
69
+ LLM-ready formats like markdown, HTML, or structured JSON data.
70
+
71
+ Attributes:
72
+ name: The name of the tool.
73
+ description: A description of what the tool does.
74
+ args_schema: The schema for the tool's input arguments.
75
+ """
76
+
77
+ name: str = "firecrawl_scrape"
78
+ description: str = (
79
+ "Scrape a single web page and extract its content in various formats (markdown, HTML, JSON, etc.). "
80
+ "This tool can handle JavaScript-rendered content, PDFs, and dynamic websites. "
81
+ "Optionally indexes the content for later querying using the firecrawl_query_indexed_content tool. "
82
+ "Use this when you need to extract clean, structured content from a specific URL."
83
+ )
84
+ args_schema: Type[BaseModel] = FirecrawlScrapeInput
85
+
86
+ async def _arun(
87
+ self,
88
+ url: str,
89
+ formats: List[str] = None,
90
+ only_main_content: bool = True,
91
+ include_tags: Optional[List[str]] = None,
92
+ exclude_tags: Optional[List[str]] = None,
93
+ wait_for: int = 0,
94
+ timeout: int = 30000,
95
+ index_content: bool = True,
96
+ chunk_size: int = 1000,
97
+ chunk_overlap: int = 200,
98
+ config: RunnableConfig = None,
99
+ **kwargs,
100
+ ) -> str:
101
+ """Implementation of the Firecrawl scrape tool.
102
+
103
+ Args:
104
+ url: The URL to scrape.
105
+ formats: Output formats to include in the response.
106
+ only_main_content: Whether to extract only main content.
107
+ include_tags: HTML tags/classes/IDs to include.
108
+ exclude_tags: HTML tags/classes/IDs to exclude.
109
+ wait_for: Wait time in milliseconds before scraping.
110
+ timeout: Maximum timeout in milliseconds.
111
+ index_content: Whether to index the content for later querying.
112
+ chunk_size: Size of text chunks for indexing.
113
+ chunk_overlap: Overlap between chunks.
114
+ config: The configuration for the tool call.
115
+
116
+ Returns:
117
+ str: Formatted scraped content based on the requested formats.
118
+ """
119
+ context = self.context_from_config(config)
120
+ logger.debug(f"firecrawl_scrape: Running scrape with context {context}")
121
+
122
+ if context.config.get("api_key_provider") == "agent_owner":
123
+ if context.config.get("rate_limit_number") and context.config.get(
124
+ "rate_limit_minutes"
125
+ ):
126
+ await self.user_rate_limit_by_category(
127
+ context.user_id,
128
+ context.config["rate_limit_number"],
129
+ context.config["rate_limit_minutes"],
130
+ )
131
+
132
+ # Get the API key from the agent's configuration
133
+ api_key = self.get_api_key(context)
134
+ if not api_key:
135
+ return "Error: No Firecrawl API key provided in the configuration."
136
+
137
+ # Validate and set defaults
138
+ if formats is None:
139
+ formats = ["markdown"]
140
+
141
+ # Validate formats
142
+ valid_formats = ["markdown", "html", "rawHtml", "screenshot", "links", "json"]
143
+ formats = [f for f in formats if f in valid_formats]
144
+ if not formats:
145
+ formats = ["markdown"]
146
+
147
+ # Prepare the request payload
148
+ payload = {
149
+ "url": url,
150
+ "formats": formats,
151
+ "onlyMainContent": only_main_content,
152
+ "timeout": timeout,
153
+ }
154
+
155
+ if include_tags:
156
+ payload["includeTags"] = include_tags
157
+ if exclude_tags:
158
+ payload["excludeTags"] = exclude_tags
159
+ if wait_for > 0:
160
+ payload["waitFor"] = wait_for
161
+
162
+ # Call Firecrawl scrape API
163
+ try:
164
+ async with httpx.AsyncClient(timeout=timeout / 1000 + 10) as client:
165
+ response = await client.post(
166
+ "https://api.firecrawl.dev/v1/scrape",
167
+ json=payload,
168
+ headers={
169
+ "Authorization": f"Bearer {api_key}",
170
+ "Content-Type": "application/json",
171
+ },
172
+ )
173
+
174
+ if response.status_code != 200:
175
+ logger.error(
176
+ f"firecrawl_scrape: Error from Firecrawl API: {response.status_code} - {response.text}"
177
+ )
178
+ return (
179
+ f"Error scraping URL: {response.status_code} - {response.text}"
180
+ )
181
+
182
+ data = response.json()
183
+
184
+ if not data.get("success"):
185
+ error_msg = data.get("error", "Unknown error occurred")
186
+ return f"Error scraping URL: {error_msg}"
187
+
188
+ result_data = data.get("data", {})
189
+
190
+ # Format the results based on requested formats
191
+ formatted_result = f"Successfully scraped: {url}\n\n"
192
+
193
+ if "markdown" in formats and result_data.get("markdown"):
194
+ formatted_result += "## Markdown Content\n"
195
+ formatted_result += result_data["markdown"][:2000] # Limit length
196
+ if len(result_data["markdown"]) > 2000:
197
+ formatted_result += "... (content truncated)"
198
+ formatted_result += "\n\n"
199
+
200
+ if "html" in formats and result_data.get("html"):
201
+ formatted_result += "## HTML Content\n"
202
+ formatted_result += f"HTML content available ({len(result_data['html'])} characters)\n\n"
203
+
204
+ if "links" in formats and result_data.get("links"):
205
+ formatted_result += "## Extracted Links\n"
206
+ links = result_data["links"][:10] # Limit to first 10 links
207
+ for link in links:
208
+ formatted_result += f"- {link}\n"
209
+ if len(result_data["links"]) > 10:
210
+ formatted_result += (
211
+ f"... and {len(result_data['links']) - 10} more links\n"
212
+ )
213
+ formatted_result += "\n"
214
+
215
+ if "json" in formats and result_data.get("json"):
216
+ formatted_result += "## Structured Data (JSON)\n"
217
+ formatted_result += str(result_data["json"])[:1000] # Limit length
218
+ if len(str(result_data["json"])) > 1000:
219
+ formatted_result += "... (data truncated)"
220
+ formatted_result += "\n\n"
221
+
222
+ if "screenshot" in formats and result_data.get("screenshot"):
223
+ formatted_result += "## Screenshot\n"
224
+ formatted_result += (
225
+ f"Screenshot available at: {result_data['screenshot']}\n\n"
226
+ )
227
+
228
+ # Add metadata information
229
+ metadata = result_data.get("metadata", {})
230
+ if metadata:
231
+ formatted_result += "## Page Metadata\n"
232
+ if metadata.get("title"):
233
+ formatted_result += f"Title: {metadata['title']}\n"
234
+ if metadata.get("description"):
235
+ formatted_result += f"Description: {metadata['description']}\n"
236
+ if metadata.get("language"):
237
+ formatted_result += f"Language: {metadata['language']}\n"
238
+ formatted_result += "\n"
239
+
240
+ # Index content if requested
241
+ if index_content and result_data.get("markdown"):
242
+ try:
243
+ # Import indexing utilities from firecrawl utils
244
+ from intentkit.skills.firecrawl.utils import (
245
+ FirecrawlMetadataManager,
246
+ index_documents,
247
+ )
248
+
249
+ # Create document from scraped content
250
+ document = Document(
251
+ page_content=result_data["markdown"],
252
+ metadata={
253
+ "source": url,
254
+ "title": metadata.get("title", ""),
255
+ "description": metadata.get("description", ""),
256
+ "language": metadata.get("language", ""),
257
+ "source_type": "firecrawl_scrape",
258
+ "indexed_at": str(context.agent_id),
259
+ },
260
+ )
261
+
262
+ # Get agent ID for indexing
263
+ agent_id = context.agent_id
264
+ if agent_id:
265
+ # Index the document
266
+ total_chunks, was_merged = await index_documents(
267
+ [document],
268
+ agent_id,
269
+ self.skill_store,
270
+ chunk_size,
271
+ chunk_overlap,
272
+ )
273
+
274
+ # Update metadata
275
+ metadata_manager = FirecrawlMetadataManager(
276
+ self.skill_store
277
+ )
278
+ new_metadata = metadata_manager.create_url_metadata(
279
+ [url], [document], "firecrawl_scrape"
280
+ )
281
+ await metadata_manager.update_metadata(
282
+ agent_id, new_metadata
283
+ )
284
+
285
+ formatted_result += "\n## Content Indexing\n"
286
+ formatted_result += (
287
+ "Successfully indexed content into vector store:\n"
288
+ )
289
+ formatted_result += f"- Chunks created: {total_chunks}\n"
290
+ formatted_result += f"- Chunk size: {chunk_size}\n"
291
+ formatted_result += f"- Chunk overlap: {chunk_overlap}\n"
292
+ formatted_result += f"- Content merged with existing: {'Yes' if was_merged else 'No'}\n"
293
+ formatted_result += "Use the 'firecrawl_query_indexed_content' skill to search this content.\n"
294
+
295
+ logger.info(
296
+ f"firecrawl_scrape: Successfully indexed {url} with {total_chunks} chunks"
297
+ )
298
+ else:
299
+ formatted_result += "\n## Content Indexing\n"
300
+ formatted_result += "Warning: Could not index content - agent ID not available.\n"
301
+
302
+ except Exception as index_error:
303
+ logger.error(
304
+ f"firecrawl_scrape: Error indexing content: {index_error}"
305
+ )
306
+ formatted_result += "\n## Content Indexing\n"
307
+ formatted_result += f"Warning: Failed to index content for later querying: {str(index_error)}\n"
308
+
309
+ return formatted_result.strip()
310
+
311
+ except httpx.TimeoutException:
312
+ logger.error(f"firecrawl_scrape: Timeout scraping URL: {url}")
313
+ return (
314
+ f"Timeout error: The request to scrape {url} took too long to complete."
315
+ )
316
+ except Exception as e:
317
+ logger.error(f"firecrawl_scrape: Error scraping URL: {e}", exc_info=True)
318
+ return f"An error occurred while scraping the URL: {str(e)}"
@@ -0,0 +1,306 @@
1
+ """Utilities for Firecrawl skill content indexing and querying."""
2
+
3
+ import logging
4
+ import re
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_core.documents import Document
10
+ from langchain_openai import OpenAIEmbeddings
11
+
12
+ from intentkit.abstracts.skill import SkillStoreABC
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class FirecrawlDocumentProcessor:
18
+ """Handles document processing and sanitization for Firecrawl content."""
19
+
20
+ @staticmethod
21
+ def sanitize_for_database(text: str) -> str:
22
+ """Sanitize text content to prevent database storage errors."""
23
+ if not text:
24
+ return ""
25
+
26
+ # Remove null bytes and other problematic characters
27
+ text = text.replace("\x00", "")
28
+ text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)
29
+
30
+ # Normalize whitespace
31
+ text = re.sub(r"\s+", " ", text)
32
+ text = text.strip()
33
+
34
+ return text
35
+
36
+ @staticmethod
37
+ def split_documents(
38
+ documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200
39
+ ) -> List[Document]:
40
+ """Split documents into smaller chunks for better indexing."""
41
+ text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap,
44
+ length_function=len,
45
+ )
46
+
47
+ split_docs = []
48
+ for doc in documents:
49
+ # Sanitize content before splitting
50
+ sanitized_content = FirecrawlDocumentProcessor.sanitize_for_database(
51
+ doc.page_content
52
+ )
53
+ doc.page_content = sanitized_content
54
+
55
+ # Split the document
56
+ chunks = text_splitter.split_documents([doc])
57
+ split_docs.extend(chunks)
58
+
59
+ return split_docs
60
+
61
+
62
+ class FirecrawlVectorStoreManager:
63
+ """Manages vector store operations for Firecrawl content."""
64
+
65
+ def __init__(self, skill_store: SkillStoreABC):
66
+ self.skill_store = skill_store
67
+
68
+ def create_embeddings(self) -> OpenAIEmbeddings:
69
+ """Create OpenAI embeddings instance."""
70
+ openai_api_key = self.skill_store.get_system_config("openai_api_key")
71
+ if not openai_api_key:
72
+ raise ValueError("OpenAI API key not found in system configuration")
73
+
74
+ return OpenAIEmbeddings(
75
+ openai_api_key=openai_api_key, model="text-embedding-3-small"
76
+ )
77
+
78
+ def encode_vector_store(self, vector_store: FAISS) -> Dict[str, str]:
79
+ """Encode FAISS vector store to base64 for storage (compatible with web_scraper)."""
80
+ import base64
81
+ import os
82
+ import tempfile
83
+
84
+ try:
85
+ with tempfile.TemporaryDirectory() as temp_dir:
86
+ vector_store.save_local(temp_dir)
87
+
88
+ encoded_files = {}
89
+ for filename in os.listdir(temp_dir):
90
+ file_path = os.path.join(temp_dir, filename)
91
+ if os.path.isfile(file_path):
92
+ with open(file_path, "rb") as f:
93
+ encoded_files[filename] = base64.b64encode(f.read()).decode(
94
+ "utf-8"
95
+ )
96
+
97
+ return encoded_files
98
+ except Exception as e:
99
+ logger.error(f"Error encoding vector store: {e}")
100
+ raise
101
+
102
+ def decode_vector_store(
103
+ self, encoded_files: Dict[str, str], embeddings: OpenAIEmbeddings
104
+ ) -> FAISS:
105
+ """Decode base64 files back to FAISS vector store (compatible with web_scraper)."""
106
+ import base64
107
+ import os
108
+ import tempfile
109
+
110
+ try:
111
+ with tempfile.TemporaryDirectory() as temp_dir:
112
+ # Decode and write files
113
+ for filename, encoded_content in encoded_files.items():
114
+ file_path = os.path.join(temp_dir, filename)
115
+ with open(file_path, "wb") as f:
116
+ f.write(base64.b64decode(encoded_content))
117
+
118
+ # Load vector store
119
+ return FAISS.load_local(
120
+ temp_dir,
121
+ embeddings,
122
+ allow_dangerous_deserialization=True,
123
+ )
124
+ except Exception as e:
125
+ logger.error(f"Error decoding vector store: {e}")
126
+ raise
127
+
128
+ async def load_vector_store(self, agent_id: str) -> Optional[FAISS]:
129
+ """Load existing vector store for an agent."""
130
+ try:
131
+ vector_store_key = f"vector_store_{agent_id}"
132
+ stored_data = await self.skill_store.get_agent_skill_data(
133
+ agent_id, "web_scraper", vector_store_key
134
+ )
135
+
136
+ if not stored_data or "faiss_files" not in stored_data:
137
+ return None
138
+
139
+ embeddings = self.create_embeddings()
140
+ return self.decode_vector_store(stored_data["faiss_files"], embeddings)
141
+
142
+ except Exception as e:
143
+ logger.error(f"Error loading vector store for agent {agent_id}: {e}")
144
+ return None
145
+
146
+ async def save_vector_store(
147
+ self,
148
+ agent_id: str,
149
+ vector_store: FAISS,
150
+ chunk_size: int = 1000,
151
+ chunk_overlap: int = 200,
152
+ ) -> None:
153
+ """Save vector store for an agent (compatible with web_scraper format)."""
154
+ try:
155
+ vector_store_key = f"vector_store_{agent_id}"
156
+ encoded_files = self.encode_vector_store(vector_store)
157
+
158
+ # Use the same data structure as web_scraper
159
+ storage_data = {
160
+ "faiss_files": encoded_files,
161
+ "chunk_size": chunk_size,
162
+ "chunk_overlap": chunk_overlap,
163
+ }
164
+
165
+ await self.skill_store.save_agent_skill_data(
166
+ agent_id, "web_scraper", vector_store_key, storage_data
167
+ )
168
+
169
+ except Exception as e:
170
+ logger.error(f"Error saving vector store for agent {agent_id}: {e}")
171
+ raise
172
+
173
+
174
+ class FirecrawlMetadataManager:
175
+ """Manages metadata for Firecrawl indexed content."""
176
+
177
+ def __init__(self, skill_store: SkillStoreABC):
178
+ self.skill_store = skill_store
179
+
180
+ def create_url_metadata(
181
+ self, urls: List[str], documents: List[Document], source_type: str
182
+ ) -> Dict[str, Any]:
183
+ """Create metadata for indexed URLs."""
184
+ return {
185
+ "urls": urls,
186
+ "document_count": len(documents),
187
+ "source_type": source_type,
188
+ "indexed_at": str(len(urls)), # Simple counter
189
+ }
190
+
191
+ async def update_metadata(
192
+ self, agent_id: str, new_metadata: Dict[str, Any]
193
+ ) -> None:
194
+ """Update metadata for an agent."""
195
+ try:
196
+ metadata_key = f"indexed_urls_{agent_id}"
197
+ await self.skill_store.save_agent_skill_data(
198
+ agent_id, "web_scraper", metadata_key, new_metadata
199
+ )
200
+ except Exception as e:
201
+ logger.error(f"Error updating metadata for agent {agent_id}: {e}")
202
+ raise
203
+
204
+
205
+ async def index_documents(
206
+ documents: List[Document],
207
+ agent_id: str,
208
+ skill_store: SkillStoreABC,
209
+ chunk_size: int = 1000,
210
+ chunk_overlap: int = 200,
211
+ ) -> Tuple[int, bool]:
212
+ """
213
+ Index documents into the Firecrawl vector store.
214
+
215
+ Args:
216
+ documents: List of documents to index
217
+ agent_id: Agent ID for storage
218
+ skill_store: Skill store for persistence
219
+ chunk_size: Size of text chunks
220
+ chunk_overlap: Overlap between chunks
221
+
222
+ Returns:
223
+ Tuple of (total_chunks, was_merged_with_existing)
224
+ """
225
+ try:
226
+ # Initialize managers
227
+ vs_manager = FirecrawlVectorStoreManager(skill_store)
228
+
229
+ # Split documents into chunks
230
+ split_docs = FirecrawlDocumentProcessor.split_documents(
231
+ documents, chunk_size, chunk_overlap
232
+ )
233
+
234
+ if not split_docs:
235
+ logger.warning("No documents to index after splitting")
236
+ return 0, False
237
+
238
+ # Create embeddings
239
+ embeddings = vs_manager.create_embeddings()
240
+
241
+ # Try to load existing vector store
242
+ existing_vector_store = await vs_manager.load_vector_store(agent_id)
243
+
244
+ if existing_vector_store:
245
+ # Add to existing vector store
246
+ existing_vector_store.add_documents(split_docs)
247
+ vector_store = existing_vector_store
248
+ was_merged = True
249
+ else:
250
+ # Create new vector store
251
+ vector_store = FAISS.from_documents(split_docs, embeddings)
252
+ was_merged = False
253
+
254
+ # Save the vector store
255
+ await vs_manager.save_vector_store(
256
+ agent_id, vector_store, chunk_size, chunk_overlap
257
+ )
258
+
259
+ logger.info(
260
+ f"Successfully indexed {len(split_docs)} chunks for agent {agent_id}"
261
+ )
262
+ return len(split_docs), was_merged
263
+
264
+ except Exception as e:
265
+ logger.error(f"Error indexing documents for agent {agent_id}: {e}")
266
+ raise
267
+
268
+
269
+ async def query_indexed_content(
270
+ query: str,
271
+ agent_id: str,
272
+ skill_store: SkillStoreABC,
273
+ max_results: int = 4,
274
+ ) -> List[Document]:
275
+ """
276
+ Query the Firecrawl indexed content.
277
+
278
+ Args:
279
+ query: Search query
280
+ agent_id: Agent ID
281
+ skill_store: Skill store for persistence
282
+ max_results: Maximum number of results to return
283
+
284
+ Returns:
285
+ List of relevant documents
286
+ """
287
+ try:
288
+ # Initialize vector store manager
289
+ vs_manager = FirecrawlVectorStoreManager(skill_store)
290
+
291
+ # Load vector store
292
+ vector_store = await vs_manager.load_vector_store(agent_id)
293
+
294
+ if not vector_store:
295
+ logger.warning(f"No vector store found for agent {agent_id}")
296
+ return []
297
+
298
+ # Perform similarity search
299
+ docs = vector_store.similarity_search(query, k=max_results)
300
+
301
+ logger.info(f"Found {len(docs)} documents for query: {query}")
302
+ return docs
303
+
304
+ except Exception as e:
305
+ logger.error(f"Error querying indexed content for agent {agent_id}: {e}")
306
+ raise
@@ -137,7 +137,7 @@ class ImageGenerationAnimagineXL(HeuristBaseTool):
137
137
  # Store the image URL
138
138
  image_url = response.text.strip('"')
139
139
  # Generate a key with agent ID as prefix
140
- image_key = f"{context.agent.id}/heurist/{job_id}"
140
+ image_key = f"{context.agent_id}/heurist/{job_id}"
141
141
  # Store the image and get the CDN URL
142
142
  stored_url = await store_image(image_url, image_key)
143
143
 
@@ -137,7 +137,7 @@ class ImageGenerationArthemyComics(HeuristBaseTool):
137
137
  # Store the image URL
138
138
  image_url = response.text.strip('"')
139
139
  # Generate a key with agent ID as prefix
140
- image_key = f"{context.agent.id}/heurist/{job_id}"
140
+ image_key = f"{context.agent_id}/heurist/{job_id}"
141
141
  # Store the image and get the CDN URL
142
142
  stored_url = await store_image(image_url, image_key)
143
143
 
@@ -137,7 +137,7 @@ class ImageGenerationArthemyReal(HeuristBaseTool):
137
137
  # Store the image URL
138
138
  image_url = response.text.strip('"')
139
139
  # Generate a key with agent ID as prefix
140
- image_key = f"{context.agent.id}/heurist/{job_id}"
140
+ image_key = f"{context.agent_id}/heurist/{job_id}"
141
141
  # Store the image and get the CDN URL
142
142
  stored_url = await store_image(image_url, image_key)
143
143
 
@@ -137,7 +137,7 @@ class ImageGenerationBrainDance(HeuristBaseTool):
137
137
  # Store the image URL
138
138
  image_url = response.text.strip('"')
139
139
  # Generate a key with agent ID as prefix
140
- image_key = f"{context.agent.id}/heurist/{job_id}"
140
+ image_key = f"{context.agent_id}/heurist/{job_id}"
141
141
  # Store the image and get the CDN URL
142
142
  stored_url = await store_image(image_url, image_key)
143
143
 
@@ -137,7 +137,7 @@ class ImageGenerationCyberRealisticXL(HeuristBaseTool):
137
137
  # Store the image URL
138
138
  image_url = response.text.strip('"')
139
139
  # Generate a key with agent ID as prefix
140
- image_key = f"{context.agent.id}/heurist/{job_id}"
140
+ image_key = f"{context.agent_id}/heurist/{job_id}"
141
141
  # Store the image and get the CDN URL
142
142
  stored_url = await store_image(image_url, image_key)
143
143
 
@@ -137,7 +137,7 @@ class ImageGenerationFlux1Dev(HeuristBaseTool):
137
137
  # Store the image URL
138
138
  image_url = response.text.strip('"')
139
139
  # Generate a key with agent ID as prefix
140
- image_key = f"{context.agent.id}/heurist/{job_id}"
140
+ image_key = f"{context.agent_id}/heurist/{job_id}"
141
141
  # Store the image and get the CDN URL
142
142
  stored_url = await store_image(image_url, image_key)
143
143