intentkit 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

Files changed (94) hide show
  1. intentkit/__init__.py +1 -1
  2. intentkit/abstracts/skill.py +12 -0
  3. intentkit/clients/cdp.py +114 -16
  4. intentkit/config/config.py +12 -4
  5. intentkit/core/engine.py +39 -31
  6. intentkit/core/node.py +8 -4
  7. intentkit/core/prompt.py +5 -6
  8. intentkit/core/skill.py +11 -0
  9. intentkit/models/agent.py +2 -9
  10. intentkit/models/agent_data.py +18 -0
  11. intentkit/models/agent_schema.json +12 -0
  12. intentkit/models/chat.py +50 -0
  13. intentkit/models/skill.py +19 -0
  14. intentkit/skills/base.py +37 -17
  15. intentkit/skills/cdp/__init__.py +6 -14
  16. intentkit/skills/cdp/get_balance.py +77 -25
  17. intentkit/skills/cdp/schema.json +0 -64
  18. intentkit/skills/cryptocompare/fetch_news.py +2 -2
  19. intentkit/skills/cryptocompare/fetch_price.py +2 -2
  20. intentkit/skills/cryptocompare/fetch_top_exchanges.py +2 -2
  21. intentkit/skills/cryptocompare/fetch_top_market_cap.py +2 -2
  22. intentkit/skills/cryptocompare/fetch_top_volume.py +2 -2
  23. intentkit/skills/cryptocompare/fetch_trading_signals.py +2 -2
  24. intentkit/skills/defillama/base.py +3 -3
  25. intentkit/skills/enso/base.py +27 -4
  26. intentkit/skills/enso/networks.py +1 -1
  27. intentkit/skills/enso/route.py +24 -23
  28. intentkit/skills/enso/tokens.py +1 -1
  29. intentkit/skills/enso/wallet.py +27 -23
  30. intentkit/skills/firecrawl/README.md +211 -0
  31. intentkit/skills/firecrawl/__init__.py +107 -0
  32. intentkit/skills/firecrawl/base.py +28 -0
  33. intentkit/skills/firecrawl/clear.py +87 -0
  34. intentkit/skills/firecrawl/crawl.py +399 -0
  35. intentkit/skills/firecrawl/firecrawl.png +0 -0
  36. intentkit/skills/firecrawl/query.py +123 -0
  37. intentkit/skills/firecrawl/schema.json +153 -0
  38. intentkit/skills/firecrawl/scrape.py +318 -0
  39. intentkit/skills/firecrawl/utils.py +306 -0
  40. intentkit/skills/heurist/image_generation_animagine_xl.py +1 -1
  41. intentkit/skills/heurist/image_generation_arthemy_comics.py +1 -1
  42. intentkit/skills/heurist/image_generation_arthemy_real.py +1 -1
  43. intentkit/skills/heurist/image_generation_braindance.py +1 -1
  44. intentkit/skills/heurist/image_generation_cyber_realistic_xl.py +1 -1
  45. intentkit/skills/heurist/image_generation_flux_1_dev.py +1 -1
  46. intentkit/skills/heurist/image_generation_sdxl.py +1 -1
  47. intentkit/skills/http/README.md +78 -0
  48. intentkit/skills/http/__init__.py +100 -0
  49. intentkit/skills/http/base.py +21 -0
  50. intentkit/skills/http/get.py +96 -0
  51. intentkit/skills/http/http.svg +15 -0
  52. intentkit/skills/http/post.py +113 -0
  53. intentkit/skills/http/put.py +113 -0
  54. intentkit/skills/http/schema.json +80 -0
  55. intentkit/skills/lifi/token_execute.py +1 -1
  56. intentkit/skills/openai/dalle_image_generation.py +1 -1
  57. intentkit/skills/openai/gpt_image_generation.py +1 -1
  58. intentkit/skills/openai/gpt_image_to_image.py +1 -1
  59. intentkit/skills/supabase/__init__.py +116 -0
  60. intentkit/skills/supabase/base.py +72 -0
  61. intentkit/skills/supabase/delete_data.py +102 -0
  62. intentkit/skills/supabase/fetch_data.py +120 -0
  63. intentkit/skills/supabase/insert_data.py +70 -0
  64. intentkit/skills/supabase/invoke_function.py +74 -0
  65. intentkit/skills/supabase/schema.json +170 -0
  66. intentkit/skills/supabase/supabase.svg +15 -0
  67. intentkit/skills/supabase/update_data.py +105 -0
  68. intentkit/skills/supabase/upsert_data.py +77 -0
  69. intentkit/skills/system/read_agent_api_key.py +1 -1
  70. intentkit/skills/system/regenerate_agent_api_key.py +1 -1
  71. intentkit/skills/token/base.py +1 -39
  72. intentkit/skills/twitter/follow_user.py +3 -3
  73. intentkit/skills/twitter/get_mentions.py +6 -6
  74. intentkit/skills/twitter/get_timeline.py +5 -5
  75. intentkit/skills/twitter/get_user_by_username.py +3 -3
  76. intentkit/skills/twitter/get_user_tweets.py +5 -5
  77. intentkit/skills/twitter/like_tweet.py +3 -3
  78. intentkit/skills/twitter/post_tweet.py +4 -4
  79. intentkit/skills/twitter/reply_tweet.py +4 -4
  80. intentkit/skills/twitter/retweet.py +3 -3
  81. intentkit/skills/twitter/search_tweets.py +5 -5
  82. intentkit/skills/unrealspeech/text_to_speech.py +1 -1
  83. intentkit/skills/web_scraper/README.md +35 -4
  84. intentkit/skills/web_scraper/__init__.py +16 -0
  85. intentkit/skills/web_scraper/document_indexer.py +143 -0
  86. intentkit/skills/web_scraper/schema.json +28 -0
  87. intentkit/skills/web_scraper/scrape_and_index.py +135 -200
  88. intentkit/skills/web_scraper/utils.py +684 -0
  89. intentkit/skills/web_scraper/website_indexer.py +456 -0
  90. intentkit/utils/logging.py +1 -1
  91. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/METADATA +1 -1
  92. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/RECORD +94 -63
  93. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/WHEEL +0 -0
  94. {intentkit-0.5.2.dist-info → intentkit-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,684 @@
1
+ """
2
+ Utility functions for web scraper skills.
3
+
4
+ This module contains common functionality used across all web scraper skills
5
+ to reduce code duplication and improve maintainability.
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import logging
11
+ import os
12
+ import tempfile
13
+ from typing import Dict, List, Optional, Tuple
14
+
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain_core.documents import Document
17
+ from langchain_openai import OpenAIEmbeddings
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+
20
+ from intentkit.abstracts.skill import SkillStoreABC
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Constants
25
+ DEFAULT_CHUNK_SIZE = 1000
26
+ DEFAULT_CHUNK_OVERLAP = 200
27
+ DEFAULT_REQUEST_TIMEOUT = 30
28
+ DEFAULT_REQUESTS_PER_SECOND = 2
29
+ MAX_CONTENT_SIZE_MB = 10 # 10 MB limit
30
+ MAX_CONTENT_SIZE_BYTES = MAX_CONTENT_SIZE_MB * 1024 * 1024
31
+
32
+ # HTTP Headers to bypass Cloudflare and other bot protection
33
+ DEFAULT_HEADERS = {
34
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
35
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
36
+ "Accept-Language": "en-US,en;q=0.9",
37
+ "Accept-Encoding": "gzip, deflate, br",
38
+ "DNT": "1",
39
+ "Connection": "keep-alive",
40
+ "Upgrade-Insecure-Requests": "1",
41
+ "Sec-Fetch-Dest": "document",
42
+ "Sec-Fetch-Mode": "navigate",
43
+ "Sec-Fetch-Site": "none",
44
+ "Sec-Fetch-User": "?1",
45
+ "Cache-Control": "max-age=0",
46
+ }
47
+
48
+ # Alternative headers for fallback when primary headers fail
49
+ FALLBACK_HEADERS = {
50
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
51
+ "Accept": "*/*",
52
+ "Accept-Language": "en-US,en;q=0.5",
53
+ "Accept-Encoding": "gzip, deflate",
54
+ "Connection": "keep-alive",
55
+ }
56
+
57
+ # Storage keys
58
+ VECTOR_STORE_KEY_PREFIX = "vector_store"
59
+ METADATA_KEY_PREFIX = "indexed_urls"
60
+
61
+
62
+ class VectorStoreManager:
63
+ """Manages vector store operations including creation, saving, loading, and merging."""
64
+
65
+ def __init__(self, skill_store: SkillStoreABC):
66
+ self.skill_store = skill_store
67
+
68
+ def create_embeddings(self) -> OpenAIEmbeddings:
69
+ """Create OpenAI embeddings using system API key."""
70
+ api_key = self.skill_store.get_system_config("openai_api_key")
71
+ return OpenAIEmbeddings(api_key=api_key)
72
+
73
+ def get_storage_keys(self, agent_id: str) -> Tuple[str, str]:
74
+ """Get storage keys for vector store and metadata."""
75
+ vector_store_key = f"{VECTOR_STORE_KEY_PREFIX}_{agent_id}"
76
+ metadata_key = f"{METADATA_KEY_PREFIX}_{agent_id}"
77
+ return vector_store_key, metadata_key
78
+
79
+ def encode_vector_store(self, vector_store: FAISS) -> Dict[str, str]:
80
+ """Encode FAISS vector store to base64 for storage."""
81
+ with tempfile.TemporaryDirectory() as temp_dir:
82
+ vector_store.save_local(temp_dir)
83
+
84
+ encoded_files = {}
85
+ for filename in os.listdir(temp_dir):
86
+ file_path = os.path.join(temp_dir, filename)
87
+ if os.path.isfile(file_path):
88
+ with open(file_path, "rb") as f:
89
+ encoded_files[filename] = base64.b64encode(f.read()).decode(
90
+ "utf-8"
91
+ )
92
+
93
+ return encoded_files
94
+
95
+ def decode_vector_store(
96
+ self, encoded_files: Dict[str, str], embeddings: OpenAIEmbeddings
97
+ ) -> FAISS:
98
+ """Decode base64 files back to FAISS vector store."""
99
+ with tempfile.TemporaryDirectory() as temp_dir:
100
+ # Decode and write files
101
+ for filename, encoded_content in encoded_files.items():
102
+ file_path = os.path.join(temp_dir, filename)
103
+ with open(file_path, "wb") as f:
104
+ f.write(base64.b64decode(encoded_content))
105
+
106
+ # Load vector store
107
+ return FAISS.load_local(
108
+ temp_dir,
109
+ embeddings,
110
+ allow_dangerous_deserialization=True,
111
+ )
112
+
113
+ async def get_existing_vector_store(self, agent_id: str) -> Optional[Dict]:
114
+ """Get existing vector store data if it exists."""
115
+ vector_store_key, _ = self.get_storage_keys(agent_id)
116
+ return await self.skill_store.get_agent_skill_data(
117
+ agent_id, "web_scraper", vector_store_key
118
+ )
119
+
120
+ async def merge_with_existing(
121
+ self,
122
+ new_documents: List[Document],
123
+ agent_id: str,
124
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
125
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
126
+ ) -> Tuple[FAISS, bool]:
127
+ """
128
+ Merge new documents with existing vector store or create new one.
129
+
130
+ Returns:
131
+ Tuple of (vector_store, was_merged)
132
+ """
133
+ embeddings = self.create_embeddings()
134
+ existing_data = await self.get_existing_vector_store(agent_id)
135
+
136
+ if existing_data and "faiss_files" in existing_data:
137
+ try:
138
+ logger.info(f"[{agent_id}] Merging content with existing vector store")
139
+
140
+ # Create new vector store from new documents
141
+ new_vector_store = FAISS.from_documents(new_documents, embeddings)
142
+
143
+ # Load existing vector store
144
+ existing_vector_store = self.decode_vector_store(
145
+ existing_data["faiss_files"], embeddings
146
+ )
147
+
148
+ # Merge stores
149
+ existing_vector_store.merge_from(new_vector_store)
150
+ return existing_vector_store, True
151
+
152
+ except Exception as e:
153
+ logger.warning(
154
+ f"[{agent_id}] Merge failed, creating new vector store: {e}"
155
+ )
156
+ logger.info(f"[{agent_id}] Creating new vector store")
157
+
158
+ # Create new vector store
159
+ logger.info(f"[{agent_id}] Creating new vector store")
160
+ vector_store = FAISS.from_documents(new_documents, embeddings)
161
+ return vector_store, False
162
+
163
+ async def save_vector_store(
164
+ self,
165
+ vector_store: FAISS,
166
+ agent_id: str,
167
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
168
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
169
+ ) -> None:
170
+ """Save vector store to agent skill data."""
171
+ vector_store_key, _ = self.get_storage_keys(agent_id)
172
+
173
+ logger.info(f"[{agent_id}] Saving vector store")
174
+
175
+ # Encode vector store
176
+ encoded_files = self.encode_vector_store(vector_store)
177
+
178
+ # Prepare data for storage
179
+ storage_data = {
180
+ "faiss_files": encoded_files,
181
+ "chunk_size": chunk_size,
182
+ "chunk_overlap": chunk_overlap,
183
+ }
184
+
185
+ try:
186
+ # Save to storage
187
+ await self.skill_store.save_agent_skill_data(
188
+ agent_id=agent_id,
189
+ skill="web_scraper",
190
+ key=vector_store_key,
191
+ data=storage_data,
192
+ )
193
+
194
+ logger.info(f"[{agent_id}] Successfully saved vector store")
195
+
196
+ except Exception as e:
197
+ logger.error(f"[{agent_id}] Failed to save vector store: {e}")
198
+ raise
199
+
200
+ async def load_vector_store(self, agent_id: str) -> Optional[FAISS]:
201
+ """Load vector store for an agent."""
202
+ stored_data = await self.get_existing_vector_store(agent_id)
203
+
204
+ if not stored_data or "faiss_files" not in stored_data:
205
+ return None
206
+
207
+ try:
208
+ embeddings = self.create_embeddings()
209
+ return self.decode_vector_store(stored_data["faiss_files"], embeddings)
210
+ except Exception as e:
211
+ logger.error(f"Error loading vector store for agent {agent_id}: {e}")
212
+ return None
213
+
214
+ async def get_content_size(self, agent_id: str) -> int:
215
+ """Get the current content size in bytes for an agent."""
216
+ stored_data = await self.get_existing_vector_store(agent_id)
217
+ if not stored_data:
218
+ return 0
219
+
220
+ # Calculate size from stored FAISS files
221
+ total_size = 0
222
+ if "faiss_files" in stored_data:
223
+ for encoded_content in stored_data["faiss_files"].values():
224
+ # Base64 encoded content size (approximate original size)
225
+ total_size += len(base64.b64decode(encoded_content))
226
+
227
+ return total_size
228
+
229
+ def format_size(self, size_bytes: int) -> str:
230
+ """Format size in bytes to human readable format."""
231
+ if size_bytes < 1024:
232
+ return f"{size_bytes} B"
233
+ elif size_bytes < 1024 * 1024:
234
+ return f"{size_bytes / 1024:.1f} KB"
235
+ else:
236
+ return f"{size_bytes / (1024 * 1024):.1f} MB"
237
+
238
+
239
+ class DocumentProcessor:
240
+ """Handles document processing operations."""
241
+
242
+ @staticmethod
243
+ def create_chunks(
244
+ documents: List[Document],
245
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
246
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
247
+ ) -> List[Document]:
248
+ """Split documents into chunks."""
249
+ text_splitter = RecursiveCharacterTextSplitter(
250
+ chunk_size=chunk_size,
251
+ chunk_overlap=chunk_overlap,
252
+ length_function=len,
253
+ )
254
+ return text_splitter.split_documents(documents)
255
+
256
+ @staticmethod
257
+ def clean_text(text: str) -> str:
258
+ """Clean and normalize text content."""
259
+ lines = text.split("\n")
260
+ cleaned_lines = []
261
+
262
+ for line in lines:
263
+ cleaned_line = line.strip()
264
+ if cleaned_line:
265
+ cleaned_lines.append(cleaned_line)
266
+
267
+ cleaned_text = "\n".join(cleaned_lines)
268
+
269
+ # Remove excessive consecutive newlines
270
+ while "\n\n\n" in cleaned_text:
271
+ cleaned_text = cleaned_text.replace("\n\n\n", "\n\n")
272
+
273
+ return cleaned_text.strip()
274
+
275
+ @staticmethod
276
+ def validate_content(content: str, min_length: int = 10) -> bool:
277
+ """Validate content meets minimum requirements."""
278
+ return len(content.strip()) >= min_length
279
+
280
+ @staticmethod
281
+ def create_document(
282
+ content: str,
283
+ title: str,
284
+ source: str,
285
+ tags: str = "",
286
+ extra_metadata: Optional[Dict] = None,
287
+ ) -> Document:
288
+ """Create a Document with standardized metadata."""
289
+ cleaned_content = DocumentProcessor.clean_text(content)
290
+
291
+ # Parse tags
292
+ tag_list = (
293
+ [tag.strip() for tag in tags.split(",") if tag.strip()] if tags else []
294
+ )
295
+
296
+ metadata = {
297
+ "title": title,
298
+ "source": source,
299
+ "source_type": "manual",
300
+ "tags": tag_list,
301
+ "length": len(cleaned_content),
302
+ "indexed_at": str(asyncio.get_event_loop().time()),
303
+ }
304
+
305
+ # Add extra metadata if provided
306
+ if extra_metadata:
307
+ metadata.update(extra_metadata)
308
+
309
+ return Document(page_content=cleaned_content, metadata=metadata)
310
+
311
+
312
+ class MetadataManager:
313
+ """Manages metadata for indexed content."""
314
+
315
+ def __init__(self, skill_store: SkillStoreABC):
316
+ self.skill_store = skill_store
317
+
318
+ async def get_existing_metadata(self, agent_id: str) -> Dict:
319
+ """Get existing metadata for an agent."""
320
+ vs_manager = VectorStoreManager(self.skill_store)
321
+ _, metadata_key = vs_manager.get_storage_keys(agent_id)
322
+ return (
323
+ await self.skill_store.get_agent_skill_data(
324
+ agent_id, "web_scraper", metadata_key
325
+ )
326
+ or {}
327
+ )
328
+
329
+ def create_url_metadata(
330
+ self,
331
+ urls: List[str],
332
+ split_docs: List[Document],
333
+ source_type: str = "web_scraper",
334
+ extra_fields: Optional[Dict] = None,
335
+ ) -> Dict:
336
+ """Create metadata for a list of URLs."""
337
+ metadata = {}
338
+ current_time = str(asyncio.get_event_loop().time())
339
+
340
+ for url in urls:
341
+ url_metadata = {
342
+ "indexed_at": current_time,
343
+ "chunks": len(
344
+ [doc for doc in split_docs if doc.metadata.get("source") == url]
345
+ ),
346
+ "source_type": source_type,
347
+ }
348
+
349
+ if extra_fields:
350
+ url_metadata.update(extra_fields)
351
+
352
+ metadata[url] = url_metadata
353
+
354
+ return metadata
355
+
356
+ def create_document_metadata(
357
+ self,
358
+ title: str,
359
+ source: str,
360
+ tags: str,
361
+ split_docs: List[Document],
362
+ document_length: int,
363
+ ) -> Dict:
364
+ """Create metadata for a document."""
365
+ # Generate unique key
366
+ key = f"document_{title.lower().replace(' ', '_')}"
367
+
368
+ return {
369
+ key: {
370
+ "title": title,
371
+ "source": source,
372
+ "source_type": "document_indexer",
373
+ "tags": [tag.strip() for tag in tags.split(",") if tag.strip()]
374
+ if tags
375
+ else [],
376
+ "indexed_at": str(asyncio.get_event_loop().time()),
377
+ "chunks": len(split_docs),
378
+ "length": document_length,
379
+ }
380
+ }
381
+
382
+ async def update_metadata(self, agent_id: str, new_metadata: Dict) -> None:
383
+ """Update metadata for an agent."""
384
+ vs_manager = VectorStoreManager(self.skill_store)
385
+ _, metadata_key = vs_manager.get_storage_keys(agent_id)
386
+
387
+ # Get existing metadata
388
+ existing_metadata = await self.get_existing_metadata(agent_id)
389
+
390
+ # Update with new metadata
391
+ existing_metadata.update(new_metadata)
392
+
393
+ # Save updated metadata
394
+ await self.skill_store.save_agent_skill_data(
395
+ agent_id=agent_id,
396
+ skill="web_scraper",
397
+ key=metadata_key,
398
+ data=existing_metadata,
399
+ )
400
+
401
+
402
+ class ResponseFormatter:
403
+ """Formats consistent responses for web scraper skills."""
404
+
405
+ @staticmethod
406
+ def format_indexing_response(
407
+ operation_type: str,
408
+ urls_or_content: List[str] | str,
409
+ total_chunks: int,
410
+ chunk_size: int,
411
+ chunk_overlap: int,
412
+ was_merged: bool,
413
+ extra_info: Optional[Dict] = None,
414
+ current_size_bytes: int = 0,
415
+ size_limit_reached: bool = False,
416
+ total_requested_urls: int = 0,
417
+ ) -> str:
418
+ """Format a consistent response for indexing operations."""
419
+
420
+ # Handle both URL lists and single content
421
+ if isinstance(urls_or_content, list):
422
+ urls = urls_or_content
423
+ processed_count = len(urls)
424
+
425
+ if size_limit_reached and total_requested_urls > 0:
426
+ content_summary = f"Processed {processed_count} of {total_requested_urls} URLs (size limit reached)"
427
+ else:
428
+ content_summary = (
429
+ f"Successfully {operation_type} {processed_count} URLs"
430
+ )
431
+
432
+ if len(urls) <= 5:
433
+ url_list = "\n".join([f"- {url}" for url in urls])
434
+ else:
435
+ displayed_urls = urls[:5]
436
+ remaining_count = len(urls) - 5
437
+ url_list = "\n".join([f"- {url}" for url in displayed_urls])
438
+ url_list += f"\n... and {remaining_count} more"
439
+ else:
440
+ content_summary = f"Successfully {operation_type} content"
441
+ url_list = ""
442
+
443
+ # Build response
444
+ response_parts = [content_summary]
445
+
446
+ if url_list:
447
+ response_parts.append(url_list)
448
+
449
+ response_parts.extend(
450
+ [
451
+ f"Total chunks created: {total_chunks}",
452
+ f"Chunk size: {chunk_size} characters",
453
+ f"Chunk overlap: {chunk_overlap} characters",
454
+ f"Vector store: {'merged with existing content' if was_merged else 'created new index'}",
455
+ ]
456
+ )
457
+
458
+ # Add size information
459
+ if current_size_bytes > 0:
460
+ vs_manager = VectorStoreManager(None) # Just for formatting
461
+ formatted_size = vs_manager.format_size(current_size_bytes)
462
+ max_size = vs_manager.format_size(MAX_CONTENT_SIZE_BYTES)
463
+ response_parts.append(
464
+ f"Current storage size: {formatted_size} / {max_size}"
465
+ )
466
+
467
+ if size_limit_reached:
468
+ response_parts.append("Size limit reached - some URLs were not processed")
469
+
470
+ if extra_info:
471
+ for key, value in extra_info.items():
472
+ response_parts.append(f"{key}: {value}")
473
+
474
+ response_parts.append(
475
+ "All content has been indexed and can be queried using the query_indexed_content tool."
476
+ )
477
+
478
+ return "\n".join(response_parts)
479
+
480
+
481
+ async def scrape_and_index_urls(
482
+ urls: List[str],
483
+ agent_id: str,
484
+ skill_store: SkillStoreABC,
485
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
486
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
487
+ requests_per_second: int = DEFAULT_REQUESTS_PER_SECOND,
488
+ ) -> Tuple[int, bool, List[str]]:
489
+ """
490
+ Scrape URLs and index their content into vector store with size limits.
491
+
492
+ Args:
493
+ urls: List of URLs to scrape
494
+ agent_id: Agent identifier for storage
495
+ skill_store: Skill store instance
496
+ chunk_size: Size of text chunks
497
+ chunk_overlap: Overlap between chunks
498
+ requests_per_second: Rate limiting for requests
499
+
500
+ Returns:
501
+ Tuple of (total_chunks, was_merged, valid_urls)
502
+ """
503
+ from urllib.parse import urlparse
504
+
505
+ from langchain_community.document_loaders import WebBaseLoader
506
+
507
+ # Validate URLs
508
+ valid_urls = []
509
+ for url in urls:
510
+ try:
511
+ parsed = urlparse(url)
512
+ if parsed.scheme in ["http", "https"] and parsed.netloc:
513
+ valid_urls.append(url)
514
+ else:
515
+ logger.warning(f"Invalid URL format: {url}")
516
+ except Exception as e:
517
+ logger.warning(f"Error parsing URL {url}: {e}")
518
+
519
+ if not valid_urls:
520
+ return 0, False, []
521
+
522
+ # Check existing content size
523
+ vs_manager = VectorStoreManager(skill_store)
524
+ current_size = await vs_manager.get_content_size(agent_id)
525
+
526
+ logger.info(
527
+ f"[{agent_id}] Current storage size: {vs_manager.format_size(current_size)}"
528
+ )
529
+
530
+ if current_size >= MAX_CONTENT_SIZE_BYTES:
531
+ logger.warning(
532
+ f"[{agent_id}] Storage limit already reached: {vs_manager.format_size(current_size)}"
533
+ )
534
+ return 0, False, []
535
+
536
+ # Process URLs one by one with size checking
537
+ processed_urls = []
538
+ total_chunks = 0
539
+ was_merged = False
540
+ size_limit_reached = False
541
+
542
+ for i, url in enumerate(valid_urls):
543
+ if current_size >= MAX_CONTENT_SIZE_BYTES:
544
+ size_limit_reached = True
545
+ logger.warning(f"[{agent_id}] Size limit reached after processing {i} URLs")
546
+ break
547
+
548
+ try:
549
+ logger.info(f"[{agent_id}] Processing URL {i + 1}/{len(valid_urls)}: {url}")
550
+
551
+ # Load single URL with enhanced headers
552
+ loader = WebBaseLoader(
553
+ web_paths=[url],
554
+ requests_per_second=requests_per_second,
555
+ )
556
+
557
+ # Configure loader with enhanced headers to bypass bot protection
558
+ loader.requests_kwargs = {
559
+ "verify": True,
560
+ "timeout": DEFAULT_REQUEST_TIMEOUT,
561
+ "headers": DEFAULT_HEADERS,
562
+ }
563
+
564
+ # Scrape the URL with retry logic
565
+ documents = None
566
+ try:
567
+ documents = await asyncio.to_thread(loader.load)
568
+ except Exception as primary_error:
569
+ # If primary headers fail, try fallback headers
570
+ logger.warning(
571
+ f"[{agent_id}] Primary headers failed for {url}, trying fallback: {primary_error}"
572
+ )
573
+
574
+ loader.requests_kwargs["headers"] = FALLBACK_HEADERS
575
+ try:
576
+ documents = await asyncio.to_thread(loader.load)
577
+ logger.info(f"[{agent_id}] Fallback headers succeeded for {url}")
578
+ except Exception as fallback_error:
579
+ logger.error(
580
+ f"[{agent_id}] Both header sets failed for {url}: {fallback_error}"
581
+ )
582
+ raise fallback_error
583
+
584
+ if not documents:
585
+ logger.warning(f"[{agent_id}] No content extracted from {url}")
586
+ continue
587
+
588
+ # Check content size before processing
589
+ content_size = sum(
590
+ len(doc.page_content.encode("utf-8")) for doc in documents
591
+ )
592
+
593
+ if current_size + content_size > MAX_CONTENT_SIZE_BYTES:
594
+ logger.warning(
595
+ f"[{agent_id}] Adding {url} would exceed size limit. Skipping."
596
+ )
597
+ size_limit_reached = True
598
+ break
599
+
600
+ # Process and index this URL's content
601
+ chunks, merged = await index_documents(
602
+ documents, agent_id, skill_store, chunk_size, chunk_overlap
603
+ )
604
+
605
+ if chunks > 0:
606
+ processed_urls.append(url)
607
+ total_chunks += chunks
608
+ was_merged = merged or was_merged
609
+ current_size += content_size
610
+
611
+ logger.info(
612
+ f"[{agent_id}] Processed {url}: {chunks} chunks, current size: {vs_manager.format_size(current_size)}"
613
+ )
614
+
615
+ # Add delay for rate limiting
616
+ if i < len(valid_urls) - 1: # Don't delay after the last URL
617
+ await asyncio.sleep(1.0 / requests_per_second)
618
+
619
+ except Exception as e:
620
+ logger.error(f"[{agent_id}] Error processing {url}: {e}")
621
+ continue
622
+
623
+ # Log final results
624
+ if size_limit_reached:
625
+ logger.warning(
626
+ f"[{agent_id}] Size limit reached. Processed {len(processed_urls)}/{len(valid_urls)} URLs"
627
+ )
628
+ else:
629
+ logger.info(
630
+ f"[{agent_id}] Successfully processed all {len(processed_urls)} URLs"
631
+ )
632
+
633
+ return total_chunks, was_merged, processed_urls
634
+
635
+
636
+ # Convenience function that combines all operations
637
+ async def index_documents(
638
+ documents: List[Document],
639
+ agent_id: str,
640
+ skill_store: SkillStoreABC,
641
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
642
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
643
+ ) -> Tuple[int, bool]:
644
+ """
645
+ Complete document indexing workflow.
646
+
647
+ Returns:
648
+ Tuple of (total_chunks, was_merged)
649
+ """
650
+ # Process documents
651
+ split_docs = DocumentProcessor.create_chunks(documents, chunk_size, chunk_overlap)
652
+
653
+ if not split_docs:
654
+ raise ValueError("No content could be processed into chunks")
655
+
656
+ # Handle vector store
657
+ vs_manager = VectorStoreManager(skill_store)
658
+ vector_store, was_merged = await vs_manager.merge_with_existing(
659
+ split_docs, agent_id, chunk_size, chunk_overlap
660
+ )
661
+
662
+ # Save vector store
663
+ await vs_manager.save_vector_store(
664
+ vector_store, agent_id, chunk_size, chunk_overlap
665
+ )
666
+
667
+ return len(split_docs), was_merged
668
+
669
+
670
+ # Error handling decorator
671
+ def handle_skill_errors(operation_name: str):
672
+ """Decorator for consistent error handling in skills."""
673
+
674
+ def decorator(func):
675
+ async def wrapper(*args, **kwargs):
676
+ try:
677
+ return await func(*args, **kwargs)
678
+ except Exception as e:
679
+ logger.error(f"Error in {operation_name}: {e}")
680
+ return f"Error {operation_name}: {str(e)}"
681
+
682
+ return wrapper
683
+
684
+ return decorator