intentkit 0.6.0.dev7__py3-none-any.whl → 0.6.0.dev8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

@@ -0,0 +1,641 @@
1
+ """
2
+ Utility functions for web scraper skills.
3
+
4
+ This module contains common functionality used across all web scraper skills
5
+ to reduce code duplication and improve maintainability.
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import logging
11
+ import os
12
+ import tempfile
13
+ from typing import Dict, List, Optional, Tuple
14
+
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain_core.documents import Document
17
+ from langchain_openai import OpenAIEmbeddings
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+
20
+ from intentkit.abstracts.skill import SkillStoreABC
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Constants
25
+ DEFAULT_CHUNK_SIZE = 1000
26
+ DEFAULT_CHUNK_OVERLAP = 200
27
+ DEFAULT_REQUEST_TIMEOUT = 30
28
+ DEFAULT_REQUESTS_PER_SECOND = 2
29
+ MAX_CONTENT_SIZE_MB = 10 # 10 MB limit
30
+ MAX_CONTENT_SIZE_BYTES = MAX_CONTENT_SIZE_MB * 1024 * 1024
31
+
32
+ # Storage keys
33
+ VECTOR_STORE_KEY_PREFIX = "vector_store"
34
+ METADATA_KEY_PREFIX = "indexed_urls"
35
+
36
+
37
+ class VectorStoreManager:
38
+ """Manages vector store operations including creation, saving, loading, and merging."""
39
+
40
+ def __init__(self, skill_store: SkillStoreABC):
41
+ self.skill_store = skill_store
42
+
43
+ def create_embeddings(self) -> OpenAIEmbeddings:
44
+ """Create OpenAI embeddings using system API key."""
45
+ api_key = self.skill_store.get_system_config("openai_api_key")
46
+ return OpenAIEmbeddings(api_key=api_key)
47
+
48
+ def get_storage_keys(self, agent_id: str) -> Tuple[str, str]:
49
+ """Get storage keys for vector store and metadata."""
50
+ vector_store_key = f"{VECTOR_STORE_KEY_PREFIX}_{agent_id}"
51
+ metadata_key = f"{METADATA_KEY_PREFIX}_{agent_id}"
52
+ return vector_store_key, metadata_key
53
+
54
+ def encode_vector_store(self, vector_store: FAISS) -> Dict[str, str]:
55
+ """Encode FAISS vector store to base64 for storage."""
56
+ with tempfile.TemporaryDirectory() as temp_dir:
57
+ vector_store.save_local(temp_dir)
58
+
59
+ encoded_files = {}
60
+ for filename in os.listdir(temp_dir):
61
+ file_path = os.path.join(temp_dir, filename)
62
+ if os.path.isfile(file_path):
63
+ with open(file_path, "rb") as f:
64
+ encoded_files[filename] = base64.b64encode(f.read()).decode(
65
+ "utf-8"
66
+ )
67
+
68
+ return encoded_files
69
+
70
+ def decode_vector_store(
71
+ self, encoded_files: Dict[str, str], embeddings: OpenAIEmbeddings
72
+ ) -> FAISS:
73
+ """Decode base64 files back to FAISS vector store."""
74
+ with tempfile.TemporaryDirectory() as temp_dir:
75
+ # Decode and write files
76
+ for filename, encoded_content in encoded_files.items():
77
+ file_path = os.path.join(temp_dir, filename)
78
+ with open(file_path, "wb") as f:
79
+ f.write(base64.b64decode(encoded_content))
80
+
81
+ # Load vector store
82
+ return FAISS.load_local(
83
+ temp_dir,
84
+ embeddings,
85
+ allow_dangerous_deserialization=True,
86
+ )
87
+
88
+ async def get_existing_vector_store(self, agent_id: str) -> Optional[Dict]:
89
+ """Get existing vector store data if it exists."""
90
+ vector_store_key, _ = self.get_storage_keys(agent_id)
91
+ return await self.skill_store.get_agent_skill_data(
92
+ agent_id, "web_scraper", vector_store_key
93
+ )
94
+
95
+ async def merge_with_existing(
96
+ self,
97
+ new_documents: List[Document],
98
+ agent_id: str,
99
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
100
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
101
+ ) -> Tuple[FAISS, bool]:
102
+ """
103
+ Merge new documents with existing vector store or create new one.
104
+
105
+ Returns:
106
+ Tuple of (vector_store, was_merged)
107
+ """
108
+ embeddings = self.create_embeddings()
109
+ existing_data = await self.get_existing_vector_store(agent_id)
110
+
111
+ if existing_data and "faiss_files" in existing_data:
112
+ try:
113
+ logger.info(f"[{agent_id}] Merging content with existing vector store")
114
+
115
+ # Create new vector store from new documents
116
+ new_vector_store = FAISS.from_documents(new_documents, embeddings)
117
+
118
+ # Load existing vector store
119
+ existing_vector_store = self.decode_vector_store(
120
+ existing_data["faiss_files"], embeddings
121
+ )
122
+
123
+ # Merge stores
124
+ existing_vector_store.merge_from(new_vector_store)
125
+ return existing_vector_store, True
126
+
127
+ except Exception as e:
128
+ logger.warning(
129
+ f"[{agent_id}] Merge failed, creating new vector store: {e}"
130
+ )
131
+ logger.info(f"[{agent_id}] Creating new vector store")
132
+
133
+ # Create new vector store
134
+ logger.info(f"[{agent_id}] Creating new vector store")
135
+ vector_store = FAISS.from_documents(new_documents, embeddings)
136
+ return vector_store, False
137
+
138
+ async def save_vector_store(
139
+ self,
140
+ vector_store: FAISS,
141
+ agent_id: str,
142
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
143
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
144
+ ) -> None:
145
+ """Save vector store to agent skill data."""
146
+ vector_store_key, _ = self.get_storage_keys(agent_id)
147
+
148
+ logger.info(f"[{agent_id}] Saving vector store")
149
+
150
+ # Encode vector store
151
+ encoded_files = self.encode_vector_store(vector_store)
152
+
153
+ # Prepare data for storage
154
+ storage_data = {
155
+ "faiss_files": encoded_files,
156
+ "chunk_size": chunk_size,
157
+ "chunk_overlap": chunk_overlap,
158
+ }
159
+
160
+ try:
161
+ # Save to storage
162
+ await self.skill_store.save_agent_skill_data(
163
+ agent_id=agent_id,
164
+ skill="web_scraper",
165
+ key=vector_store_key,
166
+ data=storage_data,
167
+ )
168
+
169
+ logger.info(f"[{agent_id}] Successfully saved vector store")
170
+
171
+ except Exception as e:
172
+ logger.error(f"[{agent_id}] Failed to save vector store: {e}")
173
+ raise
174
+
175
+ async def load_vector_store(self, agent_id: str) -> Optional[FAISS]:
176
+ """Load vector store for an agent."""
177
+ stored_data = await self.get_existing_vector_store(agent_id)
178
+
179
+ if not stored_data or "faiss_files" not in stored_data:
180
+ return None
181
+
182
+ try:
183
+ embeddings = self.create_embeddings()
184
+ return self.decode_vector_store(stored_data["faiss_files"], embeddings)
185
+ except Exception as e:
186
+ logger.error(f"Error loading vector store for agent {agent_id}: {e}")
187
+ return None
188
+
189
+ async def get_content_size(self, agent_id: str) -> int:
190
+ """Get the current content size in bytes for an agent."""
191
+ stored_data = await self.get_existing_vector_store(agent_id)
192
+ if not stored_data:
193
+ return 0
194
+
195
+ # Calculate size from stored FAISS files
196
+ total_size = 0
197
+ if "faiss_files" in stored_data:
198
+ for encoded_content in stored_data["faiss_files"].values():
199
+ # Base64 encoded content size (approximate original size)
200
+ total_size += len(base64.b64decode(encoded_content))
201
+
202
+ return total_size
203
+
204
+ def format_size(self, size_bytes: int) -> str:
205
+ """Format size in bytes to human readable format."""
206
+ if size_bytes < 1024:
207
+ return f"{size_bytes} B"
208
+ elif size_bytes < 1024 * 1024:
209
+ return f"{size_bytes / 1024:.1f} KB"
210
+ else:
211
+ return f"{size_bytes / (1024 * 1024):.1f} MB"
212
+
213
+
214
+ class DocumentProcessor:
215
+ """Handles document processing operations."""
216
+
217
+ @staticmethod
218
+ def create_chunks(
219
+ documents: List[Document],
220
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
221
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
222
+ ) -> List[Document]:
223
+ """Split documents into chunks."""
224
+ text_splitter = RecursiveCharacterTextSplitter(
225
+ chunk_size=chunk_size,
226
+ chunk_overlap=chunk_overlap,
227
+ length_function=len,
228
+ )
229
+ return text_splitter.split_documents(documents)
230
+
231
+ @staticmethod
232
+ def clean_text(text: str) -> str:
233
+ """Clean and normalize text content."""
234
+ lines = text.split("\n")
235
+ cleaned_lines = []
236
+
237
+ for line in lines:
238
+ cleaned_line = line.strip()
239
+ if cleaned_line:
240
+ cleaned_lines.append(cleaned_line)
241
+
242
+ cleaned_text = "\n".join(cleaned_lines)
243
+
244
+ # Remove excessive consecutive newlines
245
+ while "\n\n\n" in cleaned_text:
246
+ cleaned_text = cleaned_text.replace("\n\n\n", "\n\n")
247
+
248
+ return cleaned_text.strip()
249
+
250
+ @staticmethod
251
+ def validate_content(content: str, min_length: int = 10) -> bool:
252
+ """Validate content meets minimum requirements."""
253
+ return len(content.strip()) >= min_length
254
+
255
+ @staticmethod
256
+ def create_document(
257
+ content: str,
258
+ title: str,
259
+ source: str,
260
+ tags: str = "",
261
+ extra_metadata: Optional[Dict] = None,
262
+ ) -> Document:
263
+ """Create a Document with standardized metadata."""
264
+ cleaned_content = DocumentProcessor.clean_text(content)
265
+
266
+ # Parse tags
267
+ tag_list = (
268
+ [tag.strip() for tag in tags.split(",") if tag.strip()] if tags else []
269
+ )
270
+
271
+ metadata = {
272
+ "title": title,
273
+ "source": source,
274
+ "source_type": "manual",
275
+ "tags": tag_list,
276
+ "length": len(cleaned_content),
277
+ "indexed_at": str(asyncio.get_event_loop().time()),
278
+ }
279
+
280
+ # Add extra metadata if provided
281
+ if extra_metadata:
282
+ metadata.update(extra_metadata)
283
+
284
+ return Document(page_content=cleaned_content, metadata=metadata)
285
+
286
+
287
+ class MetadataManager:
288
+ """Manages metadata for indexed content."""
289
+
290
+ def __init__(self, skill_store: SkillStoreABC):
291
+ self.skill_store = skill_store
292
+
293
+ async def get_existing_metadata(self, agent_id: str) -> Dict:
294
+ """Get existing metadata for an agent."""
295
+ vs_manager = VectorStoreManager(self.skill_store)
296
+ _, metadata_key = vs_manager.get_storage_keys(agent_id)
297
+ return (
298
+ await self.skill_store.get_agent_skill_data(
299
+ agent_id, "web_scraper", metadata_key
300
+ )
301
+ or {}
302
+ )
303
+
304
+ def create_url_metadata(
305
+ self,
306
+ urls: List[str],
307
+ split_docs: List[Document],
308
+ source_type: str = "web_scraper",
309
+ extra_fields: Optional[Dict] = None,
310
+ ) -> Dict:
311
+ """Create metadata for a list of URLs."""
312
+ metadata = {}
313
+ current_time = str(asyncio.get_event_loop().time())
314
+
315
+ for url in urls:
316
+ url_metadata = {
317
+ "indexed_at": current_time,
318
+ "chunks": len(
319
+ [doc for doc in split_docs if doc.metadata.get("source") == url]
320
+ ),
321
+ "source_type": source_type,
322
+ }
323
+
324
+ if extra_fields:
325
+ url_metadata.update(extra_fields)
326
+
327
+ metadata[url] = url_metadata
328
+
329
+ return metadata
330
+
331
+ def create_document_metadata(
332
+ self,
333
+ title: str,
334
+ source: str,
335
+ tags: str,
336
+ split_docs: List[Document],
337
+ document_length: int,
338
+ ) -> Dict:
339
+ """Create metadata for a document."""
340
+ # Generate unique key
341
+ key = f"document_{title.lower().replace(' ', '_')}"
342
+
343
+ return {
344
+ key: {
345
+ "title": title,
346
+ "source": source,
347
+ "source_type": "document_indexer",
348
+ "tags": [tag.strip() for tag in tags.split(",") if tag.strip()]
349
+ if tags
350
+ else [],
351
+ "indexed_at": str(asyncio.get_event_loop().time()),
352
+ "chunks": len(split_docs),
353
+ "length": document_length,
354
+ }
355
+ }
356
+
357
+ async def update_metadata(self, agent_id: str, new_metadata: Dict) -> None:
358
+ """Update metadata for an agent."""
359
+ vs_manager = VectorStoreManager(self.skill_store)
360
+ _, metadata_key = vs_manager.get_storage_keys(agent_id)
361
+
362
+ # Get existing metadata
363
+ existing_metadata = await self.get_existing_metadata(agent_id)
364
+
365
+ # Update with new metadata
366
+ existing_metadata.update(new_metadata)
367
+
368
+ # Save updated metadata
369
+ await self.skill_store.save_agent_skill_data(
370
+ agent_id=agent_id,
371
+ skill="web_scraper",
372
+ key=metadata_key,
373
+ data=existing_metadata,
374
+ )
375
+
376
+
377
+ class ResponseFormatter:
378
+ """Formats consistent responses for web scraper skills."""
379
+
380
+ @staticmethod
381
+ def format_indexing_response(
382
+ operation_type: str,
383
+ urls_or_content: List[str] | str,
384
+ total_chunks: int,
385
+ chunk_size: int,
386
+ chunk_overlap: int,
387
+ was_merged: bool,
388
+ extra_info: Optional[Dict] = None,
389
+ current_size_bytes: int = 0,
390
+ size_limit_reached: bool = False,
391
+ total_requested_urls: int = 0,
392
+ ) -> str:
393
+ """Format a consistent response for indexing operations."""
394
+
395
+ # Handle both URL lists and single content
396
+ if isinstance(urls_or_content, list):
397
+ urls = urls_or_content
398
+ processed_count = len(urls)
399
+
400
+ if size_limit_reached and total_requested_urls > 0:
401
+ content_summary = f"Processed {processed_count} of {total_requested_urls} URLs (size limit reached)"
402
+ else:
403
+ content_summary = (
404
+ f"Successfully {operation_type} {processed_count} URLs"
405
+ )
406
+
407
+ if len(urls) <= 5:
408
+ url_list = "\n".join([f"- {url}" for url in urls])
409
+ else:
410
+ displayed_urls = urls[:5]
411
+ remaining_count = len(urls) - 5
412
+ url_list = "\n".join([f"- {url}" for url in displayed_urls])
413
+ url_list += f"\n... and {remaining_count} more"
414
+ else:
415
+ content_summary = f"Successfully {operation_type} content"
416
+ url_list = ""
417
+
418
+ # Build response
419
+ response_parts = [content_summary]
420
+
421
+ if url_list:
422
+ response_parts.append(url_list)
423
+
424
+ response_parts.extend(
425
+ [
426
+ f"Total chunks created: {total_chunks}",
427
+ f"Chunk size: {chunk_size} characters",
428
+ f"Chunk overlap: {chunk_overlap} characters",
429
+ f"Vector store: {'merged with existing content' if was_merged else 'created new index'}",
430
+ ]
431
+ )
432
+
433
+ # Add size information
434
+ if current_size_bytes > 0:
435
+ vs_manager = VectorStoreManager(None) # Just for formatting
436
+ formatted_size = vs_manager.format_size(current_size_bytes)
437
+ max_size = vs_manager.format_size(MAX_CONTENT_SIZE_BYTES)
438
+ response_parts.append(
439
+ f"Current storage size: {formatted_size} / {max_size}"
440
+ )
441
+
442
+ if size_limit_reached:
443
+ response_parts.append("Size limit reached - some URLs were not processed")
444
+
445
+ if extra_info:
446
+ for key, value in extra_info.items():
447
+ response_parts.append(f"{key}: {value}")
448
+
449
+ response_parts.append(
450
+ "All content has been indexed and can be queried using the query_indexed_content tool."
451
+ )
452
+
453
+ return "\n".join(response_parts)
454
+
455
+
456
+ async def scrape_and_index_urls(
457
+ urls: List[str],
458
+ agent_id: str,
459
+ skill_store: SkillStoreABC,
460
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
461
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
462
+ requests_per_second: int = DEFAULT_REQUESTS_PER_SECOND,
463
+ ) -> Tuple[int, bool, List[str]]:
464
+ """
465
+ Scrape URLs and index their content into vector store with size limits.
466
+
467
+ Args:
468
+ urls: List of URLs to scrape
469
+ agent_id: Agent identifier for storage
470
+ skill_store: Skill store instance
471
+ chunk_size: Size of text chunks
472
+ chunk_overlap: Overlap between chunks
473
+ requests_per_second: Rate limiting for requests
474
+
475
+ Returns:
476
+ Tuple of (total_chunks, was_merged, valid_urls)
477
+ """
478
+ from urllib.parse import urlparse
479
+
480
+ from langchain_community.document_loaders import WebBaseLoader
481
+
482
+ # Validate URLs
483
+ valid_urls = []
484
+ for url in urls:
485
+ try:
486
+ parsed = urlparse(url)
487
+ if parsed.scheme in ["http", "https"] and parsed.netloc:
488
+ valid_urls.append(url)
489
+ else:
490
+ logger.warning(f"Invalid URL format: {url}")
491
+ except Exception as e:
492
+ logger.warning(f"Error parsing URL {url}: {e}")
493
+
494
+ if not valid_urls:
495
+ return 0, False, []
496
+
497
+ # Check existing content size
498
+ vs_manager = VectorStoreManager(skill_store)
499
+ current_size = await vs_manager.get_content_size(agent_id)
500
+
501
+ logger.info(
502
+ f"[{agent_id}] Current storage size: {vs_manager.format_size(current_size)}"
503
+ )
504
+
505
+ if current_size >= MAX_CONTENT_SIZE_BYTES:
506
+ logger.warning(
507
+ f"[{agent_id}] Storage limit already reached: {vs_manager.format_size(current_size)}"
508
+ )
509
+ return 0, False, []
510
+
511
+ # Process URLs one by one with size checking
512
+ processed_urls = []
513
+ total_chunks = 0
514
+ was_merged = False
515
+ size_limit_reached = False
516
+
517
+ for i, url in enumerate(valid_urls):
518
+ if current_size >= MAX_CONTENT_SIZE_BYTES:
519
+ size_limit_reached = True
520
+ logger.warning(f"[{agent_id}] Size limit reached after processing {i} URLs")
521
+ break
522
+
523
+ try:
524
+ logger.info(f"[{agent_id}] Processing URL {i + 1}/{len(valid_urls)}: {url}")
525
+
526
+ # Load single URL
527
+ loader = WebBaseLoader(
528
+ web_paths=[url],
529
+ requests_per_second=requests_per_second,
530
+ )
531
+
532
+ # Configure loader
533
+ loader.requests_kwargs = {
534
+ "verify": True,
535
+ "timeout": DEFAULT_REQUEST_TIMEOUT,
536
+ }
537
+
538
+ # Scrape the URL
539
+ documents = await asyncio.to_thread(loader.load)
540
+
541
+ if not documents:
542
+ logger.warning(f"[{agent_id}] No content extracted from {url}")
543
+ continue
544
+
545
+ # Check content size before processing
546
+ content_size = sum(
547
+ len(doc.page_content.encode("utf-8")) for doc in documents
548
+ )
549
+
550
+ if current_size + content_size > MAX_CONTENT_SIZE_BYTES:
551
+ logger.warning(
552
+ f"[{agent_id}] Adding {url} would exceed size limit. Skipping."
553
+ )
554
+ size_limit_reached = True
555
+ break
556
+
557
+ # Process and index this URL's content
558
+ chunks, merged = await index_documents(
559
+ documents, agent_id, skill_store, chunk_size, chunk_overlap
560
+ )
561
+
562
+ if chunks > 0:
563
+ processed_urls.append(url)
564
+ total_chunks += chunks
565
+ was_merged = merged or was_merged
566
+ current_size += content_size
567
+
568
+ logger.info(
569
+ f"[{agent_id}] Processed {url}: {chunks} chunks, current size: {vs_manager.format_size(current_size)}"
570
+ )
571
+
572
+ # Add delay for rate limiting
573
+ if i < len(valid_urls) - 1: # Don't delay after the last URL
574
+ await asyncio.sleep(1.0 / requests_per_second)
575
+
576
+ except Exception as e:
577
+ logger.error(f"[{agent_id}] Error processing {url}: {e}")
578
+ continue
579
+
580
+ # Log final results
581
+ if size_limit_reached:
582
+ logger.warning(
583
+ f"[{agent_id}] Size limit reached. Processed {len(processed_urls)}/{len(valid_urls)} URLs"
584
+ )
585
+ else:
586
+ logger.info(
587
+ f"[{agent_id}] Successfully processed all {len(processed_urls)} URLs"
588
+ )
589
+
590
+ return total_chunks, was_merged, processed_urls
591
+
592
+
593
+ # Convenience function that combines all operations
594
+ async def index_documents(
595
+ documents: List[Document],
596
+ agent_id: str,
597
+ skill_store: SkillStoreABC,
598
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
599
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
600
+ ) -> Tuple[int, bool]:
601
+ """
602
+ Complete document indexing workflow.
603
+
604
+ Returns:
605
+ Tuple of (total_chunks, was_merged)
606
+ """
607
+ # Process documents
608
+ split_docs = DocumentProcessor.create_chunks(documents, chunk_size, chunk_overlap)
609
+
610
+ if not split_docs:
611
+ raise ValueError("No content could be processed into chunks")
612
+
613
+ # Handle vector store
614
+ vs_manager = VectorStoreManager(skill_store)
615
+ vector_store, was_merged = await vs_manager.merge_with_existing(
616
+ split_docs, agent_id, chunk_size, chunk_overlap
617
+ )
618
+
619
+ # Save vector store
620
+ await vs_manager.save_vector_store(
621
+ vector_store, agent_id, chunk_size, chunk_overlap
622
+ )
623
+
624
+ return len(split_docs), was_merged
625
+
626
+
627
+ # Error handling decorator
628
+ def handle_skill_errors(operation_name: str):
629
+ """Decorator for consistent error handling in skills."""
630
+
631
+ def decorator(func):
632
+ async def wrapper(*args, **kwargs):
633
+ try:
634
+ return await func(*args, **kwargs)
635
+ except Exception as e:
636
+ logger.error(f"Error in {operation_name}: {e}")
637
+ return f"Error {operation_name}: {str(e)}"
638
+
639
+ return wrapper
640
+
641
+ return decorator