intentkit 0.6.0.dev6__py3-none-any.whl → 0.6.0.dev8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of intentkit might be problematic. Click here for more details.
- intentkit/__init__.py +1 -1
- intentkit/clients/cdp.py +68 -16
- intentkit/skills/cdp/__init__.py +8 -11
- intentkit/skills/cdp/get_balance.py +46 -18
- intentkit/skills/cdp/schema.json +0 -64
- intentkit/skills/enso/base.py +26 -3
- intentkit/skills/enso/route.py +23 -22
- intentkit/skills/enso/wallet.py +27 -23
- intentkit/skills/web_scraper/README.md +35 -4
- intentkit/skills/web_scraper/__init__.py +16 -0
- intentkit/skills/web_scraper/document_indexer.py +143 -0
- intentkit/skills/web_scraper/schema.json +28 -0
- intentkit/skills/web_scraper/scrape_and_index.py +134 -199
- intentkit/skills/web_scraper/utils.py +641 -0
- intentkit/skills/web_scraper/website_indexer.py +426 -0
- {intentkit-0.6.0.dev6.dist-info → intentkit-0.6.0.dev8.dist-info}/METADATA +1 -1
- {intentkit-0.6.0.dev6.dist-info → intentkit-0.6.0.dev8.dist-info}/RECORD +19 -16
- {intentkit-0.6.0.dev6.dist-info → intentkit-0.6.0.dev8.dist-info}/WHEEL +0 -0
- {intentkit-0.6.0.dev6.dist-info → intentkit-0.6.0.dev8.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Type
|
|
3
|
+
|
|
4
|
+
from langchain_core.runnables import RunnableConfig
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from intentkit.skills.web_scraper.base import WebScraperBaseTool
|
|
8
|
+
from intentkit.skills.web_scraper.utils import (
|
|
9
|
+
DocumentProcessor,
|
|
10
|
+
MetadataManager,
|
|
11
|
+
ResponseFormatter,
|
|
12
|
+
VectorStoreManager,
|
|
13
|
+
index_documents,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DocumentIndexerInput(BaseModel):
|
|
20
|
+
"""Input for DocumentIndexer tool."""
|
|
21
|
+
|
|
22
|
+
text_content: str = Field(
|
|
23
|
+
description="The text content to add to the vector database. Can be content from Google Docs, Notion, or any other text source",
|
|
24
|
+
min_length=10,
|
|
25
|
+
max_length=100000,
|
|
26
|
+
)
|
|
27
|
+
title: str = Field(
|
|
28
|
+
description="Title or name for this text content (will be used as metadata)",
|
|
29
|
+
max_length=200,
|
|
30
|
+
)
|
|
31
|
+
source: str = Field(
|
|
32
|
+
description="Source of the text content (e.g., 'Google Doc', 'Notion Page', 'Manual Entry')",
|
|
33
|
+
default="Manual Entry",
|
|
34
|
+
max_length=100,
|
|
35
|
+
)
|
|
36
|
+
chunk_size: int = Field(
|
|
37
|
+
description="Size of text chunks for indexing (default: 1000)",
|
|
38
|
+
default=1000,
|
|
39
|
+
ge=100,
|
|
40
|
+
le=4000,
|
|
41
|
+
)
|
|
42
|
+
chunk_overlap: int = Field(
|
|
43
|
+
description="Overlap between chunks (default: 200)",
|
|
44
|
+
default=200,
|
|
45
|
+
ge=0,
|
|
46
|
+
le=1000,
|
|
47
|
+
)
|
|
48
|
+
tags: str = Field(
|
|
49
|
+
description="Optional tags for categorizing the content (comma-separated)",
|
|
50
|
+
default="",
|
|
51
|
+
max_length=500,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DocumentIndexer(WebScraperBaseTool):
|
|
56
|
+
"""Tool for importing and indexing document content to the vector database.
|
|
57
|
+
|
|
58
|
+
This tool allows users to copy and paste document content from various sources
|
|
59
|
+
(like Google Docs, Notion, PDFs, etc.) and index it directly into the vector store
|
|
60
|
+
for later querying and retrieval.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
name: str = "web_scraper_document_indexer"
|
|
64
|
+
description: str = (
|
|
65
|
+
"Import and index document content directly to the vector database. "
|
|
66
|
+
"Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources. "
|
|
67
|
+
"The indexed content can then be queried using the query_indexed_content tool."
|
|
68
|
+
)
|
|
69
|
+
args_schema: Type[BaseModel] = DocumentIndexerInput
|
|
70
|
+
|
|
71
|
+
async def _arun(
|
|
72
|
+
self,
|
|
73
|
+
text_content: str,
|
|
74
|
+
title: str,
|
|
75
|
+
source: str = "Manual Entry",
|
|
76
|
+
chunk_size: int = 1000,
|
|
77
|
+
chunk_overlap: int = 200,
|
|
78
|
+
tags: str = "",
|
|
79
|
+
config: RunnableConfig = None,
|
|
80
|
+
**kwargs,
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Add text content to the vector database."""
|
|
83
|
+
# Get agent context - throw error if not available
|
|
84
|
+
if not config:
|
|
85
|
+
raise ValueError("Configuration is required but not provided")
|
|
86
|
+
|
|
87
|
+
context = self.context_from_config(config)
|
|
88
|
+
if not context or not context.agent or not context.agent.id:
|
|
89
|
+
raise ValueError("Agent ID is required but not found in configuration")
|
|
90
|
+
|
|
91
|
+
agent_id = context.agent.id
|
|
92
|
+
|
|
93
|
+
logger.info(f"[{agent_id}] Starting document indexing for title: '{title}'")
|
|
94
|
+
|
|
95
|
+
# Validate content
|
|
96
|
+
if not DocumentProcessor.validate_content(text_content):
|
|
97
|
+
logger.error(f"[{agent_id}] Content validation failed - too short")
|
|
98
|
+
return "Error: Text content is too short. Please provide at least 10 characters of content."
|
|
99
|
+
|
|
100
|
+
# Create document with metadata
|
|
101
|
+
document = DocumentProcessor.create_document(
|
|
102
|
+
text_content,
|
|
103
|
+
title,
|
|
104
|
+
source,
|
|
105
|
+
tags,
|
|
106
|
+
extra_metadata={"source_type": "document_indexer"},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
logger.info(
|
|
110
|
+
f"[{agent_id}] Document created, length: {len(document.page_content)} chars"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Index the document
|
|
114
|
+
total_chunks, was_merged = await index_documents(
|
|
115
|
+
[document], agent_id, self.skill_store, chunk_size, chunk_overlap
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Get current storage size for response
|
|
119
|
+
vs_manager = VectorStoreManager(self.skill_store)
|
|
120
|
+
current_size = await vs_manager.get_content_size(agent_id)
|
|
121
|
+
|
|
122
|
+
# Update metadata
|
|
123
|
+
metadata_manager = MetadataManager(self.skill_store)
|
|
124
|
+
new_metadata = metadata_manager.create_document_metadata(
|
|
125
|
+
title, source, tags, [document], len(text_content)
|
|
126
|
+
)
|
|
127
|
+
await metadata_manager.update_metadata(agent_id, new_metadata)
|
|
128
|
+
|
|
129
|
+
logger.info(f"[{agent_id}] Document indexing completed successfully")
|
|
130
|
+
|
|
131
|
+
# Format response
|
|
132
|
+
response = ResponseFormatter.format_indexing_response(
|
|
133
|
+
"indexed",
|
|
134
|
+
f"Document: {title}",
|
|
135
|
+
total_chunks,
|
|
136
|
+
chunk_size,
|
|
137
|
+
chunk_overlap,
|
|
138
|
+
was_merged,
|
|
139
|
+
current_size_bytes=current_size,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logger.info(f"[{agent_id}] Document indexing completed successfully")
|
|
143
|
+
return response
|
|
@@ -50,6 +50,34 @@
|
|
|
50
50
|
],
|
|
51
51
|
"description": "Search and retrieve relevant information from previously indexed web content using semantic similarity. Perfect for answering questions based on scraped documents.",
|
|
52
52
|
"default": "private"
|
|
53
|
+
},
|
|
54
|
+
"website_indexer": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"title": "Complete Website Indexer",
|
|
57
|
+
"enum": [
|
|
58
|
+
"disabled",
|
|
59
|
+
"private"
|
|
60
|
+
],
|
|
61
|
+
"x-enum-title": [
|
|
62
|
+
"Disabled",
|
|
63
|
+
"Agent Owner Only"
|
|
64
|
+
],
|
|
65
|
+
"description": "Index entire websites by discovering and scraping all pages using sitemaps. Automatically finds sitemaps from robots.txt, extracts all URLs, and comprehensively indexes website content.",
|
|
66
|
+
"default": "private"
|
|
67
|
+
},
|
|
68
|
+
"document_indexer": {
|
|
69
|
+
"type": "string",
|
|
70
|
+
"title": "Document Content Indexer",
|
|
71
|
+
"enum": [
|
|
72
|
+
"disabled",
|
|
73
|
+
"private"
|
|
74
|
+
],
|
|
75
|
+
"x-enum-title": [
|
|
76
|
+
"Disabled",
|
|
77
|
+
"Agent Owner Only"
|
|
78
|
+
],
|
|
79
|
+
"description": "Import and index document content directly to the vector database. Perfect for adding content from Google Docs, Notion pages, PDFs, or any other document sources by copy-pasting.",
|
|
80
|
+
"default": "private"
|
|
53
81
|
}
|
|
54
82
|
},
|
|
55
83
|
"description": "Configure the availability of each web scraper skill (disabled, public, or private)"
|
|
@@ -1,19 +1,18 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import base64
|
|
3
1
|
import logging
|
|
4
|
-
import os
|
|
5
|
-
import tempfile
|
|
6
2
|
from typing import List, Type
|
|
7
|
-
from urllib.parse import urlparse
|
|
8
3
|
|
|
9
|
-
from langchain_community.document_loaders import WebBaseLoader
|
|
10
|
-
from langchain_community.vectorstores import FAISS
|
|
11
4
|
from langchain_core.runnables import RunnableConfig
|
|
12
|
-
from langchain_openai import OpenAIEmbeddings
|
|
13
|
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
14
5
|
from pydantic import BaseModel, Field
|
|
15
6
|
|
|
16
7
|
from intentkit.skills.web_scraper.base import WebScraperBaseTool
|
|
8
|
+
from intentkit.skills.web_scraper.utils import (
|
|
9
|
+
DEFAULT_CHUNK_OVERLAP,
|
|
10
|
+
DEFAULT_CHUNK_SIZE,
|
|
11
|
+
MetadataManager,
|
|
12
|
+
ResponseFormatter,
|
|
13
|
+
VectorStoreManager,
|
|
14
|
+
scrape_and_index_urls,
|
|
15
|
+
)
|
|
17
16
|
|
|
18
17
|
logger = logging.getLogger(__name__)
|
|
19
18
|
|
|
@@ -28,13 +27,13 @@ class ScrapeAndIndexInput(BaseModel):
|
|
|
28
27
|
)
|
|
29
28
|
chunk_size: int = Field(
|
|
30
29
|
description="Size of text chunks for indexing (default: 1000)",
|
|
31
|
-
default=
|
|
30
|
+
default=DEFAULT_CHUNK_SIZE,
|
|
32
31
|
ge=100,
|
|
33
32
|
le=4000,
|
|
34
33
|
)
|
|
35
34
|
chunk_overlap: int = Field(
|
|
36
35
|
description="Overlap between chunks (default: 200)",
|
|
37
|
-
default=
|
|
36
|
+
default=DEFAULT_CHUNK_OVERLAP,
|
|
38
37
|
ge=0,
|
|
39
38
|
le=1000,
|
|
40
39
|
)
|
|
@@ -71,151 +70,92 @@ class ScrapeAndIndex(WebScraperBaseTool):
|
|
|
71
70
|
)
|
|
72
71
|
args_schema: Type[BaseModel] = ScrapeAndIndexInput
|
|
73
72
|
|
|
74
|
-
def _validate_urls(self, urls: List[str]) -> List[str]:
|
|
75
|
-
"""Validate and filter URLs."""
|
|
76
|
-
valid_urls = []
|
|
77
|
-
for url in urls:
|
|
78
|
-
try:
|
|
79
|
-
parsed = urlparse(url)
|
|
80
|
-
if parsed.scheme in ["http", "https"] and parsed.netloc:
|
|
81
|
-
valid_urls.append(url)
|
|
82
|
-
else:
|
|
83
|
-
logger.warning(f"Invalid URL format: {url}")
|
|
84
|
-
except Exception as e:
|
|
85
|
-
logger.warning(f"Error parsing URL {url}: {e}")
|
|
86
|
-
return valid_urls
|
|
87
|
-
|
|
88
73
|
async def _arun(
|
|
89
74
|
self,
|
|
90
75
|
urls: List[str],
|
|
91
|
-
chunk_size: int =
|
|
92
|
-
chunk_overlap: int =
|
|
76
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
77
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
93
78
|
config: RunnableConfig = None,
|
|
94
79
|
**kwargs,
|
|
95
80
|
) -> str:
|
|
96
81
|
"""Scrape URLs and index content into vector store."""
|
|
97
82
|
try:
|
|
98
|
-
#
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
return "Error: No valid URLs provided. URLs must start with http:// or https://"
|
|
83
|
+
# Get agent context - throw error if not available
|
|
84
|
+
if not config:
|
|
85
|
+
raise ValueError("Configuration is required but not provided")
|
|
102
86
|
|
|
103
|
-
|
|
104
|
-
context
|
|
105
|
-
|
|
87
|
+
context = self.context_from_config(config)
|
|
88
|
+
if not context or not context.agent or not context.agent.id:
|
|
89
|
+
raise ValueError("Agent ID is required but not found in configuration")
|
|
106
90
|
|
|
107
|
-
|
|
108
|
-
logger.info(f"Scraping {len(valid_urls)} URLs...")
|
|
109
|
-
loader = WebBaseLoader(
|
|
110
|
-
web_paths=valid_urls,
|
|
111
|
-
requests_per_second=2, # Be respectful to servers
|
|
112
|
-
show_progress=True,
|
|
113
|
-
)
|
|
91
|
+
agent_id = context.agent.id
|
|
114
92
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
"timeout": 30,
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
documents = await asyncio.to_thread(loader.load)
|
|
93
|
+
logger.info(
|
|
94
|
+
f"[{agent_id}] Starting scrape and index operation with {len(urls)} URLs"
|
|
95
|
+
)
|
|
122
96
|
|
|
123
|
-
|
|
124
|
-
|
|
97
|
+
# Use the utility function to scrape and index URLs
|
|
98
|
+
total_chunks, was_merged, valid_urls = await scrape_and_index_urls(
|
|
99
|
+
urls, agent_id, self.skill_store, chunk_size, chunk_overlap
|
|
100
|
+
)
|
|
125
101
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
chunk_size=chunk_size,
|
|
129
|
-
chunk_overlap=chunk_overlap,
|
|
130
|
-
length_function=len,
|
|
102
|
+
logger.info(
|
|
103
|
+
f"[{agent_id}] Scraping completed: {total_chunks} chunks indexed, merged: {was_merged}"
|
|
131
104
|
)
|
|
132
|
-
split_docs = text_splitter.split_documents(documents)
|
|
133
105
|
|
|
134
|
-
if not
|
|
135
|
-
|
|
106
|
+
if not valid_urls:
|
|
107
|
+
logger.error(f"[{agent_id}] No valid URLs provided")
|
|
108
|
+
return "Error: No valid URLs provided. URLs must start with http:// or https://"
|
|
136
109
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
110
|
+
if total_chunks == 0:
|
|
111
|
+
logger.error(f"[{agent_id}] No content extracted from URLs")
|
|
112
|
+
return "Error: No content could be extracted from the provided URLs."
|
|
140
113
|
|
|
141
|
-
#
|
|
142
|
-
|
|
114
|
+
# Get current storage size for response
|
|
115
|
+
vs_manager = VectorStoreManager(self.skill_store)
|
|
116
|
+
current_size = await vs_manager.get_content_size(agent_id)
|
|
117
|
+
size_limit_reached = len(valid_urls) < len(urls)
|
|
143
118
|
|
|
144
|
-
#
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# Save vector store to temporary directory and encode to base64
|
|
149
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
150
|
-
vector_store.save_local(temp_dir)
|
|
151
|
-
|
|
152
|
-
# Read and encode all files in the temporary directory
|
|
153
|
-
encoded_files = {}
|
|
154
|
-
for filename in os.listdir(temp_dir):
|
|
155
|
-
file_path = os.path.join(temp_dir, filename)
|
|
156
|
-
if os.path.isfile(file_path):
|
|
157
|
-
with open(file_path, "rb") as f:
|
|
158
|
-
encoded_files[filename] = base64.b64encode(f.read()).decode(
|
|
159
|
-
"utf-8"
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
# Store vector store data
|
|
163
|
-
await self.skill_store.save_agent_skill_data(
|
|
164
|
-
agent_id=agent_id,
|
|
165
|
-
skill="web_scraper",
|
|
166
|
-
key=vector_store_key,
|
|
167
|
-
data={
|
|
168
|
-
"faiss_files": encoded_files,
|
|
169
|
-
"chunk_size": chunk_size,
|
|
170
|
-
"chunk_overlap": chunk_overlap,
|
|
171
|
-
},
|
|
119
|
+
# Update metadata
|
|
120
|
+
metadata_manager = MetadataManager(self.skill_store)
|
|
121
|
+
new_metadata = metadata_manager.create_url_metadata(
|
|
122
|
+
valid_urls, [], "scrape_and_index"
|
|
172
123
|
)
|
|
124
|
+
await metadata_manager.update_metadata(agent_id, new_metadata)
|
|
173
125
|
|
|
174
|
-
|
|
175
|
-
existing_metadata = (
|
|
176
|
-
await self.skill_store.get_agent_skill_data(
|
|
177
|
-
agent_id, "web_scraper", metadata_key
|
|
178
|
-
)
|
|
179
|
-
or {}
|
|
180
|
-
)
|
|
181
|
-
existing_metadata.update(
|
|
182
|
-
{
|
|
183
|
-
url: {
|
|
184
|
-
"indexed_at": str(asyncio.get_event_loop().time()),
|
|
185
|
-
"chunks": len(
|
|
186
|
-
[
|
|
187
|
-
doc
|
|
188
|
-
for doc in split_docs
|
|
189
|
-
if doc.metadata.get("source") == url
|
|
190
|
-
]
|
|
191
|
-
),
|
|
192
|
-
}
|
|
193
|
-
for url in valid_urls
|
|
194
|
-
}
|
|
195
|
-
)
|
|
126
|
+
logger.info(f"[{agent_id}] Metadata updated successfully")
|
|
196
127
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
128
|
+
# Format response
|
|
129
|
+
response = ResponseFormatter.format_indexing_response(
|
|
130
|
+
"scraped and indexed",
|
|
131
|
+
valid_urls,
|
|
132
|
+
total_chunks,
|
|
133
|
+
chunk_size,
|
|
134
|
+
chunk_overlap,
|
|
135
|
+
was_merged,
|
|
136
|
+
current_size_bytes=current_size,
|
|
137
|
+
size_limit_reached=size_limit_reached,
|
|
138
|
+
total_requested_urls=len(urls),
|
|
202
139
|
)
|
|
203
140
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
return (
|
|
208
|
-
f"Successfully scraped and indexed {successful_urls} URLs:\n"
|
|
209
|
-
f"{'• ' + chr(10) + '• '.join(valid_urls)}\n\n"
|
|
210
|
-
f"Total chunks created: {total_chunks}\n"
|
|
211
|
-
f"Chunk size: {chunk_size} characters\n"
|
|
212
|
-
f"Chunk overlap: {chunk_overlap} characters\n\n"
|
|
213
|
-
f"The content is now indexed and can be queried using the query_indexed_content tool."
|
|
141
|
+
logger.info(
|
|
142
|
+
f"[{agent_id}] Scrape and index operation completed successfully"
|
|
214
143
|
)
|
|
144
|
+
return response
|
|
215
145
|
|
|
216
146
|
except Exception as e:
|
|
217
|
-
|
|
218
|
-
|
|
147
|
+
# Extract agent_id for error logging if possible
|
|
148
|
+
agent_id = "UNKNOWN"
|
|
149
|
+
try:
|
|
150
|
+
if config:
|
|
151
|
+
context = self.context_from_config(config)
|
|
152
|
+
if context and context.agent and context.agent.id:
|
|
153
|
+
agent_id = context.agent.id
|
|
154
|
+
except Exception:
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
logger.error(f"[{agent_id}] Error in ScrapeAndIndex: {e}", exc_info=True)
|
|
158
|
+
raise type(e)(f"[agent:{agent_id}]: {e}") from e
|
|
219
159
|
|
|
220
160
|
|
|
221
161
|
class QueryIndexedContent(WebScraperBaseTool):
|
|
@@ -242,86 +182,81 @@ class QueryIndexedContent(WebScraperBaseTool):
|
|
|
242
182
|
) -> str:
|
|
243
183
|
"""Query the indexed content."""
|
|
244
184
|
try:
|
|
245
|
-
# Get agent context
|
|
246
|
-
|
|
247
|
-
|
|
185
|
+
# Get agent context - throw error if not available
|
|
186
|
+
if not config:
|
|
187
|
+
raise ValueError("Configuration is required but not provided")
|
|
188
|
+
|
|
189
|
+
context = self.context_from_config(config)
|
|
190
|
+
if not context or not context.agent or not context.agent.id:
|
|
191
|
+
raise ValueError("Agent ID is required but not found in configuration")
|
|
192
|
+
|
|
193
|
+
agent_id = context.agent.id
|
|
194
|
+
|
|
195
|
+
logger.info(f"[{agent_id}] Starting query operation: '{query}'")
|
|
248
196
|
|
|
249
197
|
# Retrieve vector store
|
|
250
198
|
vector_store_key = f"vector_store_{agent_id}"
|
|
251
|
-
|
|
199
|
+
|
|
200
|
+
logger.info(f"[{agent_id}] Looking for vector store: {vector_store_key}")
|
|
252
201
|
|
|
253
202
|
stored_data = await self.skill_store.get_agent_skill_data(
|
|
254
203
|
agent_id, "web_scraper", vector_store_key
|
|
255
204
|
)
|
|
205
|
+
|
|
206
|
+
if not stored_data:
|
|
207
|
+
logger.warning(f"[{agent_id}] No vector store found")
|
|
208
|
+
return "No indexed content found. Please use the scrape_and_index tool first to scrape and index some web content before querying."
|
|
209
|
+
|
|
256
210
|
if not stored_data or "faiss_files" not in stored_data:
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
267
|
-
# Decode and write files to temporary directory
|
|
268
|
-
for filename, encoded_content in stored_data["faiss_files"].items():
|
|
269
|
-
file_path = os.path.join(temp_dir, filename)
|
|
270
|
-
with open(file_path, "wb") as f:
|
|
271
|
-
f.write(base64.b64decode(encoded_content))
|
|
272
|
-
|
|
273
|
-
# Load the vector store from the temporary directory
|
|
274
|
-
vector_store = FAISS.load_local(
|
|
275
|
-
temp_dir,
|
|
276
|
-
embeddings,
|
|
277
|
-
allow_dangerous_deserialization=True, # Safe since we control the serialization
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
# Perform similarity search
|
|
281
|
-
relevant_docs = vector_store.similarity_search(query, k=max_results)
|
|
282
|
-
|
|
283
|
-
if not relevant_docs:
|
|
284
|
-
return f"No relevant content found for query: '{query}'"
|
|
285
|
-
|
|
286
|
-
# Get metadata about indexed URLs
|
|
287
|
-
metadata = (
|
|
288
|
-
await self.skill_store.get_agent_skill_data(
|
|
289
|
-
agent_id, "web_scraper", metadata_key
|
|
290
|
-
)
|
|
291
|
-
or {}
|
|
211
|
+
logger.warning(f"[{agent_id}] Invalid stored data structure")
|
|
212
|
+
return "No indexed content found. Please use the scrape_and_index tool first to scrape and index some web content before querying."
|
|
213
|
+
|
|
214
|
+
# Create embeddings and decode vector store
|
|
215
|
+
logger.info(f"[{agent_id}] Decoding vector store")
|
|
216
|
+
vs_manager = VectorStoreManager(self.skill_store)
|
|
217
|
+
embeddings = vs_manager.create_embeddings()
|
|
218
|
+
vector_store = vs_manager.decode_vector_store(
|
|
219
|
+
stored_data["faiss_files"], embeddings
|
|
292
220
|
)
|
|
293
221
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
[
|
|
316
|
-
"\n" + "=" * 50,
|
|
317
|
-
f"Total indexed URLs: {len(metadata)}",
|
|
318
|
-
"Indexed sources:",
|
|
319
|
-
*[f"• {url}" for url in metadata.keys()],
|
|
320
|
-
]
|
|
222
|
+
logger.info(
|
|
223
|
+
f"[{agent_id}] Vector store loaded, index count: {vector_store.index.ntotal}"
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Perform similarity search
|
|
227
|
+
docs = vector_store.similarity_search(query, k=max_results)
|
|
228
|
+
logger.info(f"[{agent_id}] Found {len(docs)} similar documents")
|
|
229
|
+
|
|
230
|
+
if not docs:
|
|
231
|
+
logger.info(f"[{agent_id}] No relevant documents found for query")
|
|
232
|
+
return f"No relevant information found for your query: '{query}'. The indexed content may not contain information related to your search."
|
|
233
|
+
|
|
234
|
+
# Format results
|
|
235
|
+
results = []
|
|
236
|
+
for i, doc in enumerate(docs, 1):
|
|
237
|
+
content = doc.page_content.strip()
|
|
238
|
+
source = doc.metadata.get("source", "Unknown")
|
|
239
|
+
results.append(f"**Source {i}:** {source}\n{content}")
|
|
240
|
+
|
|
241
|
+
response = "\n\n".join(results)
|
|
242
|
+
logger.info(
|
|
243
|
+
f"[{agent_id}] Query completed successfully, returning {len(response)} chars"
|
|
321
244
|
)
|
|
322
245
|
|
|
323
|
-
return
|
|
246
|
+
return response
|
|
324
247
|
|
|
325
248
|
except Exception as e:
|
|
326
|
-
|
|
327
|
-
|
|
249
|
+
# Extract agent_id for error logging if possible
|
|
250
|
+
agent_id = "UNKNOWN"
|
|
251
|
+
try:
|
|
252
|
+
if config:
|
|
253
|
+
context = self.context_from_config(config)
|
|
254
|
+
if context and context.agent and context.agent.id:
|
|
255
|
+
agent_id = context.agent.id
|
|
256
|
+
except Exception:
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
logger.error(
|
|
260
|
+
f"[{agent_id}] Error in QueryIndexedContent: {e}", exc_info=True
|
|
261
|
+
)
|
|
262
|
+
raise type(e)(f"[agent:{agent_id}]: {e}") from e
|