intentkit 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of intentkit might be problematic. Click here for more details.
- intentkit/__init__.py +17 -0
- intentkit/abstracts/__init__.py +0 -0
- intentkit/abstracts/agent.py +60 -0
- intentkit/abstracts/api.py +4 -0
- intentkit/abstracts/engine.py +38 -0
- intentkit/abstracts/exception.py +9 -0
- intentkit/abstracts/graph.py +25 -0
- intentkit/abstracts/skill.py +129 -0
- intentkit/abstracts/twitter.py +54 -0
- intentkit/clients/__init__.py +14 -0
- intentkit/clients/cdp.py +53 -0
- intentkit/clients/twitter.py +445 -0
- intentkit/config/__init__.py +0 -0
- intentkit/config/config.py +164 -0
- intentkit/core/__init__.py +0 -0
- intentkit/core/agent.py +191 -0
- intentkit/core/api.py +40 -0
- intentkit/core/client.py +45 -0
- intentkit/core/credit.py +1767 -0
- intentkit/core/engine.py +1018 -0
- intentkit/core/node.py +223 -0
- intentkit/core/prompt.py +58 -0
- intentkit/core/skill.py +124 -0
- intentkit/models/agent.py +1689 -0
- intentkit/models/agent_data.py +810 -0
- intentkit/models/agent_schema.json +733 -0
- intentkit/models/app_setting.py +156 -0
- intentkit/models/base.py +9 -0
- intentkit/models/chat.py +581 -0
- intentkit/models/conversation.py +286 -0
- intentkit/models/credit.py +1406 -0
- intentkit/models/db.py +120 -0
- intentkit/models/db_mig.py +102 -0
- intentkit/models/generator.py +347 -0
- intentkit/models/llm.py +746 -0
- intentkit/models/redis.py +132 -0
- intentkit/models/skill.py +466 -0
- intentkit/models/user.py +243 -0
- intentkit/skills/__init__.py +12 -0
- intentkit/skills/acolyt/__init__.py +83 -0
- intentkit/skills/acolyt/acolyt.jpg +0 -0
- intentkit/skills/acolyt/ask.py +128 -0
- intentkit/skills/acolyt/base.py +28 -0
- intentkit/skills/acolyt/schema.json +89 -0
- intentkit/skills/aixbt/README.md +71 -0
- intentkit/skills/aixbt/__init__.py +73 -0
- intentkit/skills/aixbt/aixbt.jpg +0 -0
- intentkit/skills/aixbt/base.py +21 -0
- intentkit/skills/aixbt/projects.py +153 -0
- intentkit/skills/aixbt/schema.json +99 -0
- intentkit/skills/allora/__init__.py +83 -0
- intentkit/skills/allora/allora.jpeg +0 -0
- intentkit/skills/allora/base.py +28 -0
- intentkit/skills/allora/price.py +130 -0
- intentkit/skills/allora/schema.json +89 -0
- intentkit/skills/base.py +174 -0
- intentkit/skills/carv/README.md +95 -0
- intentkit/skills/carv/__init__.py +121 -0
- intentkit/skills/carv/base.py +183 -0
- intentkit/skills/carv/carv.webp +0 -0
- intentkit/skills/carv/fetch_news.py +92 -0
- intentkit/skills/carv/onchain_query.py +164 -0
- intentkit/skills/carv/schema.json +137 -0
- intentkit/skills/carv/token_info_and_price.py +110 -0
- intentkit/skills/cdp/__init__.py +137 -0
- intentkit/skills/cdp/base.py +21 -0
- intentkit/skills/cdp/cdp.png +0 -0
- intentkit/skills/cdp/get_balance.py +81 -0
- intentkit/skills/cdp/schema.json +473 -0
- intentkit/skills/chainlist/README.md +38 -0
- intentkit/skills/chainlist/__init__.py +54 -0
- intentkit/skills/chainlist/base.py +21 -0
- intentkit/skills/chainlist/chain_lookup.py +208 -0
- intentkit/skills/chainlist/chainlist.png +0 -0
- intentkit/skills/chainlist/schema.json +47 -0
- intentkit/skills/common/__init__.py +82 -0
- intentkit/skills/common/base.py +21 -0
- intentkit/skills/common/common.jpg +0 -0
- intentkit/skills/common/current_time.py +84 -0
- intentkit/skills/common/schema.json +57 -0
- intentkit/skills/cookiefun/README.md +121 -0
- intentkit/skills/cookiefun/__init__.py +78 -0
- intentkit/skills/cookiefun/base.py +41 -0
- intentkit/skills/cookiefun/constants.py +18 -0
- intentkit/skills/cookiefun/cookiefun.png +0 -0
- intentkit/skills/cookiefun/get_account_details.py +171 -0
- intentkit/skills/cookiefun/get_account_feed.py +282 -0
- intentkit/skills/cookiefun/get_account_smart_followers.py +181 -0
- intentkit/skills/cookiefun/get_sectors.py +128 -0
- intentkit/skills/cookiefun/schema.json +155 -0
- intentkit/skills/cookiefun/search_accounts.py +225 -0
- intentkit/skills/cryptocompare/__init__.py +130 -0
- intentkit/skills/cryptocompare/api.py +159 -0
- intentkit/skills/cryptocompare/base.py +303 -0
- intentkit/skills/cryptocompare/cryptocompare.png +0 -0
- intentkit/skills/cryptocompare/fetch_news.py +96 -0
- intentkit/skills/cryptocompare/fetch_price.py +99 -0
- intentkit/skills/cryptocompare/fetch_top_exchanges.py +113 -0
- intentkit/skills/cryptocompare/fetch_top_market_cap.py +109 -0
- intentkit/skills/cryptocompare/fetch_top_volume.py +108 -0
- intentkit/skills/cryptocompare/fetch_trading_signals.py +107 -0
- intentkit/skills/cryptocompare/schema.json +168 -0
- intentkit/skills/cryptopanic/__init__.py +108 -0
- intentkit/skills/cryptopanic/base.py +51 -0
- intentkit/skills/cryptopanic/cryptopanic.png +0 -0
- intentkit/skills/cryptopanic/fetch_crypto_news.py +153 -0
- intentkit/skills/cryptopanic/fetch_crypto_sentiment.py +136 -0
- intentkit/skills/cryptopanic/schema.json +103 -0
- intentkit/skills/dapplooker/README.md +92 -0
- intentkit/skills/dapplooker/__init__.py +83 -0
- intentkit/skills/dapplooker/base.py +26 -0
- intentkit/skills/dapplooker/dapplooker.jpg +0 -0
- intentkit/skills/dapplooker/dapplooker_token_data.py +476 -0
- intentkit/skills/dapplooker/schema.json +91 -0
- intentkit/skills/defillama/__init__.py +323 -0
- intentkit/skills/defillama/api.py +315 -0
- intentkit/skills/defillama/base.py +135 -0
- intentkit/skills/defillama/coins/__init__.py +0 -0
- intentkit/skills/defillama/coins/fetch_batch_historical_prices.py +116 -0
- intentkit/skills/defillama/coins/fetch_block.py +98 -0
- intentkit/skills/defillama/coins/fetch_current_prices.py +105 -0
- intentkit/skills/defillama/coins/fetch_first_price.py +100 -0
- intentkit/skills/defillama/coins/fetch_historical_prices.py +110 -0
- intentkit/skills/defillama/coins/fetch_price_chart.py +109 -0
- intentkit/skills/defillama/coins/fetch_price_percentage.py +93 -0
- intentkit/skills/defillama/config/__init__.py +0 -0
- intentkit/skills/defillama/config/chains.py +433 -0
- intentkit/skills/defillama/defillama.jpeg +0 -0
- intentkit/skills/defillama/fees/__init__.py +0 -0
- intentkit/skills/defillama/fees/fetch_fees_overview.py +130 -0
- intentkit/skills/defillama/schema.json +383 -0
- intentkit/skills/defillama/stablecoins/__init__.py +0 -0
- intentkit/skills/defillama/stablecoins/fetch_stablecoin_chains.py +100 -0
- intentkit/skills/defillama/stablecoins/fetch_stablecoin_charts.py +129 -0
- intentkit/skills/defillama/stablecoins/fetch_stablecoin_prices.py +83 -0
- intentkit/skills/defillama/stablecoins/fetch_stablecoins.py +126 -0
- intentkit/skills/defillama/tests/__init__.py +0 -0
- intentkit/skills/defillama/tests/api_integration.test.py +192 -0
- intentkit/skills/defillama/tests/api_unit.test.py +583 -0
- intentkit/skills/defillama/tvl/__init__.py +0 -0
- intentkit/skills/defillama/tvl/fetch_chain_historical_tvl.py +106 -0
- intentkit/skills/defillama/tvl/fetch_chains.py +107 -0
- intentkit/skills/defillama/tvl/fetch_historical_tvl.py +91 -0
- intentkit/skills/defillama/tvl/fetch_protocol.py +207 -0
- intentkit/skills/defillama/tvl/fetch_protocol_current_tvl.py +93 -0
- intentkit/skills/defillama/tvl/fetch_protocols.py +196 -0
- intentkit/skills/defillama/volumes/__init__.py +0 -0
- intentkit/skills/defillama/volumes/fetch_dex_overview.py +157 -0
- intentkit/skills/defillama/volumes/fetch_dex_summary.py +123 -0
- intentkit/skills/defillama/volumes/fetch_options_overview.py +131 -0
- intentkit/skills/defillama/yields/__init__.py +0 -0
- intentkit/skills/defillama/yields/fetch_pool_chart.py +100 -0
- intentkit/skills/defillama/yields/fetch_pools.py +126 -0
- intentkit/skills/dexscreener/__init__.py +93 -0
- intentkit/skills/dexscreener/base.py +133 -0
- intentkit/skills/dexscreener/dexscreener.png +0 -0
- intentkit/skills/dexscreener/model/__init__.py +0 -0
- intentkit/skills/dexscreener/model/search_token_response.py +82 -0
- intentkit/skills/dexscreener/schema.json +48 -0
- intentkit/skills/dexscreener/search_token.py +321 -0
- intentkit/skills/dune_analytics/__init__.py +103 -0
- intentkit/skills/dune_analytics/base.py +46 -0
- intentkit/skills/dune_analytics/dune.png +0 -0
- intentkit/skills/dune_analytics/fetch_kol_buys.py +128 -0
- intentkit/skills/dune_analytics/fetch_nation_metrics.py +237 -0
- intentkit/skills/dune_analytics/schema.json +99 -0
- intentkit/skills/elfa/README.md +100 -0
- intentkit/skills/elfa/__init__.py +123 -0
- intentkit/skills/elfa/base.py +28 -0
- intentkit/skills/elfa/elfa.jpg +0 -0
- intentkit/skills/elfa/mention.py +504 -0
- intentkit/skills/elfa/schema.json +153 -0
- intentkit/skills/elfa/stats.py +118 -0
- intentkit/skills/elfa/tokens.py +126 -0
- intentkit/skills/enso/README.md +75 -0
- intentkit/skills/enso/__init__.py +114 -0
- intentkit/skills/enso/abi/__init__.py +0 -0
- intentkit/skills/enso/abi/approval.py +279 -0
- intentkit/skills/enso/abi/erc20.py +14 -0
- intentkit/skills/enso/abi/route.py +129 -0
- intentkit/skills/enso/base.py +44 -0
- intentkit/skills/enso/best_yield.py +286 -0
- intentkit/skills/enso/enso.jpg +0 -0
- intentkit/skills/enso/networks.py +105 -0
- intentkit/skills/enso/prices.py +93 -0
- intentkit/skills/enso/route.py +300 -0
- intentkit/skills/enso/schema.json +212 -0
- intentkit/skills/enso/tokens.py +223 -0
- intentkit/skills/enso/wallet.py +381 -0
- intentkit/skills/github/README.md +63 -0
- intentkit/skills/github/__init__.py +54 -0
- intentkit/skills/github/base.py +21 -0
- intentkit/skills/github/github.jpg +0 -0
- intentkit/skills/github/github_search.py +183 -0
- intentkit/skills/github/schema.json +59 -0
- intentkit/skills/heurist/__init__.py +143 -0
- intentkit/skills/heurist/base.py +26 -0
- intentkit/skills/heurist/heurist.png +0 -0
- intentkit/skills/heurist/image_generation_animagine_xl.py +162 -0
- intentkit/skills/heurist/image_generation_arthemy_comics.py +162 -0
- intentkit/skills/heurist/image_generation_arthemy_real.py +162 -0
- intentkit/skills/heurist/image_generation_braindance.py +162 -0
- intentkit/skills/heurist/image_generation_cyber_realistic_xl.py +162 -0
- intentkit/skills/heurist/image_generation_flux_1_dev.py +162 -0
- intentkit/skills/heurist/image_generation_sdxl.py +161 -0
- intentkit/skills/heurist/schema.json +196 -0
- intentkit/skills/lifi/README.md +294 -0
- intentkit/skills/lifi/__init__.py +141 -0
- intentkit/skills/lifi/base.py +21 -0
- intentkit/skills/lifi/lifi.png +0 -0
- intentkit/skills/lifi/schema.json +89 -0
- intentkit/skills/lifi/token_execute.py +472 -0
- intentkit/skills/lifi/token_quote.py +190 -0
- intentkit/skills/lifi/utils.py +656 -0
- intentkit/skills/moralis/README.md +490 -0
- intentkit/skills/moralis/__init__.py +110 -0
- intentkit/skills/moralis/api.py +281 -0
- intentkit/skills/moralis/base.py +55 -0
- intentkit/skills/moralis/fetch_chain_portfolio.py +191 -0
- intentkit/skills/moralis/fetch_nft_portfolio.py +284 -0
- intentkit/skills/moralis/fetch_solana_portfolio.py +331 -0
- intentkit/skills/moralis/fetch_wallet_portfolio.py +301 -0
- intentkit/skills/moralis/moralis.png +0 -0
- intentkit/skills/moralis/schema.json +156 -0
- intentkit/skills/moralis/tests/__init__.py +0 -0
- intentkit/skills/moralis/tests/test_wallet.py +511 -0
- intentkit/skills/nation/__init__.py +62 -0
- intentkit/skills/nation/base.py +31 -0
- intentkit/skills/nation/nation.png +0 -0
- intentkit/skills/nation/nft_check.py +106 -0
- intentkit/skills/nation/schema.json +58 -0
- intentkit/skills/openai/__init__.py +107 -0
- intentkit/skills/openai/base.py +32 -0
- intentkit/skills/openai/dalle_image_generation.py +128 -0
- intentkit/skills/openai/gpt_image_generation.py +152 -0
- intentkit/skills/openai/gpt_image_to_image.py +186 -0
- intentkit/skills/openai/image_to_text.py +126 -0
- intentkit/skills/openai/openai.png +0 -0
- intentkit/skills/openai/schema.json +139 -0
- intentkit/skills/portfolio/README.md +55 -0
- intentkit/skills/portfolio/__init__.py +151 -0
- intentkit/skills/portfolio/base.py +107 -0
- intentkit/skills/portfolio/constants.py +9 -0
- intentkit/skills/portfolio/moralis.png +0 -0
- intentkit/skills/portfolio/schema.json +237 -0
- intentkit/skills/portfolio/token_balances.py +155 -0
- intentkit/skills/portfolio/wallet_approvals.py +102 -0
- intentkit/skills/portfolio/wallet_defi_positions.py +80 -0
- intentkit/skills/portfolio/wallet_history.py +155 -0
- intentkit/skills/portfolio/wallet_net_worth.py +112 -0
- intentkit/skills/portfolio/wallet_nfts.py +139 -0
- intentkit/skills/portfolio/wallet_profitability.py +101 -0
- intentkit/skills/portfolio/wallet_profitability_summary.py +91 -0
- intentkit/skills/portfolio/wallet_stats.py +79 -0
- intentkit/skills/portfolio/wallet_swaps.py +147 -0
- intentkit/skills/skills.toml +103 -0
- intentkit/skills/slack/__init__.py +98 -0
- intentkit/skills/slack/base.py +55 -0
- intentkit/skills/slack/get_channel.py +109 -0
- intentkit/skills/slack/get_message.py +136 -0
- intentkit/skills/slack/schedule_message.py +92 -0
- intentkit/skills/slack/schema.json +135 -0
- intentkit/skills/slack/send_message.py +81 -0
- intentkit/skills/slack/slack.jpg +0 -0
- intentkit/skills/system/__init__.py +90 -0
- intentkit/skills/system/base.py +22 -0
- intentkit/skills/system/read_agent_api_key.py +87 -0
- intentkit/skills/system/regenerate_agent_api_key.py +77 -0
- intentkit/skills/system/schema.json +53 -0
- intentkit/skills/system/system.svg +76 -0
- intentkit/skills/tavily/README.md +86 -0
- intentkit/skills/tavily/__init__.py +91 -0
- intentkit/skills/tavily/base.py +27 -0
- intentkit/skills/tavily/schema.json +119 -0
- intentkit/skills/tavily/tavily.jpg +0 -0
- intentkit/skills/tavily/tavily_extract.py +147 -0
- intentkit/skills/tavily/tavily_search.py +139 -0
- intentkit/skills/token/README.md +89 -0
- intentkit/skills/token/__init__.py +107 -0
- intentkit/skills/token/base.py +154 -0
- intentkit/skills/token/constants.py +9 -0
- intentkit/skills/token/erc20_transfers.py +145 -0
- intentkit/skills/token/moralis.png +0 -0
- intentkit/skills/token/schema.json +141 -0
- intentkit/skills/token/token_analytics.py +81 -0
- intentkit/skills/token/token_price.py +132 -0
- intentkit/skills/token/token_search.py +121 -0
- intentkit/skills/twitter/__init__.py +146 -0
- intentkit/skills/twitter/base.py +68 -0
- intentkit/skills/twitter/follow_user.py +69 -0
- intentkit/skills/twitter/get_mentions.py +124 -0
- intentkit/skills/twitter/get_timeline.py +111 -0
- intentkit/skills/twitter/get_user_by_username.py +84 -0
- intentkit/skills/twitter/get_user_tweets.py +123 -0
- intentkit/skills/twitter/like_tweet.py +65 -0
- intentkit/skills/twitter/post_tweet.py +90 -0
- intentkit/skills/twitter/reply_tweet.py +98 -0
- intentkit/skills/twitter/retweet.py +76 -0
- intentkit/skills/twitter/schema.json +258 -0
- intentkit/skills/twitter/search_tweets.py +115 -0
- intentkit/skills/twitter/twitter.png +0 -0
- intentkit/skills/unrealspeech/__init__.py +55 -0
- intentkit/skills/unrealspeech/base.py +21 -0
- intentkit/skills/unrealspeech/schema.json +100 -0
- intentkit/skills/unrealspeech/text_to_speech.py +177 -0
- intentkit/skills/unrealspeech/unrealspeech.jpg +0 -0
- intentkit/skills/venice_audio/__init__.py +106 -0
- intentkit/skills/venice_audio/base.py +119 -0
- intentkit/skills/venice_audio/input.py +41 -0
- intentkit/skills/venice_audio/schema.json +152 -0
- intentkit/skills/venice_audio/venice_audio.py +240 -0
- intentkit/skills/venice_audio/venice_logo.jpg +0 -0
- intentkit/skills/venice_image/README.md +119 -0
- intentkit/skills/venice_image/__init__.py +154 -0
- intentkit/skills/venice_image/api.py +138 -0
- intentkit/skills/venice_image/base.py +188 -0
- intentkit/skills/venice_image/config.py +35 -0
- intentkit/skills/venice_image/image_enhance/README.md +119 -0
- intentkit/skills/venice_image/image_enhance/__init__.py +0 -0
- intentkit/skills/venice_image/image_enhance/image_enhance.py +80 -0
- intentkit/skills/venice_image/image_enhance/image_enhance_base.py +23 -0
- intentkit/skills/venice_image/image_enhance/image_enhance_input.py +40 -0
- intentkit/skills/venice_image/image_generation/README.md +144 -0
- intentkit/skills/venice_image/image_generation/__init__.py +0 -0
- intentkit/skills/venice_image/image_generation/image_generation_base.py +117 -0
- intentkit/skills/venice_image/image_generation/image_generation_fluently_xl.py +26 -0
- intentkit/skills/venice_image/image_generation/image_generation_flux_dev.py +27 -0
- intentkit/skills/venice_image/image_generation/image_generation_flux_dev_uncensored.py +26 -0
- intentkit/skills/venice_image/image_generation/image_generation_input.py +158 -0
- intentkit/skills/venice_image/image_generation/image_generation_lustify_sdxl.py +26 -0
- intentkit/skills/venice_image/image_generation/image_generation_pony_realism.py +26 -0
- intentkit/skills/venice_image/image_generation/image_generation_stable_diffusion_3_5.py +28 -0
- intentkit/skills/venice_image/image_generation/image_generation_venice_sd35.py +28 -0
- intentkit/skills/venice_image/image_upscale/README.md +111 -0
- intentkit/skills/venice_image/image_upscale/__init__.py +0 -0
- intentkit/skills/venice_image/image_upscale/image_upscale.py +90 -0
- intentkit/skills/venice_image/image_upscale/image_upscale_base.py +23 -0
- intentkit/skills/venice_image/image_upscale/image_upscale_input.py +22 -0
- intentkit/skills/venice_image/image_vision/README.md +112 -0
- intentkit/skills/venice_image/image_vision/__init__.py +0 -0
- intentkit/skills/venice_image/image_vision/image_vision.py +100 -0
- intentkit/skills/venice_image/image_vision/image_vision_base.py +17 -0
- intentkit/skills/venice_image/image_vision/image_vision_input.py +9 -0
- intentkit/skills/venice_image/schema.json +267 -0
- intentkit/skills/venice_image/utils.py +78 -0
- intentkit/skills/venice_image/venice_image.jpg +0 -0
- intentkit/skills/web_scraper/README.md +82 -0
- intentkit/skills/web_scraper/__init__.py +92 -0
- intentkit/skills/web_scraper/base.py +21 -0
- intentkit/skills/web_scraper/langchain.png +0 -0
- intentkit/skills/web_scraper/schema.json +115 -0
- intentkit/skills/web_scraper/scrape_and_index.py +327 -0
- intentkit/utils/__init__.py +1 -0
- intentkit/utils/chain.py +436 -0
- intentkit/utils/error.py +134 -0
- intentkit/utils/logging.py +70 -0
- intentkit/utils/middleware.py +61 -0
- intentkit/utils/random.py +16 -0
- intentkit/utils/s3.py +267 -0
- intentkit/utils/slack_alert.py +79 -0
- intentkit/utils/tx.py +37 -0
- {intentkit-0.5.0.dist-info → intentkit-0.5.2.dist-info}/METADATA +1 -1
- intentkit-0.5.2.dist-info/RECORD +365 -0
- intentkit-0.5.0.dist-info/RECORD +0 -4
- {intentkit-0.5.0.dist-info → intentkit-0.5.2.dist-info}/WHEEL +0 -0
- {intentkit-0.5.0.dist-info → intentkit-0.5.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Web Scraper & Content Indexing Skills
|
|
2
|
+
|
|
3
|
+
Intelligent web scraping and content indexing using LangChain's WebBaseLoader with vector search capabilities.
|
|
4
|
+
|
|
5
|
+
## Skills
|
|
6
|
+
|
|
7
|
+
### 🔍 `scrape_and_index`
|
|
8
|
+
Scrape content from URLs and index into a searchable vector store with configurable chunking and persistent storage.
|
|
9
|
+
|
|
10
|
+
### 🔎 `query_indexed_content`
|
|
11
|
+
Search indexed content using semantic similarity to answer questions and retrieve relevant information.
|
|
12
|
+
|
|
13
|
+
## Key Features
|
|
14
|
+
|
|
15
|
+
- **Multi-URL Support**: Scrape up to 10 URLs simultaneously
|
|
16
|
+
- **Smart Chunking**: Configurable text splitting (100-4000 chars) with overlap
|
|
17
|
+
- **Vector Search**: FAISS + OpenAI embeddings for semantic retrieval
|
|
18
|
+
- **Agent Storage**: Persistent, per-agent content indexing
|
|
19
|
+
- **Rate Limiting**: Respectful scraping (0.1-10 req/sec)
|
|
20
|
+
|
|
21
|
+
## Testing Examples
|
|
22
|
+
|
|
23
|
+
### 1. Basic Scraping & Indexing
|
|
24
|
+
|
|
25
|
+
**Agent Prompt:**
|
|
26
|
+
```
|
|
27
|
+
Please scrape and index this URL: https://docs.crestal.network/introduction
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Expected Response:**
|
|
31
|
+
- Confirmation of successful scraping
|
|
32
|
+
- Number of URLs processed and chunks created
|
|
33
|
+
- Storage confirmation message
|
|
34
|
+
|
|
35
|
+
### 2. Custom Chunking
|
|
36
|
+
|
|
37
|
+
**Agent Prompt:**
|
|
38
|
+
```
|
|
39
|
+
Scrape and index https://docs.crestal.network/introduction with chunk size 500 and overlap 100.
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### 3. Content Querying
|
|
43
|
+
|
|
44
|
+
**Agent Prompt (after indexing):**
|
|
45
|
+
```
|
|
46
|
+
Based on the indexed documentation, what are the main items in it?
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## Testing Workflow
|
|
51
|
+
|
|
52
|
+
1. **Setup**: Configure the skill in your agent
|
|
53
|
+
2. **Index Content**: Use `scrape_and_index` with test URLs
|
|
54
|
+
3. **Query Content**: Use `query_indexed_content` with questions
|
|
55
|
+
4. **Verify**: Check responses include source attribution and relevant content
|
|
56
|
+
|
|
57
|
+
## API Testing
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Test scraping via API
|
|
61
|
+
curl -X POST "http://localhost:8000/agents/your-agent-id/chat" \
|
|
62
|
+
-H "Content-Type: application/json" \
|
|
63
|
+
-d '{
|
|
64
|
+
"message": "Scrape and index https://docs.crestal.network/introduction"
|
|
65
|
+
}'
|
|
66
|
+
|
|
67
|
+
# Test querying via API
|
|
68
|
+
curl -X POST "http://localhost:8000/agents/your-agent-id/chat" \
|
|
69
|
+
-H "Content-Type: application/json" \
|
|
70
|
+
-d '{
|
|
71
|
+
"message": "What information did you find?"
|
|
72
|
+
}'
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Dependencies
|
|
76
|
+
|
|
77
|
+
Required packages (add to `pyproject.toml` if missing):
|
|
78
|
+
- `langchain-community` - WebBaseLoader
|
|
79
|
+
- `langchain-openai` - Embeddings
|
|
80
|
+
- `langchain-text-splitters` - Document chunking
|
|
81
|
+
- `faiss-cpu` - Vector storage
|
|
82
|
+
- `beautifulsoup4` - HTML parsing
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Web scraper skills for content indexing and retrieval."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TypedDict
|
|
5
|
+
|
|
6
|
+
from intentkit.abstracts.skill import SkillStoreABC
|
|
7
|
+
from intentkit.skills.base import SkillConfig, SkillOwnerState, SkillState
|
|
8
|
+
from intentkit.skills.web_scraper.base import WebScraperBaseTool
|
|
9
|
+
from intentkit.skills.web_scraper.scrape_and_index import (
|
|
10
|
+
QueryIndexedContent,
|
|
11
|
+
ScrapeAndIndex,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Cache skills at the system level, because they are stateless
|
|
15
|
+
_cache: dict[str, WebScraperBaseTool] = {}
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SkillStates(TypedDict):
|
|
21
|
+
scrape_and_index: SkillOwnerState
|
|
22
|
+
query_indexed_content: SkillState
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Config(SkillConfig):
|
|
26
|
+
"""Configuration for web scraper skills."""
|
|
27
|
+
|
|
28
|
+
states: SkillStates
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def get_skills(
|
|
32
|
+
config: "Config",
|
|
33
|
+
is_private: bool,
|
|
34
|
+
store: SkillStoreABC,
|
|
35
|
+
**_,
|
|
36
|
+
) -> list[WebScraperBaseTool]:
|
|
37
|
+
"""Get all web scraper skills.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
config: The configuration for web scraper skills.
|
|
41
|
+
is_private: Whether to include private skills.
|
|
42
|
+
store: The skill store for persisting data.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
A list of web scraper skills.
|
|
46
|
+
"""
|
|
47
|
+
available_skills = []
|
|
48
|
+
|
|
49
|
+
# Include skills based on their state
|
|
50
|
+
for skill_name, state in config["states"].items():
|
|
51
|
+
if state == "disabled":
|
|
52
|
+
continue
|
|
53
|
+
elif state == "public" or (state == "private" and is_private):
|
|
54
|
+
available_skills.append(skill_name)
|
|
55
|
+
|
|
56
|
+
# Get each skill using the cached getter
|
|
57
|
+
result = []
|
|
58
|
+
for name in available_skills:
|
|
59
|
+
skill = get_web_scraper_skill(name, store)
|
|
60
|
+
if skill:
|
|
61
|
+
result.append(skill)
|
|
62
|
+
return result
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_web_scraper_skill(
|
|
66
|
+
name: str,
|
|
67
|
+
store: SkillStoreABC,
|
|
68
|
+
) -> WebScraperBaseTool:
|
|
69
|
+
"""Get a web scraper skill by name.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
name: The name of the skill to get
|
|
73
|
+
store: The skill store for persisting data
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The requested web scraper skill
|
|
77
|
+
"""
|
|
78
|
+
if name == "scrape_and_index":
|
|
79
|
+
if name not in _cache:
|
|
80
|
+
_cache[name] = ScrapeAndIndex(
|
|
81
|
+
skill_store=store,
|
|
82
|
+
)
|
|
83
|
+
return _cache[name]
|
|
84
|
+
elif name == "query_indexed_content":
|
|
85
|
+
if name not in _cache:
|
|
86
|
+
_cache[name] = QueryIndexedContent(
|
|
87
|
+
skill_store=store,
|
|
88
|
+
)
|
|
89
|
+
return _cache[name]
|
|
90
|
+
else:
|
|
91
|
+
logger.warning(f"Unknown web scraper skill: {name}")
|
|
92
|
+
return None
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from intentkit.abstracts.skill import SkillStoreABC
|
|
6
|
+
from intentkit.skills.base import IntentKitSkill
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WebScraperBaseTool(IntentKitSkill):
|
|
10
|
+
"""Base class for web scraper tools."""
|
|
11
|
+
|
|
12
|
+
name: str = Field(description="The name of the tool")
|
|
13
|
+
description: str = Field(description="A description of what the tool does")
|
|
14
|
+
args_schema: Type[BaseModel]
|
|
15
|
+
skill_store: SkillStoreABC = Field(
|
|
16
|
+
description="The skill store for persisting data"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def category(self) -> str:
|
|
21
|
+
return "web_scraper"
|
|
Binary file
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"type": "object",
|
|
4
|
+
"title": "Web Scraper & Content Indexing",
|
|
5
|
+
"description": "Scrape web content and index it for intelligent querying and retrieval",
|
|
6
|
+
"x-icon": "https://ai.service.crestal.dev/skills/web_scraper/langchain.png",
|
|
7
|
+
"x-tags": [
|
|
8
|
+
"Web Scraping",
|
|
9
|
+
"Content Indexing",
|
|
10
|
+
"Vector Search",
|
|
11
|
+
"LangChain",
|
|
12
|
+
"Document Retrieval"
|
|
13
|
+
],
|
|
14
|
+
"properties": {
|
|
15
|
+
"enabled": {
|
|
16
|
+
"type": "boolean",
|
|
17
|
+
"title": "Enabled",
|
|
18
|
+
"description": "Whether this skill is enabled",
|
|
19
|
+
"default": false
|
|
20
|
+
},
|
|
21
|
+
"states": {
|
|
22
|
+
"type": "object",
|
|
23
|
+
"properties": {
|
|
24
|
+
"scrape_and_index": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"title": "Scrape & Index Content",
|
|
27
|
+
"enum": [
|
|
28
|
+
"disabled",
|
|
29
|
+
"private"
|
|
30
|
+
],
|
|
31
|
+
"x-enum-title": [
|
|
32
|
+
"Disabled",
|
|
33
|
+
"Agent Owner Only"
|
|
34
|
+
],
|
|
35
|
+
"description": "Scrape content from web URLs and index it into a searchable vector store for later retrieval. Supports multiple URLs, customizable chunking, and persistent storage.",
|
|
36
|
+
"default": "private"
|
|
37
|
+
},
|
|
38
|
+
"query_indexed_content": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"title": "Query Indexed Content",
|
|
41
|
+
"enum": [
|
|
42
|
+
"disabled",
|
|
43
|
+
"public",
|
|
44
|
+
"private"
|
|
45
|
+
],
|
|
46
|
+
"x-enum-title": [
|
|
47
|
+
"Disabled",
|
|
48
|
+
"Agent Owner + All Users",
|
|
49
|
+
"Agent Owner Only"
|
|
50
|
+
],
|
|
51
|
+
"description": "Search and retrieve relevant information from previously indexed web content using semantic similarity. Perfect for answering questions based on scraped documents.",
|
|
52
|
+
"default": "private"
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"description": "Configure the availability of each web scraper skill (disabled, public, or private)"
|
|
56
|
+
},
|
|
57
|
+
"max_urls_per_request": {
|
|
58
|
+
"type": "integer",
|
|
59
|
+
"title": "Max URLs per Request",
|
|
60
|
+
"description": "Maximum number of URLs that can be scraped in a single request",
|
|
61
|
+
"default": 10,
|
|
62
|
+
"minimum": 1,
|
|
63
|
+
"maximum": 20
|
|
64
|
+
},
|
|
65
|
+
"default_chunk_size": {
|
|
66
|
+
"type": "integer",
|
|
67
|
+
"title": "Default Chunk Size",
|
|
68
|
+
"description": "Default size of text chunks for document indexing (characters)",
|
|
69
|
+
"default": 1000,
|
|
70
|
+
"minimum": 100,
|
|
71
|
+
"maximum": 4000
|
|
72
|
+
},
|
|
73
|
+
"default_chunk_overlap": {
|
|
74
|
+
"type": "integer",
|
|
75
|
+
"title": "Default Chunk Overlap",
|
|
76
|
+
"description": "Default overlap between chunks to maintain context (characters)",
|
|
77
|
+
"default": 200,
|
|
78
|
+
"minimum": 0,
|
|
79
|
+
"maximum": 1000
|
|
80
|
+
},
|
|
81
|
+
"requests_per_second": {
|
|
82
|
+
"type": "number",
|
|
83
|
+
"title": "Requests per Second",
|
|
84
|
+
"description": "Rate limit for web scraping to be respectful to target servers",
|
|
85
|
+
"default": 2,
|
|
86
|
+
"minimum": 0.1,
|
|
87
|
+
"maximum": 10
|
|
88
|
+
},
|
|
89
|
+
"request_timeout": {
|
|
90
|
+
"type": "integer",
|
|
91
|
+
"title": "Request Timeout",
|
|
92
|
+
"description": "Timeout for web requests in seconds",
|
|
93
|
+
"default": 30,
|
|
94
|
+
"minimum": 5,
|
|
95
|
+
"maximum": 120
|
|
96
|
+
},
|
|
97
|
+
"api_key_provider": {
|
|
98
|
+
"type": "string",
|
|
99
|
+
"title": "API Key Provider",
|
|
100
|
+
"description": "Who provides the API key for embeddings",
|
|
101
|
+
"enum": [
|
|
102
|
+
"platform"
|
|
103
|
+
],
|
|
104
|
+
"x-enum-title": [
|
|
105
|
+
"Platform Hosted"
|
|
106
|
+
],
|
|
107
|
+
"default": "platform"
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
"required": [
|
|
111
|
+
"states",
|
|
112
|
+
"enabled"
|
|
113
|
+
],
|
|
114
|
+
"additionalProperties": true
|
|
115
|
+
}
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from typing import List, Type
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from langchain_community.document_loaders import WebBaseLoader
|
|
10
|
+
from langchain_community.vectorstores import FAISS
|
|
11
|
+
from langchain_core.runnables import RunnableConfig
|
|
12
|
+
from langchain_openai import OpenAIEmbeddings
|
|
13
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
from intentkit.skills.web_scraper.base import WebScraperBaseTool
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ScrapeAndIndexInput(BaseModel):
|
|
22
|
+
"""Input for ScrapeAndIndex tool."""
|
|
23
|
+
|
|
24
|
+
urls: List[str] = Field(
|
|
25
|
+
description="List of URLs to scrape and index. Each URL should be a valid web address starting with http:// or https://",
|
|
26
|
+
min_items=1,
|
|
27
|
+
max_items=10,
|
|
28
|
+
)
|
|
29
|
+
chunk_size: int = Field(
|
|
30
|
+
description="Size of text chunks for indexing (default: 1000)",
|
|
31
|
+
default=1000,
|
|
32
|
+
ge=100,
|
|
33
|
+
le=4000,
|
|
34
|
+
)
|
|
35
|
+
chunk_overlap: int = Field(
|
|
36
|
+
description="Overlap between chunks (default: 200)",
|
|
37
|
+
default=200,
|
|
38
|
+
ge=0,
|
|
39
|
+
le=1000,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class QueryIndexInput(BaseModel):
|
|
44
|
+
"""Input for QueryIndex tool."""
|
|
45
|
+
|
|
46
|
+
query: str = Field(
|
|
47
|
+
description="Question or query to search in the indexed content",
|
|
48
|
+
min_length=1,
|
|
49
|
+
max_length=500,
|
|
50
|
+
)
|
|
51
|
+
max_results: int = Field(
|
|
52
|
+
description="Maximum number of relevant documents to return (default: 4)",
|
|
53
|
+
default=4,
|
|
54
|
+
ge=1,
|
|
55
|
+
le=10,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ScrapeAndIndex(WebScraperBaseTool):
|
|
60
|
+
"""Tool for scraping web content and indexing it into a searchable vector store.
|
|
61
|
+
|
|
62
|
+
This tool can scrape multiple URLs, process the content into chunks,
|
|
63
|
+
and store it in a vector database for later retrieval and question answering.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
name: str = "web_scraper_scrape_and_index"
|
|
67
|
+
description: str = (
|
|
68
|
+
"Scrape content from one or more web URLs and index them into a vector store for later querying.\n"
|
|
69
|
+
"Use this tool to collect and index web content that you want to reference later.\n"
|
|
70
|
+
"The indexed content can then be queried using the query_indexed_content tool."
|
|
71
|
+
)
|
|
72
|
+
args_schema: Type[BaseModel] = ScrapeAndIndexInput
|
|
73
|
+
|
|
74
|
+
def _validate_urls(self, urls: List[str]) -> List[str]:
|
|
75
|
+
"""Validate and filter URLs."""
|
|
76
|
+
valid_urls = []
|
|
77
|
+
for url in urls:
|
|
78
|
+
try:
|
|
79
|
+
parsed = urlparse(url)
|
|
80
|
+
if parsed.scheme in ["http", "https"] and parsed.netloc:
|
|
81
|
+
valid_urls.append(url)
|
|
82
|
+
else:
|
|
83
|
+
logger.warning(f"Invalid URL format: {url}")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.warning(f"Error parsing URL {url}: {e}")
|
|
86
|
+
return valid_urls
|
|
87
|
+
|
|
88
|
+
async def _arun(
|
|
89
|
+
self,
|
|
90
|
+
urls: List[str],
|
|
91
|
+
chunk_size: int = 1000,
|
|
92
|
+
chunk_overlap: int = 200,
|
|
93
|
+
config: RunnableConfig = None,
|
|
94
|
+
**kwargs,
|
|
95
|
+
) -> str:
|
|
96
|
+
"""Scrape URLs and index content into vector store."""
|
|
97
|
+
try:
|
|
98
|
+
# Validate URLs
|
|
99
|
+
valid_urls = self._validate_urls(urls)
|
|
100
|
+
if not valid_urls:
|
|
101
|
+
return "Error: No valid URLs provided. URLs must start with http:// or https://"
|
|
102
|
+
|
|
103
|
+
# Get agent context for storage
|
|
104
|
+
context = self.context_from_config(config) if config else None
|
|
105
|
+
agent_id = context.agent.id if context else "default"
|
|
106
|
+
|
|
107
|
+
# Load documents from URLs
|
|
108
|
+
logger.info(f"Scraping {len(valid_urls)} URLs...")
|
|
109
|
+
loader = WebBaseLoader(
|
|
110
|
+
web_paths=valid_urls,
|
|
111
|
+
requests_per_second=2, # Be respectful to servers
|
|
112
|
+
show_progress=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Configure loader for better content extraction
|
|
116
|
+
loader.requests_kwargs = {
|
|
117
|
+
"verify": True,
|
|
118
|
+
"timeout": 30,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
documents = await asyncio.to_thread(loader.load)
|
|
122
|
+
|
|
123
|
+
if not documents:
|
|
124
|
+
return "Error: No content could be extracted from the provided URLs."
|
|
125
|
+
|
|
126
|
+
# Split documents into chunks
|
|
127
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
128
|
+
chunk_size=chunk_size,
|
|
129
|
+
chunk_overlap=chunk_overlap,
|
|
130
|
+
length_function=len,
|
|
131
|
+
)
|
|
132
|
+
split_docs = text_splitter.split_documents(documents)
|
|
133
|
+
|
|
134
|
+
if not split_docs:
|
|
135
|
+
return "Error: No content could be processed into chunks."
|
|
136
|
+
|
|
137
|
+
# Create embeddings and vector store
|
|
138
|
+
api_key = self.skill_store.get_system_config("openai_api_key")
|
|
139
|
+
embeddings = OpenAIEmbeddings(api_key=api_key)
|
|
140
|
+
|
|
141
|
+
# Create vector store
|
|
142
|
+
vector_store = FAISS.from_documents(split_docs, embeddings)
|
|
143
|
+
|
|
144
|
+
# Store the vector store for this agent using a temporary directory
|
|
145
|
+
vector_store_key = f"vector_store_{agent_id}"
|
|
146
|
+
metadata_key = f"indexed_urls_{agent_id}"
|
|
147
|
+
|
|
148
|
+
# Save vector store to temporary directory and encode to base64
|
|
149
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
150
|
+
vector_store.save_local(temp_dir)
|
|
151
|
+
|
|
152
|
+
# Read and encode all files in the temporary directory
|
|
153
|
+
encoded_files = {}
|
|
154
|
+
for filename in os.listdir(temp_dir):
|
|
155
|
+
file_path = os.path.join(temp_dir, filename)
|
|
156
|
+
if os.path.isfile(file_path):
|
|
157
|
+
with open(file_path, "rb") as f:
|
|
158
|
+
encoded_files[filename] = base64.b64encode(f.read()).decode(
|
|
159
|
+
"utf-8"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Store vector store data
|
|
163
|
+
await self.skill_store.save_agent_skill_data(
|
|
164
|
+
agent_id=agent_id,
|
|
165
|
+
skill="web_scraper",
|
|
166
|
+
key=vector_store_key,
|
|
167
|
+
data={
|
|
168
|
+
"faiss_files": encoded_files,
|
|
169
|
+
"chunk_size": chunk_size,
|
|
170
|
+
"chunk_overlap": chunk_overlap,
|
|
171
|
+
},
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Store metadata about indexed URLs
|
|
175
|
+
existing_metadata = (
|
|
176
|
+
await self.skill_store.get_agent_skill_data(
|
|
177
|
+
agent_id, "web_scraper", metadata_key
|
|
178
|
+
)
|
|
179
|
+
or {}
|
|
180
|
+
)
|
|
181
|
+
existing_metadata.update(
|
|
182
|
+
{
|
|
183
|
+
url: {
|
|
184
|
+
"indexed_at": str(asyncio.get_event_loop().time()),
|
|
185
|
+
"chunks": len(
|
|
186
|
+
[
|
|
187
|
+
doc
|
|
188
|
+
for doc in split_docs
|
|
189
|
+
if doc.metadata.get("source") == url
|
|
190
|
+
]
|
|
191
|
+
),
|
|
192
|
+
}
|
|
193
|
+
for url in valid_urls
|
|
194
|
+
}
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
await self.skill_store.save_agent_skill_data(
|
|
198
|
+
agent_id=agent_id,
|
|
199
|
+
skill="web_scraper",
|
|
200
|
+
key=metadata_key,
|
|
201
|
+
data=existing_metadata,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
total_chunks = len(split_docs)
|
|
205
|
+
successful_urls = len(valid_urls)
|
|
206
|
+
|
|
207
|
+
return (
|
|
208
|
+
f"Successfully scraped and indexed {successful_urls} URLs:\n"
|
|
209
|
+
f"{'• ' + chr(10) + '• '.join(valid_urls)}\n\n"
|
|
210
|
+
f"Total chunks created: {total_chunks}\n"
|
|
211
|
+
f"Chunk size: {chunk_size} characters\n"
|
|
212
|
+
f"Chunk overlap: {chunk_overlap} characters\n\n"
|
|
213
|
+
f"The content is now indexed and can be queried using the query_indexed_content tool."
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"Error in scrape_and_index: {e}")
|
|
218
|
+
return f"Error scraping and indexing URLs: {str(e)}"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class QueryIndexedContent(WebScraperBaseTool):
|
|
222
|
+
"""Tool for querying previously indexed web content.
|
|
223
|
+
|
|
224
|
+
This tool searches through content that was previously scraped and indexed
|
|
225
|
+
using the scrape_and_index tool to answer questions or find relevant information.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
name: str = "web_scraper_query_indexed_content"
|
|
229
|
+
description: str = (
|
|
230
|
+
"Query previously indexed web content to find relevant information and answer questions.\n"
|
|
231
|
+
"Use this tool to search through content that was previously scraped and indexed.\n"
|
|
232
|
+
"This tool can help answer questions based on the indexed web content."
|
|
233
|
+
)
|
|
234
|
+
args_schema: Type[BaseModel] = QueryIndexInput
|
|
235
|
+
|
|
236
|
+
async def _arun(
|
|
237
|
+
self,
|
|
238
|
+
query: str,
|
|
239
|
+
max_results: int = 4,
|
|
240
|
+
config: RunnableConfig = None,
|
|
241
|
+
**kwargs,
|
|
242
|
+
) -> str:
|
|
243
|
+
"""Query the indexed content."""
|
|
244
|
+
try:
|
|
245
|
+
# Get agent context for storage
|
|
246
|
+
context = self.context_from_config(config) if config else None
|
|
247
|
+
agent_id = context.agent.id if context else "default"
|
|
248
|
+
|
|
249
|
+
# Retrieve vector store
|
|
250
|
+
vector_store_key = f"vector_store_{agent_id}"
|
|
251
|
+
metadata_key = f"indexed_urls_{agent_id}"
|
|
252
|
+
|
|
253
|
+
stored_data = await self.skill_store.get_agent_skill_data(
|
|
254
|
+
agent_id, "web_scraper", vector_store_key
|
|
255
|
+
)
|
|
256
|
+
if not stored_data or "faiss_files" not in stored_data:
|
|
257
|
+
return (
|
|
258
|
+
"No indexed content found. Please use the scrape_and_index tool first "
|
|
259
|
+
"to scrape and index some web content before querying."
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Restore vector store from base64 encoded files
|
|
263
|
+
api_key = self.skill_store.get_system_config("openai_api_key")
|
|
264
|
+
embeddings = OpenAIEmbeddings(api_key=api_key)
|
|
265
|
+
|
|
266
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
267
|
+
# Decode and write files to temporary directory
|
|
268
|
+
for filename, encoded_content in stored_data["faiss_files"].items():
|
|
269
|
+
file_path = os.path.join(temp_dir, filename)
|
|
270
|
+
with open(file_path, "wb") as f:
|
|
271
|
+
f.write(base64.b64decode(encoded_content))
|
|
272
|
+
|
|
273
|
+
# Load the vector store from the temporary directory
|
|
274
|
+
vector_store = FAISS.load_local(
|
|
275
|
+
temp_dir,
|
|
276
|
+
embeddings,
|
|
277
|
+
allow_dangerous_deserialization=True, # Safe since we control the serialization
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# Perform similarity search
|
|
281
|
+
relevant_docs = vector_store.similarity_search(query, k=max_results)
|
|
282
|
+
|
|
283
|
+
if not relevant_docs:
|
|
284
|
+
return f"No relevant content found for query: '{query}'"
|
|
285
|
+
|
|
286
|
+
# Get metadata about indexed URLs
|
|
287
|
+
metadata = (
|
|
288
|
+
await self.skill_store.get_agent_skill_data(
|
|
289
|
+
agent_id, "web_scraper", metadata_key
|
|
290
|
+
)
|
|
291
|
+
or {}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Format response
|
|
295
|
+
response_parts = [
|
|
296
|
+
f"Found {len(relevant_docs)} relevant pieces of content for: '{query}'\n",
|
|
297
|
+
"=" * 50,
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
for i, doc in enumerate(relevant_docs, 1):
|
|
301
|
+
source_url = doc.metadata.get("source", "Unknown source")
|
|
302
|
+
title = doc.metadata.get("title", "No title")
|
|
303
|
+
|
|
304
|
+
response_parts.extend(
|
|
305
|
+
[
|
|
306
|
+
f"\n{i}. Source: {source_url}",
|
|
307
|
+
f" Title: {title}",
|
|
308
|
+
f" Content:\n {doc.page_content[:500]}{'...' if len(doc.page_content) > 500 else ''}",
|
|
309
|
+
"",
|
|
310
|
+
]
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Add summary of indexed content
|
|
314
|
+
response_parts.extend(
|
|
315
|
+
[
|
|
316
|
+
"\n" + "=" * 50,
|
|
317
|
+
f"Total indexed URLs: {len(metadata)}",
|
|
318
|
+
"Indexed sources:",
|
|
319
|
+
*[f"• {url}" for url in metadata.keys()],
|
|
320
|
+
]
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
return "\n".join(response_parts)
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
logger.error(f"Error in query_indexed_content: {e}")
|
|
327
|
+
return f"Error querying indexed content: {str(e)}"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|