intentkit 0.6.19.dev2__py3-none-any.whl → 0.6.20.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

intentkit/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  A powerful platform for building AI agents with blockchain and cryptocurrency capabilities.
4
4
  """
5
5
 
6
- __version__ = "0.6.19-dev2"
6
+ __version__ = "0.6.20-dev.1"
7
7
  __author__ = "hyacinthus"
8
8
  __email__ = "hyacinthus@gmail.com"
9
9
 
@@ -5,18 +5,22 @@ The Firecrawl skills provide advanced web scraping and content indexing capabili
5
5
  ## Skills Overview
6
6
 
7
7
  ### 1. firecrawl_scrape
8
- Scrapes a single webpage and optionally indexes the content for future querying.
8
+ Scrapes a single webpage and REPLACES any existing indexed content for that URL, preventing duplicates.
9
9
 
10
10
  **Parameters:**
11
11
  - `url` (required): The URL to scrape
12
- - `formats` (optional): Output formats - markdown, html, rawHtml, screenshot, links, extract (default: ["markdown"])
12
+ - `formats` (optional): Output formats - markdown, html, rawHtml, screenshot, links, json (default: ["markdown"])
13
+ - `only_main_content` (optional): Extract only main content (default: true)
13
14
  - `include_tags` (optional): HTML tags to include (e.g., ["h1", "h2", "p"])
14
15
  - `exclude_tags` (optional): HTML tags to exclude
15
- - `only_main_content` (optional): Extract only main content (default: true)
16
+ - `wait_for` (optional): Wait time in milliseconds before scraping
17
+ - `timeout` (optional): Maximum timeout in milliseconds (default: 30000)
16
18
  - `index_content` (optional): Whether to index content for querying (default: true)
17
19
  - `chunk_size` (optional): Size of text chunks for indexing (default: 1000)
18
20
  - `chunk_overlap` (optional): Overlap between chunks (default: 200)
19
21
 
22
+ **Use Case:** Use this when you want to refresh/update content from a URL that was previously scraped, ensuring no duplicate or stale content remains.
23
+
20
24
  ### 2. firecrawl_crawl
21
25
  Crawls multiple pages from a website and indexes all content.
22
26
 
@@ -158,8 +162,9 @@ Prompt: "Use firecrawl_scrape to scrape https://example.com and index the conten
158
162
  ### Documentation Indexing
159
163
  ```
160
164
  1. Scrape main documentation page
161
- 2. Crawl related documentation sections
162
- 3. Query for specific technical information
165
+ 2. Crawl related documentation sections
166
+ 3. Use scrape again to update changed pages (replaces old content)
167
+ 4. Query for specific technical information
163
168
  ```
164
169
 
165
170
  ### Competitive Analysis
@@ -205,6 +210,7 @@ Prompt: "Use firecrawl_scrape to scrape https://example.com and index the conten
205
210
  - **PDF Support**: Can scrape and index PDF documents
206
211
  - **Intelligent Chunking**: Optimized text splitting for better search
207
212
  - **Independent Storage**: Uses its own dedicated vector store for Firecrawl content
213
+ - **Content Replacement**: Replace mode prevents duplicate/stale content
208
214
  - **Metadata Rich**: Includes source URLs, timestamps, and content types
209
215
  - **Semantic Search**: Uses OpenAI embeddings for intelligent querying
210
216
  - **Batch Processing**: Efficient handling of multiple pages
@@ -34,7 +34,7 @@
34
34
  "Agent Owner + All Users",
35
35
  "Agent Owner Only"
36
36
  ],
37
- "description": "Scrape single web pages and extract content in various formats (markdown, HTML, JSON, etc.). Handles JavaScript-rendered content, PDFs, and dynamic websites.",
37
+ "description": "Scrape single web pages and REPLACE any existing indexed content for that URL. Unlike regular scrape, this prevents duplicate content when re-scraping the same page. Use this to refresh/update content from a previously scraped URL.",
38
38
  "default": "private"
39
39
  },
40
40
  "firecrawl_crawl": {
@@ -62,10 +62,11 @@ class FirecrawlScrapeInput(BaseModel):
62
62
 
63
63
 
64
64
  class FirecrawlScrape(FirecrawlBaseTool):
65
- """Tool for scraping web pages using Firecrawl.
65
+ """Tool for scraping web pages using Firecrawl with REPLACE behavior.
66
66
 
67
- This tool uses Firecrawl's API to scrape web pages and convert them into clean,
68
- LLM-ready formats like markdown, HTML, or structured JSON data.
67
+ This tool uses Firecrawl's API to scrape web pages and REPLACES any existing
68
+ indexed content for the same URL instead of appending to it. This prevents
69
+ duplicate content when re-scraping the same page.
69
70
 
70
71
  Attributes:
71
72
  name: The name of the tool.
@@ -75,10 +76,10 @@ class FirecrawlScrape(FirecrawlBaseTool):
75
76
 
76
77
  name: str = "firecrawl_scrape"
77
78
  description: str = (
78
- "Scrape a single web page and extract its content in various formats (markdown, HTML, JSON, etc.). "
79
+ "Scrape a single web page and REPLACE any existing indexed content for that URL. "
80
+ "Unlike regular scrape, this tool removes old content before adding new content, preventing duplicates. "
79
81
  "This tool can handle JavaScript-rendered content, PDFs, and dynamic websites. "
80
- "Optionally indexes the content for later querying using the firecrawl_query_indexed_content tool. "
81
- "Use this when you need to extract clean, structured content from a specific URL."
82
+ "Use this when you want to refresh/update content from a URL that was previously scraped."
82
83
  )
83
84
  args_schema: Type[BaseModel] = FirecrawlScrapeInput
84
85
 
@@ -187,7 +188,7 @@ class FirecrawlScrape(FirecrawlBaseTool):
187
188
  result_data = data.get("data", {})
188
189
 
189
190
  # Format the results based on requested formats
190
- formatted_result = f"Successfully scraped: {url}\n\n"
191
+ formatted_result = f"Successfully scraped (REPLACE mode): {url}\n\n"
191
192
 
192
193
  if "markdown" in formats and result_data.get("markdown"):
193
194
  formatted_result += "## Markdown Content\n"
@@ -236,13 +237,16 @@ class FirecrawlScrape(FirecrawlBaseTool):
236
237
  formatted_result += f"Language: {metadata['language']}\n"
237
238
  formatted_result += "\n"
238
239
 
239
- # Index content if requested
240
+ # Index content if requested - REPLACE MODE
240
241
  if index_content and result_data.get("markdown"):
241
242
  try:
242
- # Import indexing utilities from firecrawl utils
243
+ # Import indexing utilities
244
+ from langchain_community.vectorstores import FAISS
245
+
243
246
  from intentkit.skills.firecrawl.utils import (
247
+ FirecrawlDocumentProcessor,
244
248
  FirecrawlMetadataManager,
245
- index_documents,
249
+ FirecrawlVectorStoreManager,
246
250
  )
247
251
 
248
252
  # Create document from scraped content
@@ -261,38 +265,149 @@ class FirecrawlScrape(FirecrawlBaseTool):
261
265
  # Get agent ID for indexing
262
266
  agent_id = context.agent_id
263
267
  if agent_id:
264
- # Index the document
265
- total_chunks, was_merged = await index_documents(
266
- [document],
267
- agent_id,
268
- self.skill_store,
269
- chunk_size,
270
- chunk_overlap,
271
- )
272
-
273
- # Update metadata
268
+ # Initialize managers
269
+ vs_manager = FirecrawlVectorStoreManager(self.skill_store)
274
270
  metadata_manager = FirecrawlMetadataManager(
275
271
  self.skill_store
276
272
  )
277
- new_metadata = metadata_manager.create_url_metadata(
278
- [url], [document], "firecrawl_scrape"
273
+
274
+ # Load existing vector store
275
+ existing_vector_store = await vs_manager.load_vector_store(
276
+ agent_id
279
277
  )
280
- await metadata_manager.update_metadata(
281
- agent_id, new_metadata
278
+
279
+ # Split the new document into chunks
280
+ split_docs = FirecrawlDocumentProcessor.split_documents(
281
+ [document], chunk_size, chunk_overlap
282
282
  )
283
283
 
284
- formatted_result += "\n## Content Indexing\n"
285
- formatted_result += (
286
- "Successfully indexed content into vector store:\n"
284
+ # Create embeddings
285
+ embeddings = vs_manager.create_embeddings()
286
+
287
+ if existing_vector_store:
288
+ # Get all existing documents and filter out those from the same URL
289
+ try:
290
+ # Try to access documents directly if available
291
+ if hasattr(
292
+ existing_vector_store, "docstore"
293
+ ) and hasattr(
294
+ existing_vector_store.docstore, "_dict"
295
+ ):
296
+ # Access FAISS documents directly
297
+ all_docs = list(
298
+ existing_vector_store.docstore._dict.values()
299
+ )
300
+ else:
301
+ # Fallback: use a reasonable k value for similarity search
302
+ # Use a dummy query to retrieve documents
303
+ all_docs = existing_vector_store.similarity_search(
304
+ "dummy", # Use a dummy query instead of empty string
305
+ k=1000, # Use reasonable upper bound
306
+ )
307
+
308
+ # Filter out documents from the same URL
309
+ preserved_docs = [
310
+ doc
311
+ for doc in all_docs
312
+ if doc.metadata.get("source") != url
313
+ ]
314
+
315
+ logger.info(
316
+ f"firecrawl_scrape: Preserving {len(preserved_docs)} docs from other URLs, "
317
+ f"replacing content from {url}"
318
+ )
319
+
320
+ # Create new vector store with preserved docs + new docs
321
+ if preserved_docs:
322
+ # Combine preserved and new documents
323
+ all_documents = preserved_docs + split_docs
324
+ new_vector_store = FAISS.from_documents(
325
+ all_documents, embeddings
326
+ )
327
+ formatted_result += "\n## Content Replacement\n"
328
+ formatted_result += f"Replaced existing content for URL: {url}\n"
329
+ num_preserved_urls = len(
330
+ set(
331
+ doc.metadata.get("source", "")
332
+ for doc in preserved_docs
333
+ )
334
+ )
335
+ formatted_result += f"Preserved content from {num_preserved_urls} other URLs\n"
336
+ else:
337
+ # No other documents to preserve, just create from new docs
338
+ new_vector_store = FAISS.from_documents(
339
+ split_docs, embeddings
340
+ )
341
+ formatted_result += "\n## Content Replacement\n"
342
+ formatted_result += f"Created new index with content from: {url}\n"
343
+ except Exception as e:
344
+ logger.warning(
345
+ f"Could not preserve other URLs, creating fresh index: {e}"
346
+ )
347
+ # Fallback: create new store with just the new documents
348
+ new_vector_store = FAISS.from_documents(
349
+ split_docs, embeddings
350
+ )
351
+ formatted_result += "\n## Content Replacement\n"
352
+ formatted_result += f"Created fresh index with content from: {url}\n"
353
+ else:
354
+ # No existing store, create new one
355
+ new_vector_store = FAISS.from_documents(
356
+ split_docs, embeddings
357
+ )
358
+ formatted_result += "\n## Content Indexing\n"
359
+ formatted_result += (
360
+ f"Created new index with content from: {url}\n"
361
+ )
362
+
363
+ # Save the new vector store
364
+ await vs_manager.save_vector_store(
365
+ agent_id, new_vector_store, chunk_size, chunk_overlap
287
366
  )
288
- formatted_result += f"- Chunks created: {total_chunks}\n"
367
+
368
+ # Update metadata to track all URLs
369
+ # Get existing metadata to preserve other URLs
370
+ metadata_key = f"indexed_urls_{agent_id}"
371
+ existing_metadata = (
372
+ await self.skill_store.get_agent_skill_data(
373
+ agent_id, "firecrawl", metadata_key
374
+ )
375
+ )
376
+
377
+ if existing_metadata and existing_metadata.get("urls"):
378
+ # Remove the current URL and add it back (to update timestamp)
379
+ existing_urls = [
380
+ u for u in existing_metadata["urls"] if u != url
381
+ ]
382
+ existing_urls.append(url)
383
+ updated_metadata = {
384
+ "urls": existing_urls,
385
+ "document_count": len(existing_urls),
386
+ "source_type": "firecrawl_mixed",
387
+ "indexed_at": str(len(existing_urls)),
388
+ }
389
+ else:
390
+ # Create new metadata
391
+ updated_metadata = metadata_manager.create_url_metadata(
392
+ [url], [document], "firecrawl_scrape"
393
+ )
394
+
395
+ await metadata_manager.update_metadata(
396
+ agent_id, updated_metadata
397
+ )
398
+
399
+ formatted_result += "\n## Content Indexing (REPLACE MODE)\n"
400
+ formatted_result += "Successfully REPLACED indexed content in vector store:\n"
401
+ formatted_result += f"- Chunks created: {len(split_docs)}\n"
289
402
  formatted_result += f"- Chunk size: {chunk_size}\n"
290
403
  formatted_result += f"- Chunk overlap: {chunk_overlap}\n"
291
- formatted_result += f"- Content merged with existing: {'Yes' if was_merged else 'No'}\n"
404
+ formatted_result += (
405
+ "- Previous content for this URL: REPLACED\n"
406
+ )
292
407
  formatted_result += "Use the 'firecrawl_query_indexed_content' skill to search this content.\n"
293
408
 
294
409
  logger.info(
295
- f"firecrawl_scrape: Successfully indexed {url} with {total_chunks} chunks"
410
+ f"firecrawl_scrape: Successfully replaced content for {url} with {len(split_docs)} chunks"
296
411
  )
297
412
  else:
298
413
  formatted_result += "\n## Content Indexing\n"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: intentkit
3
- Version: 0.6.19.dev2
3
+ Version: 0.6.20.dev1
4
4
  Summary: Intent-based AI Agent Platform - Core Package
5
5
  Project-URL: Homepage, https://github.com/crestal-network/intentkit
6
6
  Project-URL: Repository, https://github.com/crestal-network/intentkit
@@ -1,4 +1,4 @@
1
- intentkit/__init__.py,sha256=URY46LF0PzcfF5ekBoFEG5w1RPHfC3Ht-Gw_t45X9Sk,384
1
+ intentkit/__init__.py,sha256=L5N8UBhOoj8vD0NB2G81lATWypQbespoMFUeoZRNITg,385
2
2
  intentkit/abstracts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  intentkit/abstracts/agent.py,sha256=108gb5W8Q1Sy4G55F2_ZFv2-_CnY76qrBtpIr0Oxxqk,1489
4
4
  intentkit/abstracts/api.py,sha256=ZUc24vaQvQVbbjznx7bV0lbbQxdQPfEV8ZxM2R6wZWo,166
@@ -198,15 +198,15 @@ intentkit/skills/enso/abi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
198
198
  intentkit/skills/enso/abi/approval.py,sha256=IsyQLFxzAttocrtCB2PhbgprA7Vqujzpxvg0hJbeJ00,9867
199
199
  intentkit/skills/enso/abi/erc20.py,sha256=IScqZhHpMt_eFfYtMXw0-w5jptkAK0xsqqUDjbWdb2s,439
200
200
  intentkit/skills/enso/abi/route.py,sha256=ng9U2RSyS5R3d-b0m5ELa4rFpaUDO9HcgSoX9P_wWZo,4746
201
- intentkit/skills/firecrawl/README.md,sha256=LCi6ju-QO0nXti4y9-ltcF-bwrgXGT7NJpz67vFUcCo,6912
201
+ intentkit/skills/firecrawl/README.md,sha256=OP5rCC5aNx9A4YjgotZB-JFdBR_0qHiWmYLuA52a8Tw,7366
202
202
  intentkit/skills/firecrawl/__init__.py,sha256=QQ0I5vlUgsLRFqHO17vbq-3ERKL3nzoo2B4MFGH0Igg,3160
203
203
  intentkit/skills/firecrawl/base.py,sha256=8BqD3X6RK0RedWU-qsa5qPMpuXWTZ6NbYLSpppFK_EU,1334
204
204
  intentkit/skills/firecrawl/clear.py,sha256=mfzQg8e6sbCwSzJGN_Lqfgxt-0pvtH_dBtNSJpMQA5A,2830
205
205
  intentkit/skills/firecrawl/crawl.py,sha256=lhySK1TbxGcLAXQi1zvrp4Zdo5ghhBFvxc4mFMl5LoI,18278
206
206
  intentkit/skills/firecrawl/firecrawl.png,sha256=6GoGlIMYuIDo-TqMlZbD4QYkmxvQ7krqAa5MANumJqk,5065
207
207
  intentkit/skills/firecrawl/query.py,sha256=LZzIy-LmqyEa8cZoBm-Eoen6GRy3NJxfuQcGi54Hwp0,4364
208
- intentkit/skills/firecrawl/schema.json,sha256=3LfZPS-mdKNh8r7IQ-oAMFAq_xS5dVs9sV8PXeEUh6o,4439
209
- intentkit/skills/firecrawl/scrape.py,sha256=P2Pwbi5l6bbN1S8akwwr9dhtUHw20UBHdN0c2B5J9Rs,13642
208
+ intentkit/skills/firecrawl/schema.json,sha256=q3ynbCO1NDidHZd3Nh7TNZ6lCv6y26XW7WBrYlj-JM0,4513
209
+ intentkit/skills/firecrawl/scrape.py,sha256=2axmz5hZVnNGvTPTi0r0WAN4MoYNQZzOFtMZd5pRgcg,20704
210
210
  intentkit/skills/firecrawl/utils.py,sha256=Ot_vEg4Z30_BY3Xbh59gb_Tu17tSCmytRw49RGAzZ88,10093
211
211
  intentkit/skills/github/README.md,sha256=SzYGJ9qSPaZl68iD8AQJGKTMLv0keQZesnSK-VhrAfs,1802
212
212
  intentkit/skills/github/__init__.py,sha256=Vva9jMtACSM_cZXy5JY0h6Q1ejR1jm-Xu3Q6PwyB72o,1471
@@ -411,7 +411,7 @@ intentkit/utils/random.py,sha256=DymMxu9g0kuQLgJUqalvgksnIeLdS-v0aRk5nQU0mLI,452
411
411
  intentkit/utils/s3.py,sha256=9trQNkKQ5VgxWsewVsV8Y0q_pXzGRvsCYP8xauyUYkg,8549
412
412
  intentkit/utils/slack_alert.py,sha256=s7UpRgyzLW7Pbmt8cKzTJgMA9bm4EP-1rQ5KXayHu6E,2264
413
413
  intentkit/utils/tx.py,sha256=2yLLGuhvfBEY5n_GJ8wmIWLCzn0FsYKv5kRNzw_sLUI,1454
414
- intentkit-0.6.19.dev2.dist-info/METADATA,sha256=yH0g5MnOWthCWld7D-Xu--mKeaavKQdWxj7gLSinejo,6414
415
- intentkit-0.6.19.dev2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
416
- intentkit-0.6.19.dev2.dist-info/licenses/LICENSE,sha256=Bln6DhK-LtcO4aXy-PBcdZv2f24MlJFm_qn222biJtE,1071
417
- intentkit-0.6.19.dev2.dist-info/RECORD,,
414
+ intentkit-0.6.20.dev1.dist-info/METADATA,sha256=oGwdu4cAD3dMnV6di-S4CTtXCr8vJH37NZNXn3yRqEA,6414
415
+ intentkit-0.6.20.dev1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
416
+ intentkit-0.6.20.dev1.dist-info/licenses/LICENSE,sha256=Bln6DhK-LtcO4aXy-PBcdZv2f24MlJFm_qn222biJtE,1071
417
+ intentkit-0.6.20.dev1.dist-info/RECORD,,