intentkit 0.6.19.dev2__py3-none-any.whl → 0.6.20.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of intentkit might be problematic. Click here for more details.
- intentkit/__init__.py +1 -1
- intentkit/skills/firecrawl/README.md +11 -5
- intentkit/skills/firecrawl/schema.json +1 -1
- intentkit/skills/firecrawl/scrape.py +145 -30
- {intentkit-0.6.19.dev2.dist-info → intentkit-0.6.20.dev1.dist-info}/METADATA +1 -1
- {intentkit-0.6.19.dev2.dist-info → intentkit-0.6.20.dev1.dist-info}/RECORD +8 -8
- {intentkit-0.6.19.dev2.dist-info → intentkit-0.6.20.dev1.dist-info}/WHEEL +0 -0
- {intentkit-0.6.19.dev2.dist-info → intentkit-0.6.20.dev1.dist-info}/licenses/LICENSE +0 -0
intentkit/__init__.py
CHANGED
|
@@ -5,18 +5,22 @@ The Firecrawl skills provide advanced web scraping and content indexing capabili
|
|
|
5
5
|
## Skills Overview
|
|
6
6
|
|
|
7
7
|
### 1. firecrawl_scrape
|
|
8
|
-
Scrapes a single webpage and
|
|
8
|
+
Scrapes a single webpage and REPLACES any existing indexed content for that URL, preventing duplicates.
|
|
9
9
|
|
|
10
10
|
**Parameters:**
|
|
11
11
|
- `url` (required): The URL to scrape
|
|
12
|
-
- `formats` (optional): Output formats - markdown, html, rawHtml, screenshot, links,
|
|
12
|
+
- `formats` (optional): Output formats - markdown, html, rawHtml, screenshot, links, json (default: ["markdown"])
|
|
13
|
+
- `only_main_content` (optional): Extract only main content (default: true)
|
|
13
14
|
- `include_tags` (optional): HTML tags to include (e.g., ["h1", "h2", "p"])
|
|
14
15
|
- `exclude_tags` (optional): HTML tags to exclude
|
|
15
|
-
- `
|
|
16
|
+
- `wait_for` (optional): Wait time in milliseconds before scraping
|
|
17
|
+
- `timeout` (optional): Maximum timeout in milliseconds (default: 30000)
|
|
16
18
|
- `index_content` (optional): Whether to index content for querying (default: true)
|
|
17
19
|
- `chunk_size` (optional): Size of text chunks for indexing (default: 1000)
|
|
18
20
|
- `chunk_overlap` (optional): Overlap between chunks (default: 200)
|
|
19
21
|
|
|
22
|
+
**Use Case:** Use this when you want to refresh/update content from a URL that was previously scraped, ensuring no duplicate or stale content remains.
|
|
23
|
+
|
|
20
24
|
### 2. firecrawl_crawl
|
|
21
25
|
Crawls multiple pages from a website and indexes all content.
|
|
22
26
|
|
|
@@ -158,8 +162,9 @@ Prompt: "Use firecrawl_scrape to scrape https://example.com and index the conten
|
|
|
158
162
|
### Documentation Indexing
|
|
159
163
|
```
|
|
160
164
|
1. Scrape main documentation page
|
|
161
|
-
2. Crawl related documentation sections
|
|
162
|
-
3.
|
|
165
|
+
2. Crawl related documentation sections
|
|
166
|
+
3. Use scrape again to update changed pages (replaces old content)
|
|
167
|
+
4. Query for specific technical information
|
|
163
168
|
```
|
|
164
169
|
|
|
165
170
|
### Competitive Analysis
|
|
@@ -205,6 +210,7 @@ Prompt: "Use firecrawl_scrape to scrape https://example.com and index the conten
|
|
|
205
210
|
- **PDF Support**: Can scrape and index PDF documents
|
|
206
211
|
- **Intelligent Chunking**: Optimized text splitting for better search
|
|
207
212
|
- **Independent Storage**: Uses its own dedicated vector store for Firecrawl content
|
|
213
|
+
- **Content Replacement**: Replace mode prevents duplicate/stale content
|
|
208
214
|
- **Metadata Rich**: Includes source URLs, timestamps, and content types
|
|
209
215
|
- **Semantic Search**: Uses OpenAI embeddings for intelligent querying
|
|
210
216
|
- **Batch Processing**: Efficient handling of multiple pages
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"Agent Owner + All Users",
|
|
35
35
|
"Agent Owner Only"
|
|
36
36
|
],
|
|
37
|
-
"description": "Scrape single web pages and
|
|
37
|
+
"description": "Scrape single web pages and REPLACE any existing indexed content for that URL. Unlike regular scrape, this prevents duplicate content when re-scraping the same page. Use this to refresh/update content from a previously scraped URL.",
|
|
38
38
|
"default": "private"
|
|
39
39
|
},
|
|
40
40
|
"firecrawl_crawl": {
|
|
@@ -62,10 +62,11 @@ class FirecrawlScrapeInput(BaseModel):
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
class FirecrawlScrape(FirecrawlBaseTool):
|
|
65
|
-
"""Tool for scraping web pages using Firecrawl.
|
|
65
|
+
"""Tool for scraping web pages using Firecrawl with REPLACE behavior.
|
|
66
66
|
|
|
67
|
-
This tool uses Firecrawl's API to scrape web pages and
|
|
68
|
-
|
|
67
|
+
This tool uses Firecrawl's API to scrape web pages and REPLACES any existing
|
|
68
|
+
indexed content for the same URL instead of appending to it. This prevents
|
|
69
|
+
duplicate content when re-scraping the same page.
|
|
69
70
|
|
|
70
71
|
Attributes:
|
|
71
72
|
name: The name of the tool.
|
|
@@ -75,10 +76,10 @@ class FirecrawlScrape(FirecrawlBaseTool):
|
|
|
75
76
|
|
|
76
77
|
name: str = "firecrawl_scrape"
|
|
77
78
|
description: str = (
|
|
78
|
-
"Scrape a single web page and
|
|
79
|
+
"Scrape a single web page and REPLACE any existing indexed content for that URL. "
|
|
80
|
+
"Unlike regular scrape, this tool removes old content before adding new content, preventing duplicates. "
|
|
79
81
|
"This tool can handle JavaScript-rendered content, PDFs, and dynamic websites. "
|
|
80
|
-
"
|
|
81
|
-
"Use this when you need to extract clean, structured content from a specific URL."
|
|
82
|
+
"Use this when you want to refresh/update content from a URL that was previously scraped."
|
|
82
83
|
)
|
|
83
84
|
args_schema: Type[BaseModel] = FirecrawlScrapeInput
|
|
84
85
|
|
|
@@ -187,7 +188,7 @@ class FirecrawlScrape(FirecrawlBaseTool):
|
|
|
187
188
|
result_data = data.get("data", {})
|
|
188
189
|
|
|
189
190
|
# Format the results based on requested formats
|
|
190
|
-
formatted_result = f"Successfully scraped: {url}\n\n"
|
|
191
|
+
formatted_result = f"Successfully scraped (REPLACE mode): {url}\n\n"
|
|
191
192
|
|
|
192
193
|
if "markdown" in formats and result_data.get("markdown"):
|
|
193
194
|
formatted_result += "## Markdown Content\n"
|
|
@@ -236,13 +237,16 @@ class FirecrawlScrape(FirecrawlBaseTool):
|
|
|
236
237
|
formatted_result += f"Language: {metadata['language']}\n"
|
|
237
238
|
formatted_result += "\n"
|
|
238
239
|
|
|
239
|
-
# Index content if requested
|
|
240
|
+
# Index content if requested - REPLACE MODE
|
|
240
241
|
if index_content and result_data.get("markdown"):
|
|
241
242
|
try:
|
|
242
|
-
# Import indexing utilities
|
|
243
|
+
# Import indexing utilities
|
|
244
|
+
from langchain_community.vectorstores import FAISS
|
|
245
|
+
|
|
243
246
|
from intentkit.skills.firecrawl.utils import (
|
|
247
|
+
FirecrawlDocumentProcessor,
|
|
244
248
|
FirecrawlMetadataManager,
|
|
245
|
-
|
|
249
|
+
FirecrawlVectorStoreManager,
|
|
246
250
|
)
|
|
247
251
|
|
|
248
252
|
# Create document from scraped content
|
|
@@ -261,38 +265,149 @@ class FirecrawlScrape(FirecrawlBaseTool):
|
|
|
261
265
|
# Get agent ID for indexing
|
|
262
266
|
agent_id = context.agent_id
|
|
263
267
|
if agent_id:
|
|
264
|
-
#
|
|
265
|
-
|
|
266
|
-
[document],
|
|
267
|
-
agent_id,
|
|
268
|
-
self.skill_store,
|
|
269
|
-
chunk_size,
|
|
270
|
-
chunk_overlap,
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
# Update metadata
|
|
268
|
+
# Initialize managers
|
|
269
|
+
vs_manager = FirecrawlVectorStoreManager(self.skill_store)
|
|
274
270
|
metadata_manager = FirecrawlMetadataManager(
|
|
275
271
|
self.skill_store
|
|
276
272
|
)
|
|
277
|
-
|
|
278
|
-
|
|
273
|
+
|
|
274
|
+
# Load existing vector store
|
|
275
|
+
existing_vector_store = await vs_manager.load_vector_store(
|
|
276
|
+
agent_id
|
|
279
277
|
)
|
|
280
|
-
|
|
281
|
-
|
|
278
|
+
|
|
279
|
+
# Split the new document into chunks
|
|
280
|
+
split_docs = FirecrawlDocumentProcessor.split_documents(
|
|
281
|
+
[document], chunk_size, chunk_overlap
|
|
282
282
|
)
|
|
283
283
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
284
|
+
# Create embeddings
|
|
285
|
+
embeddings = vs_manager.create_embeddings()
|
|
286
|
+
|
|
287
|
+
if existing_vector_store:
|
|
288
|
+
# Get all existing documents and filter out those from the same URL
|
|
289
|
+
try:
|
|
290
|
+
# Try to access documents directly if available
|
|
291
|
+
if hasattr(
|
|
292
|
+
existing_vector_store, "docstore"
|
|
293
|
+
) and hasattr(
|
|
294
|
+
existing_vector_store.docstore, "_dict"
|
|
295
|
+
):
|
|
296
|
+
# Access FAISS documents directly
|
|
297
|
+
all_docs = list(
|
|
298
|
+
existing_vector_store.docstore._dict.values()
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
# Fallback: use a reasonable k value for similarity search
|
|
302
|
+
# Use a dummy query to retrieve documents
|
|
303
|
+
all_docs = existing_vector_store.similarity_search(
|
|
304
|
+
"dummy", # Use a dummy query instead of empty string
|
|
305
|
+
k=1000, # Use reasonable upper bound
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Filter out documents from the same URL
|
|
309
|
+
preserved_docs = [
|
|
310
|
+
doc
|
|
311
|
+
for doc in all_docs
|
|
312
|
+
if doc.metadata.get("source") != url
|
|
313
|
+
]
|
|
314
|
+
|
|
315
|
+
logger.info(
|
|
316
|
+
f"firecrawl_scrape: Preserving {len(preserved_docs)} docs from other URLs, "
|
|
317
|
+
f"replacing content from {url}"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Create new vector store with preserved docs + new docs
|
|
321
|
+
if preserved_docs:
|
|
322
|
+
# Combine preserved and new documents
|
|
323
|
+
all_documents = preserved_docs + split_docs
|
|
324
|
+
new_vector_store = FAISS.from_documents(
|
|
325
|
+
all_documents, embeddings
|
|
326
|
+
)
|
|
327
|
+
formatted_result += "\n## Content Replacement\n"
|
|
328
|
+
formatted_result += f"Replaced existing content for URL: {url}\n"
|
|
329
|
+
num_preserved_urls = len(
|
|
330
|
+
set(
|
|
331
|
+
doc.metadata.get("source", "")
|
|
332
|
+
for doc in preserved_docs
|
|
333
|
+
)
|
|
334
|
+
)
|
|
335
|
+
formatted_result += f"Preserved content from {num_preserved_urls} other URLs\n"
|
|
336
|
+
else:
|
|
337
|
+
# No other documents to preserve, just create from new docs
|
|
338
|
+
new_vector_store = FAISS.from_documents(
|
|
339
|
+
split_docs, embeddings
|
|
340
|
+
)
|
|
341
|
+
formatted_result += "\n## Content Replacement\n"
|
|
342
|
+
formatted_result += f"Created new index with content from: {url}\n"
|
|
343
|
+
except Exception as e:
|
|
344
|
+
logger.warning(
|
|
345
|
+
f"Could not preserve other URLs, creating fresh index: {e}"
|
|
346
|
+
)
|
|
347
|
+
# Fallback: create new store with just the new documents
|
|
348
|
+
new_vector_store = FAISS.from_documents(
|
|
349
|
+
split_docs, embeddings
|
|
350
|
+
)
|
|
351
|
+
formatted_result += "\n## Content Replacement\n"
|
|
352
|
+
formatted_result += f"Created fresh index with content from: {url}\n"
|
|
353
|
+
else:
|
|
354
|
+
# No existing store, create new one
|
|
355
|
+
new_vector_store = FAISS.from_documents(
|
|
356
|
+
split_docs, embeddings
|
|
357
|
+
)
|
|
358
|
+
formatted_result += "\n## Content Indexing\n"
|
|
359
|
+
formatted_result += (
|
|
360
|
+
f"Created new index with content from: {url}\n"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Save the new vector store
|
|
364
|
+
await vs_manager.save_vector_store(
|
|
365
|
+
agent_id, new_vector_store, chunk_size, chunk_overlap
|
|
287
366
|
)
|
|
288
|
-
|
|
367
|
+
|
|
368
|
+
# Update metadata to track all URLs
|
|
369
|
+
# Get existing metadata to preserve other URLs
|
|
370
|
+
metadata_key = f"indexed_urls_{agent_id}"
|
|
371
|
+
existing_metadata = (
|
|
372
|
+
await self.skill_store.get_agent_skill_data(
|
|
373
|
+
agent_id, "firecrawl", metadata_key
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
if existing_metadata and existing_metadata.get("urls"):
|
|
378
|
+
# Remove the current URL and add it back (to update timestamp)
|
|
379
|
+
existing_urls = [
|
|
380
|
+
u for u in existing_metadata["urls"] if u != url
|
|
381
|
+
]
|
|
382
|
+
existing_urls.append(url)
|
|
383
|
+
updated_metadata = {
|
|
384
|
+
"urls": existing_urls,
|
|
385
|
+
"document_count": len(existing_urls),
|
|
386
|
+
"source_type": "firecrawl_mixed",
|
|
387
|
+
"indexed_at": str(len(existing_urls)),
|
|
388
|
+
}
|
|
389
|
+
else:
|
|
390
|
+
# Create new metadata
|
|
391
|
+
updated_metadata = metadata_manager.create_url_metadata(
|
|
392
|
+
[url], [document], "firecrawl_scrape"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
await metadata_manager.update_metadata(
|
|
396
|
+
agent_id, updated_metadata
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
formatted_result += "\n## Content Indexing (REPLACE MODE)\n"
|
|
400
|
+
formatted_result += "Successfully REPLACED indexed content in vector store:\n"
|
|
401
|
+
formatted_result += f"- Chunks created: {len(split_docs)}\n"
|
|
289
402
|
formatted_result += f"- Chunk size: {chunk_size}\n"
|
|
290
403
|
formatted_result += f"- Chunk overlap: {chunk_overlap}\n"
|
|
291
|
-
formatted_result +=
|
|
404
|
+
formatted_result += (
|
|
405
|
+
"- Previous content for this URL: REPLACED\n"
|
|
406
|
+
)
|
|
292
407
|
formatted_result += "Use the 'firecrawl_query_indexed_content' skill to search this content.\n"
|
|
293
408
|
|
|
294
409
|
logger.info(
|
|
295
|
-
f"firecrawl_scrape: Successfully
|
|
410
|
+
f"firecrawl_scrape: Successfully replaced content for {url} with {len(split_docs)} chunks"
|
|
296
411
|
)
|
|
297
412
|
else:
|
|
298
413
|
formatted_result += "\n## Content Indexing\n"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: intentkit
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.20.dev1
|
|
4
4
|
Summary: Intent-based AI Agent Platform - Core Package
|
|
5
5
|
Project-URL: Homepage, https://github.com/crestal-network/intentkit
|
|
6
6
|
Project-URL: Repository, https://github.com/crestal-network/intentkit
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
intentkit/__init__.py,sha256=
|
|
1
|
+
intentkit/__init__.py,sha256=L5N8UBhOoj8vD0NB2G81lATWypQbespoMFUeoZRNITg,385
|
|
2
2
|
intentkit/abstracts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
intentkit/abstracts/agent.py,sha256=108gb5W8Q1Sy4G55F2_ZFv2-_CnY76qrBtpIr0Oxxqk,1489
|
|
4
4
|
intentkit/abstracts/api.py,sha256=ZUc24vaQvQVbbjznx7bV0lbbQxdQPfEV8ZxM2R6wZWo,166
|
|
@@ -198,15 +198,15 @@ intentkit/skills/enso/abi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
198
198
|
intentkit/skills/enso/abi/approval.py,sha256=IsyQLFxzAttocrtCB2PhbgprA7Vqujzpxvg0hJbeJ00,9867
|
|
199
199
|
intentkit/skills/enso/abi/erc20.py,sha256=IScqZhHpMt_eFfYtMXw0-w5jptkAK0xsqqUDjbWdb2s,439
|
|
200
200
|
intentkit/skills/enso/abi/route.py,sha256=ng9U2RSyS5R3d-b0m5ELa4rFpaUDO9HcgSoX9P_wWZo,4746
|
|
201
|
-
intentkit/skills/firecrawl/README.md,sha256=
|
|
201
|
+
intentkit/skills/firecrawl/README.md,sha256=OP5rCC5aNx9A4YjgotZB-JFdBR_0qHiWmYLuA52a8Tw,7366
|
|
202
202
|
intentkit/skills/firecrawl/__init__.py,sha256=QQ0I5vlUgsLRFqHO17vbq-3ERKL3nzoo2B4MFGH0Igg,3160
|
|
203
203
|
intentkit/skills/firecrawl/base.py,sha256=8BqD3X6RK0RedWU-qsa5qPMpuXWTZ6NbYLSpppFK_EU,1334
|
|
204
204
|
intentkit/skills/firecrawl/clear.py,sha256=mfzQg8e6sbCwSzJGN_Lqfgxt-0pvtH_dBtNSJpMQA5A,2830
|
|
205
205
|
intentkit/skills/firecrawl/crawl.py,sha256=lhySK1TbxGcLAXQi1zvrp4Zdo5ghhBFvxc4mFMl5LoI,18278
|
|
206
206
|
intentkit/skills/firecrawl/firecrawl.png,sha256=6GoGlIMYuIDo-TqMlZbD4QYkmxvQ7krqAa5MANumJqk,5065
|
|
207
207
|
intentkit/skills/firecrawl/query.py,sha256=LZzIy-LmqyEa8cZoBm-Eoen6GRy3NJxfuQcGi54Hwp0,4364
|
|
208
|
-
intentkit/skills/firecrawl/schema.json,sha256=
|
|
209
|
-
intentkit/skills/firecrawl/scrape.py,sha256=
|
|
208
|
+
intentkit/skills/firecrawl/schema.json,sha256=q3ynbCO1NDidHZd3Nh7TNZ6lCv6y26XW7WBrYlj-JM0,4513
|
|
209
|
+
intentkit/skills/firecrawl/scrape.py,sha256=2axmz5hZVnNGvTPTi0r0WAN4MoYNQZzOFtMZd5pRgcg,20704
|
|
210
210
|
intentkit/skills/firecrawl/utils.py,sha256=Ot_vEg4Z30_BY3Xbh59gb_Tu17tSCmytRw49RGAzZ88,10093
|
|
211
211
|
intentkit/skills/github/README.md,sha256=SzYGJ9qSPaZl68iD8AQJGKTMLv0keQZesnSK-VhrAfs,1802
|
|
212
212
|
intentkit/skills/github/__init__.py,sha256=Vva9jMtACSM_cZXy5JY0h6Q1ejR1jm-Xu3Q6PwyB72o,1471
|
|
@@ -411,7 +411,7 @@ intentkit/utils/random.py,sha256=DymMxu9g0kuQLgJUqalvgksnIeLdS-v0aRk5nQU0mLI,452
|
|
|
411
411
|
intentkit/utils/s3.py,sha256=9trQNkKQ5VgxWsewVsV8Y0q_pXzGRvsCYP8xauyUYkg,8549
|
|
412
412
|
intentkit/utils/slack_alert.py,sha256=s7UpRgyzLW7Pbmt8cKzTJgMA9bm4EP-1rQ5KXayHu6E,2264
|
|
413
413
|
intentkit/utils/tx.py,sha256=2yLLGuhvfBEY5n_GJ8wmIWLCzn0FsYKv5kRNzw_sLUI,1454
|
|
414
|
-
intentkit-0.6.
|
|
415
|
-
intentkit-0.6.
|
|
416
|
-
intentkit-0.6.
|
|
417
|
-
intentkit-0.6.
|
|
414
|
+
intentkit-0.6.20.dev1.dist-info/METADATA,sha256=oGwdu4cAD3dMnV6di-S4CTtXCr8vJH37NZNXn3yRqEA,6414
|
|
415
|
+
intentkit-0.6.20.dev1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
416
|
+
intentkit-0.6.20.dev1.dist-info/licenses/LICENSE,sha256=Bln6DhK-LtcO4aXy-PBcdZv2f24MlJFm_qn222biJtE,1071
|
|
417
|
+
intentkit-0.6.20.dev1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|