intentkit 0.6.0.dev8__py3-none-any.whl → 0.6.0.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of intentkit might be problematic. Click here for more details.

intentkit/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  A powerful platform for building AI agents with blockchain and cryptocurrency capabilities.
4
4
  """
5
5
 
6
- __version__ = "0.6.0-dev.8"
6
+ __version__ = "0.6.0-dev.9"
7
7
  __author__ = "hyacinthus"
8
8
  __email__ = "hyacinthus@gmail.com"
9
9
 
@@ -23,7 +23,7 @@ class ScrapeAndIndexInput(BaseModel):
23
23
  urls: List[str] = Field(
24
24
  description="List of URLs to scrape and index. Each URL should be a valid web address starting with http:// or https://",
25
25
  min_items=1,
26
- max_items=10,
26
+ max_items=25,
27
27
  )
28
28
  chunk_size: int = Field(
29
29
  description="Size of text chunks for indexing (default: 1000)",
@@ -29,6 +29,31 @@ DEFAULT_REQUESTS_PER_SECOND = 2
29
29
  MAX_CONTENT_SIZE_MB = 10 # 10 MB limit
30
30
  MAX_CONTENT_SIZE_BYTES = MAX_CONTENT_SIZE_MB * 1024 * 1024
31
31
 
32
+ # HTTP Headers to bypass Cloudflare and other bot protection
33
+ DEFAULT_HEADERS = {
34
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
35
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
36
+ "Accept-Language": "en-US,en;q=0.9",
37
+ "Accept-Encoding": "gzip, deflate, br",
38
+ "DNT": "1",
39
+ "Connection": "keep-alive",
40
+ "Upgrade-Insecure-Requests": "1",
41
+ "Sec-Fetch-Dest": "document",
42
+ "Sec-Fetch-Mode": "navigate",
43
+ "Sec-Fetch-Site": "none",
44
+ "Sec-Fetch-User": "?1",
45
+ "Cache-Control": "max-age=0",
46
+ }
47
+
48
+ # Alternative headers for fallback when primary headers fail
49
+ FALLBACK_HEADERS = {
50
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
51
+ "Accept": "*/*",
52
+ "Accept-Language": "en-US,en;q=0.5",
53
+ "Accept-Encoding": "gzip, deflate",
54
+ "Connection": "keep-alive",
55
+ }
56
+
32
57
  # Storage keys
33
58
  VECTOR_STORE_KEY_PREFIX = "vector_store"
34
59
  METADATA_KEY_PREFIX = "indexed_urls"
@@ -523,20 +548,38 @@ async def scrape_and_index_urls(
523
548
  try:
524
549
  logger.info(f"[{agent_id}] Processing URL {i + 1}/{len(valid_urls)}: {url}")
525
550
 
526
- # Load single URL
551
+ # Load single URL with enhanced headers
527
552
  loader = WebBaseLoader(
528
553
  web_paths=[url],
529
554
  requests_per_second=requests_per_second,
530
555
  )
531
556
 
532
- # Configure loader
557
+ # Configure loader with enhanced headers to bypass bot protection
533
558
  loader.requests_kwargs = {
534
559
  "verify": True,
535
560
  "timeout": DEFAULT_REQUEST_TIMEOUT,
561
+ "headers": DEFAULT_HEADERS,
536
562
  }
537
563
 
538
- # Scrape the URL
539
- documents = await asyncio.to_thread(loader.load)
564
+ # Scrape the URL with retry logic
565
+ documents = None
566
+ try:
567
+ documents = await asyncio.to_thread(loader.load)
568
+ except Exception as primary_error:
569
+ # If primary headers fail, try fallback headers
570
+ logger.warning(
571
+ f"[{agent_id}] Primary headers failed for {url}, trying fallback: {primary_error}"
572
+ )
573
+
574
+ loader.requests_kwargs["headers"] = FALLBACK_HEADERS
575
+ try:
576
+ documents = await asyncio.to_thread(loader.load)
577
+ logger.info(f"[{agent_id}] Fallback headers succeeded for {url}")
578
+ except Exception as fallback_error:
579
+ logger.error(
580
+ f"[{agent_id}] Both header sets failed for {url}: {fallback_error}"
581
+ )
582
+ raise fallback_error
540
583
 
541
584
  if not documents:
542
585
  logger.warning(f"[{agent_id}] No content extracted from {url}")
@@ -81,7 +81,22 @@ class WebsiteIndexer(WebScraperBaseTool):
81
81
  """Fetch robots.txt content."""
82
82
  robots_url = urljoin(base_url, "/robots.txt")
83
83
 
84
- async with httpx.AsyncClient(timeout=30) as client:
84
+ # Import headers from utils
85
+ from intentkit.skills.web_scraper.utils import DEFAULT_HEADERS, FALLBACK_HEADERS
86
+
87
+ # Try with primary headers first
88
+ async with httpx.AsyncClient(timeout=30, headers=DEFAULT_HEADERS) as client:
89
+ try:
90
+ response = await client.get(robots_url)
91
+ if response.status_code == 200:
92
+ return response.text
93
+ except Exception as e:
94
+ logger.warning(
95
+ f"Primary headers failed for robots.txt from {robots_url}: {e}"
96
+ )
97
+
98
+ # Try with fallback headers
99
+ async with httpx.AsyncClient(timeout=30, headers=FALLBACK_HEADERS) as client:
85
100
  try:
86
101
  response = await client.get(robots_url)
87
102
  if response.status_code == 200:
@@ -119,7 +134,22 @@ class WebsiteIndexer(WebScraperBaseTool):
119
134
 
120
135
  async def _fetch_sitemap_content(self, sitemap_url: str) -> str:
121
136
  """Fetch sitemap XML content."""
122
- async with httpx.AsyncClient(timeout=30) as client:
137
+ # Import headers from utils
138
+ from intentkit.skills.web_scraper.utils import DEFAULT_HEADERS, FALLBACK_HEADERS
139
+
140
+ # Try with primary headers first
141
+ async with httpx.AsyncClient(timeout=30, headers=DEFAULT_HEADERS) as client:
142
+ try:
143
+ response = await client.get(sitemap_url)
144
+ if response.status_code == 200:
145
+ return response.text
146
+ except Exception as e:
147
+ logger.warning(
148
+ f"Primary headers failed for sitemap from {sitemap_url}: {e}"
149
+ )
150
+
151
+ # Try with fallback headers
152
+ async with httpx.AsyncClient(timeout=30, headers=FALLBACK_HEADERS) as client:
123
153
  try:
124
154
  response = await client.get(sitemap_url)
125
155
  if response.status_code == 200:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: intentkit
3
- Version: 0.6.0.dev8
3
+ Version: 0.6.0.dev9
4
4
  Summary: Intent-based AI Agent Platform - Core Package
5
5
  Project-URL: Homepage, https://github.com/crestal-network/intentkit
6
6
  Project-URL: Repository, https://github.com/crestal-network/intentkit
@@ -1,4 +1,4 @@
1
- intentkit/__init__.py,sha256=pGwXI_gV0Ck_IbiHjG9Pu7X2pcEipglnkuAd64veXhk,384
1
+ intentkit/__init__.py,sha256=36psS49qk6ZnE5Y1kAewHaeiYnMBG5a6ukVF9pPQmt8,384
2
2
  intentkit/abstracts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  intentkit/abstracts/agent.py,sha256=108gb5W8Q1Sy4G55F2_ZFv2-_CnY76qrBtpIr0Oxxqk,1489
4
4
  intentkit/abstracts/api.py,sha256=ZUc24vaQvQVbbjznx7bV0lbbQxdQPfEV8ZxM2R6wZWo,166
@@ -350,9 +350,9 @@ intentkit/skills/web_scraper/base.py,sha256=R3P1H8PaJ-xk3wpEAWGtDvPVxdRpCXeKoLGk
350
350
  intentkit/skills/web_scraper/document_indexer.py,sha256=sZyz2WoXPc3g3SNMq4Fxe_U2q4sc9jcVwn9ZHUoVTtY,4860
351
351
  intentkit/skills/web_scraper/langchain.png,sha256=G_FHqaY0wmbJlqvcK_ndDcRfw42MDyGcdxwN7v4CsGE,9204
352
352
  intentkit/skills/web_scraper/schema.json,sha256=gSXihGJh8uG7b2Xbj8jZUYKP-_uMckB2n3Jx8NpZuZU,4514
353
- intentkit/skills/web_scraper/scrape_and_index.py,sha256=zaePi0EiHXkZx2d7AxFsZSQL3QfoUDizRRXfkbg9-O8,9903
354
- intentkit/skills/web_scraper/utils.py,sha256=pf5JtMu_ZdFc0XQxHSnyPcpwyLFjno88Mcv415yFnn4,21409
355
- intentkit/skills/web_scraper/website_indexer.py,sha256=T2Vi6VywICeqKP4qDiYCGeVdIsA8SNSX8pHTv85YwxI,16771
353
+ intentkit/skills/web_scraper/scrape_and_index.py,sha256=j8hi9xEedNt8aAhlqN8HL75hjranCIkgPjoVHg-cfEs,9903
354
+ intentkit/skills/web_scraper/utils.py,sha256=feGBTMWqpkoY7RFy2xDHVs5y8c2h8-XZ111jRo6cC3k,23349
355
+ intentkit/skills/web_scraper/website_indexer.py,sha256=UmpOJ3BOJidzubOXtFbCtbEmoDTSNg_5VCVQ6l25Xis,18062
356
356
  intentkit/utils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
357
357
  intentkit/utils/chain.py,sha256=3GBHuAbXxQr_HlOvkbB2kruYSkweucfxI5u-swXzY40,15135
358
358
  intentkit/utils/error.py,sha256=JxnzDdKjwZX6Pa-bt_qaibcrWAeF6QAoHsu2uYPmqu4,4483
@@ -362,7 +362,7 @@ intentkit/utils/random.py,sha256=DymMxu9g0kuQLgJUqalvgksnIeLdS-v0aRk5nQU0mLI,452
362
362
  intentkit/utils/s3.py,sha256=9trQNkKQ5VgxWsewVsV8Y0q_pXzGRvsCYP8xauyUYkg,8549
363
363
  intentkit/utils/slack_alert.py,sha256=s7UpRgyzLW7Pbmt8cKzTJgMA9bm4EP-1rQ5KXayHu6E,2264
364
364
  intentkit/utils/tx.py,sha256=2yLLGuhvfBEY5n_GJ8wmIWLCzn0FsYKv5kRNzw_sLUI,1454
365
- intentkit-0.6.0.dev8.dist-info/METADATA,sha256=iHC3s23ay4ipDf_BjcHaR7kH60EmZqajtR3ruG2kDZk,7285
366
- intentkit-0.6.0.dev8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
367
- intentkit-0.6.0.dev8.dist-info/licenses/LICENSE,sha256=Bln6DhK-LtcO4aXy-PBcdZv2f24MlJFm_qn222biJtE,1071
368
- intentkit-0.6.0.dev8.dist-info/RECORD,,
365
+ intentkit-0.6.0.dev9.dist-info/METADATA,sha256=5xhzxy_Yw4pFLa4NgEnMLs6-DY5N8hyOind6QCLOfX4,7285
366
+ intentkit-0.6.0.dev9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
367
+ intentkit-0.6.0.dev9.dist-info/licenses/LICENSE,sha256=Bln6DhK-LtcO4aXy-PBcdZv2f24MlJFm_qn222biJtE,1071
368
+ intentkit-0.6.0.dev9.dist-info/RECORD,,