spiderforce4ai 0.1.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,303 @@
1
+ """
2
+ SpiderForce4AI Python Wrapper
3
+ A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
4
+ """
5
+
6
+ import asyncio
7
+ import aiohttp
8
+ import json
9
+ import logging
10
+ from typing import List, Dict, Union, Optional
11
+ from dataclasses import dataclass, asdict
12
+ from urllib.parse import urljoin, urlparse
13
+ from pathlib import Path
14
+ import time
15
+ import xml.etree.ElementTree as ET
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ from datetime import datetime
18
+ import re
19
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
20
+ from rich.console import Console
21
+ import aiofiles
22
+ import httpx
23
+
24
+ console = Console()
25
+
26
+ def slugify(url: str) -> str:
27
+ """Convert URL to a valid filename."""
28
+ parsed = urlparse(url)
29
+ # Combine domain and path, remove scheme and special characters
30
+ slug = f"{parsed.netloc}{parsed.path}"
31
+ slug = re.sub(r'[^\w\-]', '_', slug)
32
+ slug = re.sub(r'_+', '_', slug) # Replace multiple underscores with single
33
+ return slug.strip('_')
34
+
35
+ @dataclass
36
+ class CrawlResult:
37
+ """Store results of a crawl operation."""
38
+ url: str
39
+ status: str # 'success' or 'failed'
40
+ markdown: Optional[str] = None
41
+ error: Optional[str] = None
42
+ timestamp: str = None
43
+ config: Dict = None
44
+
45
+ def __post_init__(self):
46
+ if not self.timestamp:
47
+ self.timestamp = datetime.now().isoformat()
48
+
49
+ @dataclass
50
+ class CrawlConfig:
51
+ """Configuration for crawling settings."""
52
+ target_selector: Optional[str] = None # Optional - specific element to target
53
+ remove_selectors: Optional[List[str]] = None # Optional - elements to remove
54
+ remove_selectors_regex: Optional[List[str]] = None # Optional - regex patterns for removal
55
+ max_concurrent_requests: int = 1 # Default to single thread
56
+ request_delay: float = 0.5 # Delay between requests
57
+ timeout: int = 30 # Request timeout
58
+ output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
59
+ webhook_url: Optional[str] = None # Optional webhook endpoint
60
+ webhook_timeout: int = 10 # Webhook timeout
61
+ report_file: Optional[Path] = None # Optional report file location
62
+
63
+ def __post_init__(self):
64
+ # Initialize empty lists for selectors if None
65
+ self.remove_selectors = self.remove_selectors or []
66
+ self.remove_selectors_regex = self.remove_selectors_regex or []
67
+
68
+ # Ensure output_dir is a Path and exists
69
+ self.output_dir = Path(self.output_dir)
70
+ self.output_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+ # If report_file is not specified, create it in output_dir
73
+ if self.report_file is None:
74
+ self.report_file = self.output_dir / "crawl_report.json"
75
+ else:
76
+ self.report_file = Path(self.report_file)
77
+
78
+ def to_dict(self) -> Dict:
79
+ """Convert config to dictionary for API requests."""
80
+ payload = {}
81
+ # Only include selectors if they are set
82
+ if self.target_selector:
83
+ payload["target_selector"] = self.target_selector
84
+ if self.remove_selectors:
85
+ payload["remove_selectors"] = self.remove_selectors
86
+ if self.remove_selectors_regex:
87
+ payload["remove_selectors_regex"] = self.remove_selectors_regex
88
+ return payload
89
+
90
+ class SpiderForce4AI:
91
+ """Main class for interacting with SpiderForce4AI service."""
92
+
93
+ def __init__(self, base_url: str):
94
+ self.base_url = base_url.rstrip('/')
95
+ self.session = None
96
+ self._executor = ThreadPoolExecutor()
97
+ self.crawl_results: List[CrawlResult] = []
98
+
99
+ async def _ensure_session(self):
100
+ """Ensure aiohttp session exists."""
101
+ if self.session is None or self.session.closed:
102
+ self.session = aiohttp.ClientSession()
103
+
104
+ async def _close_session(self):
105
+ """Close aiohttp session."""
106
+ if self.session and not self.session.closed:
107
+ await self.session.close()
108
+
109
+ async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
110
+ """Save markdown content to file."""
111
+ filename = f"{slugify(url)}.md"
112
+ filepath = output_dir / filename
113
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
114
+ await f.write(markdown)
115
+ return filepath
116
+
117
+ async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
118
+ """Send webhook with crawl results."""
119
+ if not config.webhook_url:
120
+ return
121
+
122
+ payload = {
123
+ "url": result.url,
124
+ "status": result.status,
125
+ "markdown": result.markdown if result.status == "success" else None,
126
+ "error": result.error if result.status == "failed" else None,
127
+ "timestamp": result.timestamp,
128
+ "config": config.to_dict()
129
+ }
130
+
131
+ try:
132
+ async with httpx.AsyncClient() as client:
133
+ response = await client.post(
134
+ config.webhook_url,
135
+ json=payload,
136
+ timeout=config.webhook_timeout
137
+ )
138
+ response.raise_for_status()
139
+ except Exception as e:
140
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
141
+
142
+ async def _save_report(self, config: CrawlConfig):
143
+ """Save crawl report to JSON file."""
144
+ if not config.report_file:
145
+ return
146
+
147
+ report = {
148
+ "timestamp": datetime.now().isoformat(),
149
+ "config": config.to_dict(),
150
+ "results": {
151
+ "successful": [asdict(r) for r in self.crawl_results if r.status == "success"],
152
+ "failed": [asdict(r) for r in self.crawl_results if r.status == "failed"]
153
+ },
154
+ "summary": {
155
+ "total": len(self.crawl_results),
156
+ "successful": len([r for r in self.crawl_results if r.status == "success"]),
157
+ "failed": len([r for r in self.crawl_results if r.status == "failed"])
158
+ }
159
+ }
160
+
161
+ async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
162
+ await f.write(json.dumps(report, indent=2))
163
+
164
+ async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
165
+ """Crawl a single URL asynchronously."""
166
+ await self._ensure_session()
167
+
168
+ try:
169
+ endpoint = f"{self.base_url}/convert"
170
+ payload = {
171
+ "url": url,
172
+ **config.to_dict()
173
+ }
174
+
175
+ async with self.session.post(endpoint, json=payload, timeout=config.timeout) as response:
176
+ if response.status != 200:
177
+ error_text = await response.text()
178
+ result = CrawlResult(
179
+ url=url,
180
+ status="failed",
181
+ error=f"HTTP {response.status}: {error_text}",
182
+ config=config.to_dict()
183
+ )
184
+ else:
185
+ markdown = await response.text()
186
+ result = CrawlResult(
187
+ url=url,
188
+ status="success",
189
+ markdown=markdown,
190
+ config=config.to_dict()
191
+ )
192
+
193
+ if config.output_dir:
194
+ await self._save_markdown(url, markdown, config.output_dir)
195
+
196
+ await self._send_webhook(result, config)
197
+
198
+ self.crawl_results.append(result)
199
+ return result
200
+
201
+ except Exception as e:
202
+ result = CrawlResult(
203
+ url=url,
204
+ status="failed",
205
+ error=str(e),
206
+ config=config.to_dict()
207
+ )
208
+ self.crawl_results.append(result)
209
+ return result
210
+
211
+ def crawl_url(self, url: str, config: CrawlConfig) -> CrawlResult:
212
+ """Synchronous version of crawl_url_async."""
213
+ return asyncio.run(self.crawl_url_async(url, config))
214
+
215
+ async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
216
+ """Crawl multiple URLs asynchronously with progress bar."""
217
+ await self._ensure_session()
218
+
219
+ with Progress(
220
+ SpinnerColumn(),
221
+ TextColumn("[progress.description]{task.description}"),
222
+ BarColumn(),
223
+ TaskProgressColumn(),
224
+ console=console
225
+ ) as progress:
226
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
227
+
228
+ async def crawl_with_progress(url):
229
+ result = await self.crawl_url_async(url, config)
230
+ progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
231
+ return result
232
+
233
+ semaphore = asyncio.Semaphore(config.max_concurrent_requests)
234
+ async def crawl_with_semaphore(url):
235
+ async with semaphore:
236
+ result = await crawl_with_progress(url)
237
+ await asyncio.sleep(config.request_delay)
238
+ return result
239
+
240
+ results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
241
+
242
+ # Save final report
243
+ await self._save_report(config)
244
+
245
+ # Print summary
246
+ successful = len([r for r in results if r.status == "success"])
247
+ failed = len([r for r in results if r.status == "failed"])
248
+ console.print(f"\n[green]Crawling completed:[/green]")
249
+ console.print(f"✓ Successful: {successful}")
250
+ console.print(f"✗ Failed: {failed}")
251
+
252
+ if config.report_file:
253
+ console.print(f"📊 Report saved to: {config.report_file}")
254
+
255
+ return results
256
+
257
+ def crawl_urls(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
258
+ """Synchronous version of crawl_urls_async."""
259
+ return asyncio.run(self.crawl_urls_async(urls, config))
260
+
261
+ async def crawl_sitemap_async(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
262
+ """Crawl URLs from a sitemap asynchronously."""
263
+ await self._ensure_session()
264
+
265
+ try:
266
+ console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
267
+ async with self.session.get(sitemap_url, timeout=config.timeout) as response:
268
+ sitemap_text = await response.text()
269
+ except Exception as e:
270
+ console.print(f"[red]Error fetching sitemap: {str(e)}[/red]")
271
+ raise
272
+
273
+ try:
274
+ root = ET.fromstring(sitemap_text)
275
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
276
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
277
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
278
+ except Exception as e:
279
+ console.print(f"[red]Error parsing sitemap: {str(e)}[/red]")
280
+ raise
281
+
282
+ return await self.crawl_urls_async(urls, config)
283
+
284
+ def crawl_sitemap(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
285
+ """Synchronous version of crawl_sitemap_async."""
286
+ return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
287
+
288
+ async def __aenter__(self):
289
+ """Async context manager entry."""
290
+ await self._ensure_session()
291
+ return self
292
+
293
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
294
+ """Async context manager exit."""
295
+ await self._close_session()
296
+
297
+ def __enter__(self):
298
+ """Sync context manager entry."""
299
+ return self
300
+
301
+ def __exit__(self, exc_type, exc_val, exc_tb):
302
+ """Sync context manager exit."""
303
+ self._executor.shutdown(wait=True)
@@ -0,0 +1,239 @@
1
+ Metadata-Version: 2.2
2
+ Name: spiderforce4ai
3
+ Version: 0.1.0
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
+ Home-page: https://petertam.pro
6
+ Author: Piotr Tamulewicz
7
+ Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
+ License: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: aiohttp>=3.8.0
17
+ Requires-Dist: asyncio>=3.4.3
18
+ Requires-Dist: rich>=10.0.0
19
+ Requires-Dist: aiofiles>=0.8.0
20
+ Requires-Dist: httpx>=0.24.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: requires-python
24
+
25
+ # SpiderForce4AI Python Wrapper
26
+
27
+ A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
+
29
+ ## Features
30
+
31
+ - 🔄 Simple synchronous and asynchronous APIs
32
+ - 📁 Automatic Markdown file saving with URL-based filenames
33
+ - 📊 Real-time progress tracking in console
34
+ - 🪝 Webhook support for real-time notifications
35
+ - 📝 Detailed crawl reports in JSON format
36
+ - ⚡ Concurrent crawling with rate limiting
37
+ - 🔍 Support for sitemap.xml crawling
38
+ - 🛡️ Comprehensive error handling
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install spiderforce4ai
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
50
+
51
+ # Initialize the client
52
+ spider = SpiderForce4AI("http://localhost:3004")
53
+
54
+ # Use default configuration
55
+ config = CrawlConfig()
56
+
57
+ # Crawl a single URL
58
+ result = spider.crawl_url("https://example.com", config)
59
+
60
+ # Crawl multiple URLs
61
+ urls = [
62
+ "https://example.com/page1",
63
+ "https://example.com/page2"
64
+ ]
65
+ results = spider.crawl_urls(urls, config)
66
+
67
+ # Crawl from sitemap
68
+ results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
69
+ ```
70
+
71
+ ## Configuration
72
+
73
+ The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
74
+
75
+ ```python
76
+ config = CrawlConfig(
77
+ # Content Selection (all optional)
78
+ target_selector="article", # Specific element to target
79
+ remove_selectors=[".ads", "#popup"], # Elements to remove
80
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
81
+
82
+ # Processing Settings
83
+ max_concurrent_requests=1, # Default: 1
84
+ request_delay=0.5, # Delay between requests in seconds
85
+ timeout=30, # Request timeout in seconds
86
+
87
+ # Output Settings
88
+ output_dir="spiderforce_reports", # Default output directory
89
+ webhook_url="https://your-webhook.com", # Optional webhook endpoint
90
+ webhook_timeout=10, # Webhook timeout in seconds
91
+ report_file=None # Optional custom report location
92
+ )
93
+ ```
94
+
95
+ ### Default Directory Structure
96
+
97
+ ```
98
+ ./
99
+ └── spiderforce_reports/
100
+ ├── example-com-page1.md
101
+ ├── example-com-page2.md
102
+ └── crawl_report.json
103
+ ```
104
+
105
+ ## Webhook Notifications
106
+
107
+ If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
108
+
109
+ ```json
110
+ {
111
+ "url": "https://example.com/page1",
112
+ "status": "success",
113
+ "markdown": "# Page Title\n\nContent...",
114
+ "timestamp": "2025-02-15T10:30:00.123456",
115
+ "config": {
116
+ "target_selector": "article",
117
+ "remove_selectors": [".ads", "#popup"],
118
+ "remove_selectors_regex": ["modal-\\d+"]
119
+ }
120
+ }
121
+ ```
122
+
123
+ ## Crawl Report
124
+
125
+ A comprehensive JSON report is automatically generated in the output directory:
126
+
127
+ ```json
128
+ {
129
+ "timestamp": "2025-02-15T10:30:00.123456",
130
+ "config": {
131
+ "target_selector": "article",
132
+ "remove_selectors": [".ads", "#popup"],
133
+ "remove_selectors_regex": ["modal-\\d+"]
134
+ },
135
+ "results": {
136
+ "successful": [
137
+ {
138
+ "url": "https://example.com/page1",
139
+ "status": "success",
140
+ "markdown": "# Page Title\n\nContent...",
141
+ "timestamp": "2025-02-15T10:30:00.123456"
142
+ }
143
+ ],
144
+ "failed": [
145
+ {
146
+ "url": "https://example.com/page2",
147
+ "status": "failed",
148
+ "error": "HTTP 404: Not Found",
149
+ "timestamp": "2025-02-15T10:30:01.123456"
150
+ }
151
+ ]
152
+ },
153
+ "summary": {
154
+ "total": 2,
155
+ "successful": 1,
156
+ "failed": 1
157
+ }
158
+ }
159
+ ```
160
+
161
+ ## Async Usage
162
+
163
+ ```python
164
+ import asyncio
165
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
166
+
167
+ async def main():
168
+ config = CrawlConfig()
169
+ spider = SpiderForce4AI("http://localhost:3004")
170
+
171
+ async with spider:
172
+ results = await spider.crawl_urls_async(
173
+ ["https://example.com/page1", "https://example.com/page2"],
174
+ config
175
+ )
176
+
177
+ return results
178
+
179
+ if __name__ == "__main__":
180
+ results = asyncio.run(main())
181
+ ```
182
+
183
+ ## Error Handling
184
+
185
+ The crawler is designed to be resilient:
186
+ - Continues processing even if some URLs fail
187
+ - Records all errors in the crawl report
188
+ - Sends error notifications via webhook if configured
189
+ - Provides clear error messages in console output
190
+
191
+ ## Progress Tracking
192
+
193
+ The crawler provides real-time progress tracking in the console:
194
+
195
+ ```
196
+ 🔄 Crawling URLs... [####################] 100%
197
+ ✓ Successful: 95
198
+ ✗ Failed: 5
199
+ 📊 Report saved to: ./spiderforce_reports/crawl_report.json
200
+ ```
201
+
202
+ ## Usage with AI Agents
203
+
204
+ The package is designed to be easily integrated with AI agents and chat systems:
205
+
206
+ ```python
207
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
208
+
209
+ def fetch_content_for_ai(urls):
210
+ spider = SpiderForce4AI("http://localhost:3004")
211
+ config = CrawlConfig()
212
+
213
+ # Crawl content
214
+ results = spider.crawl_urls(urls, config)
215
+
216
+ # Return successful results
217
+ return {
218
+ result.url: result.markdown
219
+ for result in results
220
+ if result.status == "success"
221
+ }
222
+
223
+ # Use with AI agent
224
+ urls = ["https://example.com/article1", "https://example.com/article2"]
225
+ content = fetch_content_for_ai(urls)
226
+ ```
227
+
228
+ ## Requirements
229
+
230
+ - Python 3.11 or later
231
+ - Docker (for running SpiderForce4AI service)
232
+
233
+ ## License
234
+
235
+ MIT License
236
+
237
+ ## Credits
238
+
239
+ Created by [Peter Tam](https://petertam.pro)
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=TTUtXHp4QvFLhh4vgh0bCvYAyJEAZ-8xguoBNVcQUZI,11815
2
+ spiderforce4ai-0.1.0.dist-info/METADATA,sha256=X2Y8tb-sgJ_8fnilV9yHA_qM3xE1OQmTZPtXohT2nsg,6174
3
+ spiderforce4ai-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ spiderforce4ai