spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,10 @@
1
- """
2
- SpiderForce4AI Python Wrapper
3
- A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
4
- """
1
+ # spiderforce4ai/__init__.py
5
2
 
6
3
  import asyncio
7
4
  import aiohttp
8
5
  import json
9
6
  import logging
10
- from typing import List, Dict, Union, Optional
7
+ from typing import List, Dict, Union, Optional, Tuple
11
8
  from dataclasses import dataclass, asdict
12
9
  from urllib.parse import urljoin, urlparse
13
10
  from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
20
17
  from rich.console import Console
21
18
  import aiofiles
22
19
  import httpx
20
+ import requests
23
21
  from multiprocessing import Pool
24
22
 
25
23
  console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
88
86
  payload["remove_selectors_regex"] = self.remove_selectors_regex
89
87
  return payload
90
88
 
89
+ # Module level function for multiprocessing
90
+ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
91
+ """Process a single URL for parallel processing."""
92
+ url, base_url, config = args
93
+ try:
94
+ endpoint = f"{base_url}/convert"
95
+ payload = {
96
+ "url": url,
97
+ **config.to_dict()
98
+ }
99
+
100
+ response = requests.post(endpoint, json=payload, timeout=config.timeout)
101
+ if response.status_code != 200:
102
+ return CrawlResult(
103
+ url=url,
104
+ status="failed",
105
+ error=f"HTTP {response.status_code}: {response.text}",
106
+ config=config.to_dict()
107
+ )
108
+
109
+ markdown = response.text
110
+
111
+ # Save markdown if output directory is configured
112
+ if config.output_dir:
113
+ filepath = config.output_dir / f"{slugify(url)}.md"
114
+ with open(filepath, 'w', encoding='utf-8') as f:
115
+ f.write(markdown)
116
+
117
+ # Add delay if configured
118
+ if config.request_delay:
119
+ time.sleep(config.request_delay)
120
+
121
+ return CrawlResult(
122
+ url=url,
123
+ status="success",
124
+ markdown=markdown,
125
+ config=config.to_dict()
126
+ )
127
+
128
+ except Exception as e:
129
+ return CrawlResult(
130
+ url=url,
131
+ status="failed",
132
+ error=str(e),
133
+ config=config.to_dict()
134
+ )
135
+
91
136
  class SpiderForce4AI:
92
137
  """Main class for interacting with SpiderForce4AI service."""
93
138
 
@@ -140,6 +185,25 @@ class SpiderForce4AI:
140
185
  except Exception as e:
141
186
  console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
142
187
 
188
+ def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
189
+ """Save crawl report synchronously."""
190
+ report = {
191
+ "timestamp": datetime.now().isoformat(),
192
+ "config": config.to_dict(),
193
+ "results": {
194
+ "successful": [asdict(r) for r in results if r.status == "success"],
195
+ "failed": [asdict(r) for r in results if r.status == "failed"]
196
+ },
197
+ "summary": {
198
+ "total": len(results),
199
+ "successful": len([r for r in results if r.status == "success"]),
200
+ "failed": len([r for r in results if r.status == "failed"])
201
+ }
202
+ }
203
+
204
+ with open(config.report_file, 'w', encoding='utf-8') as f:
205
+ json.dump(report, f, indent=2)
206
+
143
207
  async def _save_report(self, config: CrawlConfig):
144
208
  """Save crawl report to JSON file."""
145
209
  if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
286
350
  """Synchronous version of crawl_sitemap_async."""
287
351
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
288
352
 
289
- async def __aenter__(self):
290
- """Async context manager entry."""
291
- await self._ensure_session()
292
- return self
293
-
294
- async def __aexit__(self, exc_type, exc_val, exc_tb):
295
- """Async context manager exit."""
296
- await self._close_session()
297
-
298
- def __enter__(self):
299
- """Sync context manager entry."""
300
- return self
301
-
302
- def __exit__(self, exc_type, exc_val, exc_tb):
303
- """Sync context manager exit."""
304
- self._executor.shutdown(wait=True)
305
-
306
-
307
353
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
308
- """
309
- Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
310
- """
354
+ """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
311
355
  print(f"Fetching sitemap from {sitemap_url}...")
312
356
 
313
357
  # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
329
373
  print(f"Error parsing sitemap: {str(e)}")
330
374
  raise
331
375
 
332
- def _crawl_single(url: str) -> CrawlResult:
333
- try:
334
- endpoint = f"{self.base_url}/convert"
335
- payload = {
336
- "url": url,
337
- **config.to_dict()
338
- }
339
-
340
- response = requests.post(endpoint, json=payload, timeout=config.timeout)
341
- if response.status_code != 200:
342
- return CrawlResult(
343
- url=url,
344
- status="failed",
345
- error=f"HTTP {response.status_code}: {response.text}",
346
- config=config.to_dict()
347
- )
348
-
349
- markdown = response.text
350
-
351
- # Save markdown if output directory is configured
352
- if config.output_dir:
353
- filepath = config.output_dir / f"{slugify(url)}.md"
354
- with open(filepath, 'w', encoding='utf-8') as f:
355
- f.write(markdown)
356
-
357
- # Add delay if configured
358
- if config.request_delay:
359
- time.sleep(config.request_delay)
360
-
361
- return CrawlResult(
362
- url=url,
363
- status="success",
364
- markdown=markdown,
365
- config=config.to_dict()
366
- )
367
-
368
- except Exception as e:
369
- return CrawlResult(
370
- url=url,
371
- status="failed",
372
- error=str(e),
373
- config=config.to_dict()
374
- )
376
+ # Prepare arguments for parallel processing
377
+ process_args = [(url, self.base_url, config) for url in urls]
375
378
 
376
379
  # Create process pool and execute crawls
377
380
  results = []
381
+
378
382
  with Pool(processes=config.max_concurrent_requests) as pool:
379
383
  with Progress(
380
384
  SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
385
389
  ) as progress:
386
390
  task = progress.add_task("Crawling URLs...", total=len(urls))
387
391
 
388
- for result in pool.imap_unordered(_crawl_single, urls):
392
+ for result in pool.imap_unordered(_process_url_parallel, process_args):
389
393
  results.append(result)
390
394
  progress.update(task, advance=1)
391
395
  status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
405
409
 
406
410
  return results
407
411
 
408
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
409
- """Save crawl report synchronously."""
410
- report = {
411
- "timestamp": datetime.now().isoformat(),
412
- "config": config.to_dict(),
413
- "results": {
414
- "successful": [asdict(r) for r in results if r.status == "success"],
415
- "failed": [asdict(r) for r in results if r.status == "failed"]
416
- },
417
- "summary": {
418
- "total": len(results),
419
- "successful": len([r for r in results if r.status == "success"]),
420
- "failed": len([r for r in results if r.status == "failed"])
421
- }
422
- }
412
+ async def __aenter__(self):
413
+ """Async context manager entry."""
414
+ await self._ensure_session()
415
+ return self
423
416
 
424
- with open(config.report_file, 'w', encoding='utf-8') as f:
425
- json.dump(report, f, indent=2)
417
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
418
+ """Async context manager exit."""
419
+ await self._close_session()
420
+
421
+ def __enter__(self):
422
+ """Sync context manager entry."""
423
+ return self
424
+
425
+ def __exit__(self, exc_type, exc_val, exc_tb):
426
+ """Sync context manager exit."""
427
+ self._executor.shutdown(wait=True)
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.2
2
+ Name: spiderforce4ai
3
+ Version: 0.1.6
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
+ Home-page: https://petertam.pro
6
+ Author: Piotr Tamulewicz
7
+ Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
+ License: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: aiohttp>=3.8.0
17
+ Requires-Dist: asyncio>=3.4.3
18
+ Requires-Dist: rich>=10.0.0
19
+ Requires-Dist: aiofiles>=0.8.0
20
+ Requires-Dist: httpx>=0.24.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: requires-python
24
+
25
+ # SpiderForce4AI Python Wrapper
26
+
27
+ A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install spiderforce4ai
33
+ ```
34
+
35
+ ## Quick Start (Minimal Setup)
36
+
37
+ ```python
38
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
39
+
40
+ # Initialize with your SpiderForce4AI service URL
41
+ spider = SpiderForce4AI("http://localhost:3004")
42
+
43
+ # Use default configuration (will save in ./spiderforce_reports)
44
+ config = CrawlConfig()
45
+
46
+ # Crawl a single URL
47
+ result = spider.crawl_url("https://example.com", config)
48
+ ```
49
+
50
+ ## Crawling Methods
51
+
52
+ ### 1. Single URL Crawling
53
+
54
+ ```python
55
+ # Synchronous
56
+ result = spider.crawl_url("https://example.com", config)
57
+
58
+ # Asynchronous
59
+ async def crawl():
60
+ result = await spider.crawl_url_async("https://example.com", config)
61
+ ```
62
+
63
+ ### 2. Multiple URLs Crawling
64
+
65
+ ```python
66
+ # List of URLs
67
+ urls = [
68
+ "https://example.com/page1",
69
+ "https://example.com/page2",
70
+ "https://example.com/page3"
71
+ ]
72
+
73
+ # Synchronous
74
+ results = spider.crawl_urls(urls, config)
75
+
76
+ # Asynchronous
77
+ async def crawl():
78
+ results = await spider.crawl_urls_async(urls, config)
79
+
80
+ # Parallel (using multiprocessing)
81
+ results = spider.crawl_urls_parallel(urls, config)
82
+ ```
83
+
84
+ ### 3. Sitemap Crawling
85
+
86
+ ```python
87
+ # Synchronous
88
+ results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
89
+
90
+ # Asynchronous
91
+ async def crawl():
92
+ results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
93
+
94
+ # Parallel (using multiprocessing)
95
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
96
+ ```
97
+
98
+ ## Configuration Options
99
+
100
+ All configuration options are optional with sensible defaults:
101
+
102
+ ```python
103
+ config = CrawlConfig(
104
+ # Content Selection (all optional)
105
+ target_selector="article", # Specific element to target
106
+ remove_selectors=[ # Elements to remove
107
+ ".ads",
108
+ "#popup",
109
+ ".navigation",
110
+ ".footer"
111
+ ],
112
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
113
+
114
+ # Processing Settings
115
+ max_concurrent_requests=1, # Default: 1 (parallel processing)
116
+ request_delay=0.5, # Delay between requests in seconds
117
+ timeout=30, # Request timeout in seconds
118
+
119
+ # Output Settings
120
+ output_dir="custom_output", # Default: "spiderforce_reports"
121
+ report_file="custom_report.json", # Default: "crawl_report.json"
122
+ webhook_url="https://your-webhook.com", # Optional webhook endpoint
123
+ webhook_timeout=10 # Webhook timeout in seconds
124
+ )
125
+ ```
126
+
127
+ ## Real-World Examples
128
+
129
+ ### 1. Basic Website Crawling
130
+
131
+ ```python
132
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
133
+ from pathlib import Path
134
+
135
+ spider = SpiderForce4AI("http://localhost:3004")
136
+ config = CrawlConfig(
137
+ output_dir=Path("blog_content")
138
+ )
139
+
140
+ result = spider.crawl_url("https://example.com/blog", config)
141
+ print(f"Content saved to: {result.url}.md")
142
+ ```
143
+
144
+ ### 2. Advanced Parallel Sitemap Crawling
145
+
146
+ ```python
147
+ config = CrawlConfig(
148
+ max_concurrent_requests=5,
149
+ output_dir=Path("website_content"),
150
+ remove_selectors=[
151
+ ".navigation",
152
+ ".footer",
153
+ ".ads",
154
+ "#cookie-notice"
155
+ ],
156
+ webhook_url="https://your-webhook.com/endpoint"
157
+ )
158
+
159
+ results = spider.crawl_sitemap_parallel(
160
+ "https://example.com/sitemap.xml",
161
+ config
162
+ )
163
+ ```
164
+
165
+ ### 3. Async Crawling with Progress
166
+
167
+ ```python
168
+ import asyncio
169
+
170
+ async def main():
171
+ config = CrawlConfig(
172
+ max_concurrent_requests=3,
173
+ request_delay=1.0
174
+ )
175
+
176
+ async with spider:
177
+ results = await spider.crawl_urls_async([
178
+ "https://example.com/1",
179
+ "https://example.com/2",
180
+ "https://example.com/3"
181
+ ], config)
182
+
183
+ return results
184
+
185
+ results = asyncio.run(main())
186
+ ```
187
+
188
+ ## Output Structure
189
+
190
+ ### 1. File Organization
191
+ ```
192
+ output_dir/
193
+ ├── example-com-page1.md
194
+ ├── example-com-page2.md
195
+ └── crawl_report.json
196
+ ```
197
+
198
+ ### 2. Markdown Files
199
+ Each markdown file is named using a slugified version of the URL and contains the converted content.
200
+
201
+ ### 3. Report JSON Structure
202
+ ```json
203
+ {
204
+ "timestamp": "2025-02-15T10:30:00.123456",
205
+ "config": {
206
+ "target_selector": "article",
207
+ "remove_selectors": [".ads", "#popup"],
208
+ "remove_selectors_regex": ["modal-\\d+"]
209
+ },
210
+ "results": {
211
+ "successful": [
212
+ {
213
+ "url": "https://example.com/page1",
214
+ "status": "success",
215
+ "markdown": "# Page Title\n\nContent...",
216
+ "timestamp": "2025-02-15T10:30:00.123456"
217
+ }
218
+ ],
219
+ "failed": [
220
+ {
221
+ "url": "https://example.com/page2",
222
+ "status": "failed",
223
+ "error": "HTTP 404: Not Found",
224
+ "timestamp": "2025-02-15T10:30:01.123456"
225
+ }
226
+ ]
227
+ },
228
+ "summary": {
229
+ "total": 2,
230
+ "successful": 1,
231
+ "failed": 1
232
+ }
233
+ }
234
+ ```
235
+
236
+ ### 4. Webhook Notifications
237
+ If configured, webhooks receive real-time updates in JSON format:
238
+ ```json
239
+ {
240
+ "url": "https://example.com/page1",
241
+ "status": "success",
242
+ "markdown": "# Page Title\n\nContent...",
243
+ "timestamp": "2025-02-15T10:30:00.123456",
244
+ "config": {
245
+ "target_selector": "article",
246
+ "remove_selectors": [".ads", "#popup"]
247
+ }
248
+ }
249
+ ```
250
+
251
+ ## Error Handling
252
+
253
+ The package handles various types of errors:
254
+ - Network errors
255
+ - Timeout errors
256
+ - Invalid URLs
257
+ - Missing content
258
+ - Service errors
259
+
260
+ All errors are:
261
+ 1. Logged in the console
262
+ 2. Included in the JSON report
263
+ 3. Sent via webhook (if configured)
264
+ 4. Available in the results list
265
+
266
+ ## Requirements
267
+
268
+ - Python 3.11 or later
269
+ - Running SpiderForce4AI service
270
+ - Internet connection
271
+
272
+ ## License
273
+
274
+ MIT License
275
+
276
+ ## Credits
277
+
278
+ Created by [Peter Tam](https://petertam.pro)
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
2
+ spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
3
+ spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.6.dist-info/RECORD,,
@@ -1,239 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: spiderforce4ai
3
- Version: 0.1.4
4
- Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
- Home-page: https://petertam.pro
6
- Author: Piotr Tamulewicz
7
- Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
- License: MIT
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Requires-Python: >=3.11
15
- Description-Content-Type: text/markdown
16
- Requires-Dist: aiohttp>=3.8.0
17
- Requires-Dist: asyncio>=3.4.3
18
- Requires-Dist: rich>=10.0.0
19
- Requires-Dist: aiofiles>=0.8.0
20
- Requires-Dist: httpx>=0.24.0
21
- Dynamic: author
22
- Dynamic: home-page
23
- Dynamic: requires-python
24
-
25
- # SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
26
-
27
- A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
-
29
- ## Features
30
-
31
- - 🔄 Simple synchronous and asynchronous APIs
32
- - 📁 Automatic Markdown file saving with URL-based filenames
33
- - 📊 Real-time progress tracking in console
34
- - 🪝 Webhook support for real-time notifications
35
- - 📝 Detailed crawl reports in JSON format
36
- - ⚡ Concurrent crawling with rate limiting
37
- - 🔍 Support for sitemap.xml crawling
38
- - 🛡️ Comprehensive error handling
39
-
40
- ## Installation
41
-
42
- ```bash
43
- pip install spiderforce4ai
44
- ```
45
-
46
- ## Quick Start
47
-
48
- ```python
49
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
50
-
51
- # Initialize the client
52
- spider = SpiderForce4AI("http://localhost:3004")
53
-
54
- # Use default configuration
55
- config = CrawlConfig()
56
-
57
- # Crawl a single URL
58
- result = spider.crawl_url("https://example.com", config)
59
-
60
- # Crawl multiple URLs
61
- urls = [
62
- "https://example.com/page1",
63
- "https://example.com/page2"
64
- ]
65
- results = spider.crawl_urls(urls, config)
66
-
67
- # Crawl from sitemap
68
- results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
69
- ```
70
-
71
- ## Configuration
72
-
73
- The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
74
-
75
- ```python
76
- config = CrawlConfig(
77
- # Content Selection (all optional)
78
- target_selector="article", # Specific element to target
79
- remove_selectors=[".ads", "#popup"], # Elements to remove
80
- remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
81
-
82
- # Processing Settings
83
- max_concurrent_requests=1, # Default: 1
84
- request_delay=0.5, # Delay between requests in seconds
85
- timeout=30, # Request timeout in seconds
86
-
87
- # Output Settings
88
- output_dir="spiderforce_reports", # Default output directory
89
- webhook_url="https://your-webhook.com", # Optional webhook endpoint
90
- webhook_timeout=10, # Webhook timeout in seconds
91
- report_file=None # Optional custom report location
92
- )
93
- ```
94
-
95
- ### Default Directory Structure
96
-
97
- ```
98
- ./
99
- └── spiderforce_reports/
100
- ├── example-com-page1.md
101
- ├── example-com-page2.md
102
- └── crawl_report.json
103
- ```
104
-
105
- ## Webhook Notifications
106
-
107
- If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
108
-
109
- ```json
110
- {
111
- "url": "https://example.com/page1",
112
- "status": "success",
113
- "markdown": "# Page Title\n\nContent...",
114
- "timestamp": "2025-02-15T10:30:00.123456",
115
- "config": {
116
- "target_selector": "article",
117
- "remove_selectors": [".ads", "#popup"],
118
- "remove_selectors_regex": ["modal-\\d+"]
119
- }
120
- }
121
- ```
122
-
123
- ## Crawl Report
124
-
125
- A comprehensive JSON report is automatically generated in the output directory:
126
-
127
- ```json
128
- {
129
- "timestamp": "2025-02-15T10:30:00.123456",
130
- "config": {
131
- "target_selector": "article",
132
- "remove_selectors": [".ads", "#popup"],
133
- "remove_selectors_regex": ["modal-\\d+"]
134
- },
135
- "results": {
136
- "successful": [
137
- {
138
- "url": "https://example.com/page1",
139
- "status": "success",
140
- "markdown": "# Page Title\n\nContent...",
141
- "timestamp": "2025-02-15T10:30:00.123456"
142
- }
143
- ],
144
- "failed": [
145
- {
146
- "url": "https://example.com/page2",
147
- "status": "failed",
148
- "error": "HTTP 404: Not Found",
149
- "timestamp": "2025-02-15T10:30:01.123456"
150
- }
151
- ]
152
- },
153
- "summary": {
154
- "total": 2,
155
- "successful": 1,
156
- "failed": 1
157
- }
158
- }
159
- ```
160
-
161
- ## Async Usage
162
-
163
- ```python
164
- import asyncio
165
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
166
-
167
- async def main():
168
- config = CrawlConfig()
169
- spider = SpiderForce4AI("http://localhost:3004")
170
-
171
- async with spider:
172
- results = await spider.crawl_urls_async(
173
- ["https://example.com/page1", "https://example.com/page2"],
174
- config
175
- )
176
-
177
- return results
178
-
179
- if __name__ == "__main__":
180
- results = asyncio.run(main())
181
- ```
182
-
183
- ## Error Handling
184
-
185
- The crawler is designed to be resilient:
186
- - Continues processing even if some URLs fail
187
- - Records all errors in the crawl report
188
- - Sends error notifications via webhook if configured
189
- - Provides clear error messages in console output
190
-
191
- ## Progress Tracking
192
-
193
- The crawler provides real-time progress tracking in the console:
194
-
195
- ```
196
- 🔄 Crawling URLs... [####################] 100%
197
- ✓ Successful: 95
198
- ✗ Failed: 5
199
- 📊 Report saved to: ./spiderforce_reports/crawl_report.json
200
- ```
201
-
202
- ## Usage with AI Agents
203
-
204
- The package is designed to be easily integrated with AI agents and chat systems:
205
-
206
- ```python
207
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
208
-
209
- def fetch_content_for_ai(urls):
210
- spider = SpiderForce4AI("http://localhost:3004")
211
- config = CrawlConfig()
212
-
213
- # Crawl content
214
- results = spider.crawl_urls(urls, config)
215
-
216
- # Return successful results
217
- return {
218
- result.url: result.markdown
219
- for result in results
220
- if result.status == "success"
221
- }
222
-
223
- # Use with AI agent
224
- urls = ["https://example.com/article1", "https://example.com/article2"]
225
- content = fetch_content_for_ai(urls)
226
- ```
227
-
228
- ## Requirements
229
-
230
- - Python 3.11 or later
231
- - Docker (for running SpiderForce4AI service)
232
-
233
- ## License
234
-
235
- MIT License
236
-
237
- ## Credits
238
-
239
- Created by [Peter Tam](https://petertam.pro)
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
2
- spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
3
- spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.4.dist-info/RECORD,,