spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,10 @@
1
- """
2
- SpiderForce4AI Python Wrapper
3
- A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
4
- """
1
+ # spiderforce4ai/__init__.py
5
2
 
6
3
  import asyncio
7
4
  import aiohttp
8
5
  import json
9
6
  import logging
10
- from typing import List, Dict, Union, Optional
7
+ from typing import List, Dict, Union, Optional, Tuple
11
8
  from dataclasses import dataclass, asdict
12
9
  from urllib.parse import urljoin, urlparse
13
10
  from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
20
17
  from rich.console import Console
21
18
  import aiofiles
22
19
  import httpx
20
+ import requests
23
21
  from multiprocessing import Pool
24
22
 
25
23
  console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
88
86
  payload["remove_selectors_regex"] = self.remove_selectors_regex
89
87
  return payload
90
88
 
89
+ # Module level function for multiprocessing
90
+ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
91
+ """Process a single URL for parallel processing."""
92
+ url, base_url, config = args
93
+ try:
94
+ endpoint = f"{base_url}/convert"
95
+ payload = {
96
+ "url": url,
97
+ **config.to_dict()
98
+ }
99
+
100
+ response = requests.post(endpoint, json=payload, timeout=config.timeout)
101
+ if response.status_code != 200:
102
+ return CrawlResult(
103
+ url=url,
104
+ status="failed",
105
+ error=f"HTTP {response.status_code}: {response.text}",
106
+ config=config.to_dict()
107
+ )
108
+
109
+ markdown = response.text
110
+
111
+ # Save markdown if output directory is configured
112
+ if config.output_dir:
113
+ filepath = config.output_dir / f"{slugify(url)}.md"
114
+ with open(filepath, 'w', encoding='utf-8') as f:
115
+ f.write(markdown)
116
+
117
+ # Add delay if configured
118
+ if config.request_delay:
119
+ time.sleep(config.request_delay)
120
+
121
+ return CrawlResult(
122
+ url=url,
123
+ status="success",
124
+ markdown=markdown,
125
+ config=config.to_dict()
126
+ )
127
+
128
+ except Exception as e:
129
+ return CrawlResult(
130
+ url=url,
131
+ status="failed",
132
+ error=str(e),
133
+ config=config.to_dict()
134
+ )
135
+
91
136
  class SpiderForce4AI:
92
137
  """Main class for interacting with SpiderForce4AI service."""
93
138
 
@@ -140,6 +185,25 @@ class SpiderForce4AI:
140
185
  except Exception as e:
141
186
  console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
142
187
 
188
+ def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
189
+ """Save crawl report synchronously."""
190
+ report = {
191
+ "timestamp": datetime.now().isoformat(),
192
+ "config": config.to_dict(),
193
+ "results": {
194
+ "successful": [asdict(r) for r in results if r.status == "success"],
195
+ "failed": [asdict(r) for r in results if r.status == "failed"]
196
+ },
197
+ "summary": {
198
+ "total": len(results),
199
+ "successful": len([r for r in results if r.status == "success"]),
200
+ "failed": len([r for r in results if r.status == "failed"])
201
+ }
202
+ }
203
+
204
+ with open(config.report_file, 'w', encoding='utf-8') as f:
205
+ json.dump(report, f, indent=2)
206
+
143
207
  async def _save_report(self, config: CrawlConfig):
144
208
  """Save crawl report to JSON file."""
145
209
  if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
286
350
  """Synchronous version of crawl_sitemap_async."""
287
351
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
288
352
 
289
- async def __aenter__(self):
290
- """Async context manager entry."""
291
- await self._ensure_session()
292
- return self
293
-
294
- async def __aexit__(self, exc_type, exc_val, exc_tb):
295
- """Async context manager exit."""
296
- await self._close_session()
297
-
298
- def __enter__(self):
299
- """Sync context manager entry."""
300
- return self
301
-
302
- def __exit__(self, exc_type, exc_val, exc_tb):
303
- """Sync context manager exit."""
304
- self._executor.shutdown(wait=True)
305
-
306
-
307
353
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
308
- """
309
- Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
310
- """
354
+ """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
311
355
  print(f"Fetching sitemap from {sitemap_url}...")
312
356
 
313
357
  # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
329
373
  print(f"Error parsing sitemap: {str(e)}")
330
374
  raise
331
375
 
332
- def _crawl_single(url: str) -> CrawlResult:
333
- try:
334
- endpoint = f"{self.base_url}/convert"
335
- payload = {
336
- "url": url,
337
- **config.to_dict()
338
- }
339
-
340
- response = requests.post(endpoint, json=payload, timeout=config.timeout)
341
- if response.status_code != 200:
342
- return CrawlResult(
343
- url=url,
344
- status="failed",
345
- error=f"HTTP {response.status_code}: {response.text}",
346
- config=config.to_dict()
347
- )
348
-
349
- markdown = response.text
350
-
351
- # Save markdown if output directory is configured
352
- if config.output_dir:
353
- filepath = config.output_dir / f"{slugify(url)}.md"
354
- with open(filepath, 'w', encoding='utf-8') as f:
355
- f.write(markdown)
356
-
357
- # Add delay if configured
358
- if config.request_delay:
359
- time.sleep(config.request_delay)
360
-
361
- return CrawlResult(
362
- url=url,
363
- status="success",
364
- markdown=markdown,
365
- config=config.to_dict()
366
- )
367
-
368
- except Exception as e:
369
- return CrawlResult(
370
- url=url,
371
- status="failed",
372
- error=str(e),
373
- config=config.to_dict()
374
- )
376
+ # Prepare arguments for parallel processing
377
+ process_args = [(url, self.base_url, config) for url in urls]
375
378
 
376
379
  # Create process pool and execute crawls
377
380
  results = []
381
+
378
382
  with Pool(processes=config.max_concurrent_requests) as pool:
379
383
  with Progress(
380
384
  SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
385
389
  ) as progress:
386
390
  task = progress.add_task("Crawling URLs...", total=len(urls))
387
391
 
388
- for result in pool.imap_unordered(_crawl_single, urls):
392
+ for result in pool.imap_unordered(_process_url_parallel, process_args):
389
393
  results.append(result)
390
394
  progress.update(task, advance=1)
391
395
  status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
405
409
 
406
410
  return results
407
411
 
408
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
409
- """Save crawl report synchronously."""
410
- report = {
411
- "timestamp": datetime.now().isoformat(),
412
- "config": config.to_dict(),
413
- "results": {
414
- "successful": [asdict(r) for r in results if r.status == "success"],
415
- "failed": [asdict(r) for r in results if r.status == "failed"]
416
- },
417
- "summary": {
418
- "total": len(results),
419
- "successful": len([r for r in results if r.status == "success"]),
420
- "failed": len([r for r in results if r.status == "failed"])
421
- }
422
- }
412
+ async def __aenter__(self):
413
+ """Async context manager entry."""
414
+ await self._ensure_session()
415
+ return self
423
416
 
424
- with open(config.report_file, 'w', encoding='utf-8') as f:
425
- json.dump(report, f, indent=2)
417
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
418
+ """Async context manager exit."""
419
+ await self._close_session()
420
+
421
+ def __enter__(self):
422
+ """Sync context manager entry."""
423
+ return self
424
+
425
+ def __exit__(self, exc_type, exc_val, exc_tb):
426
+ """Sync context manager exit."""
427
+ self._executor.shutdown(wait=True)
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.2
2
+ Name: spiderforce4ai
3
+ Version: 0.1.6
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
+ Home-page: https://petertam.pro
6
+ Author: Piotr Tamulewicz
7
+ Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
+ License: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: aiohttp>=3.8.0
17
+ Requires-Dist: asyncio>=3.4.3
18
+ Requires-Dist: rich>=10.0.0
19
+ Requires-Dist: aiofiles>=0.8.0
20
+ Requires-Dist: httpx>=0.24.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: requires-python
24
+
25
+ # SpiderForce4AI Python Wrapper
26
+
27
+ A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install spiderforce4ai
33
+ ```
34
+
35
+ ## Quick Start (Minimal Setup)
36
+
37
+ ```python
38
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
39
+
40
+ # Initialize with your SpiderForce4AI service URL
41
+ spider = SpiderForce4AI("http://localhost:3004")
42
+
43
+ # Use default configuration (will save in ./spiderforce_reports)
44
+ config = CrawlConfig()
45
+
46
+ # Crawl a single URL
47
+ result = spider.crawl_url("https://example.com", config)
48
+ ```
49
+
50
+ ## Crawling Methods
51
+
52
+ ### 1. Single URL Crawling
53
+
54
+ ```python
55
+ # Synchronous
56
+ result = spider.crawl_url("https://example.com", config)
57
+
58
+ # Asynchronous
59
+ async def crawl():
60
+ result = await spider.crawl_url_async("https://example.com", config)
61
+ ```
62
+
63
+ ### 2. Multiple URLs Crawling
64
+
65
+ ```python
66
+ # List of URLs
67
+ urls = [
68
+ "https://example.com/page1",
69
+ "https://example.com/page2",
70
+ "https://example.com/page3"
71
+ ]
72
+
73
+ # Synchronous
74
+ results = spider.crawl_urls(urls, config)
75
+
76
+ # Asynchronous
77
+ async def crawl():
78
+ results = await spider.crawl_urls_async(urls, config)
79
+
80
+ # Parallel (using multiprocessing)
81
+ results = spider.crawl_urls_parallel(urls, config)
82
+ ```
83
+
84
+ ### 3. Sitemap Crawling
85
+
86
+ ```python
87
+ # Synchronous
88
+ results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
89
+
90
+ # Asynchronous
91
+ async def crawl():
92
+ results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
93
+
94
+ # Parallel (using multiprocessing)
95
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
96
+ ```
97
+
98
+ ## Configuration Options
99
+
100
+ All configuration options are optional with sensible defaults:
101
+
102
+ ```python
103
+ config = CrawlConfig(
104
+ # Content Selection (all optional)
105
+ target_selector="article", # Specific element to target
106
+ remove_selectors=[ # Elements to remove
107
+ ".ads",
108
+ "#popup",
109
+ ".navigation",
110
+ ".footer"
111
+ ],
112
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
113
+
114
+ # Processing Settings
115
+ max_concurrent_requests=1, # Default: 1 (parallel processing)
116
+ request_delay=0.5, # Delay between requests in seconds
117
+ timeout=30, # Request timeout in seconds
118
+
119
+ # Output Settings
120
+ output_dir="custom_output", # Default: "spiderforce_reports"
121
+ report_file="custom_report.json", # Default: "crawl_report.json"
122
+ webhook_url="https://your-webhook.com", # Optional webhook endpoint
123
+ webhook_timeout=10 # Webhook timeout in seconds
124
+ )
125
+ ```
126
+
127
+ ## Real-World Examples
128
+
129
+ ### 1. Basic Website Crawling
130
+
131
+ ```python
132
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
133
+ from pathlib import Path
134
+
135
+ spider = SpiderForce4AI("http://localhost:3004")
136
+ config = CrawlConfig(
137
+ output_dir=Path("blog_content")
138
+ )
139
+
140
+ result = spider.crawl_url("https://example.com/blog", config)
141
+ print(f"Content saved to: {result.url}.md")
142
+ ```
143
+
144
+ ### 2. Advanced Parallel Sitemap Crawling
145
+
146
+ ```python
147
+ config = CrawlConfig(
148
+ max_concurrent_requests=5,
149
+ output_dir=Path("website_content"),
150
+ remove_selectors=[
151
+ ".navigation",
152
+ ".footer",
153
+ ".ads",
154
+ "#cookie-notice"
155
+ ],
156
+ webhook_url="https://your-webhook.com/endpoint"
157
+ )
158
+
159
+ results = spider.crawl_sitemap_parallel(
160
+ "https://example.com/sitemap.xml",
161
+ config
162
+ )
163
+ ```
164
+
165
+ ### 3. Async Crawling with Progress
166
+
167
+ ```python
168
+ import asyncio
169
+
170
+ async def main():
171
+ config = CrawlConfig(
172
+ max_concurrent_requests=3,
173
+ request_delay=1.0
174
+ )
175
+
176
+ async with spider:
177
+ results = await spider.crawl_urls_async([
178
+ "https://example.com/1",
179
+ "https://example.com/2",
180
+ "https://example.com/3"
181
+ ], config)
182
+
183
+ return results
184
+
185
+ results = asyncio.run(main())
186
+ ```
187
+
188
+ ## Output Structure
189
+
190
+ ### 1. File Organization
191
+ ```
192
+ output_dir/
193
+ ├── example-com-page1.md
194
+ ├── example-com-page2.md
195
+ └── crawl_report.json
196
+ ```
197
+
198
+ ### 2. Markdown Files
199
+ Each markdown file is named using a slugified version of the URL and contains the converted content.
200
+
201
+ ### 3. Report JSON Structure
202
+ ```json
203
+ {
204
+ "timestamp": "2025-02-15T10:30:00.123456",
205
+ "config": {
206
+ "target_selector": "article",
207
+ "remove_selectors": [".ads", "#popup"],
208
+ "remove_selectors_regex": ["modal-\\d+"]
209
+ },
210
+ "results": {
211
+ "successful": [
212
+ {
213
+ "url": "https://example.com/page1",
214
+ "status": "success",
215
+ "markdown": "# Page Title\n\nContent...",
216
+ "timestamp": "2025-02-15T10:30:00.123456"
217
+ }
218
+ ],
219
+ "failed": [
220
+ {
221
+ "url": "https://example.com/page2",
222
+ "status": "failed",
223
+ "error": "HTTP 404: Not Found",
224
+ "timestamp": "2025-02-15T10:30:01.123456"
225
+ }
226
+ ]
227
+ },
228
+ "summary": {
229
+ "total": 2,
230
+ "successful": 1,
231
+ "failed": 1
232
+ }
233
+ }
234
+ ```
235
+
236
+ ### 4. Webhook Notifications
237
+ If configured, webhooks receive real-time updates in JSON format:
238
+ ```json
239
+ {
240
+ "url": "https://example.com/page1",
241
+ "status": "success",
242
+ "markdown": "# Page Title\n\nContent...",
243
+ "timestamp": "2025-02-15T10:30:00.123456",
244
+ "config": {
245
+ "target_selector": "article",
246
+ "remove_selectors": [".ads", "#popup"]
247
+ }
248
+ }
249
+ ```
250
+
251
+ ## Error Handling
252
+
253
+ The package handles various types of errors:
254
+ - Network errors
255
+ - Timeout errors
256
+ - Invalid URLs
257
+ - Missing content
258
+ - Service errors
259
+
260
+ All errors are:
261
+ 1. Logged in the console
262
+ 2. Included in the JSON report
263
+ 3. Sent via webhook (if configured)
264
+ 4. Available in the results list
265
+
266
+ ## Requirements
267
+
268
+ - Python 3.11 or later
269
+ - Running SpiderForce4AI service
270
+ - Internet connection
271
+
272
+ ## License
273
+
274
+ MIT License
275
+
276
+ ## Credits
277
+
278
+ Created by [Peter Tam](https://petertam.pro)
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
2
+ spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
3
+ spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.6.dist-info/RECORD,,
@@ -1,239 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: spiderforce4ai
3
- Version: 0.1.4
4
- Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
- Home-page: https://petertam.pro
6
- Author: Piotr Tamulewicz
7
- Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
- License: MIT
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Requires-Python: >=3.11
15
- Description-Content-Type: text/markdown
16
- Requires-Dist: aiohttp>=3.8.0
17
- Requires-Dist: asyncio>=3.4.3
18
- Requires-Dist: rich>=10.0.0
19
- Requires-Dist: aiofiles>=0.8.0
20
- Requires-Dist: httpx>=0.24.0
21
- Dynamic: author
22
- Dynamic: home-page
23
- Dynamic: requires-python
24
-
25
- # SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
26
-
27
- A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
-
29
- ## Features
30
-
31
- - 🔄 Simple synchronous and asynchronous APIs
32
- - 📁 Automatic Markdown file saving with URL-based filenames
33
- - 📊 Real-time progress tracking in console
34
- - 🪝 Webhook support for real-time notifications
35
- - 📝 Detailed crawl reports in JSON format
36
- - ⚡ Concurrent crawling with rate limiting
37
- - 🔍 Support for sitemap.xml crawling
38
- - 🛡️ Comprehensive error handling
39
-
40
- ## Installation
41
-
42
- ```bash
43
- pip install spiderforce4ai
44
- ```
45
-
46
- ## Quick Start
47
-
48
- ```python
49
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
50
-
51
- # Initialize the client
52
- spider = SpiderForce4AI("http://localhost:3004")
53
-
54
- # Use default configuration
55
- config = CrawlConfig()
56
-
57
- # Crawl a single URL
58
- result = spider.crawl_url("https://example.com", config)
59
-
60
- # Crawl multiple URLs
61
- urls = [
62
- "https://example.com/page1",
63
- "https://example.com/page2"
64
- ]
65
- results = spider.crawl_urls(urls, config)
66
-
67
- # Crawl from sitemap
68
- results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
69
- ```
70
-
71
- ## Configuration
72
-
73
- The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
74
-
75
- ```python
76
- config = CrawlConfig(
77
- # Content Selection (all optional)
78
- target_selector="article", # Specific element to target
79
- remove_selectors=[".ads", "#popup"], # Elements to remove
80
- remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
81
-
82
- # Processing Settings
83
- max_concurrent_requests=1, # Default: 1
84
- request_delay=0.5, # Delay between requests in seconds
85
- timeout=30, # Request timeout in seconds
86
-
87
- # Output Settings
88
- output_dir="spiderforce_reports", # Default output directory
89
- webhook_url="https://your-webhook.com", # Optional webhook endpoint
90
- webhook_timeout=10, # Webhook timeout in seconds
91
- report_file=None # Optional custom report location
92
- )
93
- ```
94
-
95
- ### Default Directory Structure
96
-
97
- ```
98
- ./
99
- └── spiderforce_reports/
100
- ├── example-com-page1.md
101
- ├── example-com-page2.md
102
- └── crawl_report.json
103
- ```
104
-
105
- ## Webhook Notifications
106
-
107
- If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
108
-
109
- ```json
110
- {
111
- "url": "https://example.com/page1",
112
- "status": "success",
113
- "markdown": "# Page Title\n\nContent...",
114
- "timestamp": "2025-02-15T10:30:00.123456",
115
- "config": {
116
- "target_selector": "article",
117
- "remove_selectors": [".ads", "#popup"],
118
- "remove_selectors_regex": ["modal-\\d+"]
119
- }
120
- }
121
- ```
122
-
123
- ## Crawl Report
124
-
125
- A comprehensive JSON report is automatically generated in the output directory:
126
-
127
- ```json
128
- {
129
- "timestamp": "2025-02-15T10:30:00.123456",
130
- "config": {
131
- "target_selector": "article",
132
- "remove_selectors": [".ads", "#popup"],
133
- "remove_selectors_regex": ["modal-\\d+"]
134
- },
135
- "results": {
136
- "successful": [
137
- {
138
- "url": "https://example.com/page1",
139
- "status": "success",
140
- "markdown": "# Page Title\n\nContent...",
141
- "timestamp": "2025-02-15T10:30:00.123456"
142
- }
143
- ],
144
- "failed": [
145
- {
146
- "url": "https://example.com/page2",
147
- "status": "failed",
148
- "error": "HTTP 404: Not Found",
149
- "timestamp": "2025-02-15T10:30:01.123456"
150
- }
151
- ]
152
- },
153
- "summary": {
154
- "total": 2,
155
- "successful": 1,
156
- "failed": 1
157
- }
158
- }
159
- ```
160
-
161
- ## Async Usage
162
-
163
- ```python
164
- import asyncio
165
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
166
-
167
- async def main():
168
- config = CrawlConfig()
169
- spider = SpiderForce4AI("http://localhost:3004")
170
-
171
- async with spider:
172
- results = await spider.crawl_urls_async(
173
- ["https://example.com/page1", "https://example.com/page2"],
174
- config
175
- )
176
-
177
- return results
178
-
179
- if __name__ == "__main__":
180
- results = asyncio.run(main())
181
- ```
182
-
183
- ## Error Handling
184
-
185
- The crawler is designed to be resilient:
186
- - Continues processing even if some URLs fail
187
- - Records all errors in the crawl report
188
- - Sends error notifications via webhook if configured
189
- - Provides clear error messages in console output
190
-
191
- ## Progress Tracking
192
-
193
- The crawler provides real-time progress tracking in the console:
194
-
195
- ```
196
- 🔄 Crawling URLs... [####################] 100%
197
- ✓ Successful: 95
198
- ✗ Failed: 5
199
- 📊 Report saved to: ./spiderforce_reports/crawl_report.json
200
- ```
201
-
202
- ## Usage with AI Agents
203
-
204
- The package is designed to be easily integrated with AI agents and chat systems:
205
-
206
- ```python
207
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
208
-
209
- def fetch_content_for_ai(urls):
210
- spider = SpiderForce4AI("http://localhost:3004")
211
- config = CrawlConfig()
212
-
213
- # Crawl content
214
- results = spider.crawl_urls(urls, config)
215
-
216
- # Return successful results
217
- return {
218
- result.url: result.markdown
219
- for result in results
220
- if result.status == "success"
221
- }
222
-
223
- # Use with AI agent
224
- urls = ["https://example.com/article1", "https://example.com/article2"]
225
- content = fetch_content_for_ai(urls)
226
- ```
227
-
228
- ## Requirements
229
-
230
- - Python 3.11 or later
231
- - Docker (for running SpiderForce4AI service)
232
-
233
- ## License
234
-
235
- MIT License
236
-
237
- ## Credits
238
-
239
- Created by [Peter Tam](https://petertam.pro)
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
2
- spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
3
- spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.4.dist-info/RECORD,,