spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +89 -87
- spiderforce4ai-0.1.6.dist-info/METADATA +278 -0
- spiderforce4ai-0.1.6.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.4.dist-info/METADATA +0 -239
- spiderforce4ai-0.1.4.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
|
-
|
2
|
-
SpiderForce4AI Python Wrapper
|
3
|
-
A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
|
4
|
-
"""
|
1
|
+
# spiderforce4ai/__init__.py
|
5
2
|
|
6
3
|
import asyncio
|
7
4
|
import aiohttp
|
8
5
|
import json
|
9
6
|
import logging
|
10
|
-
from typing import List, Dict, Union, Optional
|
7
|
+
from typing import List, Dict, Union, Optional, Tuple
|
11
8
|
from dataclasses import dataclass, asdict
|
12
9
|
from urllib.parse import urljoin, urlparse
|
13
10
|
from pathlib import Path
|
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
|
|
20
17
|
from rich.console import Console
|
21
18
|
import aiofiles
|
22
19
|
import httpx
|
20
|
+
import requests
|
23
21
|
from multiprocessing import Pool
|
24
22
|
|
25
23
|
console = Console()
|
@@ -88,6 +86,53 @@ class CrawlConfig:
|
|
88
86
|
payload["remove_selectors_regex"] = self.remove_selectors_regex
|
89
87
|
return payload
|
90
88
|
|
89
|
+
# Module level function for multiprocessing
|
90
|
+
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
91
|
+
"""Process a single URL for parallel processing."""
|
92
|
+
url, base_url, config = args
|
93
|
+
try:
|
94
|
+
endpoint = f"{base_url}/convert"
|
95
|
+
payload = {
|
96
|
+
"url": url,
|
97
|
+
**config.to_dict()
|
98
|
+
}
|
99
|
+
|
100
|
+
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
101
|
+
if response.status_code != 200:
|
102
|
+
return CrawlResult(
|
103
|
+
url=url,
|
104
|
+
status="failed",
|
105
|
+
error=f"HTTP {response.status_code}: {response.text}",
|
106
|
+
config=config.to_dict()
|
107
|
+
)
|
108
|
+
|
109
|
+
markdown = response.text
|
110
|
+
|
111
|
+
# Save markdown if output directory is configured
|
112
|
+
if config.output_dir:
|
113
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
114
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
115
|
+
f.write(markdown)
|
116
|
+
|
117
|
+
# Add delay if configured
|
118
|
+
if config.request_delay:
|
119
|
+
time.sleep(config.request_delay)
|
120
|
+
|
121
|
+
return CrawlResult(
|
122
|
+
url=url,
|
123
|
+
status="success",
|
124
|
+
markdown=markdown,
|
125
|
+
config=config.to_dict()
|
126
|
+
)
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
return CrawlResult(
|
130
|
+
url=url,
|
131
|
+
status="failed",
|
132
|
+
error=str(e),
|
133
|
+
config=config.to_dict()
|
134
|
+
)
|
135
|
+
|
91
136
|
class SpiderForce4AI:
|
92
137
|
"""Main class for interacting with SpiderForce4AI service."""
|
93
138
|
|
@@ -140,6 +185,25 @@ class SpiderForce4AI:
|
|
140
185
|
except Exception as e:
|
141
186
|
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
142
187
|
|
188
|
+
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
189
|
+
"""Save crawl report synchronously."""
|
190
|
+
report = {
|
191
|
+
"timestamp": datetime.now().isoformat(),
|
192
|
+
"config": config.to_dict(),
|
193
|
+
"results": {
|
194
|
+
"successful": [asdict(r) for r in results if r.status == "success"],
|
195
|
+
"failed": [asdict(r) for r in results if r.status == "failed"]
|
196
|
+
},
|
197
|
+
"summary": {
|
198
|
+
"total": len(results),
|
199
|
+
"successful": len([r for r in results if r.status == "success"]),
|
200
|
+
"failed": len([r for r in results if r.status == "failed"])
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
205
|
+
json.dump(report, f, indent=2)
|
206
|
+
|
143
207
|
async def _save_report(self, config: CrawlConfig):
|
144
208
|
"""Save crawl report to JSON file."""
|
145
209
|
if not config.report_file:
|
@@ -286,28 +350,8 @@ class SpiderForce4AI:
|
|
286
350
|
"""Synchronous version of crawl_sitemap_async."""
|
287
351
|
return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
|
288
352
|
|
289
|
-
async def __aenter__(self):
|
290
|
-
"""Async context manager entry."""
|
291
|
-
await self._ensure_session()
|
292
|
-
return self
|
293
|
-
|
294
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
295
|
-
"""Async context manager exit."""
|
296
|
-
await self._close_session()
|
297
|
-
|
298
|
-
def __enter__(self):
|
299
|
-
"""Sync context manager entry."""
|
300
|
-
return self
|
301
|
-
|
302
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
303
|
-
"""Sync context manager exit."""
|
304
|
-
self._executor.shutdown(wait=True)
|
305
|
-
|
306
|
-
|
307
353
|
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
308
|
-
"""
|
309
|
-
Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
|
310
|
-
"""
|
354
|
+
"""Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
|
311
355
|
print(f"Fetching sitemap from {sitemap_url}...")
|
312
356
|
|
313
357
|
# Fetch sitemap
|
@@ -329,52 +373,12 @@ class SpiderForce4AI:
|
|
329
373
|
print(f"Error parsing sitemap: {str(e)}")
|
330
374
|
raise
|
331
375
|
|
332
|
-
|
333
|
-
|
334
|
-
endpoint = f"{self.base_url}/convert"
|
335
|
-
payload = {
|
336
|
-
"url": url,
|
337
|
-
**config.to_dict()
|
338
|
-
}
|
339
|
-
|
340
|
-
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
341
|
-
if response.status_code != 200:
|
342
|
-
return CrawlResult(
|
343
|
-
url=url,
|
344
|
-
status="failed",
|
345
|
-
error=f"HTTP {response.status_code}: {response.text}",
|
346
|
-
config=config.to_dict()
|
347
|
-
)
|
348
|
-
|
349
|
-
markdown = response.text
|
350
|
-
|
351
|
-
# Save markdown if output directory is configured
|
352
|
-
if config.output_dir:
|
353
|
-
filepath = config.output_dir / f"{slugify(url)}.md"
|
354
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
355
|
-
f.write(markdown)
|
356
|
-
|
357
|
-
# Add delay if configured
|
358
|
-
if config.request_delay:
|
359
|
-
time.sleep(config.request_delay)
|
360
|
-
|
361
|
-
return CrawlResult(
|
362
|
-
url=url,
|
363
|
-
status="success",
|
364
|
-
markdown=markdown,
|
365
|
-
config=config.to_dict()
|
366
|
-
)
|
367
|
-
|
368
|
-
except Exception as e:
|
369
|
-
return CrawlResult(
|
370
|
-
url=url,
|
371
|
-
status="failed",
|
372
|
-
error=str(e),
|
373
|
-
config=config.to_dict()
|
374
|
-
)
|
376
|
+
# Prepare arguments for parallel processing
|
377
|
+
process_args = [(url, self.base_url, config) for url in urls]
|
375
378
|
|
376
379
|
# Create process pool and execute crawls
|
377
380
|
results = []
|
381
|
+
|
378
382
|
with Pool(processes=config.max_concurrent_requests) as pool:
|
379
383
|
with Progress(
|
380
384
|
SpinnerColumn(),
|
@@ -385,7 +389,7 @@ class SpiderForce4AI:
|
|
385
389
|
) as progress:
|
386
390
|
task = progress.add_task("Crawling URLs...", total=len(urls))
|
387
391
|
|
388
|
-
for result in pool.imap_unordered(
|
392
|
+
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
389
393
|
results.append(result)
|
390
394
|
progress.update(task, advance=1)
|
391
395
|
status = "✓" if result.status == "success" else "✗"
|
@@ -405,21 +409,19 @@ class SpiderForce4AI:
|
|
405
409
|
|
406
410
|
return results
|
407
411
|
|
408
|
-
def
|
409
|
-
"""
|
410
|
-
|
411
|
-
|
412
|
-
"config": config.to_dict(),
|
413
|
-
"results": {
|
414
|
-
"successful": [asdict(r) for r in results if r.status == "success"],
|
415
|
-
"failed": [asdict(r) for r in results if r.status == "failed"]
|
416
|
-
},
|
417
|
-
"summary": {
|
418
|
-
"total": len(results),
|
419
|
-
"successful": len([r for r in results if r.status == "success"]),
|
420
|
-
"failed": len([r for r in results if r.status == "failed"])
|
421
|
-
}
|
422
|
-
}
|
412
|
+
async def __aenter__(self):
|
413
|
+
"""Async context manager entry."""
|
414
|
+
await self._ensure_session()
|
415
|
+
return self
|
423
416
|
|
424
|
-
|
425
|
-
|
417
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
418
|
+
"""Async context manager exit."""
|
419
|
+
await self._close_session()
|
420
|
+
|
421
|
+
def __enter__(self):
|
422
|
+
"""Sync context manager entry."""
|
423
|
+
return self
|
424
|
+
|
425
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
426
|
+
"""Sync context manager exit."""
|
427
|
+
self._executor.shutdown(wait=True)
|
@@ -0,0 +1,278 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: spiderforce4ai
|
3
|
+
Version: 0.1.6
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
+
Home-page: https://petertam.pro
|
6
|
+
Author: Piotr Tamulewicz
|
7
|
+
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
+
License: MIT
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Requires-Python: >=3.11
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
Requires-Dist: aiohttp>=3.8.0
|
17
|
+
Requires-Dist: asyncio>=3.4.3
|
18
|
+
Requires-Dist: rich>=10.0.0
|
19
|
+
Requires-Dist: aiofiles>=0.8.0
|
20
|
+
Requires-Dist: httpx>=0.24.0
|
21
|
+
Dynamic: author
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: requires-python
|
24
|
+
|
25
|
+
# SpiderForce4AI Python Wrapper
|
26
|
+
|
27
|
+
A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
|
28
|
+
|
29
|
+
## Installation
|
30
|
+
|
31
|
+
```bash
|
32
|
+
pip install spiderforce4ai
|
33
|
+
```
|
34
|
+
|
35
|
+
## Quick Start (Minimal Setup)
|
36
|
+
|
37
|
+
```python
|
38
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
39
|
+
|
40
|
+
# Initialize with your SpiderForce4AI service URL
|
41
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
42
|
+
|
43
|
+
# Use default configuration (will save in ./spiderforce_reports)
|
44
|
+
config = CrawlConfig()
|
45
|
+
|
46
|
+
# Crawl a single URL
|
47
|
+
result = spider.crawl_url("https://example.com", config)
|
48
|
+
```
|
49
|
+
|
50
|
+
## Crawling Methods
|
51
|
+
|
52
|
+
### 1. Single URL Crawling
|
53
|
+
|
54
|
+
```python
|
55
|
+
# Synchronous
|
56
|
+
result = spider.crawl_url("https://example.com", config)
|
57
|
+
|
58
|
+
# Asynchronous
|
59
|
+
async def crawl():
|
60
|
+
result = await spider.crawl_url_async("https://example.com", config)
|
61
|
+
```
|
62
|
+
|
63
|
+
### 2. Multiple URLs Crawling
|
64
|
+
|
65
|
+
```python
|
66
|
+
# List of URLs
|
67
|
+
urls = [
|
68
|
+
"https://example.com/page1",
|
69
|
+
"https://example.com/page2",
|
70
|
+
"https://example.com/page3"
|
71
|
+
]
|
72
|
+
|
73
|
+
# Synchronous
|
74
|
+
results = spider.crawl_urls(urls, config)
|
75
|
+
|
76
|
+
# Asynchronous
|
77
|
+
async def crawl():
|
78
|
+
results = await spider.crawl_urls_async(urls, config)
|
79
|
+
|
80
|
+
# Parallel (using multiprocessing)
|
81
|
+
results = spider.crawl_urls_parallel(urls, config)
|
82
|
+
```
|
83
|
+
|
84
|
+
### 3. Sitemap Crawling
|
85
|
+
|
86
|
+
```python
|
87
|
+
# Synchronous
|
88
|
+
results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
|
89
|
+
|
90
|
+
# Asynchronous
|
91
|
+
async def crawl():
|
92
|
+
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
93
|
+
|
94
|
+
# Parallel (using multiprocessing)
|
95
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
96
|
+
```
|
97
|
+
|
98
|
+
## Configuration Options
|
99
|
+
|
100
|
+
All configuration options are optional with sensible defaults:
|
101
|
+
|
102
|
+
```python
|
103
|
+
config = CrawlConfig(
|
104
|
+
# Content Selection (all optional)
|
105
|
+
target_selector="article", # Specific element to target
|
106
|
+
remove_selectors=[ # Elements to remove
|
107
|
+
".ads",
|
108
|
+
"#popup",
|
109
|
+
".navigation",
|
110
|
+
".footer"
|
111
|
+
],
|
112
|
+
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
|
+
|
114
|
+
# Processing Settings
|
115
|
+
max_concurrent_requests=1, # Default: 1 (parallel processing)
|
116
|
+
request_delay=0.5, # Delay between requests in seconds
|
117
|
+
timeout=30, # Request timeout in seconds
|
118
|
+
|
119
|
+
# Output Settings
|
120
|
+
output_dir="custom_output", # Default: "spiderforce_reports"
|
121
|
+
report_file="custom_report.json", # Default: "crawl_report.json"
|
122
|
+
webhook_url="https://your-webhook.com", # Optional webhook endpoint
|
123
|
+
webhook_timeout=10 # Webhook timeout in seconds
|
124
|
+
)
|
125
|
+
```
|
126
|
+
|
127
|
+
## Real-World Examples
|
128
|
+
|
129
|
+
### 1. Basic Website Crawling
|
130
|
+
|
131
|
+
```python
|
132
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
133
|
+
from pathlib import Path
|
134
|
+
|
135
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
136
|
+
config = CrawlConfig(
|
137
|
+
output_dir=Path("blog_content")
|
138
|
+
)
|
139
|
+
|
140
|
+
result = spider.crawl_url("https://example.com/blog", config)
|
141
|
+
print(f"Content saved to: {result.url}.md")
|
142
|
+
```
|
143
|
+
|
144
|
+
### 2. Advanced Parallel Sitemap Crawling
|
145
|
+
|
146
|
+
```python
|
147
|
+
config = CrawlConfig(
|
148
|
+
max_concurrent_requests=5,
|
149
|
+
output_dir=Path("website_content"),
|
150
|
+
remove_selectors=[
|
151
|
+
".navigation",
|
152
|
+
".footer",
|
153
|
+
".ads",
|
154
|
+
"#cookie-notice"
|
155
|
+
],
|
156
|
+
webhook_url="https://your-webhook.com/endpoint"
|
157
|
+
)
|
158
|
+
|
159
|
+
results = spider.crawl_sitemap_parallel(
|
160
|
+
"https://example.com/sitemap.xml",
|
161
|
+
config
|
162
|
+
)
|
163
|
+
```
|
164
|
+
|
165
|
+
### 3. Async Crawling with Progress
|
166
|
+
|
167
|
+
```python
|
168
|
+
import asyncio
|
169
|
+
|
170
|
+
async def main():
|
171
|
+
config = CrawlConfig(
|
172
|
+
max_concurrent_requests=3,
|
173
|
+
request_delay=1.0
|
174
|
+
)
|
175
|
+
|
176
|
+
async with spider:
|
177
|
+
results = await spider.crawl_urls_async([
|
178
|
+
"https://example.com/1",
|
179
|
+
"https://example.com/2",
|
180
|
+
"https://example.com/3"
|
181
|
+
], config)
|
182
|
+
|
183
|
+
return results
|
184
|
+
|
185
|
+
results = asyncio.run(main())
|
186
|
+
```
|
187
|
+
|
188
|
+
## Output Structure
|
189
|
+
|
190
|
+
### 1. File Organization
|
191
|
+
```
|
192
|
+
output_dir/
|
193
|
+
├── example-com-page1.md
|
194
|
+
├── example-com-page2.md
|
195
|
+
└── crawl_report.json
|
196
|
+
```
|
197
|
+
|
198
|
+
### 2. Markdown Files
|
199
|
+
Each markdown file is named using a slugified version of the URL and contains the converted content.
|
200
|
+
|
201
|
+
### 3. Report JSON Structure
|
202
|
+
```json
|
203
|
+
{
|
204
|
+
"timestamp": "2025-02-15T10:30:00.123456",
|
205
|
+
"config": {
|
206
|
+
"target_selector": "article",
|
207
|
+
"remove_selectors": [".ads", "#popup"],
|
208
|
+
"remove_selectors_regex": ["modal-\\d+"]
|
209
|
+
},
|
210
|
+
"results": {
|
211
|
+
"successful": [
|
212
|
+
{
|
213
|
+
"url": "https://example.com/page1",
|
214
|
+
"status": "success",
|
215
|
+
"markdown": "# Page Title\n\nContent...",
|
216
|
+
"timestamp": "2025-02-15T10:30:00.123456"
|
217
|
+
}
|
218
|
+
],
|
219
|
+
"failed": [
|
220
|
+
{
|
221
|
+
"url": "https://example.com/page2",
|
222
|
+
"status": "failed",
|
223
|
+
"error": "HTTP 404: Not Found",
|
224
|
+
"timestamp": "2025-02-15T10:30:01.123456"
|
225
|
+
}
|
226
|
+
]
|
227
|
+
},
|
228
|
+
"summary": {
|
229
|
+
"total": 2,
|
230
|
+
"successful": 1,
|
231
|
+
"failed": 1
|
232
|
+
}
|
233
|
+
}
|
234
|
+
```
|
235
|
+
|
236
|
+
### 4. Webhook Notifications
|
237
|
+
If configured, webhooks receive real-time updates in JSON format:
|
238
|
+
```json
|
239
|
+
{
|
240
|
+
"url": "https://example.com/page1",
|
241
|
+
"status": "success",
|
242
|
+
"markdown": "# Page Title\n\nContent...",
|
243
|
+
"timestamp": "2025-02-15T10:30:00.123456",
|
244
|
+
"config": {
|
245
|
+
"target_selector": "article",
|
246
|
+
"remove_selectors": [".ads", "#popup"]
|
247
|
+
}
|
248
|
+
}
|
249
|
+
```
|
250
|
+
|
251
|
+
## Error Handling
|
252
|
+
|
253
|
+
The package handles various types of errors:
|
254
|
+
- Network errors
|
255
|
+
- Timeout errors
|
256
|
+
- Invalid URLs
|
257
|
+
- Missing content
|
258
|
+
- Service errors
|
259
|
+
|
260
|
+
All errors are:
|
261
|
+
1. Logged in the console
|
262
|
+
2. Included in the JSON report
|
263
|
+
3. Sent via webhook (if configured)
|
264
|
+
4. Available in the results list
|
265
|
+
|
266
|
+
## Requirements
|
267
|
+
|
268
|
+
- Python 3.11 or later
|
269
|
+
- Running SpiderForce4AI service
|
270
|
+
- Internet connection
|
271
|
+
|
272
|
+
## License
|
273
|
+
|
274
|
+
MIT License
|
275
|
+
|
276
|
+
## Credits
|
277
|
+
|
278
|
+
Created by [Peter Tam](https://petertam.pro)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
|
2
|
+
spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
|
3
|
+
spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.6.dist-info/RECORD,,
|
@@ -1,239 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 0.1.4
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
License: MIT
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
14
|
-
Requires-Python: >=3.11
|
15
|
-
Description-Content-Type: text/markdown
|
16
|
-
Requires-Dist: aiohttp>=3.8.0
|
17
|
-
Requires-Dist: asyncio>=3.4.3
|
18
|
-
Requires-Dist: rich>=10.0.0
|
19
|
-
Requires-Dist: aiofiles>=0.8.0
|
20
|
-
Requires-Dist: httpx>=0.24.0
|
21
|
-
Dynamic: author
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-python
|
24
|
-
|
25
|
-
# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
|
26
|
-
|
27
|
-
A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
|
28
|
-
|
29
|
-
## Features
|
30
|
-
|
31
|
-
- 🔄 Simple synchronous and asynchronous APIs
|
32
|
-
- 📁 Automatic Markdown file saving with URL-based filenames
|
33
|
-
- 📊 Real-time progress tracking in console
|
34
|
-
- 🪝 Webhook support for real-time notifications
|
35
|
-
- 📝 Detailed crawl reports in JSON format
|
36
|
-
- ⚡ Concurrent crawling with rate limiting
|
37
|
-
- 🔍 Support for sitemap.xml crawling
|
38
|
-
- 🛡️ Comprehensive error handling
|
39
|
-
|
40
|
-
## Installation
|
41
|
-
|
42
|
-
```bash
|
43
|
-
pip install spiderforce4ai
|
44
|
-
```
|
45
|
-
|
46
|
-
## Quick Start
|
47
|
-
|
48
|
-
```python
|
49
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
50
|
-
|
51
|
-
# Initialize the client
|
52
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
53
|
-
|
54
|
-
# Use default configuration
|
55
|
-
config = CrawlConfig()
|
56
|
-
|
57
|
-
# Crawl a single URL
|
58
|
-
result = spider.crawl_url("https://example.com", config)
|
59
|
-
|
60
|
-
# Crawl multiple URLs
|
61
|
-
urls = [
|
62
|
-
"https://example.com/page1",
|
63
|
-
"https://example.com/page2"
|
64
|
-
]
|
65
|
-
results = spider.crawl_urls(urls, config)
|
66
|
-
|
67
|
-
# Crawl from sitemap
|
68
|
-
results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
|
69
|
-
```
|
70
|
-
|
71
|
-
## Configuration
|
72
|
-
|
73
|
-
The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
|
74
|
-
|
75
|
-
```python
|
76
|
-
config = CrawlConfig(
|
77
|
-
# Content Selection (all optional)
|
78
|
-
target_selector="article", # Specific element to target
|
79
|
-
remove_selectors=[".ads", "#popup"], # Elements to remove
|
80
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
81
|
-
|
82
|
-
# Processing Settings
|
83
|
-
max_concurrent_requests=1, # Default: 1
|
84
|
-
request_delay=0.5, # Delay between requests in seconds
|
85
|
-
timeout=30, # Request timeout in seconds
|
86
|
-
|
87
|
-
# Output Settings
|
88
|
-
output_dir="spiderforce_reports", # Default output directory
|
89
|
-
webhook_url="https://your-webhook.com", # Optional webhook endpoint
|
90
|
-
webhook_timeout=10, # Webhook timeout in seconds
|
91
|
-
report_file=None # Optional custom report location
|
92
|
-
)
|
93
|
-
```
|
94
|
-
|
95
|
-
### Default Directory Structure
|
96
|
-
|
97
|
-
```
|
98
|
-
./
|
99
|
-
└── spiderforce_reports/
|
100
|
-
├── example-com-page1.md
|
101
|
-
├── example-com-page2.md
|
102
|
-
└── crawl_report.json
|
103
|
-
```
|
104
|
-
|
105
|
-
## Webhook Notifications
|
106
|
-
|
107
|
-
If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
|
108
|
-
|
109
|
-
```json
|
110
|
-
{
|
111
|
-
"url": "https://example.com/page1",
|
112
|
-
"status": "success",
|
113
|
-
"markdown": "# Page Title\n\nContent...",
|
114
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
115
|
-
"config": {
|
116
|
-
"target_selector": "article",
|
117
|
-
"remove_selectors": [".ads", "#popup"],
|
118
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
119
|
-
}
|
120
|
-
}
|
121
|
-
```
|
122
|
-
|
123
|
-
## Crawl Report
|
124
|
-
|
125
|
-
A comprehensive JSON report is automatically generated in the output directory:
|
126
|
-
|
127
|
-
```json
|
128
|
-
{
|
129
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
130
|
-
"config": {
|
131
|
-
"target_selector": "article",
|
132
|
-
"remove_selectors": [".ads", "#popup"],
|
133
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
134
|
-
},
|
135
|
-
"results": {
|
136
|
-
"successful": [
|
137
|
-
{
|
138
|
-
"url": "https://example.com/page1",
|
139
|
-
"status": "success",
|
140
|
-
"markdown": "# Page Title\n\nContent...",
|
141
|
-
"timestamp": "2025-02-15T10:30:00.123456"
|
142
|
-
}
|
143
|
-
],
|
144
|
-
"failed": [
|
145
|
-
{
|
146
|
-
"url": "https://example.com/page2",
|
147
|
-
"status": "failed",
|
148
|
-
"error": "HTTP 404: Not Found",
|
149
|
-
"timestamp": "2025-02-15T10:30:01.123456"
|
150
|
-
}
|
151
|
-
]
|
152
|
-
},
|
153
|
-
"summary": {
|
154
|
-
"total": 2,
|
155
|
-
"successful": 1,
|
156
|
-
"failed": 1
|
157
|
-
}
|
158
|
-
}
|
159
|
-
```
|
160
|
-
|
161
|
-
## Async Usage
|
162
|
-
|
163
|
-
```python
|
164
|
-
import asyncio
|
165
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
166
|
-
|
167
|
-
async def main():
|
168
|
-
config = CrawlConfig()
|
169
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
170
|
-
|
171
|
-
async with spider:
|
172
|
-
results = await spider.crawl_urls_async(
|
173
|
-
["https://example.com/page1", "https://example.com/page2"],
|
174
|
-
config
|
175
|
-
)
|
176
|
-
|
177
|
-
return results
|
178
|
-
|
179
|
-
if __name__ == "__main__":
|
180
|
-
results = asyncio.run(main())
|
181
|
-
```
|
182
|
-
|
183
|
-
## Error Handling
|
184
|
-
|
185
|
-
The crawler is designed to be resilient:
|
186
|
-
- Continues processing even if some URLs fail
|
187
|
-
- Records all errors in the crawl report
|
188
|
-
- Sends error notifications via webhook if configured
|
189
|
-
- Provides clear error messages in console output
|
190
|
-
|
191
|
-
## Progress Tracking
|
192
|
-
|
193
|
-
The crawler provides real-time progress tracking in the console:
|
194
|
-
|
195
|
-
```
|
196
|
-
🔄 Crawling URLs... [####################] 100%
|
197
|
-
✓ Successful: 95
|
198
|
-
✗ Failed: 5
|
199
|
-
📊 Report saved to: ./spiderforce_reports/crawl_report.json
|
200
|
-
```
|
201
|
-
|
202
|
-
## Usage with AI Agents
|
203
|
-
|
204
|
-
The package is designed to be easily integrated with AI agents and chat systems:
|
205
|
-
|
206
|
-
```python
|
207
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
208
|
-
|
209
|
-
def fetch_content_for_ai(urls):
|
210
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
211
|
-
config = CrawlConfig()
|
212
|
-
|
213
|
-
# Crawl content
|
214
|
-
results = spider.crawl_urls(urls, config)
|
215
|
-
|
216
|
-
# Return successful results
|
217
|
-
return {
|
218
|
-
result.url: result.markdown
|
219
|
-
for result in results
|
220
|
-
if result.status == "success"
|
221
|
-
}
|
222
|
-
|
223
|
-
# Use with AI agent
|
224
|
-
urls = ["https://example.com/article1", "https://example.com/article2"]
|
225
|
-
content = fetch_content_for_ai(urls)
|
226
|
-
```
|
227
|
-
|
228
|
-
## Requirements
|
229
|
-
|
230
|
-
- Python 3.11 or later
|
231
|
-
- Docker (for running SpiderForce4AI service)
|
232
|
-
|
233
|
-
## License
|
234
|
-
|
235
|
-
MIT License
|
236
|
-
|
237
|
-
## Credits
|
238
|
-
|
239
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
|
2
|
-
spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
|
3
|
-
spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|