spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +89 -87
- spiderforce4ai-0.1.6.dist-info/METADATA +278 -0
- spiderforce4ai-0.1.6.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.4.dist-info/METADATA +0 -239
- spiderforce4ai-0.1.4.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
|
-
|
2
|
-
SpiderForce4AI Python Wrapper
|
3
|
-
A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
|
4
|
-
"""
|
1
|
+
# spiderforce4ai/__init__.py
|
5
2
|
|
6
3
|
import asyncio
|
7
4
|
import aiohttp
|
8
5
|
import json
|
9
6
|
import logging
|
10
|
-
from typing import List, Dict, Union, Optional
|
7
|
+
from typing import List, Dict, Union, Optional, Tuple
|
11
8
|
from dataclasses import dataclass, asdict
|
12
9
|
from urllib.parse import urljoin, urlparse
|
13
10
|
from pathlib import Path
|
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
|
|
20
17
|
from rich.console import Console
|
21
18
|
import aiofiles
|
22
19
|
import httpx
|
20
|
+
import requests
|
23
21
|
from multiprocessing import Pool
|
24
22
|
|
25
23
|
console = Console()
|
@@ -88,6 +86,53 @@ class CrawlConfig:
|
|
88
86
|
payload["remove_selectors_regex"] = self.remove_selectors_regex
|
89
87
|
return payload
|
90
88
|
|
89
|
+
# Module level function for multiprocessing
|
90
|
+
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
91
|
+
"""Process a single URL for parallel processing."""
|
92
|
+
url, base_url, config = args
|
93
|
+
try:
|
94
|
+
endpoint = f"{base_url}/convert"
|
95
|
+
payload = {
|
96
|
+
"url": url,
|
97
|
+
**config.to_dict()
|
98
|
+
}
|
99
|
+
|
100
|
+
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
101
|
+
if response.status_code != 200:
|
102
|
+
return CrawlResult(
|
103
|
+
url=url,
|
104
|
+
status="failed",
|
105
|
+
error=f"HTTP {response.status_code}: {response.text}",
|
106
|
+
config=config.to_dict()
|
107
|
+
)
|
108
|
+
|
109
|
+
markdown = response.text
|
110
|
+
|
111
|
+
# Save markdown if output directory is configured
|
112
|
+
if config.output_dir:
|
113
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
114
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
115
|
+
f.write(markdown)
|
116
|
+
|
117
|
+
# Add delay if configured
|
118
|
+
if config.request_delay:
|
119
|
+
time.sleep(config.request_delay)
|
120
|
+
|
121
|
+
return CrawlResult(
|
122
|
+
url=url,
|
123
|
+
status="success",
|
124
|
+
markdown=markdown,
|
125
|
+
config=config.to_dict()
|
126
|
+
)
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
return CrawlResult(
|
130
|
+
url=url,
|
131
|
+
status="failed",
|
132
|
+
error=str(e),
|
133
|
+
config=config.to_dict()
|
134
|
+
)
|
135
|
+
|
91
136
|
class SpiderForce4AI:
|
92
137
|
"""Main class for interacting with SpiderForce4AI service."""
|
93
138
|
|
@@ -140,6 +185,25 @@ class SpiderForce4AI:
|
|
140
185
|
except Exception as e:
|
141
186
|
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
142
187
|
|
188
|
+
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
189
|
+
"""Save crawl report synchronously."""
|
190
|
+
report = {
|
191
|
+
"timestamp": datetime.now().isoformat(),
|
192
|
+
"config": config.to_dict(),
|
193
|
+
"results": {
|
194
|
+
"successful": [asdict(r) for r in results if r.status == "success"],
|
195
|
+
"failed": [asdict(r) for r in results if r.status == "failed"]
|
196
|
+
},
|
197
|
+
"summary": {
|
198
|
+
"total": len(results),
|
199
|
+
"successful": len([r for r in results if r.status == "success"]),
|
200
|
+
"failed": len([r for r in results if r.status == "failed"])
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
205
|
+
json.dump(report, f, indent=2)
|
206
|
+
|
143
207
|
async def _save_report(self, config: CrawlConfig):
|
144
208
|
"""Save crawl report to JSON file."""
|
145
209
|
if not config.report_file:
|
@@ -286,28 +350,8 @@ class SpiderForce4AI:
|
|
286
350
|
"""Synchronous version of crawl_sitemap_async."""
|
287
351
|
return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
|
288
352
|
|
289
|
-
async def __aenter__(self):
|
290
|
-
"""Async context manager entry."""
|
291
|
-
await self._ensure_session()
|
292
|
-
return self
|
293
|
-
|
294
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
295
|
-
"""Async context manager exit."""
|
296
|
-
await self._close_session()
|
297
|
-
|
298
|
-
def __enter__(self):
|
299
|
-
"""Sync context manager entry."""
|
300
|
-
return self
|
301
|
-
|
302
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
303
|
-
"""Sync context manager exit."""
|
304
|
-
self._executor.shutdown(wait=True)
|
305
|
-
|
306
|
-
|
307
353
|
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
308
|
-
"""
|
309
|
-
Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
|
310
|
-
"""
|
354
|
+
"""Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
|
311
355
|
print(f"Fetching sitemap from {sitemap_url}...")
|
312
356
|
|
313
357
|
# Fetch sitemap
|
@@ -329,52 +373,12 @@ class SpiderForce4AI:
|
|
329
373
|
print(f"Error parsing sitemap: {str(e)}")
|
330
374
|
raise
|
331
375
|
|
332
|
-
|
333
|
-
|
334
|
-
endpoint = f"{self.base_url}/convert"
|
335
|
-
payload = {
|
336
|
-
"url": url,
|
337
|
-
**config.to_dict()
|
338
|
-
}
|
339
|
-
|
340
|
-
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
341
|
-
if response.status_code != 200:
|
342
|
-
return CrawlResult(
|
343
|
-
url=url,
|
344
|
-
status="failed",
|
345
|
-
error=f"HTTP {response.status_code}: {response.text}",
|
346
|
-
config=config.to_dict()
|
347
|
-
)
|
348
|
-
|
349
|
-
markdown = response.text
|
350
|
-
|
351
|
-
# Save markdown if output directory is configured
|
352
|
-
if config.output_dir:
|
353
|
-
filepath = config.output_dir / f"{slugify(url)}.md"
|
354
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
355
|
-
f.write(markdown)
|
356
|
-
|
357
|
-
# Add delay if configured
|
358
|
-
if config.request_delay:
|
359
|
-
time.sleep(config.request_delay)
|
360
|
-
|
361
|
-
return CrawlResult(
|
362
|
-
url=url,
|
363
|
-
status="success",
|
364
|
-
markdown=markdown,
|
365
|
-
config=config.to_dict()
|
366
|
-
)
|
367
|
-
|
368
|
-
except Exception as e:
|
369
|
-
return CrawlResult(
|
370
|
-
url=url,
|
371
|
-
status="failed",
|
372
|
-
error=str(e),
|
373
|
-
config=config.to_dict()
|
374
|
-
)
|
376
|
+
# Prepare arguments for parallel processing
|
377
|
+
process_args = [(url, self.base_url, config) for url in urls]
|
375
378
|
|
376
379
|
# Create process pool and execute crawls
|
377
380
|
results = []
|
381
|
+
|
378
382
|
with Pool(processes=config.max_concurrent_requests) as pool:
|
379
383
|
with Progress(
|
380
384
|
SpinnerColumn(),
|
@@ -385,7 +389,7 @@ class SpiderForce4AI:
|
|
385
389
|
) as progress:
|
386
390
|
task = progress.add_task("Crawling URLs...", total=len(urls))
|
387
391
|
|
388
|
-
for result in pool.imap_unordered(
|
392
|
+
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
389
393
|
results.append(result)
|
390
394
|
progress.update(task, advance=1)
|
391
395
|
status = "✓" if result.status == "success" else "✗"
|
@@ -405,21 +409,19 @@ class SpiderForce4AI:
|
|
405
409
|
|
406
410
|
return results
|
407
411
|
|
408
|
-
def
|
409
|
-
"""
|
410
|
-
|
411
|
-
|
412
|
-
"config": config.to_dict(),
|
413
|
-
"results": {
|
414
|
-
"successful": [asdict(r) for r in results if r.status == "success"],
|
415
|
-
"failed": [asdict(r) for r in results if r.status == "failed"]
|
416
|
-
},
|
417
|
-
"summary": {
|
418
|
-
"total": len(results),
|
419
|
-
"successful": len([r for r in results if r.status == "success"]),
|
420
|
-
"failed": len([r for r in results if r.status == "failed"])
|
421
|
-
}
|
422
|
-
}
|
412
|
+
async def __aenter__(self):
|
413
|
+
"""Async context manager entry."""
|
414
|
+
await self._ensure_session()
|
415
|
+
return self
|
423
416
|
|
424
|
-
|
425
|
-
|
417
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
418
|
+
"""Async context manager exit."""
|
419
|
+
await self._close_session()
|
420
|
+
|
421
|
+
def __enter__(self):
|
422
|
+
"""Sync context manager entry."""
|
423
|
+
return self
|
424
|
+
|
425
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
426
|
+
"""Sync context manager exit."""
|
427
|
+
self._executor.shutdown(wait=True)
|
@@ -0,0 +1,278 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: spiderforce4ai
|
3
|
+
Version: 0.1.6
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
+
Home-page: https://petertam.pro
|
6
|
+
Author: Piotr Tamulewicz
|
7
|
+
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
+
License: MIT
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Requires-Python: >=3.11
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
Requires-Dist: aiohttp>=3.8.0
|
17
|
+
Requires-Dist: asyncio>=3.4.3
|
18
|
+
Requires-Dist: rich>=10.0.0
|
19
|
+
Requires-Dist: aiofiles>=0.8.0
|
20
|
+
Requires-Dist: httpx>=0.24.0
|
21
|
+
Dynamic: author
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: requires-python
|
24
|
+
|
25
|
+
# SpiderForce4AI Python Wrapper
|
26
|
+
|
27
|
+
A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
|
28
|
+
|
29
|
+
## Installation
|
30
|
+
|
31
|
+
```bash
|
32
|
+
pip install spiderforce4ai
|
33
|
+
```
|
34
|
+
|
35
|
+
## Quick Start (Minimal Setup)
|
36
|
+
|
37
|
+
```python
|
38
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
39
|
+
|
40
|
+
# Initialize with your SpiderForce4AI service URL
|
41
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
42
|
+
|
43
|
+
# Use default configuration (will save in ./spiderforce_reports)
|
44
|
+
config = CrawlConfig()
|
45
|
+
|
46
|
+
# Crawl a single URL
|
47
|
+
result = spider.crawl_url("https://example.com", config)
|
48
|
+
```
|
49
|
+
|
50
|
+
## Crawling Methods
|
51
|
+
|
52
|
+
### 1. Single URL Crawling
|
53
|
+
|
54
|
+
```python
|
55
|
+
# Synchronous
|
56
|
+
result = spider.crawl_url("https://example.com", config)
|
57
|
+
|
58
|
+
# Asynchronous
|
59
|
+
async def crawl():
|
60
|
+
result = await spider.crawl_url_async("https://example.com", config)
|
61
|
+
```
|
62
|
+
|
63
|
+
### 2. Multiple URLs Crawling
|
64
|
+
|
65
|
+
```python
|
66
|
+
# List of URLs
|
67
|
+
urls = [
|
68
|
+
"https://example.com/page1",
|
69
|
+
"https://example.com/page2",
|
70
|
+
"https://example.com/page3"
|
71
|
+
]
|
72
|
+
|
73
|
+
# Synchronous
|
74
|
+
results = spider.crawl_urls(urls, config)
|
75
|
+
|
76
|
+
# Asynchronous
|
77
|
+
async def crawl():
|
78
|
+
results = await spider.crawl_urls_async(urls, config)
|
79
|
+
|
80
|
+
# Parallel (using multiprocessing)
|
81
|
+
results = spider.crawl_urls_parallel(urls, config)
|
82
|
+
```
|
83
|
+
|
84
|
+
### 3. Sitemap Crawling
|
85
|
+
|
86
|
+
```python
|
87
|
+
# Synchronous
|
88
|
+
results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
|
89
|
+
|
90
|
+
# Asynchronous
|
91
|
+
async def crawl():
|
92
|
+
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
93
|
+
|
94
|
+
# Parallel (using multiprocessing)
|
95
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
96
|
+
```
|
97
|
+
|
98
|
+
## Configuration Options
|
99
|
+
|
100
|
+
All configuration options are optional with sensible defaults:
|
101
|
+
|
102
|
+
```python
|
103
|
+
config = CrawlConfig(
|
104
|
+
# Content Selection (all optional)
|
105
|
+
target_selector="article", # Specific element to target
|
106
|
+
remove_selectors=[ # Elements to remove
|
107
|
+
".ads",
|
108
|
+
"#popup",
|
109
|
+
".navigation",
|
110
|
+
".footer"
|
111
|
+
],
|
112
|
+
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
|
+
|
114
|
+
# Processing Settings
|
115
|
+
max_concurrent_requests=1, # Default: 1 (parallel processing)
|
116
|
+
request_delay=0.5, # Delay between requests in seconds
|
117
|
+
timeout=30, # Request timeout in seconds
|
118
|
+
|
119
|
+
# Output Settings
|
120
|
+
output_dir="custom_output", # Default: "spiderforce_reports"
|
121
|
+
report_file="custom_report.json", # Default: "crawl_report.json"
|
122
|
+
webhook_url="https://your-webhook.com", # Optional webhook endpoint
|
123
|
+
webhook_timeout=10 # Webhook timeout in seconds
|
124
|
+
)
|
125
|
+
```
|
126
|
+
|
127
|
+
## Real-World Examples
|
128
|
+
|
129
|
+
### 1. Basic Website Crawling
|
130
|
+
|
131
|
+
```python
|
132
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
133
|
+
from pathlib import Path
|
134
|
+
|
135
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
136
|
+
config = CrawlConfig(
|
137
|
+
output_dir=Path("blog_content")
|
138
|
+
)
|
139
|
+
|
140
|
+
result = spider.crawl_url("https://example.com/blog", config)
|
141
|
+
print(f"Content saved to: {result.url}.md")
|
142
|
+
```
|
143
|
+
|
144
|
+
### 2. Advanced Parallel Sitemap Crawling
|
145
|
+
|
146
|
+
```python
|
147
|
+
config = CrawlConfig(
|
148
|
+
max_concurrent_requests=5,
|
149
|
+
output_dir=Path("website_content"),
|
150
|
+
remove_selectors=[
|
151
|
+
".navigation",
|
152
|
+
".footer",
|
153
|
+
".ads",
|
154
|
+
"#cookie-notice"
|
155
|
+
],
|
156
|
+
webhook_url="https://your-webhook.com/endpoint"
|
157
|
+
)
|
158
|
+
|
159
|
+
results = spider.crawl_sitemap_parallel(
|
160
|
+
"https://example.com/sitemap.xml",
|
161
|
+
config
|
162
|
+
)
|
163
|
+
```
|
164
|
+
|
165
|
+
### 3. Async Crawling with Progress
|
166
|
+
|
167
|
+
```python
|
168
|
+
import asyncio
|
169
|
+
|
170
|
+
async def main():
|
171
|
+
config = CrawlConfig(
|
172
|
+
max_concurrent_requests=3,
|
173
|
+
request_delay=1.0
|
174
|
+
)
|
175
|
+
|
176
|
+
async with spider:
|
177
|
+
results = await spider.crawl_urls_async([
|
178
|
+
"https://example.com/1",
|
179
|
+
"https://example.com/2",
|
180
|
+
"https://example.com/3"
|
181
|
+
], config)
|
182
|
+
|
183
|
+
return results
|
184
|
+
|
185
|
+
results = asyncio.run(main())
|
186
|
+
```
|
187
|
+
|
188
|
+
## Output Structure
|
189
|
+
|
190
|
+
### 1. File Organization
|
191
|
+
```
|
192
|
+
output_dir/
|
193
|
+
├── example-com-page1.md
|
194
|
+
├── example-com-page2.md
|
195
|
+
└── crawl_report.json
|
196
|
+
```
|
197
|
+
|
198
|
+
### 2. Markdown Files
|
199
|
+
Each markdown file is named using a slugified version of the URL and contains the converted content.
|
200
|
+
|
201
|
+
### 3. Report JSON Structure
|
202
|
+
```json
|
203
|
+
{
|
204
|
+
"timestamp": "2025-02-15T10:30:00.123456",
|
205
|
+
"config": {
|
206
|
+
"target_selector": "article",
|
207
|
+
"remove_selectors": [".ads", "#popup"],
|
208
|
+
"remove_selectors_regex": ["modal-\\d+"]
|
209
|
+
},
|
210
|
+
"results": {
|
211
|
+
"successful": [
|
212
|
+
{
|
213
|
+
"url": "https://example.com/page1",
|
214
|
+
"status": "success",
|
215
|
+
"markdown": "# Page Title\n\nContent...",
|
216
|
+
"timestamp": "2025-02-15T10:30:00.123456"
|
217
|
+
}
|
218
|
+
],
|
219
|
+
"failed": [
|
220
|
+
{
|
221
|
+
"url": "https://example.com/page2",
|
222
|
+
"status": "failed",
|
223
|
+
"error": "HTTP 404: Not Found",
|
224
|
+
"timestamp": "2025-02-15T10:30:01.123456"
|
225
|
+
}
|
226
|
+
]
|
227
|
+
},
|
228
|
+
"summary": {
|
229
|
+
"total": 2,
|
230
|
+
"successful": 1,
|
231
|
+
"failed": 1
|
232
|
+
}
|
233
|
+
}
|
234
|
+
```
|
235
|
+
|
236
|
+
### 4. Webhook Notifications
|
237
|
+
If configured, webhooks receive real-time updates in JSON format:
|
238
|
+
```json
|
239
|
+
{
|
240
|
+
"url": "https://example.com/page1",
|
241
|
+
"status": "success",
|
242
|
+
"markdown": "# Page Title\n\nContent...",
|
243
|
+
"timestamp": "2025-02-15T10:30:00.123456",
|
244
|
+
"config": {
|
245
|
+
"target_selector": "article",
|
246
|
+
"remove_selectors": [".ads", "#popup"]
|
247
|
+
}
|
248
|
+
}
|
249
|
+
```
|
250
|
+
|
251
|
+
## Error Handling
|
252
|
+
|
253
|
+
The package handles various types of errors:
|
254
|
+
- Network errors
|
255
|
+
- Timeout errors
|
256
|
+
- Invalid URLs
|
257
|
+
- Missing content
|
258
|
+
- Service errors
|
259
|
+
|
260
|
+
All errors are:
|
261
|
+
1. Logged in the console
|
262
|
+
2. Included in the JSON report
|
263
|
+
3. Sent via webhook (if configured)
|
264
|
+
4. Available in the results list
|
265
|
+
|
266
|
+
## Requirements
|
267
|
+
|
268
|
+
- Python 3.11 or later
|
269
|
+
- Running SpiderForce4AI service
|
270
|
+
- Internet connection
|
271
|
+
|
272
|
+
## License
|
273
|
+
|
274
|
+
MIT License
|
275
|
+
|
276
|
+
## Credits
|
277
|
+
|
278
|
+
Created by [Peter Tam](https://petertam.pro)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
|
2
|
+
spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
|
3
|
+
spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.6.dist-info/RECORD,,
|
@@ -1,239 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 0.1.4
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
License: MIT
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
14
|
-
Requires-Python: >=3.11
|
15
|
-
Description-Content-Type: text/markdown
|
16
|
-
Requires-Dist: aiohttp>=3.8.0
|
17
|
-
Requires-Dist: asyncio>=3.4.3
|
18
|
-
Requires-Dist: rich>=10.0.0
|
19
|
-
Requires-Dist: aiofiles>=0.8.0
|
20
|
-
Requires-Dist: httpx>=0.24.0
|
21
|
-
Dynamic: author
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-python
|
24
|
-
|
25
|
-
# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
|
26
|
-
|
27
|
-
A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
|
28
|
-
|
29
|
-
## Features
|
30
|
-
|
31
|
-
- 🔄 Simple synchronous and asynchronous APIs
|
32
|
-
- 📁 Automatic Markdown file saving with URL-based filenames
|
33
|
-
- 📊 Real-time progress tracking in console
|
34
|
-
- 🪝 Webhook support for real-time notifications
|
35
|
-
- 📝 Detailed crawl reports in JSON format
|
36
|
-
- ⚡ Concurrent crawling with rate limiting
|
37
|
-
- 🔍 Support for sitemap.xml crawling
|
38
|
-
- 🛡️ Comprehensive error handling
|
39
|
-
|
40
|
-
## Installation
|
41
|
-
|
42
|
-
```bash
|
43
|
-
pip install spiderforce4ai
|
44
|
-
```
|
45
|
-
|
46
|
-
## Quick Start
|
47
|
-
|
48
|
-
```python
|
49
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
50
|
-
|
51
|
-
# Initialize the client
|
52
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
53
|
-
|
54
|
-
# Use default configuration
|
55
|
-
config = CrawlConfig()
|
56
|
-
|
57
|
-
# Crawl a single URL
|
58
|
-
result = spider.crawl_url("https://example.com", config)
|
59
|
-
|
60
|
-
# Crawl multiple URLs
|
61
|
-
urls = [
|
62
|
-
"https://example.com/page1",
|
63
|
-
"https://example.com/page2"
|
64
|
-
]
|
65
|
-
results = spider.crawl_urls(urls, config)
|
66
|
-
|
67
|
-
# Crawl from sitemap
|
68
|
-
results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
|
69
|
-
```
|
70
|
-
|
71
|
-
## Configuration
|
72
|
-
|
73
|
-
The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
|
74
|
-
|
75
|
-
```python
|
76
|
-
config = CrawlConfig(
|
77
|
-
# Content Selection (all optional)
|
78
|
-
target_selector="article", # Specific element to target
|
79
|
-
remove_selectors=[".ads", "#popup"], # Elements to remove
|
80
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
81
|
-
|
82
|
-
# Processing Settings
|
83
|
-
max_concurrent_requests=1, # Default: 1
|
84
|
-
request_delay=0.5, # Delay between requests in seconds
|
85
|
-
timeout=30, # Request timeout in seconds
|
86
|
-
|
87
|
-
# Output Settings
|
88
|
-
output_dir="spiderforce_reports", # Default output directory
|
89
|
-
webhook_url="https://your-webhook.com", # Optional webhook endpoint
|
90
|
-
webhook_timeout=10, # Webhook timeout in seconds
|
91
|
-
report_file=None # Optional custom report location
|
92
|
-
)
|
93
|
-
```
|
94
|
-
|
95
|
-
### Default Directory Structure
|
96
|
-
|
97
|
-
```
|
98
|
-
./
|
99
|
-
└── spiderforce_reports/
|
100
|
-
├── example-com-page1.md
|
101
|
-
├── example-com-page2.md
|
102
|
-
└── crawl_report.json
|
103
|
-
```
|
104
|
-
|
105
|
-
## Webhook Notifications
|
106
|
-
|
107
|
-
If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
|
108
|
-
|
109
|
-
```json
|
110
|
-
{
|
111
|
-
"url": "https://example.com/page1",
|
112
|
-
"status": "success",
|
113
|
-
"markdown": "# Page Title\n\nContent...",
|
114
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
115
|
-
"config": {
|
116
|
-
"target_selector": "article",
|
117
|
-
"remove_selectors": [".ads", "#popup"],
|
118
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
119
|
-
}
|
120
|
-
}
|
121
|
-
```
|
122
|
-
|
123
|
-
## Crawl Report
|
124
|
-
|
125
|
-
A comprehensive JSON report is automatically generated in the output directory:
|
126
|
-
|
127
|
-
```json
|
128
|
-
{
|
129
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
130
|
-
"config": {
|
131
|
-
"target_selector": "article",
|
132
|
-
"remove_selectors": [".ads", "#popup"],
|
133
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
134
|
-
},
|
135
|
-
"results": {
|
136
|
-
"successful": [
|
137
|
-
{
|
138
|
-
"url": "https://example.com/page1",
|
139
|
-
"status": "success",
|
140
|
-
"markdown": "# Page Title\n\nContent...",
|
141
|
-
"timestamp": "2025-02-15T10:30:00.123456"
|
142
|
-
}
|
143
|
-
],
|
144
|
-
"failed": [
|
145
|
-
{
|
146
|
-
"url": "https://example.com/page2",
|
147
|
-
"status": "failed",
|
148
|
-
"error": "HTTP 404: Not Found",
|
149
|
-
"timestamp": "2025-02-15T10:30:01.123456"
|
150
|
-
}
|
151
|
-
]
|
152
|
-
},
|
153
|
-
"summary": {
|
154
|
-
"total": 2,
|
155
|
-
"successful": 1,
|
156
|
-
"failed": 1
|
157
|
-
}
|
158
|
-
}
|
159
|
-
```
|
160
|
-
|
161
|
-
## Async Usage
|
162
|
-
|
163
|
-
```python
|
164
|
-
import asyncio
|
165
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
166
|
-
|
167
|
-
async def main():
|
168
|
-
config = CrawlConfig()
|
169
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
170
|
-
|
171
|
-
async with spider:
|
172
|
-
results = await spider.crawl_urls_async(
|
173
|
-
["https://example.com/page1", "https://example.com/page2"],
|
174
|
-
config
|
175
|
-
)
|
176
|
-
|
177
|
-
return results
|
178
|
-
|
179
|
-
if __name__ == "__main__":
|
180
|
-
results = asyncio.run(main())
|
181
|
-
```
|
182
|
-
|
183
|
-
## Error Handling
|
184
|
-
|
185
|
-
The crawler is designed to be resilient:
|
186
|
-
- Continues processing even if some URLs fail
|
187
|
-
- Records all errors in the crawl report
|
188
|
-
- Sends error notifications via webhook if configured
|
189
|
-
- Provides clear error messages in console output
|
190
|
-
|
191
|
-
## Progress Tracking
|
192
|
-
|
193
|
-
The crawler provides real-time progress tracking in the console:
|
194
|
-
|
195
|
-
```
|
196
|
-
🔄 Crawling URLs... [####################] 100%
|
197
|
-
✓ Successful: 95
|
198
|
-
✗ Failed: 5
|
199
|
-
📊 Report saved to: ./spiderforce_reports/crawl_report.json
|
200
|
-
```
|
201
|
-
|
202
|
-
## Usage with AI Agents
|
203
|
-
|
204
|
-
The package is designed to be easily integrated with AI agents and chat systems:
|
205
|
-
|
206
|
-
```python
|
207
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
208
|
-
|
209
|
-
def fetch_content_for_ai(urls):
|
210
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
211
|
-
config = CrawlConfig()
|
212
|
-
|
213
|
-
# Crawl content
|
214
|
-
results = spider.crawl_urls(urls, config)
|
215
|
-
|
216
|
-
# Return successful results
|
217
|
-
return {
|
218
|
-
result.url: result.markdown
|
219
|
-
for result in results
|
220
|
-
if result.status == "success"
|
221
|
-
}
|
222
|
-
|
223
|
-
# Use with AI agent
|
224
|
-
urls = ["https://example.com/article1", "https://example.com/article2"]
|
225
|
-
content = fetch_content_for_ai(urls)
|
226
|
-
```
|
227
|
-
|
228
|
-
## Requirements
|
229
|
-
|
230
|
-
- Python 3.11 or later
|
231
|
-
- Docker (for running SpiderForce4AI service)
|
232
|
-
|
233
|
-
## License
|
234
|
-
|
235
|
-
MIT License
|
236
|
-
|
237
|
-
## Credits
|
238
|
-
|
239
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
|
2
|
-
spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
|
3
|
-
spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|