spiderforce4ai 2.6.7__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai-2.6.8.dist-info/METADATA +789 -0
- spiderforce4ai-2.6.8.dist-info/RECORD +7 -0
- {spiderforce4ai-2.6.7.dist-info → spiderforce4ai-2.6.8.dist-info}/WHEEL +1 -1
- spiderforce4ai-2.6.7.dist-info/METADATA +0 -336
- spiderforce4ai-2.6.7.dist-info/RECORD +0 -7
- {spiderforce4ai-2.6.7.dist-info → spiderforce4ai-2.6.8.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.6.7.dist-info → spiderforce4ai-2.6.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,789 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: spiderforce4ai
|
3
|
+
Version: 2.6.8
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
|
5
|
+
Home-page: https://petertam.pro
|
6
|
+
Author: Piotr Tamulewicz
|
7
|
+
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
+
Project-URL: Homepage, https://petertam.pro
|
9
|
+
Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
|
10
|
+
Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
|
11
|
+
Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
|
12
|
+
Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Requires-Python: >=3.11
|
23
|
+
Description-Content-Type: text/markdown
|
24
|
+
Requires-Dist: aiohttp>=3.8.0
|
25
|
+
Requires-Dist: asyncio>=3.4.3
|
26
|
+
Requires-Dist: rich>=10.0.0
|
27
|
+
Requires-Dist: aiofiles>=0.8.0
|
28
|
+
Requires-Dist: httpx>=0.24.0
|
29
|
+
Requires-Dist: litellm>=1.26.0
|
30
|
+
Requires-Dist: pydantic>=2.6.0
|
31
|
+
Requires-Dist: requests>=2.31.0
|
32
|
+
Requires-Dist: aiofiles>=23.2.1
|
33
|
+
Requires-Dist: et-xmlfile>=1.1.0
|
34
|
+
Requires-Dist: multidict>=6.0.4
|
35
|
+
Requires-Dist: openai>=1.12.0
|
36
|
+
Requires-Dist: pandas>=2.2.0
|
37
|
+
Requires-Dist: numpy>=1.26.0
|
38
|
+
Requires-Dist: yarl>=1.9.4
|
39
|
+
Requires-Dist: typing-extensions>=4.9.0
|
40
|
+
Provides-Extra: dev
|
41
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
42
|
+
Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
|
43
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
44
|
+
Requires-Dist: black>=23.7.0; extra == "dev"
|
45
|
+
Requires-Dist: isort>=5.12.0; extra == "dev"
|
46
|
+
Requires-Dist: mypy>=1.4.1; extra == "dev"
|
47
|
+
Requires-Dist: ruff>=0.1.8; extra == "dev"
|
48
|
+
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
49
|
+
Provides-Extra: docs
|
50
|
+
Requires-Dist: sphinx>=7.1.0; extra == "docs"
|
51
|
+
Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
|
52
|
+
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
53
|
+
Provides-Extra: test
|
54
|
+
Requires-Dist: pytest>=7.4.0; extra == "test"
|
55
|
+
Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
|
56
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
57
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "test"
|
58
|
+
Requires-Dist: coverage>=7.4.0; extra == "test"
|
59
|
+
|
60
|
+
# SpiderForce4AI Python Wrapper
|
61
|
+
|
62
|
+
A comprehensive Python package for web content crawling, HTML-to-Markdown conversion, and AI-powered post-processing with real-time webhook notifications. Built for seamless integration with the SpiderForce4AI service.
|
63
|
+
|
64
|
+
## Prerequisites
|
65
|
+
|
66
|
+
**Important:** To use this wrapper, you must have the SpiderForce4AI service running. For full installation and deployment instructions, visit:
|
67
|
+
[https://github.com/petertamai/SpiderForce4AI](https://github.com/petertamai/SpiderForce4AI)
|
68
|
+
|
69
|
+
## Features
|
70
|
+
|
71
|
+
- HTML to Markdown conversion
|
72
|
+
- Advanced content extraction with custom selectors
|
73
|
+
- Parallel and async crawling support
|
74
|
+
- Sitemap processing
|
75
|
+
- Automatic retry mechanism
|
76
|
+
- Real-time webhook notifications for each processed URL
|
77
|
+
- AI-powered post-extraction processing
|
78
|
+
- Post-extraction webhook integration
|
79
|
+
- Detailed progress tracking
|
80
|
+
- Customizable reporting
|
81
|
+
|
82
|
+
## Installation
|
83
|
+
|
84
|
+
```bash
|
85
|
+
pip install spiderforce4ai
|
86
|
+
```
|
87
|
+
|
88
|
+
## Quick Start with Webhooks
|
89
|
+
|
90
|
+
```python
|
91
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
92
|
+
from pathlib import Path
|
93
|
+
|
94
|
+
# Initialize crawler with your SpiderForce4AI service URL (required)
|
95
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
96
|
+
|
97
|
+
# Configure crawling options with webhook support
|
98
|
+
config = CrawlConfig(
|
99
|
+
target_selector="article", # Optional: target element to extract
|
100
|
+
remove_selectors=[".ads", ".navigation"], # Optional: elements to remove
|
101
|
+
max_concurrent_requests=5, # Optional: default is 1
|
102
|
+
save_reports=True, # Optional: default is False
|
103
|
+
|
104
|
+
# Webhook configuration for real-time notifications
|
105
|
+
webhook_url="https://webhook.site/your-unique-webhook-id", # Required for webhooks
|
106
|
+
webhook_headers={ # Optional custom headers
|
107
|
+
"Authorization": "Bearer your-token",
|
108
|
+
"Content-Type": "application/json"
|
109
|
+
}
|
110
|
+
)
|
111
|
+
|
112
|
+
# Crawl a sitemap with webhook notifications
|
113
|
+
results = spider.crawl_sitemap_server_parallel("https://petertam.pro/sitemap.xml", config)
|
114
|
+
```
|
115
|
+
|
116
|
+
## Core Components
|
117
|
+
|
118
|
+
### Configuration Options
|
119
|
+
|
120
|
+
The `CrawlConfig` class accepts the following parameters:
|
121
|
+
|
122
|
+
#### Mandatory Parameters for Webhook Integration
|
123
|
+
- `webhook_url`: (str) Webhook endpoint URL (required only if you want webhook notifications)
|
124
|
+
|
125
|
+
#### Optional Parameters
|
126
|
+
|
127
|
+
**Content Selection:**
|
128
|
+
- `target_selector`: (str) CSS selector for the main content to extract
|
129
|
+
- `remove_selectors`: (List[str]) List of CSS selectors to remove from content
|
130
|
+
- `remove_selectors_regex`: (List[str]) List of regex patterns for element removal
|
131
|
+
|
132
|
+
**Processing:**
|
133
|
+
- `max_concurrent_requests`: (int) Number of parallel requests (default: 1)
|
134
|
+
- `request_delay`: (float) Delay between requests in seconds (default: 0.5)
|
135
|
+
- `timeout`: (int) Request timeout in seconds (default: 30)
|
136
|
+
|
137
|
+
**Output:**
|
138
|
+
- `output_dir`: (Path) Directory to save output files (default: "spiderforce_reports")
|
139
|
+
- `save_reports`: (bool) Whether to save crawl reports (default: False)
|
140
|
+
- `report_file`: (Path) Custom report file location (generated if None)
|
141
|
+
- `combine_to_one_markdown`: (str) 'full' or 'metadata_headers' (perfect for SEO) to combine content
|
142
|
+
- `combined_markdown_file`: (Path) Custom combined file location (generated if None)
|
143
|
+
|
144
|
+
**Webhook:**
|
145
|
+
- `webhook_url`: (str) Webhook endpoint URL
|
146
|
+
- `webhook_timeout`: (int) Webhook timeout in seconds (default: 10)
|
147
|
+
- `webhook_headers`: (Dict[str, str]) Custom webhook headers
|
148
|
+
- `webhook_payload_template`: (str) Custom webhook payload template
|
149
|
+
|
150
|
+
**Post-Extraction Processing:**
|
151
|
+
- `post_extraction_agent`: (Dict) Configuration for LLM post-processing
|
152
|
+
- `post_extraction_agent_save_to_file`: (str) Path to save extraction results
|
153
|
+
- `post_agent_transformer_function`: (Callable) Custom transformer function for webhook payloads
|
154
|
+
|
155
|
+
## When Are Webhooks Triggered?
|
156
|
+
|
157
|
+
SpiderForce4AI provides webhook notifications at multiple points in the crawling process:
|
158
|
+
|
159
|
+
1. **URL Processing Completion**: Triggered after each URL is processed (whether successful or failed)
|
160
|
+
2. **Post-Extraction Completion**: Triggered after LLM post-processing of each URL (if configured)
|
161
|
+
3. **Custom Transformation**: You can implement your own webhook logic in the transformer function
|
162
|
+
|
163
|
+
The webhook payload contains detailed information about the processed URL, including:
|
164
|
+
- The URL that was processed
|
165
|
+
- Processing status (success/failed)
|
166
|
+
- Extracted markdown content (for successful requests)
|
167
|
+
- Error details (for failed requests)
|
168
|
+
- Timestamp of processing
|
169
|
+
- Post-extraction results (if LLM processing is enabled)
|
170
|
+
|
171
|
+
## Complete Webhook Integration Examples
|
172
|
+
|
173
|
+
### Basic Webhook Integration
|
174
|
+
|
175
|
+
This example sends a webhook notification after each URL is processed:
|
176
|
+
|
177
|
+
```python
|
178
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
179
|
+
from pathlib import Path
|
180
|
+
|
181
|
+
# Initialize crawler
|
182
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
183
|
+
|
184
|
+
# Configure with basic webhook support
|
185
|
+
config = CrawlConfig(
|
186
|
+
# Content selection
|
187
|
+
target_selector="article",
|
188
|
+
remove_selectors=[".ads", ".navigation"],
|
189
|
+
|
190
|
+
# Processing
|
191
|
+
max_concurrent_requests=5,
|
192
|
+
request_delay=0.5,
|
193
|
+
|
194
|
+
# Output
|
195
|
+
output_dir=Path("content_output"),
|
196
|
+
save_reports=True,
|
197
|
+
|
198
|
+
# Webhook configuration
|
199
|
+
webhook_url="https://webhook.site/your-unique-id",
|
200
|
+
webhook_headers={
|
201
|
+
"Authorization": "Bearer your-token",
|
202
|
+
"Content-Type": "application/json"
|
203
|
+
}
|
204
|
+
)
|
205
|
+
|
206
|
+
# Crawl URLs with webhook notifications
|
207
|
+
urls = ["https://petertam.pro/about", "https://petertam.pro/contact"]
|
208
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
209
|
+
```
|
210
|
+
|
211
|
+
### Custom Webhook Payload Template
|
212
|
+
|
213
|
+
You can customize the webhook payload format:
|
214
|
+
|
215
|
+
```python
|
216
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
217
|
+
|
218
|
+
# Initialize crawler
|
219
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
220
|
+
|
221
|
+
# Configure with custom webhook payload
|
222
|
+
config = CrawlConfig(
|
223
|
+
target_selector="article",
|
224
|
+
max_concurrent_requests=3,
|
225
|
+
|
226
|
+
# Webhook with custom payload template
|
227
|
+
webhook_url="https://webhook.site/your-unique-id",
|
228
|
+
webhook_payload_template='''{
|
229
|
+
"page": {
|
230
|
+
"url": "{url}",
|
231
|
+
"status": "{status}",
|
232
|
+
"processed_at": "{timestamp}"
|
233
|
+
},
|
234
|
+
"content": {
|
235
|
+
"markdown": "{markdown}",
|
236
|
+
"error": "{error}"
|
237
|
+
},
|
238
|
+
"metadata": {
|
239
|
+
"service": "SpiderForce4AI",
|
240
|
+
"version": "2.6.7",
|
241
|
+
"client_id": "your-client-id"
|
242
|
+
}
|
243
|
+
}'''
|
244
|
+
)
|
245
|
+
|
246
|
+
# Crawl with custom webhook payload
|
247
|
+
results = spider.crawl_url("https://petertam.pro", config)
|
248
|
+
```
|
249
|
+
|
250
|
+
### Advanced: Post-Extraction Webhooks
|
251
|
+
|
252
|
+
This example demonstrates how to use webhooks with the AI post-processing feature:
|
253
|
+
|
254
|
+
```python
|
255
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
256
|
+
import requests
|
257
|
+
from pathlib import Path
|
258
|
+
|
259
|
+
# Define extraction template structure
|
260
|
+
extraction_template = """
|
261
|
+
{
|
262
|
+
"Title": "Extract the main title of the content",
|
263
|
+
"MetaDescription": "Extract a search-friendly meta description (under 160 characters)",
|
264
|
+
"KeyPoints": ["Extract 3-5 key points from the content"],
|
265
|
+
"Categories": ["Extract relevant categories for this content"],
|
266
|
+
"ReadingTimeMinutes": "Estimate reading time in minutes"
|
267
|
+
}
|
268
|
+
"""
|
269
|
+
|
270
|
+
# Define a custom webhook function
|
271
|
+
def post_extraction_webhook(extraction_result):
|
272
|
+
"""Send extraction results to a webhook and return transformed data."""
|
273
|
+
# Add custom fields or transform the data as needed
|
274
|
+
payload = {
|
275
|
+
"url": extraction_result.get("url", ""),
|
276
|
+
"title": extraction_result.get("Title", ""),
|
277
|
+
"description": extraction_result.get("MetaDescription", ""),
|
278
|
+
"key_points": extraction_result.get("KeyPoints", []),
|
279
|
+
"categories": extraction_result.get("Categories", []),
|
280
|
+
"reading_time": extraction_result.get("ReadingTimeMinutes", ""),
|
281
|
+
"processed_at": extraction_result.get("timestamp", "")
|
282
|
+
}
|
283
|
+
|
284
|
+
# Send to webhook (example using a different webhook than the main one)
|
285
|
+
try:
|
286
|
+
response = requests.post(
|
287
|
+
"https://webhook.site/your-extraction-webhook-id",
|
288
|
+
json=payload,
|
289
|
+
headers={
|
290
|
+
"Authorization": "Bearer extraction-token",
|
291
|
+
"Content-Type": "application/json"
|
292
|
+
},
|
293
|
+
timeout=10
|
294
|
+
)
|
295
|
+
print(f"Extraction webhook sent: Status {response.status_code}")
|
296
|
+
except Exception as e:
|
297
|
+
print(f"Extraction webhook error: {str(e)}")
|
298
|
+
|
299
|
+
# Return the transformed data (will be stored in result.extraction_result)
|
300
|
+
return payload
|
301
|
+
|
302
|
+
# Initialize crawler
|
303
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
304
|
+
|
305
|
+
# Configure with post-extraction and webhooks
|
306
|
+
config = CrawlConfig(
|
307
|
+
# Basic crawling settings
|
308
|
+
target_selector="article",
|
309
|
+
remove_selectors=[".ads", ".navigation", ".comments"],
|
310
|
+
max_concurrent_requests=5,
|
311
|
+
|
312
|
+
# Regular webhook for crawl results
|
313
|
+
webhook_url="https://webhook.site/your-crawl-webhook-id",
|
314
|
+
webhook_headers={
|
315
|
+
"Authorization": "Bearer crawl-token",
|
316
|
+
"Content-Type": "application/json"
|
317
|
+
},
|
318
|
+
|
319
|
+
# Post-extraction LLM processing
|
320
|
+
post_extraction_agent={
|
321
|
+
"model": "gpt-4-turbo", # Or another compatible model
|
322
|
+
"api_key": "your-api-key-here",
|
323
|
+
"max_tokens": 1000,
|
324
|
+
"temperature": 0.3,
|
325
|
+
"response_format": "json_object", # Request JSON response format
|
326
|
+
"messages": [
|
327
|
+
{
|
328
|
+
"role": "system",
|
329
|
+
"content": f"Extract the following information from the content. Return ONLY valid JSON, no explanations:\n\n{extraction_template}"
|
330
|
+
},
|
331
|
+
{
|
332
|
+
"role": "user",
|
333
|
+
"content": "{here_markdown_content}" # Will be replaced with actual content
|
334
|
+
}
|
335
|
+
]
|
336
|
+
},
|
337
|
+
# Save combined extraction results
|
338
|
+
post_extraction_agent_save_to_file="extraction_results.json",
|
339
|
+
# Custom function to transform and send extraction webhook
|
340
|
+
post_agent_transformer_function=post_extraction_webhook
|
341
|
+
)
|
342
|
+
|
343
|
+
# Crawl a sitemap with both regular and post-extraction webhooks
|
344
|
+
results = spider.crawl_sitemap_server_parallel("https://petertam.pro/sitemap.xml", config)
|
345
|
+
```
|
346
|
+
|
347
|
+
### Real-World Example: SEO Analyzer with Webhooks
|
348
|
+
|
349
|
+
This complete example crawls a site and uses Mistral AI to extract SEO-focused data, sending custom webhooks at each step:
|
350
|
+
|
351
|
+
```python
|
352
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
353
|
+
from pathlib import Path
|
354
|
+
import requests
|
355
|
+
|
356
|
+
# Define your extraction template
|
357
|
+
extraction_template = """
|
358
|
+
{
|
359
|
+
"Title": "Extract the main title of the content (ideally under 60 characters for SEO)",
|
360
|
+
"MetaDescription": "Extract the meta description of the content (ideally under 160 characters for SEO)",
|
361
|
+
"CanonicalUrl": "Extract the canonical URL of the content",
|
362
|
+
"Headings": {
|
363
|
+
"H1": "Extract the main H1 heading",
|
364
|
+
"H2": ["Extract all H2 headings (for better content structure)"],
|
365
|
+
"H3": ["Extract all H3 headings (for better content structure)"]
|
366
|
+
},
|
367
|
+
"KeyPoints": [
|
368
|
+
"Point 1 (focus on primary keywords)",
|
369
|
+
"Point 2 (focus on secondary keywords)",
|
370
|
+
"Point 3 (focus on user intent)"
|
371
|
+
],
|
372
|
+
"CallToAction": "Extract current call to actions (e.g., Sign up now for exclusive offers!)"
|
373
|
+
}
|
374
|
+
"""
|
375
|
+
|
376
|
+
# Define custom transformer function
|
377
|
+
def post_webhook(post_extraction_agent_response):
|
378
|
+
"""Transform and send extracted SEO-focused data to webhook."""
|
379
|
+
payload = {
|
380
|
+
"url": post_extraction_agent_response.get("url", ""),
|
381
|
+
"Title": post_extraction_agent_response.get("Title", ""),
|
382
|
+
"MetaDescription": post_extraction_agent_response.get("MetaDescription", ""),
|
383
|
+
"CanonicalUrl": post_extraction_agent_response.get("CanonicalUrl", ""),
|
384
|
+
"Headings": post_extraction_agent_response.get("Headings", {}),
|
385
|
+
"KeyPoints": post_extraction_agent_response.get("KeyPoints", []),
|
386
|
+
"CallToAction": post_extraction_agent_response.get("CallToAction", "")
|
387
|
+
}
|
388
|
+
|
389
|
+
# Send webhook with custom headers
|
390
|
+
try:
|
391
|
+
response = requests.post(
|
392
|
+
"https://webhook.site/your-extraction-webhook-id",
|
393
|
+
json=payload,
|
394
|
+
headers={
|
395
|
+
"Authorization": "Bearer token",
|
396
|
+
"X-Custom-Header": "value",
|
397
|
+
"Content-Type": "application/json"
|
398
|
+
},
|
399
|
+
timeout=10
|
400
|
+
)
|
401
|
+
|
402
|
+
# Log the response for debugging
|
403
|
+
if response.status_code == 200:
|
404
|
+
print(f"✅ Webhook sent successfully for {payload['url']}")
|
405
|
+
else:
|
406
|
+
print(f"❌ Failed to send webhook. Status code: {response.status_code}")
|
407
|
+
except Exception as e:
|
408
|
+
print(f"❌ Webhook error: {str(e)}")
|
409
|
+
|
410
|
+
return payload # Return transformed data
|
411
|
+
|
412
|
+
# Initialize crawler
|
413
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
414
|
+
|
415
|
+
# Configure post-extraction agent
|
416
|
+
post_extraction_agent = {
|
417
|
+
"model": "mistral/mistral-large-latest", # Or any model compatible with litellm
|
418
|
+
"messages": [
|
419
|
+
{
|
420
|
+
"role": "system",
|
421
|
+
"content": f"Based on the provided markdown content, extract the following information. Return ONLY valid JSON, no comments or explanations:\n\n{extraction_template}"
|
422
|
+
},
|
423
|
+
{
|
424
|
+
"role": "user",
|
425
|
+
"content": "{here_markdown_content}" # Placeholder for markdown content
|
426
|
+
}
|
427
|
+
],
|
428
|
+
"api_key": "your-mistral-api-key", # Replace with your actual API key
|
429
|
+
"max_tokens": 8000,
|
430
|
+
"response_format": "json_object" # Request JSON format from the API
|
431
|
+
}
|
432
|
+
|
433
|
+
# Create crawl configuration
|
434
|
+
config = CrawlConfig(
|
435
|
+
# Basic crawl settings
|
436
|
+
save_reports=True,
|
437
|
+
max_concurrent_requests=5,
|
438
|
+
remove_selectors=[
|
439
|
+
".header-bottom",
|
440
|
+
"#mainmenu",
|
441
|
+
".headerfix",
|
442
|
+
".header",
|
443
|
+
".menu-item",
|
444
|
+
".wpcf7",
|
445
|
+
".followus-section"
|
446
|
+
],
|
447
|
+
output_dir=Path("reports"),
|
448
|
+
|
449
|
+
# Crawl results webhook
|
450
|
+
webhook_url="https://webhook.site/your-crawl-webhook-id",
|
451
|
+
webhook_headers={
|
452
|
+
"Authorization": "Bearer crawl-token",
|
453
|
+
"Content-Type": "application/json"
|
454
|
+
},
|
455
|
+
|
456
|
+
# Add post-extraction configuration
|
457
|
+
post_extraction_agent=post_extraction_agent,
|
458
|
+
post_extraction_agent_save_to_file="combined_extraction.json",
|
459
|
+
post_agent_transformer_function=post_webhook
|
460
|
+
)
|
461
|
+
|
462
|
+
# Run the crawler with parallel processing
|
463
|
+
results = spider.crawl_sitemap_parallel(
|
464
|
+
"https://petertam.pro/sitemap.xml",
|
465
|
+
config
|
466
|
+
)
|
467
|
+
|
468
|
+
# Print summary
|
469
|
+
successful = len([r for r in results if r.status == "success"])
|
470
|
+
extracted = len([r for r in results if hasattr(r, 'extraction_result') and r.extraction_result])
|
471
|
+
print(f"\n📊 Crawling complete:")
|
472
|
+
print(f" - {len(results)} URLs processed")
|
473
|
+
print(f" - {successful} successfully crawled")
|
474
|
+
print(f" - {extracted} with AI extraction")
|
475
|
+
print(f" - Reports saved to: {config.report_file}")
|
476
|
+
print(f" - Extraction data saved to: {config.post_extraction_agent_save_to_file}")
|
477
|
+
```
|
478
|
+
|
479
|
+
## AI-Powered Post-Processing with Webhook Integration
|
480
|
+
|
481
|
+
The package includes a powerful AI post-processing system through the `PostExtractionAgent` class with integrated webhook capabilities.
|
482
|
+
|
483
|
+
### Post-Extraction Configuration
|
484
|
+
|
485
|
+
```python
|
486
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig, PostExtractionConfig, PostExtractionAgent
|
487
|
+
import requests
|
488
|
+
|
489
|
+
# Define a custom webhook transformer
|
490
|
+
def transform_and_notify(result):
|
491
|
+
"""Process extraction results and send to external system."""
|
492
|
+
# Send to external system
|
493
|
+
requests.post(
|
494
|
+
"https://your-api.com/analyze",
|
495
|
+
json=result,
|
496
|
+
headers={"Authorization": "Bearer token"}
|
497
|
+
)
|
498
|
+
return result # Return data for storage
|
499
|
+
|
500
|
+
# Configure post-extraction processing with webhooks
|
501
|
+
config = CrawlConfig(
|
502
|
+
# Basic crawl settings
|
503
|
+
target_selector="article",
|
504
|
+
max_concurrent_requests=5,
|
505
|
+
|
506
|
+
# Standard crawl webhook
|
507
|
+
webhook_url="https://webhook.site/your-crawl-webhook-id",
|
508
|
+
|
509
|
+
# Post-extraction LLM configuration
|
510
|
+
post_extraction_agent={
|
511
|
+
"model": "gpt-4-turbo",
|
512
|
+
"api_key": "your-api-key-here",
|
513
|
+
"max_tokens": 1000,
|
514
|
+
"temperature": 0.7,
|
515
|
+
"base_url": "https://api.openai.com/v1",
|
516
|
+
"messages": [
|
517
|
+
{
|
518
|
+
"role": "system",
|
519
|
+
"content": "Extract key information from the following content."
|
520
|
+
},
|
521
|
+
{
|
522
|
+
"role": "user",
|
523
|
+
"content": "Please analyze the following content:\n\n{here_markdown_content}"
|
524
|
+
}
|
525
|
+
]
|
526
|
+
},
|
527
|
+
# Save extraction results to file
|
528
|
+
post_extraction_agent_save_to_file="extraction_results.json",
|
529
|
+
# Custom webhook transformer
|
530
|
+
post_agent_transformer_function=transform_and_notify
|
531
|
+
)
|
532
|
+
|
533
|
+
# Crawl with post-processing and webhooks
|
534
|
+
results = spider.crawl_urls_server_parallel(["https://petertam.pro"], config)
|
535
|
+
```
|
536
|
+
|
537
|
+
## Crawling Methods
|
538
|
+
|
539
|
+
All methods support webhook integration automatically when configured:
|
540
|
+
|
541
|
+
#### 1. Single URL Processing
|
542
|
+
|
543
|
+
```python
|
544
|
+
# Synchronous with webhook
|
545
|
+
result = spider.crawl_url("https://petertam.pro", config)
|
546
|
+
|
547
|
+
# Asynchronous with webhook
|
548
|
+
async def crawl():
|
549
|
+
result = await spider.crawl_url_async("https://petertam.pro", config)
|
550
|
+
```
|
551
|
+
|
552
|
+
#### 2. Multiple URLs
|
553
|
+
|
554
|
+
```python
|
555
|
+
urls = ["https://petertam.pro/about", "https://petertam.pro/contact"]
|
556
|
+
|
557
|
+
# Server-side parallel with webhooks (recommended)
|
558
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
559
|
+
|
560
|
+
# Client-side parallel with webhooks
|
561
|
+
results = spider.crawl_urls_parallel(urls, config)
|
562
|
+
|
563
|
+
# Asynchronous with webhooks
|
564
|
+
async def crawl():
|
565
|
+
results = await spider.crawl_urls_async(urls, config)
|
566
|
+
```
|
567
|
+
|
568
|
+
#### 3. Sitemap Processing
|
569
|
+
|
570
|
+
```python
|
571
|
+
# Server-side parallel with webhooks (recommended)
|
572
|
+
results = spider.crawl_sitemap_server_parallel("https://petertam.pro/sitemap.xml", config)
|
573
|
+
|
574
|
+
# Client-side parallel with webhooks
|
575
|
+
results = spider.crawl_sitemap_parallel("https://petertam.pro/sitemap.xml", config)
|
576
|
+
|
577
|
+
# Asynchronous with webhooks
|
578
|
+
async def crawl():
|
579
|
+
results = await spider.crawl_sitemap_async("https://petertam.pro/sitemap.xml", config)
|
580
|
+
```
|
581
|
+
|
582
|
+
## Webhook Payload Structure
|
583
|
+
|
584
|
+
The default webhook payload includes:
|
585
|
+
|
586
|
+
```json
|
587
|
+
{
|
588
|
+
"url": "https://petertam.pro/about",
|
589
|
+
"status": "success",
|
590
|
+
"markdown": "# About\n\nThis is the about page content...",
|
591
|
+
"error": null,
|
592
|
+
"timestamp": "2025-02-27T12:30:45.123456",
|
593
|
+
"config": {
|
594
|
+
"target_selector": "article",
|
595
|
+
"remove_selectors": [".ads", ".navigation"]
|
596
|
+
},
|
597
|
+
"extraction_result": {
|
598
|
+
"Title": "About Peter Tam",
|
599
|
+
"MetaDescription": "Professional developer with expertise in AI and web technologies",
|
600
|
+
"KeyPoints": ["Over 10 years experience", "Specializes in AI integration", "Full-stack development"]
|
601
|
+
}
|
602
|
+
}
|
603
|
+
```
|
604
|
+
|
605
|
+
For failed requests:
|
606
|
+
|
607
|
+
```json
|
608
|
+
{
|
609
|
+
"url": "https://petertam.pro/missing-page",
|
610
|
+
"status": "failed",
|
611
|
+
"markdown": null,
|
612
|
+
"error": "HTTP 404: Not Found",
|
613
|
+
"timestamp": "2025-02-27T12:31:23.456789",
|
614
|
+
"config": {
|
615
|
+
"target_selector": "article",
|
616
|
+
"remove_selectors": [".ads", ".navigation"]
|
617
|
+
},
|
618
|
+
"extraction_result": null
|
619
|
+
}
|
620
|
+
```
|
621
|
+
|
622
|
+
## Custom Webhook Transformation
|
623
|
+
|
624
|
+
You can transform webhook data with a custom function:
|
625
|
+
|
626
|
+
```python
|
627
|
+
def transform_webhook_data(result):
|
628
|
+
"""Custom transformer for webhook data."""
|
629
|
+
# Extract only needed fields
|
630
|
+
transformed = {
|
631
|
+
"url": result.get("url"),
|
632
|
+
"title": result.get("Title"),
|
633
|
+
"description": result.get("MetaDescription"),
|
634
|
+
"processed_at": result.get("timestamp")
|
635
|
+
}
|
636
|
+
|
637
|
+
# Add custom calculations
|
638
|
+
if "raw_content" in result:
|
639
|
+
transformed["word_count"] = len(result["raw_content"].split())
|
640
|
+
transformed["reading_time_min"] = max(1, transformed["word_count"] // 200)
|
641
|
+
|
642
|
+
# Send to external systems if needed
|
643
|
+
requests.post("https://your-analytics-api.com/log", json=transformed)
|
644
|
+
|
645
|
+
return transformed # This will be stored in result.extraction_result
|
646
|
+
```
|
647
|
+
|
648
|
+
## Smart Retry Mechanism
|
649
|
+
|
650
|
+
The package provides a sophisticated retry system with webhook notifications:
|
651
|
+
|
652
|
+
```python
|
653
|
+
# Retry behavior with webhook notifications
|
654
|
+
config = CrawlConfig(
|
655
|
+
max_concurrent_requests=5,
|
656
|
+
request_delay=1.0,
|
657
|
+
webhook_url="https://webhook.site/your-webhook-id",
|
658
|
+
# Webhook will be called for both initial attempts and retries
|
659
|
+
)
|
660
|
+
results = spider.crawl_urls_async(urls, config)
|
661
|
+
```
|
662
|
+
|
663
|
+
## Report Generation
|
664
|
+
|
665
|
+
The package can generate detailed reports of crawling operations:
|
666
|
+
|
667
|
+
```python
|
668
|
+
config = CrawlConfig(
|
669
|
+
save_reports=True,
|
670
|
+
report_file=Path("custom_report.json"),
|
671
|
+
output_dir=Path("content"),
|
672
|
+
webhook_url="https://webhook.site/your-webhook-id"
|
673
|
+
)
|
674
|
+
```
|
675
|
+
|
676
|
+
## Combined Content Output
|
677
|
+
|
678
|
+
You can combine multiple pages into a single Markdown file:
|
679
|
+
|
680
|
+
```python
|
681
|
+
config = CrawlConfig(
|
682
|
+
combine_to_one_markdown="full",
|
683
|
+
combined_markdown_file=Path("all_pages.md"),
|
684
|
+
webhook_url="https://webhook.site/your-webhook-id"
|
685
|
+
)
|
686
|
+
```
|
687
|
+
|
688
|
+
## Progress Tracking
|
689
|
+
|
690
|
+
The package provides rich progress tracking with detailed statistics:
|
691
|
+
|
692
|
+
```
|
693
|
+
Fetching sitemap from https://petertam.pro/sitemap.xml...
|
694
|
+
Found 23 URLs in sitemap
|
695
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 23/23 URLs
|
696
|
+
|
697
|
+
Retrying failed URLs: 3 (13.0% failed)
|
698
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 3/3 retries
|
699
|
+
|
700
|
+
Starting post-extraction processing...
|
701
|
+
[⠋] Post-extraction processing... • 0/20 0% • 00:00:00
|
702
|
+
|
703
|
+
Crawling Summary:
|
704
|
+
Total URLs processed: 23
|
705
|
+
Initial failures: 3 (13.0%)
|
706
|
+
Final results:
|
707
|
+
✓ Successful: 22
|
708
|
+
✗ Failed: 1
|
709
|
+
Retry success rate: 2/3 (66.7%)
|
710
|
+
```
|
711
|
+
|
712
|
+
## Advanced Error Handling with Webhooks
|
713
|
+
|
714
|
+
```python
|
715
|
+
import logging
|
716
|
+
from datetime import datetime
|
717
|
+
|
718
|
+
# Setup logging
|
719
|
+
logging.basicConfig(
|
720
|
+
level=logging.INFO,
|
721
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
722
|
+
filename='spider_log.txt'
|
723
|
+
)
|
724
|
+
|
725
|
+
# Define error handling webhook function
|
726
|
+
def error_handler(extraction_result):
|
727
|
+
"""Handle errors and send notifications."""
|
728
|
+
url = extraction_result.get('url', 'unknown')
|
729
|
+
|
730
|
+
# Check for errors
|
731
|
+
if 'error' in extraction_result and extraction_result['error']:
|
732
|
+
# Log the error
|
733
|
+
logging.error(f"Error processing {url}: {extraction_result['error']}")
|
734
|
+
|
735
|
+
# Send error notification webhook
|
736
|
+
try:
|
737
|
+
requests.post(
|
738
|
+
"https://webhook.site/your-error-webhook-id",
|
739
|
+
json={
|
740
|
+
"url": url,
|
741
|
+
"error": extraction_result['error'],
|
742
|
+
"timestamp": datetime.now().isoformat(),
|
743
|
+
"severity": "high" if "404" in extraction_result['error'] else "medium"
|
744
|
+
},
|
745
|
+
headers={"X-Error-Alert": "true"}
|
746
|
+
)
|
747
|
+
except Exception as e:
|
748
|
+
logging.error(f"Failed to send error webhook: {e}")
|
749
|
+
|
750
|
+
# Always return the original result to ensure data is preserved
|
751
|
+
return extraction_result
|
752
|
+
|
753
|
+
# Add to config
|
754
|
+
config = CrawlConfig(
|
755
|
+
# Regular webhook for all crawl results
|
756
|
+
webhook_url="https://webhook.site/your-regular-webhook-id",
|
757
|
+
|
758
|
+
# Error handling webhook through the transformer function
|
759
|
+
post_agent_transformer_function=error_handler
|
760
|
+
)
|
761
|
+
```
|
762
|
+
|
763
|
+
## Requirements
|
764
|
+
|
765
|
+
- Python 3.11+
|
766
|
+
- Running SpiderForce4AI service
|
767
|
+
- Internet connection
|
768
|
+
|
769
|
+
## Dependencies
|
770
|
+
|
771
|
+
- aiohttp
|
772
|
+
- asyncio
|
773
|
+
- rich
|
774
|
+
- aiofiles
|
775
|
+
- httpx
|
776
|
+
- litellm
|
777
|
+
- pydantic
|
778
|
+
- requests
|
779
|
+
- pandas
|
780
|
+
- numpy
|
781
|
+
- openai
|
782
|
+
|
783
|
+
## License
|
784
|
+
|
785
|
+
MIT License
|
786
|
+
|
787
|
+
## Credits
|
788
|
+
|
789
|
+
Created by [Piotr Tamulewicz](https://petertam.pro)
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=p_ybuwvTD7bTelORBzAkomUQrc69WvOmu3owHKlzp0A,42231
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
|
3
|
+
spiderforce4ai-2.6.8.dist-info/METADATA,sha256=QXMvOkWgOgNb4HL3RKgyPMlsSrOeleQlT-9ma0FRzQs,25726
|
4
|
+
spiderforce4ai-2.6.8.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
5
|
+
spiderforce4ai-2.6.8.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.6.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.6.8.dist-info/RECORD,,
|
@@ -1,336 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 2.6.7
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
Project-URL: Homepage, https://petertam.pro
|
9
|
-
Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
|
10
|
-
Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
|
11
|
-
Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
|
12
|
-
Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
14
|
-
Classifier: Intended Audience :: Developers
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
17
|
-
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
19
|
-
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
20
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Requires-Python: >=3.11
|
23
|
-
Description-Content-Type: text/markdown
|
24
|
-
Requires-Dist: aiohttp>=3.8.0
|
25
|
-
Requires-Dist: asyncio>=3.4.3
|
26
|
-
Requires-Dist: rich>=10.0.0
|
27
|
-
Requires-Dist: aiofiles>=0.8.0
|
28
|
-
Requires-Dist: httpx>=0.24.0
|
29
|
-
Requires-Dist: litellm>=1.26.0
|
30
|
-
Requires-Dist: pydantic>=2.6.0
|
31
|
-
Requires-Dist: requests>=2.31.0
|
32
|
-
Requires-Dist: aiofiles>=23.2.1
|
33
|
-
Requires-Dist: et-xmlfile>=1.1.0
|
34
|
-
Requires-Dist: multidict>=6.0.4
|
35
|
-
Requires-Dist: openai>=1.12.0
|
36
|
-
Requires-Dist: pandas>=2.2.0
|
37
|
-
Requires-Dist: numpy>=1.26.0
|
38
|
-
Requires-Dist: yarl>=1.9.4
|
39
|
-
Requires-Dist: typing_extensions>=4.9.0
|
40
|
-
Provides-Extra: dev
|
41
|
-
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
42
|
-
Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
|
43
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
44
|
-
Requires-Dist: black>=23.7.0; extra == "dev"
|
45
|
-
Requires-Dist: isort>=5.12.0; extra == "dev"
|
46
|
-
Requires-Dist: mypy>=1.4.1; extra == "dev"
|
47
|
-
Requires-Dist: ruff>=0.1.8; extra == "dev"
|
48
|
-
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
49
|
-
Provides-Extra: test
|
50
|
-
Requires-Dist: pytest>=7.4.0; extra == "test"
|
51
|
-
Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
|
52
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
53
|
-
Requires-Dist: pytest-mock>=3.12.0; extra == "test"
|
54
|
-
Requires-Dist: coverage>=7.4.0; extra == "test"
|
55
|
-
Provides-Extra: docs
|
56
|
-
Requires-Dist: sphinx>=7.1.0; extra == "docs"
|
57
|
-
Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
|
58
|
-
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
59
|
-
Dynamic: author
|
60
|
-
Dynamic: home-page
|
61
|
-
Dynamic: requires-python
|
62
|
-
|
63
|
-
# SpiderForce4AI Python Wrapper
|
64
|
-
|
65
|
-
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
66
|
-
|
67
|
-
## Features
|
68
|
-
|
69
|
-
- HTML to Markdown conversion
|
70
|
-
- Parallel and async crawling support
|
71
|
-
- Sitemap processing
|
72
|
-
- Custom content selection
|
73
|
-
- Automatic retry mechanism
|
74
|
-
- Detailed progress tracking
|
75
|
-
- Webhook notifications
|
76
|
-
- Customizable reporting
|
77
|
-
|
78
|
-
## Installation
|
79
|
-
|
80
|
-
```bash
|
81
|
-
pip install spiderforce4ai
|
82
|
-
```
|
83
|
-
|
84
|
-
## Quick Start
|
85
|
-
|
86
|
-
```python
|
87
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
88
|
-
from pathlib import Path
|
89
|
-
|
90
|
-
# Initialize crawler
|
91
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
92
|
-
|
93
|
-
# Configure crawling options
|
94
|
-
config = CrawlConfig(
|
95
|
-
target_selector="article",
|
96
|
-
remove_selectors=[".ads", ".navigation"],
|
97
|
-
max_concurrent_requests=5,
|
98
|
-
save_reports=True
|
99
|
-
)
|
100
|
-
|
101
|
-
# Crawl a sitemap
|
102
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
103
|
-
```
|
104
|
-
|
105
|
-
## Key Features
|
106
|
-
|
107
|
-
### 1. Smart Retry Mechanism
|
108
|
-
- Automatically retries failed URLs
|
109
|
-
- Monitors failure ratio to prevent server overload
|
110
|
-
- Detailed retry statistics and progress tracking
|
111
|
-
- Aborts retries if failure rate exceeds 20%
|
112
|
-
|
113
|
-
```python
|
114
|
-
# Retry behavior is automatic
|
115
|
-
config = CrawlConfig(
|
116
|
-
max_concurrent_requests=5,
|
117
|
-
request_delay=1.0 # Delay between retries
|
118
|
-
)
|
119
|
-
results = spider.crawl_urls_async(urls, config)
|
120
|
-
```
|
121
|
-
|
122
|
-
### 2. Custom Webhook Integration
|
123
|
-
- Flexible payload formatting
|
124
|
-
- Custom headers support
|
125
|
-
- Variable substitution in templates
|
126
|
-
|
127
|
-
```python
|
128
|
-
config = CrawlConfig(
|
129
|
-
webhook_url="https://your-webhook.com",
|
130
|
-
webhook_headers={
|
131
|
-
"Authorization": "Bearer token",
|
132
|
-
"X-Custom-Header": "value"
|
133
|
-
},
|
134
|
-
webhook_payload_template='''{
|
135
|
-
"url": "{url}",
|
136
|
-
"content": "{markdown}",
|
137
|
-
"status": "{status}",
|
138
|
-
"custom_field": "value"
|
139
|
-
}'''
|
140
|
-
)
|
141
|
-
```
|
142
|
-
|
143
|
-
### 3. Flexible Report Generation
|
144
|
-
- Optional report saving
|
145
|
-
- Customizable report location
|
146
|
-
- Detailed success/failure statistics
|
147
|
-
|
148
|
-
```python
|
149
|
-
config = CrawlConfig(
|
150
|
-
save_reports=True,
|
151
|
-
report_file=Path("custom_report.json"),
|
152
|
-
output_dir=Path("content")
|
153
|
-
)
|
154
|
-
```
|
155
|
-
|
156
|
-
## Crawling Methods
|
157
|
-
|
158
|
-
### 1. Single URL Processing
|
159
|
-
|
160
|
-
```python
|
161
|
-
# Synchronous
|
162
|
-
result = spider.crawl_url("https://example.com", config)
|
163
|
-
|
164
|
-
# Asynchronous
|
165
|
-
async def crawl():
|
166
|
-
result = await spider.crawl_url_async("https://example.com", config)
|
167
|
-
```
|
168
|
-
|
169
|
-
### 2. Multiple URLs
|
170
|
-
|
171
|
-
```python
|
172
|
-
urls = ["https://example.com/page1", "https://example.com/page2"]
|
173
|
-
|
174
|
-
# Server-side parallel (recommended)
|
175
|
-
results = spider.crawl_urls_server_parallel(urls, config)
|
176
|
-
|
177
|
-
# Client-side parallel
|
178
|
-
results = spider.crawl_urls_parallel(urls, config)
|
179
|
-
|
180
|
-
# Asynchronous
|
181
|
-
async def crawl():
|
182
|
-
results = await spider.crawl_urls_async(urls, config)
|
183
|
-
```
|
184
|
-
|
185
|
-
### 3. Sitemap Processing
|
186
|
-
|
187
|
-
```python
|
188
|
-
# Server-side parallel (recommended)
|
189
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
190
|
-
|
191
|
-
# Client-side parallel
|
192
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
193
|
-
|
194
|
-
# Asynchronous
|
195
|
-
async def crawl():
|
196
|
-
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
197
|
-
```
|
198
|
-
|
199
|
-
## Configuration Options
|
200
|
-
|
201
|
-
```python
|
202
|
-
config = CrawlConfig(
|
203
|
-
# Content Selection
|
204
|
-
target_selector="article", # Target element to extract
|
205
|
-
remove_selectors=[".ads", "#popup"], # Elements to remove
|
206
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
207
|
-
|
208
|
-
# Processing
|
209
|
-
max_concurrent_requests=5, # Parallel processing limit
|
210
|
-
request_delay=0.5, # Delay between requests
|
211
|
-
timeout=30, # Request timeout
|
212
|
-
|
213
|
-
# Output
|
214
|
-
output_dir=Path("content"), # Output directory
|
215
|
-
save_reports=False, # Enable/disable report saving
|
216
|
-
report_file=Path("report.json"), # Report location
|
217
|
-
|
218
|
-
# Webhook
|
219
|
-
webhook_url="https://webhook.com", # Webhook endpoint
|
220
|
-
webhook_timeout=10, # Webhook timeout
|
221
|
-
webhook_headers={ # Custom headers
|
222
|
-
"Authorization": "Bearer token"
|
223
|
-
},
|
224
|
-
webhook_payload_template=''' # Custom payload format
|
225
|
-
{
|
226
|
-
"url": "{url}",
|
227
|
-
"content": "{markdown}",
|
228
|
-
"status": "{status}",
|
229
|
-
"error": "{error}",
|
230
|
-
"time": "{timestamp}"
|
231
|
-
}'''
|
232
|
-
)
|
233
|
-
```
|
234
|
-
|
235
|
-
## Progress Tracking
|
236
|
-
|
237
|
-
The package provides detailed progress information:
|
238
|
-
|
239
|
-
```
|
240
|
-
Fetching sitemap from https://example.com/sitemap.xml...
|
241
|
-
Found 156 URLs in sitemap
|
242
|
-
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
|
243
|
-
|
244
|
-
Retrying failed URLs: 18 (11.5% failed)
|
245
|
-
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
|
246
|
-
|
247
|
-
Crawling Summary:
|
248
|
-
Total URLs processed: 156
|
249
|
-
Initial failures: 18 (11.5%)
|
250
|
-
Final results:
|
251
|
-
✓ Successful: 150
|
252
|
-
✗ Failed: 6
|
253
|
-
Retry success rate: 12/18 (66.7%)
|
254
|
-
```
|
255
|
-
|
256
|
-
## Output Structure
|
257
|
-
|
258
|
-
### 1. Directory Layout
|
259
|
-
```
|
260
|
-
content/ # Output directory
|
261
|
-
├── example-com-page1.md # Markdown files
|
262
|
-
├── example-com-page2.md
|
263
|
-
└── report.json # Crawl report
|
264
|
-
```
|
265
|
-
|
266
|
-
### 2. Report Format
|
267
|
-
```json
|
268
|
-
{
|
269
|
-
"timestamp": "2025-02-15T10:30:00",
|
270
|
-
"config": {
|
271
|
-
"target_selector": "article",
|
272
|
-
"remove_selectors": [".ads"]
|
273
|
-
},
|
274
|
-
"results": {
|
275
|
-
"successful": [...],
|
276
|
-
"failed": [...]
|
277
|
-
},
|
278
|
-
"summary": {
|
279
|
-
"total": 156,
|
280
|
-
"successful": 150,
|
281
|
-
"failed": 6
|
282
|
-
}
|
283
|
-
}
|
284
|
-
```
|
285
|
-
|
286
|
-
## Performance Optimization
|
287
|
-
|
288
|
-
1. Server-side Parallel Processing
|
289
|
-
- Recommended for most cases
|
290
|
-
- Single HTTP request
|
291
|
-
- Reduced network overhead
|
292
|
-
- Built-in load balancing
|
293
|
-
|
294
|
-
2. Client-side Parallel Processing
|
295
|
-
- Better control over processing
|
296
|
-
- Customizable concurrency
|
297
|
-
- Progress tracking per URL
|
298
|
-
- Automatic retry handling
|
299
|
-
|
300
|
-
3. Asynchronous Processing
|
301
|
-
- Ideal for async applications
|
302
|
-
- Non-blocking operation
|
303
|
-
- Real-time progress updates
|
304
|
-
- Efficient resource usage
|
305
|
-
|
306
|
-
## Error Handling
|
307
|
-
|
308
|
-
The package provides comprehensive error handling:
|
309
|
-
|
310
|
-
- Automatic retry for failed URLs
|
311
|
-
- Failure ratio monitoring
|
312
|
-
- Detailed error reporting
|
313
|
-
- Webhook error notifications
|
314
|
-
- Progress tracking during retries
|
315
|
-
|
316
|
-
## Requirements
|
317
|
-
|
318
|
-
- Python 3.11+
|
319
|
-
- Running SpiderForce4AI service
|
320
|
-
- Internet connection
|
321
|
-
|
322
|
-
## Dependencies
|
323
|
-
|
324
|
-
- aiohttp
|
325
|
-
- asyncio
|
326
|
-
- rich
|
327
|
-
- aiofiles
|
328
|
-
- httpx
|
329
|
-
|
330
|
-
## License
|
331
|
-
|
332
|
-
MIT License
|
333
|
-
|
334
|
-
## Credits
|
335
|
-
|
336
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=p_ybuwvTD7bTelORBzAkomUQrc69WvOmu3owHKlzp0A,42231
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
|
3
|
-
spiderforce4ai-2.6.7.dist-info/METADATA,sha256=5Tjsk-VHFD81TDxfh2LAClnoWa99BwBbNja9-681rZI,9012
|
4
|
-
spiderforce4ai-2.6.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.6.7.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.6.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.6.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|