spiderforce4ai 2.3.1__py3-none-any.whl → 2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +411 -349
- spiderforce4ai/post_extraction_agent.py +259 -0
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/METADATA +41 -3
- spiderforce4ai-2.4.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/entry_points.txt +2 -0
- spiderforce4ai-2.3.1.dist-info/RECORD +0 -5
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# spiderforce4ai/__init__.py
|
2
2
|
|
3
|
+
from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
|
3
4
|
import asyncio
|
4
5
|
import aiohttp
|
5
6
|
import json
|
6
7
|
import logging
|
7
|
-
from typing import List, Dict, Union, Optional, Tuple
|
8
|
+
from typing import List, Dict, Union, Optional, Tuple, Callable, Any
|
8
9
|
from dataclasses import dataclass, asdict
|
9
10
|
from urllib.parse import urljoin, urlparse
|
10
11
|
from pathlib import Path
|
@@ -65,13 +66,14 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
|
65
66
|
output = []
|
66
67
|
output.append(f"URL: {url}")
|
67
68
|
output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
|
68
|
-
output.append(f"Description: {metadata.get('Description', '')}")
|
69
|
+
output.append(f"Description: {metadata.get('Description', '')}")
|
69
70
|
output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
|
70
71
|
output.append(f"Language: {metadata.get('Language', 'en')}")
|
71
72
|
output.append("") # Empty line
|
72
73
|
output.extend(headers)
|
73
74
|
|
74
75
|
return '\n'.join(output)
|
76
|
+
|
75
77
|
def slugify(url: str) -> str:
|
76
78
|
"""Convert URL to a valid filename."""
|
77
79
|
parsed = urlparse(url)
|
@@ -90,6 +92,7 @@ class CrawlResult:
|
|
90
92
|
error: Optional[str] = None
|
91
93
|
timestamp: str = None
|
92
94
|
config: Dict = None
|
95
|
+
extraction_result: Optional[Dict] = None # Store post-extraction results
|
93
96
|
|
94
97
|
def __post_init__(self):
|
95
98
|
if not self.timestamp:
|
@@ -110,9 +113,14 @@ class CrawlConfig:
|
|
110
113
|
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
111
114
|
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
112
115
|
save_reports: bool = False # Whether to save crawl reports
|
113
|
-
report_file: Optional[Path] = None # Optional report file location
|
114
|
-
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
|
115
|
-
combined_markdown_file: Optional[Path] = None # Optional path for combined
|
116
|
+
report_file: Optional[Path] = None # Optional report file location
|
117
|
+
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
|
118
|
+
combined_markdown_file: Optional[Path] = None # Optional path for combined file
|
119
|
+
|
120
|
+
# Post-extraction settings
|
121
|
+
post_extraction_agent: Optional[Dict[str, Any]] = None # LLM configuration
|
122
|
+
post_extraction_agent_save_to_file: Optional[str] = None # Extraction output file
|
123
|
+
post_agent_transformer_function: Optional[Callable] = None # Custom transformer
|
116
124
|
|
117
125
|
def __post_init__(self):
|
118
126
|
# Initialize empty lists/dicts for None values
|
@@ -140,6 +148,15 @@ class CrawlConfig:
|
|
140
148
|
# Create or clear the combined file
|
141
149
|
self.combined_markdown_file.write_text('')
|
142
150
|
|
151
|
+
# Validate post-extraction agent configuration if provided
|
152
|
+
if self.post_extraction_agent:
|
153
|
+
if "messages" not in self.post_extraction_agent:
|
154
|
+
raise ValueError("Post-extraction agent configuration must include 'messages'")
|
155
|
+
if "model" not in self.post_extraction_agent:
|
156
|
+
raise ValueError("Post-extraction agent configuration must include 'model'")
|
157
|
+
if "api_key" not in self.post_extraction_agent:
|
158
|
+
raise ValueError("Post-extraction agent configuration must include 'api_key'")
|
159
|
+
|
143
160
|
def to_dict(self) -> Dict:
|
144
161
|
"""Convert config to dictionary for API requests."""
|
145
162
|
payload = {}
|
@@ -151,52 +168,120 @@ class CrawlConfig:
|
|
151
168
|
if self.remove_selectors_regex:
|
152
169
|
payload["remove_selectors_regex"] = self.remove_selectors_regex
|
153
170
|
return payload
|
154
|
-
|
155
|
-
|
171
|
+
|
156
172
|
def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
157
173
|
"""Synchronous version of webhook sender for parallel processing."""
|
158
174
|
if not config.webhook_url:
|
159
175
|
return
|
160
176
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
177
|
+
try:
|
178
|
+
# Use custom payload template if provided, otherwise use default
|
179
|
+
if config.webhook_payload_template:
|
180
|
+
# Replace variables in the template
|
181
|
+
payload_str = config.webhook_payload_template.format(
|
182
|
+
url=result.url,
|
183
|
+
status=result.status,
|
184
|
+
markdown=result.markdown if result.status == "success" else None,
|
185
|
+
error=result.error if result.status == "failed" else None,
|
186
|
+
timestamp=result.timestamp,
|
187
|
+
config=config.to_dict(),
|
188
|
+
extraction_result=result.extraction_result if result.extraction_result else None
|
189
|
+
)
|
190
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
191
|
+
else:
|
192
|
+
# Use default payload format
|
193
|
+
payload = {
|
194
|
+
"url": result.url,
|
195
|
+
"status": result.status,
|
196
|
+
"markdown": result.markdown if result.status == "success" else None,
|
197
|
+
"error": result.error if result.status == "failed" else None,
|
198
|
+
"timestamp": result.timestamp,
|
199
|
+
"config": config.to_dict(),
|
200
|
+
"extraction_result": result.extraction_result if result.extraction_result else None
|
201
|
+
}
|
202
|
+
|
203
|
+
response = requests.post(
|
204
|
+
config.webhook_url,
|
205
|
+
json=payload,
|
206
|
+
headers=config.webhook_headers,
|
207
|
+
timeout=config.webhook_timeout
|
171
208
|
)
|
172
|
-
|
173
|
-
|
174
|
-
|
209
|
+
response.raise_for_status()
|
210
|
+
except Exception as e:
|
211
|
+
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
212
|
+
|
213
|
+
async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
|
214
|
+
"""Asynchronous webhook sender."""
|
215
|
+
if not config.webhook_url:
|
216
|
+
return
|
217
|
+
|
218
|
+
try:
|
219
|
+
# Prepare payload similar to sync version
|
175
220
|
payload = {
|
176
221
|
"url": result.url,
|
177
222
|
"status": result.status,
|
178
223
|
"markdown": result.markdown if result.status == "success" else None,
|
179
224
|
"error": result.error if result.status == "failed" else None,
|
180
225
|
"timestamp": result.timestamp,
|
181
|
-
"config": config.to_dict()
|
226
|
+
"config": config.to_dict(),
|
227
|
+
"extraction_result": result.extraction_result if result.extraction_result else None
|
182
228
|
}
|
183
229
|
|
230
|
+
async with httpx.AsyncClient() as client:
|
231
|
+
response = await client.post(
|
232
|
+
config.webhook_url,
|
233
|
+
json=payload,
|
234
|
+
headers=config.webhook_headers,
|
235
|
+
timeout=config.webhook_timeout
|
236
|
+
)
|
237
|
+
response.raise_for_status()
|
238
|
+
except Exception as e:
|
239
|
+
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
240
|
+
|
241
|
+
async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
|
242
|
+
"""Save markdown content to file and/or append to combined file asynchronously."""
|
184
243
|
try:
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
244
|
+
# Save individual file if not combining or if combining in full mode
|
245
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
246
|
+
filename = f"{slugify(url)}.md"
|
247
|
+
filepath = config.output_dir / filename
|
248
|
+
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
249
|
+
await f.write(markdown)
|
250
|
+
|
251
|
+
# Handle combined markdown file
|
252
|
+
if config.combine_to_one_markdown:
|
253
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
254
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
255
|
+
|
256
|
+
async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
257
|
+
await f.write(combined_content)
|
192
258
|
except Exception as e:
|
193
|
-
print(f"
|
259
|
+
console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
|
260
|
+
|
261
|
+
def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
|
262
|
+
"""Synchronous version of markdown saver for parallel processing."""
|
263
|
+
try:
|
264
|
+
# Save individual file if not combining or if combining in full mode
|
265
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
266
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
267
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
268
|
+
f.write(markdown)
|
269
|
+
|
270
|
+
# Handle combined markdown file
|
271
|
+
if config.combine_to_one_markdown:
|
272
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
273
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
274
|
+
|
275
|
+
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
276
|
+
f.write(combined_content)
|
277
|
+
except Exception as e:
|
278
|
+
console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
|
194
279
|
|
195
|
-
# Module level function for multiprocessing
|
196
280
|
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
197
281
|
"""Process a single URL for parallel processing."""
|
198
282
|
url, base_url, config = args
|
199
283
|
try:
|
284
|
+
# Make the conversion request
|
200
285
|
endpoint = f"{base_url}/convert"
|
201
286
|
payload = {
|
202
287
|
"url": url,
|
@@ -211,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
211
296
|
error=f"HTTP {response.status_code}: {response.text}",
|
212
297
|
config=config.to_dict()
|
213
298
|
)
|
214
|
-
# Send webhook for failed result
|
215
299
|
_send_webhook_sync(result, config)
|
216
300
|
return result
|
217
301
|
|
@@ -219,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
219
303
|
|
220
304
|
# Save markdown if output directory is configured
|
221
305
|
if config.output_dir:
|
222
|
-
|
223
|
-
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
224
|
-
filepath = config.output_dir / f"{slugify(url)}.md"
|
225
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
226
|
-
f.write(markdown)
|
227
|
-
|
228
|
-
# Handle combined markdown file
|
229
|
-
if config.combine_to_one_markdown:
|
230
|
-
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
231
|
-
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
232
|
-
|
233
|
-
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
234
|
-
f.write(combined_content)
|
306
|
+
_save_markdown_sync(url, markdown, config)
|
235
307
|
|
236
308
|
result = CrawlResult(
|
237
309
|
url=url,
|
@@ -240,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
240
312
|
config=config.to_dict()
|
241
313
|
)
|
242
314
|
|
315
|
+
# Handle post-extraction if configured
|
316
|
+
if config.post_extraction_agent:
|
317
|
+
try:
|
318
|
+
post_config = PostExtractionConfig(
|
319
|
+
model=config.post_extraction_agent["model"],
|
320
|
+
messages=config.post_extraction_agent["messages"],
|
321
|
+
api_key=config.post_extraction_agent["api_key"],
|
322
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
+
output_file=config.post_extraction_agent_save_to_file,
|
327
|
+
custom_transform_function=config.post_agent_transformer_function
|
328
|
+
)
|
329
|
+
|
330
|
+
agent = PostExtractionAgent(post_config)
|
331
|
+
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
+
if extraction_result:
|
333
|
+
result.extraction_result = extraction_result
|
334
|
+
except Exception as e:
|
335
|
+
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
+
|
243
337
|
# Send webhook for successful result
|
244
338
|
_send_webhook_sync(result, config)
|
245
339
|
|
@@ -260,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
260
354
|
_send_webhook_sync(result, config)
|
261
355
|
return result
|
262
356
|
|
357
|
+
async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
|
358
|
+
"""Save crawl report to JSON file asynchronously."""
|
359
|
+
if not config.report_file:
|
360
|
+
return
|
361
|
+
|
362
|
+
# Separate successful and failed results
|
363
|
+
successful_results = [r for r in results if r.status == "success"]
|
364
|
+
failed_results = [r for r in results if r.status == "failed"]
|
365
|
+
|
366
|
+
report = {
|
367
|
+
"timestamp": datetime.now().isoformat(),
|
368
|
+
"config": config.to_dict(),
|
369
|
+
"results": {
|
370
|
+
"successful": [asdict(r) for r in successful_results],
|
371
|
+
"failed": [asdict(r) for r in failed_results]
|
372
|
+
},
|
373
|
+
"summary": {
|
374
|
+
"total": len(results),
|
375
|
+
"successful": len(successful_results),
|
376
|
+
"failed": len(failed_results),
|
377
|
+
"retry_info": retry_stats or {}
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
|
382
|
+
await f.write(json.dumps(report, indent=2))
|
383
|
+
|
384
|
+
def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
|
385
|
+
"""Synchronous version of report saver."""
|
386
|
+
if not config.report_file:
|
387
|
+
return
|
388
|
+
|
389
|
+
# Create report similar to async version
|
390
|
+
successful_results = [r for r in results if r.status == "success"]
|
391
|
+
failed_results = [r for r in results if r.status == "failed"]
|
392
|
+
|
393
|
+
report = {
|
394
|
+
"timestamp": datetime.now().isoformat(),
|
395
|
+
"config": config.to_dict(),
|
396
|
+
"results": {
|
397
|
+
"successful": [asdict(r) for r in successful_results],
|
398
|
+
"failed": [asdict(r) for r in failed_results]
|
399
|
+
},
|
400
|
+
"summary": {
|
401
|
+
"total": len(results),
|
402
|
+
"successful": len(successful_results),
|
403
|
+
"failed": len(failed_results),
|
404
|
+
"retry_info": retry_stats or {}
|
405
|
+
}
|
406
|
+
}
|
407
|
+
|
408
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
409
|
+
json.dump(report, f, indent=2)
|
410
|
+
|
263
411
|
class SpiderForce4AI:
|
264
412
|
"""Main class for interacting with SpiderForce4AI service."""
|
265
413
|
|
@@ -268,6 +416,7 @@ class SpiderForce4AI:
|
|
268
416
|
self.session = None
|
269
417
|
self._executor = ThreadPoolExecutor()
|
270
418
|
self.crawl_results: List[CrawlResult] = []
|
419
|
+
self._retry_stats = {}
|
271
420
|
|
272
421
|
async def _ensure_session(self):
|
273
422
|
"""Ensure aiohttp session exists."""
|
@@ -279,215 +428,6 @@ class SpiderForce4AI:
|
|
279
428
|
if self.session and not self.session.closed:
|
280
429
|
await self.session.close()
|
281
430
|
|
282
|
-
async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
|
283
|
-
"""Save markdown content to file and/or append to combined file."""
|
284
|
-
# Save individual file if not combining or if combining in full mode
|
285
|
-
if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
|
286
|
-
filename = f"{slugify(url)}.md"
|
287
|
-
filepath = output_dir / filename
|
288
|
-
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
289
|
-
await f.write(markdown)
|
290
|
-
|
291
|
-
# Handle combined markdown file
|
292
|
-
if self.config.combine_to_one_markdown:
|
293
|
-
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
294
|
-
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
295
|
-
|
296
|
-
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
297
|
-
await f.write(combined_content)
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
302
|
-
"""
|
303
|
-
Crawl sitemap URLs using server-side parallel processing.
|
304
|
-
"""
|
305
|
-
print(f"Fetching sitemap from {sitemap_url}...")
|
306
|
-
|
307
|
-
# Fetch sitemap
|
308
|
-
try:
|
309
|
-
response = requests.get(sitemap_url, timeout=config.timeout)
|
310
|
-
response.raise_for_status()
|
311
|
-
sitemap_text = response.text
|
312
|
-
except Exception as e:
|
313
|
-
print(f"Error fetching sitemap: {str(e)}")
|
314
|
-
raise
|
315
|
-
|
316
|
-
# Parse sitemap
|
317
|
-
try:
|
318
|
-
root = ET.fromstring(sitemap_text)
|
319
|
-
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
320
|
-
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
321
|
-
print(f"Found {len(urls)} URLs in sitemap")
|
322
|
-
except Exception as e:
|
323
|
-
print(f"Error parsing sitemap: {str(e)}")
|
324
|
-
raise
|
325
|
-
|
326
|
-
# Process URLs using server-side parallel endpoint
|
327
|
-
return self.crawl_urls_server_parallel(urls, config)
|
328
|
-
|
329
|
-
|
330
|
-
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
331
|
-
"""
|
332
|
-
Crawl multiple URLs using server-side parallel processing.
|
333
|
-
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
334
|
-
"""
|
335
|
-
print(f"Sending {len(urls)} URLs for parallel processing...")
|
336
|
-
|
337
|
-
try:
|
338
|
-
endpoint = f"{self.base_url}/convert_parallel"
|
339
|
-
|
340
|
-
# Prepare payload
|
341
|
-
payload = {
|
342
|
-
"urls": urls,
|
343
|
-
**config.to_dict()
|
344
|
-
}
|
345
|
-
|
346
|
-
# Send request
|
347
|
-
response = requests.post(
|
348
|
-
endpoint,
|
349
|
-
json=payload,
|
350
|
-
timeout=config.timeout
|
351
|
-
)
|
352
|
-
response.raise_for_status()
|
353
|
-
|
354
|
-
# Process results
|
355
|
-
results = []
|
356
|
-
server_results = response.json() # Assuming server returns JSON array of results
|
357
|
-
|
358
|
-
for url_result in server_results:
|
359
|
-
result = CrawlResult(
|
360
|
-
url=url_result["url"],
|
361
|
-
status=url_result.get("status", "failed"),
|
362
|
-
markdown=url_result.get("markdown"),
|
363
|
-
error=url_result.get("error"),
|
364
|
-
config=config.to_dict()
|
365
|
-
)
|
366
|
-
|
367
|
-
# Save markdown if successful and output dir is configured
|
368
|
-
if result.status == "success" and config.output_dir and result.markdown:
|
369
|
-
filepath = config.output_dir / f"{slugify(result.url)}.md"
|
370
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
371
|
-
f.write(result.markdown)
|
372
|
-
|
373
|
-
# Send webhook if configured
|
374
|
-
if config.webhook_url:
|
375
|
-
_send_webhook_sync(result, config)
|
376
|
-
|
377
|
-
results.append(result)
|
378
|
-
|
379
|
-
# Calculate statistics
|
380
|
-
successful = len([r for r in results if r.status == "success"])
|
381
|
-
failed = len([r for r in results if r.status == "failed"])
|
382
|
-
|
383
|
-
# Print summary
|
384
|
-
print(f"\nParallel processing completed:")
|
385
|
-
print(f"✓ Successful: {successful}")
|
386
|
-
print(f"✗ Failed: {failed}")
|
387
|
-
|
388
|
-
# Save report if enabled
|
389
|
-
if config.save_reports and config.report_file:
|
390
|
-
self._retry_stats = {
|
391
|
-
"initial_failures": failed,
|
392
|
-
"failure_ratio": (failed / len(urls)) * 100,
|
393
|
-
"retry_successful": 0, # No retries in server parallel mode
|
394
|
-
"retry_failed": failed
|
395
|
-
}
|
396
|
-
self._save_report_sync(results, config)
|
397
|
-
console.print(f"📊 Report saved to: {config.report_file}")
|
398
|
-
|
399
|
-
return results
|
400
|
-
|
401
|
-
except Exception as e:
|
402
|
-
print(f"Error during parallel processing: {str(e)}")
|
403
|
-
# Create failed results for all URLs
|
404
|
-
return [
|
405
|
-
CrawlResult(
|
406
|
-
url=url,
|
407
|
-
status="failed",
|
408
|
-
error=str(e),
|
409
|
-
config=config.to_dict()
|
410
|
-
) for url in urls
|
411
|
-
]
|
412
|
-
|
413
|
-
|
414
|
-
async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
|
415
|
-
"""Send webhook with crawl results."""
|
416
|
-
if not config.webhook_url:
|
417
|
-
return
|
418
|
-
|
419
|
-
payload = {
|
420
|
-
"url": result.url,
|
421
|
-
"status": result.status,
|
422
|
-
"markdown": result.markdown if result.status == "success" else None,
|
423
|
-
"error": result.error if result.status == "failed" else None,
|
424
|
-
"timestamp": result.timestamp,
|
425
|
-
"config": config.to_dict()
|
426
|
-
}
|
427
|
-
|
428
|
-
try:
|
429
|
-
async with httpx.AsyncClient() as client:
|
430
|
-
response = await client.post(
|
431
|
-
config.webhook_url,
|
432
|
-
json=payload,
|
433
|
-
timeout=config.webhook_timeout
|
434
|
-
)
|
435
|
-
response.raise_for_status()
|
436
|
-
except Exception as e:
|
437
|
-
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
438
|
-
|
439
|
-
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
440
|
-
"""Save crawl report synchronously."""
|
441
|
-
# Separate successful and failed results
|
442
|
-
successful_results = [r for r in results if r.status == "success"]
|
443
|
-
failed_results = [r for r in results if r.status == "failed"]
|
444
|
-
|
445
|
-
# Create report with only final state
|
446
|
-
report = {
|
447
|
-
"timestamp": datetime.now().isoformat(),
|
448
|
-
"config": config.to_dict(),
|
449
|
-
"results": {
|
450
|
-
"successful": [asdict(r) for r in successful_results],
|
451
|
-
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
452
|
-
},
|
453
|
-
"summary": {
|
454
|
-
"total": len(results),
|
455
|
-
"successful": len(successful_results),
|
456
|
-
"failed": len(failed_results),
|
457
|
-
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
458
|
-
}
|
459
|
-
}
|
460
|
-
|
461
|
-
with open(config.report_file, 'w', encoding='utf-8') as f:
|
462
|
-
json.dump(report, f, indent=2)
|
463
|
-
|
464
|
-
async def _save_report(self, config: CrawlConfig):
|
465
|
-
"""Save crawl report to JSON file."""
|
466
|
-
if not config.report_file:
|
467
|
-
return
|
468
|
-
|
469
|
-
# Separate successful and failed results
|
470
|
-
successful_results = [r for r in self.crawl_results if r.status == "success"]
|
471
|
-
failed_results = [r for r in self.crawl_results if r.status == "failed"]
|
472
|
-
|
473
|
-
report = {
|
474
|
-
"timestamp": datetime.now().isoformat(),
|
475
|
-
"config": config.to_dict(),
|
476
|
-
"results": {
|
477
|
-
"successful": [asdict(r) for r in successful_results],
|
478
|
-
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
479
|
-
},
|
480
|
-
"summary": {
|
481
|
-
"total": len(self.crawl_results),
|
482
|
-
"successful": len(successful_results),
|
483
|
-
"failed": len(failed_results),
|
484
|
-
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
485
|
-
}
|
486
|
-
}
|
487
|
-
|
488
|
-
async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
|
489
|
-
await f.write(json.dumps(report, indent=2))
|
490
|
-
|
491
431
|
async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
|
492
432
|
"""Crawl a single URL asynchronously."""
|
493
433
|
await self._ensure_session()
|
@@ -518,9 +458,31 @@ class SpiderForce4AI:
|
|
518
458
|
)
|
519
459
|
|
520
460
|
if config.output_dir:
|
521
|
-
await
|
461
|
+
await _save_markdown_async(url, markdown, config)
|
462
|
+
|
463
|
+
# Handle post-extraction if configured
|
464
|
+
if config.post_extraction_agent and result.status == "success":
|
465
|
+
try:
|
466
|
+
post_config = PostExtractionConfig(
|
467
|
+
model=config.post_extraction_agent["model"],
|
468
|
+
messages=config.post_extraction_agent["messages"],
|
469
|
+
api_key=config.post_extraction_agent["api_key"],
|
470
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
471
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
472
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
473
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
474
|
+
output_file=config.post_extraction_agent_save_to_file,
|
475
|
+
custom_transform_function=config.post_agent_transformer_function
|
476
|
+
)
|
477
|
+
|
478
|
+
agent = PostExtractionAgent(post_config)
|
479
|
+
extraction_result = await agent.process_content(url, markdown)
|
480
|
+
if extraction_result:
|
481
|
+
result.extraction_result = extraction_result
|
482
|
+
except Exception as e:
|
483
|
+
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
522
484
|
|
523
|
-
await
|
485
|
+
await _send_webhook_async(result, config)
|
524
486
|
|
525
487
|
self.crawl_results.append(result)
|
526
488
|
return result
|
@@ -540,18 +502,18 @@ class SpiderForce4AI:
|
|
540
502
|
return asyncio.run(self.crawl_url_async(url, config))
|
541
503
|
|
542
504
|
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
543
|
-
"""Retry failed URLs
|
505
|
+
"""Retry failed URLs with optional progress tracking."""
|
544
506
|
if not failed_results:
|
545
507
|
return []
|
546
508
|
|
547
509
|
failed_count = len(failed_results)
|
548
|
-
total_count = len(
|
510
|
+
total_count = len(self.crawl_results)
|
549
511
|
failure_ratio = (failed_count / total_count) * 100
|
550
512
|
|
551
513
|
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
552
514
|
retry_results = []
|
553
515
|
|
554
|
-
# Create
|
516
|
+
# Create or use provided progress bar
|
555
517
|
should_close_progress = progress is None
|
556
518
|
if progress is None:
|
557
519
|
progress = Progress(
|
@@ -595,6 +557,7 @@ class SpiderForce4AI:
|
|
595
557
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
596
558
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
597
559
|
await self._ensure_session()
|
560
|
+
post_extraction_results = {}
|
598
561
|
|
599
562
|
with Progress(
|
600
563
|
SpinnerColumn(),
|
@@ -603,52 +566,60 @@ class SpiderForce4AI:
|
|
603
566
|
TaskProgressColumn(),
|
604
567
|
console=console
|
605
568
|
) as progress:
|
606
|
-
|
569
|
+
crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
607
570
|
|
608
571
|
async def crawl_with_progress(url):
|
609
572
|
result = await self.crawl_url_async(url, config)
|
610
|
-
progress.update(
|
573
|
+
progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
|
611
574
|
return result
|
612
575
|
|
576
|
+
# Set up concurrency control
|
613
577
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
578
|
+
|
614
579
|
async def crawl_with_semaphore(url):
|
615
580
|
async with semaphore:
|
616
581
|
result = await crawl_with_progress(url)
|
617
582
|
await asyncio.sleep(config.request_delay)
|
618
583
|
return result
|
619
584
|
|
585
|
+
# Perform initial crawl
|
620
586
|
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
621
587
|
|
622
|
-
#
|
588
|
+
# Handle failed URLs
|
623
589
|
failed_results = [r for r in initial_results if r.status == "failed"]
|
624
|
-
|
625
|
-
# Calculate initial failure ratio
|
626
590
|
initial_failed = len(failed_results)
|
627
591
|
total_urls = len(urls)
|
628
592
|
failure_ratio = (initial_failed / total_urls) * 100
|
629
593
|
|
630
594
|
# Retry failed URLs if ratio is acceptable
|
631
|
-
|
632
|
-
|
633
|
-
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
634
|
-
results = initial_results
|
635
|
-
else:
|
636
|
-
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
637
|
-
# Update results list by replacing failed results with successful retries
|
638
|
-
results = initial_results.copy()
|
639
|
-
for retry_result in retry_results:
|
640
|
-
for i, result in enumerate(results):
|
641
|
-
if result.url == retry_result.url:
|
642
|
-
results[i] = retry_result
|
643
|
-
break
|
644
|
-
else:
|
645
|
-
results = initial_results
|
595
|
+
results = initial_results
|
596
|
+
retry_successful = 0
|
646
597
|
|
647
|
-
|
598
|
+
if failed_results and failure_ratio <= 20:
|
599
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
600
|
+
retry_successful = len([r for r in retry_results if r.status == "success"])
|
601
|
+
|
602
|
+
# Update results list
|
603
|
+
for retry_result in retry_results:
|
604
|
+
for i, result in enumerate(results):
|
605
|
+
if result.url == retry_result.url:
|
606
|
+
results[i] = retry_result
|
607
|
+
break
|
608
|
+
|
609
|
+
# Calculate final statistics
|
648
610
|
final_successful = len([r for r in results if r.status == "success"])
|
649
611
|
final_failed = len([r for r in results if r.status == "failed"])
|
650
612
|
|
651
|
-
#
|
613
|
+
# Update retry stats
|
614
|
+
self._retry_stats = {
|
615
|
+
"initial_failures": initial_failed,
|
616
|
+
"failure_ratio": failure_ratio,
|
617
|
+
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
618
|
+
"retry_failed": final_failed,
|
619
|
+
"post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
|
620
|
+
}
|
621
|
+
|
622
|
+
# Print summary
|
652
623
|
console.print(f"\n[green]Crawling Summary:[/green]")
|
653
624
|
console.print(f"Total URLs processed: {total_urls}")
|
654
625
|
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
@@ -657,18 +628,11 @@ class SpiderForce4AI:
|
|
657
628
|
console.print(f" ✗ Failed: {final_failed}")
|
658
629
|
|
659
630
|
if initial_failed > 0:
|
660
|
-
retry_successful = initial_failed - final_failed
|
661
631
|
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
662
632
|
|
663
|
-
# Save final report
|
633
|
+
# Save final report
|
664
634
|
if config.save_reports:
|
665
|
-
self._retry_stats
|
666
|
-
"initial_failures": initial_failed,
|
667
|
-
"failure_ratio": failure_ratio,
|
668
|
-
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
669
|
-
"retry_failed": final_failed
|
670
|
-
}
|
671
|
-
await self._save_report(config)
|
635
|
+
await _save_report_async(results, config, self._retry_stats)
|
672
636
|
console.print(f"📊 Report saved to: {config.report_file}")
|
673
637
|
|
674
638
|
return results
|
@@ -705,32 +669,21 @@ class SpiderForce4AI:
|
|
705
669
|
return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
|
706
670
|
|
707
671
|
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
708
|
-
"""Crawl sitemap URLs in parallel using multiprocessing
|
709
|
-
|
710
|
-
|
711
|
-
# Fetch sitemap
|
672
|
+
"""Crawl sitemap URLs in parallel using multiprocessing."""
|
673
|
+
# Fetch and parse sitemap
|
712
674
|
try:
|
713
675
|
response = requests.get(sitemap_url, timeout=config.timeout)
|
714
676
|
response.raise_for_status()
|
715
|
-
|
716
|
-
except Exception as e:
|
717
|
-
print(f"Error fetching sitemap: {str(e)}")
|
718
|
-
raise
|
719
|
-
|
720
|
-
# Parse sitemap
|
721
|
-
try:
|
722
|
-
root = ET.fromstring(sitemap_text)
|
677
|
+
root = ET.fromstring(response.text)
|
723
678
|
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
724
679
|
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
725
|
-
print(f"Found {len(urls)} URLs in sitemap")
|
680
|
+
console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
|
726
681
|
except Exception as e:
|
727
|
-
print(f"Error
|
682
|
+
console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
|
728
683
|
raise
|
729
684
|
|
730
|
-
#
|
685
|
+
# Process URLs in parallel
|
731
686
|
process_args = [(url, self.base_url, config) for url in urls]
|
732
|
-
|
733
|
-
# Create process pool and execute crawls
|
734
687
|
results = []
|
735
688
|
|
736
689
|
with Pool(processes=config.max_concurrent_requests) as pool:
|
@@ -741,81 +694,186 @@ class SpiderForce4AI:
|
|
741
694
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
742
695
|
TextColumn("({task.completed}/{task.total})"),
|
743
696
|
) as progress:
|
744
|
-
task = progress.add_task("Crawling URLs...", total=len(urls))
|
697
|
+
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
745
698
|
|
746
699
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
747
700
|
results.append(result)
|
748
701
|
progress.update(task, advance=1)
|
749
702
|
status = "✓" if result.status == "success" else "✗"
|
750
|
-
progress.description = f"Last: {status} {result.url}"
|
703
|
+
progress.description = f"[cyan]Last: {status} {result.url}"
|
751
704
|
|
752
|
-
# Calculate
|
705
|
+
# Calculate statistics and handle retries
|
753
706
|
failed_results = [r for r in results if r.status == "failed"]
|
754
707
|
initial_failed = len(failed_results)
|
755
|
-
|
756
|
-
|
708
|
+
failure_ratio = (initial_failed / len(urls)) * 100
|
709
|
+
retry_successful = 0
|
757
710
|
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
if
|
771
|
-
|
772
|
-
|
773
|
-
if config.output_dir and new_result.markdown:
|
774
|
-
filepath = config.output_dir / f"{slugify(new_result.url)}.md"
|
775
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
776
|
-
f.write(new_result.markdown)
|
777
|
-
# Send webhook for successful retry
|
778
|
-
_send_webhook_sync(new_result, config)
|
779
|
-
else:
|
780
|
-
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
781
|
-
# Send webhook for failed retry
|
782
|
-
_send_webhook_sync(new_result, config)
|
783
|
-
|
784
|
-
# Update results list
|
785
|
-
for i, r in enumerate(results):
|
786
|
-
if r.url == new_result.url:
|
787
|
-
results[i] = new_result
|
788
|
-
break
|
711
|
+
if failed_results and failure_ratio <= 20:
|
712
|
+
console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
|
713
|
+
for result in failed_results:
|
714
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
715
|
+
if new_result.status == "success":
|
716
|
+
retry_successful += 1
|
717
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
718
|
+
else:
|
719
|
+
console.print(f"[red]✗ Retry failed: {result.url}[/red]")
|
720
|
+
|
721
|
+
# Update results list
|
722
|
+
for i, r in enumerate(results):
|
723
|
+
if r.url == new_result.url:
|
724
|
+
results[i] = new_result
|
725
|
+
break
|
789
726
|
|
790
727
|
# Calculate final statistics
|
791
728
|
final_successful = len([r for r in results if r.status == "success"])
|
792
729
|
final_failed = len([r for r in results if r.status == "failed"])
|
793
730
|
|
794
|
-
# Print
|
731
|
+
# Print summary
|
795
732
|
console.print(f"\n[green]Crawling Summary:[/green]")
|
796
|
-
console.print(f"Total URLs processed: {
|
733
|
+
console.print(f"Total URLs processed: {len(urls)}")
|
797
734
|
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
798
735
|
console.print(f"Final results:")
|
799
736
|
console.print(f" ✓ Successful: {final_successful}")
|
800
737
|
console.print(f" ✗ Failed: {final_failed}")
|
801
|
-
|
738
|
+
|
802
739
|
if initial_failed > 0:
|
803
|
-
retry_successful = initial_failed - final_failed
|
804
740
|
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
805
741
|
|
806
|
-
|
742
|
+
# Save report
|
807
743
|
if config.save_reports:
|
808
744
|
self._retry_stats = {
|
809
745
|
"initial_failures": initial_failed,
|
810
746
|
"failure_ratio": failure_ratio,
|
811
|
-
"retry_successful": retry_successful
|
747
|
+
"retry_successful": retry_successful,
|
812
748
|
"retry_failed": final_failed
|
813
749
|
}
|
814
|
-
|
750
|
+
_save_report_sync(results, config, self._retry_stats)
|
815
751
|
console.print(f"📊 Report saved to: {config.report_file}")
|
816
752
|
|
817
753
|
return results
|
818
754
|
|
755
|
+
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
756
|
+
"""
|
757
|
+
Crawl multiple URLs using server-side parallel processing.
|
758
|
+
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
759
|
+
"""
|
760
|
+
console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
|
761
|
+
|
762
|
+
try:
|
763
|
+
endpoint = f"{self.base_url}/convert_parallel"
|
764
|
+
|
765
|
+
# Prepare payload
|
766
|
+
payload = {
|
767
|
+
"urls": urls,
|
768
|
+
**config.to_dict()
|
769
|
+
}
|
770
|
+
|
771
|
+
# Send request
|
772
|
+
response = requests.post(
|
773
|
+
endpoint,
|
774
|
+
json=payload,
|
775
|
+
timeout=config.timeout
|
776
|
+
)
|
777
|
+
response.raise_for_status()
|
778
|
+
|
779
|
+
# Process results
|
780
|
+
results = []
|
781
|
+
server_results = response.json()
|
782
|
+
|
783
|
+
for url_result in server_results:
|
784
|
+
result = CrawlResult(
|
785
|
+
url=url_result["url"],
|
786
|
+
status=url_result.get("status", "failed"),
|
787
|
+
markdown=url_result.get("markdown"),
|
788
|
+
error=url_result.get("error"),
|
789
|
+
config=config.to_dict()
|
790
|
+
)
|
791
|
+
|
792
|
+
# Save markdown if successful and output dir is configured
|
793
|
+
if result.status == "success" and config.output_dir and result.markdown:
|
794
|
+
_save_markdown_sync(result.url, result.markdown, config)
|
795
|
+
|
796
|
+
# Handle post-extraction if configured
|
797
|
+
if config.post_extraction_agent and result.status == "success":
|
798
|
+
try:
|
799
|
+
post_config = PostExtractionConfig(
|
800
|
+
model=config.post_extraction_agent["model"],
|
801
|
+
messages=config.post_extraction_agent["messages"],
|
802
|
+
api_key=config.post_extraction_agent["api_key"],
|
803
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
804
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
805
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
806
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
807
|
+
output_file=config.post_extraction_agent_save_to_file,
|
808
|
+
custom_transform_function=config.post_agent_transformer_function
|
809
|
+
)
|
810
|
+
|
811
|
+
agent = PostExtractionAgent(post_config)
|
812
|
+
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
813
|
+
if extraction_result:
|
814
|
+
result.extraction_result = extraction_result
|
815
|
+
except Exception as e:
|
816
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
817
|
+
|
818
|
+
# Send webhook if configured
|
819
|
+
_send_webhook_sync(result, config)
|
820
|
+
results.append(result)
|
821
|
+
|
822
|
+
# Calculate statistics
|
823
|
+
successful = len([r for r in results if r.status == "success"])
|
824
|
+
failed = len([r for r in results if r.status == "failed"])
|
825
|
+
|
826
|
+
# Print summary
|
827
|
+
console.print("\n[green]Parallel processing completed:[/green]")
|
828
|
+
console.print(f"✓ Successful: {successful}")
|
829
|
+
console.print(f"✗ Failed: {failed}")
|
830
|
+
|
831
|
+
# Save report if enabled
|
832
|
+
if config.save_reports:
|
833
|
+
self._retry_stats = {
|
834
|
+
"initial_failures": failed,
|
835
|
+
"failure_ratio": (failed / len(urls)) * 100,
|
836
|
+
"retry_successful": 0, # No retries in server parallel mode
|
837
|
+
"retry_failed": failed
|
838
|
+
}
|
839
|
+
_save_report_sync(results, config, self._retry_stats)
|
840
|
+
console.print(f"📊 Report saved to: {config.report_file}")
|
841
|
+
|
842
|
+
return results
|
843
|
+
|
844
|
+
except Exception as e:
|
845
|
+
console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
|
846
|
+
# Create failed results for all URLs
|
847
|
+
return [
|
848
|
+
CrawlResult(
|
849
|
+
url=url,
|
850
|
+
status="failed",
|
851
|
+
error=str(e),
|
852
|
+
config=config.to_dict()
|
853
|
+
) for url in urls
|
854
|
+
]
|
855
|
+
|
856
|
+
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
857
|
+
"""
|
858
|
+
Crawl sitemap URLs using server-side parallel processing.
|
859
|
+
"""
|
860
|
+
console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
|
861
|
+
|
862
|
+
try:
|
863
|
+
response = requests.get(sitemap_url, timeout=config.timeout)
|
864
|
+
response.raise_for_status()
|
865
|
+
root = ET.fromstring(response.text)
|
866
|
+
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
867
|
+
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
868
|
+
console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
|
869
|
+
|
870
|
+
# Process URLs using server-side parallel endpoint
|
871
|
+
return self.crawl_urls_server_parallel(urls, config)
|
872
|
+
|
873
|
+
except Exception as e:
|
874
|
+
console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
|
875
|
+
raise
|
876
|
+
|
819
877
|
async def __aenter__(self):
|
820
878
|
"""Async context manager entry."""
|
821
879
|
await self._ensure_session()
|
@@ -833,3 +891,7 @@ class SpiderForce4AI:
|
|
833
891
|
"""Sync context manager exit."""
|
834
892
|
self._executor.shutdown(wait=True)
|
835
893
|
|
894
|
+
# Version info
|
895
|
+
#__version__ = "2.3.1"
|
896
|
+
#__author__ = "Piotr Tamulewicz"
|
897
|
+
#__email__ = "pt@petertam.pro"
|