spiderforce4ai 2.3.1__py3-none-any.whl → 2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  # spiderforce4ai/__init__.py
2
2
 
3
+ from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
3
4
  import asyncio
4
5
  import aiohttp
5
6
  import json
6
7
  import logging
7
- from typing import List, Dict, Union, Optional, Tuple
8
+ from typing import List, Dict, Union, Optional, Tuple, Callable, Any
8
9
  from dataclasses import dataclass, asdict
9
10
  from urllib.parse import urljoin, urlparse
10
11
  from pathlib import Path
@@ -65,13 +66,14 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
65
66
  output = []
66
67
  output.append(f"URL: {url}")
67
68
  output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
68
- output.append(f"Description: {metadata.get('Description', '')}") # Now this will be empty string for missing descriptions
69
+ output.append(f"Description: {metadata.get('Description', '')}")
69
70
  output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
70
71
  output.append(f"Language: {metadata.get('Language', 'en')}")
71
72
  output.append("") # Empty line
72
73
  output.extend(headers)
73
74
 
74
75
  return '\n'.join(output)
76
+
75
77
  def slugify(url: str) -> str:
76
78
  """Convert URL to a valid filename."""
77
79
  parsed = urlparse(url)
@@ -90,6 +92,7 @@ class CrawlResult:
90
92
  error: Optional[str] = None
91
93
  timestamp: str = None
92
94
  config: Dict = None
95
+ extraction_result: Optional[Dict] = None # Store post-extraction results
93
96
 
94
97
  def __post_init__(self):
95
98
  if not self.timestamp:
@@ -110,9 +113,14 @@ class CrawlConfig:
110
113
  webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
111
114
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
112
115
  save_reports: bool = False # Whether to save crawl reports
113
- report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
114
- combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
115
- combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
116
+ report_file: Optional[Path] = None # Optional report file location
117
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
118
+ combined_markdown_file: Optional[Path] = None # Optional path for combined file
119
+
120
+ # Post-extraction settings
121
+ post_extraction_agent: Optional[Dict[str, Any]] = None # LLM configuration
122
+ post_extraction_agent_save_to_file: Optional[str] = None # Extraction output file
123
+ post_agent_transformer_function: Optional[Callable] = None # Custom transformer
116
124
 
117
125
  def __post_init__(self):
118
126
  # Initialize empty lists/dicts for None values
@@ -140,6 +148,15 @@ class CrawlConfig:
140
148
  # Create or clear the combined file
141
149
  self.combined_markdown_file.write_text('')
142
150
 
151
+ # Validate post-extraction agent configuration if provided
152
+ if self.post_extraction_agent:
153
+ if "messages" not in self.post_extraction_agent:
154
+ raise ValueError("Post-extraction agent configuration must include 'messages'")
155
+ if "model" not in self.post_extraction_agent:
156
+ raise ValueError("Post-extraction agent configuration must include 'model'")
157
+ if "api_key" not in self.post_extraction_agent:
158
+ raise ValueError("Post-extraction agent configuration must include 'api_key'")
159
+
143
160
  def to_dict(self) -> Dict:
144
161
  """Convert config to dictionary for API requests."""
145
162
  payload = {}
@@ -151,52 +168,120 @@ class CrawlConfig:
151
168
  if self.remove_selectors_regex:
152
169
  payload["remove_selectors_regex"] = self.remove_selectors_regex
153
170
  return payload
154
-
155
-
171
+
156
172
  def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
157
173
  """Synchronous version of webhook sender for parallel processing."""
158
174
  if not config.webhook_url:
159
175
  return
160
176
 
161
- # Use custom payload template if provided, otherwise use default
162
- if config.webhook_payload_template:
163
- # Replace variables in the template
164
- payload_str = config.webhook_payload_template.format(
165
- url=result.url,
166
- status=result.status,
167
- markdown=result.markdown if result.status == "success" else None,
168
- error=result.error if result.status == "failed" else None,
169
- timestamp=result.timestamp,
170
- config=config.to_dict()
177
+ try:
178
+ # Use custom payload template if provided, otherwise use default
179
+ if config.webhook_payload_template:
180
+ # Replace variables in the template
181
+ payload_str = config.webhook_payload_template.format(
182
+ url=result.url,
183
+ status=result.status,
184
+ markdown=result.markdown if result.status == "success" else None,
185
+ error=result.error if result.status == "failed" else None,
186
+ timestamp=result.timestamp,
187
+ config=config.to_dict(),
188
+ extraction_result=result.extraction_result if result.extraction_result else None
189
+ )
190
+ payload = json.loads(payload_str) # Parse the formatted JSON string
191
+ else:
192
+ # Use default payload format
193
+ payload = {
194
+ "url": result.url,
195
+ "status": result.status,
196
+ "markdown": result.markdown if result.status == "success" else None,
197
+ "error": result.error if result.status == "failed" else None,
198
+ "timestamp": result.timestamp,
199
+ "config": config.to_dict(),
200
+ "extraction_result": result.extraction_result if result.extraction_result else None
201
+ }
202
+
203
+ response = requests.post(
204
+ config.webhook_url,
205
+ json=payload,
206
+ headers=config.webhook_headers,
207
+ timeout=config.webhook_timeout
171
208
  )
172
- payload = json.loads(payload_str) # Parse the formatted JSON string
173
- else:
174
- # Use default payload format
209
+ response.raise_for_status()
210
+ except Exception as e:
211
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
212
+
213
+ async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
214
+ """Asynchronous webhook sender."""
215
+ if not config.webhook_url:
216
+ return
217
+
218
+ try:
219
+ # Prepare payload similar to sync version
175
220
  payload = {
176
221
  "url": result.url,
177
222
  "status": result.status,
178
223
  "markdown": result.markdown if result.status == "success" else None,
179
224
  "error": result.error if result.status == "failed" else None,
180
225
  "timestamp": result.timestamp,
181
- "config": config.to_dict()
226
+ "config": config.to_dict(),
227
+ "extraction_result": result.extraction_result if result.extraction_result else None
182
228
  }
183
229
 
230
+ async with httpx.AsyncClient() as client:
231
+ response = await client.post(
232
+ config.webhook_url,
233
+ json=payload,
234
+ headers=config.webhook_headers,
235
+ timeout=config.webhook_timeout
236
+ )
237
+ response.raise_for_status()
238
+ except Exception as e:
239
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
240
+
241
+ async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
242
+ """Save markdown content to file and/or append to combined file asynchronously."""
184
243
  try:
185
- response = requests.post(
186
- config.webhook_url,
187
- json=payload,
188
- headers=config.webhook_headers,
189
- timeout=config.webhook_timeout
190
- )
191
- response.raise_for_status()
244
+ # Save individual file if not combining or if combining in full mode
245
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
246
+ filename = f"{slugify(url)}.md"
247
+ filepath = config.output_dir / filename
248
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
249
+ await f.write(markdown)
250
+
251
+ # Handle combined markdown file
252
+ if config.combine_to_one_markdown:
253
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
254
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
255
+
256
+ async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
257
+ await f.write(combined_content)
192
258
  except Exception as e:
193
- print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
259
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
260
+
261
+ def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
262
+ """Synchronous version of markdown saver for parallel processing."""
263
+ try:
264
+ # Save individual file if not combining or if combining in full mode
265
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
266
+ filepath = config.output_dir / f"{slugify(url)}.md"
267
+ with open(filepath, 'w', encoding='utf-8') as f:
268
+ f.write(markdown)
269
+
270
+ # Handle combined markdown file
271
+ if config.combine_to_one_markdown:
272
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
273
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
274
+
275
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
276
+ f.write(combined_content)
277
+ except Exception as e:
278
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
194
279
 
195
- # Module level function for multiprocessing
196
280
  def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
197
281
  """Process a single URL for parallel processing."""
198
282
  url, base_url, config = args
199
283
  try:
284
+ # Make the conversion request
200
285
  endpoint = f"{base_url}/convert"
201
286
  payload = {
202
287
  "url": url,
@@ -211,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
211
296
  error=f"HTTP {response.status_code}: {response.text}",
212
297
  config=config.to_dict()
213
298
  )
214
- # Send webhook for failed result
215
299
  _send_webhook_sync(result, config)
216
300
  return result
217
301
 
@@ -219,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
219
303
 
220
304
  # Save markdown if output directory is configured
221
305
  if config.output_dir:
222
- # Save individual file if not combining or if combining in full mode
223
- if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
224
- filepath = config.output_dir / f"{slugify(url)}.md"
225
- with open(filepath, 'w', encoding='utf-8') as f:
226
- f.write(markdown)
227
-
228
- # Handle combined markdown file
229
- if config.combine_to_one_markdown:
230
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
231
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
232
-
233
- with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
234
- f.write(combined_content)
306
+ _save_markdown_sync(url, markdown, config)
235
307
 
236
308
  result = CrawlResult(
237
309
  url=url,
@@ -240,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
240
312
  config=config.to_dict()
241
313
  )
242
314
 
315
+ # Handle post-extraction if configured
316
+ if config.post_extraction_agent:
317
+ try:
318
+ post_config = PostExtractionConfig(
319
+ model=config.post_extraction_agent["model"],
320
+ messages=config.post_extraction_agent["messages"],
321
+ api_key=config.post_extraction_agent["api_key"],
322
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
324
+ base_url=config.post_extraction_agent.get("base_url"),
325
+ combine_output=bool(config.post_extraction_agent_save_to_file),
326
+ output_file=config.post_extraction_agent_save_to_file,
327
+ custom_transform_function=config.post_agent_transformer_function
328
+ )
329
+
330
+ agent = PostExtractionAgent(post_config)
331
+ extraction_result = asyncio.run(agent.process_content(url, markdown))
332
+ if extraction_result:
333
+ result.extraction_result = extraction_result
334
+ except Exception as e:
335
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
+
243
337
  # Send webhook for successful result
244
338
  _send_webhook_sync(result, config)
245
339
 
@@ -260,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
260
354
  _send_webhook_sync(result, config)
261
355
  return result
262
356
 
357
+ async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
358
+ """Save crawl report to JSON file asynchronously."""
359
+ if not config.report_file:
360
+ return
361
+
362
+ # Separate successful and failed results
363
+ successful_results = [r for r in results if r.status == "success"]
364
+ failed_results = [r for r in results if r.status == "failed"]
365
+
366
+ report = {
367
+ "timestamp": datetime.now().isoformat(),
368
+ "config": config.to_dict(),
369
+ "results": {
370
+ "successful": [asdict(r) for r in successful_results],
371
+ "failed": [asdict(r) for r in failed_results]
372
+ },
373
+ "summary": {
374
+ "total": len(results),
375
+ "successful": len(successful_results),
376
+ "failed": len(failed_results),
377
+ "retry_info": retry_stats or {}
378
+ }
379
+ }
380
+
381
+ async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
382
+ await f.write(json.dumps(report, indent=2))
383
+
384
+ def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
385
+ """Synchronous version of report saver."""
386
+ if not config.report_file:
387
+ return
388
+
389
+ # Create report similar to async version
390
+ successful_results = [r for r in results if r.status == "success"]
391
+ failed_results = [r for r in results if r.status == "failed"]
392
+
393
+ report = {
394
+ "timestamp": datetime.now().isoformat(),
395
+ "config": config.to_dict(),
396
+ "results": {
397
+ "successful": [asdict(r) for r in successful_results],
398
+ "failed": [asdict(r) for r in failed_results]
399
+ },
400
+ "summary": {
401
+ "total": len(results),
402
+ "successful": len(successful_results),
403
+ "failed": len(failed_results),
404
+ "retry_info": retry_stats or {}
405
+ }
406
+ }
407
+
408
+ with open(config.report_file, 'w', encoding='utf-8') as f:
409
+ json.dump(report, f, indent=2)
410
+
263
411
  class SpiderForce4AI:
264
412
  """Main class for interacting with SpiderForce4AI service."""
265
413
 
@@ -268,6 +416,7 @@ class SpiderForce4AI:
268
416
  self.session = None
269
417
  self._executor = ThreadPoolExecutor()
270
418
  self.crawl_results: List[CrawlResult] = []
419
+ self._retry_stats = {}
271
420
 
272
421
  async def _ensure_session(self):
273
422
  """Ensure aiohttp session exists."""
@@ -279,215 +428,6 @@ class SpiderForce4AI:
279
428
  if self.session and not self.session.closed:
280
429
  await self.session.close()
281
430
 
282
- async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
283
- """Save markdown content to file and/or append to combined file."""
284
- # Save individual file if not combining or if combining in full mode
285
- if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
286
- filename = f"{slugify(url)}.md"
287
- filepath = output_dir / filename
288
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
289
- await f.write(markdown)
290
-
291
- # Handle combined markdown file
292
- if self.config.combine_to_one_markdown:
293
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
294
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
295
-
296
- async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
297
- await f.write(combined_content)
298
-
299
-
300
-
301
- def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
302
- """
303
- Crawl sitemap URLs using server-side parallel processing.
304
- """
305
- print(f"Fetching sitemap from {sitemap_url}...")
306
-
307
- # Fetch sitemap
308
- try:
309
- response = requests.get(sitemap_url, timeout=config.timeout)
310
- response.raise_for_status()
311
- sitemap_text = response.text
312
- except Exception as e:
313
- print(f"Error fetching sitemap: {str(e)}")
314
- raise
315
-
316
- # Parse sitemap
317
- try:
318
- root = ET.fromstring(sitemap_text)
319
- namespace = {'ns': root.tag.split('}')[0].strip('{')}
320
- urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
321
- print(f"Found {len(urls)} URLs in sitemap")
322
- except Exception as e:
323
- print(f"Error parsing sitemap: {str(e)}")
324
- raise
325
-
326
- # Process URLs using server-side parallel endpoint
327
- return self.crawl_urls_server_parallel(urls, config)
328
-
329
-
330
- def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
331
- """
332
- Crawl multiple URLs using server-side parallel processing.
333
- This uses the /convert_parallel endpoint which handles parallelization on the server.
334
- """
335
- print(f"Sending {len(urls)} URLs for parallel processing...")
336
-
337
- try:
338
- endpoint = f"{self.base_url}/convert_parallel"
339
-
340
- # Prepare payload
341
- payload = {
342
- "urls": urls,
343
- **config.to_dict()
344
- }
345
-
346
- # Send request
347
- response = requests.post(
348
- endpoint,
349
- json=payload,
350
- timeout=config.timeout
351
- )
352
- response.raise_for_status()
353
-
354
- # Process results
355
- results = []
356
- server_results = response.json() # Assuming server returns JSON array of results
357
-
358
- for url_result in server_results:
359
- result = CrawlResult(
360
- url=url_result["url"],
361
- status=url_result.get("status", "failed"),
362
- markdown=url_result.get("markdown"),
363
- error=url_result.get("error"),
364
- config=config.to_dict()
365
- )
366
-
367
- # Save markdown if successful and output dir is configured
368
- if result.status == "success" and config.output_dir and result.markdown:
369
- filepath = config.output_dir / f"{slugify(result.url)}.md"
370
- with open(filepath, 'w', encoding='utf-8') as f:
371
- f.write(result.markdown)
372
-
373
- # Send webhook if configured
374
- if config.webhook_url:
375
- _send_webhook_sync(result, config)
376
-
377
- results.append(result)
378
-
379
- # Calculate statistics
380
- successful = len([r for r in results if r.status == "success"])
381
- failed = len([r for r in results if r.status == "failed"])
382
-
383
- # Print summary
384
- print(f"\nParallel processing completed:")
385
- print(f"✓ Successful: {successful}")
386
- print(f"✗ Failed: {failed}")
387
-
388
- # Save report if enabled
389
- if config.save_reports and config.report_file:
390
- self._retry_stats = {
391
- "initial_failures": failed,
392
- "failure_ratio": (failed / len(urls)) * 100,
393
- "retry_successful": 0, # No retries in server parallel mode
394
- "retry_failed": failed
395
- }
396
- self._save_report_sync(results, config)
397
- console.print(f"📊 Report saved to: {config.report_file}")
398
-
399
- return results
400
-
401
- except Exception as e:
402
- print(f"Error during parallel processing: {str(e)}")
403
- # Create failed results for all URLs
404
- return [
405
- CrawlResult(
406
- url=url,
407
- status="failed",
408
- error=str(e),
409
- config=config.to_dict()
410
- ) for url in urls
411
- ]
412
-
413
-
414
- async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
415
- """Send webhook with crawl results."""
416
- if not config.webhook_url:
417
- return
418
-
419
- payload = {
420
- "url": result.url,
421
- "status": result.status,
422
- "markdown": result.markdown if result.status == "success" else None,
423
- "error": result.error if result.status == "failed" else None,
424
- "timestamp": result.timestamp,
425
- "config": config.to_dict()
426
- }
427
-
428
- try:
429
- async with httpx.AsyncClient() as client:
430
- response = await client.post(
431
- config.webhook_url,
432
- json=payload,
433
- timeout=config.webhook_timeout
434
- )
435
- response.raise_for_status()
436
- except Exception as e:
437
- console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
438
-
439
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
440
- """Save crawl report synchronously."""
441
- # Separate successful and failed results
442
- successful_results = [r for r in results if r.status == "success"]
443
- failed_results = [r for r in results if r.status == "failed"]
444
-
445
- # Create report with only final state
446
- report = {
447
- "timestamp": datetime.now().isoformat(),
448
- "config": config.to_dict(),
449
- "results": {
450
- "successful": [asdict(r) for r in successful_results],
451
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
452
- },
453
- "summary": {
454
- "total": len(results),
455
- "successful": len(successful_results),
456
- "failed": len(failed_results),
457
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
458
- }
459
- }
460
-
461
- with open(config.report_file, 'w', encoding='utf-8') as f:
462
- json.dump(report, f, indent=2)
463
-
464
- async def _save_report(self, config: CrawlConfig):
465
- """Save crawl report to JSON file."""
466
- if not config.report_file:
467
- return
468
-
469
- # Separate successful and failed results
470
- successful_results = [r for r in self.crawl_results if r.status == "success"]
471
- failed_results = [r for r in self.crawl_results if r.status == "failed"]
472
-
473
- report = {
474
- "timestamp": datetime.now().isoformat(),
475
- "config": config.to_dict(),
476
- "results": {
477
- "successful": [asdict(r) for r in successful_results],
478
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
479
- },
480
- "summary": {
481
- "total": len(self.crawl_results),
482
- "successful": len(successful_results),
483
- "failed": len(failed_results),
484
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
485
- }
486
- }
487
-
488
- async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
489
- await f.write(json.dumps(report, indent=2))
490
-
491
431
  async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
492
432
  """Crawl a single URL asynchronously."""
493
433
  await self._ensure_session()
@@ -518,9 +458,31 @@ class SpiderForce4AI:
518
458
  )
519
459
 
520
460
  if config.output_dir:
521
- await self._save_markdown(url, markdown, config.output_dir)
461
+ await _save_markdown_async(url, markdown, config)
462
+
463
+ # Handle post-extraction if configured
464
+ if config.post_extraction_agent and result.status == "success":
465
+ try:
466
+ post_config = PostExtractionConfig(
467
+ model=config.post_extraction_agent["model"],
468
+ messages=config.post_extraction_agent["messages"],
469
+ api_key=config.post_extraction_agent["api_key"],
470
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
472
+ base_url=config.post_extraction_agent.get("base_url"),
473
+ combine_output=bool(config.post_extraction_agent_save_to_file),
474
+ output_file=config.post_extraction_agent_save_to_file,
475
+ custom_transform_function=config.post_agent_transformer_function
476
+ )
477
+
478
+ agent = PostExtractionAgent(post_config)
479
+ extraction_result = await agent.process_content(url, markdown)
480
+ if extraction_result:
481
+ result.extraction_result = extraction_result
482
+ except Exception as e:
483
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
522
484
 
523
- await self._send_webhook(result, config)
485
+ await _send_webhook_async(result, config)
524
486
 
525
487
  self.crawl_results.append(result)
526
488
  return result
@@ -540,18 +502,18 @@ class SpiderForce4AI:
540
502
  return asyncio.run(self.crawl_url_async(url, config))
541
503
 
542
504
  async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
543
- """Retry failed URLs once."""
505
+ """Retry failed URLs with optional progress tracking."""
544
506
  if not failed_results:
545
507
  return []
546
508
 
547
509
  failed_count = len(failed_results)
548
- total_count = len([r for r in self.crawl_results])
510
+ total_count = len(self.crawl_results)
549
511
  failure_ratio = (failed_count / total_count) * 100
550
512
 
551
513
  console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
552
514
  retry_results = []
553
515
 
554
- # Create a new progress bar if one wasn't provided
516
+ # Create or use provided progress bar
555
517
  should_close_progress = progress is None
556
518
  if progress is None:
557
519
  progress = Progress(
@@ -595,6 +557,7 @@ class SpiderForce4AI:
595
557
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
596
558
  """Crawl multiple URLs asynchronously with progress bar."""
597
559
  await self._ensure_session()
560
+ post_extraction_results = {}
598
561
 
599
562
  with Progress(
600
563
  SpinnerColumn(),
@@ -603,52 +566,60 @@ class SpiderForce4AI:
603
566
  TaskProgressColumn(),
604
567
  console=console
605
568
  ) as progress:
606
- task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
569
+ crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
607
570
 
608
571
  async def crawl_with_progress(url):
609
572
  result = await self.crawl_url_async(url, config)
610
- progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
573
+ progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
611
574
  return result
612
575
 
576
+ # Set up concurrency control
613
577
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
+
614
579
  async def crawl_with_semaphore(url):
615
580
  async with semaphore:
616
581
  result = await crawl_with_progress(url)
617
582
  await asyncio.sleep(config.request_delay)
618
583
  return result
619
584
 
585
+ # Perform initial crawl
620
586
  initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
621
587
 
622
- # Identify failed URLs
588
+ # Handle failed URLs
623
589
  failed_results = [r for r in initial_results if r.status == "failed"]
624
-
625
- # Calculate initial failure ratio
626
590
  initial_failed = len(failed_results)
627
591
  total_urls = len(urls)
628
592
  failure_ratio = (initial_failed / total_urls) * 100
629
593
 
630
594
  # Retry failed URLs if ratio is acceptable
631
- if failed_results:
632
- if failure_ratio > 20:
633
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
634
- results = initial_results
635
- else:
636
- retry_results = await self._retry_failed_urls(failed_results, config, progress)
637
- # Update results list by replacing failed results with successful retries
638
- results = initial_results.copy()
639
- for retry_result in retry_results:
640
- for i, result in enumerate(results):
641
- if result.url == retry_result.url:
642
- results[i] = retry_result
643
- break
644
- else:
645
- results = initial_results
595
+ results = initial_results
596
+ retry_successful = 0
646
597
 
647
- # Calculate final statistics before saving report
598
+ if failed_results and failure_ratio <= 20:
599
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
600
+ retry_successful = len([r for r in retry_results if r.status == "success"])
601
+
602
+ # Update results list
603
+ for retry_result in retry_results:
604
+ for i, result in enumerate(results):
605
+ if result.url == retry_result.url:
606
+ results[i] = retry_result
607
+ break
608
+
609
+ # Calculate final statistics
648
610
  final_successful = len([r for r in results if r.status == "success"])
649
611
  final_failed = len([r for r in results if r.status == "failed"])
650
612
 
651
- # Print detailed summary
613
+ # Update retry stats
614
+ self._retry_stats = {
615
+ "initial_failures": initial_failed,
616
+ "failure_ratio": failure_ratio,
617
+ "retry_successful": retry_successful if initial_failed > 0 else 0,
618
+ "retry_failed": final_failed,
619
+ "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
620
+ }
621
+
622
+ # Print summary
652
623
  console.print(f"\n[green]Crawling Summary:[/green]")
653
624
  console.print(f"Total URLs processed: {total_urls}")
654
625
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
@@ -657,18 +628,11 @@ class SpiderForce4AI:
657
628
  console.print(f" ✗ Failed: {final_failed}")
658
629
 
659
630
  if initial_failed > 0:
660
- retry_successful = initial_failed - final_failed
661
631
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
662
632
 
663
- # Save final report after all retries are complete
633
+ # Save final report
664
634
  if config.save_reports:
665
- self._retry_stats = {
666
- "initial_failures": initial_failed,
667
- "failure_ratio": failure_ratio,
668
- "retry_successful": retry_successful if initial_failed > 0 else 0,
669
- "retry_failed": final_failed
670
- }
671
- await self._save_report(config)
635
+ await _save_report_async(results, config, self._retry_stats)
672
636
  console.print(f"📊 Report saved to: {config.report_file}")
673
637
 
674
638
  return results
@@ -705,32 +669,21 @@ class SpiderForce4AI:
705
669
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
706
670
 
707
671
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
708
- """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
709
- print(f"Fetching sitemap from {sitemap_url}...")
710
-
711
- # Fetch sitemap
672
+ """Crawl sitemap URLs in parallel using multiprocessing."""
673
+ # Fetch and parse sitemap
712
674
  try:
713
675
  response = requests.get(sitemap_url, timeout=config.timeout)
714
676
  response.raise_for_status()
715
- sitemap_text = response.text
716
- except Exception as e:
717
- print(f"Error fetching sitemap: {str(e)}")
718
- raise
719
-
720
- # Parse sitemap
721
- try:
722
- root = ET.fromstring(sitemap_text)
677
+ root = ET.fromstring(response.text)
723
678
  namespace = {'ns': root.tag.split('}')[0].strip('{')}
724
679
  urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
725
- print(f"Found {len(urls)} URLs in sitemap")
680
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
726
681
  except Exception as e:
727
- print(f"Error parsing sitemap: {str(e)}")
682
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
728
683
  raise
729
684
 
730
- # Prepare arguments for parallel processing
685
+ # Process URLs in parallel
731
686
  process_args = [(url, self.base_url, config) for url in urls]
732
-
733
- # Create process pool and execute crawls
734
687
  results = []
735
688
 
736
689
  with Pool(processes=config.max_concurrent_requests) as pool:
@@ -741,81 +694,186 @@ class SpiderForce4AI:
741
694
  TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
742
695
  TextColumn("({task.completed}/{task.total})"),
743
696
  ) as progress:
744
- task = progress.add_task("Crawling URLs...", total=len(urls))
697
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
745
698
 
746
699
  for result in pool.imap_unordered(_process_url_parallel, process_args):
747
700
  results.append(result)
748
701
  progress.update(task, advance=1)
749
702
  status = "✓" if result.status == "success" else "✗"
750
- progress.description = f"Last: {status} {result.url}"
703
+ progress.description = f"[cyan]Last: {status} {result.url}"
751
704
 
752
- # Calculate initial failure statistics
705
+ # Calculate statistics and handle retries
753
706
  failed_results = [r for r in results if r.status == "failed"]
754
707
  initial_failed = len(failed_results)
755
- total_urls = len(urls)
756
- failure_ratio = (initial_failed / total_urls) * 100
708
+ failure_ratio = (initial_failed / len(urls)) * 100
709
+ retry_successful = 0
757
710
 
758
- # Retry failed URLs if ratio is acceptable
759
- if failed_results:
760
- if failure_ratio > 20:
761
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
762
- else:
763
- failed_count = len(failed_results)
764
- failure_ratio = (failed_count / total_urls) * 100
765
- console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
766
- for result in failed_results:
767
- new_result = _process_url_parallel((result.url, self.base_url, config))
768
-
769
- # Save markdown and trigger webhook for successful retries
770
- if new_result.status == "success":
771
- console.print(f"[green] Retry successful: {result.url}[/green]")
772
- # Save markdown if output directory is configured
773
- if config.output_dir and new_result.markdown:
774
- filepath = config.output_dir / f"{slugify(new_result.url)}.md"
775
- with open(filepath, 'w', encoding='utf-8') as f:
776
- f.write(new_result.markdown)
777
- # Send webhook for successful retry
778
- _send_webhook_sync(new_result, config)
779
- else:
780
- console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
781
- # Send webhook for failed retry
782
- _send_webhook_sync(new_result, config)
783
-
784
- # Update results list
785
- for i, r in enumerate(results):
786
- if r.url == new_result.url:
787
- results[i] = new_result
788
- break
711
+ if failed_results and failure_ratio <= 20:
712
+ console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
713
+ for result in failed_results:
714
+ new_result = _process_url_parallel((result.url, self.base_url, config))
715
+ if new_result.status == "success":
716
+ retry_successful += 1
717
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
718
+ else:
719
+ console.print(f"[red]✗ Retry failed: {result.url}[/red]")
720
+
721
+ # Update results list
722
+ for i, r in enumerate(results):
723
+ if r.url == new_result.url:
724
+ results[i] = new_result
725
+ break
789
726
 
790
727
  # Calculate final statistics
791
728
  final_successful = len([r for r in results if r.status == "success"])
792
729
  final_failed = len([r for r in results if r.status == "failed"])
793
730
 
794
- # Print detailed summary
731
+ # Print summary
795
732
  console.print(f"\n[green]Crawling Summary:[/green]")
796
- console.print(f"Total URLs processed: {total_urls}")
733
+ console.print(f"Total URLs processed: {len(urls)}")
797
734
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
798
735
  console.print(f"Final results:")
799
736
  console.print(f" ✓ Successful: {final_successful}")
800
737
  console.print(f" ✗ Failed: {final_failed}")
801
-
738
+
802
739
  if initial_failed > 0:
803
- retry_successful = initial_failed - final_failed
804
740
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
805
741
 
806
- # Save final report after all retries are complete
742
+ # Save report
807
743
  if config.save_reports:
808
744
  self._retry_stats = {
809
745
  "initial_failures": initial_failed,
810
746
  "failure_ratio": failure_ratio,
811
- "retry_successful": retry_successful if initial_failed > 0 else 0,
747
+ "retry_successful": retry_successful,
812
748
  "retry_failed": final_failed
813
749
  }
814
- self._save_report_sync(results, config)
750
+ _save_report_sync(results, config, self._retry_stats)
815
751
  console.print(f"📊 Report saved to: {config.report_file}")
816
752
 
817
753
  return results
818
754
 
755
+ def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
756
+ """
757
+ Crawl multiple URLs using server-side parallel processing.
758
+ This uses the /convert_parallel endpoint which handles parallelization on the server.
759
+ """
760
+ console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
761
+
762
+ try:
763
+ endpoint = f"{self.base_url}/convert_parallel"
764
+
765
+ # Prepare payload
766
+ payload = {
767
+ "urls": urls,
768
+ **config.to_dict()
769
+ }
770
+
771
+ # Send request
772
+ response = requests.post(
773
+ endpoint,
774
+ json=payload,
775
+ timeout=config.timeout
776
+ )
777
+ response.raise_for_status()
778
+
779
+ # Process results
780
+ results = []
781
+ server_results = response.json()
782
+
783
+ for url_result in server_results:
784
+ result = CrawlResult(
785
+ url=url_result["url"],
786
+ status=url_result.get("status", "failed"),
787
+ markdown=url_result.get("markdown"),
788
+ error=url_result.get("error"),
789
+ config=config.to_dict()
790
+ )
791
+
792
+ # Save markdown if successful and output dir is configured
793
+ if result.status == "success" and config.output_dir and result.markdown:
794
+ _save_markdown_sync(result.url, result.markdown, config)
795
+
796
+ # Handle post-extraction if configured
797
+ if config.post_extraction_agent and result.status == "success":
798
+ try:
799
+ post_config = PostExtractionConfig(
800
+ model=config.post_extraction_agent["model"],
801
+ messages=config.post_extraction_agent["messages"],
802
+ api_key=config.post_extraction_agent["api_key"],
803
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
804
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
805
+ base_url=config.post_extraction_agent.get("base_url"),
806
+ combine_output=bool(config.post_extraction_agent_save_to_file),
807
+ output_file=config.post_extraction_agent_save_to_file,
808
+ custom_transform_function=config.post_agent_transformer_function
809
+ )
810
+
811
+ agent = PostExtractionAgent(post_config)
812
+ extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
813
+ if extraction_result:
814
+ result.extraction_result = extraction_result
815
+ except Exception as e:
816
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
817
+
818
+ # Send webhook if configured
819
+ _send_webhook_sync(result, config)
820
+ results.append(result)
821
+
822
+ # Calculate statistics
823
+ successful = len([r for r in results if r.status == "success"])
824
+ failed = len([r for r in results if r.status == "failed"])
825
+
826
+ # Print summary
827
+ console.print("\n[green]Parallel processing completed:[/green]")
828
+ console.print(f"✓ Successful: {successful}")
829
+ console.print(f"✗ Failed: {failed}")
830
+
831
+ # Save report if enabled
832
+ if config.save_reports:
833
+ self._retry_stats = {
834
+ "initial_failures": failed,
835
+ "failure_ratio": (failed / len(urls)) * 100,
836
+ "retry_successful": 0, # No retries in server parallel mode
837
+ "retry_failed": failed
838
+ }
839
+ _save_report_sync(results, config, self._retry_stats)
840
+ console.print(f"📊 Report saved to: {config.report_file}")
841
+
842
+ return results
843
+
844
+ except Exception as e:
845
+ console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
846
+ # Create failed results for all URLs
847
+ return [
848
+ CrawlResult(
849
+ url=url,
850
+ status="failed",
851
+ error=str(e),
852
+ config=config.to_dict()
853
+ ) for url in urls
854
+ ]
855
+
856
+ def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
857
+ """
858
+ Crawl sitemap URLs using server-side parallel processing.
859
+ """
860
+ console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
861
+
862
+ try:
863
+ response = requests.get(sitemap_url, timeout=config.timeout)
864
+ response.raise_for_status()
865
+ root = ET.fromstring(response.text)
866
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
867
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
868
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
869
+
870
+ # Process URLs using server-side parallel endpoint
871
+ return self.crawl_urls_server_parallel(urls, config)
872
+
873
+ except Exception as e:
874
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
875
+ raise
876
+
819
877
  async def __aenter__(self):
820
878
  """Async context manager entry."""
821
879
  await self._ensure_session()
@@ -833,3 +891,7 @@ class SpiderForce4AI:
833
891
  """Sync context manager exit."""
834
892
  self._executor.shutdown(wait=True)
835
893
 
894
+ # Version info
895
+ #__version__ = "2.3.1"
896
+ #__author__ = "Piotr Tamulewicz"
897
+ #__email__ = "pt@petertam.pro"