spiderforce4ai 2.3.1__py3-none-any.whl → 2.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,11 @@
1
1
  # spiderforce4ai/__init__.py
2
2
 
3
+ from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
3
4
  import asyncio
4
5
  import aiohttp
5
6
  import json
6
7
  import logging
7
- from typing import List, Dict, Union, Optional, Tuple
8
+ from typing import List, Dict, Union, Optional, Tuple, Callable, Any
8
9
  from dataclasses import dataclass, asdict
9
10
  from urllib.parse import urljoin, urlparse
10
11
  from pathlib import Path
@@ -65,13 +66,14 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
65
66
  output = []
66
67
  output.append(f"URL: {url}")
67
68
  output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
68
- output.append(f"Description: {metadata.get('Description', '')}") # Now this will be empty string for missing descriptions
69
+ output.append(f"Description: {metadata.get('Description', '')}")
69
70
  output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
70
71
  output.append(f"Language: {metadata.get('Language', 'en')}")
71
72
  output.append("") # Empty line
72
73
  output.extend(headers)
73
74
 
74
75
  return '\n'.join(output)
76
+
75
77
  def slugify(url: str) -> str:
76
78
  """Convert URL to a valid filename."""
77
79
  parsed = urlparse(url)
@@ -90,6 +92,7 @@ class CrawlResult:
90
92
  error: Optional[str] = None
91
93
  timestamp: str = None
92
94
  config: Dict = None
95
+ extraction_result: Optional[Dict] = None # Store post-extraction results
93
96
 
94
97
  def __post_init__(self):
95
98
  if not self.timestamp:
@@ -110,9 +113,14 @@ class CrawlConfig:
110
113
  webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
111
114
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
112
115
  save_reports: bool = False # Whether to save crawl reports
113
- report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
114
- combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
115
- combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
116
+ report_file: Optional[Path] = None # Optional report file location
117
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
118
+ combined_markdown_file: Optional[Path] = None # Optional path for combined file
119
+
120
+ # Post-extraction settings
121
+ post_extraction_agent: Optional[Dict[str, Any]] = None # LLM configuration
122
+ post_extraction_agent_save_to_file: Optional[str] = None # Extraction output file
123
+ post_agent_transformer_function: Optional[Callable] = None # Custom transformer
116
124
 
117
125
  def __post_init__(self):
118
126
  # Initialize empty lists/dicts for None values
@@ -140,6 +148,15 @@ class CrawlConfig:
140
148
  # Create or clear the combined file
141
149
  self.combined_markdown_file.write_text('')
142
150
 
151
+ # Validate post-extraction agent configuration if provided
152
+ if self.post_extraction_agent:
153
+ if "messages" not in self.post_extraction_agent:
154
+ raise ValueError("Post-extraction agent configuration must include 'messages'")
155
+ if "model" not in self.post_extraction_agent:
156
+ raise ValueError("Post-extraction agent configuration must include 'model'")
157
+ if "api_key" not in self.post_extraction_agent:
158
+ raise ValueError("Post-extraction agent configuration must include 'api_key'")
159
+
143
160
  def to_dict(self) -> Dict:
144
161
  """Convert config to dictionary for API requests."""
145
162
  payload = {}
@@ -151,52 +168,120 @@ class CrawlConfig:
151
168
  if self.remove_selectors_regex:
152
169
  payload["remove_selectors_regex"] = self.remove_selectors_regex
153
170
  return payload
154
-
155
-
171
+
156
172
  def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
157
173
  """Synchronous version of webhook sender for parallel processing."""
158
174
  if not config.webhook_url:
159
175
  return
160
176
 
161
- # Use custom payload template if provided, otherwise use default
162
- if config.webhook_payload_template:
163
- # Replace variables in the template
164
- payload_str = config.webhook_payload_template.format(
165
- url=result.url,
166
- status=result.status,
167
- markdown=result.markdown if result.status == "success" else None,
168
- error=result.error if result.status == "failed" else None,
169
- timestamp=result.timestamp,
170
- config=config.to_dict()
177
+ try:
178
+ # Use custom payload template if provided, otherwise use default
179
+ if config.webhook_payload_template:
180
+ # Replace variables in the template
181
+ payload_str = config.webhook_payload_template.format(
182
+ url=result.url,
183
+ status=result.status,
184
+ markdown=result.markdown if result.status == "success" else None,
185
+ error=result.error if result.status == "failed" else None,
186
+ timestamp=result.timestamp,
187
+ config=config.to_dict(),
188
+ extraction_result=result.extraction_result if result.extraction_result else None
189
+ )
190
+ payload = json.loads(payload_str) # Parse the formatted JSON string
191
+ else:
192
+ # Use default payload format
193
+ payload = {
194
+ "url": result.url,
195
+ "status": result.status,
196
+ "markdown": result.markdown if result.status == "success" else None,
197
+ "error": result.error if result.status == "failed" else None,
198
+ "timestamp": result.timestamp,
199
+ "config": config.to_dict(),
200
+ "extraction_result": result.extraction_result if result.extraction_result else None
201
+ }
202
+
203
+ response = requests.post(
204
+ config.webhook_url,
205
+ json=payload,
206
+ headers=config.webhook_headers,
207
+ timeout=config.webhook_timeout
171
208
  )
172
- payload = json.loads(payload_str) # Parse the formatted JSON string
173
- else:
174
- # Use default payload format
209
+ response.raise_for_status()
210
+ except Exception as e:
211
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
212
+
213
+ async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
214
+ """Asynchronous webhook sender."""
215
+ if not config.webhook_url:
216
+ return
217
+
218
+ try:
219
+ # Prepare payload similar to sync version
175
220
  payload = {
176
221
  "url": result.url,
177
222
  "status": result.status,
178
223
  "markdown": result.markdown if result.status == "success" else None,
179
224
  "error": result.error if result.status == "failed" else None,
180
225
  "timestamp": result.timestamp,
181
- "config": config.to_dict()
226
+ "config": config.to_dict(),
227
+ "extraction_result": result.extraction_result if result.extraction_result else None
182
228
  }
183
229
 
230
+ async with httpx.AsyncClient() as client:
231
+ response = await client.post(
232
+ config.webhook_url,
233
+ json=payload,
234
+ headers=config.webhook_headers,
235
+ timeout=config.webhook_timeout
236
+ )
237
+ response.raise_for_status()
238
+ except Exception as e:
239
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
240
+
241
+ async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
242
+ """Save markdown content to file and/or append to combined file asynchronously."""
184
243
  try:
185
- response = requests.post(
186
- config.webhook_url,
187
- json=payload,
188
- headers=config.webhook_headers,
189
- timeout=config.webhook_timeout
190
- )
191
- response.raise_for_status()
244
+ # Save individual file if not combining or if combining in full mode
245
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
246
+ filename = f"{slugify(url)}.md"
247
+ filepath = config.output_dir / filename
248
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
249
+ await f.write(markdown)
250
+
251
+ # Handle combined markdown file
252
+ if config.combine_to_one_markdown:
253
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
254
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
255
+
256
+ async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
257
+ await f.write(combined_content)
192
258
  except Exception as e:
193
- print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
259
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
260
+
261
+ def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
262
+ """Synchronous version of markdown saver for parallel processing."""
263
+ try:
264
+ # Save individual file if not combining or if combining in full mode
265
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
266
+ filepath = config.output_dir / f"{slugify(url)}.md"
267
+ with open(filepath, 'w', encoding='utf-8') as f:
268
+ f.write(markdown)
269
+
270
+ # Handle combined markdown file
271
+ if config.combine_to_one_markdown:
272
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
273
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
274
+
275
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
276
+ f.write(combined_content)
277
+ except Exception as e:
278
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
194
279
 
195
- # Module level function for multiprocessing
196
280
  def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
197
281
  """Process a single URL for parallel processing."""
198
282
  url, base_url, config = args
199
283
  try:
284
+ # Make the conversion request
200
285
  endpoint = f"{base_url}/convert"
201
286
  payload = {
202
287
  "url": url,
@@ -211,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
211
296
  error=f"HTTP {response.status_code}: {response.text}",
212
297
  config=config.to_dict()
213
298
  )
214
- # Send webhook for failed result
215
299
  _send_webhook_sync(result, config)
216
300
  return result
217
301
 
@@ -219,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
219
303
 
220
304
  # Save markdown if output directory is configured
221
305
  if config.output_dir:
222
- # Save individual file if not combining or if combining in full mode
223
- if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
224
- filepath = config.output_dir / f"{slugify(url)}.md"
225
- with open(filepath, 'w', encoding='utf-8') as f:
226
- f.write(markdown)
227
-
228
- # Handle combined markdown file
229
- if config.combine_to_one_markdown:
230
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
231
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
232
-
233
- with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
234
- f.write(combined_content)
306
+ _save_markdown_sync(url, markdown, config)
235
307
 
236
308
  result = CrawlResult(
237
309
  url=url,
@@ -240,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
240
312
  config=config.to_dict()
241
313
  )
242
314
 
315
+ # Handle post-extraction if configured
316
+ if config.post_extraction_agent:
317
+ try:
318
+ post_config = PostExtractionConfig(
319
+ model=config.post_extraction_agent["model"],
320
+ messages=config.post_extraction_agent["messages"],
321
+ api_key=config.post_extraction_agent["api_key"],
322
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
324
+ base_url=config.post_extraction_agent.get("base_url"),
325
+ combine_output=bool(config.post_extraction_agent_save_to_file),
326
+ output_file=config.post_extraction_agent_save_to_file,
327
+ custom_transform_function=config.post_agent_transformer_function
328
+ )
329
+
330
+ agent = PostExtractionAgent(post_config)
331
+ extraction_result = asyncio.run(agent.process_content(url, markdown))
332
+ if extraction_result:
333
+ result.extraction_result = extraction_result
334
+ except Exception as e:
335
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
+
243
337
  # Send webhook for successful result
244
338
  _send_webhook_sync(result, config)
245
339
 
@@ -260,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
260
354
  _send_webhook_sync(result, config)
261
355
  return result
262
356
 
357
+ async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
358
+ """Save crawl report to JSON file asynchronously."""
359
+ if not config.report_file:
360
+ return
361
+
362
+ # Separate successful and failed results
363
+ successful_results = [r for r in results if r.status == "success"]
364
+ failed_results = [r for r in results if r.status == "failed"]
365
+
366
+ report = {
367
+ "timestamp": datetime.now().isoformat(),
368
+ "config": config.to_dict(),
369
+ "results": {
370
+ "successful": [asdict(r) for r in successful_results],
371
+ "failed": [asdict(r) for r in failed_results]
372
+ },
373
+ "summary": {
374
+ "total": len(results),
375
+ "successful": len(successful_results),
376
+ "failed": len(failed_results),
377
+ "retry_info": retry_stats or {}
378
+ }
379
+ }
380
+
381
+ async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
382
+ await f.write(json.dumps(report, indent=2))
383
+
384
+ def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
385
+ """Synchronous version of report saver."""
386
+ if not config.report_file:
387
+ return
388
+
389
+ # Create report similar to async version
390
+ successful_results = [r for r in results if r.status == "success"]
391
+ failed_results = [r for r in results if r.status == "failed"]
392
+
393
+ report = {
394
+ "timestamp": datetime.now().isoformat(),
395
+ "config": config.to_dict(),
396
+ "results": {
397
+ "successful": [asdict(r) for r in successful_results],
398
+ "failed": [asdict(r) for r in failed_results]
399
+ },
400
+ "summary": {
401
+ "total": len(results),
402
+ "successful": len(successful_results),
403
+ "failed": len(failed_results),
404
+ "retry_info": retry_stats or {}
405
+ }
406
+ }
407
+
408
+ with open(config.report_file, 'w', encoding='utf-8') as f:
409
+ json.dump(report, f, indent=2)
410
+
263
411
  class SpiderForce4AI:
264
412
  """Main class for interacting with SpiderForce4AI service."""
265
413
 
@@ -268,6 +416,7 @@ class SpiderForce4AI:
268
416
  self.session = None
269
417
  self._executor = ThreadPoolExecutor()
270
418
  self.crawl_results: List[CrawlResult] = []
419
+ self._retry_stats = {}
271
420
 
272
421
  async def _ensure_session(self):
273
422
  """Ensure aiohttp session exists."""
@@ -279,215 +428,6 @@ class SpiderForce4AI:
279
428
  if self.session and not self.session.closed:
280
429
  await self.session.close()
281
430
 
282
- async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
283
- """Save markdown content to file and/or append to combined file."""
284
- # Save individual file if not combining or if combining in full mode
285
- if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
286
- filename = f"{slugify(url)}.md"
287
- filepath = output_dir / filename
288
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
289
- await f.write(markdown)
290
-
291
- # Handle combined markdown file
292
- if self.config.combine_to_one_markdown:
293
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
294
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
295
-
296
- async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
297
- await f.write(combined_content)
298
-
299
-
300
-
301
- def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
302
- """
303
- Crawl sitemap URLs using server-side parallel processing.
304
- """
305
- print(f"Fetching sitemap from {sitemap_url}...")
306
-
307
- # Fetch sitemap
308
- try:
309
- response = requests.get(sitemap_url, timeout=config.timeout)
310
- response.raise_for_status()
311
- sitemap_text = response.text
312
- except Exception as e:
313
- print(f"Error fetching sitemap: {str(e)}")
314
- raise
315
-
316
- # Parse sitemap
317
- try:
318
- root = ET.fromstring(sitemap_text)
319
- namespace = {'ns': root.tag.split('}')[0].strip('{')}
320
- urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
321
- print(f"Found {len(urls)} URLs in sitemap")
322
- except Exception as e:
323
- print(f"Error parsing sitemap: {str(e)}")
324
- raise
325
-
326
- # Process URLs using server-side parallel endpoint
327
- return self.crawl_urls_server_parallel(urls, config)
328
-
329
-
330
- def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
331
- """
332
- Crawl multiple URLs using server-side parallel processing.
333
- This uses the /convert_parallel endpoint which handles parallelization on the server.
334
- """
335
- print(f"Sending {len(urls)} URLs for parallel processing...")
336
-
337
- try:
338
- endpoint = f"{self.base_url}/convert_parallel"
339
-
340
- # Prepare payload
341
- payload = {
342
- "urls": urls,
343
- **config.to_dict()
344
- }
345
-
346
- # Send request
347
- response = requests.post(
348
- endpoint,
349
- json=payload,
350
- timeout=config.timeout
351
- )
352
- response.raise_for_status()
353
-
354
- # Process results
355
- results = []
356
- server_results = response.json() # Assuming server returns JSON array of results
357
-
358
- for url_result in server_results:
359
- result = CrawlResult(
360
- url=url_result["url"],
361
- status=url_result.get("status", "failed"),
362
- markdown=url_result.get("markdown"),
363
- error=url_result.get("error"),
364
- config=config.to_dict()
365
- )
366
-
367
- # Save markdown if successful and output dir is configured
368
- if result.status == "success" and config.output_dir and result.markdown:
369
- filepath = config.output_dir / f"{slugify(result.url)}.md"
370
- with open(filepath, 'w', encoding='utf-8') as f:
371
- f.write(result.markdown)
372
-
373
- # Send webhook if configured
374
- if config.webhook_url:
375
- _send_webhook_sync(result, config)
376
-
377
- results.append(result)
378
-
379
- # Calculate statistics
380
- successful = len([r for r in results if r.status == "success"])
381
- failed = len([r for r in results if r.status == "failed"])
382
-
383
- # Print summary
384
- print(f"\nParallel processing completed:")
385
- print(f"✓ Successful: {successful}")
386
- print(f"✗ Failed: {failed}")
387
-
388
- # Save report if enabled
389
- if config.save_reports and config.report_file:
390
- self._retry_stats = {
391
- "initial_failures": failed,
392
- "failure_ratio": (failed / len(urls)) * 100,
393
- "retry_successful": 0, # No retries in server parallel mode
394
- "retry_failed": failed
395
- }
396
- self._save_report_sync(results, config)
397
- console.print(f"📊 Report saved to: {config.report_file}")
398
-
399
- return results
400
-
401
- except Exception as e:
402
- print(f"Error during parallel processing: {str(e)}")
403
- # Create failed results for all URLs
404
- return [
405
- CrawlResult(
406
- url=url,
407
- status="failed",
408
- error=str(e),
409
- config=config.to_dict()
410
- ) for url in urls
411
- ]
412
-
413
-
414
- async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
415
- """Send webhook with crawl results."""
416
- if not config.webhook_url:
417
- return
418
-
419
- payload = {
420
- "url": result.url,
421
- "status": result.status,
422
- "markdown": result.markdown if result.status == "success" else None,
423
- "error": result.error if result.status == "failed" else None,
424
- "timestamp": result.timestamp,
425
- "config": config.to_dict()
426
- }
427
-
428
- try:
429
- async with httpx.AsyncClient() as client:
430
- response = await client.post(
431
- config.webhook_url,
432
- json=payload,
433
- timeout=config.webhook_timeout
434
- )
435
- response.raise_for_status()
436
- except Exception as e:
437
- console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
438
-
439
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
440
- """Save crawl report synchronously."""
441
- # Separate successful and failed results
442
- successful_results = [r for r in results if r.status == "success"]
443
- failed_results = [r for r in results if r.status == "failed"]
444
-
445
- # Create report with only final state
446
- report = {
447
- "timestamp": datetime.now().isoformat(),
448
- "config": config.to_dict(),
449
- "results": {
450
- "successful": [asdict(r) for r in successful_results],
451
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
452
- },
453
- "summary": {
454
- "total": len(results),
455
- "successful": len(successful_results),
456
- "failed": len(failed_results),
457
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
458
- }
459
- }
460
-
461
- with open(config.report_file, 'w', encoding='utf-8') as f:
462
- json.dump(report, f, indent=2)
463
-
464
- async def _save_report(self, config: CrawlConfig):
465
- """Save crawl report to JSON file."""
466
- if not config.report_file:
467
- return
468
-
469
- # Separate successful and failed results
470
- successful_results = [r for r in self.crawl_results if r.status == "success"]
471
- failed_results = [r for r in self.crawl_results if r.status == "failed"]
472
-
473
- report = {
474
- "timestamp": datetime.now().isoformat(),
475
- "config": config.to_dict(),
476
- "results": {
477
- "successful": [asdict(r) for r in successful_results],
478
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
479
- },
480
- "summary": {
481
- "total": len(self.crawl_results),
482
- "successful": len(successful_results),
483
- "failed": len(failed_results),
484
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
485
- }
486
- }
487
-
488
- async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
489
- await f.write(json.dumps(report, indent=2))
490
-
491
431
  async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
492
432
  """Crawl a single URL asynchronously."""
493
433
  await self._ensure_session()
@@ -518,9 +458,31 @@ class SpiderForce4AI:
518
458
  )
519
459
 
520
460
  if config.output_dir:
521
- await self._save_markdown(url, markdown, config.output_dir)
461
+ await _save_markdown_async(url, markdown, config)
462
+
463
+ # Handle post-extraction if configured
464
+ if config.post_extraction_agent and result.status == "success":
465
+ try:
466
+ post_config = PostExtractionConfig(
467
+ model=config.post_extraction_agent["model"],
468
+ messages=config.post_extraction_agent["messages"],
469
+ api_key=config.post_extraction_agent["api_key"],
470
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
472
+ base_url=config.post_extraction_agent.get("base_url"),
473
+ combine_output=bool(config.post_extraction_agent_save_to_file),
474
+ output_file=config.post_extraction_agent_save_to_file,
475
+ custom_transform_function=config.post_agent_transformer_function
476
+ )
477
+
478
+ agent = PostExtractionAgent(post_config)
479
+ extraction_result = await agent.process_content(url, markdown)
480
+ if extraction_result:
481
+ result.extraction_result = extraction_result
482
+ except Exception as e:
483
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
522
484
 
523
- await self._send_webhook(result, config)
485
+ await _send_webhook_async(result, config)
524
486
 
525
487
  self.crawl_results.append(result)
526
488
  return result
@@ -540,18 +502,18 @@ class SpiderForce4AI:
540
502
  return asyncio.run(self.crawl_url_async(url, config))
541
503
 
542
504
  async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
543
- """Retry failed URLs once."""
505
+ """Retry failed URLs with optional progress tracking."""
544
506
  if not failed_results:
545
507
  return []
546
508
 
547
509
  failed_count = len(failed_results)
548
- total_count = len([r for r in self.crawl_results])
510
+ total_count = len(self.crawl_results)
549
511
  failure_ratio = (failed_count / total_count) * 100
550
512
 
551
513
  console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
552
514
  retry_results = []
553
515
 
554
- # Create a new progress bar if one wasn't provided
516
+ # Create or use provided progress bar
555
517
  should_close_progress = progress is None
556
518
  if progress is None:
557
519
  progress = Progress(
@@ -595,6 +557,7 @@ class SpiderForce4AI:
595
557
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
596
558
  """Crawl multiple URLs asynchronously with progress bar."""
597
559
  await self._ensure_session()
560
+ post_extraction_results = {}
598
561
 
599
562
  with Progress(
600
563
  SpinnerColumn(),
@@ -603,52 +566,60 @@ class SpiderForce4AI:
603
566
  TaskProgressColumn(),
604
567
  console=console
605
568
  ) as progress:
606
- task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
569
+ crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
607
570
 
608
571
  async def crawl_with_progress(url):
609
572
  result = await self.crawl_url_async(url, config)
610
- progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
573
+ progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
611
574
  return result
612
575
 
576
+ # Set up concurrency control
613
577
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
+
614
579
  async def crawl_with_semaphore(url):
615
580
  async with semaphore:
616
581
  result = await crawl_with_progress(url)
617
582
  await asyncio.sleep(config.request_delay)
618
583
  return result
619
584
 
585
+ # Perform initial crawl
620
586
  initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
621
587
 
622
- # Identify failed URLs
588
+ # Handle failed URLs
623
589
  failed_results = [r for r in initial_results if r.status == "failed"]
624
-
625
- # Calculate initial failure ratio
626
590
  initial_failed = len(failed_results)
627
591
  total_urls = len(urls)
628
592
  failure_ratio = (initial_failed / total_urls) * 100
629
593
 
630
594
  # Retry failed URLs if ratio is acceptable
631
- if failed_results:
632
- if failure_ratio > 20:
633
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
634
- results = initial_results
635
- else:
636
- retry_results = await self._retry_failed_urls(failed_results, config, progress)
637
- # Update results list by replacing failed results with successful retries
638
- results = initial_results.copy()
639
- for retry_result in retry_results:
640
- for i, result in enumerate(results):
641
- if result.url == retry_result.url:
642
- results[i] = retry_result
643
- break
644
- else:
645
- results = initial_results
595
+ results = initial_results
596
+ retry_successful = 0
646
597
 
647
- # Calculate final statistics before saving report
598
+ if failed_results and failure_ratio <= 20:
599
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
600
+ retry_successful = len([r for r in retry_results if r.status == "success"])
601
+
602
+ # Update results list
603
+ for retry_result in retry_results:
604
+ for i, result in enumerate(results):
605
+ if result.url == retry_result.url:
606
+ results[i] = retry_result
607
+ break
608
+
609
+ # Calculate final statistics
648
610
  final_successful = len([r for r in results if r.status == "success"])
649
611
  final_failed = len([r for r in results if r.status == "failed"])
650
612
 
651
- # Print detailed summary
613
+ # Update retry stats
614
+ self._retry_stats = {
615
+ "initial_failures": initial_failed,
616
+ "failure_ratio": failure_ratio,
617
+ "retry_successful": retry_successful if initial_failed > 0 else 0,
618
+ "retry_failed": final_failed,
619
+ "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
620
+ }
621
+
622
+ # Print summary
652
623
  console.print(f"\n[green]Crawling Summary:[/green]")
653
624
  console.print(f"Total URLs processed: {total_urls}")
654
625
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
@@ -657,18 +628,11 @@ class SpiderForce4AI:
657
628
  console.print(f" ✗ Failed: {final_failed}")
658
629
 
659
630
  if initial_failed > 0:
660
- retry_successful = initial_failed - final_failed
661
631
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
662
632
 
663
- # Save final report after all retries are complete
633
+ # Save final report
664
634
  if config.save_reports:
665
- self._retry_stats = {
666
- "initial_failures": initial_failed,
667
- "failure_ratio": failure_ratio,
668
- "retry_successful": retry_successful if initial_failed > 0 else 0,
669
- "retry_failed": final_failed
670
- }
671
- await self._save_report(config)
635
+ await _save_report_async(results, config, self._retry_stats)
672
636
  console.print(f"📊 Report saved to: {config.report_file}")
673
637
 
674
638
  return results
@@ -705,32 +669,21 @@ class SpiderForce4AI:
705
669
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
706
670
 
707
671
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
708
- """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
709
- print(f"Fetching sitemap from {sitemap_url}...")
710
-
711
- # Fetch sitemap
672
+ """Crawl sitemap URLs in parallel using multiprocessing."""
673
+ # Fetch and parse sitemap
712
674
  try:
713
675
  response = requests.get(sitemap_url, timeout=config.timeout)
714
676
  response.raise_for_status()
715
- sitemap_text = response.text
716
- except Exception as e:
717
- print(f"Error fetching sitemap: {str(e)}")
718
- raise
719
-
720
- # Parse sitemap
721
- try:
722
- root = ET.fromstring(sitemap_text)
677
+ root = ET.fromstring(response.text)
723
678
  namespace = {'ns': root.tag.split('}')[0].strip('{')}
724
679
  urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
725
- print(f"Found {len(urls)} URLs in sitemap")
680
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
726
681
  except Exception as e:
727
- print(f"Error parsing sitemap: {str(e)}")
682
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
728
683
  raise
729
684
 
730
- # Prepare arguments for parallel processing
685
+ # Process URLs in parallel
731
686
  process_args = [(url, self.base_url, config) for url in urls]
732
-
733
- # Create process pool and execute crawls
734
687
  results = []
735
688
 
736
689
  with Pool(processes=config.max_concurrent_requests) as pool:
@@ -741,81 +694,186 @@ class SpiderForce4AI:
741
694
  TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
742
695
  TextColumn("({task.completed}/{task.total})"),
743
696
  ) as progress:
744
- task = progress.add_task("Crawling URLs...", total=len(urls))
697
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
745
698
 
746
699
  for result in pool.imap_unordered(_process_url_parallel, process_args):
747
700
  results.append(result)
748
701
  progress.update(task, advance=1)
749
702
  status = "✓" if result.status == "success" else "✗"
750
- progress.description = f"Last: {status} {result.url}"
703
+ progress.description = f"[cyan]Last: {status} {result.url}"
751
704
 
752
- # Calculate initial failure statistics
705
+ # Calculate statistics and handle retries
753
706
  failed_results = [r for r in results if r.status == "failed"]
754
707
  initial_failed = len(failed_results)
755
- total_urls = len(urls)
756
- failure_ratio = (initial_failed / total_urls) * 100
708
+ failure_ratio = (initial_failed / len(urls)) * 100
709
+ retry_successful = 0
757
710
 
758
- # Retry failed URLs if ratio is acceptable
759
- if failed_results:
760
- if failure_ratio > 20:
761
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
762
- else:
763
- failed_count = len(failed_results)
764
- failure_ratio = (failed_count / total_urls) * 100
765
- console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
766
- for result in failed_results:
767
- new_result = _process_url_parallel((result.url, self.base_url, config))
768
-
769
- # Save markdown and trigger webhook for successful retries
770
- if new_result.status == "success":
771
- console.print(f"[green] Retry successful: {result.url}[/green]")
772
- # Save markdown if output directory is configured
773
- if config.output_dir and new_result.markdown:
774
- filepath = config.output_dir / f"{slugify(new_result.url)}.md"
775
- with open(filepath, 'w', encoding='utf-8') as f:
776
- f.write(new_result.markdown)
777
- # Send webhook for successful retry
778
- _send_webhook_sync(new_result, config)
779
- else:
780
- console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
781
- # Send webhook for failed retry
782
- _send_webhook_sync(new_result, config)
783
-
784
- # Update results list
785
- for i, r in enumerate(results):
786
- if r.url == new_result.url:
787
- results[i] = new_result
788
- break
711
+ if failed_results and failure_ratio <= 20:
712
+ console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
713
+ for result in failed_results:
714
+ new_result = _process_url_parallel((result.url, self.base_url, config))
715
+ if new_result.status == "success":
716
+ retry_successful += 1
717
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
718
+ else:
719
+ console.print(f"[red]✗ Retry failed: {result.url}[/red]")
720
+
721
+ # Update results list
722
+ for i, r in enumerate(results):
723
+ if r.url == new_result.url:
724
+ results[i] = new_result
725
+ break
789
726
 
790
727
  # Calculate final statistics
791
728
  final_successful = len([r for r in results if r.status == "success"])
792
729
  final_failed = len([r for r in results if r.status == "failed"])
793
730
 
794
- # Print detailed summary
731
+ # Print summary
795
732
  console.print(f"\n[green]Crawling Summary:[/green]")
796
- console.print(f"Total URLs processed: {total_urls}")
733
+ console.print(f"Total URLs processed: {len(urls)}")
797
734
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
798
735
  console.print(f"Final results:")
799
736
  console.print(f" ✓ Successful: {final_successful}")
800
737
  console.print(f" ✗ Failed: {final_failed}")
801
-
738
+
802
739
  if initial_failed > 0:
803
- retry_successful = initial_failed - final_failed
804
740
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
805
741
 
806
- # Save final report after all retries are complete
742
+ # Save report
807
743
  if config.save_reports:
808
744
  self._retry_stats = {
809
745
  "initial_failures": initial_failed,
810
746
  "failure_ratio": failure_ratio,
811
- "retry_successful": retry_successful if initial_failed > 0 else 0,
747
+ "retry_successful": retry_successful,
812
748
  "retry_failed": final_failed
813
749
  }
814
- self._save_report_sync(results, config)
750
+ _save_report_sync(results, config, self._retry_stats)
815
751
  console.print(f"📊 Report saved to: {config.report_file}")
816
752
 
817
753
  return results
818
754
 
755
+ def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
756
+ """
757
+ Crawl multiple URLs using server-side parallel processing.
758
+ This uses the /convert_parallel endpoint which handles parallelization on the server.
759
+ """
760
+ console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
761
+
762
+ try:
763
+ endpoint = f"{self.base_url}/convert_parallel"
764
+
765
+ # Prepare payload
766
+ payload = {
767
+ "urls": urls,
768
+ **config.to_dict()
769
+ }
770
+
771
+ # Send request
772
+ response = requests.post(
773
+ endpoint,
774
+ json=payload,
775
+ timeout=config.timeout
776
+ )
777
+ response.raise_for_status()
778
+
779
+ # Process results
780
+ results = []
781
+ server_results = response.json()
782
+
783
+ for url_result in server_results:
784
+ result = CrawlResult(
785
+ url=url_result["url"],
786
+ status=url_result.get("status", "failed"),
787
+ markdown=url_result.get("markdown"),
788
+ error=url_result.get("error"),
789
+ config=config.to_dict()
790
+ )
791
+
792
+ # Save markdown if successful and output dir is configured
793
+ if result.status == "success" and config.output_dir and result.markdown:
794
+ _save_markdown_sync(result.url, result.markdown, config)
795
+
796
+ # Handle post-extraction if configured
797
+ if config.post_extraction_agent and result.status == "success":
798
+ try:
799
+ post_config = PostExtractionConfig(
800
+ model=config.post_extraction_agent["model"],
801
+ messages=config.post_extraction_agent["messages"],
802
+ api_key=config.post_extraction_agent["api_key"],
803
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
804
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
805
+ base_url=config.post_extraction_agent.get("base_url"),
806
+ combine_output=bool(config.post_extraction_agent_save_to_file),
807
+ output_file=config.post_extraction_agent_save_to_file,
808
+ custom_transform_function=config.post_agent_transformer_function
809
+ )
810
+
811
+ agent = PostExtractionAgent(post_config)
812
+ extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
813
+ if extraction_result:
814
+ result.extraction_result = extraction_result
815
+ except Exception as e:
816
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
817
+
818
+ # Send webhook if configured
819
+ _send_webhook_sync(result, config)
820
+ results.append(result)
821
+
822
+ # Calculate statistics
823
+ successful = len([r for r in results if r.status == "success"])
824
+ failed = len([r for r in results if r.status == "failed"])
825
+
826
+ # Print summary
827
+ console.print("\n[green]Parallel processing completed:[/green]")
828
+ console.print(f"✓ Successful: {successful}")
829
+ console.print(f"✗ Failed: {failed}")
830
+
831
+ # Save report if enabled
832
+ if config.save_reports:
833
+ self._retry_stats = {
834
+ "initial_failures": failed,
835
+ "failure_ratio": (failed / len(urls)) * 100,
836
+ "retry_successful": 0, # No retries in server parallel mode
837
+ "retry_failed": failed
838
+ }
839
+ _save_report_sync(results, config, self._retry_stats)
840
+ console.print(f"📊 Report saved to: {config.report_file}")
841
+
842
+ return results
843
+
844
+ except Exception as e:
845
+ console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
846
+ # Create failed results for all URLs
847
+ return [
848
+ CrawlResult(
849
+ url=url,
850
+ status="failed",
851
+ error=str(e),
852
+ config=config.to_dict()
853
+ ) for url in urls
854
+ ]
855
+
856
+ def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
857
+ """
858
+ Crawl sitemap URLs using server-side parallel processing.
859
+ """
860
+ console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
861
+
862
+ try:
863
+ response = requests.get(sitemap_url, timeout=config.timeout)
864
+ response.raise_for_status()
865
+ root = ET.fromstring(response.text)
866
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
867
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
868
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
869
+
870
+ # Process URLs using server-side parallel endpoint
871
+ return self.crawl_urls_server_parallel(urls, config)
872
+
873
+ except Exception as e:
874
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
875
+ raise
876
+
819
877
  async def __aenter__(self):
820
878
  """Async context manager entry."""
821
879
  await self._ensure_session()
@@ -833,3 +891,7 @@ class SpiderForce4AI:
833
891
  """Sync context manager exit."""
834
892
  self._executor.shutdown(wait=True)
835
893
 
894
+ # Version info
895
+ #__version__ = "2.3.1"
896
+ #__author__ = "Piotr Tamulewicz"
897
+ #__email__ = "pt@petertam.pro"