spiderforce4ai 2.1__py3-none-any.whl → 2.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,11 @@
1
1
  # spiderforce4ai/__init__.py
2
2
 
3
+ from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
3
4
  import asyncio
4
5
  import aiohttp
5
6
  import json
6
7
  import logging
7
- from typing import List, Dict, Union, Optional, Tuple
8
+ from typing import List, Dict, Union, Optional, Tuple, Callable, Any
8
9
  from dataclasses import dataclass, asdict
9
10
  from urllib.parse import urljoin, urlparse
10
11
  from pathlib import Path
@@ -23,75 +24,55 @@ from multiprocessing import Pool
23
24
  console = Console()
24
25
 
25
26
  def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
- """Extract metadata and headers from markdown content with enhanced SEO formatting."""
27
+ """Extract metadata and headers from markdown content."""
27
28
  lines = markdown.split('\n')
28
- extracted = []
29
- in_metadata = False
30
- metadata = {
31
- 'title': '',
32
- 'description': '',
33
- 'canonical_url': '',
34
- 'language': ''
35
- }
36
- first_paragraph = ''
29
+ metadata = {}
30
+ headers = []
37
31
 
38
- # First pass - collect metadata and first paragraph
39
- for i, line in enumerate(lines):
40
- # Check for metadata block boundaries
41
- if line.strip() == '---':
42
- if not in_metadata:
43
- in_metadata = True
44
- continue
45
- else:
46
- in_metadata = False
47
- break
32
+ def parse_metadata_line(line):
33
+ """Parse a single metadata line correctly."""
34
+ first_colon = line.find(':')
35
+ if first_colon == -1:
36
+ return None, None
37
+
38
+ key = line[:first_colon].strip()
39
+ value = line[first_colon + 1:].strip()
48
40
 
49
- # Extract metadata within the block
50
- if in_metadata:
51
- if ':' in line:
52
- key, value = [part.strip() for part in line.split(':', 1)]
53
- key = key.lower()
54
-
55
- # Handle multi-line values
56
- if value.startswith('>'):
57
- value = value[1:].strip()
58
- j = i + 1
59
- while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
60
- value += ' ' + lines[j].strip()
61
- j += 1
62
-
63
- if key == 'title':
64
- metadata['title'] = value
65
- elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
66
- metadata['description'] = value
67
- elif key in ['canonical_url', 'canonical']:
68
- metadata['canonical_url'] = value
69
- elif key in ['language', 'lang']:
70
- metadata['language'] = value
71
- elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
72
- first_paragraph = line.strip()
73
-
74
- # Use first paragraph as fallback description if none found
75
- if not metadata['description'] and first_paragraph:
76
- metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
77
-
78
- # Add formatted metadata section
79
- extracted.append(f"URL: {url}")
80
- extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
81
- extracted.append(f"Description: {metadata['description']}")
82
- extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
83
- extracted.append(f"Language: {metadata['language'] or 'en'}")
84
- extracted.append("") # Empty line after metadata
41
+ # Handle the case where value starts with "URL:" - this means it's a missing description
42
+ if value.startswith('URL:'):
43
+ return key, ''
44
+
45
+ return key, value
85
46
 
86
- # Second pass - process headers
47
+ # Process each line
87
48
  for line in lines:
88
- if line.strip().startswith('#'):
49
+ line = line.strip()
50
+ if not line:
51
+ continue
52
+
53
+ # Check if it's a metadata line (contains : but isn't a header)
54
+ if ':' in line and not line.startswith('#'):
55
+ key, value = parse_metadata_line(line)
56
+ if key:
57
+ metadata[key] = value
58
+ # Check if it's a header
59
+ elif line.startswith('#'):
89
60
  level = len(line) - len(line.lstrip('#'))
90
61
  text = line.lstrip('#').strip()
91
62
  if 1 <= level <= 6:
92
- extracted.append(f"H{level}: {text}")
63
+ headers.append(f"H{level}: {text}")
93
64
 
94
- return '\n'.join(extracted)
65
+ # Construct output
66
+ output = []
67
+ output.append(f"URL: {url}")
68
+ output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
69
+ output.append(f"Description: {metadata.get('Description', '')}")
70
+ output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
71
+ output.append(f"Language: {metadata.get('Language', 'en')}")
72
+ output.append("") # Empty line
73
+ output.extend(headers)
74
+
75
+ return '\n'.join(output)
95
76
 
96
77
  def slugify(url: str) -> str:
97
78
  """Convert URL to a valid filename."""
@@ -111,6 +92,7 @@ class CrawlResult:
111
92
  error: Optional[str] = None
112
93
  timestamp: str = None
113
94
  config: Dict = None
95
+ extraction_result: Optional[Dict] = None # Store post-extraction results
114
96
 
115
97
  def __post_init__(self):
116
98
  if not self.timestamp:
@@ -131,9 +113,14 @@ class CrawlConfig:
131
113
  webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
132
114
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
133
115
  save_reports: bool = False # Whether to save crawl reports
134
- report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
135
- combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
136
- combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
116
+ report_file: Optional[Path] = None # Optional report file location
117
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
118
+ combined_markdown_file: Optional[Path] = None # Optional path for combined file
119
+
120
+ # Post-extraction settings
121
+ post_extraction_agent: Optional[Dict[str, Any]] = None # LLM configuration
122
+ post_extraction_agent_save_to_file: Optional[str] = None # Extraction output file
123
+ post_agent_transformer_function: Optional[Callable] = None # Custom transformer
137
124
 
138
125
  def __post_init__(self):
139
126
  # Initialize empty lists/dicts for None values
@@ -161,6 +148,15 @@ class CrawlConfig:
161
148
  # Create or clear the combined file
162
149
  self.combined_markdown_file.write_text('')
163
150
 
151
+ # Validate post-extraction agent configuration if provided
152
+ if self.post_extraction_agent:
153
+ if "messages" not in self.post_extraction_agent:
154
+ raise ValueError("Post-extraction agent configuration must include 'messages'")
155
+ if "model" not in self.post_extraction_agent:
156
+ raise ValueError("Post-extraction agent configuration must include 'model'")
157
+ if "api_key" not in self.post_extraction_agent:
158
+ raise ValueError("Post-extraction agent configuration must include 'api_key'")
159
+
164
160
  def to_dict(self) -> Dict:
165
161
  """Convert config to dictionary for API requests."""
166
162
  payload = {}
@@ -172,52 +168,120 @@ class CrawlConfig:
172
168
  if self.remove_selectors_regex:
173
169
  payload["remove_selectors_regex"] = self.remove_selectors_regex
174
170
  return payload
175
-
176
-
171
+
177
172
  def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
178
173
  """Synchronous version of webhook sender for parallel processing."""
179
174
  if not config.webhook_url:
180
175
  return
181
176
 
182
- # Use custom payload template if provided, otherwise use default
183
- if config.webhook_payload_template:
184
- # Replace variables in the template
185
- payload_str = config.webhook_payload_template.format(
186
- url=result.url,
187
- status=result.status,
188
- markdown=result.markdown if result.status == "success" else None,
189
- error=result.error if result.status == "failed" else None,
190
- timestamp=result.timestamp,
191
- config=config.to_dict()
177
+ try:
178
+ # Use custom payload template if provided, otherwise use default
179
+ if config.webhook_payload_template:
180
+ # Replace variables in the template
181
+ payload_str = config.webhook_payload_template.format(
182
+ url=result.url,
183
+ status=result.status,
184
+ markdown=result.markdown if result.status == "success" else None,
185
+ error=result.error if result.status == "failed" else None,
186
+ timestamp=result.timestamp,
187
+ config=config.to_dict(),
188
+ extraction_result=result.extraction_result if result.extraction_result else None
189
+ )
190
+ payload = json.loads(payload_str) # Parse the formatted JSON string
191
+ else:
192
+ # Use default payload format
193
+ payload = {
194
+ "url": result.url,
195
+ "status": result.status,
196
+ "markdown": result.markdown if result.status == "success" else None,
197
+ "error": result.error if result.status == "failed" else None,
198
+ "timestamp": result.timestamp,
199
+ "config": config.to_dict(),
200
+ "extraction_result": result.extraction_result if result.extraction_result else None
201
+ }
202
+
203
+ response = requests.post(
204
+ config.webhook_url,
205
+ json=payload,
206
+ headers=config.webhook_headers,
207
+ timeout=config.webhook_timeout
192
208
  )
193
- payload = json.loads(payload_str) # Parse the formatted JSON string
194
- else:
195
- # Use default payload format
209
+ response.raise_for_status()
210
+ except Exception as e:
211
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
212
+
213
+ async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
214
+ """Asynchronous webhook sender."""
215
+ if not config.webhook_url:
216
+ return
217
+
218
+ try:
219
+ # Prepare payload similar to sync version
196
220
  payload = {
197
221
  "url": result.url,
198
222
  "status": result.status,
199
223
  "markdown": result.markdown if result.status == "success" else None,
200
224
  "error": result.error if result.status == "failed" else None,
201
225
  "timestamp": result.timestamp,
202
- "config": config.to_dict()
226
+ "config": config.to_dict(),
227
+ "extraction_result": result.extraction_result if result.extraction_result else None
203
228
  }
204
229
 
230
+ async with httpx.AsyncClient() as client:
231
+ response = await client.post(
232
+ config.webhook_url,
233
+ json=payload,
234
+ headers=config.webhook_headers,
235
+ timeout=config.webhook_timeout
236
+ )
237
+ response.raise_for_status()
238
+ except Exception as e:
239
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
240
+
241
+ async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
242
+ """Save markdown content to file and/or append to combined file asynchronously."""
205
243
  try:
206
- response = requests.post(
207
- config.webhook_url,
208
- json=payload,
209
- headers=config.webhook_headers,
210
- timeout=config.webhook_timeout
211
- )
212
- response.raise_for_status()
244
+ # Save individual file if not combining or if combining in full mode
245
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
246
+ filename = f"{slugify(url)}.md"
247
+ filepath = config.output_dir / filename
248
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
249
+ await f.write(markdown)
250
+
251
+ # Handle combined markdown file
252
+ if config.combine_to_one_markdown:
253
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
254
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
255
+
256
+ async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
257
+ await f.write(combined_content)
258
+ except Exception as e:
259
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
260
+
261
+ def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
262
+ """Synchronous version of markdown saver for parallel processing."""
263
+ try:
264
+ # Save individual file if not combining or if combining in full mode
265
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
266
+ filepath = config.output_dir / f"{slugify(url)}.md"
267
+ with open(filepath, 'w', encoding='utf-8') as f:
268
+ f.write(markdown)
269
+
270
+ # Handle combined markdown file
271
+ if config.combine_to_one_markdown:
272
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
273
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
274
+
275
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
276
+ f.write(combined_content)
213
277
  except Exception as e:
214
- print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
278
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
215
279
 
216
- # Module level function for multiprocessing
217
280
  def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
218
281
  """Process a single URL for parallel processing."""
219
282
  url, base_url, config = args
220
283
  try:
284
+ # Make the conversion request
221
285
  endpoint = f"{base_url}/convert"
222
286
  payload = {
223
287
  "url": url,
@@ -232,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
232
296
  error=f"HTTP {response.status_code}: {response.text}",
233
297
  config=config.to_dict()
234
298
  )
235
- # Send webhook for failed result
236
299
  _send_webhook_sync(result, config)
237
300
  return result
238
301
 
@@ -240,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
240
303
 
241
304
  # Save markdown if output directory is configured
242
305
  if config.output_dir:
243
- # Save individual file if not combining or if combining in full mode
244
- if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
245
- filepath = config.output_dir / f"{slugify(url)}.md"
246
- with open(filepath, 'w', encoding='utf-8') as f:
247
- f.write(markdown)
248
-
249
- # Handle combined markdown file
250
- if config.combine_to_one_markdown:
251
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
252
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
253
-
254
- with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
255
- f.write(combined_content)
306
+ _save_markdown_sync(url, markdown, config)
256
307
 
257
308
  result = CrawlResult(
258
309
  url=url,
@@ -261,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
261
312
  config=config.to_dict()
262
313
  )
263
314
 
315
+ # Handle post-extraction if configured
316
+ if config.post_extraction_agent:
317
+ try:
318
+ post_config = PostExtractionConfig(
319
+ model=config.post_extraction_agent["model"],
320
+ messages=config.post_extraction_agent["messages"],
321
+ api_key=config.post_extraction_agent["api_key"],
322
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
324
+ base_url=config.post_extraction_agent.get("base_url"),
325
+ combine_output=bool(config.post_extraction_agent_save_to_file),
326
+ output_file=config.post_extraction_agent_save_to_file,
327
+ custom_transform_function=config.post_agent_transformer_function
328
+ )
329
+
330
+ agent = PostExtractionAgent(post_config)
331
+ extraction_result = asyncio.run(agent.process_content(url, markdown))
332
+ if extraction_result:
333
+ result.extraction_result = extraction_result
334
+ except Exception as e:
335
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
+
264
337
  # Send webhook for successful result
265
338
  _send_webhook_sync(result, config)
266
339
 
@@ -281,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
281
354
  _send_webhook_sync(result, config)
282
355
  return result
283
356
 
357
+ async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
358
+ """Save crawl report to JSON file asynchronously."""
359
+ if not config.report_file:
360
+ return
361
+
362
+ # Separate successful and failed results
363
+ successful_results = [r for r in results if r.status == "success"]
364
+ failed_results = [r for r in results if r.status == "failed"]
365
+
366
+ report = {
367
+ "timestamp": datetime.now().isoformat(),
368
+ "config": config.to_dict(),
369
+ "results": {
370
+ "successful": [asdict(r) for r in successful_results],
371
+ "failed": [asdict(r) for r in failed_results]
372
+ },
373
+ "summary": {
374
+ "total": len(results),
375
+ "successful": len(successful_results),
376
+ "failed": len(failed_results),
377
+ "retry_info": retry_stats or {}
378
+ }
379
+ }
380
+
381
+ async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
382
+ await f.write(json.dumps(report, indent=2))
383
+
384
+ def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
385
+ """Synchronous version of report saver."""
386
+ if not config.report_file:
387
+ return
388
+
389
+ # Create report similar to async version
390
+ successful_results = [r for r in results if r.status == "success"]
391
+ failed_results = [r for r in results if r.status == "failed"]
392
+
393
+ report = {
394
+ "timestamp": datetime.now().isoformat(),
395
+ "config": config.to_dict(),
396
+ "results": {
397
+ "successful": [asdict(r) for r in successful_results],
398
+ "failed": [asdict(r) for r in failed_results]
399
+ },
400
+ "summary": {
401
+ "total": len(results),
402
+ "successful": len(successful_results),
403
+ "failed": len(failed_results),
404
+ "retry_info": retry_stats or {}
405
+ }
406
+ }
407
+
408
+ with open(config.report_file, 'w', encoding='utf-8') as f:
409
+ json.dump(report, f, indent=2)
410
+
284
411
  class SpiderForce4AI:
285
412
  """Main class for interacting with SpiderForce4AI service."""
286
413
 
@@ -289,6 +416,7 @@ class SpiderForce4AI:
289
416
  self.session = None
290
417
  self._executor = ThreadPoolExecutor()
291
418
  self.crawl_results: List[CrawlResult] = []
419
+ self._retry_stats = {}
292
420
 
293
421
  async def _ensure_session(self):
294
422
  """Ensure aiohttp session exists."""
@@ -300,215 +428,6 @@ class SpiderForce4AI:
300
428
  if self.session and not self.session.closed:
301
429
  await self.session.close()
302
430
 
303
- async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
304
- """Save markdown content to file and/or append to combined file."""
305
- # Save individual file if not combining or if combining in full mode
306
- if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
307
- filename = f"{slugify(url)}.md"
308
- filepath = output_dir / filename
309
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
310
- await f.write(markdown)
311
-
312
- # Handle combined markdown file
313
- if self.config.combine_to_one_markdown:
314
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
315
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
316
-
317
- async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
318
- await f.write(combined_content)
319
-
320
-
321
-
322
- def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
323
- """
324
- Crawl sitemap URLs using server-side parallel processing.
325
- """
326
- print(f"Fetching sitemap from {sitemap_url}...")
327
-
328
- # Fetch sitemap
329
- try:
330
- response = requests.get(sitemap_url, timeout=config.timeout)
331
- response.raise_for_status()
332
- sitemap_text = response.text
333
- except Exception as e:
334
- print(f"Error fetching sitemap: {str(e)}")
335
- raise
336
-
337
- # Parse sitemap
338
- try:
339
- root = ET.fromstring(sitemap_text)
340
- namespace = {'ns': root.tag.split('}')[0].strip('{')}
341
- urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
342
- print(f"Found {len(urls)} URLs in sitemap")
343
- except Exception as e:
344
- print(f"Error parsing sitemap: {str(e)}")
345
- raise
346
-
347
- # Process URLs using server-side parallel endpoint
348
- return self.crawl_urls_server_parallel(urls, config)
349
-
350
-
351
- def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
352
- """
353
- Crawl multiple URLs using server-side parallel processing.
354
- This uses the /convert_parallel endpoint which handles parallelization on the server.
355
- """
356
- print(f"Sending {len(urls)} URLs for parallel processing...")
357
-
358
- try:
359
- endpoint = f"{self.base_url}/convert_parallel"
360
-
361
- # Prepare payload
362
- payload = {
363
- "urls": urls,
364
- **config.to_dict()
365
- }
366
-
367
- # Send request
368
- response = requests.post(
369
- endpoint,
370
- json=payload,
371
- timeout=config.timeout
372
- )
373
- response.raise_for_status()
374
-
375
- # Process results
376
- results = []
377
- server_results = response.json() # Assuming server returns JSON array of results
378
-
379
- for url_result in server_results:
380
- result = CrawlResult(
381
- url=url_result["url"],
382
- status=url_result.get("status", "failed"),
383
- markdown=url_result.get("markdown"),
384
- error=url_result.get("error"),
385
- config=config.to_dict()
386
- )
387
-
388
- # Save markdown if successful and output dir is configured
389
- if result.status == "success" and config.output_dir and result.markdown:
390
- filepath = config.output_dir / f"{slugify(result.url)}.md"
391
- with open(filepath, 'w', encoding='utf-8') as f:
392
- f.write(result.markdown)
393
-
394
- # Send webhook if configured
395
- if config.webhook_url:
396
- _send_webhook_sync(result, config)
397
-
398
- results.append(result)
399
-
400
- # Calculate statistics
401
- successful = len([r for r in results if r.status == "success"])
402
- failed = len([r for r in results if r.status == "failed"])
403
-
404
- # Print summary
405
- print(f"\nParallel processing completed:")
406
- print(f"✓ Successful: {successful}")
407
- print(f"✗ Failed: {failed}")
408
-
409
- # Save report if enabled
410
- if config.save_reports and config.report_file:
411
- self._retry_stats = {
412
- "initial_failures": failed,
413
- "failure_ratio": (failed / len(urls)) * 100,
414
- "retry_successful": 0, # No retries in server parallel mode
415
- "retry_failed": failed
416
- }
417
- self._save_report_sync(results, config)
418
- console.print(f"📊 Report saved to: {config.report_file}")
419
-
420
- return results
421
-
422
- except Exception as e:
423
- print(f"Error during parallel processing: {str(e)}")
424
- # Create failed results for all URLs
425
- return [
426
- CrawlResult(
427
- url=url,
428
- status="failed",
429
- error=str(e),
430
- config=config.to_dict()
431
- ) for url in urls
432
- ]
433
-
434
-
435
- async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
436
- """Send webhook with crawl results."""
437
- if not config.webhook_url:
438
- return
439
-
440
- payload = {
441
- "url": result.url,
442
- "status": result.status,
443
- "markdown": result.markdown if result.status == "success" else None,
444
- "error": result.error if result.status == "failed" else None,
445
- "timestamp": result.timestamp,
446
- "config": config.to_dict()
447
- }
448
-
449
- try:
450
- async with httpx.AsyncClient() as client:
451
- response = await client.post(
452
- config.webhook_url,
453
- json=payload,
454
- timeout=config.webhook_timeout
455
- )
456
- response.raise_for_status()
457
- except Exception as e:
458
- console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
459
-
460
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
461
- """Save crawl report synchronously."""
462
- # Separate successful and failed results
463
- successful_results = [r for r in results if r.status == "success"]
464
- failed_results = [r for r in results if r.status == "failed"]
465
-
466
- # Create report with only final state
467
- report = {
468
- "timestamp": datetime.now().isoformat(),
469
- "config": config.to_dict(),
470
- "results": {
471
- "successful": [asdict(r) for r in successful_results],
472
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
473
- },
474
- "summary": {
475
- "total": len(results),
476
- "successful": len(successful_results),
477
- "failed": len(failed_results),
478
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
479
- }
480
- }
481
-
482
- with open(config.report_file, 'w', encoding='utf-8') as f:
483
- json.dump(report, f, indent=2)
484
-
485
- async def _save_report(self, config: CrawlConfig):
486
- """Save crawl report to JSON file."""
487
- if not config.report_file:
488
- return
489
-
490
- # Separate successful and failed results
491
- successful_results = [r for r in self.crawl_results if r.status == "success"]
492
- failed_results = [r for r in self.crawl_results if r.status == "failed"]
493
-
494
- report = {
495
- "timestamp": datetime.now().isoformat(),
496
- "config": config.to_dict(),
497
- "results": {
498
- "successful": [asdict(r) for r in successful_results],
499
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
500
- },
501
- "summary": {
502
- "total": len(self.crawl_results),
503
- "successful": len(successful_results),
504
- "failed": len(failed_results),
505
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
506
- }
507
- }
508
-
509
- async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
510
- await f.write(json.dumps(report, indent=2))
511
-
512
431
  async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
513
432
  """Crawl a single URL asynchronously."""
514
433
  await self._ensure_session()
@@ -539,9 +458,31 @@ class SpiderForce4AI:
539
458
  )
540
459
 
541
460
  if config.output_dir:
542
- await self._save_markdown(url, markdown, config.output_dir)
461
+ await _save_markdown_async(url, markdown, config)
543
462
 
544
- await self._send_webhook(result, config)
463
+ # Handle post-extraction if configured
464
+ if config.post_extraction_agent and result.status == "success":
465
+ try:
466
+ post_config = PostExtractionConfig(
467
+ model=config.post_extraction_agent["model"],
468
+ messages=config.post_extraction_agent["messages"],
469
+ api_key=config.post_extraction_agent["api_key"],
470
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
472
+ base_url=config.post_extraction_agent.get("base_url"),
473
+ combine_output=bool(config.post_extraction_agent_save_to_file),
474
+ output_file=config.post_extraction_agent_save_to_file,
475
+ custom_transform_function=config.post_agent_transformer_function
476
+ )
477
+
478
+ agent = PostExtractionAgent(post_config)
479
+ extraction_result = await agent.process_content(url, markdown)
480
+ if extraction_result:
481
+ result.extraction_result = extraction_result
482
+ except Exception as e:
483
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
+
485
+ await _send_webhook_async(result, config)
545
486
 
546
487
  self.crawl_results.append(result)
547
488
  return result
@@ -561,18 +502,18 @@ class SpiderForce4AI:
561
502
  return asyncio.run(self.crawl_url_async(url, config))
562
503
 
563
504
  async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
564
- """Retry failed URLs once."""
505
+ """Retry failed URLs with optional progress tracking."""
565
506
  if not failed_results:
566
507
  return []
567
508
 
568
509
  failed_count = len(failed_results)
569
- total_count = len([r for r in self.crawl_results])
510
+ total_count = len(self.crawl_results)
570
511
  failure_ratio = (failed_count / total_count) * 100
571
512
 
572
513
  console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
573
514
  retry_results = []
574
515
 
575
- # Create a new progress bar if one wasn't provided
516
+ # Create or use provided progress bar
576
517
  should_close_progress = progress is None
577
518
  if progress is None:
578
519
  progress = Progress(
@@ -616,6 +557,7 @@ class SpiderForce4AI:
616
557
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
617
558
  """Crawl multiple URLs asynchronously with progress bar."""
618
559
  await self._ensure_session()
560
+ post_extraction_results = {}
619
561
 
620
562
  with Progress(
621
563
  SpinnerColumn(),
@@ -624,52 +566,60 @@ class SpiderForce4AI:
624
566
  TaskProgressColumn(),
625
567
  console=console
626
568
  ) as progress:
627
- task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
569
+ crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
628
570
 
629
571
  async def crawl_with_progress(url):
630
572
  result = await self.crawl_url_async(url, config)
631
- progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
573
+ progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
632
574
  return result
633
575
 
576
+ # Set up concurrency control
634
577
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
+
635
579
  async def crawl_with_semaphore(url):
636
580
  async with semaphore:
637
581
  result = await crawl_with_progress(url)
638
582
  await asyncio.sleep(config.request_delay)
639
583
  return result
640
584
 
585
+ # Perform initial crawl
641
586
  initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
642
587
 
643
- # Identify failed URLs
588
+ # Handle failed URLs
644
589
  failed_results = [r for r in initial_results if r.status == "failed"]
645
-
646
- # Calculate initial failure ratio
647
590
  initial_failed = len(failed_results)
648
591
  total_urls = len(urls)
649
592
  failure_ratio = (initial_failed / total_urls) * 100
650
593
 
651
594
  # Retry failed URLs if ratio is acceptable
652
- if failed_results:
653
- if failure_ratio > 20:
654
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
655
- results = initial_results
656
- else:
657
- retry_results = await self._retry_failed_urls(failed_results, config, progress)
658
- # Update results list by replacing failed results with successful retries
659
- results = initial_results.copy()
660
- for retry_result in retry_results:
661
- for i, result in enumerate(results):
662
- if result.url == retry_result.url:
663
- results[i] = retry_result
664
- break
665
- else:
666
- results = initial_results
595
+ results = initial_results
596
+ retry_successful = 0
667
597
 
668
- # Calculate final statistics before saving report
598
+ if failed_results and failure_ratio <= 20:
599
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
600
+ retry_successful = len([r for r in retry_results if r.status == "success"])
601
+
602
+ # Update results list
603
+ for retry_result in retry_results:
604
+ for i, result in enumerate(results):
605
+ if result.url == retry_result.url:
606
+ results[i] = retry_result
607
+ break
608
+
609
+ # Calculate final statistics
669
610
  final_successful = len([r for r in results if r.status == "success"])
670
611
  final_failed = len([r for r in results if r.status == "failed"])
671
612
 
672
- # Print detailed summary
613
+ # Update retry stats
614
+ self._retry_stats = {
615
+ "initial_failures": initial_failed,
616
+ "failure_ratio": failure_ratio,
617
+ "retry_successful": retry_successful if initial_failed > 0 else 0,
618
+ "retry_failed": final_failed,
619
+ "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
620
+ }
621
+
622
+ # Print summary
673
623
  console.print(f"\n[green]Crawling Summary:[/green]")
674
624
  console.print(f"Total URLs processed: {total_urls}")
675
625
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
@@ -678,18 +628,11 @@ class SpiderForce4AI:
678
628
  console.print(f" ✗ Failed: {final_failed}")
679
629
 
680
630
  if initial_failed > 0:
681
- retry_successful = initial_failed - final_failed
682
631
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
683
632
 
684
- # Save final report after all retries are complete
633
+ # Save final report
685
634
  if config.save_reports:
686
- self._retry_stats = {
687
- "initial_failures": initial_failed,
688
- "failure_ratio": failure_ratio,
689
- "retry_successful": retry_successful if initial_failed > 0 else 0,
690
- "retry_failed": final_failed
691
- }
692
- await self._save_report(config)
635
+ await _save_report_async(results, config, self._retry_stats)
693
636
  console.print(f"📊 Report saved to: {config.report_file}")
694
637
 
695
638
  return results
@@ -726,32 +669,21 @@ class SpiderForce4AI:
726
669
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
727
670
 
728
671
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
729
- """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
730
- print(f"Fetching sitemap from {sitemap_url}...")
731
-
732
- # Fetch sitemap
672
+ """Crawl sitemap URLs in parallel using multiprocessing."""
673
+ # Fetch and parse sitemap
733
674
  try:
734
675
  response = requests.get(sitemap_url, timeout=config.timeout)
735
676
  response.raise_for_status()
736
- sitemap_text = response.text
737
- except Exception as e:
738
- print(f"Error fetching sitemap: {str(e)}")
739
- raise
740
-
741
- # Parse sitemap
742
- try:
743
- root = ET.fromstring(sitemap_text)
677
+ root = ET.fromstring(response.text)
744
678
  namespace = {'ns': root.tag.split('}')[0].strip('{')}
745
679
  urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
746
- print(f"Found {len(urls)} URLs in sitemap")
680
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
747
681
  except Exception as e:
748
- print(f"Error parsing sitemap: {str(e)}")
682
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
749
683
  raise
750
684
 
751
- # Prepare arguments for parallel processing
685
+ # Process URLs in parallel
752
686
  process_args = [(url, self.base_url, config) for url in urls]
753
-
754
- # Create process pool and execute crawls
755
687
  results = []
756
688
 
757
689
  with Pool(processes=config.max_concurrent_requests) as pool:
@@ -762,81 +694,186 @@ class SpiderForce4AI:
762
694
  TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
763
695
  TextColumn("({task.completed}/{task.total})"),
764
696
  ) as progress:
765
- task = progress.add_task("Crawling URLs...", total=len(urls))
697
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
766
698
 
767
699
  for result in pool.imap_unordered(_process_url_parallel, process_args):
768
700
  results.append(result)
769
701
  progress.update(task, advance=1)
770
702
  status = "✓" if result.status == "success" else "✗"
771
- progress.description = f"Last: {status} {result.url}"
703
+ progress.description = f"[cyan]Last: {status} {result.url}"
772
704
 
773
- # Calculate initial failure statistics
705
+ # Calculate statistics and handle retries
774
706
  failed_results = [r for r in results if r.status == "failed"]
775
707
  initial_failed = len(failed_results)
776
- total_urls = len(urls)
777
- failure_ratio = (initial_failed / total_urls) * 100
708
+ failure_ratio = (initial_failed / len(urls)) * 100
709
+ retry_successful = 0
778
710
 
779
- # Retry failed URLs if ratio is acceptable
780
- if failed_results:
781
- if failure_ratio > 20:
782
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
783
- else:
784
- failed_count = len(failed_results)
785
- failure_ratio = (failed_count / total_urls) * 100
786
- console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
787
- for result in failed_results:
788
- new_result = _process_url_parallel((result.url, self.base_url, config))
789
-
790
- # Save markdown and trigger webhook for successful retries
791
- if new_result.status == "success":
792
- console.print(f"[green] Retry successful: {result.url}[/green]")
793
- # Save markdown if output directory is configured
794
- if config.output_dir and new_result.markdown:
795
- filepath = config.output_dir / f"{slugify(new_result.url)}.md"
796
- with open(filepath, 'w', encoding='utf-8') as f:
797
- f.write(new_result.markdown)
798
- # Send webhook for successful retry
799
- _send_webhook_sync(new_result, config)
800
- else:
801
- console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
802
- # Send webhook for failed retry
803
- _send_webhook_sync(new_result, config)
804
-
805
- # Update results list
806
- for i, r in enumerate(results):
807
- if r.url == new_result.url:
808
- results[i] = new_result
809
- break
711
+ if failed_results and failure_ratio <= 20:
712
+ console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
713
+ for result in failed_results:
714
+ new_result = _process_url_parallel((result.url, self.base_url, config))
715
+ if new_result.status == "success":
716
+ retry_successful += 1
717
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
718
+ else:
719
+ console.print(f"[red]✗ Retry failed: {result.url}[/red]")
720
+
721
+ # Update results list
722
+ for i, r in enumerate(results):
723
+ if r.url == new_result.url:
724
+ results[i] = new_result
725
+ break
810
726
 
811
727
  # Calculate final statistics
812
728
  final_successful = len([r for r in results if r.status == "success"])
813
729
  final_failed = len([r for r in results if r.status == "failed"])
814
730
 
815
- # Print detailed summary
731
+ # Print summary
816
732
  console.print(f"\n[green]Crawling Summary:[/green]")
817
- console.print(f"Total URLs processed: {total_urls}")
733
+ console.print(f"Total URLs processed: {len(urls)}")
818
734
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
819
735
  console.print(f"Final results:")
820
736
  console.print(f" ✓ Successful: {final_successful}")
821
737
  console.print(f" ✗ Failed: {final_failed}")
822
-
738
+
823
739
  if initial_failed > 0:
824
- retry_successful = initial_failed - final_failed
825
740
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
826
741
 
827
- # Save final report after all retries are complete
742
+ # Save report
828
743
  if config.save_reports:
829
744
  self._retry_stats = {
830
745
  "initial_failures": initial_failed,
831
746
  "failure_ratio": failure_ratio,
832
- "retry_successful": retry_successful if initial_failed > 0 else 0,
747
+ "retry_successful": retry_successful,
833
748
  "retry_failed": final_failed
834
749
  }
835
- self._save_report_sync(results, config)
750
+ _save_report_sync(results, config, self._retry_stats)
836
751
  console.print(f"📊 Report saved to: {config.report_file}")
837
752
 
838
753
  return results
839
754
 
755
+ def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
756
+ """
757
+ Crawl multiple URLs using server-side parallel processing.
758
+ This uses the /convert_parallel endpoint which handles parallelization on the server.
759
+ """
760
+ console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
761
+
762
+ try:
763
+ endpoint = f"{self.base_url}/convert_parallel"
764
+
765
+ # Prepare payload
766
+ payload = {
767
+ "urls": urls,
768
+ **config.to_dict()
769
+ }
770
+
771
+ # Send request
772
+ response = requests.post(
773
+ endpoint,
774
+ json=payload,
775
+ timeout=config.timeout
776
+ )
777
+ response.raise_for_status()
778
+
779
+ # Process results
780
+ results = []
781
+ server_results = response.json()
782
+
783
+ for url_result in server_results:
784
+ result = CrawlResult(
785
+ url=url_result["url"],
786
+ status=url_result.get("status", "failed"),
787
+ markdown=url_result.get("markdown"),
788
+ error=url_result.get("error"),
789
+ config=config.to_dict()
790
+ )
791
+
792
+ # Save markdown if successful and output dir is configured
793
+ if result.status == "success" and config.output_dir and result.markdown:
794
+ _save_markdown_sync(result.url, result.markdown, config)
795
+
796
+ # Handle post-extraction if configured
797
+ if config.post_extraction_agent and result.status == "success":
798
+ try:
799
+ post_config = PostExtractionConfig(
800
+ model=config.post_extraction_agent["model"],
801
+ messages=config.post_extraction_agent["messages"],
802
+ api_key=config.post_extraction_agent["api_key"],
803
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
804
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
805
+ base_url=config.post_extraction_agent.get("base_url"),
806
+ combine_output=bool(config.post_extraction_agent_save_to_file),
807
+ output_file=config.post_extraction_agent_save_to_file,
808
+ custom_transform_function=config.post_agent_transformer_function
809
+ )
810
+
811
+ agent = PostExtractionAgent(post_config)
812
+ extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
813
+ if extraction_result:
814
+ result.extraction_result = extraction_result
815
+ except Exception as e:
816
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
817
+
818
+ # Send webhook if configured
819
+ _send_webhook_sync(result, config)
820
+ results.append(result)
821
+
822
+ # Calculate statistics
823
+ successful = len([r for r in results if r.status == "success"])
824
+ failed = len([r for r in results if r.status == "failed"])
825
+
826
+ # Print summary
827
+ console.print("\n[green]Parallel processing completed:[/green]")
828
+ console.print(f"✓ Successful: {successful}")
829
+ console.print(f"✗ Failed: {failed}")
830
+
831
+ # Save report if enabled
832
+ if config.save_reports:
833
+ self._retry_stats = {
834
+ "initial_failures": failed,
835
+ "failure_ratio": (failed / len(urls)) * 100,
836
+ "retry_successful": 0, # No retries in server parallel mode
837
+ "retry_failed": failed
838
+ }
839
+ _save_report_sync(results, config, self._retry_stats)
840
+ console.print(f"📊 Report saved to: {config.report_file}")
841
+
842
+ return results
843
+
844
+ except Exception as e:
845
+ console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
846
+ # Create failed results for all URLs
847
+ return [
848
+ CrawlResult(
849
+ url=url,
850
+ status="failed",
851
+ error=str(e),
852
+ config=config.to_dict()
853
+ ) for url in urls
854
+ ]
855
+
856
+ def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
857
+ """
858
+ Crawl sitemap URLs using server-side parallel processing.
859
+ """
860
+ console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
861
+
862
+ try:
863
+ response = requests.get(sitemap_url, timeout=config.timeout)
864
+ response.raise_for_status()
865
+ root = ET.fromstring(response.text)
866
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
867
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
868
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
869
+
870
+ # Process URLs using server-side parallel endpoint
871
+ return self.crawl_urls_server_parallel(urls, config)
872
+
873
+ except Exception as e:
874
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
875
+ raise
876
+
840
877
  async def __aenter__(self):
841
878
  """Async context manager entry."""
842
879
  await self._ensure_session()
@@ -854,3 +891,7 @@ class SpiderForce4AI:
854
891
  """Sync context manager exit."""
855
892
  self._executor.shutdown(wait=True)
856
893
 
894
+ # Version info
895
+ #__version__ = "2.3.1"
896
+ #__author__ = "Piotr Tamulewicz"
897
+ #__email__ = "pt@petertam.pro"