spiderforce4ai 2.1__py3-none-any.whl → 2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  # spiderforce4ai/__init__.py
2
2
 
3
+ from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
3
4
  import asyncio
4
5
  import aiohttp
5
6
  import json
6
7
  import logging
7
- from typing import List, Dict, Union, Optional, Tuple
8
+ from typing import List, Dict, Union, Optional, Tuple, Callable, Any
8
9
  from dataclasses import dataclass, asdict
9
10
  from urllib.parse import urljoin, urlparse
10
11
  from pathlib import Path
@@ -23,75 +24,55 @@ from multiprocessing import Pool
23
24
  console = Console()
24
25
 
25
26
  def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
- """Extract metadata and headers from markdown content with enhanced SEO formatting."""
27
+ """Extract metadata and headers from markdown content."""
27
28
  lines = markdown.split('\n')
28
- extracted = []
29
- in_metadata = False
30
- metadata = {
31
- 'title': '',
32
- 'description': '',
33
- 'canonical_url': '',
34
- 'language': ''
35
- }
36
- first_paragraph = ''
29
+ metadata = {}
30
+ headers = []
37
31
 
38
- # First pass - collect metadata and first paragraph
39
- for i, line in enumerate(lines):
40
- # Check for metadata block boundaries
41
- if line.strip() == '---':
42
- if not in_metadata:
43
- in_metadata = True
44
- continue
45
- else:
46
- in_metadata = False
47
- break
32
+ def parse_metadata_line(line):
33
+ """Parse a single metadata line correctly."""
34
+ first_colon = line.find(':')
35
+ if first_colon == -1:
36
+ return None, None
37
+
38
+ key = line[:first_colon].strip()
39
+ value = line[first_colon + 1:].strip()
48
40
 
49
- # Extract metadata within the block
50
- if in_metadata:
51
- if ':' in line:
52
- key, value = [part.strip() for part in line.split(':', 1)]
53
- key = key.lower()
54
-
55
- # Handle multi-line values
56
- if value.startswith('>'):
57
- value = value[1:].strip()
58
- j = i + 1
59
- while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
60
- value += ' ' + lines[j].strip()
61
- j += 1
62
-
63
- if key == 'title':
64
- metadata['title'] = value
65
- elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
66
- metadata['description'] = value
67
- elif key in ['canonical_url', 'canonical']:
68
- metadata['canonical_url'] = value
69
- elif key in ['language', 'lang']:
70
- metadata['language'] = value
71
- elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
72
- first_paragraph = line.strip()
73
-
74
- # Use first paragraph as fallback description if none found
75
- if not metadata['description'] and first_paragraph:
76
- metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
77
-
78
- # Add formatted metadata section
79
- extracted.append(f"URL: {url}")
80
- extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
81
- extracted.append(f"Description: {metadata['description']}")
82
- extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
83
- extracted.append(f"Language: {metadata['language'] or 'en'}")
84
- extracted.append("") # Empty line after metadata
41
+ # Handle the case where value starts with "URL:" - this means it's a missing description
42
+ if value.startswith('URL:'):
43
+ return key, ''
44
+
45
+ return key, value
85
46
 
86
- # Second pass - process headers
47
+ # Process each line
87
48
  for line in lines:
88
- if line.strip().startswith('#'):
49
+ line = line.strip()
50
+ if not line:
51
+ continue
52
+
53
+ # Check if it's a metadata line (contains : but isn't a header)
54
+ if ':' in line and not line.startswith('#'):
55
+ key, value = parse_metadata_line(line)
56
+ if key:
57
+ metadata[key] = value
58
+ # Check if it's a header
59
+ elif line.startswith('#'):
89
60
  level = len(line) - len(line.lstrip('#'))
90
61
  text = line.lstrip('#').strip()
91
62
  if 1 <= level <= 6:
92
- extracted.append(f"H{level}: {text}")
63
+ headers.append(f"H{level}: {text}")
93
64
 
94
- return '\n'.join(extracted)
65
+ # Construct output
66
+ output = []
67
+ output.append(f"URL: {url}")
68
+ output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
69
+ output.append(f"Description: {metadata.get('Description', '')}")
70
+ output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
71
+ output.append(f"Language: {metadata.get('Language', 'en')}")
72
+ output.append("") # Empty line
73
+ output.extend(headers)
74
+
75
+ return '\n'.join(output)
95
76
 
96
77
  def slugify(url: str) -> str:
97
78
  """Convert URL to a valid filename."""
@@ -111,6 +92,7 @@ class CrawlResult:
111
92
  error: Optional[str] = None
112
93
  timestamp: str = None
113
94
  config: Dict = None
95
+ extraction_result: Optional[Dict] = None # Store post-extraction results
114
96
 
115
97
  def __post_init__(self):
116
98
  if not self.timestamp:
@@ -131,9 +113,14 @@ class CrawlConfig:
131
113
  webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
132
114
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
133
115
  save_reports: bool = False # Whether to save crawl reports
134
- report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
135
- combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
136
- combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
116
+ report_file: Optional[Path] = None # Optional report file location
117
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
118
+ combined_markdown_file: Optional[Path] = None # Optional path for combined file
119
+
120
+ # Post-extraction settings
121
+ post_extraction_agent: Optional[Dict[str, Any]] = None # LLM configuration
122
+ post_extraction_agent_save_to_file: Optional[str] = None # Extraction output file
123
+ post_agent_transformer_function: Optional[Callable] = None # Custom transformer
137
124
 
138
125
  def __post_init__(self):
139
126
  # Initialize empty lists/dicts for None values
@@ -161,6 +148,15 @@ class CrawlConfig:
161
148
  # Create or clear the combined file
162
149
  self.combined_markdown_file.write_text('')
163
150
 
151
+ # Validate post-extraction agent configuration if provided
152
+ if self.post_extraction_agent:
153
+ if "messages" not in self.post_extraction_agent:
154
+ raise ValueError("Post-extraction agent configuration must include 'messages'")
155
+ if "model" not in self.post_extraction_agent:
156
+ raise ValueError("Post-extraction agent configuration must include 'model'")
157
+ if "api_key" not in self.post_extraction_agent:
158
+ raise ValueError("Post-extraction agent configuration must include 'api_key'")
159
+
164
160
  def to_dict(self) -> Dict:
165
161
  """Convert config to dictionary for API requests."""
166
162
  payload = {}
@@ -172,52 +168,120 @@ class CrawlConfig:
172
168
  if self.remove_selectors_regex:
173
169
  payload["remove_selectors_regex"] = self.remove_selectors_regex
174
170
  return payload
175
-
176
-
171
+
177
172
  def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
178
173
  """Synchronous version of webhook sender for parallel processing."""
179
174
  if not config.webhook_url:
180
175
  return
181
176
 
182
- # Use custom payload template if provided, otherwise use default
183
- if config.webhook_payload_template:
184
- # Replace variables in the template
185
- payload_str = config.webhook_payload_template.format(
186
- url=result.url,
187
- status=result.status,
188
- markdown=result.markdown if result.status == "success" else None,
189
- error=result.error if result.status == "failed" else None,
190
- timestamp=result.timestamp,
191
- config=config.to_dict()
177
+ try:
178
+ # Use custom payload template if provided, otherwise use default
179
+ if config.webhook_payload_template:
180
+ # Replace variables in the template
181
+ payload_str = config.webhook_payload_template.format(
182
+ url=result.url,
183
+ status=result.status,
184
+ markdown=result.markdown if result.status == "success" else None,
185
+ error=result.error if result.status == "failed" else None,
186
+ timestamp=result.timestamp,
187
+ config=config.to_dict(),
188
+ extraction_result=result.extraction_result if result.extraction_result else None
189
+ )
190
+ payload = json.loads(payload_str) # Parse the formatted JSON string
191
+ else:
192
+ # Use default payload format
193
+ payload = {
194
+ "url": result.url,
195
+ "status": result.status,
196
+ "markdown": result.markdown if result.status == "success" else None,
197
+ "error": result.error if result.status == "failed" else None,
198
+ "timestamp": result.timestamp,
199
+ "config": config.to_dict(),
200
+ "extraction_result": result.extraction_result if result.extraction_result else None
201
+ }
202
+
203
+ response = requests.post(
204
+ config.webhook_url,
205
+ json=payload,
206
+ headers=config.webhook_headers,
207
+ timeout=config.webhook_timeout
192
208
  )
193
- payload = json.loads(payload_str) # Parse the formatted JSON string
194
- else:
195
- # Use default payload format
209
+ response.raise_for_status()
210
+ except Exception as e:
211
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
212
+
213
+ async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
214
+ """Asynchronous webhook sender."""
215
+ if not config.webhook_url:
216
+ return
217
+
218
+ try:
219
+ # Prepare payload similar to sync version
196
220
  payload = {
197
221
  "url": result.url,
198
222
  "status": result.status,
199
223
  "markdown": result.markdown if result.status == "success" else None,
200
224
  "error": result.error if result.status == "failed" else None,
201
225
  "timestamp": result.timestamp,
202
- "config": config.to_dict()
226
+ "config": config.to_dict(),
227
+ "extraction_result": result.extraction_result if result.extraction_result else None
203
228
  }
204
229
 
230
+ async with httpx.AsyncClient() as client:
231
+ response = await client.post(
232
+ config.webhook_url,
233
+ json=payload,
234
+ headers=config.webhook_headers,
235
+ timeout=config.webhook_timeout
236
+ )
237
+ response.raise_for_status()
238
+ except Exception as e:
239
+ console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
240
+
241
+ async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
242
+ """Save markdown content to file and/or append to combined file asynchronously."""
205
243
  try:
206
- response = requests.post(
207
- config.webhook_url,
208
- json=payload,
209
- headers=config.webhook_headers,
210
- timeout=config.webhook_timeout
211
- )
212
- response.raise_for_status()
244
+ # Save individual file if not combining or if combining in full mode
245
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
246
+ filename = f"{slugify(url)}.md"
247
+ filepath = config.output_dir / filename
248
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
249
+ await f.write(markdown)
250
+
251
+ # Handle combined markdown file
252
+ if config.combine_to_one_markdown:
253
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
254
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
255
+
256
+ async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
257
+ await f.write(combined_content)
258
+ except Exception as e:
259
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
260
+
261
+ def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
262
+ """Synchronous version of markdown saver for parallel processing."""
263
+ try:
264
+ # Save individual file if not combining or if combining in full mode
265
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
266
+ filepath = config.output_dir / f"{slugify(url)}.md"
267
+ with open(filepath, 'w', encoding='utf-8') as f:
268
+ f.write(markdown)
269
+
270
+ # Handle combined markdown file
271
+ if config.combine_to_one_markdown:
272
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
273
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
274
+
275
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
276
+ f.write(combined_content)
213
277
  except Exception as e:
214
- print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
278
+ console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
215
279
 
216
- # Module level function for multiprocessing
217
280
  def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
218
281
  """Process a single URL for parallel processing."""
219
282
  url, base_url, config = args
220
283
  try:
284
+ # Make the conversion request
221
285
  endpoint = f"{base_url}/convert"
222
286
  payload = {
223
287
  "url": url,
@@ -232,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
232
296
  error=f"HTTP {response.status_code}: {response.text}",
233
297
  config=config.to_dict()
234
298
  )
235
- # Send webhook for failed result
236
299
  _send_webhook_sync(result, config)
237
300
  return result
238
301
 
@@ -240,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
240
303
 
241
304
  # Save markdown if output directory is configured
242
305
  if config.output_dir:
243
- # Save individual file if not combining or if combining in full mode
244
- if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
245
- filepath = config.output_dir / f"{slugify(url)}.md"
246
- with open(filepath, 'w', encoding='utf-8') as f:
247
- f.write(markdown)
248
-
249
- # Handle combined markdown file
250
- if config.combine_to_one_markdown:
251
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
252
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
253
-
254
- with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
255
- f.write(combined_content)
306
+ _save_markdown_sync(url, markdown, config)
256
307
 
257
308
  result = CrawlResult(
258
309
  url=url,
@@ -261,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
261
312
  config=config.to_dict()
262
313
  )
263
314
 
315
+ # Handle post-extraction if configured
316
+ if config.post_extraction_agent:
317
+ try:
318
+ post_config = PostExtractionConfig(
319
+ model=config.post_extraction_agent["model"],
320
+ messages=config.post_extraction_agent["messages"],
321
+ api_key=config.post_extraction_agent["api_key"],
322
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
324
+ base_url=config.post_extraction_agent.get("base_url"),
325
+ combine_output=bool(config.post_extraction_agent_save_to_file),
326
+ output_file=config.post_extraction_agent_save_to_file,
327
+ custom_transform_function=config.post_agent_transformer_function
328
+ )
329
+
330
+ agent = PostExtractionAgent(post_config)
331
+ extraction_result = asyncio.run(agent.process_content(url, markdown))
332
+ if extraction_result:
333
+ result.extraction_result = extraction_result
334
+ except Exception as e:
335
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
+
264
337
  # Send webhook for successful result
265
338
  _send_webhook_sync(result, config)
266
339
 
@@ -281,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
281
354
  _send_webhook_sync(result, config)
282
355
  return result
283
356
 
357
+ async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
358
+ """Save crawl report to JSON file asynchronously."""
359
+ if not config.report_file:
360
+ return
361
+
362
+ # Separate successful and failed results
363
+ successful_results = [r for r in results if r.status == "success"]
364
+ failed_results = [r for r in results if r.status == "failed"]
365
+
366
+ report = {
367
+ "timestamp": datetime.now().isoformat(),
368
+ "config": config.to_dict(),
369
+ "results": {
370
+ "successful": [asdict(r) for r in successful_results],
371
+ "failed": [asdict(r) for r in failed_results]
372
+ },
373
+ "summary": {
374
+ "total": len(results),
375
+ "successful": len(successful_results),
376
+ "failed": len(failed_results),
377
+ "retry_info": retry_stats or {}
378
+ }
379
+ }
380
+
381
+ async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
382
+ await f.write(json.dumps(report, indent=2))
383
+
384
+ def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
385
+ """Synchronous version of report saver."""
386
+ if not config.report_file:
387
+ return
388
+
389
+ # Create report similar to async version
390
+ successful_results = [r for r in results if r.status == "success"]
391
+ failed_results = [r for r in results if r.status == "failed"]
392
+
393
+ report = {
394
+ "timestamp": datetime.now().isoformat(),
395
+ "config": config.to_dict(),
396
+ "results": {
397
+ "successful": [asdict(r) for r in successful_results],
398
+ "failed": [asdict(r) for r in failed_results]
399
+ },
400
+ "summary": {
401
+ "total": len(results),
402
+ "successful": len(successful_results),
403
+ "failed": len(failed_results),
404
+ "retry_info": retry_stats or {}
405
+ }
406
+ }
407
+
408
+ with open(config.report_file, 'w', encoding='utf-8') as f:
409
+ json.dump(report, f, indent=2)
410
+
284
411
  class SpiderForce4AI:
285
412
  """Main class for interacting with SpiderForce4AI service."""
286
413
 
@@ -289,6 +416,7 @@ class SpiderForce4AI:
289
416
  self.session = None
290
417
  self._executor = ThreadPoolExecutor()
291
418
  self.crawl_results: List[CrawlResult] = []
419
+ self._retry_stats = {}
292
420
 
293
421
  async def _ensure_session(self):
294
422
  """Ensure aiohttp session exists."""
@@ -300,215 +428,6 @@ class SpiderForce4AI:
300
428
  if self.session and not self.session.closed:
301
429
  await self.session.close()
302
430
 
303
- async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
304
- """Save markdown content to file and/or append to combined file."""
305
- # Save individual file if not combining or if combining in full mode
306
- if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
307
- filename = f"{slugify(url)}.md"
308
- filepath = output_dir / filename
309
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
310
- await f.write(markdown)
311
-
312
- # Handle combined markdown file
313
- if self.config.combine_to_one_markdown:
314
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
315
- combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
316
-
317
- async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
318
- await f.write(combined_content)
319
-
320
-
321
-
322
- def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
323
- """
324
- Crawl sitemap URLs using server-side parallel processing.
325
- """
326
- print(f"Fetching sitemap from {sitemap_url}...")
327
-
328
- # Fetch sitemap
329
- try:
330
- response = requests.get(sitemap_url, timeout=config.timeout)
331
- response.raise_for_status()
332
- sitemap_text = response.text
333
- except Exception as e:
334
- print(f"Error fetching sitemap: {str(e)}")
335
- raise
336
-
337
- # Parse sitemap
338
- try:
339
- root = ET.fromstring(sitemap_text)
340
- namespace = {'ns': root.tag.split('}')[0].strip('{')}
341
- urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
342
- print(f"Found {len(urls)} URLs in sitemap")
343
- except Exception as e:
344
- print(f"Error parsing sitemap: {str(e)}")
345
- raise
346
-
347
- # Process URLs using server-side parallel endpoint
348
- return self.crawl_urls_server_parallel(urls, config)
349
-
350
-
351
- def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
352
- """
353
- Crawl multiple URLs using server-side parallel processing.
354
- This uses the /convert_parallel endpoint which handles parallelization on the server.
355
- """
356
- print(f"Sending {len(urls)} URLs for parallel processing...")
357
-
358
- try:
359
- endpoint = f"{self.base_url}/convert_parallel"
360
-
361
- # Prepare payload
362
- payload = {
363
- "urls": urls,
364
- **config.to_dict()
365
- }
366
-
367
- # Send request
368
- response = requests.post(
369
- endpoint,
370
- json=payload,
371
- timeout=config.timeout
372
- )
373
- response.raise_for_status()
374
-
375
- # Process results
376
- results = []
377
- server_results = response.json() # Assuming server returns JSON array of results
378
-
379
- for url_result in server_results:
380
- result = CrawlResult(
381
- url=url_result["url"],
382
- status=url_result.get("status", "failed"),
383
- markdown=url_result.get("markdown"),
384
- error=url_result.get("error"),
385
- config=config.to_dict()
386
- )
387
-
388
- # Save markdown if successful and output dir is configured
389
- if result.status == "success" and config.output_dir and result.markdown:
390
- filepath = config.output_dir / f"{slugify(result.url)}.md"
391
- with open(filepath, 'w', encoding='utf-8') as f:
392
- f.write(result.markdown)
393
-
394
- # Send webhook if configured
395
- if config.webhook_url:
396
- _send_webhook_sync(result, config)
397
-
398
- results.append(result)
399
-
400
- # Calculate statistics
401
- successful = len([r for r in results if r.status == "success"])
402
- failed = len([r for r in results if r.status == "failed"])
403
-
404
- # Print summary
405
- print(f"\nParallel processing completed:")
406
- print(f"✓ Successful: {successful}")
407
- print(f"✗ Failed: {failed}")
408
-
409
- # Save report if enabled
410
- if config.save_reports and config.report_file:
411
- self._retry_stats = {
412
- "initial_failures": failed,
413
- "failure_ratio": (failed / len(urls)) * 100,
414
- "retry_successful": 0, # No retries in server parallel mode
415
- "retry_failed": failed
416
- }
417
- self._save_report_sync(results, config)
418
- console.print(f"📊 Report saved to: {config.report_file}")
419
-
420
- return results
421
-
422
- except Exception as e:
423
- print(f"Error during parallel processing: {str(e)}")
424
- # Create failed results for all URLs
425
- return [
426
- CrawlResult(
427
- url=url,
428
- status="failed",
429
- error=str(e),
430
- config=config.to_dict()
431
- ) for url in urls
432
- ]
433
-
434
-
435
- async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
436
- """Send webhook with crawl results."""
437
- if not config.webhook_url:
438
- return
439
-
440
- payload = {
441
- "url": result.url,
442
- "status": result.status,
443
- "markdown": result.markdown if result.status == "success" else None,
444
- "error": result.error if result.status == "failed" else None,
445
- "timestamp": result.timestamp,
446
- "config": config.to_dict()
447
- }
448
-
449
- try:
450
- async with httpx.AsyncClient() as client:
451
- response = await client.post(
452
- config.webhook_url,
453
- json=payload,
454
- timeout=config.webhook_timeout
455
- )
456
- response.raise_for_status()
457
- except Exception as e:
458
- console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
459
-
460
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
461
- """Save crawl report synchronously."""
462
- # Separate successful and failed results
463
- successful_results = [r for r in results if r.status == "success"]
464
- failed_results = [r for r in results if r.status == "failed"]
465
-
466
- # Create report with only final state
467
- report = {
468
- "timestamp": datetime.now().isoformat(),
469
- "config": config.to_dict(),
470
- "results": {
471
- "successful": [asdict(r) for r in successful_results],
472
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
473
- },
474
- "summary": {
475
- "total": len(results),
476
- "successful": len(successful_results),
477
- "failed": len(failed_results),
478
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
479
- }
480
- }
481
-
482
- with open(config.report_file, 'w', encoding='utf-8') as f:
483
- json.dump(report, f, indent=2)
484
-
485
- async def _save_report(self, config: CrawlConfig):
486
- """Save crawl report to JSON file."""
487
- if not config.report_file:
488
- return
489
-
490
- # Separate successful and failed results
491
- successful_results = [r for r in self.crawl_results if r.status == "success"]
492
- failed_results = [r for r in self.crawl_results if r.status == "failed"]
493
-
494
- report = {
495
- "timestamp": datetime.now().isoformat(),
496
- "config": config.to_dict(),
497
- "results": {
498
- "successful": [asdict(r) for r in successful_results],
499
- "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
500
- },
501
- "summary": {
502
- "total": len(self.crawl_results),
503
- "successful": len(successful_results),
504
- "failed": len(failed_results),
505
- "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
506
- }
507
- }
508
-
509
- async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
510
- await f.write(json.dumps(report, indent=2))
511
-
512
431
  async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
513
432
  """Crawl a single URL asynchronously."""
514
433
  await self._ensure_session()
@@ -539,9 +458,31 @@ class SpiderForce4AI:
539
458
  )
540
459
 
541
460
  if config.output_dir:
542
- await self._save_markdown(url, markdown, config.output_dir)
461
+ await _save_markdown_async(url, markdown, config)
543
462
 
544
- await self._send_webhook(result, config)
463
+ # Handle post-extraction if configured
464
+ if config.post_extraction_agent and result.status == "success":
465
+ try:
466
+ post_config = PostExtractionConfig(
467
+ model=config.post_extraction_agent["model"],
468
+ messages=config.post_extraction_agent["messages"],
469
+ api_key=config.post_extraction_agent["api_key"],
470
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
472
+ base_url=config.post_extraction_agent.get("base_url"),
473
+ combine_output=bool(config.post_extraction_agent_save_to_file),
474
+ output_file=config.post_extraction_agent_save_to_file,
475
+ custom_transform_function=config.post_agent_transformer_function
476
+ )
477
+
478
+ agent = PostExtractionAgent(post_config)
479
+ extraction_result = await agent.process_content(url, markdown)
480
+ if extraction_result:
481
+ result.extraction_result = extraction_result
482
+ except Exception as e:
483
+ console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
+
485
+ await _send_webhook_async(result, config)
545
486
 
546
487
  self.crawl_results.append(result)
547
488
  return result
@@ -561,18 +502,18 @@ class SpiderForce4AI:
561
502
  return asyncio.run(self.crawl_url_async(url, config))
562
503
 
563
504
  async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
564
- """Retry failed URLs once."""
505
+ """Retry failed URLs with optional progress tracking."""
565
506
  if not failed_results:
566
507
  return []
567
508
 
568
509
  failed_count = len(failed_results)
569
- total_count = len([r for r in self.crawl_results])
510
+ total_count = len(self.crawl_results)
570
511
  failure_ratio = (failed_count / total_count) * 100
571
512
 
572
513
  console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
573
514
  retry_results = []
574
515
 
575
- # Create a new progress bar if one wasn't provided
516
+ # Create or use provided progress bar
576
517
  should_close_progress = progress is None
577
518
  if progress is None:
578
519
  progress = Progress(
@@ -616,6 +557,7 @@ class SpiderForce4AI:
616
557
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
617
558
  """Crawl multiple URLs asynchronously with progress bar."""
618
559
  await self._ensure_session()
560
+ post_extraction_results = {}
619
561
 
620
562
  with Progress(
621
563
  SpinnerColumn(),
@@ -624,52 +566,60 @@ class SpiderForce4AI:
624
566
  TaskProgressColumn(),
625
567
  console=console
626
568
  ) as progress:
627
- task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
569
+ crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
628
570
 
629
571
  async def crawl_with_progress(url):
630
572
  result = await self.crawl_url_async(url, config)
631
- progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
573
+ progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
632
574
  return result
633
575
 
576
+ # Set up concurrency control
634
577
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
+
635
579
  async def crawl_with_semaphore(url):
636
580
  async with semaphore:
637
581
  result = await crawl_with_progress(url)
638
582
  await asyncio.sleep(config.request_delay)
639
583
  return result
640
584
 
585
+ # Perform initial crawl
641
586
  initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
642
587
 
643
- # Identify failed URLs
588
+ # Handle failed URLs
644
589
  failed_results = [r for r in initial_results if r.status == "failed"]
645
-
646
- # Calculate initial failure ratio
647
590
  initial_failed = len(failed_results)
648
591
  total_urls = len(urls)
649
592
  failure_ratio = (initial_failed / total_urls) * 100
650
593
 
651
594
  # Retry failed URLs if ratio is acceptable
652
- if failed_results:
653
- if failure_ratio > 20:
654
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
655
- results = initial_results
656
- else:
657
- retry_results = await self._retry_failed_urls(failed_results, config, progress)
658
- # Update results list by replacing failed results with successful retries
659
- results = initial_results.copy()
660
- for retry_result in retry_results:
661
- for i, result in enumerate(results):
662
- if result.url == retry_result.url:
663
- results[i] = retry_result
664
- break
665
- else:
666
- results = initial_results
595
+ results = initial_results
596
+ retry_successful = 0
667
597
 
668
- # Calculate final statistics before saving report
598
+ if failed_results and failure_ratio <= 20:
599
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
600
+ retry_successful = len([r for r in retry_results if r.status == "success"])
601
+
602
+ # Update results list
603
+ for retry_result in retry_results:
604
+ for i, result in enumerate(results):
605
+ if result.url == retry_result.url:
606
+ results[i] = retry_result
607
+ break
608
+
609
+ # Calculate final statistics
669
610
  final_successful = len([r for r in results if r.status == "success"])
670
611
  final_failed = len([r for r in results if r.status == "failed"])
671
612
 
672
- # Print detailed summary
613
+ # Update retry stats
614
+ self._retry_stats = {
615
+ "initial_failures": initial_failed,
616
+ "failure_ratio": failure_ratio,
617
+ "retry_successful": retry_successful if initial_failed > 0 else 0,
618
+ "retry_failed": final_failed,
619
+ "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
620
+ }
621
+
622
+ # Print summary
673
623
  console.print(f"\n[green]Crawling Summary:[/green]")
674
624
  console.print(f"Total URLs processed: {total_urls}")
675
625
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
@@ -678,18 +628,11 @@ class SpiderForce4AI:
678
628
  console.print(f" ✗ Failed: {final_failed}")
679
629
 
680
630
  if initial_failed > 0:
681
- retry_successful = initial_failed - final_failed
682
631
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
683
632
 
684
- # Save final report after all retries are complete
633
+ # Save final report
685
634
  if config.save_reports:
686
- self._retry_stats = {
687
- "initial_failures": initial_failed,
688
- "failure_ratio": failure_ratio,
689
- "retry_successful": retry_successful if initial_failed > 0 else 0,
690
- "retry_failed": final_failed
691
- }
692
- await self._save_report(config)
635
+ await _save_report_async(results, config, self._retry_stats)
693
636
  console.print(f"📊 Report saved to: {config.report_file}")
694
637
 
695
638
  return results
@@ -726,32 +669,21 @@ class SpiderForce4AI:
726
669
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
727
670
 
728
671
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
729
- """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
730
- print(f"Fetching sitemap from {sitemap_url}...")
731
-
732
- # Fetch sitemap
672
+ """Crawl sitemap URLs in parallel using multiprocessing."""
673
+ # Fetch and parse sitemap
733
674
  try:
734
675
  response = requests.get(sitemap_url, timeout=config.timeout)
735
676
  response.raise_for_status()
736
- sitemap_text = response.text
737
- except Exception as e:
738
- print(f"Error fetching sitemap: {str(e)}")
739
- raise
740
-
741
- # Parse sitemap
742
- try:
743
- root = ET.fromstring(sitemap_text)
677
+ root = ET.fromstring(response.text)
744
678
  namespace = {'ns': root.tag.split('}')[0].strip('{')}
745
679
  urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
746
- print(f"Found {len(urls)} URLs in sitemap")
680
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
747
681
  except Exception as e:
748
- print(f"Error parsing sitemap: {str(e)}")
682
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
749
683
  raise
750
684
 
751
- # Prepare arguments for parallel processing
685
+ # Process URLs in parallel
752
686
  process_args = [(url, self.base_url, config) for url in urls]
753
-
754
- # Create process pool and execute crawls
755
687
  results = []
756
688
 
757
689
  with Pool(processes=config.max_concurrent_requests) as pool:
@@ -762,81 +694,186 @@ class SpiderForce4AI:
762
694
  TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
763
695
  TextColumn("({task.completed}/{task.total})"),
764
696
  ) as progress:
765
- task = progress.add_task("Crawling URLs...", total=len(urls))
697
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
766
698
 
767
699
  for result in pool.imap_unordered(_process_url_parallel, process_args):
768
700
  results.append(result)
769
701
  progress.update(task, advance=1)
770
702
  status = "✓" if result.status == "success" else "✗"
771
- progress.description = f"Last: {status} {result.url}"
703
+ progress.description = f"[cyan]Last: {status} {result.url}"
772
704
 
773
- # Calculate initial failure statistics
705
+ # Calculate statistics and handle retries
774
706
  failed_results = [r for r in results if r.status == "failed"]
775
707
  initial_failed = len(failed_results)
776
- total_urls = len(urls)
777
- failure_ratio = (initial_failed / total_urls) * 100
708
+ failure_ratio = (initial_failed / len(urls)) * 100
709
+ retry_successful = 0
778
710
 
779
- # Retry failed URLs if ratio is acceptable
780
- if failed_results:
781
- if failure_ratio > 20:
782
- console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
783
- else:
784
- failed_count = len(failed_results)
785
- failure_ratio = (failed_count / total_urls) * 100
786
- console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
787
- for result in failed_results:
788
- new_result = _process_url_parallel((result.url, self.base_url, config))
789
-
790
- # Save markdown and trigger webhook for successful retries
791
- if new_result.status == "success":
792
- console.print(f"[green] Retry successful: {result.url}[/green]")
793
- # Save markdown if output directory is configured
794
- if config.output_dir and new_result.markdown:
795
- filepath = config.output_dir / f"{slugify(new_result.url)}.md"
796
- with open(filepath, 'w', encoding='utf-8') as f:
797
- f.write(new_result.markdown)
798
- # Send webhook for successful retry
799
- _send_webhook_sync(new_result, config)
800
- else:
801
- console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
802
- # Send webhook for failed retry
803
- _send_webhook_sync(new_result, config)
804
-
805
- # Update results list
806
- for i, r in enumerate(results):
807
- if r.url == new_result.url:
808
- results[i] = new_result
809
- break
711
+ if failed_results and failure_ratio <= 20:
712
+ console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
713
+ for result in failed_results:
714
+ new_result = _process_url_parallel((result.url, self.base_url, config))
715
+ if new_result.status == "success":
716
+ retry_successful += 1
717
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
718
+ else:
719
+ console.print(f"[red]✗ Retry failed: {result.url}[/red]")
720
+
721
+ # Update results list
722
+ for i, r in enumerate(results):
723
+ if r.url == new_result.url:
724
+ results[i] = new_result
725
+ break
810
726
 
811
727
  # Calculate final statistics
812
728
  final_successful = len([r for r in results if r.status == "success"])
813
729
  final_failed = len([r for r in results if r.status == "failed"])
814
730
 
815
- # Print detailed summary
731
+ # Print summary
816
732
  console.print(f"\n[green]Crawling Summary:[/green]")
817
- console.print(f"Total URLs processed: {total_urls}")
733
+ console.print(f"Total URLs processed: {len(urls)}")
818
734
  console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
819
735
  console.print(f"Final results:")
820
736
  console.print(f" ✓ Successful: {final_successful}")
821
737
  console.print(f" ✗ Failed: {final_failed}")
822
-
738
+
823
739
  if initial_failed > 0:
824
- retry_successful = initial_failed - final_failed
825
740
  console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
826
741
 
827
- # Save final report after all retries are complete
742
+ # Save report
828
743
  if config.save_reports:
829
744
  self._retry_stats = {
830
745
  "initial_failures": initial_failed,
831
746
  "failure_ratio": failure_ratio,
832
- "retry_successful": retry_successful if initial_failed > 0 else 0,
747
+ "retry_successful": retry_successful,
833
748
  "retry_failed": final_failed
834
749
  }
835
- self._save_report_sync(results, config)
750
+ _save_report_sync(results, config, self._retry_stats)
836
751
  console.print(f"📊 Report saved to: {config.report_file}")
837
752
 
838
753
  return results
839
754
 
755
+ def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
756
+ """
757
+ Crawl multiple URLs using server-side parallel processing.
758
+ This uses the /convert_parallel endpoint which handles parallelization on the server.
759
+ """
760
+ console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
761
+
762
+ try:
763
+ endpoint = f"{self.base_url}/convert_parallel"
764
+
765
+ # Prepare payload
766
+ payload = {
767
+ "urls": urls,
768
+ **config.to_dict()
769
+ }
770
+
771
+ # Send request
772
+ response = requests.post(
773
+ endpoint,
774
+ json=payload,
775
+ timeout=config.timeout
776
+ )
777
+ response.raise_for_status()
778
+
779
+ # Process results
780
+ results = []
781
+ server_results = response.json()
782
+
783
+ for url_result in server_results:
784
+ result = CrawlResult(
785
+ url=url_result["url"],
786
+ status=url_result.get("status", "failed"),
787
+ markdown=url_result.get("markdown"),
788
+ error=url_result.get("error"),
789
+ config=config.to_dict()
790
+ )
791
+
792
+ # Save markdown if successful and output dir is configured
793
+ if result.status == "success" and config.output_dir and result.markdown:
794
+ _save_markdown_sync(result.url, result.markdown, config)
795
+
796
+ # Handle post-extraction if configured
797
+ if config.post_extraction_agent and result.status == "success":
798
+ try:
799
+ post_config = PostExtractionConfig(
800
+ model=config.post_extraction_agent["model"],
801
+ messages=config.post_extraction_agent["messages"],
802
+ api_key=config.post_extraction_agent["api_key"],
803
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
804
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
805
+ base_url=config.post_extraction_agent.get("base_url"),
806
+ combine_output=bool(config.post_extraction_agent_save_to_file),
807
+ output_file=config.post_extraction_agent_save_to_file,
808
+ custom_transform_function=config.post_agent_transformer_function
809
+ )
810
+
811
+ agent = PostExtractionAgent(post_config)
812
+ extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
813
+ if extraction_result:
814
+ result.extraction_result = extraction_result
815
+ except Exception as e:
816
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
817
+
818
+ # Send webhook if configured
819
+ _send_webhook_sync(result, config)
820
+ results.append(result)
821
+
822
+ # Calculate statistics
823
+ successful = len([r for r in results if r.status == "success"])
824
+ failed = len([r for r in results if r.status == "failed"])
825
+
826
+ # Print summary
827
+ console.print("\n[green]Parallel processing completed:[/green]")
828
+ console.print(f"✓ Successful: {successful}")
829
+ console.print(f"✗ Failed: {failed}")
830
+
831
+ # Save report if enabled
832
+ if config.save_reports:
833
+ self._retry_stats = {
834
+ "initial_failures": failed,
835
+ "failure_ratio": (failed / len(urls)) * 100,
836
+ "retry_successful": 0, # No retries in server parallel mode
837
+ "retry_failed": failed
838
+ }
839
+ _save_report_sync(results, config, self._retry_stats)
840
+ console.print(f"📊 Report saved to: {config.report_file}")
841
+
842
+ return results
843
+
844
+ except Exception as e:
845
+ console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
846
+ # Create failed results for all URLs
847
+ return [
848
+ CrawlResult(
849
+ url=url,
850
+ status="failed",
851
+ error=str(e),
852
+ config=config.to_dict()
853
+ ) for url in urls
854
+ ]
855
+
856
+ def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
857
+ """
858
+ Crawl sitemap URLs using server-side parallel processing.
859
+ """
860
+ console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
861
+
862
+ try:
863
+ response = requests.get(sitemap_url, timeout=config.timeout)
864
+ response.raise_for_status()
865
+ root = ET.fromstring(response.text)
866
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
867
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
868
+ console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
869
+
870
+ # Process URLs using server-side parallel endpoint
871
+ return self.crawl_urls_server_parallel(urls, config)
872
+
873
+ except Exception as e:
874
+ console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
875
+ raise
876
+
840
877
  async def __aenter__(self):
841
878
  """Async context manager entry."""
842
879
  await self._ensure_session()
@@ -854,3 +891,7 @@ class SpiderForce4AI:
854
891
  """Sync context manager exit."""
855
892
  self._executor.shutdown(wait=True)
856
893
 
894
+ # Version info
895
+ #__version__ = "2.3.1"
896
+ #__author__ = "Piotr Tamulewicz"
897
+ #__email__ = "pt@petertam.pro"