spiderforce4ai 2.1__py3-none-any.whl → 2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +449 -408
- spiderforce4ai/post_extraction_agent.py +259 -0
- {spiderforce4ai-2.1.dist-info → spiderforce4ai-2.4.dist-info}/METADATA +41 -3
- spiderforce4ai-2.4.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/entry_points.txt +2 -0
- spiderforce4ai-2.1.dist-info/RECORD +0 -5
- {spiderforce4ai-2.1.dist-info → spiderforce4ai-2.4.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.1.dist-info → spiderforce4ai-2.4.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# spiderforce4ai/__init__.py
|
2
2
|
|
3
|
+
from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
|
3
4
|
import asyncio
|
4
5
|
import aiohttp
|
5
6
|
import json
|
6
7
|
import logging
|
7
|
-
from typing import List, Dict, Union, Optional, Tuple
|
8
|
+
from typing import List, Dict, Union, Optional, Tuple, Callable, Any
|
8
9
|
from dataclasses import dataclass, asdict
|
9
10
|
from urllib.parse import urljoin, urlparse
|
10
11
|
from pathlib import Path
|
@@ -23,75 +24,55 @@ from multiprocessing import Pool
|
|
23
24
|
console = Console()
|
24
25
|
|
25
26
|
def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
26
|
-
"""Extract metadata and headers from markdown content
|
27
|
+
"""Extract metadata and headers from markdown content."""
|
27
28
|
lines = markdown.split('\n')
|
28
|
-
|
29
|
-
|
30
|
-
metadata = {
|
31
|
-
'title': '',
|
32
|
-
'description': '',
|
33
|
-
'canonical_url': '',
|
34
|
-
'language': ''
|
35
|
-
}
|
36
|
-
first_paragraph = ''
|
29
|
+
metadata = {}
|
30
|
+
headers = []
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
if
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
in_metadata = False
|
47
|
-
break
|
32
|
+
def parse_metadata_line(line):
|
33
|
+
"""Parse a single metadata line correctly."""
|
34
|
+
first_colon = line.find(':')
|
35
|
+
if first_colon == -1:
|
36
|
+
return None, None
|
37
|
+
|
38
|
+
key = line[:first_colon].strip()
|
39
|
+
value = line[first_colon + 1:].strip()
|
48
40
|
|
49
|
-
#
|
50
|
-
if
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
# Handle multi-line values
|
56
|
-
if value.startswith('>'):
|
57
|
-
value = value[1:].strip()
|
58
|
-
j = i + 1
|
59
|
-
while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
|
60
|
-
value += ' ' + lines[j].strip()
|
61
|
-
j += 1
|
62
|
-
|
63
|
-
if key == 'title':
|
64
|
-
metadata['title'] = value
|
65
|
-
elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
|
66
|
-
metadata['description'] = value
|
67
|
-
elif key in ['canonical_url', 'canonical']:
|
68
|
-
metadata['canonical_url'] = value
|
69
|
-
elif key in ['language', 'lang']:
|
70
|
-
metadata['language'] = value
|
71
|
-
elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
|
72
|
-
first_paragraph = line.strip()
|
73
|
-
|
74
|
-
# Use first paragraph as fallback description if none found
|
75
|
-
if not metadata['description'] and first_paragraph:
|
76
|
-
metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
|
77
|
-
|
78
|
-
# Add formatted metadata section
|
79
|
-
extracted.append(f"URL: {url}")
|
80
|
-
extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
|
81
|
-
extracted.append(f"Description: {metadata['description']}")
|
82
|
-
extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
|
83
|
-
extracted.append(f"Language: {metadata['language'] or 'en'}")
|
84
|
-
extracted.append("") # Empty line after metadata
|
41
|
+
# Handle the case where value starts with "URL:" - this means it's a missing description
|
42
|
+
if value.startswith('URL:'):
|
43
|
+
return key, ''
|
44
|
+
|
45
|
+
return key, value
|
85
46
|
|
86
|
-
#
|
47
|
+
# Process each line
|
87
48
|
for line in lines:
|
88
|
-
|
49
|
+
line = line.strip()
|
50
|
+
if not line:
|
51
|
+
continue
|
52
|
+
|
53
|
+
# Check if it's a metadata line (contains : but isn't a header)
|
54
|
+
if ':' in line and not line.startswith('#'):
|
55
|
+
key, value = parse_metadata_line(line)
|
56
|
+
if key:
|
57
|
+
metadata[key] = value
|
58
|
+
# Check if it's a header
|
59
|
+
elif line.startswith('#'):
|
89
60
|
level = len(line) - len(line.lstrip('#'))
|
90
61
|
text = line.lstrip('#').strip()
|
91
62
|
if 1 <= level <= 6:
|
92
|
-
|
63
|
+
headers.append(f"H{level}: {text}")
|
93
64
|
|
94
|
-
|
65
|
+
# Construct output
|
66
|
+
output = []
|
67
|
+
output.append(f"URL: {url}")
|
68
|
+
output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
|
69
|
+
output.append(f"Description: {metadata.get('Description', '')}")
|
70
|
+
output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
|
71
|
+
output.append(f"Language: {metadata.get('Language', 'en')}")
|
72
|
+
output.append("") # Empty line
|
73
|
+
output.extend(headers)
|
74
|
+
|
75
|
+
return '\n'.join(output)
|
95
76
|
|
96
77
|
def slugify(url: str) -> str:
|
97
78
|
"""Convert URL to a valid filename."""
|
@@ -111,6 +92,7 @@ class CrawlResult:
|
|
111
92
|
error: Optional[str] = None
|
112
93
|
timestamp: str = None
|
113
94
|
config: Dict = None
|
95
|
+
extraction_result: Optional[Dict] = None # Store post-extraction results
|
114
96
|
|
115
97
|
def __post_init__(self):
|
116
98
|
if not self.timestamp:
|
@@ -131,9 +113,14 @@ class CrawlConfig:
|
|
131
113
|
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
132
114
|
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
133
115
|
save_reports: bool = False # Whether to save crawl reports
|
134
|
-
report_file: Optional[Path] = None # Optional report file location
|
135
|
-
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
|
136
|
-
combined_markdown_file: Optional[Path] = None # Optional path for combined
|
116
|
+
report_file: Optional[Path] = None # Optional report file location
|
117
|
+
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers'
|
118
|
+
combined_markdown_file: Optional[Path] = None # Optional path for combined file
|
119
|
+
|
120
|
+
# Post-extraction settings
|
121
|
+
post_extraction_agent: Optional[Dict[str, Any]] = None # LLM configuration
|
122
|
+
post_extraction_agent_save_to_file: Optional[str] = None # Extraction output file
|
123
|
+
post_agent_transformer_function: Optional[Callable] = None # Custom transformer
|
137
124
|
|
138
125
|
def __post_init__(self):
|
139
126
|
# Initialize empty lists/dicts for None values
|
@@ -161,6 +148,15 @@ class CrawlConfig:
|
|
161
148
|
# Create or clear the combined file
|
162
149
|
self.combined_markdown_file.write_text('')
|
163
150
|
|
151
|
+
# Validate post-extraction agent configuration if provided
|
152
|
+
if self.post_extraction_agent:
|
153
|
+
if "messages" not in self.post_extraction_agent:
|
154
|
+
raise ValueError("Post-extraction agent configuration must include 'messages'")
|
155
|
+
if "model" not in self.post_extraction_agent:
|
156
|
+
raise ValueError("Post-extraction agent configuration must include 'model'")
|
157
|
+
if "api_key" not in self.post_extraction_agent:
|
158
|
+
raise ValueError("Post-extraction agent configuration must include 'api_key'")
|
159
|
+
|
164
160
|
def to_dict(self) -> Dict:
|
165
161
|
"""Convert config to dictionary for API requests."""
|
166
162
|
payload = {}
|
@@ -172,52 +168,120 @@ class CrawlConfig:
|
|
172
168
|
if self.remove_selectors_regex:
|
173
169
|
payload["remove_selectors_regex"] = self.remove_selectors_regex
|
174
170
|
return payload
|
175
|
-
|
176
|
-
|
171
|
+
|
177
172
|
def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
178
173
|
"""Synchronous version of webhook sender for parallel processing."""
|
179
174
|
if not config.webhook_url:
|
180
175
|
return
|
181
176
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
177
|
+
try:
|
178
|
+
# Use custom payload template if provided, otherwise use default
|
179
|
+
if config.webhook_payload_template:
|
180
|
+
# Replace variables in the template
|
181
|
+
payload_str = config.webhook_payload_template.format(
|
182
|
+
url=result.url,
|
183
|
+
status=result.status,
|
184
|
+
markdown=result.markdown if result.status == "success" else None,
|
185
|
+
error=result.error if result.status == "failed" else None,
|
186
|
+
timestamp=result.timestamp,
|
187
|
+
config=config.to_dict(),
|
188
|
+
extraction_result=result.extraction_result if result.extraction_result else None
|
189
|
+
)
|
190
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
191
|
+
else:
|
192
|
+
# Use default payload format
|
193
|
+
payload = {
|
194
|
+
"url": result.url,
|
195
|
+
"status": result.status,
|
196
|
+
"markdown": result.markdown if result.status == "success" else None,
|
197
|
+
"error": result.error if result.status == "failed" else None,
|
198
|
+
"timestamp": result.timestamp,
|
199
|
+
"config": config.to_dict(),
|
200
|
+
"extraction_result": result.extraction_result if result.extraction_result else None
|
201
|
+
}
|
202
|
+
|
203
|
+
response = requests.post(
|
204
|
+
config.webhook_url,
|
205
|
+
json=payload,
|
206
|
+
headers=config.webhook_headers,
|
207
|
+
timeout=config.webhook_timeout
|
192
208
|
)
|
193
|
-
|
194
|
-
|
195
|
-
|
209
|
+
response.raise_for_status()
|
210
|
+
except Exception as e:
|
211
|
+
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
212
|
+
|
213
|
+
async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
|
214
|
+
"""Asynchronous webhook sender."""
|
215
|
+
if not config.webhook_url:
|
216
|
+
return
|
217
|
+
|
218
|
+
try:
|
219
|
+
# Prepare payload similar to sync version
|
196
220
|
payload = {
|
197
221
|
"url": result.url,
|
198
222
|
"status": result.status,
|
199
223
|
"markdown": result.markdown if result.status == "success" else None,
|
200
224
|
"error": result.error if result.status == "failed" else None,
|
201
225
|
"timestamp": result.timestamp,
|
202
|
-
"config": config.to_dict()
|
226
|
+
"config": config.to_dict(),
|
227
|
+
"extraction_result": result.extraction_result if result.extraction_result else None
|
203
228
|
}
|
204
229
|
|
230
|
+
async with httpx.AsyncClient() as client:
|
231
|
+
response = await client.post(
|
232
|
+
config.webhook_url,
|
233
|
+
json=payload,
|
234
|
+
headers=config.webhook_headers,
|
235
|
+
timeout=config.webhook_timeout
|
236
|
+
)
|
237
|
+
response.raise_for_status()
|
238
|
+
except Exception as e:
|
239
|
+
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
240
|
+
|
241
|
+
async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
|
242
|
+
"""Save markdown content to file and/or append to combined file asynchronously."""
|
205
243
|
try:
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
244
|
+
# Save individual file if not combining or if combining in full mode
|
245
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
246
|
+
filename = f"{slugify(url)}.md"
|
247
|
+
filepath = config.output_dir / filename
|
248
|
+
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
249
|
+
await f.write(markdown)
|
250
|
+
|
251
|
+
# Handle combined markdown file
|
252
|
+
if config.combine_to_one_markdown:
|
253
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
254
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
255
|
+
|
256
|
+
async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
257
|
+
await f.write(combined_content)
|
258
|
+
except Exception as e:
|
259
|
+
console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
|
260
|
+
|
261
|
+
def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
|
262
|
+
"""Synchronous version of markdown saver for parallel processing."""
|
263
|
+
try:
|
264
|
+
# Save individual file if not combining or if combining in full mode
|
265
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
266
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
267
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
268
|
+
f.write(markdown)
|
269
|
+
|
270
|
+
# Handle combined markdown file
|
271
|
+
if config.combine_to_one_markdown:
|
272
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
273
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
274
|
+
|
275
|
+
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
276
|
+
f.write(combined_content)
|
213
277
|
except Exception as e:
|
214
|
-
print(f"
|
278
|
+
console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
|
215
279
|
|
216
|
-
# Module level function for multiprocessing
|
217
280
|
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
218
281
|
"""Process a single URL for parallel processing."""
|
219
282
|
url, base_url, config = args
|
220
283
|
try:
|
284
|
+
# Make the conversion request
|
221
285
|
endpoint = f"{base_url}/convert"
|
222
286
|
payload = {
|
223
287
|
"url": url,
|
@@ -232,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
232
296
|
error=f"HTTP {response.status_code}: {response.text}",
|
233
297
|
config=config.to_dict()
|
234
298
|
)
|
235
|
-
# Send webhook for failed result
|
236
299
|
_send_webhook_sync(result, config)
|
237
300
|
return result
|
238
301
|
|
@@ -240,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
240
303
|
|
241
304
|
# Save markdown if output directory is configured
|
242
305
|
if config.output_dir:
|
243
|
-
|
244
|
-
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
245
|
-
filepath = config.output_dir / f"{slugify(url)}.md"
|
246
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
247
|
-
f.write(markdown)
|
248
|
-
|
249
|
-
# Handle combined markdown file
|
250
|
-
if config.combine_to_one_markdown:
|
251
|
-
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
252
|
-
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
253
|
-
|
254
|
-
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
255
|
-
f.write(combined_content)
|
306
|
+
_save_markdown_sync(url, markdown, config)
|
256
307
|
|
257
308
|
result = CrawlResult(
|
258
309
|
url=url,
|
@@ -261,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
261
312
|
config=config.to_dict()
|
262
313
|
)
|
263
314
|
|
315
|
+
# Handle post-extraction if configured
|
316
|
+
if config.post_extraction_agent:
|
317
|
+
try:
|
318
|
+
post_config = PostExtractionConfig(
|
319
|
+
model=config.post_extraction_agent["model"],
|
320
|
+
messages=config.post_extraction_agent["messages"],
|
321
|
+
api_key=config.post_extraction_agent["api_key"],
|
322
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
+
output_file=config.post_extraction_agent_save_to_file,
|
327
|
+
custom_transform_function=config.post_agent_transformer_function
|
328
|
+
)
|
329
|
+
|
330
|
+
agent = PostExtractionAgent(post_config)
|
331
|
+
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
+
if extraction_result:
|
333
|
+
result.extraction_result = extraction_result
|
334
|
+
except Exception as e:
|
335
|
+
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
+
|
264
337
|
# Send webhook for successful result
|
265
338
|
_send_webhook_sync(result, config)
|
266
339
|
|
@@ -281,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
281
354
|
_send_webhook_sync(result, config)
|
282
355
|
return result
|
283
356
|
|
357
|
+
async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
|
358
|
+
"""Save crawl report to JSON file asynchronously."""
|
359
|
+
if not config.report_file:
|
360
|
+
return
|
361
|
+
|
362
|
+
# Separate successful and failed results
|
363
|
+
successful_results = [r for r in results if r.status == "success"]
|
364
|
+
failed_results = [r for r in results if r.status == "failed"]
|
365
|
+
|
366
|
+
report = {
|
367
|
+
"timestamp": datetime.now().isoformat(),
|
368
|
+
"config": config.to_dict(),
|
369
|
+
"results": {
|
370
|
+
"successful": [asdict(r) for r in successful_results],
|
371
|
+
"failed": [asdict(r) for r in failed_results]
|
372
|
+
},
|
373
|
+
"summary": {
|
374
|
+
"total": len(results),
|
375
|
+
"successful": len(successful_results),
|
376
|
+
"failed": len(failed_results),
|
377
|
+
"retry_info": retry_stats or {}
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
|
382
|
+
await f.write(json.dumps(report, indent=2))
|
383
|
+
|
384
|
+
def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
|
385
|
+
"""Synchronous version of report saver."""
|
386
|
+
if not config.report_file:
|
387
|
+
return
|
388
|
+
|
389
|
+
# Create report similar to async version
|
390
|
+
successful_results = [r for r in results if r.status == "success"]
|
391
|
+
failed_results = [r for r in results if r.status == "failed"]
|
392
|
+
|
393
|
+
report = {
|
394
|
+
"timestamp": datetime.now().isoformat(),
|
395
|
+
"config": config.to_dict(),
|
396
|
+
"results": {
|
397
|
+
"successful": [asdict(r) for r in successful_results],
|
398
|
+
"failed": [asdict(r) for r in failed_results]
|
399
|
+
},
|
400
|
+
"summary": {
|
401
|
+
"total": len(results),
|
402
|
+
"successful": len(successful_results),
|
403
|
+
"failed": len(failed_results),
|
404
|
+
"retry_info": retry_stats or {}
|
405
|
+
}
|
406
|
+
}
|
407
|
+
|
408
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
409
|
+
json.dump(report, f, indent=2)
|
410
|
+
|
284
411
|
class SpiderForce4AI:
|
285
412
|
"""Main class for interacting with SpiderForce4AI service."""
|
286
413
|
|
@@ -289,6 +416,7 @@ class SpiderForce4AI:
|
|
289
416
|
self.session = None
|
290
417
|
self._executor = ThreadPoolExecutor()
|
291
418
|
self.crawl_results: List[CrawlResult] = []
|
419
|
+
self._retry_stats = {}
|
292
420
|
|
293
421
|
async def _ensure_session(self):
|
294
422
|
"""Ensure aiohttp session exists."""
|
@@ -300,215 +428,6 @@ class SpiderForce4AI:
|
|
300
428
|
if self.session and not self.session.closed:
|
301
429
|
await self.session.close()
|
302
430
|
|
303
|
-
async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
|
304
|
-
"""Save markdown content to file and/or append to combined file."""
|
305
|
-
# Save individual file if not combining or if combining in full mode
|
306
|
-
if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
|
307
|
-
filename = f"{slugify(url)}.md"
|
308
|
-
filepath = output_dir / filename
|
309
|
-
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
310
|
-
await f.write(markdown)
|
311
|
-
|
312
|
-
# Handle combined markdown file
|
313
|
-
if self.config.combine_to_one_markdown:
|
314
|
-
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
315
|
-
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
316
|
-
|
317
|
-
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
318
|
-
await f.write(combined_content)
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
323
|
-
"""
|
324
|
-
Crawl sitemap URLs using server-side parallel processing.
|
325
|
-
"""
|
326
|
-
print(f"Fetching sitemap from {sitemap_url}...")
|
327
|
-
|
328
|
-
# Fetch sitemap
|
329
|
-
try:
|
330
|
-
response = requests.get(sitemap_url, timeout=config.timeout)
|
331
|
-
response.raise_for_status()
|
332
|
-
sitemap_text = response.text
|
333
|
-
except Exception as e:
|
334
|
-
print(f"Error fetching sitemap: {str(e)}")
|
335
|
-
raise
|
336
|
-
|
337
|
-
# Parse sitemap
|
338
|
-
try:
|
339
|
-
root = ET.fromstring(sitemap_text)
|
340
|
-
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
341
|
-
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
342
|
-
print(f"Found {len(urls)} URLs in sitemap")
|
343
|
-
except Exception as e:
|
344
|
-
print(f"Error parsing sitemap: {str(e)}")
|
345
|
-
raise
|
346
|
-
|
347
|
-
# Process URLs using server-side parallel endpoint
|
348
|
-
return self.crawl_urls_server_parallel(urls, config)
|
349
|
-
|
350
|
-
|
351
|
-
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
352
|
-
"""
|
353
|
-
Crawl multiple URLs using server-side parallel processing.
|
354
|
-
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
355
|
-
"""
|
356
|
-
print(f"Sending {len(urls)} URLs for parallel processing...")
|
357
|
-
|
358
|
-
try:
|
359
|
-
endpoint = f"{self.base_url}/convert_parallel"
|
360
|
-
|
361
|
-
# Prepare payload
|
362
|
-
payload = {
|
363
|
-
"urls": urls,
|
364
|
-
**config.to_dict()
|
365
|
-
}
|
366
|
-
|
367
|
-
# Send request
|
368
|
-
response = requests.post(
|
369
|
-
endpoint,
|
370
|
-
json=payload,
|
371
|
-
timeout=config.timeout
|
372
|
-
)
|
373
|
-
response.raise_for_status()
|
374
|
-
|
375
|
-
# Process results
|
376
|
-
results = []
|
377
|
-
server_results = response.json() # Assuming server returns JSON array of results
|
378
|
-
|
379
|
-
for url_result in server_results:
|
380
|
-
result = CrawlResult(
|
381
|
-
url=url_result["url"],
|
382
|
-
status=url_result.get("status", "failed"),
|
383
|
-
markdown=url_result.get("markdown"),
|
384
|
-
error=url_result.get("error"),
|
385
|
-
config=config.to_dict()
|
386
|
-
)
|
387
|
-
|
388
|
-
# Save markdown if successful and output dir is configured
|
389
|
-
if result.status == "success" and config.output_dir and result.markdown:
|
390
|
-
filepath = config.output_dir / f"{slugify(result.url)}.md"
|
391
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
392
|
-
f.write(result.markdown)
|
393
|
-
|
394
|
-
# Send webhook if configured
|
395
|
-
if config.webhook_url:
|
396
|
-
_send_webhook_sync(result, config)
|
397
|
-
|
398
|
-
results.append(result)
|
399
|
-
|
400
|
-
# Calculate statistics
|
401
|
-
successful = len([r for r in results if r.status == "success"])
|
402
|
-
failed = len([r for r in results if r.status == "failed"])
|
403
|
-
|
404
|
-
# Print summary
|
405
|
-
print(f"\nParallel processing completed:")
|
406
|
-
print(f"✓ Successful: {successful}")
|
407
|
-
print(f"✗ Failed: {failed}")
|
408
|
-
|
409
|
-
# Save report if enabled
|
410
|
-
if config.save_reports and config.report_file:
|
411
|
-
self._retry_stats = {
|
412
|
-
"initial_failures": failed,
|
413
|
-
"failure_ratio": (failed / len(urls)) * 100,
|
414
|
-
"retry_successful": 0, # No retries in server parallel mode
|
415
|
-
"retry_failed": failed
|
416
|
-
}
|
417
|
-
self._save_report_sync(results, config)
|
418
|
-
console.print(f"📊 Report saved to: {config.report_file}")
|
419
|
-
|
420
|
-
return results
|
421
|
-
|
422
|
-
except Exception as e:
|
423
|
-
print(f"Error during parallel processing: {str(e)}")
|
424
|
-
# Create failed results for all URLs
|
425
|
-
return [
|
426
|
-
CrawlResult(
|
427
|
-
url=url,
|
428
|
-
status="failed",
|
429
|
-
error=str(e),
|
430
|
-
config=config.to_dict()
|
431
|
-
) for url in urls
|
432
|
-
]
|
433
|
-
|
434
|
-
|
435
|
-
async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
|
436
|
-
"""Send webhook with crawl results."""
|
437
|
-
if not config.webhook_url:
|
438
|
-
return
|
439
|
-
|
440
|
-
payload = {
|
441
|
-
"url": result.url,
|
442
|
-
"status": result.status,
|
443
|
-
"markdown": result.markdown if result.status == "success" else None,
|
444
|
-
"error": result.error if result.status == "failed" else None,
|
445
|
-
"timestamp": result.timestamp,
|
446
|
-
"config": config.to_dict()
|
447
|
-
}
|
448
|
-
|
449
|
-
try:
|
450
|
-
async with httpx.AsyncClient() as client:
|
451
|
-
response = await client.post(
|
452
|
-
config.webhook_url,
|
453
|
-
json=payload,
|
454
|
-
timeout=config.webhook_timeout
|
455
|
-
)
|
456
|
-
response.raise_for_status()
|
457
|
-
except Exception as e:
|
458
|
-
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
459
|
-
|
460
|
-
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
461
|
-
"""Save crawl report synchronously."""
|
462
|
-
# Separate successful and failed results
|
463
|
-
successful_results = [r for r in results if r.status == "success"]
|
464
|
-
failed_results = [r for r in results if r.status == "failed"]
|
465
|
-
|
466
|
-
# Create report with only final state
|
467
|
-
report = {
|
468
|
-
"timestamp": datetime.now().isoformat(),
|
469
|
-
"config": config.to_dict(),
|
470
|
-
"results": {
|
471
|
-
"successful": [asdict(r) for r in successful_results],
|
472
|
-
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
473
|
-
},
|
474
|
-
"summary": {
|
475
|
-
"total": len(results),
|
476
|
-
"successful": len(successful_results),
|
477
|
-
"failed": len(failed_results),
|
478
|
-
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
479
|
-
}
|
480
|
-
}
|
481
|
-
|
482
|
-
with open(config.report_file, 'w', encoding='utf-8') as f:
|
483
|
-
json.dump(report, f, indent=2)
|
484
|
-
|
485
|
-
async def _save_report(self, config: CrawlConfig):
|
486
|
-
"""Save crawl report to JSON file."""
|
487
|
-
if not config.report_file:
|
488
|
-
return
|
489
|
-
|
490
|
-
# Separate successful and failed results
|
491
|
-
successful_results = [r for r in self.crawl_results if r.status == "success"]
|
492
|
-
failed_results = [r for r in self.crawl_results if r.status == "failed"]
|
493
|
-
|
494
|
-
report = {
|
495
|
-
"timestamp": datetime.now().isoformat(),
|
496
|
-
"config": config.to_dict(),
|
497
|
-
"results": {
|
498
|
-
"successful": [asdict(r) for r in successful_results],
|
499
|
-
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
500
|
-
},
|
501
|
-
"summary": {
|
502
|
-
"total": len(self.crawl_results),
|
503
|
-
"successful": len(successful_results),
|
504
|
-
"failed": len(failed_results),
|
505
|
-
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
506
|
-
}
|
507
|
-
}
|
508
|
-
|
509
|
-
async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
|
510
|
-
await f.write(json.dumps(report, indent=2))
|
511
|
-
|
512
431
|
async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
|
513
432
|
"""Crawl a single URL asynchronously."""
|
514
433
|
await self._ensure_session()
|
@@ -539,9 +458,31 @@ class SpiderForce4AI:
|
|
539
458
|
)
|
540
459
|
|
541
460
|
if config.output_dir:
|
542
|
-
await
|
461
|
+
await _save_markdown_async(url, markdown, config)
|
543
462
|
|
544
|
-
|
463
|
+
# Handle post-extraction if configured
|
464
|
+
if config.post_extraction_agent and result.status == "success":
|
465
|
+
try:
|
466
|
+
post_config = PostExtractionConfig(
|
467
|
+
model=config.post_extraction_agent["model"],
|
468
|
+
messages=config.post_extraction_agent["messages"],
|
469
|
+
api_key=config.post_extraction_agent["api_key"],
|
470
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
471
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
472
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
473
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
474
|
+
output_file=config.post_extraction_agent_save_to_file,
|
475
|
+
custom_transform_function=config.post_agent_transformer_function
|
476
|
+
)
|
477
|
+
|
478
|
+
agent = PostExtractionAgent(post_config)
|
479
|
+
extraction_result = await agent.process_content(url, markdown)
|
480
|
+
if extraction_result:
|
481
|
+
result.extraction_result = extraction_result
|
482
|
+
except Exception as e:
|
483
|
+
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
484
|
+
|
485
|
+
await _send_webhook_async(result, config)
|
545
486
|
|
546
487
|
self.crawl_results.append(result)
|
547
488
|
return result
|
@@ -561,18 +502,18 @@ class SpiderForce4AI:
|
|
561
502
|
return asyncio.run(self.crawl_url_async(url, config))
|
562
503
|
|
563
504
|
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
564
|
-
"""Retry failed URLs
|
505
|
+
"""Retry failed URLs with optional progress tracking."""
|
565
506
|
if not failed_results:
|
566
507
|
return []
|
567
508
|
|
568
509
|
failed_count = len(failed_results)
|
569
|
-
total_count = len(
|
510
|
+
total_count = len(self.crawl_results)
|
570
511
|
failure_ratio = (failed_count / total_count) * 100
|
571
512
|
|
572
513
|
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
573
514
|
retry_results = []
|
574
515
|
|
575
|
-
# Create
|
516
|
+
# Create or use provided progress bar
|
576
517
|
should_close_progress = progress is None
|
577
518
|
if progress is None:
|
578
519
|
progress = Progress(
|
@@ -616,6 +557,7 @@ class SpiderForce4AI:
|
|
616
557
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
617
558
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
618
559
|
await self._ensure_session()
|
560
|
+
post_extraction_results = {}
|
619
561
|
|
620
562
|
with Progress(
|
621
563
|
SpinnerColumn(),
|
@@ -624,52 +566,60 @@ class SpiderForce4AI:
|
|
624
566
|
TaskProgressColumn(),
|
625
567
|
console=console
|
626
568
|
) as progress:
|
627
|
-
|
569
|
+
crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
628
570
|
|
629
571
|
async def crawl_with_progress(url):
|
630
572
|
result = await self.crawl_url_async(url, config)
|
631
|
-
progress.update(
|
573
|
+
progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
|
632
574
|
return result
|
633
575
|
|
576
|
+
# Set up concurrency control
|
634
577
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
578
|
+
|
635
579
|
async def crawl_with_semaphore(url):
|
636
580
|
async with semaphore:
|
637
581
|
result = await crawl_with_progress(url)
|
638
582
|
await asyncio.sleep(config.request_delay)
|
639
583
|
return result
|
640
584
|
|
585
|
+
# Perform initial crawl
|
641
586
|
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
642
587
|
|
643
|
-
#
|
588
|
+
# Handle failed URLs
|
644
589
|
failed_results = [r for r in initial_results if r.status == "failed"]
|
645
|
-
|
646
|
-
# Calculate initial failure ratio
|
647
590
|
initial_failed = len(failed_results)
|
648
591
|
total_urls = len(urls)
|
649
592
|
failure_ratio = (initial_failed / total_urls) * 100
|
650
593
|
|
651
594
|
# Retry failed URLs if ratio is acceptable
|
652
|
-
|
653
|
-
|
654
|
-
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
655
|
-
results = initial_results
|
656
|
-
else:
|
657
|
-
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
658
|
-
# Update results list by replacing failed results with successful retries
|
659
|
-
results = initial_results.copy()
|
660
|
-
for retry_result in retry_results:
|
661
|
-
for i, result in enumerate(results):
|
662
|
-
if result.url == retry_result.url:
|
663
|
-
results[i] = retry_result
|
664
|
-
break
|
665
|
-
else:
|
666
|
-
results = initial_results
|
595
|
+
results = initial_results
|
596
|
+
retry_successful = 0
|
667
597
|
|
668
|
-
|
598
|
+
if failed_results and failure_ratio <= 20:
|
599
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
600
|
+
retry_successful = len([r for r in retry_results if r.status == "success"])
|
601
|
+
|
602
|
+
# Update results list
|
603
|
+
for retry_result in retry_results:
|
604
|
+
for i, result in enumerate(results):
|
605
|
+
if result.url == retry_result.url:
|
606
|
+
results[i] = retry_result
|
607
|
+
break
|
608
|
+
|
609
|
+
# Calculate final statistics
|
669
610
|
final_successful = len([r for r in results if r.status == "success"])
|
670
611
|
final_failed = len([r for r in results if r.status == "failed"])
|
671
612
|
|
672
|
-
#
|
613
|
+
# Update retry stats
|
614
|
+
self._retry_stats = {
|
615
|
+
"initial_failures": initial_failed,
|
616
|
+
"failure_ratio": failure_ratio,
|
617
|
+
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
618
|
+
"retry_failed": final_failed,
|
619
|
+
"post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
|
620
|
+
}
|
621
|
+
|
622
|
+
# Print summary
|
673
623
|
console.print(f"\n[green]Crawling Summary:[/green]")
|
674
624
|
console.print(f"Total URLs processed: {total_urls}")
|
675
625
|
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
@@ -678,18 +628,11 @@ class SpiderForce4AI:
|
|
678
628
|
console.print(f" ✗ Failed: {final_failed}")
|
679
629
|
|
680
630
|
if initial_failed > 0:
|
681
|
-
retry_successful = initial_failed - final_failed
|
682
631
|
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
683
632
|
|
684
|
-
# Save final report
|
633
|
+
# Save final report
|
685
634
|
if config.save_reports:
|
686
|
-
self._retry_stats
|
687
|
-
"initial_failures": initial_failed,
|
688
|
-
"failure_ratio": failure_ratio,
|
689
|
-
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
690
|
-
"retry_failed": final_failed
|
691
|
-
}
|
692
|
-
await self._save_report(config)
|
635
|
+
await _save_report_async(results, config, self._retry_stats)
|
693
636
|
console.print(f"📊 Report saved to: {config.report_file}")
|
694
637
|
|
695
638
|
return results
|
@@ -726,32 +669,21 @@ class SpiderForce4AI:
|
|
726
669
|
return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
|
727
670
|
|
728
671
|
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
729
|
-
"""Crawl sitemap URLs in parallel using multiprocessing
|
730
|
-
|
731
|
-
|
732
|
-
# Fetch sitemap
|
672
|
+
"""Crawl sitemap URLs in parallel using multiprocessing."""
|
673
|
+
# Fetch and parse sitemap
|
733
674
|
try:
|
734
675
|
response = requests.get(sitemap_url, timeout=config.timeout)
|
735
676
|
response.raise_for_status()
|
736
|
-
|
737
|
-
except Exception as e:
|
738
|
-
print(f"Error fetching sitemap: {str(e)}")
|
739
|
-
raise
|
740
|
-
|
741
|
-
# Parse sitemap
|
742
|
-
try:
|
743
|
-
root = ET.fromstring(sitemap_text)
|
677
|
+
root = ET.fromstring(response.text)
|
744
678
|
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
745
679
|
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
746
|
-
print(f"Found {len(urls)} URLs in sitemap")
|
680
|
+
console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
|
747
681
|
except Exception as e:
|
748
|
-
print(f"Error
|
682
|
+
console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
|
749
683
|
raise
|
750
684
|
|
751
|
-
#
|
685
|
+
# Process URLs in parallel
|
752
686
|
process_args = [(url, self.base_url, config) for url in urls]
|
753
|
-
|
754
|
-
# Create process pool and execute crawls
|
755
687
|
results = []
|
756
688
|
|
757
689
|
with Pool(processes=config.max_concurrent_requests) as pool:
|
@@ -762,81 +694,186 @@ class SpiderForce4AI:
|
|
762
694
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
763
695
|
TextColumn("({task.completed}/{task.total})"),
|
764
696
|
) as progress:
|
765
|
-
task = progress.add_task("Crawling URLs...", total=len(urls))
|
697
|
+
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
766
698
|
|
767
699
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
768
700
|
results.append(result)
|
769
701
|
progress.update(task, advance=1)
|
770
702
|
status = "✓" if result.status == "success" else "✗"
|
771
|
-
progress.description = f"Last: {status} {result.url}"
|
703
|
+
progress.description = f"[cyan]Last: {status} {result.url}"
|
772
704
|
|
773
|
-
# Calculate
|
705
|
+
# Calculate statistics and handle retries
|
774
706
|
failed_results = [r for r in results if r.status == "failed"]
|
775
707
|
initial_failed = len(failed_results)
|
776
|
-
|
777
|
-
|
708
|
+
failure_ratio = (initial_failed / len(urls)) * 100
|
709
|
+
retry_successful = 0
|
778
710
|
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
if
|
792
|
-
|
793
|
-
|
794
|
-
if config.output_dir and new_result.markdown:
|
795
|
-
filepath = config.output_dir / f"{slugify(new_result.url)}.md"
|
796
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
797
|
-
f.write(new_result.markdown)
|
798
|
-
# Send webhook for successful retry
|
799
|
-
_send_webhook_sync(new_result, config)
|
800
|
-
else:
|
801
|
-
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
802
|
-
# Send webhook for failed retry
|
803
|
-
_send_webhook_sync(new_result, config)
|
804
|
-
|
805
|
-
# Update results list
|
806
|
-
for i, r in enumerate(results):
|
807
|
-
if r.url == new_result.url:
|
808
|
-
results[i] = new_result
|
809
|
-
break
|
711
|
+
if failed_results and failure_ratio <= 20:
|
712
|
+
console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
|
713
|
+
for result in failed_results:
|
714
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
715
|
+
if new_result.status == "success":
|
716
|
+
retry_successful += 1
|
717
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
718
|
+
else:
|
719
|
+
console.print(f"[red]✗ Retry failed: {result.url}[/red]")
|
720
|
+
|
721
|
+
# Update results list
|
722
|
+
for i, r in enumerate(results):
|
723
|
+
if r.url == new_result.url:
|
724
|
+
results[i] = new_result
|
725
|
+
break
|
810
726
|
|
811
727
|
# Calculate final statistics
|
812
728
|
final_successful = len([r for r in results if r.status == "success"])
|
813
729
|
final_failed = len([r for r in results if r.status == "failed"])
|
814
730
|
|
815
|
-
# Print
|
731
|
+
# Print summary
|
816
732
|
console.print(f"\n[green]Crawling Summary:[/green]")
|
817
|
-
console.print(f"Total URLs processed: {
|
733
|
+
console.print(f"Total URLs processed: {len(urls)}")
|
818
734
|
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
819
735
|
console.print(f"Final results:")
|
820
736
|
console.print(f" ✓ Successful: {final_successful}")
|
821
737
|
console.print(f" ✗ Failed: {final_failed}")
|
822
|
-
|
738
|
+
|
823
739
|
if initial_failed > 0:
|
824
|
-
retry_successful = initial_failed - final_failed
|
825
740
|
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
826
741
|
|
827
|
-
|
742
|
+
# Save report
|
828
743
|
if config.save_reports:
|
829
744
|
self._retry_stats = {
|
830
745
|
"initial_failures": initial_failed,
|
831
746
|
"failure_ratio": failure_ratio,
|
832
|
-
"retry_successful": retry_successful
|
747
|
+
"retry_successful": retry_successful,
|
833
748
|
"retry_failed": final_failed
|
834
749
|
}
|
835
|
-
|
750
|
+
_save_report_sync(results, config, self._retry_stats)
|
836
751
|
console.print(f"📊 Report saved to: {config.report_file}")
|
837
752
|
|
838
753
|
return results
|
839
754
|
|
755
|
+
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
756
|
+
"""
|
757
|
+
Crawl multiple URLs using server-side parallel processing.
|
758
|
+
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
759
|
+
"""
|
760
|
+
console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
|
761
|
+
|
762
|
+
try:
|
763
|
+
endpoint = f"{self.base_url}/convert_parallel"
|
764
|
+
|
765
|
+
# Prepare payload
|
766
|
+
payload = {
|
767
|
+
"urls": urls,
|
768
|
+
**config.to_dict()
|
769
|
+
}
|
770
|
+
|
771
|
+
# Send request
|
772
|
+
response = requests.post(
|
773
|
+
endpoint,
|
774
|
+
json=payload,
|
775
|
+
timeout=config.timeout
|
776
|
+
)
|
777
|
+
response.raise_for_status()
|
778
|
+
|
779
|
+
# Process results
|
780
|
+
results = []
|
781
|
+
server_results = response.json()
|
782
|
+
|
783
|
+
for url_result in server_results:
|
784
|
+
result = CrawlResult(
|
785
|
+
url=url_result["url"],
|
786
|
+
status=url_result.get("status", "failed"),
|
787
|
+
markdown=url_result.get("markdown"),
|
788
|
+
error=url_result.get("error"),
|
789
|
+
config=config.to_dict()
|
790
|
+
)
|
791
|
+
|
792
|
+
# Save markdown if successful and output dir is configured
|
793
|
+
if result.status == "success" and config.output_dir and result.markdown:
|
794
|
+
_save_markdown_sync(result.url, result.markdown, config)
|
795
|
+
|
796
|
+
# Handle post-extraction if configured
|
797
|
+
if config.post_extraction_agent and result.status == "success":
|
798
|
+
try:
|
799
|
+
post_config = PostExtractionConfig(
|
800
|
+
model=config.post_extraction_agent["model"],
|
801
|
+
messages=config.post_extraction_agent["messages"],
|
802
|
+
api_key=config.post_extraction_agent["api_key"],
|
803
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
804
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
805
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
806
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
807
|
+
output_file=config.post_extraction_agent_save_to_file,
|
808
|
+
custom_transform_function=config.post_agent_transformer_function
|
809
|
+
)
|
810
|
+
|
811
|
+
agent = PostExtractionAgent(post_config)
|
812
|
+
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
813
|
+
if extraction_result:
|
814
|
+
result.extraction_result = extraction_result
|
815
|
+
except Exception as e:
|
816
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
817
|
+
|
818
|
+
# Send webhook if configured
|
819
|
+
_send_webhook_sync(result, config)
|
820
|
+
results.append(result)
|
821
|
+
|
822
|
+
# Calculate statistics
|
823
|
+
successful = len([r for r in results if r.status == "success"])
|
824
|
+
failed = len([r for r in results if r.status == "failed"])
|
825
|
+
|
826
|
+
# Print summary
|
827
|
+
console.print("\n[green]Parallel processing completed:[/green]")
|
828
|
+
console.print(f"✓ Successful: {successful}")
|
829
|
+
console.print(f"✗ Failed: {failed}")
|
830
|
+
|
831
|
+
# Save report if enabled
|
832
|
+
if config.save_reports:
|
833
|
+
self._retry_stats = {
|
834
|
+
"initial_failures": failed,
|
835
|
+
"failure_ratio": (failed / len(urls)) * 100,
|
836
|
+
"retry_successful": 0, # No retries in server parallel mode
|
837
|
+
"retry_failed": failed
|
838
|
+
}
|
839
|
+
_save_report_sync(results, config, self._retry_stats)
|
840
|
+
console.print(f"📊 Report saved to: {config.report_file}")
|
841
|
+
|
842
|
+
return results
|
843
|
+
|
844
|
+
except Exception as e:
|
845
|
+
console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
|
846
|
+
# Create failed results for all URLs
|
847
|
+
return [
|
848
|
+
CrawlResult(
|
849
|
+
url=url,
|
850
|
+
status="failed",
|
851
|
+
error=str(e),
|
852
|
+
config=config.to_dict()
|
853
|
+
) for url in urls
|
854
|
+
]
|
855
|
+
|
856
|
+
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
857
|
+
"""
|
858
|
+
Crawl sitemap URLs using server-side parallel processing.
|
859
|
+
"""
|
860
|
+
console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
|
861
|
+
|
862
|
+
try:
|
863
|
+
response = requests.get(sitemap_url, timeout=config.timeout)
|
864
|
+
response.raise_for_status()
|
865
|
+
root = ET.fromstring(response.text)
|
866
|
+
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
867
|
+
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
868
|
+
console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
|
869
|
+
|
870
|
+
# Process URLs using server-side parallel endpoint
|
871
|
+
return self.crawl_urls_server_parallel(urls, config)
|
872
|
+
|
873
|
+
except Exception as e:
|
874
|
+
console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
|
875
|
+
raise
|
876
|
+
|
840
877
|
async def __aenter__(self):
|
841
878
|
"""Async context manager entry."""
|
842
879
|
await self._ensure_session()
|
@@ -854,3 +891,7 @@ class SpiderForce4AI:
|
|
854
891
|
"""Sync context manager exit."""
|
855
892
|
self._executor.shutdown(wait=True)
|
856
893
|
|
894
|
+
# Version info
|
895
|
+
#__version__ = "2.3.1"
|
896
|
+
#__author__ = "Piotr Tamulewicz"
|
897
|
+
#__email__ = "pt@petertam.pro"
|