spiderforce4ai 2.4.2__py3-none-any.whl → 2.4.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +72 -46
- spiderforce4ai/post_extraction_agent.py +24 -11
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.5.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.2.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
312
312
|
config=config.to_dict()
|
313
313
|
)
|
314
314
|
|
315
|
-
# Handle post-extraction if configured
|
316
|
-
if config.post_extraction_agent:
|
317
|
-
try:
|
318
|
-
post_config = PostExtractionConfig(
|
319
|
-
model=config.post_extraction_agent["model"],
|
320
|
-
messages=config.post_extraction_agent["messages"],
|
321
|
-
api_key=config.post_extraction_agent["api_key"],
|
322
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
-
output_file=config.post_extraction_agent_save_to_file,
|
327
|
-
custom_transform_function=config.post_agent_transformer_function
|
328
|
-
)
|
329
|
-
|
330
|
-
agent = PostExtractionAgent(post_config)
|
331
|
-
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
-
if extraction_result:
|
333
|
-
result.extraction_result = extraction_result
|
334
|
-
except Exception as e:
|
335
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
-
|
337
315
|
# Send webhook for successful result
|
338
316
|
_send_webhook_sync(result, config)
|
339
317
|
|
@@ -608,7 +586,7 @@ class SpiderForce4AI:
|
|
608
586
|
for result in results:
|
609
587
|
if result.status == "success":
|
610
588
|
try:
|
611
|
-
result.extraction_result =
|
589
|
+
result.extraction_result = agent.process_content(result.url, result.markdown)
|
612
590
|
progress.update(llm_task, advance=1)
|
613
591
|
except Exception as e:
|
614
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
@@ -733,13 +711,48 @@ class SpiderForce4AI:
|
|
733
711
|
TextColumn("({task.completed}/{task.total})"),
|
734
712
|
) as progress:
|
735
713
|
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
736
|
-
|
714
|
+
|
737
715
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
738
716
|
results.append(result)
|
739
717
|
progress.update(task, advance=1)
|
740
718
|
status = "✓" if result.status == "success" else "✗"
|
741
719
|
progress.description = f"[cyan]Last: {status} {result.url}"
|
742
720
|
|
721
|
+
# Process LLM requests sequentially after all crawling is complete
|
722
|
+
if config.post_extraction_agent:
|
723
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
724
|
+
successful_results = [r for r in results if r.status == "success"]
|
725
|
+
|
726
|
+
with Progress(
|
727
|
+
SpinnerColumn(),
|
728
|
+
TextColumn("[progress.description]{task.description}"),
|
729
|
+
BarColumn(),
|
730
|
+
TaskProgressColumn(),
|
731
|
+
) as progress:
|
732
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
733
|
+
|
734
|
+
post_config = PostExtractionConfig(
|
735
|
+
model=config.post_extraction_agent["model"],
|
736
|
+
messages=config.post_extraction_agent["messages"],
|
737
|
+
api_key=config.post_extraction_agent["api_key"],
|
738
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
739
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
740
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
741
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
742
|
+
output_file=config.post_extraction_agent_save_to_file,
|
743
|
+
custom_transform_function=config.post_agent_transformer_function
|
744
|
+
)
|
745
|
+
agent = PostExtractionAgent(post_config)
|
746
|
+
|
747
|
+
for result in successful_results:
|
748
|
+
try:
|
749
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
750
|
+
progress.update(llm_task, advance=1)
|
751
|
+
except Exception as e:
|
752
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
753
|
+
time.sleep(1) # Add delay after error
|
754
|
+
time.sleep(0.5) # Rate limiting between requests
|
755
|
+
|
743
756
|
# Calculate statistics and handle retries
|
744
757
|
failed_results = [r for r in results if r.status == "failed"]
|
745
758
|
initial_failed = len(failed_results)
|
@@ -831,31 +844,44 @@ class SpiderForce4AI:
|
|
831
844
|
if result.status == "success" and config.output_dir and result.markdown:
|
832
845
|
_save_markdown_sync(result.url, result.markdown, config)
|
833
846
|
|
834
|
-
# Handle post-extraction if configured
|
835
|
-
if config.post_extraction_agent and result.status == "success":
|
836
|
-
try:
|
837
|
-
post_config = PostExtractionConfig(
|
838
|
-
model=config.post_extraction_agent["model"],
|
839
|
-
messages=config.post_extraction_agent["messages"],
|
840
|
-
api_key=config.post_extraction_agent["api_key"],
|
841
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
842
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
843
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
844
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
845
|
-
output_file=config.post_extraction_agent_save_to_file,
|
846
|
-
custom_transform_function=config.post_agent_transformer_function
|
847
|
-
)
|
848
|
-
|
849
|
-
agent = PostExtractionAgent(post_config)
|
850
|
-
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
851
|
-
if extraction_result:
|
852
|
-
result.extraction_result = extraction_result
|
853
|
-
except Exception as e:
|
854
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
855
|
-
|
856
847
|
# Send webhook if configured
|
857
848
|
_send_webhook_sync(result, config)
|
858
849
|
results.append(result)
|
850
|
+
|
851
|
+
# Process LLM requests sequentially after all crawling is complete
|
852
|
+
if config.post_extraction_agent:
|
853
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
854
|
+
successful_results = [r for r in results if r.status == "success"]
|
855
|
+
|
856
|
+
with Progress(
|
857
|
+
SpinnerColumn(),
|
858
|
+
TextColumn("[progress.description]{task.description}"),
|
859
|
+
BarColumn(),
|
860
|
+
TaskProgressColumn(),
|
861
|
+
) as progress:
|
862
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
863
|
+
|
864
|
+
post_config = PostExtractionConfig(
|
865
|
+
model=config.post_extraction_agent["model"],
|
866
|
+
messages=config.post_extraction_agent["messages"],
|
867
|
+
api_key=config.post_extraction_agent["api_key"],
|
868
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
869
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
870
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
871
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
|
+
output_file=config.post_extraction_agent_save_to_file,
|
873
|
+
custom_transform_function=config.post_agent_transformer_function
|
874
|
+
)
|
875
|
+
agent = PostExtractionAgent(post_config)
|
876
|
+
|
877
|
+
for result in successful_results:
|
878
|
+
try:
|
879
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
880
|
+
progress.update(llm_task, advance=1)
|
881
|
+
except Exception as e:
|
882
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
883
|
+
time.sleep(1) # Add delay after error
|
884
|
+
time.sleep(0.5) # Rate limiting between requests
|
859
885
|
|
860
886
|
# Calculate statistics
|
861
887
|
successful = len([r for r in results if r.status == "success"])
|
@@ -164,12 +164,9 @@ class PostExtractionAgent:
|
|
164
164
|
self.config.output_file.rename(backup_path)
|
165
165
|
self.config.output_file.touch()
|
166
166
|
|
167
|
-
|
167
|
+
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
168
|
"""Process a single piece of content through the LLM."""
|
169
169
|
try:
|
170
|
-
# Apply rate limiting
|
171
|
-
await self.rate_limiter.acquire()
|
172
|
-
|
173
170
|
# Replace placeholder in messages with actual content
|
174
171
|
messages = [
|
175
172
|
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
@@ -183,7 +180,8 @@ class PostExtractionAgent:
|
|
183
180
|
|
184
181
|
for attempt in range(max_retries):
|
185
182
|
try:
|
186
|
-
|
183
|
+
# Call completion synchronously
|
184
|
+
response = completion(
|
187
185
|
model=self.config.model,
|
188
186
|
messages=messages,
|
189
187
|
max_tokens=self.config.max_tokens,
|
@@ -200,11 +198,11 @@ class PostExtractionAgent:
|
|
200
198
|
except json.JSONDecodeError as e:
|
201
199
|
last_error = f"Invalid JSON response from LLM: {e}"
|
202
200
|
if attempt < max_retries - 1:
|
203
|
-
|
201
|
+
time.sleep(retry_delay * (attempt + 1))
|
204
202
|
except Exception as e:
|
205
203
|
last_error = str(e)
|
206
204
|
if attempt < max_retries - 1:
|
207
|
-
|
205
|
+
time.sleep(retry_delay * (attempt + 1))
|
208
206
|
|
209
207
|
# If we get here, all retries failed
|
210
208
|
raise Exception(last_error)
|
@@ -214,6 +212,20 @@ class PostExtractionAgent:
|
|
214
212
|
self.buffer.add_failed_request(url, content, str(e))
|
215
213
|
return None
|
216
214
|
|
215
|
+
def _save_result_sync(self, url: str, result: Dict) -> None:
|
216
|
+
"""Save individual or combined results synchronously."""
|
217
|
+
try:
|
218
|
+
if self.config.combine_output and self.config.output_file:
|
219
|
+
self.results[url] = result
|
220
|
+
with open(self.config.output_file, 'w') as f:
|
221
|
+
json.dump(self.results, f, indent=2)
|
222
|
+
elif not self.config.combine_output and self.config.output_file:
|
223
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
224
|
+
with open(individual_file, 'w') as f:
|
225
|
+
json.dump(result, f, indent=2)
|
226
|
+
except Exception as e:
|
227
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
228
|
+
|
217
229
|
async def _save_result(self, url: str, result: Dict) -> None:
|
218
230
|
"""Save individual or combined results."""
|
219
231
|
try:
|
@@ -228,10 +240,10 @@ class PostExtractionAgent:
|
|
228
240
|
except Exception as e:
|
229
241
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
230
242
|
|
231
|
-
|
243
|
+
def process_content(self, url: str, content: str) -> Optional[Dict]:
|
232
244
|
"""Process content with retry mechanism."""
|
233
245
|
for attempt in range(self.config.max_retries):
|
234
|
-
result =
|
246
|
+
result = self._process_single_content(url, content)
|
235
247
|
if result:
|
236
248
|
# Apply custom transformation if provided
|
237
249
|
if self.config.custom_transform_function:
|
@@ -240,12 +252,13 @@ class PostExtractionAgent:
|
|
240
252
|
except Exception as e:
|
241
253
|
logger.error(f"Error in custom transform for {url}: {str(e)}")
|
242
254
|
|
243
|
-
|
255
|
+
# Save result synchronously
|
256
|
+
self._save_result_sync(url, result)
|
244
257
|
return result
|
245
258
|
|
246
259
|
# Wait before retry
|
247
260
|
if attempt < self.config.max_retries - 1:
|
248
|
-
|
261
|
+
time.sleep(self.config.retry_delay)
|
249
262
|
|
250
263
|
return None
|
251
264
|
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=t9KxjuNw16-6kige6ULPLyykNkiGmKhpCi8QjskdaTk,11959
|
3
|
+
spiderforce4ai-2.4.5.dist-info/METADATA,sha256=q3VBuGb5wxsi9OPkzEMwFMyg9f_vT2RamWYIgu2JbLc,9012
|
4
|
+
spiderforce4ai-2.4.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.5.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.5.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
|
4
|
-
spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|