spiderforce4ai 2.4.2__py3-none-any.whl → 2.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +72 -46
- spiderforce4ai/post_extraction_agent.py +24 -11
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.5.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.2.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.5.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
312
312
|
config=config.to_dict()
|
313
313
|
)
|
314
314
|
|
315
|
-
# Handle post-extraction if configured
|
316
|
-
if config.post_extraction_agent:
|
317
|
-
try:
|
318
|
-
post_config = PostExtractionConfig(
|
319
|
-
model=config.post_extraction_agent["model"],
|
320
|
-
messages=config.post_extraction_agent["messages"],
|
321
|
-
api_key=config.post_extraction_agent["api_key"],
|
322
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
-
output_file=config.post_extraction_agent_save_to_file,
|
327
|
-
custom_transform_function=config.post_agent_transformer_function
|
328
|
-
)
|
329
|
-
|
330
|
-
agent = PostExtractionAgent(post_config)
|
331
|
-
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
-
if extraction_result:
|
333
|
-
result.extraction_result = extraction_result
|
334
|
-
except Exception as e:
|
335
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
-
|
337
315
|
# Send webhook for successful result
|
338
316
|
_send_webhook_sync(result, config)
|
339
317
|
|
@@ -608,7 +586,7 @@ class SpiderForce4AI:
|
|
608
586
|
for result in results:
|
609
587
|
if result.status == "success":
|
610
588
|
try:
|
611
|
-
result.extraction_result =
|
589
|
+
result.extraction_result = agent.process_content(result.url, result.markdown)
|
612
590
|
progress.update(llm_task, advance=1)
|
613
591
|
except Exception as e:
|
614
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
@@ -733,13 +711,48 @@ class SpiderForce4AI:
|
|
733
711
|
TextColumn("({task.completed}/{task.total})"),
|
734
712
|
) as progress:
|
735
713
|
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
736
|
-
|
714
|
+
|
737
715
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
738
716
|
results.append(result)
|
739
717
|
progress.update(task, advance=1)
|
740
718
|
status = "✓" if result.status == "success" else "✗"
|
741
719
|
progress.description = f"[cyan]Last: {status} {result.url}"
|
742
720
|
|
721
|
+
# Process LLM requests sequentially after all crawling is complete
|
722
|
+
if config.post_extraction_agent:
|
723
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
724
|
+
successful_results = [r for r in results if r.status == "success"]
|
725
|
+
|
726
|
+
with Progress(
|
727
|
+
SpinnerColumn(),
|
728
|
+
TextColumn("[progress.description]{task.description}"),
|
729
|
+
BarColumn(),
|
730
|
+
TaskProgressColumn(),
|
731
|
+
) as progress:
|
732
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
733
|
+
|
734
|
+
post_config = PostExtractionConfig(
|
735
|
+
model=config.post_extraction_agent["model"],
|
736
|
+
messages=config.post_extraction_agent["messages"],
|
737
|
+
api_key=config.post_extraction_agent["api_key"],
|
738
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
739
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
740
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
741
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
742
|
+
output_file=config.post_extraction_agent_save_to_file,
|
743
|
+
custom_transform_function=config.post_agent_transformer_function
|
744
|
+
)
|
745
|
+
agent = PostExtractionAgent(post_config)
|
746
|
+
|
747
|
+
for result in successful_results:
|
748
|
+
try:
|
749
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
750
|
+
progress.update(llm_task, advance=1)
|
751
|
+
except Exception as e:
|
752
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
753
|
+
time.sleep(1) # Add delay after error
|
754
|
+
time.sleep(0.5) # Rate limiting between requests
|
755
|
+
|
743
756
|
# Calculate statistics and handle retries
|
744
757
|
failed_results = [r for r in results if r.status == "failed"]
|
745
758
|
initial_failed = len(failed_results)
|
@@ -831,31 +844,44 @@ class SpiderForce4AI:
|
|
831
844
|
if result.status == "success" and config.output_dir and result.markdown:
|
832
845
|
_save_markdown_sync(result.url, result.markdown, config)
|
833
846
|
|
834
|
-
# Handle post-extraction if configured
|
835
|
-
if config.post_extraction_agent and result.status == "success":
|
836
|
-
try:
|
837
|
-
post_config = PostExtractionConfig(
|
838
|
-
model=config.post_extraction_agent["model"],
|
839
|
-
messages=config.post_extraction_agent["messages"],
|
840
|
-
api_key=config.post_extraction_agent["api_key"],
|
841
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
842
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
843
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
844
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
845
|
-
output_file=config.post_extraction_agent_save_to_file,
|
846
|
-
custom_transform_function=config.post_agent_transformer_function
|
847
|
-
)
|
848
|
-
|
849
|
-
agent = PostExtractionAgent(post_config)
|
850
|
-
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
851
|
-
if extraction_result:
|
852
|
-
result.extraction_result = extraction_result
|
853
|
-
except Exception as e:
|
854
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
855
|
-
|
856
847
|
# Send webhook if configured
|
857
848
|
_send_webhook_sync(result, config)
|
858
849
|
results.append(result)
|
850
|
+
|
851
|
+
# Process LLM requests sequentially after all crawling is complete
|
852
|
+
if config.post_extraction_agent:
|
853
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
854
|
+
successful_results = [r for r in results if r.status == "success"]
|
855
|
+
|
856
|
+
with Progress(
|
857
|
+
SpinnerColumn(),
|
858
|
+
TextColumn("[progress.description]{task.description}"),
|
859
|
+
BarColumn(),
|
860
|
+
TaskProgressColumn(),
|
861
|
+
) as progress:
|
862
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
863
|
+
|
864
|
+
post_config = PostExtractionConfig(
|
865
|
+
model=config.post_extraction_agent["model"],
|
866
|
+
messages=config.post_extraction_agent["messages"],
|
867
|
+
api_key=config.post_extraction_agent["api_key"],
|
868
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
869
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
870
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
871
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
|
+
output_file=config.post_extraction_agent_save_to_file,
|
873
|
+
custom_transform_function=config.post_agent_transformer_function
|
874
|
+
)
|
875
|
+
agent = PostExtractionAgent(post_config)
|
876
|
+
|
877
|
+
for result in successful_results:
|
878
|
+
try:
|
879
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
880
|
+
progress.update(llm_task, advance=1)
|
881
|
+
except Exception as e:
|
882
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
883
|
+
time.sleep(1) # Add delay after error
|
884
|
+
time.sleep(0.5) # Rate limiting between requests
|
859
885
|
|
860
886
|
# Calculate statistics
|
861
887
|
successful = len([r for r in results if r.status == "success"])
|
@@ -164,12 +164,9 @@ class PostExtractionAgent:
|
|
164
164
|
self.config.output_file.rename(backup_path)
|
165
165
|
self.config.output_file.touch()
|
166
166
|
|
167
|
-
|
167
|
+
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
168
|
"""Process a single piece of content through the LLM."""
|
169
169
|
try:
|
170
|
-
# Apply rate limiting
|
171
|
-
await self.rate_limiter.acquire()
|
172
|
-
|
173
170
|
# Replace placeholder in messages with actual content
|
174
171
|
messages = [
|
175
172
|
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
@@ -183,7 +180,8 @@ class PostExtractionAgent:
|
|
183
180
|
|
184
181
|
for attempt in range(max_retries):
|
185
182
|
try:
|
186
|
-
|
183
|
+
# Call completion synchronously
|
184
|
+
response = completion(
|
187
185
|
model=self.config.model,
|
188
186
|
messages=messages,
|
189
187
|
max_tokens=self.config.max_tokens,
|
@@ -200,11 +198,11 @@ class PostExtractionAgent:
|
|
200
198
|
except json.JSONDecodeError as e:
|
201
199
|
last_error = f"Invalid JSON response from LLM: {e}"
|
202
200
|
if attempt < max_retries - 1:
|
203
|
-
|
201
|
+
time.sleep(retry_delay * (attempt + 1))
|
204
202
|
except Exception as e:
|
205
203
|
last_error = str(e)
|
206
204
|
if attempt < max_retries - 1:
|
207
|
-
|
205
|
+
time.sleep(retry_delay * (attempt + 1))
|
208
206
|
|
209
207
|
# If we get here, all retries failed
|
210
208
|
raise Exception(last_error)
|
@@ -214,6 +212,20 @@ class PostExtractionAgent:
|
|
214
212
|
self.buffer.add_failed_request(url, content, str(e))
|
215
213
|
return None
|
216
214
|
|
215
|
+
def _save_result_sync(self, url: str, result: Dict) -> None:
|
216
|
+
"""Save individual or combined results synchronously."""
|
217
|
+
try:
|
218
|
+
if self.config.combine_output and self.config.output_file:
|
219
|
+
self.results[url] = result
|
220
|
+
with open(self.config.output_file, 'w') as f:
|
221
|
+
json.dump(self.results, f, indent=2)
|
222
|
+
elif not self.config.combine_output and self.config.output_file:
|
223
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
224
|
+
with open(individual_file, 'w') as f:
|
225
|
+
json.dump(result, f, indent=2)
|
226
|
+
except Exception as e:
|
227
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
228
|
+
|
217
229
|
async def _save_result(self, url: str, result: Dict) -> None:
|
218
230
|
"""Save individual or combined results."""
|
219
231
|
try:
|
@@ -228,10 +240,10 @@ class PostExtractionAgent:
|
|
228
240
|
except Exception as e:
|
229
241
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
230
242
|
|
231
|
-
|
243
|
+
def process_content(self, url: str, content: str) -> Optional[Dict]:
|
232
244
|
"""Process content with retry mechanism."""
|
233
245
|
for attempt in range(self.config.max_retries):
|
234
|
-
result =
|
246
|
+
result = self._process_single_content(url, content)
|
235
247
|
if result:
|
236
248
|
# Apply custom transformation if provided
|
237
249
|
if self.config.custom_transform_function:
|
@@ -240,12 +252,13 @@ class PostExtractionAgent:
|
|
240
252
|
except Exception as e:
|
241
253
|
logger.error(f"Error in custom transform for {url}: {str(e)}")
|
242
254
|
|
243
|
-
|
255
|
+
# Save result synchronously
|
256
|
+
self._save_result_sync(url, result)
|
244
257
|
return result
|
245
258
|
|
246
259
|
# Wait before retry
|
247
260
|
if attempt < self.config.max_retries - 1:
|
248
|
-
|
261
|
+
time.sleep(self.config.retry_delay)
|
249
262
|
|
250
263
|
return None
|
251
264
|
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=t9KxjuNw16-6kige6ULPLyykNkiGmKhpCi8QjskdaTk,11959
|
3
|
+
spiderforce4ai-2.4.5.dist-info/METADATA,sha256=q3VBuGb5wxsi9OPkzEMwFMyg9f_vT2RamWYIgu2JbLc,9012
|
4
|
+
spiderforce4ai-2.4.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.5.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.5.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
|
4
|
-
spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|