spiderforce4ai 2.4.3__py3-none-any.whl → 2.4.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +1 -1
- spiderforce4ai/post_extraction_agent.py +63 -20
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.6.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.3.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -586,7 +586,7 @@ class SpiderForce4AI:
|
|
586
586
|
for result in results:
|
587
587
|
if result.status == "success":
|
588
588
|
try:
|
589
|
-
result.extraction_result =
|
589
|
+
result.extraction_result = agent.process_content(result.url, result.markdown)
|
590
590
|
progress.update(llm_task, advance=1)
|
591
591
|
except Exception as e:
|
592
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
@@ -164,12 +164,9 @@ class PostExtractionAgent:
|
|
164
164
|
self.config.output_file.rename(backup_path)
|
165
165
|
self.config.output_file.touch()
|
166
166
|
|
167
|
-
|
167
|
+
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
168
|
"""Process a single piece of content through the LLM."""
|
169
169
|
try:
|
170
|
-
# Apply rate limiting
|
171
|
-
await self.rate_limiter.acquire()
|
172
|
-
|
173
170
|
# Replace placeholder in messages with actual content
|
174
171
|
messages = [
|
175
172
|
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
@@ -183,7 +180,8 @@ class PostExtractionAgent:
|
|
183
180
|
|
184
181
|
for attempt in range(max_retries):
|
185
182
|
try:
|
186
|
-
|
183
|
+
# Call completion synchronously
|
184
|
+
response = completion(
|
187
185
|
model=self.config.model,
|
188
186
|
messages=messages,
|
189
187
|
max_tokens=self.config.max_tokens,
|
@@ -192,19 +190,31 @@ class PostExtractionAgent:
|
|
192
190
|
api_base=self.config.base_url
|
193
191
|
)
|
194
192
|
|
195
|
-
#
|
196
|
-
|
197
|
-
|
198
|
-
return extracted_data
|
193
|
+
# Log raw response for debugging
|
194
|
+
raw_content = response.choices[0].message.content
|
195
|
+
logger.debug(f"Raw LLM response for {url}: {raw_content}")
|
199
196
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
197
|
+
try:
|
198
|
+
extracted_data = json.loads(raw_content)
|
199
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
200
|
+
return extracted_data
|
201
|
+
except json.JSONDecodeError as e:
|
202
|
+
error_msg = (
|
203
|
+
f"Invalid JSON response from LLM for {url}:\n"
|
204
|
+
f"Error: {str(e)}\n"
|
205
|
+
f"Raw content: {raw_content[:500]}..." # First 500 chars of response
|
206
|
+
)
|
207
|
+
logger.error(error_msg)
|
208
|
+
last_error = error_msg
|
209
|
+
if attempt < max_retries - 1:
|
210
|
+
time.sleep(retry_delay * (attempt + 1))
|
211
|
+
|
204
212
|
except Exception as e:
|
205
|
-
|
213
|
+
error_msg = f"LLM processing error for {url}: {str(e)}"
|
214
|
+
logger.error(error_msg)
|
215
|
+
last_error = error_msg
|
206
216
|
if attempt < max_retries - 1:
|
207
|
-
|
217
|
+
time.sleep(retry_delay * (attempt + 1))
|
208
218
|
|
209
219
|
# If we get here, all retries failed
|
210
220
|
raise Exception(last_error)
|
@@ -214,6 +224,20 @@ class PostExtractionAgent:
|
|
214
224
|
self.buffer.add_failed_request(url, content, str(e))
|
215
225
|
return None
|
216
226
|
|
227
|
+
def _save_result_sync(self, url: str, result: Dict) -> None:
|
228
|
+
"""Save individual or combined results synchronously."""
|
229
|
+
try:
|
230
|
+
if self.config.combine_output and self.config.output_file:
|
231
|
+
self.results[url] = result
|
232
|
+
with open(self.config.output_file, 'w') as f:
|
233
|
+
json.dump(self.results, f, indent=2)
|
234
|
+
elif not self.config.combine_output and self.config.output_file:
|
235
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
236
|
+
with open(individual_file, 'w') as f:
|
237
|
+
json.dump(result, f, indent=2)
|
238
|
+
except Exception as e:
|
239
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
240
|
+
|
217
241
|
async def _save_result(self, url: str, result: Dict) -> None:
|
218
242
|
"""Save individual or combined results."""
|
219
243
|
try:
|
@@ -228,25 +252,44 @@ class PostExtractionAgent:
|
|
228
252
|
except Exception as e:
|
229
253
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
230
254
|
|
231
|
-
|
255
|
+
def process_content(self, url: str, content: str) -> Optional[Dict]:
|
232
256
|
"""Process content with retry mechanism."""
|
257
|
+
logger.info(f"Starting content processing for {url}")
|
258
|
+
|
233
259
|
for attempt in range(self.config.max_retries):
|
234
|
-
|
260
|
+
logger.info(f"Processing attempt {attempt + 1}/{self.config.max_retries} for {url}")
|
261
|
+
|
262
|
+
result = self._process_single_content(url, content)
|
235
263
|
if result:
|
264
|
+
logger.info(f"Successfully processed content for {url}")
|
265
|
+
|
236
266
|
# Apply custom transformation if provided
|
237
267
|
if self.config.custom_transform_function:
|
238
268
|
try:
|
239
269
|
result = self.config.custom_transform_function(result)
|
270
|
+
logger.info(f"Applied custom transformation for {url}")
|
240
271
|
except Exception as e:
|
241
|
-
|
272
|
+
error_msg = f"Error in custom transform for {url}: {str(e)}"
|
273
|
+
logger.error(error_msg)
|
274
|
+
console.print(f"[red]{error_msg}[/red]")
|
275
|
+
|
276
|
+
# Save result synchronously
|
277
|
+
try:
|
278
|
+
self._save_result_sync(url, result)
|
279
|
+
logger.info(f"Saved results for {url}")
|
280
|
+
except Exception as e:
|
281
|
+
error_msg = f"Error saving results for {url}: {str(e)}"
|
282
|
+
logger.error(error_msg)
|
283
|
+
console.print(f"[red]{error_msg}[/red]")
|
242
284
|
|
243
|
-
await self._save_result(url, result)
|
244
285
|
return result
|
245
286
|
|
246
287
|
# Wait before retry
|
247
288
|
if attempt < self.config.max_retries - 1:
|
248
|
-
|
289
|
+
logger.info(f"Attempt {attempt + 1} failed for {url}, waiting {self.config.retry_delay}s before retry")
|
290
|
+
time.sleep(self.config.retry_delay)
|
249
291
|
|
292
|
+
logger.error(f"All processing attempts failed for {url}")
|
250
293
|
return None
|
251
294
|
|
252
295
|
async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=Ty9-Ai3_RVT86RrPUxKEzf4oUh-Wr7bk2aM87Je_WvE,13580
|
3
|
+
spiderforce4ai-2.4.6.dist-info/METADATA,sha256=7FaEgAHdD-8a0XmuDMkpAUjAQ7ZmFTD89IqQM17nllI,9012
|
4
|
+
spiderforce4ai-2.4.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.6.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
|
4
|
-
spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|