spiderforce4ai 2.4.3__py3-none-any.whl → 2.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +1 -1
- spiderforce4ai/post_extraction_agent.py +63 -20
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.6.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.3.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.3.dist-info → spiderforce4ai-2.4.6.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -586,7 +586,7 @@ class SpiderForce4AI:
|
|
586
586
|
for result in results:
|
587
587
|
if result.status == "success":
|
588
588
|
try:
|
589
|
-
result.extraction_result =
|
589
|
+
result.extraction_result = agent.process_content(result.url, result.markdown)
|
590
590
|
progress.update(llm_task, advance=1)
|
591
591
|
except Exception as e:
|
592
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
@@ -164,12 +164,9 @@ class PostExtractionAgent:
|
|
164
164
|
self.config.output_file.rename(backup_path)
|
165
165
|
self.config.output_file.touch()
|
166
166
|
|
167
|
-
|
167
|
+
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
168
|
"""Process a single piece of content through the LLM."""
|
169
169
|
try:
|
170
|
-
# Apply rate limiting
|
171
|
-
await self.rate_limiter.acquire()
|
172
|
-
|
173
170
|
# Replace placeholder in messages with actual content
|
174
171
|
messages = [
|
175
172
|
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
@@ -183,7 +180,8 @@ class PostExtractionAgent:
|
|
183
180
|
|
184
181
|
for attempt in range(max_retries):
|
185
182
|
try:
|
186
|
-
|
183
|
+
# Call completion synchronously
|
184
|
+
response = completion(
|
187
185
|
model=self.config.model,
|
188
186
|
messages=messages,
|
189
187
|
max_tokens=self.config.max_tokens,
|
@@ -192,19 +190,31 @@ class PostExtractionAgent:
|
|
192
190
|
api_base=self.config.base_url
|
193
191
|
)
|
194
192
|
|
195
|
-
#
|
196
|
-
|
197
|
-
|
198
|
-
return extracted_data
|
193
|
+
# Log raw response for debugging
|
194
|
+
raw_content = response.choices[0].message.content
|
195
|
+
logger.debug(f"Raw LLM response for {url}: {raw_content}")
|
199
196
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
197
|
+
try:
|
198
|
+
extracted_data = json.loads(raw_content)
|
199
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
200
|
+
return extracted_data
|
201
|
+
except json.JSONDecodeError as e:
|
202
|
+
error_msg = (
|
203
|
+
f"Invalid JSON response from LLM for {url}:\n"
|
204
|
+
f"Error: {str(e)}\n"
|
205
|
+
f"Raw content: {raw_content[:500]}..." # First 500 chars of response
|
206
|
+
)
|
207
|
+
logger.error(error_msg)
|
208
|
+
last_error = error_msg
|
209
|
+
if attempt < max_retries - 1:
|
210
|
+
time.sleep(retry_delay * (attempt + 1))
|
211
|
+
|
204
212
|
except Exception as e:
|
205
|
-
|
213
|
+
error_msg = f"LLM processing error for {url}: {str(e)}"
|
214
|
+
logger.error(error_msg)
|
215
|
+
last_error = error_msg
|
206
216
|
if attempt < max_retries - 1:
|
207
|
-
|
217
|
+
time.sleep(retry_delay * (attempt + 1))
|
208
218
|
|
209
219
|
# If we get here, all retries failed
|
210
220
|
raise Exception(last_error)
|
@@ -214,6 +224,20 @@ class PostExtractionAgent:
|
|
214
224
|
self.buffer.add_failed_request(url, content, str(e))
|
215
225
|
return None
|
216
226
|
|
227
|
+
def _save_result_sync(self, url: str, result: Dict) -> None:
|
228
|
+
"""Save individual or combined results synchronously."""
|
229
|
+
try:
|
230
|
+
if self.config.combine_output and self.config.output_file:
|
231
|
+
self.results[url] = result
|
232
|
+
with open(self.config.output_file, 'w') as f:
|
233
|
+
json.dump(self.results, f, indent=2)
|
234
|
+
elif not self.config.combine_output and self.config.output_file:
|
235
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
236
|
+
with open(individual_file, 'w') as f:
|
237
|
+
json.dump(result, f, indent=2)
|
238
|
+
except Exception as e:
|
239
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
240
|
+
|
217
241
|
async def _save_result(self, url: str, result: Dict) -> None:
|
218
242
|
"""Save individual or combined results."""
|
219
243
|
try:
|
@@ -228,25 +252,44 @@ class PostExtractionAgent:
|
|
228
252
|
except Exception as e:
|
229
253
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
230
254
|
|
231
|
-
|
255
|
+
def process_content(self, url: str, content: str) -> Optional[Dict]:
|
232
256
|
"""Process content with retry mechanism."""
|
257
|
+
logger.info(f"Starting content processing for {url}")
|
258
|
+
|
233
259
|
for attempt in range(self.config.max_retries):
|
234
|
-
|
260
|
+
logger.info(f"Processing attempt {attempt + 1}/{self.config.max_retries} for {url}")
|
261
|
+
|
262
|
+
result = self._process_single_content(url, content)
|
235
263
|
if result:
|
264
|
+
logger.info(f"Successfully processed content for {url}")
|
265
|
+
|
236
266
|
# Apply custom transformation if provided
|
237
267
|
if self.config.custom_transform_function:
|
238
268
|
try:
|
239
269
|
result = self.config.custom_transform_function(result)
|
270
|
+
logger.info(f"Applied custom transformation for {url}")
|
240
271
|
except Exception as e:
|
241
|
-
|
272
|
+
error_msg = f"Error in custom transform for {url}: {str(e)}"
|
273
|
+
logger.error(error_msg)
|
274
|
+
console.print(f"[red]{error_msg}[/red]")
|
275
|
+
|
276
|
+
# Save result synchronously
|
277
|
+
try:
|
278
|
+
self._save_result_sync(url, result)
|
279
|
+
logger.info(f"Saved results for {url}")
|
280
|
+
except Exception as e:
|
281
|
+
error_msg = f"Error saving results for {url}: {str(e)}"
|
282
|
+
logger.error(error_msg)
|
283
|
+
console.print(f"[red]{error_msg}[/red]")
|
242
284
|
|
243
|
-
await self._save_result(url, result)
|
244
285
|
return result
|
245
286
|
|
246
287
|
# Wait before retry
|
247
288
|
if attempt < self.config.max_retries - 1:
|
248
|
-
|
289
|
+
logger.info(f"Attempt {attempt + 1} failed for {url}, waiting {self.config.retry_delay}s before retry")
|
290
|
+
time.sleep(self.config.retry_delay)
|
249
291
|
|
292
|
+
logger.error(f"All processing attempts failed for {url}")
|
250
293
|
return None
|
251
294
|
|
252
295
|
async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=Ty9-Ai3_RVT86RrPUxKEzf4oUh-Wr7bk2aM87Je_WvE,13580
|
3
|
+
spiderforce4ai-2.4.6.dist-info/METADATA,sha256=7FaEgAHdD-8a0XmuDMkpAUjAQ7ZmFTD89IqQM17nllI,9012
|
4
|
+
spiderforce4ai-2.4.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.6.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
|
4
|
-
spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|