spiderforce4ai 2.4.3__tar.gz → 2.4.5__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/PKG-INFO +1 -1
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/pyproject.toml +1 -1
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/setup.py +1 -1
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai/__init__.py +1 -1
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai/post_extraction_agent.py +24 -11
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/README.md +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/setup.cfg +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.4.3 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.4.
|
7
|
+
version = "2.4.5"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.4.
|
10
|
+
version="2.4.5",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -586,7 +586,7 @@ class SpiderForce4AI:
|
|
586
586
|
for result in results:
|
587
587
|
if result.status == "success":
|
588
588
|
try:
|
589
|
-
result.extraction_result =
|
589
|
+
result.extraction_result = agent.process_content(result.url, result.markdown)
|
590
590
|
progress.update(llm_task, advance=1)
|
591
591
|
except Exception as e:
|
592
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
@@ -164,12 +164,9 @@ class PostExtractionAgent:
|
|
164
164
|
self.config.output_file.rename(backup_path)
|
165
165
|
self.config.output_file.touch()
|
166
166
|
|
167
|
-
|
167
|
+
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
168
|
"""Process a single piece of content through the LLM."""
|
169
169
|
try:
|
170
|
-
# Apply rate limiting
|
171
|
-
await self.rate_limiter.acquire()
|
172
|
-
|
173
170
|
# Replace placeholder in messages with actual content
|
174
171
|
messages = [
|
175
172
|
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
@@ -183,7 +180,8 @@ class PostExtractionAgent:
|
|
183
180
|
|
184
181
|
for attempt in range(max_retries):
|
185
182
|
try:
|
186
|
-
|
183
|
+
# Call completion synchronously
|
184
|
+
response = completion(
|
187
185
|
model=self.config.model,
|
188
186
|
messages=messages,
|
189
187
|
max_tokens=self.config.max_tokens,
|
@@ -200,11 +198,11 @@ class PostExtractionAgent:
|
|
200
198
|
except json.JSONDecodeError as e:
|
201
199
|
last_error = f"Invalid JSON response from LLM: {e}"
|
202
200
|
if attempt < max_retries - 1:
|
203
|
-
|
201
|
+
time.sleep(retry_delay * (attempt + 1))
|
204
202
|
except Exception as e:
|
205
203
|
last_error = str(e)
|
206
204
|
if attempt < max_retries - 1:
|
207
|
-
|
205
|
+
time.sleep(retry_delay * (attempt + 1))
|
208
206
|
|
209
207
|
# If we get here, all retries failed
|
210
208
|
raise Exception(last_error)
|
@@ -214,6 +212,20 @@ class PostExtractionAgent:
|
|
214
212
|
self.buffer.add_failed_request(url, content, str(e))
|
215
213
|
return None
|
216
214
|
|
215
|
+
def _save_result_sync(self, url: str, result: Dict) -> None:
|
216
|
+
"""Save individual or combined results synchronously."""
|
217
|
+
try:
|
218
|
+
if self.config.combine_output and self.config.output_file:
|
219
|
+
self.results[url] = result
|
220
|
+
with open(self.config.output_file, 'w') as f:
|
221
|
+
json.dump(self.results, f, indent=2)
|
222
|
+
elif not self.config.combine_output and self.config.output_file:
|
223
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
224
|
+
with open(individual_file, 'w') as f:
|
225
|
+
json.dump(result, f, indent=2)
|
226
|
+
except Exception as e:
|
227
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
228
|
+
|
217
229
|
async def _save_result(self, url: str, result: Dict) -> None:
|
218
230
|
"""Save individual or combined results."""
|
219
231
|
try:
|
@@ -228,10 +240,10 @@ class PostExtractionAgent:
|
|
228
240
|
except Exception as e:
|
229
241
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
230
242
|
|
231
|
-
|
243
|
+
def process_content(self, url: str, content: str) -> Optional[Dict]:
|
232
244
|
"""Process content with retry mechanism."""
|
233
245
|
for attempt in range(self.config.max_retries):
|
234
|
-
result =
|
246
|
+
result = self._process_single_content(url, content)
|
235
247
|
if result:
|
236
248
|
# Apply custom transformation if provided
|
237
249
|
if self.config.custom_transform_function:
|
@@ -240,12 +252,13 @@ class PostExtractionAgent:
|
|
240
252
|
except Exception as e:
|
241
253
|
logger.error(f"Error in custom transform for {url}: {str(e)}")
|
242
254
|
|
243
|
-
|
255
|
+
# Save result synchronously
|
256
|
+
self._save_result_sync(url, result)
|
244
257
|
return result
|
245
258
|
|
246
259
|
# Wait before retry
|
247
260
|
if attempt < self.config.max_retries - 1:
|
248
|
-
|
261
|
+
time.sleep(self.config.retry_delay)
|
249
262
|
|
250
263
|
return None
|
251
264
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|