spiderforce4ai 2.3.1__py3-none-any.whl → 2.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +411 -349
- spiderforce4ai/post_extraction_agent.py +259 -0
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/METADATA +41 -3
- spiderforce4ai-2.4.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/entry_points.txt +2 -0
- spiderforce4ai-2.3.1.dist-info/RECORD +0 -5
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,259 @@
|
|
1
|
+
# post_extraction_agent.py
|
2
|
+
|
3
|
+
from dataclasses import dataclass, asdict
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
5
|
+
import json
|
6
|
+
import asyncio
|
7
|
+
import time
|
8
|
+
from pathlib import Path
|
9
|
+
import aiofiles
|
10
|
+
from litellm import completion
|
11
|
+
from pydantic import BaseModel, Field
|
12
|
+
import logging
|
13
|
+
from datetime import datetime
|
14
|
+
import re
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
class PostExtractionBuffer:
|
19
|
+
"""Buffer system for tracking and retrying failed LLM requests."""
|
20
|
+
|
21
|
+
def __init__(self, buffer_file: Optional[Path] = None):
|
22
|
+
# Generate a unique session ID using timestamp and random string
|
23
|
+
session_id = f"{int(time.time())}_{hex(hash(str(time.time())))[-6:]}"
|
24
|
+
|
25
|
+
# Create unique buffer file path
|
26
|
+
if buffer_file:
|
27
|
+
# If buffer_file is provided, insert session_id before the extension
|
28
|
+
stem = buffer_file.stem
|
29
|
+
suffix = buffer_file.suffix
|
30
|
+
self.buffer_file = buffer_file.with_name(f"{stem}_{session_id}{suffix}")
|
31
|
+
else:
|
32
|
+
# Default buffer file with session_id
|
33
|
+
self.buffer_file = Path(f"post_extraction_buffer_{session_id}.json")
|
34
|
+
|
35
|
+
self.failed_requests: Dict[str, Dict] = {}
|
36
|
+
self._load_buffer()
|
37
|
+
|
38
|
+
def _load_buffer(self) -> None:
|
39
|
+
"""Load failed requests from buffer file if it exists."""
|
40
|
+
if self.buffer_file.exists():
|
41
|
+
try:
|
42
|
+
with open(self.buffer_file, 'r') as f:
|
43
|
+
self.failed_requests = json.load(f)
|
44
|
+
except Exception as e:
|
45
|
+
logger.error(f"Error loading buffer file: {e}")
|
46
|
+
self.failed_requests = {}
|
47
|
+
|
48
|
+
def _save_buffer(self) -> None:
|
49
|
+
"""Save failed requests to buffer file."""
|
50
|
+
try:
|
51
|
+
with open(self.buffer_file, 'w') as f:
|
52
|
+
json.dump(self.failed_requests, f, indent=2)
|
53
|
+
except Exception as e:
|
54
|
+
logger.error(f"Error saving buffer file: {e}")
|
55
|
+
|
56
|
+
def add_failed_request(self, url: str, content: str, error: str) -> None:
|
57
|
+
"""Add a failed request to the buffer."""
|
58
|
+
self.failed_requests[url] = {
|
59
|
+
"content": content,
|
60
|
+
"error": error,
|
61
|
+
"timestamp": datetime.now().isoformat(),
|
62
|
+
"attempts": self.failed_requests.get(url, {}).get("attempts", 0) + 1
|
63
|
+
}
|
64
|
+
self._save_buffer()
|
65
|
+
|
66
|
+
def remove_request(self, url: str) -> None:
|
67
|
+
"""Remove a request from the buffer after successful processing."""
|
68
|
+
if url in self.failed_requests:
|
69
|
+
del self.failed_requests[url]
|
70
|
+
self._save_buffer()
|
71
|
+
|
72
|
+
def get_failed_requests(self) -> Dict[str, Dict]:
|
73
|
+
"""Get all failed requests."""
|
74
|
+
return self.failed_requests
|
75
|
+
|
76
|
+
def get_retryable_requests(self, max_attempts: int = 3) -> Dict[str, Dict]:
|
77
|
+
"""Get failed requests that haven't exceeded max retry attempts."""
|
78
|
+
return {
|
79
|
+
url: data for url, data in self.failed_requests.items()
|
80
|
+
if data.get("attempts", 0) < max_attempts
|
81
|
+
}
|
82
|
+
|
83
|
+
class ExtractionTemplate(BaseModel):
|
84
|
+
"""Base model for extraction template validation."""
|
85
|
+
template: Dict[str, Any] = Field(..., description="Template structure for extraction")
|
86
|
+
|
87
|
+
class Config:
|
88
|
+
extra = "allow"
|
89
|
+
arbitrary_types_allowed = True
|
90
|
+
|
91
|
+
@classmethod
|
92
|
+
def validate_template_string(cls, template_str: str) -> bool:
|
93
|
+
"""Validate a template string against the schema."""
|
94
|
+
try:
|
95
|
+
template_json = json.loads(template_str)
|
96
|
+
cls(template=template_json)
|
97
|
+
return True
|
98
|
+
except Exception as e:
|
99
|
+
logger.error(f"Template validation failed: {e}")
|
100
|
+
return False
|
101
|
+
|
102
|
+
@dataclass
|
103
|
+
class PostExtractionConfig:
|
104
|
+
"""Configuration for post-extraction processing."""
|
105
|
+
model: str
|
106
|
+
messages: List[Dict[str, str]]
|
107
|
+
api_key: str
|
108
|
+
max_tokens: int = 1000
|
109
|
+
temperature: float = 0.7
|
110
|
+
base_url: Optional[str] = None
|
111
|
+
request_delay: float = 0.01 # 10 milliseconds default
|
112
|
+
max_retries: int = 3
|
113
|
+
retry_delay: float = 1.0
|
114
|
+
combine_output: bool = False
|
115
|
+
output_file: Optional[Path] = None
|
116
|
+
custom_transform_function: Optional[Callable] = None
|
117
|
+
buffer_file: Optional[Path] = None
|
118
|
+
|
119
|
+
def __post_init__(self):
|
120
|
+
if self.output_file:
|
121
|
+
self.output_file = Path(self.output_file)
|
122
|
+
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
123
|
+
|
124
|
+
if self.buffer_file:
|
125
|
+
self.buffer_file = Path(self.buffer_file)
|
126
|
+
self.buffer_file.parent.mkdir(parents=True, exist_ok=True)
|
127
|
+
|
128
|
+
class RateLimiter:
|
129
|
+
"""Rate limiter for API calls."""
|
130
|
+
|
131
|
+
def __init__(self, requests_per_minute: int = 60):
|
132
|
+
self.requests_per_minute = requests_per_minute
|
133
|
+
self.interval = 60 / requests_per_minute
|
134
|
+
self.last_request = 0
|
135
|
+
self._lock = asyncio.Lock()
|
136
|
+
|
137
|
+
async def acquire(self):
|
138
|
+
"""Acquire rate limit slot."""
|
139
|
+
async with self._lock:
|
140
|
+
now = time.time()
|
141
|
+
if self.last_request:
|
142
|
+
elapsed = now - self.last_request
|
143
|
+
if elapsed < self.interval:
|
144
|
+
await asyncio.sleep(self.interval - elapsed)
|
145
|
+
self.last_request = time.time()
|
146
|
+
|
147
|
+
class PostExtractionAgent:
|
148
|
+
"""Agent for processing extracted content using LLM models."""
|
149
|
+
|
150
|
+
def __init__(self, config: PostExtractionConfig):
|
151
|
+
self.config = config
|
152
|
+
self.buffer = PostExtractionBuffer(config.buffer_file)
|
153
|
+
self.results: Dict[str, Any] = {}
|
154
|
+
self.rate_limiter = RateLimiter()
|
155
|
+
self._setup_output()
|
156
|
+
|
157
|
+
def _setup_output(self) -> None:
|
158
|
+
"""Setup output file if combining results."""
|
159
|
+
if self.config.combine_output and self.config.output_file:
|
160
|
+
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
161
|
+
if self.config.output_file.exists():
|
162
|
+
# Backup existing file
|
163
|
+
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
164
|
+
self.config.output_file.rename(backup_path)
|
165
|
+
self.config.output_file.touch()
|
166
|
+
|
167
|
+
async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
|
+
"""Process a single piece of content through the LLM."""
|
169
|
+
try:
|
170
|
+
# Apply rate limiting
|
171
|
+
await self.rate_limiter.acquire()
|
172
|
+
|
173
|
+
# Replace placeholder in messages with actual content
|
174
|
+
messages = [
|
175
|
+
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
176
|
+
for msg in self.config.messages
|
177
|
+
]
|
178
|
+
|
179
|
+
# Make LLM request
|
180
|
+
response = await completion(
|
181
|
+
model=self.config.model,
|
182
|
+
messages=messages,
|
183
|
+
max_tokens=self.config.max_tokens,
|
184
|
+
temperature=self.config.temperature,
|
185
|
+
api_key=self.config.api_key,
|
186
|
+
api_base=self.config.base_url
|
187
|
+
)
|
188
|
+
|
189
|
+
# Parse response
|
190
|
+
try:
|
191
|
+
extracted_data = json.loads(response.choices[0].message.content)
|
192
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
193
|
+
return extracted_data
|
194
|
+
except json.JSONDecodeError as e:
|
195
|
+
raise ValueError(f"Invalid JSON response from LLM: {e}")
|
196
|
+
|
197
|
+
except Exception as e:
|
198
|
+
logger.error(f"Error processing {url}: {str(e)}")
|
199
|
+
self.buffer.add_failed_request(url, content, str(e))
|
200
|
+
return None
|
201
|
+
|
202
|
+
async def _save_result(self, url: str, result: Dict) -> None:
|
203
|
+
"""Save individual or combined results."""
|
204
|
+
try:
|
205
|
+
if self.config.combine_output and self.config.output_file:
|
206
|
+
self.results[url] = result
|
207
|
+
async with aiofiles.open(self.config.output_file, 'w') as f:
|
208
|
+
await f.write(json.dumps(self.results, indent=2))
|
209
|
+
elif not self.config.combine_output and self.config.output_file:
|
210
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
211
|
+
async with aiofiles.open(individual_file, 'w') as f:
|
212
|
+
await f.write(json.dumps(result, indent=2))
|
213
|
+
except Exception as e:
|
214
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
215
|
+
|
216
|
+
async def process_content(self, url: str, content: str) -> Optional[Dict]:
|
217
|
+
"""Process content with retry mechanism."""
|
218
|
+
for attempt in range(self.config.max_retries):
|
219
|
+
result = await self._process_single_content(url, content)
|
220
|
+
if result:
|
221
|
+
# Apply custom transformation if provided
|
222
|
+
if self.config.custom_transform_function:
|
223
|
+
try:
|
224
|
+
result = self.config.custom_transform_function(result)
|
225
|
+
except Exception as e:
|
226
|
+
logger.error(f"Error in custom transform for {url}: {str(e)}")
|
227
|
+
|
228
|
+
await self._save_result(url, result)
|
229
|
+
return result
|
230
|
+
|
231
|
+
# Wait before retry
|
232
|
+
if attempt < self.config.max_retries - 1:
|
233
|
+
await asyncio.sleep(self.config.retry_delay)
|
234
|
+
|
235
|
+
return None
|
236
|
+
|
237
|
+
async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
|
238
|
+
"""Process multiple pieces of content with rate limiting."""
|
239
|
+
results = {}
|
240
|
+
for url, content in content_map.items():
|
241
|
+
results[url] = await self.process_content(url, content)
|
242
|
+
await asyncio.sleep(self.config.request_delay)
|
243
|
+
return results
|
244
|
+
|
245
|
+
def retry_failed_requests(self) -> Dict[str, Optional[Dict]]:
|
246
|
+
"""Retry all failed requests from the buffer."""
|
247
|
+
failed_requests = self.buffer.get_retryable_requests(self.config.max_retries)
|
248
|
+
return asyncio.run(self.process_bulk_content(
|
249
|
+
{url: data['content'] for url, data in failed_requests.items()}
|
250
|
+
))
|
251
|
+
|
252
|
+
async def get_processing_stats(self) -> Dict[str, Any]:
|
253
|
+
"""Get detailed processing statistics."""
|
254
|
+
return {
|
255
|
+
"total_processed": len(self.results),
|
256
|
+
"failed_requests": len(self.buffer.get_failed_requests()),
|
257
|
+
"retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
|
258
|
+
"success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
|
259
|
+
}
|
@@ -1,16 +1,24 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 2.
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
3
|
+
Version: 2.4
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
7
7
|
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
|
8
|
+
Project-URL: Homepage, https://petertam.pro
|
9
|
+
Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
|
10
|
+
Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
|
11
|
+
Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
|
12
|
+
Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
|
9
13
|
Classifier: Development Status :: 4 - Beta
|
10
14
|
Classifier: Intended Audience :: Developers
|
11
15
|
Classifier: License :: OSI Approved :: MIT License
|
12
16
|
Classifier: Programming Language :: Python :: 3.11
|
13
17
|
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
14
22
|
Requires-Python: >=3.11
|
15
23
|
Description-Content-Type: text/markdown
|
16
24
|
Requires-Dist: aiohttp>=3.8.0
|
@@ -18,6 +26,36 @@ Requires-Dist: asyncio>=3.4.3
|
|
18
26
|
Requires-Dist: rich>=10.0.0
|
19
27
|
Requires-Dist: aiofiles>=0.8.0
|
20
28
|
Requires-Dist: httpx>=0.24.0
|
29
|
+
Requires-Dist: litellm>=1.26.0
|
30
|
+
Requires-Dist: pydantic>=2.6.0
|
31
|
+
Requires-Dist: requests>=2.31.0
|
32
|
+
Requires-Dist: aiofiles>=23.2.1
|
33
|
+
Requires-Dist: et-xmlfile>=1.1.0
|
34
|
+
Requires-Dist: multidict>=6.0.4
|
35
|
+
Requires-Dist: openai>=1.12.0
|
36
|
+
Requires-Dist: pandas>=2.2.0
|
37
|
+
Requires-Dist: numpy>=1.26.0
|
38
|
+
Requires-Dist: yarl>=1.9.4
|
39
|
+
Requires-Dist: typing_extensions>=4.9.0
|
40
|
+
Provides-Extra: dev
|
41
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
42
|
+
Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
|
43
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
44
|
+
Requires-Dist: black>=23.7.0; extra == "dev"
|
45
|
+
Requires-Dist: isort>=5.12.0; extra == "dev"
|
46
|
+
Requires-Dist: mypy>=1.4.1; extra == "dev"
|
47
|
+
Requires-Dist: ruff>=0.1.8; extra == "dev"
|
48
|
+
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
49
|
+
Provides-Extra: test
|
50
|
+
Requires-Dist: pytest>=7.4.0; extra == "test"
|
51
|
+
Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
|
52
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
53
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "test"
|
54
|
+
Requires-Dist: coverage>=7.4.0; extra == "test"
|
55
|
+
Provides-Extra: docs
|
56
|
+
Requires-Dist: sphinx>=7.1.0; extra == "docs"
|
57
|
+
Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
|
58
|
+
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
21
59
|
Dynamic: author
|
22
60
|
Dynamic: home-page
|
23
61
|
Dynamic: requires-python
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
|
3
|
+
spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
|
4
|
+
spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=xugT2psrwNiT-qCgSjEPUERvJszcsojYHiU_19sV__A,34754
|
2
|
-
spiderforce4ai-2.3.1.dist-info/METADATA,sha256=Mremwj9Ysxs8zAQ_hLeBM_MSes45NrfFb0nABOVvhrs,7185
|
3
|
-
spiderforce4ai-2.3.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-2.3.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-2.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|