spiderforce4ai 2.3.1__py3-none-any.whl → 2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +411 -349
- spiderforce4ai/post_extraction_agent.py +259 -0
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/METADATA +41 -3
- spiderforce4ai-2.4.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/entry_points.txt +2 -0
- spiderforce4ai-2.3.1.dist-info/RECORD +0 -5
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.3.1.dist-info → spiderforce4ai-2.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,259 @@
|
|
1
|
+
# post_extraction_agent.py
|
2
|
+
|
3
|
+
from dataclasses import dataclass, asdict
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
5
|
+
import json
|
6
|
+
import asyncio
|
7
|
+
import time
|
8
|
+
from pathlib import Path
|
9
|
+
import aiofiles
|
10
|
+
from litellm import completion
|
11
|
+
from pydantic import BaseModel, Field
|
12
|
+
import logging
|
13
|
+
from datetime import datetime
|
14
|
+
import re
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
class PostExtractionBuffer:
|
19
|
+
"""Buffer system for tracking and retrying failed LLM requests."""
|
20
|
+
|
21
|
+
def __init__(self, buffer_file: Optional[Path] = None):
|
22
|
+
# Generate a unique session ID using timestamp and random string
|
23
|
+
session_id = f"{int(time.time())}_{hex(hash(str(time.time())))[-6:]}"
|
24
|
+
|
25
|
+
# Create unique buffer file path
|
26
|
+
if buffer_file:
|
27
|
+
# If buffer_file is provided, insert session_id before the extension
|
28
|
+
stem = buffer_file.stem
|
29
|
+
suffix = buffer_file.suffix
|
30
|
+
self.buffer_file = buffer_file.with_name(f"{stem}_{session_id}{suffix}")
|
31
|
+
else:
|
32
|
+
# Default buffer file with session_id
|
33
|
+
self.buffer_file = Path(f"post_extraction_buffer_{session_id}.json")
|
34
|
+
|
35
|
+
self.failed_requests: Dict[str, Dict] = {}
|
36
|
+
self._load_buffer()
|
37
|
+
|
38
|
+
def _load_buffer(self) -> None:
|
39
|
+
"""Load failed requests from buffer file if it exists."""
|
40
|
+
if self.buffer_file.exists():
|
41
|
+
try:
|
42
|
+
with open(self.buffer_file, 'r') as f:
|
43
|
+
self.failed_requests = json.load(f)
|
44
|
+
except Exception as e:
|
45
|
+
logger.error(f"Error loading buffer file: {e}")
|
46
|
+
self.failed_requests = {}
|
47
|
+
|
48
|
+
def _save_buffer(self) -> None:
|
49
|
+
"""Save failed requests to buffer file."""
|
50
|
+
try:
|
51
|
+
with open(self.buffer_file, 'w') as f:
|
52
|
+
json.dump(self.failed_requests, f, indent=2)
|
53
|
+
except Exception as e:
|
54
|
+
logger.error(f"Error saving buffer file: {e}")
|
55
|
+
|
56
|
+
def add_failed_request(self, url: str, content: str, error: str) -> None:
|
57
|
+
"""Add a failed request to the buffer."""
|
58
|
+
self.failed_requests[url] = {
|
59
|
+
"content": content,
|
60
|
+
"error": error,
|
61
|
+
"timestamp": datetime.now().isoformat(),
|
62
|
+
"attempts": self.failed_requests.get(url, {}).get("attempts", 0) + 1
|
63
|
+
}
|
64
|
+
self._save_buffer()
|
65
|
+
|
66
|
+
def remove_request(self, url: str) -> None:
|
67
|
+
"""Remove a request from the buffer after successful processing."""
|
68
|
+
if url in self.failed_requests:
|
69
|
+
del self.failed_requests[url]
|
70
|
+
self._save_buffer()
|
71
|
+
|
72
|
+
def get_failed_requests(self) -> Dict[str, Dict]:
|
73
|
+
"""Get all failed requests."""
|
74
|
+
return self.failed_requests
|
75
|
+
|
76
|
+
def get_retryable_requests(self, max_attempts: int = 3) -> Dict[str, Dict]:
|
77
|
+
"""Get failed requests that haven't exceeded max retry attempts."""
|
78
|
+
return {
|
79
|
+
url: data for url, data in self.failed_requests.items()
|
80
|
+
if data.get("attempts", 0) < max_attempts
|
81
|
+
}
|
82
|
+
|
83
|
+
class ExtractionTemplate(BaseModel):
|
84
|
+
"""Base model for extraction template validation."""
|
85
|
+
template: Dict[str, Any] = Field(..., description="Template structure for extraction")
|
86
|
+
|
87
|
+
class Config:
|
88
|
+
extra = "allow"
|
89
|
+
arbitrary_types_allowed = True
|
90
|
+
|
91
|
+
@classmethod
|
92
|
+
def validate_template_string(cls, template_str: str) -> bool:
|
93
|
+
"""Validate a template string against the schema."""
|
94
|
+
try:
|
95
|
+
template_json = json.loads(template_str)
|
96
|
+
cls(template=template_json)
|
97
|
+
return True
|
98
|
+
except Exception as e:
|
99
|
+
logger.error(f"Template validation failed: {e}")
|
100
|
+
return False
|
101
|
+
|
102
|
+
@dataclass
|
103
|
+
class PostExtractionConfig:
|
104
|
+
"""Configuration for post-extraction processing."""
|
105
|
+
model: str
|
106
|
+
messages: List[Dict[str, str]]
|
107
|
+
api_key: str
|
108
|
+
max_tokens: int = 1000
|
109
|
+
temperature: float = 0.7
|
110
|
+
base_url: Optional[str] = None
|
111
|
+
request_delay: float = 0.01 # 10 milliseconds default
|
112
|
+
max_retries: int = 3
|
113
|
+
retry_delay: float = 1.0
|
114
|
+
combine_output: bool = False
|
115
|
+
output_file: Optional[Path] = None
|
116
|
+
custom_transform_function: Optional[Callable] = None
|
117
|
+
buffer_file: Optional[Path] = None
|
118
|
+
|
119
|
+
def __post_init__(self):
|
120
|
+
if self.output_file:
|
121
|
+
self.output_file = Path(self.output_file)
|
122
|
+
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
123
|
+
|
124
|
+
if self.buffer_file:
|
125
|
+
self.buffer_file = Path(self.buffer_file)
|
126
|
+
self.buffer_file.parent.mkdir(parents=True, exist_ok=True)
|
127
|
+
|
128
|
+
class RateLimiter:
|
129
|
+
"""Rate limiter for API calls."""
|
130
|
+
|
131
|
+
def __init__(self, requests_per_minute: int = 60):
|
132
|
+
self.requests_per_minute = requests_per_minute
|
133
|
+
self.interval = 60 / requests_per_minute
|
134
|
+
self.last_request = 0
|
135
|
+
self._lock = asyncio.Lock()
|
136
|
+
|
137
|
+
async def acquire(self):
|
138
|
+
"""Acquire rate limit slot."""
|
139
|
+
async with self._lock:
|
140
|
+
now = time.time()
|
141
|
+
if self.last_request:
|
142
|
+
elapsed = now - self.last_request
|
143
|
+
if elapsed < self.interval:
|
144
|
+
await asyncio.sleep(self.interval - elapsed)
|
145
|
+
self.last_request = time.time()
|
146
|
+
|
147
|
+
class PostExtractionAgent:
|
148
|
+
"""Agent for processing extracted content using LLM models."""
|
149
|
+
|
150
|
+
def __init__(self, config: PostExtractionConfig):
|
151
|
+
self.config = config
|
152
|
+
self.buffer = PostExtractionBuffer(config.buffer_file)
|
153
|
+
self.results: Dict[str, Any] = {}
|
154
|
+
self.rate_limiter = RateLimiter()
|
155
|
+
self._setup_output()
|
156
|
+
|
157
|
+
def _setup_output(self) -> None:
|
158
|
+
"""Setup output file if combining results."""
|
159
|
+
if self.config.combine_output and self.config.output_file:
|
160
|
+
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
161
|
+
if self.config.output_file.exists():
|
162
|
+
# Backup existing file
|
163
|
+
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
164
|
+
self.config.output_file.rename(backup_path)
|
165
|
+
self.config.output_file.touch()
|
166
|
+
|
167
|
+
async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
168
|
+
"""Process a single piece of content through the LLM."""
|
169
|
+
try:
|
170
|
+
# Apply rate limiting
|
171
|
+
await self.rate_limiter.acquire()
|
172
|
+
|
173
|
+
# Replace placeholder in messages with actual content
|
174
|
+
messages = [
|
175
|
+
{**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
|
176
|
+
for msg in self.config.messages
|
177
|
+
]
|
178
|
+
|
179
|
+
# Make LLM request
|
180
|
+
response = await completion(
|
181
|
+
model=self.config.model,
|
182
|
+
messages=messages,
|
183
|
+
max_tokens=self.config.max_tokens,
|
184
|
+
temperature=self.config.temperature,
|
185
|
+
api_key=self.config.api_key,
|
186
|
+
api_base=self.config.base_url
|
187
|
+
)
|
188
|
+
|
189
|
+
# Parse response
|
190
|
+
try:
|
191
|
+
extracted_data = json.loads(response.choices[0].message.content)
|
192
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
193
|
+
return extracted_data
|
194
|
+
except json.JSONDecodeError as e:
|
195
|
+
raise ValueError(f"Invalid JSON response from LLM: {e}")
|
196
|
+
|
197
|
+
except Exception as e:
|
198
|
+
logger.error(f"Error processing {url}: {str(e)}")
|
199
|
+
self.buffer.add_failed_request(url, content, str(e))
|
200
|
+
return None
|
201
|
+
|
202
|
+
async def _save_result(self, url: str, result: Dict) -> None:
|
203
|
+
"""Save individual or combined results."""
|
204
|
+
try:
|
205
|
+
if self.config.combine_output and self.config.output_file:
|
206
|
+
self.results[url] = result
|
207
|
+
async with aiofiles.open(self.config.output_file, 'w') as f:
|
208
|
+
await f.write(json.dumps(self.results, indent=2))
|
209
|
+
elif not self.config.combine_output and self.config.output_file:
|
210
|
+
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
211
|
+
async with aiofiles.open(individual_file, 'w') as f:
|
212
|
+
await f.write(json.dumps(result, indent=2))
|
213
|
+
except Exception as e:
|
214
|
+
logger.error(f"Error saving results for {url}: {str(e)}")
|
215
|
+
|
216
|
+
async def process_content(self, url: str, content: str) -> Optional[Dict]:
|
217
|
+
"""Process content with retry mechanism."""
|
218
|
+
for attempt in range(self.config.max_retries):
|
219
|
+
result = await self._process_single_content(url, content)
|
220
|
+
if result:
|
221
|
+
# Apply custom transformation if provided
|
222
|
+
if self.config.custom_transform_function:
|
223
|
+
try:
|
224
|
+
result = self.config.custom_transform_function(result)
|
225
|
+
except Exception as e:
|
226
|
+
logger.error(f"Error in custom transform for {url}: {str(e)}")
|
227
|
+
|
228
|
+
await self._save_result(url, result)
|
229
|
+
return result
|
230
|
+
|
231
|
+
# Wait before retry
|
232
|
+
if attempt < self.config.max_retries - 1:
|
233
|
+
await asyncio.sleep(self.config.retry_delay)
|
234
|
+
|
235
|
+
return None
|
236
|
+
|
237
|
+
async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
|
238
|
+
"""Process multiple pieces of content with rate limiting."""
|
239
|
+
results = {}
|
240
|
+
for url, content in content_map.items():
|
241
|
+
results[url] = await self.process_content(url, content)
|
242
|
+
await asyncio.sleep(self.config.request_delay)
|
243
|
+
return results
|
244
|
+
|
245
|
+
def retry_failed_requests(self) -> Dict[str, Optional[Dict]]:
|
246
|
+
"""Retry all failed requests from the buffer."""
|
247
|
+
failed_requests = self.buffer.get_retryable_requests(self.config.max_retries)
|
248
|
+
return asyncio.run(self.process_bulk_content(
|
249
|
+
{url: data['content'] for url, data in failed_requests.items()}
|
250
|
+
))
|
251
|
+
|
252
|
+
async def get_processing_stats(self) -> Dict[str, Any]:
|
253
|
+
"""Get detailed processing statistics."""
|
254
|
+
return {
|
255
|
+
"total_processed": len(self.results),
|
256
|
+
"failed_requests": len(self.buffer.get_failed_requests()),
|
257
|
+
"retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
|
258
|
+
"success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
|
259
|
+
}
|
@@ -1,16 +1,24 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 2.
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
3
|
+
Version: 2.4
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
7
7
|
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
|
8
|
+
Project-URL: Homepage, https://petertam.pro
|
9
|
+
Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
|
10
|
+
Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
|
11
|
+
Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
|
12
|
+
Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
|
9
13
|
Classifier: Development Status :: 4 - Beta
|
10
14
|
Classifier: Intended Audience :: Developers
|
11
15
|
Classifier: License :: OSI Approved :: MIT License
|
12
16
|
Classifier: Programming Language :: Python :: 3.11
|
13
17
|
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
14
22
|
Requires-Python: >=3.11
|
15
23
|
Description-Content-Type: text/markdown
|
16
24
|
Requires-Dist: aiohttp>=3.8.0
|
@@ -18,6 +26,36 @@ Requires-Dist: asyncio>=3.4.3
|
|
18
26
|
Requires-Dist: rich>=10.0.0
|
19
27
|
Requires-Dist: aiofiles>=0.8.0
|
20
28
|
Requires-Dist: httpx>=0.24.0
|
29
|
+
Requires-Dist: litellm>=1.26.0
|
30
|
+
Requires-Dist: pydantic>=2.6.0
|
31
|
+
Requires-Dist: requests>=2.31.0
|
32
|
+
Requires-Dist: aiofiles>=23.2.1
|
33
|
+
Requires-Dist: et-xmlfile>=1.1.0
|
34
|
+
Requires-Dist: multidict>=6.0.4
|
35
|
+
Requires-Dist: openai>=1.12.0
|
36
|
+
Requires-Dist: pandas>=2.2.0
|
37
|
+
Requires-Dist: numpy>=1.26.0
|
38
|
+
Requires-Dist: yarl>=1.9.4
|
39
|
+
Requires-Dist: typing_extensions>=4.9.0
|
40
|
+
Provides-Extra: dev
|
41
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
42
|
+
Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
|
43
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
44
|
+
Requires-Dist: black>=23.7.0; extra == "dev"
|
45
|
+
Requires-Dist: isort>=5.12.0; extra == "dev"
|
46
|
+
Requires-Dist: mypy>=1.4.1; extra == "dev"
|
47
|
+
Requires-Dist: ruff>=0.1.8; extra == "dev"
|
48
|
+
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
49
|
+
Provides-Extra: test
|
50
|
+
Requires-Dist: pytest>=7.4.0; extra == "test"
|
51
|
+
Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
|
52
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
53
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == "test"
|
54
|
+
Requires-Dist: coverage>=7.4.0; extra == "test"
|
55
|
+
Provides-Extra: docs
|
56
|
+
Requires-Dist: sphinx>=7.1.0; extra == "docs"
|
57
|
+
Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
|
58
|
+
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
21
59
|
Dynamic: author
|
22
60
|
Dynamic: home-page
|
23
61
|
Dynamic: requires-python
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
|
3
|
+
spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
|
4
|
+
spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=xugT2psrwNiT-qCgSjEPUERvJszcsojYHiU_19sV__A,34754
|
2
|
-
spiderforce4ai-2.3.1.dist-info/METADATA,sha256=Mremwj9Ysxs8zAQ_hLeBM_MSes45NrfFb0nABOVvhrs,7185
|
3
|
-
spiderforce4ai-2.3.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-2.3.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-2.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|