spiderforce4ai 2.3.1__py3-none-any.whl → 2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ # post_extraction_agent.py
2
+
3
+ from dataclasses import dataclass, asdict
4
+ from typing import Any, Callable, Dict, List, Optional, Union
5
+ import json
6
+ import asyncio
7
+ import time
8
+ from pathlib import Path
9
+ import aiofiles
10
+ from litellm import completion
11
+ from pydantic import BaseModel, Field
12
+ import logging
13
+ from datetime import datetime
14
+ import re
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class PostExtractionBuffer:
19
+ """Buffer system for tracking and retrying failed LLM requests."""
20
+
21
+ def __init__(self, buffer_file: Optional[Path] = None):
22
+ # Generate a unique session ID using timestamp and random string
23
+ session_id = f"{int(time.time())}_{hex(hash(str(time.time())))[-6:]}"
24
+
25
+ # Create unique buffer file path
26
+ if buffer_file:
27
+ # If buffer_file is provided, insert session_id before the extension
28
+ stem = buffer_file.stem
29
+ suffix = buffer_file.suffix
30
+ self.buffer_file = buffer_file.with_name(f"{stem}_{session_id}{suffix}")
31
+ else:
32
+ # Default buffer file with session_id
33
+ self.buffer_file = Path(f"post_extraction_buffer_{session_id}.json")
34
+
35
+ self.failed_requests: Dict[str, Dict] = {}
36
+ self._load_buffer()
37
+
38
+ def _load_buffer(self) -> None:
39
+ """Load failed requests from buffer file if it exists."""
40
+ if self.buffer_file.exists():
41
+ try:
42
+ with open(self.buffer_file, 'r') as f:
43
+ self.failed_requests = json.load(f)
44
+ except Exception as e:
45
+ logger.error(f"Error loading buffer file: {e}")
46
+ self.failed_requests = {}
47
+
48
+ def _save_buffer(self) -> None:
49
+ """Save failed requests to buffer file."""
50
+ try:
51
+ with open(self.buffer_file, 'w') as f:
52
+ json.dump(self.failed_requests, f, indent=2)
53
+ except Exception as e:
54
+ logger.error(f"Error saving buffer file: {e}")
55
+
56
+ def add_failed_request(self, url: str, content: str, error: str) -> None:
57
+ """Add a failed request to the buffer."""
58
+ self.failed_requests[url] = {
59
+ "content": content,
60
+ "error": error,
61
+ "timestamp": datetime.now().isoformat(),
62
+ "attempts": self.failed_requests.get(url, {}).get("attempts", 0) + 1
63
+ }
64
+ self._save_buffer()
65
+
66
+ def remove_request(self, url: str) -> None:
67
+ """Remove a request from the buffer after successful processing."""
68
+ if url in self.failed_requests:
69
+ del self.failed_requests[url]
70
+ self._save_buffer()
71
+
72
+ def get_failed_requests(self) -> Dict[str, Dict]:
73
+ """Get all failed requests."""
74
+ return self.failed_requests
75
+
76
+ def get_retryable_requests(self, max_attempts: int = 3) -> Dict[str, Dict]:
77
+ """Get failed requests that haven't exceeded max retry attempts."""
78
+ return {
79
+ url: data for url, data in self.failed_requests.items()
80
+ if data.get("attempts", 0) < max_attempts
81
+ }
82
+
83
+ class ExtractionTemplate(BaseModel):
84
+ """Base model for extraction template validation."""
85
+ template: Dict[str, Any] = Field(..., description="Template structure for extraction")
86
+
87
+ class Config:
88
+ extra = "allow"
89
+ arbitrary_types_allowed = True
90
+
91
+ @classmethod
92
+ def validate_template_string(cls, template_str: str) -> bool:
93
+ """Validate a template string against the schema."""
94
+ try:
95
+ template_json = json.loads(template_str)
96
+ cls(template=template_json)
97
+ return True
98
+ except Exception as e:
99
+ logger.error(f"Template validation failed: {e}")
100
+ return False
101
+
102
+ @dataclass
103
+ class PostExtractionConfig:
104
+ """Configuration for post-extraction processing."""
105
+ model: str
106
+ messages: List[Dict[str, str]]
107
+ api_key: str
108
+ max_tokens: int = 1000
109
+ temperature: float = 0.7
110
+ base_url: Optional[str] = None
111
+ request_delay: float = 0.01 # 10 milliseconds default
112
+ max_retries: int = 3
113
+ retry_delay: float = 1.0
114
+ combine_output: bool = False
115
+ output_file: Optional[Path] = None
116
+ custom_transform_function: Optional[Callable] = None
117
+ buffer_file: Optional[Path] = None
118
+
119
+ def __post_init__(self):
120
+ if self.output_file:
121
+ self.output_file = Path(self.output_file)
122
+ self.output_file.parent.mkdir(parents=True, exist_ok=True)
123
+
124
+ if self.buffer_file:
125
+ self.buffer_file = Path(self.buffer_file)
126
+ self.buffer_file.parent.mkdir(parents=True, exist_ok=True)
127
+
128
+ class RateLimiter:
129
+ """Rate limiter for API calls."""
130
+
131
+ def __init__(self, requests_per_minute: int = 60):
132
+ self.requests_per_minute = requests_per_minute
133
+ self.interval = 60 / requests_per_minute
134
+ self.last_request = 0
135
+ self._lock = asyncio.Lock()
136
+
137
+ async def acquire(self):
138
+ """Acquire rate limit slot."""
139
+ async with self._lock:
140
+ now = time.time()
141
+ if self.last_request:
142
+ elapsed = now - self.last_request
143
+ if elapsed < self.interval:
144
+ await asyncio.sleep(self.interval - elapsed)
145
+ self.last_request = time.time()
146
+
147
+ class PostExtractionAgent:
148
+ """Agent for processing extracted content using LLM models."""
149
+
150
+ def __init__(self, config: PostExtractionConfig):
151
+ self.config = config
152
+ self.buffer = PostExtractionBuffer(config.buffer_file)
153
+ self.results: Dict[str, Any] = {}
154
+ self.rate_limiter = RateLimiter()
155
+ self._setup_output()
156
+
157
+ def _setup_output(self) -> None:
158
+ """Setup output file if combining results."""
159
+ if self.config.combine_output and self.config.output_file:
160
+ self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
161
+ if self.config.output_file.exists():
162
+ # Backup existing file
163
+ backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
164
+ self.config.output_file.rename(backup_path)
165
+ self.config.output_file.touch()
166
+
167
+ async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
168
+ """Process a single piece of content through the LLM."""
169
+ try:
170
+ # Apply rate limiting
171
+ await self.rate_limiter.acquire()
172
+
173
+ # Replace placeholder in messages with actual content
174
+ messages = [
175
+ {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
176
+ for msg in self.config.messages
177
+ ]
178
+
179
+ # Make LLM request
180
+ response = await completion(
181
+ model=self.config.model,
182
+ messages=messages,
183
+ max_tokens=self.config.max_tokens,
184
+ temperature=self.config.temperature,
185
+ api_key=self.config.api_key,
186
+ api_base=self.config.base_url
187
+ )
188
+
189
+ # Parse response
190
+ try:
191
+ extracted_data = json.loads(response.choices[0].message.content)
192
+ self.buffer.remove_request(url) # Remove from buffer if successful
193
+ return extracted_data
194
+ except json.JSONDecodeError as e:
195
+ raise ValueError(f"Invalid JSON response from LLM: {e}")
196
+
197
+ except Exception as e:
198
+ logger.error(f"Error processing {url}: {str(e)}")
199
+ self.buffer.add_failed_request(url, content, str(e))
200
+ return None
201
+
202
+ async def _save_result(self, url: str, result: Dict) -> None:
203
+ """Save individual or combined results."""
204
+ try:
205
+ if self.config.combine_output and self.config.output_file:
206
+ self.results[url] = result
207
+ async with aiofiles.open(self.config.output_file, 'w') as f:
208
+ await f.write(json.dumps(self.results, indent=2))
209
+ elif not self.config.combine_output and self.config.output_file:
210
+ individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
211
+ async with aiofiles.open(individual_file, 'w') as f:
212
+ await f.write(json.dumps(result, indent=2))
213
+ except Exception as e:
214
+ logger.error(f"Error saving results for {url}: {str(e)}")
215
+
216
+ async def process_content(self, url: str, content: str) -> Optional[Dict]:
217
+ """Process content with retry mechanism."""
218
+ for attempt in range(self.config.max_retries):
219
+ result = await self._process_single_content(url, content)
220
+ if result:
221
+ # Apply custom transformation if provided
222
+ if self.config.custom_transform_function:
223
+ try:
224
+ result = self.config.custom_transform_function(result)
225
+ except Exception as e:
226
+ logger.error(f"Error in custom transform for {url}: {str(e)}")
227
+
228
+ await self._save_result(url, result)
229
+ return result
230
+
231
+ # Wait before retry
232
+ if attempt < self.config.max_retries - 1:
233
+ await asyncio.sleep(self.config.retry_delay)
234
+
235
+ return None
236
+
237
+ async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
238
+ """Process multiple pieces of content with rate limiting."""
239
+ results = {}
240
+ for url, content in content_map.items():
241
+ results[url] = await self.process_content(url, content)
242
+ await asyncio.sleep(self.config.request_delay)
243
+ return results
244
+
245
+ def retry_failed_requests(self) -> Dict[str, Optional[Dict]]:
246
+ """Retry all failed requests from the buffer."""
247
+ failed_requests = self.buffer.get_retryable_requests(self.config.max_retries)
248
+ return asyncio.run(self.process_bulk_content(
249
+ {url: data['content'] for url, data in failed_requests.items()}
250
+ ))
251
+
252
+ async def get_processing_stats(self) -> Dict[str, Any]:
253
+ """Get detailed processing statistics."""
254
+ return {
255
+ "total_processed": len(self.results),
256
+ "failed_requests": len(self.buffer.get_failed_requests()),
257
+ "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
258
+ "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
259
+ }
@@ -1,16 +1,24 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.3.1
4
- Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
3
+ Version: 2.4
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
7
7
  Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
- License: MIT
8
+ Project-URL: Homepage, https://petertam.pro
9
+ Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
10
+ Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
11
+ Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
12
+ Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
9
13
  Classifier: Development Status :: 4 - Beta
10
14
  Classifier: Intended Audience :: Developers
11
15
  Classifier: License :: OSI Approved :: MIT License
12
16
  Classifier: Programming Language :: Python :: 3.11
13
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
19
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
22
  Requires-Python: >=3.11
15
23
  Description-Content-Type: text/markdown
16
24
  Requires-Dist: aiohttp>=3.8.0
@@ -18,6 +26,36 @@ Requires-Dist: asyncio>=3.4.3
18
26
  Requires-Dist: rich>=10.0.0
19
27
  Requires-Dist: aiofiles>=0.8.0
20
28
  Requires-Dist: httpx>=0.24.0
29
+ Requires-Dist: litellm>=1.26.0
30
+ Requires-Dist: pydantic>=2.6.0
31
+ Requires-Dist: requests>=2.31.0
32
+ Requires-Dist: aiofiles>=23.2.1
33
+ Requires-Dist: et-xmlfile>=1.1.0
34
+ Requires-Dist: multidict>=6.0.4
35
+ Requires-Dist: openai>=1.12.0
36
+ Requires-Dist: pandas>=2.2.0
37
+ Requires-Dist: numpy>=1.26.0
38
+ Requires-Dist: yarl>=1.9.4
39
+ Requires-Dist: typing_extensions>=4.9.0
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
42
+ Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
43
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
44
+ Requires-Dist: black>=23.7.0; extra == "dev"
45
+ Requires-Dist: isort>=5.12.0; extra == "dev"
46
+ Requires-Dist: mypy>=1.4.1; extra == "dev"
47
+ Requires-Dist: ruff>=0.1.8; extra == "dev"
48
+ Requires-Dist: pre-commit>=3.5.0; extra == "dev"
49
+ Provides-Extra: test
50
+ Requires-Dist: pytest>=7.4.0; extra == "test"
51
+ Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
52
+ Requires-Dist: pytest-cov>=4.1.0; extra == "test"
53
+ Requires-Dist: pytest-mock>=3.12.0; extra == "test"
54
+ Requires-Dist: coverage>=7.4.0; extra == "test"
55
+ Provides-Extra: docs
56
+ Requires-Dist: sphinx>=7.1.0; extra == "docs"
57
+ Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
58
+ Requires-Dist: myst-parser>=2.0.0; extra == "docs"
21
59
  Dynamic: author
22
60
  Dynamic: home-page
23
61
  Dynamic: requires-python
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
2
+ spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
3
+ spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
4
+ spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ spiderforce4ai = spiderforce4ai.cli:main
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=xugT2psrwNiT-qCgSjEPUERvJszcsojYHiU_19sV__A,34754
2
- spiderforce4ai-2.3.1.dist-info/METADATA,sha256=Mremwj9Ysxs8zAQ_hLeBM_MSes45NrfFb0nABOVvhrs,7185
3
- spiderforce4ai-2.3.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-2.3.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-2.3.1.dist-info/RECORD,,