spiderforce4ai 2.1__py3-none-any.whl → 2.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,259 @@
1
+ # post_extraction_agent.py
2
+
3
+ from dataclasses import dataclass, asdict
4
+ from typing import Any, Callable, Dict, List, Optional, Union
5
+ import json
6
+ import asyncio
7
+ import time
8
+ from pathlib import Path
9
+ import aiofiles
10
+ from litellm import completion
11
+ from pydantic import BaseModel, Field
12
+ import logging
13
+ from datetime import datetime
14
+ import re
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class PostExtractionBuffer:
19
+ """Buffer system for tracking and retrying failed LLM requests."""
20
+
21
+ def __init__(self, buffer_file: Optional[Path] = None):
22
+ # Generate a unique session ID using timestamp and random string
23
+ session_id = f"{int(time.time())}_{hex(hash(str(time.time())))[-6:]}"
24
+
25
+ # Create unique buffer file path
26
+ if buffer_file:
27
+ # If buffer_file is provided, insert session_id before the extension
28
+ stem = buffer_file.stem
29
+ suffix = buffer_file.suffix
30
+ self.buffer_file = buffer_file.with_name(f"{stem}_{session_id}{suffix}")
31
+ else:
32
+ # Default buffer file with session_id
33
+ self.buffer_file = Path(f"post_extraction_buffer_{session_id}.json")
34
+
35
+ self.failed_requests: Dict[str, Dict] = {}
36
+ self._load_buffer()
37
+
38
+ def _load_buffer(self) -> None:
39
+ """Load failed requests from buffer file if it exists."""
40
+ if self.buffer_file.exists():
41
+ try:
42
+ with open(self.buffer_file, 'r') as f:
43
+ self.failed_requests = json.load(f)
44
+ except Exception as e:
45
+ logger.error(f"Error loading buffer file: {e}")
46
+ self.failed_requests = {}
47
+
48
+ def _save_buffer(self) -> None:
49
+ """Save failed requests to buffer file."""
50
+ try:
51
+ with open(self.buffer_file, 'w') as f:
52
+ json.dump(self.failed_requests, f, indent=2)
53
+ except Exception as e:
54
+ logger.error(f"Error saving buffer file: {e}")
55
+
56
+ def add_failed_request(self, url: str, content: str, error: str) -> None:
57
+ """Add a failed request to the buffer."""
58
+ self.failed_requests[url] = {
59
+ "content": content,
60
+ "error": error,
61
+ "timestamp": datetime.now().isoformat(),
62
+ "attempts": self.failed_requests.get(url, {}).get("attempts", 0) + 1
63
+ }
64
+ self._save_buffer()
65
+
66
+ def remove_request(self, url: str) -> None:
67
+ """Remove a request from the buffer after successful processing."""
68
+ if url in self.failed_requests:
69
+ del self.failed_requests[url]
70
+ self._save_buffer()
71
+
72
+ def get_failed_requests(self) -> Dict[str, Dict]:
73
+ """Get all failed requests."""
74
+ return self.failed_requests
75
+
76
+ def get_retryable_requests(self, max_attempts: int = 3) -> Dict[str, Dict]:
77
+ """Get failed requests that haven't exceeded max retry attempts."""
78
+ return {
79
+ url: data for url, data in self.failed_requests.items()
80
+ if data.get("attempts", 0) < max_attempts
81
+ }
82
+
83
+ class ExtractionTemplate(BaseModel):
84
+ """Base model for extraction template validation."""
85
+ template: Dict[str, Any] = Field(..., description="Template structure for extraction")
86
+
87
+ class Config:
88
+ extra = "allow"
89
+ arbitrary_types_allowed = True
90
+
91
+ @classmethod
92
+ def validate_template_string(cls, template_str: str) -> bool:
93
+ """Validate a template string against the schema."""
94
+ try:
95
+ template_json = json.loads(template_str)
96
+ cls(template=template_json)
97
+ return True
98
+ except Exception as e:
99
+ logger.error(f"Template validation failed: {e}")
100
+ return False
101
+
102
+ @dataclass
103
+ class PostExtractionConfig:
104
+ """Configuration for post-extraction processing."""
105
+ model: str
106
+ messages: List[Dict[str, str]]
107
+ api_key: str
108
+ max_tokens: int = 1000
109
+ temperature: float = 0.7
110
+ base_url: Optional[str] = None
111
+ request_delay: float = 0.01 # 10 milliseconds default
112
+ max_retries: int = 3
113
+ retry_delay: float = 1.0
114
+ combine_output: bool = False
115
+ output_file: Optional[Path] = None
116
+ custom_transform_function: Optional[Callable] = None
117
+ buffer_file: Optional[Path] = None
118
+
119
+ def __post_init__(self):
120
+ if self.output_file:
121
+ self.output_file = Path(self.output_file)
122
+ self.output_file.parent.mkdir(parents=True, exist_ok=True)
123
+
124
+ if self.buffer_file:
125
+ self.buffer_file = Path(self.buffer_file)
126
+ self.buffer_file.parent.mkdir(parents=True, exist_ok=True)
127
+
128
+ class RateLimiter:
129
+ """Rate limiter for API calls."""
130
+
131
+ def __init__(self, requests_per_minute: int = 60):
132
+ self.requests_per_minute = requests_per_minute
133
+ self.interval = 60 / requests_per_minute
134
+ self.last_request = 0
135
+ self._lock = asyncio.Lock()
136
+
137
+ async def acquire(self):
138
+ """Acquire rate limit slot."""
139
+ async with self._lock:
140
+ now = time.time()
141
+ if self.last_request:
142
+ elapsed = now - self.last_request
143
+ if elapsed < self.interval:
144
+ await asyncio.sleep(self.interval - elapsed)
145
+ self.last_request = time.time()
146
+
147
+ class PostExtractionAgent:
148
+ """Agent for processing extracted content using LLM models."""
149
+
150
+ def __init__(self, config: PostExtractionConfig):
151
+ self.config = config
152
+ self.buffer = PostExtractionBuffer(config.buffer_file)
153
+ self.results: Dict[str, Any] = {}
154
+ self.rate_limiter = RateLimiter()
155
+ self._setup_output()
156
+
157
+ def _setup_output(self) -> None:
158
+ """Setup output file if combining results."""
159
+ if self.config.combine_output and self.config.output_file:
160
+ self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
161
+ if self.config.output_file.exists():
162
+ # Backup existing file
163
+ backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
164
+ self.config.output_file.rename(backup_path)
165
+ self.config.output_file.touch()
166
+
167
+ async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
168
+ """Process a single piece of content through the LLM."""
169
+ try:
170
+ # Apply rate limiting
171
+ await self.rate_limiter.acquire()
172
+
173
+ # Replace placeholder in messages with actual content
174
+ messages = [
175
+ {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
176
+ for msg in self.config.messages
177
+ ]
178
+
179
+ # Make LLM request
180
+ response = await completion(
181
+ model=self.config.model,
182
+ messages=messages,
183
+ max_tokens=self.config.max_tokens,
184
+ temperature=self.config.temperature,
185
+ api_key=self.config.api_key,
186
+ api_base=self.config.base_url
187
+ )
188
+
189
+ # Parse response
190
+ try:
191
+ extracted_data = json.loads(response.choices[0].message.content)
192
+ self.buffer.remove_request(url) # Remove from buffer if successful
193
+ return extracted_data
194
+ except json.JSONDecodeError as e:
195
+ raise ValueError(f"Invalid JSON response from LLM: {e}")
196
+
197
+ except Exception as e:
198
+ logger.error(f"Error processing {url}: {str(e)}")
199
+ self.buffer.add_failed_request(url, content, str(e))
200
+ return None
201
+
202
+ async def _save_result(self, url: str, result: Dict) -> None:
203
+ """Save individual or combined results."""
204
+ try:
205
+ if self.config.combine_output and self.config.output_file:
206
+ self.results[url] = result
207
+ async with aiofiles.open(self.config.output_file, 'w') as f:
208
+ await f.write(json.dumps(self.results, indent=2))
209
+ elif not self.config.combine_output and self.config.output_file:
210
+ individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
211
+ async with aiofiles.open(individual_file, 'w') as f:
212
+ await f.write(json.dumps(result, indent=2))
213
+ except Exception as e:
214
+ logger.error(f"Error saving results for {url}: {str(e)}")
215
+
216
+ async def process_content(self, url: str, content: str) -> Optional[Dict]:
217
+ """Process content with retry mechanism."""
218
+ for attempt in range(self.config.max_retries):
219
+ result = await self._process_single_content(url, content)
220
+ if result:
221
+ # Apply custom transformation if provided
222
+ if self.config.custom_transform_function:
223
+ try:
224
+ result = self.config.custom_transform_function(result)
225
+ except Exception as e:
226
+ logger.error(f"Error in custom transform for {url}: {str(e)}")
227
+
228
+ await self._save_result(url, result)
229
+ return result
230
+
231
+ # Wait before retry
232
+ if attempt < self.config.max_retries - 1:
233
+ await asyncio.sleep(self.config.retry_delay)
234
+
235
+ return None
236
+
237
+ async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
238
+ """Process multiple pieces of content with rate limiting."""
239
+ results = {}
240
+ for url, content in content_map.items():
241
+ results[url] = await self.process_content(url, content)
242
+ await asyncio.sleep(self.config.request_delay)
243
+ return results
244
+
245
+ def retry_failed_requests(self) -> Dict[str, Optional[Dict]]:
246
+ """Retry all failed requests from the buffer."""
247
+ failed_requests = self.buffer.get_retryable_requests(self.config.max_retries)
248
+ return asyncio.run(self.process_bulk_content(
249
+ {url: data['content'] for url, data in failed_requests.items()}
250
+ ))
251
+
252
+ async def get_processing_stats(self) -> Dict[str, Any]:
253
+ """Get detailed processing statistics."""
254
+ return {
255
+ "total_processed": len(self.results),
256
+ "failed_requests": len(self.buffer.get_failed_requests()),
257
+ "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
258
+ "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
259
+ }
@@ -1,16 +1,24 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.1
4
- Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
3
+ Version: 2.4
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
7
7
  Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
- License: MIT
8
+ Project-URL: Homepage, https://petertam.pro
9
+ Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
10
+ Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
11
+ Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
12
+ Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
9
13
  Classifier: Development Status :: 4 - Beta
10
14
  Classifier: Intended Audience :: Developers
11
15
  Classifier: License :: OSI Approved :: MIT License
12
16
  Classifier: Programming Language :: Python :: 3.11
13
17
  Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
19
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
22
  Requires-Python: >=3.11
15
23
  Description-Content-Type: text/markdown
16
24
  Requires-Dist: aiohttp>=3.8.0
@@ -18,6 +26,36 @@ Requires-Dist: asyncio>=3.4.3
18
26
  Requires-Dist: rich>=10.0.0
19
27
  Requires-Dist: aiofiles>=0.8.0
20
28
  Requires-Dist: httpx>=0.24.0
29
+ Requires-Dist: litellm>=1.26.0
30
+ Requires-Dist: pydantic>=2.6.0
31
+ Requires-Dist: requests>=2.31.0
32
+ Requires-Dist: aiofiles>=23.2.1
33
+ Requires-Dist: et-xmlfile>=1.1.0
34
+ Requires-Dist: multidict>=6.0.4
35
+ Requires-Dist: openai>=1.12.0
36
+ Requires-Dist: pandas>=2.2.0
37
+ Requires-Dist: numpy>=1.26.0
38
+ Requires-Dist: yarl>=1.9.4
39
+ Requires-Dist: typing_extensions>=4.9.0
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
42
+ Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
43
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
44
+ Requires-Dist: black>=23.7.0; extra == "dev"
45
+ Requires-Dist: isort>=5.12.0; extra == "dev"
46
+ Requires-Dist: mypy>=1.4.1; extra == "dev"
47
+ Requires-Dist: ruff>=0.1.8; extra == "dev"
48
+ Requires-Dist: pre-commit>=3.5.0; extra == "dev"
49
+ Provides-Extra: test
50
+ Requires-Dist: pytest>=7.4.0; extra == "test"
51
+ Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
52
+ Requires-Dist: pytest-cov>=4.1.0; extra == "test"
53
+ Requires-Dist: pytest-mock>=3.12.0; extra == "test"
54
+ Requires-Dist: coverage>=7.4.0; extra == "test"
55
+ Provides-Extra: docs
56
+ Requires-Dist: sphinx>=7.1.0; extra == "docs"
57
+ Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
58
+ Requires-Dist: myst-parser>=2.0.0; extra == "docs"
21
59
  Dynamic: author
22
60
  Dynamic: home-page
23
61
  Dynamic: requires-python
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
2
+ spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
3
+ spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
4
+ spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ spiderforce4ai = spiderforce4ai.cli:main
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=6WskofS5vOJuPhdwoCRvcOmWVimCKJxtkkP_pshrrlo,35805
2
- spiderforce4ai-2.1.dist-info/METADATA,sha256=bK_85RBFEAmDTZgo2oCPKgDNd-dqfYvRJoBl92Zk-i8,7183
3
- spiderforce4ai-2.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-2.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-2.1.dist-info/RECORD,,