jwebs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jwebs/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+ # Copyright 2026 J Code
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from .core.http import FastHTTP, HTTPResponse, RequestRecord
4
+ from .core.exceptions import (
5
+ JWebsError, HTTPError, JWebsConnectionError,
6
+ JWebsTimeoutError, RobotsBlockedError, CacheError
7
+ )
8
+ from .check import Checker, SecurityReport, SEOScore, PerformanceMetrics
9
+ from .extract import Builder
10
+ from .crawl import Crawler, DistributedCrawler
11
+ from .ai import AIScrapingEngine, GraphQLClient, GraphQLResponse
12
+ from .captcha import CaptchaSolver, CAPTCHAResult
13
+ from .proxy import ProxyRotator, ProxyConfig
14
+ from .monitor import Monitor
15
+ from .smart import SmartScraper
16
+ from .async_ import AsyncClient, AsyncResponse
17
+ from .diff import ContentDiffer
18
+ from .generate import SitemapGenerator, RSSGenerator
19
+ from .jwebs import JWebs
20
+
21
+ __version__ = "1.0.0"
22
+ __author__ = "J Code"
23
+ __license__ = "Apache-2.0"
jwebs/ai.py ADDED
@@ -0,0 +1,328 @@
1
+ # Copyright 2026 J Code
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import os
4
+ import json
5
+ import hashlib
6
+ import threading
7
+ import time
8
+ import re
9
+ from typing import Dict, List, Optional, Any
10
+
11
+ from urllib3 import PoolManager, Timeout as Urllib3Timeout, Retry
12
+
13
+ from .core.http import FastHTTP
14
+ from .core.datatypes import AIScrapingResult, GraphQLResponse
15
+ from .core.utils import _safe_parse_html
16
+ from .core.deps import _check_dep
17
+ from .core.logging import logger
18
+
19
+ class AIScrapingEngine:
20
+ def __init__(self, provider: str = 'deepseek', model: Optional[str] = None,
21
+ api_key: Optional[str] = None, use_local: bool = False,
22
+ connect_timeout: float = 10.0, read_timeout: float = 60.0,
23
+ total_timeout: float = 120.0,
24
+ max_cache_entries: int = 100, cache_ttl_enabled: bool = False,
25
+ cache_ttl_seconds: int = 3600):
26
+ self.provider = provider.lower()
27
+ if self.provider not in ('deepseek', 'openai'):
28
+ raise ValueError("provider must be 'deepseek' or 'openai'")
29
+
30
+ if self.provider == 'deepseek':
31
+ self.base_url = "https://api.deepseek.com/v1/chat/completions"
32
+ self.model = model or 'deepseek-chat'
33
+ self.api_key = api_key or os.environ.get('DEEPSEEK_API_KEY', '')
34
+ else:
35
+ self.base_url = "https://api.openai.com/v1/chat/completions"
36
+ self.model = model or 'gpt-4o'
37
+ self.api_key = api_key or os.environ.get('OPENAI_API_KEY', '')
38
+
39
+ self.use_local = use_local
40
+ self.connect_timeout = connect_timeout
41
+ self.read_timeout = read_timeout
42
+ self.total_timeout = total_timeout
43
+
44
+ self.max_cache_entries = max_cache_entries
45
+ self.cache_ttl_enabled = cache_ttl_enabled
46
+ self.cache_ttl_seconds = cache_ttl_seconds
47
+
48
+ self._ai_cache: Dict[str, AIScrapingResult] = {}
49
+ self._cache_lock = threading.Lock()
50
+
51
+ self._api_pool = PoolManager(
52
+ num_pools=2,
53
+ maxsize=5,
54
+ timeout=Urllib3Timeout(connect=connect_timeout, read=read_timeout),
55
+ retries=Retry(total=2, backoff_factor=0.5),
56
+ cert_reqs='CERT_REQUIRED'
57
+ )
58
+
59
+ if not self.api_key:
60
+ logger.warning('AIScrapingEngine',
61
+ f'No API key found for {self.provider}. Set environment variable accordingly.')
62
+
63
+ def set_timeouts(self, connect: Optional[float] = None, read: Optional[float] = None,
64
+ total: Optional[float] = None):
65
+ if connect is not None:
66
+ self.connect_timeout = connect
67
+ if read is not None:
68
+ self.read_timeout = read
69
+ if total is not None:
70
+ self.total_timeout = total
71
+ self._api_pool = PoolManager(
72
+ num_pools=2, maxsize=5,
73
+ timeout=Urllib3Timeout(connect=self.connect_timeout, read=self.read_timeout),
74
+ retries=Retry(total=2, backoff_factor=0.5),
75
+ cert_reqs='CERT_REQUIRED'
76
+ )
77
+
78
+ def _call_llm(self, messages: List[Dict], temperature: float = 0.1,
79
+ max_tokens: int = 2000) -> Optional[Dict]:
80
+ headers = {
81
+ 'Authorization': f'Bearer {self.api_key}',
82
+ 'Content-Type': 'application/json'
83
+ }
84
+
85
+ payload = {
86
+ 'model': self.model,
87
+ 'messages': messages,
88
+ 'temperature': temperature,
89
+ 'max_tokens': max_tokens,
90
+ 'stream': False
91
+ }
92
+
93
+ try:
94
+ response = self._api_pool.request(
95
+ 'POST', self.base_url,
96
+ body=json.dumps(payload).encode('utf-8'),
97
+ headers=headers,
98
+ timeout=Urllib3Timeout(connect=self.connect_timeout, read=self.read_timeout)
99
+ )
100
+
101
+ if response.status == 200:
102
+ data = json.loads(response.data.decode('utf-8'))
103
+ response.release_conn()
104
+ return data
105
+ else:
106
+ logger.error('AIScrapingEngine', f"{self.provider} API error: {response.status}")
107
+ response.release_conn()
108
+ return None
109
+ except Exception as e:
110
+ logger.error('AIScrapingEngine', f"{self.provider} API call failed: {e}", exc_info=True)
111
+ return None
112
+
113
+ def _prune_cache(self):
114
+ with self._cache_lock:
115
+ now = time.time()
116
+ if self.cache_ttl_enabled:
117
+ expired = [k for k, v in self._ai_cache.items()
118
+ if now - v.processing_time > self.cache_ttl_seconds]
119
+ for k in expired:
120
+ del self._ai_cache[k]
121
+ if len(self._ai_cache) > self.max_cache_entries:
122
+ sorted_items = sorted(self._ai_cache.items(), key=lambda x: x[1].processing_time)
123
+ to_remove = len(self._ai_cache) - self.max_cache_entries
124
+ for k, _ in sorted_items[:to_remove]:
125
+ del self._ai_cache[k]
126
+
127
+ def _extract_single(self, text: str, instruction: str, start_time: float) -> AIScrapingResult:
128
+ cache_key = hashlib.md5(f"{self.provider}{self.model}{instruction}{text[:2000]}".encode()).hexdigest()
129
+ with self._cache_lock:
130
+ if cache_key in self._ai_cache:
131
+ cached = self._ai_cache[cache_key]
132
+ if not self.cache_ttl_enabled or (time.time() - cached.processing_time <= self.cache_ttl_seconds):
133
+ return cached
134
+ else:
135
+ del self._ai_cache[cache_key]
136
+
137
+ system_prompt = """You are a precise data extraction assistant.
138
+ Extract information exactly as requested. Always return valid JSON.
139
+ If information is not found, use null values. Never make up data."""
140
+
141
+ user_prompt = f"""Extract the following information from the text:
142
+
143
+ TEXT:
144
+ {text}
145
+
146
+ INSTRUCTION:
147
+ {instruction}
148
+
149
+ Return ONLY a valid JSON object. Do not include explanations or markdown."""
150
+
151
+ messages = [
152
+ {'role': 'system', 'content': system_prompt},
153
+ {'role': 'user', 'content': user_prompt}
154
+ ]
155
+
156
+ response_data = self._call_llm(messages)
157
+
158
+ if not response_data:
159
+ return AIScrapingResult(
160
+ elements=[{'error': 'API call failed'}],
161
+ model_used=self.model,
162
+ processing_time=time.time() - start_time
163
+ )
164
+
165
+ try:
166
+ result_text = response_data['choices'][0]['message']['content']
167
+ tokens_used = response_data.get('usage', {}).get('total_tokens', 0)
168
+ data = self._parse_json_safely(result_text)
169
+
170
+ result = AIScrapingResult(
171
+ elements=data if isinstance(data, list) else [data],
172
+ confidence=0.9,
173
+ model_used=self.model,
174
+ processing_time=time.time() - start_time,
175
+ tokens_used=tokens_used,
176
+ raw_response=result_text
177
+ )
178
+
179
+ with self._cache_lock:
180
+ self._ai_cache[cache_key] = result
181
+ self._prune_cache()
182
+
183
+ return result
184
+ except Exception as e:
185
+ logger.error('AIScrapingEngine', f"Parse error: {e}", exc_info=True)
186
+ return AIScrapingResult(
187
+ elements=[{'error': f'Parse error: {str(e)}'}],
188
+ model_used=self.model,
189
+ processing_time=time.time() - start_time
190
+ )
191
+
192
+ def EXTRACT(self, html: str, instruction: str) -> AIScrapingResult:
193
+ start_time = time.time()
194
+
195
+ soup = _safe_parse_html(html, 'lxml')
196
+ for tag in soup(['script', 'style', 'noscript', 'iframe', 'nav', 'footer']):
197
+ tag.decompose()
198
+
199
+ text = soup.get_text(separator='\n', strip=True)
200
+
201
+ if len(text) <= 8000:
202
+ return self._extract_single(text, instruction, start_time)
203
+ else:
204
+ return self._extract_chunks(text, instruction, start_time)
205
+
206
+ def _extract_chunks(self, text: str, instruction: str, start_time: float) -> AIScrapingResult:
207
+ chunk_size = 6000
208
+ overlap = 500
209
+ chunks = []
210
+ for i in range(0, len(text), chunk_size - overlap):
211
+ chunk = text[i:i + chunk_size]
212
+ if len(chunk) > 100:
213
+ chunks.append(chunk)
214
+
215
+ if not chunks:
216
+ return AIScrapingResult(elements=[{'error': 'No valid text found'}])
217
+
218
+ all_elements = []
219
+ total_tokens = 0
220
+ for idx, chunk in enumerate(chunks):
221
+ if idx == 0:
222
+ chunk_result = self._extract_single(chunk, instruction, start_time)
223
+ else:
224
+ prev_context = json.dumps(all_elements[-1] if all_elements else {})
225
+ enhanced_instruction = f"{instruction}\n\nPrevious findings: {prev_context}"
226
+ chunk_result = self._extract_single(chunk, enhanced_instruction, start_time)
227
+
228
+ if chunk_result.elements:
229
+ if isinstance(chunk_result.elements, list):
230
+ all_elements.extend(chunk_result.elements)
231
+ else:
232
+ all_elements.append(chunk_result.elements)
233
+ total_tokens += chunk_result.tokens_used
234
+
235
+ return AIScrapingResult(
236
+ elements=all_elements,
237
+ confidence=0.85,
238
+ model_used=self.model,
239
+ processing_time=time.time() - start_time,
240
+ tokens_used=total_tokens
241
+ )
242
+
243
+ def _parse_json_safely(self, text: str) -> Any:
244
+ text = re.sub(r'```json\s*', '', text)
245
+ text = re.sub(r'```\s*', '', text)
246
+ text = text.strip()
247
+
248
+ try:
249
+ return json.loads(text)
250
+ except json.JSONDecodeError:
251
+ pass
252
+
253
+ json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text, re.DOTALL)
254
+ if json_match:
255
+ try:
256
+ return json.loads(json_match.group())
257
+ except json.JSONDecodeError:
258
+ pass
259
+
260
+ json_match = re.search(r'\[[^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*\]', text, re.DOTALL)
261
+ if json_match:
262
+ try:
263
+ return json.loads(json_match.group())
264
+ except json.JSONDecodeError:
265
+ pass
266
+
267
+ return {'raw_output': text}
268
+
269
+ def SUMMARIZE(self, text: str, max_length: int = 150) -> str:
270
+ if len(text) < 100:
271
+ return text
272
+
273
+ text = text[:8000]
274
+ messages = [
275
+ {'role': 'system', 'content': "You are a text summarization expert."},
276
+ {'role': 'user', 'content': f"Summarize in {max_length} chars:\n\n{text}"}
277
+ ]
278
+
279
+ response = self._call_llm(messages, temperature=0.3, max_tokens=max_length)
280
+ if response:
281
+ return response['choices'][0]['message']['content'].strip()
282
+
283
+ sentences = text.split('.')[:5]
284
+ return '. '.join(sentences) + '.'
285
+
286
+ def SCRAPE_PAGE(self, url: str, instruction: str,
287
+ http: Optional[FastHTTP] = None) -> AIScrapingResult:
288
+ client = http or FastHTTP()
289
+ resp = client.GET(url)
290
+ if not resp or resp.status == 0:
291
+ return AIScrapingResult(elements=[{'error': f'Failed to fetch URL: {url}'}])
292
+ return self.EXTRACT(resp.text, instruction)
293
+
294
+ def SET_API_KEY(self, api_key: str):
295
+ self.api_key = api_key
296
+
297
+ def CLEAR_CACHE(self):
298
+ with self._cache_lock:
299
+ self._ai_cache.clear()
300
+
301
+
302
+ class GraphQLClient:
303
+ def __init__(self, endpoint: str, headers: Optional[Dict] = None,
304
+ timeout: float = 30.0, http: Optional[FastHTTP] = None):
305
+ self.endpoint = endpoint
306
+ self.headers = headers or {'Content-Type': 'application/json'}
307
+ self.timeout = timeout
308
+ self.http = http or FastHTTP()
309
+
310
+ def QUERY(self, query: str, variables: Optional[Dict] = None) -> GraphQLResponse:
311
+ payload = {'query': query}
312
+ if variables:
313
+ payload['variables'] = variables
314
+ resp = self.http.POST(
315
+ self.endpoint, json_data=payload,
316
+ headers=self.headers, timeout=self.timeout
317
+ )
318
+ if resp and resp.ok:
319
+ data = resp.JSON()
320
+ if data:
321
+ return GraphQLResponse(
322
+ data=data.get('data'),
323
+ errors=data.get('errors'),
324
+ extensions=data.get('extensions')
325
+ )
326
+ return GraphQLResponse(
327
+ errors=[{'message': f'HTTP {resp.status if resp else "error"}'}]
328
+ )
jwebs/async_.py ADDED
@@ -0,0 +1,108 @@
1
+ # Copyright 2026 J Code
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import json
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import Dict, List, Optional, Any
7
+
8
+ from urllib3 import PoolManager, Timeout as Urllib3Timeout
9
+
10
+ from .core.datatypes import AsyncResponse
11
+
12
+
13
+ class AsyncClient:
14
+ def __init__(self, max_connections: int = 100, timeout: float = 30.0,
15
+ connect_timeout: float = 10.0, default_headers: Optional[Dict] = None):
16
+ import sys, os
17
+ _default_max_connections = max_connections
18
+ IS_ANDROID = hasattr(sys, 'getandroidapilevel') or 'ANDROID_STORAGE' in os.environ
19
+ if IS_ANDROID:
20
+ if max_connections == _default_max_connections:
21
+ max_connections = min(max_connections, 20)
22
+ else:
23
+ if max_connections == _default_max_connections:
24
+ max_connections = min(max_connections, 100)
25
+ self.max_connections = max_connections
26
+ self.timeout = timeout
27
+ self.connect_timeout = connect_timeout
28
+ self.default_headers = default_headers or {
29
+ 'User-Agent': 'JWebs-Async/2.0',
30
+ 'Accept': '*/*',
31
+ 'Accept-Encoding': 'gzip, deflate'
32
+ }
33
+ self._pool = PoolManager(
34
+ num_pools=self.max_connections,
35
+ maxsize=self.max_connections,
36
+ headers=self.default_headers,
37
+ timeout=Urllib3Timeout(connect=connect_timeout, read=timeout)
38
+ )
39
+
40
+ def GET(self, url: str, timeout: Optional[float] = None,
41
+ connect_timeout: Optional[float] = None, **kwargs) -> AsyncResponse:
42
+ start = time.time()
43
+ try:
44
+ headers = {**self.default_headers, **kwargs.pop('headers', {})}
45
+ eff_connect = connect_timeout or self.connect_timeout
46
+ eff_read = timeout or self.timeout
47
+ resp = self._pool.request(
48
+ 'GET', url, headers=headers,
49
+ timeout=Urllib3Timeout(connect=eff_connect, read=eff_read),
50
+ **kwargs
51
+ )
52
+ elapsed = time.time() - start
53
+ async_resp = AsyncResponse(
54
+ status=resp.status, headers=dict(resp.headers),
55
+ body=resp.data, url=url, elapsed=elapsed,
56
+ content_type=resp.headers.get('Content-Type', '')
57
+ )
58
+ resp.release_conn()
59
+ return async_resp
60
+ except Exception as e:
61
+ return AsyncResponse(
62
+ status=0, headers={}, body=str(e).encode(),
63
+ url=url, elapsed=time.time() - start, content_type='text/plain'
64
+ )
65
+
66
+ def POST(self, url: str, json: Optional[Dict] = None,
67
+ data: Optional[Any] = None, timeout: Optional[float] = None,
68
+ **kwargs) -> AsyncResponse:
69
+ start = time.time()
70
+ try:
71
+ headers = {**self.default_headers, **kwargs.pop('headers', {})}
72
+ if json:
73
+ body = json.dumps(json).encode('utf-8')
74
+ headers['Content-Type'] = 'application/json'
75
+ resp = self._pool.request('POST', url, headers=headers, body=body,
76
+ timeout=Urllib3Timeout(connect=self.connect_timeout,
77
+ read=timeout or self.timeout))
78
+ else:
79
+ resp = self._pool.request('POST', url, headers=headers, body=data,
80
+ timeout=Urllib3Timeout(connect=self.connect_timeout,
81
+ read=timeout or self.timeout))
82
+ elapsed = time.time() - start
83
+ async_resp = AsyncResponse(
84
+ status=resp.status, headers=dict(resp.headers),
85
+ body=resp.data, url=url, elapsed=elapsed,
86
+ content_type=resp.headers.get('Content-Type', '')
87
+ )
88
+ resp.release_conn()
89
+ return async_resp
90
+ except Exception as e:
91
+ return AsyncResponse(
92
+ status=0, headers={}, body=str(e).encode(),
93
+ url=url, elapsed=time.time() - start, content_type='text/plain'
94
+ )
95
+
96
+ def BATCH_GET(self, urls: List[str], **kwargs) -> Dict[str, AsyncResponse]:
97
+ results = {}
98
+ def fetch(url):
99
+ return url, self.GET(url, **kwargs)
100
+ with ThreadPoolExecutor(max_workers=self.max_connections) as executor:
101
+ futures = [executor.submit(fetch, url) for url in urls]
102
+ for future in as_completed(futures):
103
+ url, result = future.result()
104
+ results[url] = result
105
+ return results
106
+
107
+ def CLOSE(self):
108
+ self._pool.clear()
jwebs/captcha.py ADDED
@@ -0,0 +1,99 @@
1
+ # Copyright 2026 J Code
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import os
4
+ import json
5
+ import time
6
+ import threading
7
+ from typing import Optional, List
8
+
9
+ from urllib3 import PoolManager, Timeout as Urllib3Timeout, Retry
10
+
11
+ from .core.datatypes import CAPTCHAResult
12
+ from .core.logging import logger
13
+
14
+ class CaptchaSolver:
15
+ def __init__(self, api_key: Optional[str] = None, service: str = '2captcha',
16
+ connect_timeout: float = 10.0, read_timeout: float = 30.0,
17
+ solve_timeout: float = 180.0):
18
+ self.api_key = api_key or os.environ.get('CAPTCHA_API_KEY', '')
19
+ self.service = service
20
+ self.connect_timeout = connect_timeout
21
+ self.read_timeout = read_timeout
22
+ self.solve_timeout = solve_timeout
23
+ self.solve_history: List[CAPTCHAResult] = []
24
+ self._lock = threading.Lock()
25
+
26
+ self._pool = PoolManager(
27
+ num_pools=2, maxsize=5,
28
+ timeout=Urllib3Timeout(connect=connect_timeout, read=read_timeout),
29
+ retries=Retry(total=3, backoff_factor=1.0),
30
+ cert_reqs='CERT_REQUIRED'
31
+ )
32
+
33
+ def set_timeouts(self, connect: Optional[float] = None, read: Optional[float] = None,
34
+ solve: Optional[float] = None):
35
+ if connect is not None:
36
+ self.connect_timeout = connect
37
+ if read is not None:
38
+ self.read_timeout = read
39
+ if solve is not None:
40
+ self.solve_timeout = solve
41
+ self._pool = PoolManager(
42
+ num_pools=2, maxsize=5,
43
+ timeout=Urllib3Timeout(connect=self.connect_timeout, read=self.read_timeout),
44
+ retries=Retry(total=3, backoff_factor=1.0),
45
+ cert_reqs='CERT_REQUIRED'
46
+ )
47
+
48
+ def DETECT(self, html: str) -> Optional[str]:
49
+ html_lower = html.lower()
50
+ if 'g-recaptcha' in html_lower or 'recaptcha' in html_lower:
51
+ return 'recaptcha_v2'
52
+ if 'h-captcha' in html_lower or 'hcaptcha' in html_lower:
53
+ return 'hcaptcha'
54
+ if 'captcha' in html_lower:
55
+ return 'image_captcha'
56
+ return None
57
+
58
+ def SOLVE(self, site_key: str, page_url: str) -> CAPTCHAResult:
59
+ start_time = time.time()
60
+ if not self.api_key:
61
+ return CAPTCHAResult(solved=False, provider='none', time_taken=time.time() - start_time)
62
+ try:
63
+ payload = {
64
+ 'key': self.api_key, 'method': 'userrecaptcha',
65
+ 'googlekey': site_key, 'pageurl': page_url, 'json': 1
66
+ }
67
+ resp = self._pool.request(
68
+ 'POST', 'https://2captcha.com/in.php', fields=payload,
69
+ timeout=Urllib3Timeout(connect=self.connect_timeout, read=self.read_timeout)
70
+ )
71
+ result_data = json.loads(resp.data.decode('utf-8'))
72
+ resp.release_conn()
73
+ if result_data.get('status') != 1:
74
+ return CAPTCHAResult(solved=False, provider=self.service, time_taken=time.time() - start_time)
75
+
76
+ captcha_id = result_data['request']
77
+ deadline = time.time() + self.solve_timeout
78
+
79
+ for attempt in range(60):
80
+ if time.time() > deadline:
81
+ return CAPTCHAResult(solved=False, provider=self.service,
82
+ time_taken=time.time() - start_time, attempts=attempt)
83
+ time.sleep(5)
84
+ resp = self._pool.request(
85
+ 'GET', 'https://2captcha.com/res.php',
86
+ fields={'key': self.api_key, 'action': 'get', 'id': captcha_id, 'json': 1},
87
+ timeout=Urllib3Timeout(connect=self.connect_timeout, read=self.read_timeout)
88
+ )
89
+ result_data = json.loads(resp.data.decode('utf-8'))
90
+ resp.release_conn()
91
+ if result_data.get('status') == 1:
92
+ return CAPTCHAResult(
93
+ solved=True, solution=result_data['request'],
94
+ provider=self.service, time_taken=time.time() - start_time,
95
+ attempts=attempt + 1
96
+ )
97
+ except Exception as e:
98
+ logger.error('CaptchaSolver', f"Error: {e}", exc_info=True)
99
+ return CAPTCHAResult(solved=False, provider=self.service, time_taken=time.time() - start_time)