webscout 2025.10.15__py3-none-any.whl → 2025.10.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Extra/YTToolkit/README.md +1 -1
- webscout/Extra/tempmail/README.md +3 -3
- webscout/Provider/ClaudeOnline.py +350 -0
- webscout/Provider/OPENAI/README.md +1 -1
- webscout/Provider/TTI/bing.py +4 -4
- webscout/Provider/TTI/claudeonline.py +315 -0
- webscout/__init__.py +1 -1
- webscout/client.py +4 -5
- webscout/litprinter/__init__.py +0 -42
- webscout/scout/README.md +59 -8
- webscout/scout/core/scout.py +62 -0
- webscout/scout/element.py +251 -45
- webscout/search/__init__.py +3 -4
- webscout/search/engines/bing/images.py +5 -2
- webscout/search/engines/bing/news.py +6 -4
- webscout/search/engines/bing/text.py +5 -2
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +16 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +324 -0
- webscout/search/engines/yahoo/maps.py +16 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +16 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/http_client.py +1 -1
- webscout/search/yahoo_main.py +54 -0
- webscout/{auth → server}/__init__.py +2 -23
- webscout/server/config.py +84 -0
- webscout/{auth → server}/request_processing.py +3 -28
- webscout/{auth → server}/routes.py +6 -148
- webscout/server/schemas.py +23 -0
- webscout/{auth → server}/server.py +11 -43
- webscout/server/simple_logger.py +84 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +1 -1
- webscout/zeroart/README.md +17 -9
- webscout/zeroart/__init__.py +78 -6
- webscout/zeroart/effects.py +51 -1
- webscout/zeroart/fonts.py +559 -1
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/METADATA +11 -54
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/RECORD +51 -46
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/entry_points.txt +1 -1
- webscout/Extra/weather.md +0 -281
- webscout/auth/api_key_manager.py +0 -189
- webscout/auth/auth_system.py +0 -85
- webscout/auth/config.py +0 -175
- webscout/auth/database.py +0 -755
- webscout/auth/middleware.py +0 -248
- webscout/auth/models.py +0 -185
- webscout/auth/rate_limiter.py +0 -254
- webscout/auth/schemas.py +0 -103
- webscout/auth/simple_logger.py +0 -236
- webscout/search/engines/yahoo.py +0 -65
- webscout/search/engines/yahoo_news.py +0 -64
- /webscout/{auth → server}/exceptions.py +0 -0
- /webscout/{auth → server}/providers.py +0 -0
- /webscout/{auth → server}/request_models.py +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/WHEEL +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import tempfile
|
|
5
|
+
import time
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from requests.exceptions import RequestException
|
|
11
|
+
|
|
12
|
+
from webscout.litagent import LitAgent
|
|
13
|
+
from webscout.Provider.TTI.base import BaseImages, TTICompatibleProvider
|
|
14
|
+
from webscout.Provider.TTI.utils import ImageData, ImageResponse
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from PIL import Image
|
|
18
|
+
except ImportError:
|
|
19
|
+
Image = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Images(BaseImages):
|
|
23
|
+
def __init__(self, client):
|
|
24
|
+
self._client = client
|
|
25
|
+
|
|
26
|
+
def create(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
model: str,
|
|
30
|
+
prompt: str,
|
|
31
|
+
n: int = 1,
|
|
32
|
+
size: str = "1024x1024",
|
|
33
|
+
response_format: str = "url",
|
|
34
|
+
user: Optional[str] = None,
|
|
35
|
+
style: str = "none",
|
|
36
|
+
aspect_ratio: str = "1:1",
|
|
37
|
+
timeout: int = 60,
|
|
38
|
+
image_format: str = "png",
|
|
39
|
+
seed: Optional[int] = None,
|
|
40
|
+
**kwargs,
|
|
41
|
+
) -> ImageResponse:
|
|
42
|
+
"""
|
|
43
|
+
Generate images using Claude Online's /imagine feature via Pollinations.ai.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
model: Model to use (ignored, uses Pollinations.ai)
|
|
47
|
+
prompt: The image generation prompt
|
|
48
|
+
n: Number of images to generate (max 1 for Claude Online)
|
|
49
|
+
size: Image size (supports various sizes)
|
|
50
|
+
response_format: "url" or "b64_json"
|
|
51
|
+
timeout: Request timeout in seconds
|
|
52
|
+
image_format: Output format "png" or "jpeg"
|
|
53
|
+
**kwargs: Additional parameters
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
ImageResponse with generated image data
|
|
57
|
+
"""
|
|
58
|
+
if Image is None:
|
|
59
|
+
raise ImportError("Pillow (PIL) is required for image format conversion.")
|
|
60
|
+
|
|
61
|
+
# Claude Online only supports 1 image per request
|
|
62
|
+
if n > 1:
|
|
63
|
+
raise ValueError("Claude Online only supports generating 1 image per request")
|
|
64
|
+
|
|
65
|
+
# Parse size parameter
|
|
66
|
+
width, height = self._parse_size(size)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
# Clean the prompt (remove command words if present)
|
|
70
|
+
clean_prompt = self._clean_prompt(prompt)
|
|
71
|
+
|
|
72
|
+
# Generate image using Pollinations.ai API
|
|
73
|
+
timestamp = int(time.time() * 1000) # Use timestamp as seed for uniqueness
|
|
74
|
+
seed_value = seed if seed is not None else timestamp
|
|
75
|
+
|
|
76
|
+
# Build the Pollinations.ai URL
|
|
77
|
+
base_url = "https://image.pollinations.ai/prompt"
|
|
78
|
+
params = {
|
|
79
|
+
"width": width,
|
|
80
|
+
"height": height,
|
|
81
|
+
"nologo": "true",
|
|
82
|
+
"seed": seed_value
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
image_url = f"{base_url}/{clean_prompt}"
|
|
86
|
+
query_params = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
87
|
+
full_image_url = f"{image_url}?{query_params}"
|
|
88
|
+
|
|
89
|
+
# Download the image
|
|
90
|
+
response = requests.get(full_image_url, timeout=timeout, stream=True)
|
|
91
|
+
response.raise_for_status()
|
|
92
|
+
|
|
93
|
+
img_bytes = response.content
|
|
94
|
+
|
|
95
|
+
# Convert image format if needed
|
|
96
|
+
with BytesIO(img_bytes) as input_io:
|
|
97
|
+
with Image.open(input_io) as im:
|
|
98
|
+
out_io = BytesIO()
|
|
99
|
+
if image_format.lower() == "jpeg":
|
|
100
|
+
im = im.convert("RGB")
|
|
101
|
+
im.save(out_io, format="JPEG")
|
|
102
|
+
else:
|
|
103
|
+
im.save(out_io, format="PNG")
|
|
104
|
+
processed_img_bytes = out_io.getvalue()
|
|
105
|
+
|
|
106
|
+
# Handle response format
|
|
107
|
+
if response_format == "url":
|
|
108
|
+
# Upload to image hosting service
|
|
109
|
+
uploaded_url = self._upload_image(processed_img_bytes, image_format)
|
|
110
|
+
if not uploaded_url:
|
|
111
|
+
raise RuntimeError("Failed to upload generated image")
|
|
112
|
+
result_data = [ImageData(url=uploaded_url)]
|
|
113
|
+
elif response_format == "b64_json":
|
|
114
|
+
b64 = base64.b64encode(processed_img_bytes).decode("utf-8")
|
|
115
|
+
result_data = [ImageData(b64_json=b64)]
|
|
116
|
+
else:
|
|
117
|
+
raise ValueError("response_format must be 'url' or 'b64_json'")
|
|
118
|
+
|
|
119
|
+
return ImageResponse(created=int(time.time()), data=result_data)
|
|
120
|
+
|
|
121
|
+
except RequestException as e:
|
|
122
|
+
raise RuntimeError(f"Failed to generate image with Claude Online: {e}")
|
|
123
|
+
except Exception as e:
|
|
124
|
+
raise RuntimeError(f"Unexpected error during image generation: {e}")
|
|
125
|
+
|
|
126
|
+
def _parse_size(self, size: str) -> tuple[int, int]:
|
|
127
|
+
"""Parse size string into width and height."""
|
|
128
|
+
size = size.lower().strip()
|
|
129
|
+
|
|
130
|
+
# Handle common size formats
|
|
131
|
+
size_map = {
|
|
132
|
+
"256x256": (256, 256),
|
|
133
|
+
"512x512": (512, 512),
|
|
134
|
+
"1024x1024": (1024, 1024),
|
|
135
|
+
"1024x768": (1024, 768),
|
|
136
|
+
"768x1024": (768, 1024),
|
|
137
|
+
"1280x720": (1280, 720),
|
|
138
|
+
"720x1280": (720, 1280),
|
|
139
|
+
"1920x1080": (1920, 1080),
|
|
140
|
+
"1080x1920": (1080, 1920),
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if size in size_map:
|
|
144
|
+
return size_map[size]
|
|
145
|
+
|
|
146
|
+
# Try to parse custom size (e.g., "800x600")
|
|
147
|
+
try:
|
|
148
|
+
width, height = size.split("x")
|
|
149
|
+
return int(width), int(height)
|
|
150
|
+
except (ValueError, AttributeError):
|
|
151
|
+
# Default to 1024x1024
|
|
152
|
+
return 1024, 1024
|
|
153
|
+
|
|
154
|
+
def _clean_prompt(self, prompt: str) -> str:
|
|
155
|
+
"""Clean the prompt by removing command prefixes."""
|
|
156
|
+
# Remove common image generation command prefixes
|
|
157
|
+
prefixes_to_remove = [
|
|
158
|
+
r'^/imagine\s*',
|
|
159
|
+
r'^/image\s*',
|
|
160
|
+
r'^/picture\s*',
|
|
161
|
+
r'^/draw\s*',
|
|
162
|
+
r'^/create\s*',
|
|
163
|
+
r'^/generate\s*',
|
|
164
|
+
r'^создай изображение\s*',
|
|
165
|
+
r'^нарисуй\s*',
|
|
166
|
+
r'^сгенерируй картинку\s*',
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
import re
|
|
170
|
+
clean_prompt = prompt
|
|
171
|
+
for prefix in prefixes_to_remove:
|
|
172
|
+
clean_prompt = re.sub(prefix, '', clean_prompt, flags=re.IGNORECASE)
|
|
173
|
+
|
|
174
|
+
return clean_prompt.strip()
|
|
175
|
+
|
|
176
|
+
def _upload_image(self, img_bytes: bytes, image_format: str, max_retries: int = 3) -> Optional[str]:
|
|
177
|
+
"""Upload image to hosting service and return URL"""
|
|
178
|
+
|
|
179
|
+
def upload_to_catbox(img_bytes, image_format):
|
|
180
|
+
"""Upload to catbox.moe"""
|
|
181
|
+
ext = "jpg" if image_format.lower() == "jpeg" else "png"
|
|
182
|
+
tmp_path = None
|
|
183
|
+
try:
|
|
184
|
+
with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp:
|
|
185
|
+
tmp.write(img_bytes)
|
|
186
|
+
tmp.flush()
|
|
187
|
+
tmp_path = tmp.name
|
|
188
|
+
|
|
189
|
+
with open(tmp_path, "rb") as f:
|
|
190
|
+
files = {"fileToUpload": (f"image.{ext}", f, f"image/{ext}")}
|
|
191
|
+
data = {"reqtype": "fileupload", "json": "true"}
|
|
192
|
+
headers = {"User-Agent": LitAgent().random()}
|
|
193
|
+
|
|
194
|
+
resp = requests.post(
|
|
195
|
+
"https://catbox.moe/user/api.php",
|
|
196
|
+
files=files,
|
|
197
|
+
data=data,
|
|
198
|
+
headers=headers,
|
|
199
|
+
timeout=30,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if resp.status_code == 200 and resp.text.strip():
|
|
203
|
+
text = resp.text.strip()
|
|
204
|
+
if text.startswith("http"):
|
|
205
|
+
return text
|
|
206
|
+
try:
|
|
207
|
+
result = resp.json()
|
|
208
|
+
if "url" in result:
|
|
209
|
+
return result["url"]
|
|
210
|
+
except json.JSONDecodeError:
|
|
211
|
+
pass
|
|
212
|
+
except Exception:
|
|
213
|
+
pass
|
|
214
|
+
finally:
|
|
215
|
+
if tmp_path and os.path.isfile(tmp_path):
|
|
216
|
+
try:
|
|
217
|
+
os.remove(tmp_path)
|
|
218
|
+
except Exception:
|
|
219
|
+
pass
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
def upload_to_0x0(img_bytes, image_format):
|
|
223
|
+
"""Upload to 0x0.st as fallback"""
|
|
224
|
+
ext = "jpg" if image_format.lower() == "jpeg" else "png"
|
|
225
|
+
tmp_path = None
|
|
226
|
+
try:
|
|
227
|
+
with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp:
|
|
228
|
+
tmp.write(img_bytes)
|
|
229
|
+
tmp.flush()
|
|
230
|
+
tmp_path = tmp.name
|
|
231
|
+
|
|
232
|
+
with open(tmp_path, "rb") as img_file:
|
|
233
|
+
files = {"file": img_file}
|
|
234
|
+
response = requests.post("https://0x0.st", files=files, timeout=30)
|
|
235
|
+
response.raise_for_status()
|
|
236
|
+
image_url = response.text.strip()
|
|
237
|
+
if image_url.startswith("http"):
|
|
238
|
+
return image_url
|
|
239
|
+
except Exception:
|
|
240
|
+
pass
|
|
241
|
+
finally:
|
|
242
|
+
if tmp_path and os.path.isfile(tmp_path):
|
|
243
|
+
try:
|
|
244
|
+
os.remove(tmp_path)
|
|
245
|
+
except Exception:
|
|
246
|
+
pass
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
# Try primary upload method
|
|
250
|
+
for attempt in range(max_retries):
|
|
251
|
+
uploaded_url = upload_to_catbox(img_bytes, image_format)
|
|
252
|
+
if uploaded_url:
|
|
253
|
+
return uploaded_url
|
|
254
|
+
time.sleep(1 * (attempt + 1))
|
|
255
|
+
|
|
256
|
+
# Try fallback method
|
|
257
|
+
for attempt in range(max_retries):
|
|
258
|
+
uploaded_url = upload_to_0x0(img_bytes, image_format)
|
|
259
|
+
if uploaded_url:
|
|
260
|
+
return uploaded_url
|
|
261
|
+
time.sleep(1 * (attempt + 1))
|
|
262
|
+
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class ClaudeOnlineTTI(TTICompatibleProvider):
|
|
267
|
+
"""
|
|
268
|
+
Claude Online Text-to-Image Provider
|
|
269
|
+
|
|
270
|
+
Uses Claude Online's /imagine feature with Pollinations.ai backend.
|
|
271
|
+
Supports high-quality image generation with various styles and sizes.
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
AVAILABLE_MODELS = ["claude-imagine"]
|
|
275
|
+
|
|
276
|
+
def __init__(self):
|
|
277
|
+
self.api_endpoint = "https://image.pollinations.ai/prompt"
|
|
278
|
+
self.session = requests.Session()
|
|
279
|
+
self.user_agent = LitAgent().random()
|
|
280
|
+
self.headers = {
|
|
281
|
+
"accept": "image/*",
|
|
282
|
+
"accept-language": "en-US,en;q=0.9",
|
|
283
|
+
"user-agent": self.user_agent,
|
|
284
|
+
}
|
|
285
|
+
self.session.headers.update(self.headers)
|
|
286
|
+
self.images = Images(self)
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def models(self):
|
|
290
|
+
class _ModelList:
|
|
291
|
+
def list(inner_self):
|
|
292
|
+
return type(self).AVAILABLE_MODELS
|
|
293
|
+
|
|
294
|
+
return _ModelList()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
if __name__ == "__main__":
|
|
298
|
+
from rich import print
|
|
299
|
+
|
|
300
|
+
# Test the Claude Online TTI provider
|
|
301
|
+
client = ClaudeOnlineTTI()
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
response = client.images.create(
|
|
305
|
+
model="claude-imagine",
|
|
306
|
+
prompt="a beautiful sunset over mountains with vibrant colors",
|
|
307
|
+
response_format="url",
|
|
308
|
+
timeout=60,
|
|
309
|
+
)
|
|
310
|
+
print("✅ Image generation successful!")
|
|
311
|
+
print(response)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
print(f"❌ Image generation failed: {e}")
|
|
314
|
+
import traceback
|
|
315
|
+
traceback.print_exc()
|
webscout/__init__.py
CHANGED
webscout/client.py
CHANGED
|
@@ -34,20 +34,19 @@ from webscout.Provider.OPENAI import *
|
|
|
34
34
|
try:
|
|
35
35
|
# Use lazy import to avoid module execution issues
|
|
36
36
|
def run_api(*args, **kwargs):
|
|
37
|
-
|
|
38
|
-
from webscout.auth.server import run_api as _run_api
|
|
37
|
+
from webscout.server.server import run_api as _run_api
|
|
39
38
|
return _run_api(*args, **kwargs)
|
|
40
39
|
|
|
41
40
|
def start_server(**kwargs):
|
|
42
41
|
"""Start the Webscout OpenAI-compatible API server (FastAPI backend)."""
|
|
43
|
-
from webscout.
|
|
42
|
+
from webscout.server.server import run_api as _run_api
|
|
44
43
|
return _run_api(**kwargs)
|
|
45
44
|
except ImportError:
|
|
46
45
|
# Fallback for environments where the backend is not available
|
|
47
46
|
def run_api(*args, **kwargs):
|
|
48
|
-
raise ImportError("webscout.
|
|
47
|
+
raise ImportError("webscout.server.server.run_api is not available in this environment.")
|
|
49
48
|
def start_server(*args, **kwargs):
|
|
50
|
-
raise ImportError("webscout.
|
|
49
|
+
raise ImportError("webscout.server.server.start_server is not available in this environment.")
|
|
51
50
|
|
|
52
51
|
# ---
|
|
53
52
|
# API Documentation
|
webscout/litprinter/__init__.py
CHANGED
|
@@ -1,45 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
>>> from litprinter import litprint
|
|
3
|
-
>>> from litprinter import lit
|
|
4
|
-
>>> from litprinter import install, uninstall
|
|
5
|
-
>>>
|
|
6
|
-
>>> litprint("Hello, world!")
|
|
7
|
-
LIT -> [__main__.py:1] in () >>> Hello, world!
|
|
8
|
-
>>>
|
|
9
|
-
>>> def my_function():
|
|
10
|
-
... lit(1, 2, 3)
|
|
11
|
-
>>> my_function()
|
|
12
|
-
LIT -> [__main__.py:4] in my_function() >>> 1, 2, 3
|
|
13
|
-
>>> install()
|
|
14
|
-
>>> ic("This is now the builtins.ic()")
|
|
15
|
-
LIT -> [__main__.py:7] in () >>> This is now the builtins.ic()
|
|
16
|
-
>>> uninstall()
|
|
17
|
-
|
|
18
|
-
This module provides enhanced print and logging functionalities for Python,
|
|
19
|
-
allowing developers to debug their code with style and precision. It
|
|
20
|
-
includes the litprint and lit functions for debugging, log for logging, and
|
|
21
|
-
install/uninstall functions for integration into the builtins module.
|
|
22
|
-
It also handles colorizing output and provides different styles and customizable
|
|
23
|
-
options.
|
|
24
|
-
|
|
25
|
-
LITPRINTER is inspired by the icecream package and provides similar functionality
|
|
26
|
-
with additional features:
|
|
27
|
-
- Variable inspection with expression display
|
|
28
|
-
- Return value handling for inline usage
|
|
29
|
-
- Support for custom formatters for specific data types
|
|
30
|
-
- Execution context tracking
|
|
31
|
-
- Rich-like colorized output with multiple themes (JARVIS, RICH, MODERN, NEON, CYBERPUNK)
|
|
32
|
-
- Better JSON formatting with indent=2 by default
|
|
33
|
-
- Advanced pretty printing for complex data structures with smart truncation
|
|
34
|
-
- Clickable file paths in supported terminals and editors (VSCode compatible)
|
|
35
|
-
- Enhanced visual formatting with better spacing and separators
|
|
36
|
-
- Special formatters for common types (Exception, bytes, set, frozenset, etc.)
|
|
37
|
-
- Smart object introspection for custom classes
|
|
38
|
-
- Logging capabilities with timestamp and log levels
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
# Try to import from the standalone litprinter package first
|
|
42
|
-
# If it's not installed
|
|
43
1
|
try:
|
|
44
2
|
import litprinter
|
|
45
3
|
# If standalone package is found, re-export all its components
|
webscout/scout/README.md
CHANGED
|
@@ -43,7 +43,7 @@ pip install webscout
|
|
|
43
43
|
Or install the latest version from GitHub:
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
|
-
pip install git+https://github.com/
|
|
46
|
+
pip install git+https://github.com/pyscout/Webscout.git
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
## 🚀 Quick Start
|
|
@@ -147,10 +147,57 @@ Scout provides powerful tools for navigating and manipulating HTML/XML documents
|
|
|
147
147
|
- **Document Manipulation**: Modify, replace, or remove elements
|
|
148
148
|
- **Dynamic Building**: Easily append or insert new nodes
|
|
149
149
|
|
|
150
|
+
#### CSS Selector Support
|
|
151
|
+
|
|
152
|
+
Scout includes a comprehensive CSS selector engine that supports all common selector types:
|
|
153
|
+
|
|
150
154
|
```python
|
|
151
|
-
#
|
|
152
|
-
|
|
155
|
+
# Tag selectors
|
|
156
|
+
paragraphs = scout.select('p')
|
|
157
|
+
divs = scout.select('div')
|
|
158
|
+
|
|
159
|
+
# Class selectors
|
|
160
|
+
items = scout.select('.item') # Single class
|
|
161
|
+
cards = scout.select('div.card') # Tag + class
|
|
162
|
+
special = scout.select('.card.special') # Multiple classes
|
|
163
|
+
|
|
164
|
+
# ID selectors
|
|
165
|
+
header = scout.select_one('#header') # Single element by ID
|
|
166
|
+
menu = scout.select('nav#main-menu') # Tag + ID
|
|
167
|
+
|
|
168
|
+
# Attribute selectors
|
|
169
|
+
links = scout.select('a[href]') # Has attribute
|
|
170
|
+
external = scout.select('a[rel="nofollow"]') # Attribute value
|
|
171
|
+
images = scout.select('img[alt]') # Has alt attribute
|
|
172
|
+
|
|
173
|
+
# Descendant selectors (space)
|
|
174
|
+
nested = scout.select('div p') # Any p inside div
|
|
175
|
+
deep = scout.select('article section p') # Deeply nested
|
|
176
|
+
|
|
177
|
+
# Child selectors (>)
|
|
178
|
+
direct = scout.select('ul > li') # Direct children only
|
|
179
|
+
menu_items = scout.select('nav#menu > ul > li') # Multiple levels
|
|
180
|
+
|
|
181
|
+
# Combined selectors
|
|
182
|
+
complex = scout.select('div.container > p.text[lang="en"]')
|
|
183
|
+
links = scout.select('ol#results > li.item a[href]')
|
|
184
|
+
|
|
185
|
+
# Get first match only
|
|
186
|
+
first = scout.select_one('p.intro')
|
|
187
|
+
```
|
|
153
188
|
|
|
189
|
+
**Supported Selector Types:**
|
|
190
|
+
- **Tag**: `p`, `div`, `a`
|
|
191
|
+
- **Class**: `.class`, `div.class`, `.class1.class2`
|
|
192
|
+
- **ID**: `#id`, `div#id`
|
|
193
|
+
- **Attribute**: `[attr]`, `[attr="value"]`
|
|
194
|
+
- **Descendant**: `div p`, `article section p`
|
|
195
|
+
- **Child**: `div > p`, `ul > li`
|
|
196
|
+
- **Combined**: `p.class#id[attr="value"]`
|
|
197
|
+
|
|
198
|
+
#### Element Navigation
|
|
199
|
+
|
|
200
|
+
```python
|
|
154
201
|
# Advanced find with attribute matching
|
|
155
202
|
results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
|
|
156
203
|
|
|
@@ -340,6 +387,10 @@ cached_data = scout.cache('parsed_data')
|
|
|
340
387
|
- `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
|
|
341
388
|
- `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
|
|
342
389
|
- `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
|
|
390
|
+
- `find_next(name, attrs={}, text=None)`: Find next element in document order
|
|
391
|
+
- `find_all_next(name, attrs={}, text=None, limit=None)`: Find all next elements in document order
|
|
392
|
+
- `find_previous(name, attrs={}, text=None)`: Find previous element in document order
|
|
393
|
+
- `find_all_previous(name, attrs={}, text=None, limit=None)`: Find all previous elements in document order
|
|
343
394
|
- `select(selector)`: Find elements using CSS selector
|
|
344
395
|
- `get_text(separator=' ', strip=False)`: Extract text from document
|
|
345
396
|
- `analyze_text()`: Perform text analysis
|
|
@@ -358,7 +409,7 @@ cached_data = scout.cache('parsed_data')
|
|
|
358
409
|
- `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
|
|
359
410
|
- `_is_valid_url(url)`: Check if a URL is valid (internal method)
|
|
360
411
|
|
|
361
|
-
For detailed API documentation, please refer to the [documentation](https://github.com/
|
|
412
|
+
For detailed API documentation, please refer to the [documentation](https://github.com/pyscout/Webscout/wiki).
|
|
362
413
|
|
|
363
414
|
## 🔧 Dependencies
|
|
364
415
|
|
|
@@ -393,9 +444,9 @@ This project is licensed under the MIT License - see the LICENSE file for detail
|
|
|
393
444
|
<div align="center">
|
|
394
445
|
<p>Made with ❤️ by the Webscout team</p>
|
|
395
446
|
<p>
|
|
396
|
-
<a href="https://github.com/
|
|
397
|
-
<a href="https://github.com/
|
|
398
|
-
<a href="https://github.com/
|
|
399
|
-
<a href="https://github.com/
|
|
447
|
+
<a href="https://github.com/pyscout/Webscout">GitHub</a> •
|
|
448
|
+
<a href="https://github.com/pyscout/Webscout/wiki">Documentation</a> •
|
|
449
|
+
<a href="https://github.com/pyscout/Webscout/issues">Report Bug</a> •
|
|
450
|
+
<a href="https://github.com/pyscout/Webscout/issues">Request Feature</a>
|
|
400
451
|
</p>
|
|
401
452
|
</div>
|
webscout/scout/core/scout.py
CHANGED
|
@@ -454,6 +454,68 @@ class Scout:
|
|
|
454
454
|
pass
|
|
455
455
|
return siblings
|
|
456
456
|
|
|
457
|
+
def find_next(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
|
|
458
|
+
"""
|
|
459
|
+
Find the next element in document order.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
name: Tag name to search for
|
|
463
|
+
attrs: Attributes to match
|
|
464
|
+
text: Text content to match
|
|
465
|
+
**kwargs: Additional attributes
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Optional[Tag]: Next matching element or None
|
|
469
|
+
"""
|
|
470
|
+
return self._soup.find_next(name, attrs, text, **kwargs)
|
|
471
|
+
|
|
472
|
+
def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
|
|
473
|
+
"""
|
|
474
|
+
Find all next elements in document order.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
name: Tag name to search for
|
|
478
|
+
attrs: Attributes to match
|
|
479
|
+
text: Text content to match
|
|
480
|
+
limit: Maximum number of results
|
|
481
|
+
**kwargs: Additional attributes
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
List[Tag]: List of matching elements
|
|
485
|
+
"""
|
|
486
|
+
return self._soup.find_all_next(name, attrs, text, limit, **kwargs)
|
|
487
|
+
|
|
488
|
+
def find_previous(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
|
|
489
|
+
"""
|
|
490
|
+
Find the previous element in document order.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
name: Tag name to search for
|
|
494
|
+
attrs: Attributes to match
|
|
495
|
+
text: Text content to match
|
|
496
|
+
**kwargs: Additional attributes
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Optional[Tag]: Previous matching element or None
|
|
500
|
+
"""
|
|
501
|
+
return self._soup.find_previous(name, attrs, text, **kwargs)
|
|
502
|
+
|
|
503
|
+
def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
|
|
504
|
+
"""
|
|
505
|
+
Find all previous elements in document order.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
name: Tag name to search for
|
|
509
|
+
attrs: Attributes to match
|
|
510
|
+
text: Text content to match
|
|
511
|
+
limit: Maximum number of results
|
|
512
|
+
**kwargs: Additional attributes
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
List[Tag]: List of matching elements
|
|
516
|
+
"""
|
|
517
|
+
return self._soup.find_all_previous(name, attrs, text, limit, **kwargs)
|
|
518
|
+
|
|
457
519
|
def select(self, selector: str) -> List[Tag]:
|
|
458
520
|
"""
|
|
459
521
|
Select elements using CSS selector.
|