webscout 2025.10.15__py3-none-any.whl → 2025.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Extra/YTToolkit/README.md +1 -1
- webscout/Extra/tempmail/README.md +3 -3
- webscout/Provider/OPENAI/README.md +1 -1
- webscout/Provider/TTI/bing.py +4 -4
- webscout/__init__.py +1 -1
- webscout/client.py +4 -5
- webscout/litprinter/__init__.py +0 -42
- webscout/scout/README.md +59 -8
- webscout/scout/core/scout.py +62 -0
- webscout/scout/element.py +251 -45
- webscout/search/__init__.py +3 -4
- webscout/search/engines/bing/images.py +5 -2
- webscout/search/engines/bing/news.py +6 -4
- webscout/search/engines/bing/text.py +5 -2
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +16 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +324 -0
- webscout/search/engines/yahoo/maps.py +16 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +16 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/http_client.py +1 -1
- webscout/search/yahoo_main.py +54 -0
- webscout/{auth → server}/__init__.py +2 -23
- webscout/server/config.py +84 -0
- webscout/{auth → server}/request_processing.py +3 -28
- webscout/{auth → server}/routes.py +6 -148
- webscout/server/schemas.py +23 -0
- webscout/{auth → server}/server.py +11 -43
- webscout/server/simple_logger.py +84 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +1 -1
- webscout/zeroart/README.md +17 -9
- webscout/zeroart/__init__.py +78 -6
- webscout/zeroart/effects.py +51 -1
- webscout/zeroart/fonts.py +559 -1
- {webscout-2025.10.15.dist-info → webscout-2025.10.16.dist-info}/METADATA +10 -52
- {webscout-2025.10.15.dist-info → webscout-2025.10.16.dist-info}/RECORD +49 -45
- {webscout-2025.10.15.dist-info → webscout-2025.10.16.dist-info}/entry_points.txt +1 -1
- webscout/auth/api_key_manager.py +0 -189
- webscout/auth/auth_system.py +0 -85
- webscout/auth/config.py +0 -175
- webscout/auth/database.py +0 -755
- webscout/auth/middleware.py +0 -248
- webscout/auth/models.py +0 -185
- webscout/auth/rate_limiter.py +0 -254
- webscout/auth/schemas.py +0 -103
- webscout/auth/simple_logger.py +0 -236
- webscout/search/engines/yahoo.py +0 -65
- webscout/search/engines/yahoo_news.py +0 -64
- /webscout/{auth → server}/exceptions.py +0 -0
- /webscout/{auth → server}/providers.py +0 -0
- /webscout/{auth → server}/request_models.py +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.16.dist-info}/WHEEL +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.16.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.16.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<a href="https://github.com/
|
|
2
|
+
<a href="https://github.com/pyscout/Webscout">
|
|
3
3
|
<img src="https://img.shields.io/badge/YTToolkit-YouTube%20Toolkit-red?style=for-the-badge&logo=youtube&logoColor=white" alt="YTToolkit Logo">
|
|
4
4
|
</a>
|
|
5
5
|
<h1>YTToolkit</h1>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<a href="https://github.com/
|
|
3
|
-
<img src="https://img.shields.io/badge/WebScout-TempMail%
|
|
2
|
+
<a href="https://github.com/pyscout/Webscout">
|
|
3
|
+
<img src="https://img.shields.io/badge/WebScout-TempMail%20Toolkit-blue?style=for-the-badge&logo=maildotru&logoColor=white" alt="WebScout TempMail Toolkit">
|
|
4
4
|
</a>
|
|
5
5
|
|
|
6
6
|
<h1>📧 TempMail</h1>
|
|
@@ -484,5 +484,5 @@ Please refer to the main Webscout project's contributing guidelines if you plan
|
|
|
484
484
|
<a href="https://buymeacoffee.com/oevortex"><img alt="Buy Me A Coffee" src="https://img.shields.io/badge/Buy%20Me%20A%20Coffee-FFDD00?style=for-the-badge&logo=buymeacoffee&logoColor=black"></a>
|
|
485
485
|
</div>
|
|
486
486
|
<p>📧 TempMail - Part of the Webscout Toolkit</p>
|
|
487
|
-
<a href="https://github.com/
|
|
487
|
+
<a href="https://github.com/pyscout/Webscout">Back to Main Webscout Project</a>
|
|
488
488
|
</div>
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<a href="https://github.com/
|
|
2
|
+
<a href="https://github.com/pyscout/Webscout">
|
|
3
3
|
<img src="https://img.shields.io/badge/WebScout-OpenAI%20Compatible%20Providers-4285F4?style=for-the-badge&logo=openai&logoColor=white" alt="WebScout OpenAI Compatible Providers">
|
|
4
4
|
</a>
|
|
5
5
|
<br/>
|
webscout/Provider/TTI/bing.py
CHANGED
|
@@ -7,6 +7,7 @@ from webscout.Provider.TTI.utils import ImageData, ImageResponse
|
|
|
7
7
|
from webscout.Provider.TTI.base import TTICompatibleProvider, BaseImages
|
|
8
8
|
from io import BytesIO
|
|
9
9
|
from webscout.litagent import LitAgent
|
|
10
|
+
from webscout.scout import Scout
|
|
10
11
|
|
|
11
12
|
try:
|
|
12
13
|
from PIL import Image
|
|
@@ -81,9 +82,8 @@ class Images(BaseImages):
|
|
|
81
82
|
time.sleep(3)
|
|
82
83
|
try:
|
|
83
84
|
poll_resp = session.get(polling_url, headers=headers, timeout=timeout)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
imgs = [img["src"].split("?")[0] for img in soup.select(".img_cont .mimg") if img.get("src")]
|
|
85
|
+
scout = Scout(poll_resp.text, features='html.parser')
|
|
86
|
+
imgs = [img["src"].split("?")[0] for img in scout.select(".img_cont .mimg") if img.attrs.get("src")]
|
|
87
87
|
if imgs:
|
|
88
88
|
img_url = imgs[0]
|
|
89
89
|
break
|
|
@@ -232,7 +232,7 @@ class BingImageAI(TTICompatibleProvider):
|
|
|
232
232
|
|
|
233
233
|
if __name__ == "__main__":
|
|
234
234
|
from rich import print
|
|
235
|
-
client = BingImageAI(cookie="
|
|
235
|
+
client = BingImageAI(cookie="1Fw9daLSZzVBJXgevTDuc0jHZ60l4m5IiQEwjRCFOwEkpEBDmw3b8CEAALFSwZ1QBu-rATNkfD0i0gfJmVHeFlogqIriGwxNwT9T6fVREgAQD4_qn0VnQYP681NN4K80t6o-eJXnK1MBhdjxTIaok8173LGmLkEWLqHC0k3dYnF7m2kHRhf1dxjEH3WDI56hxiSPZtnggdzrfnuFAmOgCQQ")
|
|
236
236
|
response = client.images.create(
|
|
237
237
|
model="gpt4o",
|
|
238
238
|
prompt="A cat riding a bicycle",
|
webscout/__init__.py
CHANGED
webscout/client.py
CHANGED
|
@@ -34,20 +34,19 @@ from webscout.Provider.OPENAI import *
|
|
|
34
34
|
try:
|
|
35
35
|
# Use lazy import to avoid module execution issues
|
|
36
36
|
def run_api(*args, **kwargs):
|
|
37
|
-
|
|
38
|
-
from webscout.auth.server import run_api as _run_api
|
|
37
|
+
from webscout.server.server import run_api as _run_api
|
|
39
38
|
return _run_api(*args, **kwargs)
|
|
40
39
|
|
|
41
40
|
def start_server(**kwargs):
|
|
42
41
|
"""Start the Webscout OpenAI-compatible API server (FastAPI backend)."""
|
|
43
|
-
from webscout.
|
|
42
|
+
from webscout.server.server import run_api as _run_api
|
|
44
43
|
return _run_api(**kwargs)
|
|
45
44
|
except ImportError:
|
|
46
45
|
# Fallback for environments where the backend is not available
|
|
47
46
|
def run_api(*args, **kwargs):
|
|
48
|
-
raise ImportError("webscout.
|
|
47
|
+
raise ImportError("webscout.server.server.run_api is not available in this environment.")
|
|
49
48
|
def start_server(*args, **kwargs):
|
|
50
|
-
raise ImportError("webscout.
|
|
49
|
+
raise ImportError("webscout.server.server.start_server is not available in this environment.")
|
|
51
50
|
|
|
52
51
|
# ---
|
|
53
52
|
# API Documentation
|
webscout/litprinter/__init__.py
CHANGED
|
@@ -1,45 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
>>> from litprinter import litprint
|
|
3
|
-
>>> from litprinter import lit
|
|
4
|
-
>>> from litprinter import install, uninstall
|
|
5
|
-
>>>
|
|
6
|
-
>>> litprint("Hello, world!")
|
|
7
|
-
LIT -> [__main__.py:1] in () >>> Hello, world!
|
|
8
|
-
>>>
|
|
9
|
-
>>> def my_function():
|
|
10
|
-
... lit(1, 2, 3)
|
|
11
|
-
>>> my_function()
|
|
12
|
-
LIT -> [__main__.py:4] in my_function() >>> 1, 2, 3
|
|
13
|
-
>>> install()
|
|
14
|
-
>>> ic("This is now the builtins.ic()")
|
|
15
|
-
LIT -> [__main__.py:7] in () >>> This is now the builtins.ic()
|
|
16
|
-
>>> uninstall()
|
|
17
|
-
|
|
18
|
-
This module provides enhanced print and logging functionalities for Python,
|
|
19
|
-
allowing developers to debug their code with style and precision. It
|
|
20
|
-
includes the litprint and lit functions for debugging, log for logging, and
|
|
21
|
-
install/uninstall functions for integration into the builtins module.
|
|
22
|
-
It also handles colorizing output and provides different styles and customizable
|
|
23
|
-
options.
|
|
24
|
-
|
|
25
|
-
LITPRINTER is inspired by the icecream package and provides similar functionality
|
|
26
|
-
with additional features:
|
|
27
|
-
- Variable inspection with expression display
|
|
28
|
-
- Return value handling for inline usage
|
|
29
|
-
- Support for custom formatters for specific data types
|
|
30
|
-
- Execution context tracking
|
|
31
|
-
- Rich-like colorized output with multiple themes (JARVIS, RICH, MODERN, NEON, CYBERPUNK)
|
|
32
|
-
- Better JSON formatting with indent=2 by default
|
|
33
|
-
- Advanced pretty printing for complex data structures with smart truncation
|
|
34
|
-
- Clickable file paths in supported terminals and editors (VSCode compatible)
|
|
35
|
-
- Enhanced visual formatting with better spacing and separators
|
|
36
|
-
- Special formatters for common types (Exception, bytes, set, frozenset, etc.)
|
|
37
|
-
- Smart object introspection for custom classes
|
|
38
|
-
- Logging capabilities with timestamp and log levels
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
# Try to import from the standalone litprinter package first
|
|
42
|
-
# If it's not installed
|
|
43
1
|
try:
|
|
44
2
|
import litprinter
|
|
45
3
|
# If standalone package is found, re-export all its components
|
webscout/scout/README.md
CHANGED
|
@@ -43,7 +43,7 @@ pip install webscout
|
|
|
43
43
|
Or install the latest version from GitHub:
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
|
-
pip install git+https://github.com/
|
|
46
|
+
pip install git+https://github.com/pyscout/Webscout.git
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
## 🚀 Quick Start
|
|
@@ -147,10 +147,57 @@ Scout provides powerful tools for navigating and manipulating HTML/XML documents
|
|
|
147
147
|
- **Document Manipulation**: Modify, replace, or remove elements
|
|
148
148
|
- **Dynamic Building**: Easily append or insert new nodes
|
|
149
149
|
|
|
150
|
+
#### CSS Selector Support
|
|
151
|
+
|
|
152
|
+
Scout includes a comprehensive CSS selector engine that supports all common selector types:
|
|
153
|
+
|
|
150
154
|
```python
|
|
151
|
-
#
|
|
152
|
-
|
|
155
|
+
# Tag selectors
|
|
156
|
+
paragraphs = scout.select('p')
|
|
157
|
+
divs = scout.select('div')
|
|
158
|
+
|
|
159
|
+
# Class selectors
|
|
160
|
+
items = scout.select('.item') # Single class
|
|
161
|
+
cards = scout.select('div.card') # Tag + class
|
|
162
|
+
special = scout.select('.card.special') # Multiple classes
|
|
163
|
+
|
|
164
|
+
# ID selectors
|
|
165
|
+
header = scout.select_one('#header') # Single element by ID
|
|
166
|
+
menu = scout.select('nav#main-menu') # Tag + ID
|
|
167
|
+
|
|
168
|
+
# Attribute selectors
|
|
169
|
+
links = scout.select('a[href]') # Has attribute
|
|
170
|
+
external = scout.select('a[rel="nofollow"]') # Attribute value
|
|
171
|
+
images = scout.select('img[alt]') # Has alt attribute
|
|
172
|
+
|
|
173
|
+
# Descendant selectors (space)
|
|
174
|
+
nested = scout.select('div p') # Any p inside div
|
|
175
|
+
deep = scout.select('article section p') # Deeply nested
|
|
176
|
+
|
|
177
|
+
# Child selectors (>)
|
|
178
|
+
direct = scout.select('ul > li') # Direct children only
|
|
179
|
+
menu_items = scout.select('nav#menu > ul > li') # Multiple levels
|
|
180
|
+
|
|
181
|
+
# Combined selectors
|
|
182
|
+
complex = scout.select('div.container > p.text[lang="en"]')
|
|
183
|
+
links = scout.select('ol#results > li.item a[href]')
|
|
184
|
+
|
|
185
|
+
# Get first match only
|
|
186
|
+
first = scout.select_one('p.intro')
|
|
187
|
+
```
|
|
153
188
|
|
|
189
|
+
**Supported Selector Types:**
|
|
190
|
+
- **Tag**: `p`, `div`, `a`
|
|
191
|
+
- **Class**: `.class`, `div.class`, `.class1.class2`
|
|
192
|
+
- **ID**: `#id`, `div#id`
|
|
193
|
+
- **Attribute**: `[attr]`, `[attr="value"]`
|
|
194
|
+
- **Descendant**: `div p`, `article section p`
|
|
195
|
+
- **Child**: `div > p`, `ul > li`
|
|
196
|
+
- **Combined**: `p.class#id[attr="value"]`
|
|
197
|
+
|
|
198
|
+
#### Element Navigation
|
|
199
|
+
|
|
200
|
+
```python
|
|
154
201
|
# Advanced find with attribute matching
|
|
155
202
|
results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
|
|
156
203
|
|
|
@@ -340,6 +387,10 @@ cached_data = scout.cache('parsed_data')
|
|
|
340
387
|
- `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
|
|
341
388
|
- `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
|
|
342
389
|
- `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
|
|
390
|
+
- `find_next(name, attrs={}, text=None)`: Find next element in document order
|
|
391
|
+
- `find_all_next(name, attrs={}, text=None, limit=None)`: Find all next elements in document order
|
|
392
|
+
- `find_previous(name, attrs={}, text=None)`: Find previous element in document order
|
|
393
|
+
- `find_all_previous(name, attrs={}, text=None, limit=None)`: Find all previous elements in document order
|
|
343
394
|
- `select(selector)`: Find elements using CSS selector
|
|
344
395
|
- `get_text(separator=' ', strip=False)`: Extract text from document
|
|
345
396
|
- `analyze_text()`: Perform text analysis
|
|
@@ -358,7 +409,7 @@ cached_data = scout.cache('parsed_data')
|
|
|
358
409
|
- `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
|
|
359
410
|
- `_is_valid_url(url)`: Check if a URL is valid (internal method)
|
|
360
411
|
|
|
361
|
-
For detailed API documentation, please refer to the [documentation](https://github.com/
|
|
412
|
+
For detailed API documentation, please refer to the [documentation](https://github.com/pyscout/Webscout/wiki).
|
|
362
413
|
|
|
363
414
|
## 🔧 Dependencies
|
|
364
415
|
|
|
@@ -393,9 +444,9 @@ This project is licensed under the MIT License - see the LICENSE file for detail
|
|
|
393
444
|
<div align="center">
|
|
394
445
|
<p>Made with ❤️ by the Webscout team</p>
|
|
395
446
|
<p>
|
|
396
|
-
<a href="https://github.com/
|
|
397
|
-
<a href="https://github.com/
|
|
398
|
-
<a href="https://github.com/
|
|
399
|
-
<a href="https://github.com/
|
|
447
|
+
<a href="https://github.com/pyscout/Webscout">GitHub</a> •
|
|
448
|
+
<a href="https://github.com/pyscout/Webscout/wiki">Documentation</a> •
|
|
449
|
+
<a href="https://github.com/pyscout/Webscout/issues">Report Bug</a> •
|
|
450
|
+
<a href="https://github.com/pyscout/Webscout/issues">Request Feature</a>
|
|
400
451
|
</p>
|
|
401
452
|
</div>
|
webscout/scout/core/scout.py
CHANGED
|
@@ -454,6 +454,68 @@ class Scout:
|
|
|
454
454
|
pass
|
|
455
455
|
return siblings
|
|
456
456
|
|
|
457
|
+
def find_next(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
|
|
458
|
+
"""
|
|
459
|
+
Find the next element in document order.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
name: Tag name to search for
|
|
463
|
+
attrs: Attributes to match
|
|
464
|
+
text: Text content to match
|
|
465
|
+
**kwargs: Additional attributes
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Optional[Tag]: Next matching element or None
|
|
469
|
+
"""
|
|
470
|
+
return self._soup.find_next(name, attrs, text, **kwargs)
|
|
471
|
+
|
|
472
|
+
def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
|
|
473
|
+
"""
|
|
474
|
+
Find all next elements in document order.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
name: Tag name to search for
|
|
478
|
+
attrs: Attributes to match
|
|
479
|
+
text: Text content to match
|
|
480
|
+
limit: Maximum number of results
|
|
481
|
+
**kwargs: Additional attributes
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
List[Tag]: List of matching elements
|
|
485
|
+
"""
|
|
486
|
+
return self._soup.find_all_next(name, attrs, text, limit, **kwargs)
|
|
487
|
+
|
|
488
|
+
def find_previous(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
|
|
489
|
+
"""
|
|
490
|
+
Find the previous element in document order.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
name: Tag name to search for
|
|
494
|
+
attrs: Attributes to match
|
|
495
|
+
text: Text content to match
|
|
496
|
+
**kwargs: Additional attributes
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Optional[Tag]: Previous matching element or None
|
|
500
|
+
"""
|
|
501
|
+
return self._soup.find_previous(name, attrs, text, **kwargs)
|
|
502
|
+
|
|
503
|
+
def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
|
|
504
|
+
"""
|
|
505
|
+
Find all previous elements in document order.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
name: Tag name to search for
|
|
509
|
+
attrs: Attributes to match
|
|
510
|
+
text: Text content to match
|
|
511
|
+
limit: Maximum number of results
|
|
512
|
+
**kwargs: Additional attributes
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
List[Tag]: List of matching elements
|
|
516
|
+
"""
|
|
517
|
+
return self._soup.find_all_previous(name, attrs, text, limit, **kwargs)
|
|
518
|
+
|
|
457
519
|
def select(self, selector: str) -> List[Tag]:
|
|
458
520
|
"""
|
|
459
521
|
Select elements using CSS selector.
|
webscout/scout/element.py
CHANGED
|
@@ -267,7 +267,14 @@ class Tag:
|
|
|
267
267
|
def select(self, selector: str) -> List['Tag']:
|
|
268
268
|
"""
|
|
269
269
|
Select elements using CSS selector.
|
|
270
|
-
Enhanced to support more complex selectors
|
|
270
|
+
Enhanced to support more complex selectors including:
|
|
271
|
+
- Tag selectors: 'p', 'div'
|
|
272
|
+
- Class selectors: '.class', 'p.class'
|
|
273
|
+
- ID selectors: '#id', 'div#id'
|
|
274
|
+
- Attribute selectors: '[attr]', '[attr=value]'
|
|
275
|
+
- Descendant selectors: 'div p'
|
|
276
|
+
- Child selectors: 'div > p'
|
|
277
|
+
- Multiple classes: '.class1.class2'
|
|
271
278
|
|
|
272
279
|
Args:
|
|
273
280
|
selector (str): CSS selector string
|
|
@@ -275,54 +282,248 @@ class Tag:
|
|
|
275
282
|
Returns:
|
|
276
283
|
List[Tag]: List of matching elements
|
|
277
284
|
"""
|
|
278
|
-
# More advanced CSS selector parsing
|
|
279
|
-
# This is a simplified implementation and might need more robust parsing
|
|
280
|
-
parts = re.split(r'\s+', selector.strip())
|
|
281
285
|
results = []
|
|
282
|
-
|
|
283
|
-
def
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
286
|
+
|
|
287
|
+
def _parse_simple_selector(simple_sel: str) -> dict:
|
|
288
|
+
"""Parse a simple selector like 'p.class#id[attr=value]' into components."""
|
|
289
|
+
components = {
|
|
290
|
+
'tag': None,
|
|
291
|
+
'id': None,
|
|
292
|
+
'classes': [],
|
|
293
|
+
'attrs': {}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
# Extract tag name (at the start)
|
|
297
|
+
tag_match = re.match(r'^([a-zA-Z][\w-]*)', simple_sel)
|
|
298
|
+
if tag_match:
|
|
299
|
+
components['tag'] = tag_match.group(1)
|
|
300
|
+
simple_sel = simple_sel[len(tag_match.group(1)):]
|
|
301
|
+
|
|
302
|
+
# Extract ID
|
|
303
|
+
id_matches = re.findall(r'#([\w-]+)', simple_sel)
|
|
304
|
+
if id_matches:
|
|
305
|
+
components['id'] = id_matches[0]
|
|
306
|
+
|
|
307
|
+
# Extract classes
|
|
308
|
+
class_matches = re.findall(r'\.([\w-]+)', simple_sel)
|
|
309
|
+
components['classes'] = class_matches
|
|
310
|
+
|
|
311
|
+
# Extract attributes
|
|
312
|
+
attr_matches = re.findall(r'\[([^\]]+)\]', simple_sel)
|
|
313
|
+
for attr_expr in attr_matches:
|
|
314
|
+
if '=' in attr_expr:
|
|
315
|
+
attr_name, attr_value = attr_expr.split('=', 1)
|
|
316
|
+
components['attrs'][attr_name.strip()] = attr_value.strip('\'"')
|
|
317
|
+
else:
|
|
318
|
+
components['attrs'][attr_expr.strip()] = None
|
|
319
|
+
|
|
320
|
+
return components
|
|
321
|
+
|
|
322
|
+
def _match_simple_selector(tag: 'Tag', components: dict) -> bool:
|
|
323
|
+
"""Check if a tag matches the parsed selector components."""
|
|
324
|
+
# Check tag name
|
|
325
|
+
if components['tag'] and tag.name != components['tag']:
|
|
326
|
+
return False
|
|
327
|
+
|
|
328
|
+
# Check ID
|
|
329
|
+
if components['id'] and tag.get('id') != components['id']:
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
# Check classes
|
|
333
|
+
tag_classes = tag.get('class', '')
|
|
334
|
+
if isinstance(tag_classes, str):
|
|
335
|
+
tag_classes = tag_classes.split()
|
|
336
|
+
elif not isinstance(tag_classes, list):
|
|
337
|
+
tag_classes = [str(tag_classes)] if tag_classes else []
|
|
338
|
+
|
|
339
|
+
for cls in components['classes']:
|
|
340
|
+
if cls not in tag_classes:
|
|
341
|
+
return False
|
|
342
|
+
|
|
343
|
+
# Check attributes
|
|
344
|
+
for attr_name, attr_value in components['attrs'].items():
|
|
345
|
+
if attr_value is None:
|
|
346
|
+
# Just check attribute exists
|
|
347
|
+
if attr_name not in tag.attrs:
|
|
297
348
|
return False
|
|
298
|
-
if value:
|
|
299
|
-
return tag.get(attr) == value.strip("'\"")
|
|
300
|
-
return attr in tag.attrs
|
|
301
|
-
else:
|
|
302
|
-
# Tag selector
|
|
303
|
-
return tag.name == selector_part
|
|
304
|
-
|
|
305
|
-
def _recursive_select(element, selector_parts):
|
|
306
|
-
if not selector_parts:
|
|
307
|
-
results.append(element)
|
|
308
|
-
return
|
|
309
|
-
|
|
310
|
-
current_selector = selector_parts[0]
|
|
311
|
-
remaining_selectors = selector_parts[1:]
|
|
312
|
-
|
|
313
|
-
if _match_selector(element, current_selector):
|
|
314
|
-
if not remaining_selectors:
|
|
315
|
-
results.append(element)
|
|
316
349
|
else:
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
350
|
+
# Check attribute value
|
|
351
|
+
if tag.get(attr_name) != attr_value:
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
def _find_all_matching(element: 'Tag', components: dict) -> List['Tag']:
|
|
357
|
+
"""Recursively find all elements matching the selector components."""
|
|
358
|
+
matches = []
|
|
359
|
+
|
|
360
|
+
# Check current element
|
|
361
|
+
if _match_simple_selector(element, components):
|
|
362
|
+
matches.append(element)
|
|
363
|
+
|
|
364
|
+
# Check children recursively
|
|
365
|
+
for child in element.contents:
|
|
366
|
+
if isinstance(child, Tag):
|
|
367
|
+
matches.extend(_find_all_matching(child, components))
|
|
368
|
+
|
|
369
|
+
return matches
|
|
370
|
+
|
|
371
|
+
# Handle combinators (descendant ' ' and child '>')
|
|
372
|
+
if ' > ' in selector:
|
|
373
|
+
# Child combinator
|
|
374
|
+
parts = [p.strip() for p in selector.split(' > ')]
|
|
375
|
+
return self._select_with_child_combinator(parts)
|
|
376
|
+
elif ' ' in selector.strip():
|
|
377
|
+
# Descendant combinator
|
|
378
|
+
parts = [p.strip() for p in selector.split()]
|
|
379
|
+
return self._select_with_descendant_combinator(parts)
|
|
380
|
+
else:
|
|
381
|
+
# Simple selector
|
|
382
|
+
components = _parse_simple_selector(selector)
|
|
383
|
+
return _find_all_matching(self, components)
|
|
384
|
+
|
|
385
|
+
def _select_with_descendant_combinator(self, parts: List[str]) -> List['Tag']:
|
|
386
|
+
"""Handle descendant combinator (space)."""
|
|
387
|
+
if not parts:
|
|
388
|
+
return []
|
|
389
|
+
|
|
390
|
+
if len(parts) == 1:
|
|
391
|
+
components = self._parse_selector_components(parts[0])
|
|
392
|
+
return self._find_all_matching_in_tree(self, components)
|
|
393
|
+
|
|
394
|
+
# Find elements matching the first part
|
|
395
|
+
first_components = self._parse_selector_components(parts[0])
|
|
396
|
+
first_matches = self._find_all_matching_in_tree(self, first_components)
|
|
397
|
+
|
|
398
|
+
# For each match, find descendants matching remaining parts
|
|
399
|
+
results = []
|
|
400
|
+
remaining_selector = ' '.join(parts[1:])
|
|
401
|
+
for match in first_matches:
|
|
402
|
+
descendants = match.select(remaining_selector)
|
|
403
|
+
results.extend(descendants)
|
|
404
|
+
|
|
325
405
|
return results
|
|
406
|
+
|
|
407
|
+
def _select_with_child_combinator(self, parts: List[str]) -> List['Tag']:
|
|
408
|
+
"""Handle child combinator (>)."""
|
|
409
|
+
if not parts:
|
|
410
|
+
return []
|
|
411
|
+
|
|
412
|
+
if len(parts) == 1:
|
|
413
|
+
components = self._parse_selector_components(parts[0])
|
|
414
|
+
return self._find_all_matching_in_tree(self, components)
|
|
415
|
+
|
|
416
|
+
# Find elements matching the first part
|
|
417
|
+
first_components = self._parse_selector_components(parts[0])
|
|
418
|
+
first_matches = self._find_all_matching_in_tree(self, first_components)
|
|
419
|
+
|
|
420
|
+
# For each match, find direct children matching the next part
|
|
421
|
+
if len(parts) == 2:
|
|
422
|
+
# Last part, just check direct children
|
|
423
|
+
next_components = self._parse_selector_components(parts[1])
|
|
424
|
+
results = []
|
|
425
|
+
for match in first_matches:
|
|
426
|
+
for child in match.contents:
|
|
427
|
+
if isinstance(child, Tag) and self._match_selector_components(child, next_components):
|
|
428
|
+
results.append(child)
|
|
429
|
+
return results
|
|
430
|
+
else:
|
|
431
|
+
# More parts, need to continue recursively
|
|
432
|
+
results = []
|
|
433
|
+
next_components = self._parse_selector_components(parts[1])
|
|
434
|
+
remaining_parts = parts[2:]
|
|
435
|
+
for match in first_matches:
|
|
436
|
+
for child in match.contents:
|
|
437
|
+
if isinstance(child, Tag) and self._match_selector_components(child, next_components):
|
|
438
|
+
# Continue with remaining parts
|
|
439
|
+
remaining_selector = ' > '.join(remaining_parts)
|
|
440
|
+
descendants = child.select(remaining_selector)
|
|
441
|
+
results.extend(descendants)
|
|
442
|
+
return results
|
|
443
|
+
|
|
444
|
+
def _parse_selector_components(self, simple_sel: str) -> dict:
|
|
445
|
+
"""Parse a simple selector like 'p.class#id[attr=value]' into components."""
|
|
446
|
+
components = {
|
|
447
|
+
'tag': None,
|
|
448
|
+
'id': None,
|
|
449
|
+
'classes': [],
|
|
450
|
+
'attrs': {}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
# Extract tag name (at the start)
|
|
454
|
+
tag_match = re.match(r'^([a-zA-Z][\w-]*)', simple_sel)
|
|
455
|
+
if tag_match:
|
|
456
|
+
components['tag'] = tag_match.group(1)
|
|
457
|
+
simple_sel = simple_sel[len(tag_match.group(1)):]
|
|
458
|
+
|
|
459
|
+
# Extract ID
|
|
460
|
+
id_matches = re.findall(r'#([\w-]+)', simple_sel)
|
|
461
|
+
if id_matches:
|
|
462
|
+
components['id'] = id_matches[0]
|
|
463
|
+
|
|
464
|
+
# Extract classes
|
|
465
|
+
class_matches = re.findall(r'\.([\w-]+)', simple_sel)
|
|
466
|
+
components['classes'] = class_matches
|
|
467
|
+
|
|
468
|
+
# Extract attributes
|
|
469
|
+
attr_matches = re.findall(r'\[([^\]]+)\]', simple_sel)
|
|
470
|
+
for attr_expr in attr_matches:
|
|
471
|
+
if '=' in attr_expr:
|
|
472
|
+
attr_name, attr_value = attr_expr.split('=', 1)
|
|
473
|
+
components['attrs'][attr_name.strip()] = attr_value.strip('\'"')
|
|
474
|
+
else:
|
|
475
|
+
components['attrs'][attr_expr.strip()] = None
|
|
476
|
+
|
|
477
|
+
return components
|
|
478
|
+
|
|
479
|
+
def _match_selector_components(self, tag: 'Tag', components: dict) -> bool:
|
|
480
|
+
"""Check if a tag matches the parsed selector components."""
|
|
481
|
+
# Check tag name
|
|
482
|
+
if components['tag'] and tag.name != components['tag']:
|
|
483
|
+
return False
|
|
484
|
+
|
|
485
|
+
# Check ID
|
|
486
|
+
if components['id'] and tag.get('id') != components['id']:
|
|
487
|
+
return False
|
|
488
|
+
|
|
489
|
+
# Check classes
|
|
490
|
+
tag_classes = tag.get('class', '')
|
|
491
|
+
if isinstance(tag_classes, str):
|
|
492
|
+
tag_classes = tag_classes.split()
|
|
493
|
+
elif not isinstance(tag_classes, list):
|
|
494
|
+
tag_classes = [str(tag_classes)] if tag_classes else []
|
|
495
|
+
|
|
496
|
+
for cls in components['classes']:
|
|
497
|
+
if cls not in tag_classes:
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
# Check attributes
|
|
501
|
+
for attr_name, attr_value in components['attrs'].items():
|
|
502
|
+
if attr_value is None:
|
|
503
|
+
# Just check attribute exists
|
|
504
|
+
if attr_name not in tag.attrs:
|
|
505
|
+
return False
|
|
506
|
+
else:
|
|
507
|
+
# Check attribute value
|
|
508
|
+
if tag.get(attr_name) != attr_value:
|
|
509
|
+
return False
|
|
510
|
+
|
|
511
|
+
return True
|
|
512
|
+
|
|
513
|
+
def _find_all_matching_in_tree(self, element: 'Tag', components: dict) -> List['Tag']:
|
|
514
|
+
"""Recursively find all elements matching the selector components."""
|
|
515
|
+
matches = []
|
|
516
|
+
|
|
517
|
+
# Check current element
|
|
518
|
+
if self._match_selector_components(element, components):
|
|
519
|
+
matches.append(element)
|
|
520
|
+
|
|
521
|
+
# Check children recursively
|
|
522
|
+
for child in element.contents:
|
|
523
|
+
if isinstance(child, Tag):
|
|
524
|
+
matches.extend(self._find_all_matching_in_tree(child, components))
|
|
525
|
+
|
|
526
|
+
return matches
|
|
326
527
|
|
|
327
528
|
def select_one(self, selector: str) -> Optional['Tag']:
|
|
328
529
|
"""
|
|
@@ -462,6 +663,11 @@ class Tag:
|
|
|
462
663
|
new_child.parent = self
|
|
463
664
|
self.contents.append(new_child)
|
|
464
665
|
|
|
666
|
+
def extend(self, new_children: List[Union['Tag', NavigableString, str]]) -> None:
|
|
667
|
+
"""Extend the contents of this tag with a list of new children."""
|
|
668
|
+
for child in new_children:
|
|
669
|
+
self.append(child)
|
|
670
|
+
|
|
465
671
|
def insert(self, index: int, new_child: Union['Tag', NavigableString, str]) -> None:
|
|
466
672
|
"""Insert a new child at the given index with error handling."""
|
|
467
673
|
if isinstance(new_child, str):
|
webscout/search/__init__.py
CHANGED
|
@@ -4,14 +4,14 @@ from .base import BaseSearch, BaseSearchEngine
|
|
|
4
4
|
from .duckduckgo_main import DuckDuckGoSearch
|
|
5
5
|
from .yep_main import YepSearch
|
|
6
6
|
from .bing_main import BingSearch
|
|
7
|
+
from .yahoo_main import YahooSearch
|
|
7
8
|
|
|
8
9
|
# Import new search engines
|
|
9
10
|
from .engines.brave import Brave
|
|
10
11
|
from .engines.mojeek import Mojeek
|
|
11
|
-
|
|
12
|
+
|
|
12
13
|
from .engines.yandex import Yandex
|
|
13
14
|
from .engines.wikipedia import Wikipedia
|
|
14
|
-
from .engines.yahoo_news import YahooNews
|
|
15
15
|
|
|
16
16
|
# Import result models
|
|
17
17
|
from .results import (
|
|
@@ -31,14 +31,13 @@ __all__ = [
|
|
|
31
31
|
"DuckDuckGoSearch",
|
|
32
32
|
"YepSearch",
|
|
33
33
|
"BingSearch",
|
|
34
|
+
"YahooSearch",
|
|
34
35
|
|
|
35
36
|
# Individual engines
|
|
36
37
|
"Brave",
|
|
37
38
|
"Mojeek",
|
|
38
|
-
"Yahoo",
|
|
39
39
|
"Yandex",
|
|
40
40
|
"Wikipedia",
|
|
41
|
-
"YahooNews",
|
|
42
41
|
|
|
43
42
|
# Result models
|
|
44
43
|
"TextResult",
|