webscout 2025.10.14.1__py3-none-any.whl → 2025.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Extra/YTToolkit/README.md +1 -1
- webscout/Extra/tempmail/README.md +3 -3
- webscout/Provider/OPENAI/README.md +1 -1
- webscout/Provider/TTI/bing.py +4 -4
- webscout/__init__.py +1 -1
- webscout/cli.py +0 -147
- webscout/client.py +4 -5
- webscout/litprinter/__init__.py +0 -42
- webscout/scout/README.md +59 -8
- webscout/scout/core/scout.py +62 -0
- webscout/scout/element.py +251 -45
- webscout/search/__init__.py +5 -8
- webscout/search/bing_main.py +42 -0
- webscout/search/engines/bing/__init__.py +1 -0
- webscout/search/engines/bing/base.py +33 -0
- webscout/search/engines/bing/images.py +108 -0
- webscout/search/engines/bing/news.py +91 -0
- webscout/search/engines/bing/suggestions.py +34 -0
- webscout/search/engines/bing/text.py +106 -0
- webscout/search/engines/duckduckgo/maps.py +13 -0
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +16 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +324 -0
- webscout/search/engines/yahoo/maps.py +16 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +16 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/http_client.py +1 -1
- webscout/search/yahoo_main.py +54 -0
- webscout/{auth → server}/__init__.py +2 -23
- webscout/server/config.py +84 -0
- webscout/{auth → server}/request_processing.py +3 -28
- webscout/{auth → server}/routes.py +14 -170
- webscout/server/schemas.py +23 -0
- webscout/{auth → server}/server.py +11 -43
- webscout/server/simple_logger.py +84 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +1 -1
- webscout/zeroart/README.md +17 -9
- webscout/zeroart/__init__.py +78 -6
- webscout/zeroart/effects.py +51 -1
- webscout/zeroart/fonts.py +559 -1
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/METADATA +15 -332
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/RECORD +55 -48
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/entry_points.txt +1 -1
- webscout/Bing_search.py +0 -417
- webscout/DWEBS.py +0 -529
- webscout/auth/api_key_manager.py +0 -189
- webscout/auth/auth_system.py +0 -85
- webscout/auth/config.py +0 -175
- webscout/auth/database.py +0 -755
- webscout/auth/middleware.py +0 -248
- webscout/auth/models.py +0 -185
- webscout/auth/rate_limiter.py +0 -254
- webscout/auth/schemas.py +0 -103
- webscout/auth/simple_logger.py +0 -236
- webscout/search/engines/bing.py +0 -84
- webscout/search/engines/bing_news.py +0 -52
- webscout/search/engines/yahoo.py +0 -65
- webscout/search/engines/yahoo_news.py +0 -64
- /webscout/{auth → server}/exceptions.py +0 -0
- /webscout/{auth → server}/providers.py +0 -0
- /webscout/{auth → server}/request_models.py +0 -0
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/WHEEL +0 -0
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Bing text search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
from urllib.parse import urlencode
|
|
7
|
+
from time import sleep
|
|
8
|
+
|
|
9
|
+
from .base import BingBase
|
|
10
|
+
from webscout.scout import Scout
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BingTextSearch(BingBase):
|
|
14
|
+
def run(self, *args, **kwargs) -> List[Dict[str, str]]:
|
|
15
|
+
keywords = args[0] if args else kwargs.get("keywords")
|
|
16
|
+
region = args[1] if len(args) > 1 else kwargs.get("region", "us")
|
|
17
|
+
safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
|
|
18
|
+
max_results = args[3] if len(args) > 3 else kwargs.get("max_results", 10)
|
|
19
|
+
unique = kwargs.get("unique", True)
|
|
20
|
+
|
|
21
|
+
if max_results is None:
|
|
22
|
+
max_results = 10
|
|
23
|
+
|
|
24
|
+
if not keywords:
|
|
25
|
+
raise ValueError("Keywords are mandatory")
|
|
26
|
+
|
|
27
|
+
safe_map = {
|
|
28
|
+
"on": "Strict",
|
|
29
|
+
"moderate": "Moderate",
|
|
30
|
+
"off": "Off"
|
|
31
|
+
}
|
|
32
|
+
safe = safe_map.get(safesearch.lower(), "Moderate")
|
|
33
|
+
|
|
34
|
+
fetched_results = []
|
|
35
|
+
fetched_links = set()
|
|
36
|
+
|
|
37
|
+
def fetch_page(url):
|
|
38
|
+
try:
|
|
39
|
+
response = self.session.get(url, timeout=self.timeout)
|
|
40
|
+
response.raise_for_status()
|
|
41
|
+
return response.text
|
|
42
|
+
except Exception as e:
|
|
43
|
+
raise Exception(f"Failed to fetch page: {str(e)}")
|
|
44
|
+
|
|
45
|
+
# Get first page URL
|
|
46
|
+
url = f'{self.base_url}/search?q={keywords}&search=&form=QBLH'
|
|
47
|
+
urls_to_fetch = [url]
|
|
48
|
+
|
|
49
|
+
while len(fetched_results) < max_results and urls_to_fetch:
|
|
50
|
+
current_url = urls_to_fetch.pop(0)
|
|
51
|
+
html = fetch_page(current_url)
|
|
52
|
+
soup = Scout(html)
|
|
53
|
+
|
|
54
|
+
links = soup.select('ol#b_results > li.b_algo')
|
|
55
|
+
for link in links:
|
|
56
|
+
if len(fetched_results) >= max_results:
|
|
57
|
+
break
|
|
58
|
+
title_tag = link.select_one('h2')
|
|
59
|
+
url_tag = link.select_one('h2 a')
|
|
60
|
+
text_tag = link.select_one('p')
|
|
61
|
+
|
|
62
|
+
if title_tag and url_tag and text_tag:
|
|
63
|
+
title = title_tag.get_text(strip=True)
|
|
64
|
+
href = url_tag.get('href', '')
|
|
65
|
+
body = text_tag.get_text(strip=True)
|
|
66
|
+
|
|
67
|
+
# Decode Bing URL if needed
|
|
68
|
+
if href.startswith('/ck/a?'):
|
|
69
|
+
# Simple unwrap, similar to bing.py
|
|
70
|
+
from urllib.parse import parse_qs, urlparse
|
|
71
|
+
try:
|
|
72
|
+
parsed = urlparse(href)
|
|
73
|
+
query_params = parse_qs(parsed.query)
|
|
74
|
+
if 'u' in query_params:
|
|
75
|
+
encoded_url = query_params['u'][0]
|
|
76
|
+
if encoded_url.startswith('a1'):
|
|
77
|
+
encoded_url = encoded_url[2:]
|
|
78
|
+
padding = len(encoded_url) % 4
|
|
79
|
+
if padding:
|
|
80
|
+
encoded_url += '=' * (4 - padding)
|
|
81
|
+
import base64
|
|
82
|
+
decoded = base64.urlsafe_b64decode(encoded_url).decode()
|
|
83
|
+
href = decoded
|
|
84
|
+
except:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
if unique and href in fetched_links:
|
|
88
|
+
continue
|
|
89
|
+
fetched_links.add(href)
|
|
90
|
+
|
|
91
|
+
fetched_results.append({
|
|
92
|
+
'title': title,
|
|
93
|
+
'href': href,
|
|
94
|
+
'body': body
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
# Get next page
|
|
98
|
+
next_page_tag = soup.select_one('div#b_content nav[role="navigation"] a.sb_pagN')
|
|
99
|
+
if next_page_tag and next_page_tag.get('href'):
|
|
100
|
+
next_url = self.base_url + next_page_tag['href']
|
|
101
|
+
urls_to_fetch.append(next_url)
|
|
102
|
+
|
|
103
|
+
if self.sleep_interval:
|
|
104
|
+
sleep(self.sleep_interval)
|
|
105
|
+
|
|
106
|
+
return fetched_results[:max_results]
|
|
@@ -1,12 +1,25 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from decimal import Decimal
|
|
4
|
+
from math import sqrt
|
|
4
5
|
|
|
5
6
|
from ....exceptions import WebscoutE
|
|
6
7
|
from .base import DuckDuckGoBase
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class DuckDuckGoMaps(DuckDuckGoBase):
|
|
11
|
+
def _calculate_distance(self, lat_t: Decimal, lon_l: Decimal, lat_b: Decimal, lon_r: Decimal) -> float:
|
|
12
|
+
"""Calculate the Euclidean distance between top-left and bottom-right corners of bounding box."""
|
|
13
|
+
# Convert to float for math operations
|
|
14
|
+
lat_t_f = float(lat_t)
|
|
15
|
+
lon_l_f = float(lon_l)
|
|
16
|
+
lat_b_f = float(lat_b)
|
|
17
|
+
lon_r_f = float(lon_r)
|
|
18
|
+
|
|
19
|
+
# Calculate Euclidean distance
|
|
20
|
+
distance = sqrt((lat_t_f - lat_b_f) ** 2 + (lon_r_f - lon_l_f) ** 2)
|
|
21
|
+
return distance
|
|
22
|
+
|
|
10
23
|
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
11
24
|
keywords = args[0] if args else kwargs.get("keywords")
|
|
12
25
|
place = args[1] if len(args) > 1 else kwargs.get("place")
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Yahoo search engines package.
|
|
2
|
+
|
|
3
|
+
This package provides comprehensive Yahoo search functionality including:
|
|
4
|
+
- Text search with multi-page pagination
|
|
5
|
+
- Image search with advanced filters
|
|
6
|
+
- Video search with quality and length filters
|
|
7
|
+
- News search with time filtering
|
|
8
|
+
- Search suggestions/autocomplete
|
|
9
|
+
|
|
10
|
+
All engines support:
|
|
11
|
+
- Human-like browsing through multiple pages
|
|
12
|
+
- Rich metadata extraction
|
|
13
|
+
- Filter support
|
|
14
|
+
- Clean result formatting
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
>>> from webscout.search.engines.yahoo import YahooText
|
|
18
|
+
>>>
|
|
19
|
+
>>> # Search with automatic pagination
|
|
20
|
+
>>> searcher = YahooText()
|
|
21
|
+
>>> results = searcher.search("python programming", max_results=50)
|
|
22
|
+
>>>
|
|
23
|
+
>>> for result in results:
|
|
24
|
+
... print(f"{result.title}: {result.url}")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .base import YahooSearchEngine
|
|
28
|
+
from .images import YahooImages
|
|
29
|
+
from .news import YahooNews
|
|
30
|
+
from .suggestions import YahooSuggestions
|
|
31
|
+
from .text import YahooText
|
|
32
|
+
from .videos import YahooVideos
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"YahooSearchEngine",
|
|
36
|
+
"YahooText",
|
|
37
|
+
"YahooImages",
|
|
38
|
+
"YahooVideos",
|
|
39
|
+
"YahooNews",
|
|
40
|
+
"YahooSuggestions",
|
|
41
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Yahoo answers search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import YahooSearchEngine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YahooAnswers(YahooSearchEngine):
|
|
9
|
+
"""Yahoo instant answers."""
|
|
10
|
+
|
|
11
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
12
|
+
"""Get instant answers from Yahoo.
|
|
13
|
+
|
|
14
|
+
Not supported.
|
|
15
|
+
"""
|
|
16
|
+
raise NotImplementedError("Yahoo does not support instant answers")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Base class for Yahoo search engines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from secrets import token_urlsafe
|
|
6
|
+
from typing import Any, Generic, TypeVar
|
|
7
|
+
|
|
8
|
+
from ...base import BaseSearchEngine
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T")
|
|
11
|
+
|
|
12
|
+
class YahooSearchEngine(BaseSearchEngine[T], Generic[T]):
|
|
13
|
+
"""Base class for Yahoo search engines.
|
|
14
|
+
|
|
15
|
+
Yahoo search is powered by Bing but has its own interface.
|
|
16
|
+
All Yahoo searches use dynamic URLs with tokens for tracking.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
provider = "yahoo"
|
|
20
|
+
_base_url = "https://search.yahoo.com"
|
|
21
|
+
|
|
22
|
+
def generate_ylt_token(self) -> str:
|
|
23
|
+
"""Generate Yahoo _ylt tracking token."""
|
|
24
|
+
return token_urlsafe(24 * 3 // 4)
|
|
25
|
+
|
|
26
|
+
def generate_ylu_token(self) -> str:
|
|
27
|
+
"""Generate Yahoo _ylu tracking token."""
|
|
28
|
+
return token_urlsafe(47 * 3 // 4)
|
|
29
|
+
|
|
30
|
+
def build_search_url(self, base_path: str) -> str:
|
|
31
|
+
"""Build search URL with tracking tokens."""
|
|
32
|
+
ylt = self.generate_ylt_token()
|
|
33
|
+
ylu = self.generate_ylu_token()
|
|
34
|
+
return f"{self._base_url}/{base_path};_ylt={ylt};_ylu={ylu}"
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Yahoo image search engine with advanced filters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import urljoin
|
|
8
|
+
|
|
9
|
+
from .base import YahooSearchEngine
|
|
10
|
+
from ...results import ImagesResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class YahooImages(YahooSearchEngine[ImagesResult]):
|
|
14
|
+
"""Yahoo image search engine with filter support.
|
|
15
|
+
|
|
16
|
+
Features:
|
|
17
|
+
- Size filters (small, medium, large, wallpaper)
|
|
18
|
+
- Color filters (color, bw, red, orange, yellow, etc.)
|
|
19
|
+
- Type filters (photo, clipart, lineart, transparent)
|
|
20
|
+
- Layout filters (square, wide, tall)
|
|
21
|
+
- Time filters
|
|
22
|
+
- Pagination support
|
|
23
|
+
|
|
24
|
+
Note: Yahoo does not support reverse image search (searching by image upload/URL).
|
|
25
|
+
For reverse image search functionality, use Google Images or Bing Images instead.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "yahoo"
|
|
29
|
+
category = "images"
|
|
30
|
+
|
|
31
|
+
search_url = "https://images.search.yahoo.com/search/images"
|
|
32
|
+
search_method = "GET"
|
|
33
|
+
search_headers = {
|
|
34
|
+
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# XPath selectors
|
|
38
|
+
items_xpath = "//li[contains(@class, 'ld')]"
|
|
39
|
+
elements_xpath: Mapping[str, str] = {
|
|
40
|
+
"title": "@data",
|
|
41
|
+
"image": "@data",
|
|
42
|
+
"thumbnail": "@data",
|
|
43
|
+
"url": "@data",
|
|
44
|
+
"source": "@data",
|
|
45
|
+
"width": "@data",
|
|
46
|
+
"height": "@data",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Filter mappings
|
|
50
|
+
SIZE_FILTERS = {
|
|
51
|
+
"small": "small",
|
|
52
|
+
"medium": "medium",
|
|
53
|
+
"large": "large",
|
|
54
|
+
"wallpaper": "wallpaper",
|
|
55
|
+
"all": "",
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
COLOR_FILTERS = {
|
|
59
|
+
"color": "color",
|
|
60
|
+
"bw": "bw",
|
|
61
|
+
"black": "black",
|
|
62
|
+
"white": "white",
|
|
63
|
+
"red": "red",
|
|
64
|
+
"orange": "orange",
|
|
65
|
+
"yellow": "yellow",
|
|
66
|
+
"green": "green",
|
|
67
|
+
"teal": "teal",
|
|
68
|
+
"blue": "blue",
|
|
69
|
+
"purple": "purple",
|
|
70
|
+
"pink": "pink",
|
|
71
|
+
"brown": "brown",
|
|
72
|
+
"gray": "gray",
|
|
73
|
+
"all": "",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
TYPE_FILTERS = {
|
|
77
|
+
"photo": "photo",
|
|
78
|
+
"clipart": "clipart",
|
|
79
|
+
"lineart": "linedrawing",
|
|
80
|
+
"transparent": "transparent",
|
|
81
|
+
"gif": "animatedgif",
|
|
82
|
+
"all": "",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
LAYOUT_FILTERS = {
|
|
86
|
+
"square": "square",
|
|
87
|
+
"wide": "wide",
|
|
88
|
+
"tall": "tall",
|
|
89
|
+
"all": "",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def build_payload(
|
|
93
|
+
self,
|
|
94
|
+
query: str,
|
|
95
|
+
region: str,
|
|
96
|
+
safesearch: str,
|
|
97
|
+
timelimit: str | None,
|
|
98
|
+
page: int = 1,
|
|
99
|
+
**kwargs: Any,
|
|
100
|
+
) -> dict[str, Any]:
|
|
101
|
+
"""Build image search payload with filters.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
query: Search query
|
|
105
|
+
region: Region code
|
|
106
|
+
safesearch: Safe search level (on/moderate/off)
|
|
107
|
+
timelimit: Time filter (d, w, m)
|
|
108
|
+
page: Page number
|
|
109
|
+
**kwargs: Additional filters including:
|
|
110
|
+
- size: Image size filter
|
|
111
|
+
- color: Color filter
|
|
112
|
+
- type: Image type filter
|
|
113
|
+
- layout: Layout/aspect ratio filter
|
|
114
|
+
- license: Usage rights filter
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Query parameters dictionary
|
|
118
|
+
"""
|
|
119
|
+
payload = {
|
|
120
|
+
"p": query,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Pagination - Yahoo images use 'b' parameter
|
|
124
|
+
if page > 1:
|
|
125
|
+
# Each page shows approximately 40 images
|
|
126
|
+
payload["b"] = f"{(page - 1) * 40 + 1}"
|
|
127
|
+
|
|
128
|
+
# Safe search
|
|
129
|
+
if safesearch == "on":
|
|
130
|
+
payload["safe"] = "active"
|
|
131
|
+
elif safesearch == "off":
|
|
132
|
+
payload["safe"] = "off"
|
|
133
|
+
|
|
134
|
+
# Time filter
|
|
135
|
+
if timelimit:
|
|
136
|
+
time_map = {
|
|
137
|
+
"d": "1d", # Past 24 hours
|
|
138
|
+
"w": "1w", # Past week
|
|
139
|
+
"m": "1m", # Past month
|
|
140
|
+
}
|
|
141
|
+
if timelimit in time_map:
|
|
142
|
+
payload["age"] = time_map[timelimit]
|
|
143
|
+
|
|
144
|
+
# Size filter
|
|
145
|
+
if "size" in kwargs and kwargs["size"] in self.SIZE_FILTERS:
|
|
146
|
+
size_val = self.SIZE_FILTERS[kwargs["size"]]
|
|
147
|
+
if size_val:
|
|
148
|
+
payload["imgsz"] = size_val
|
|
149
|
+
|
|
150
|
+
# Color filter
|
|
151
|
+
if "color" in kwargs and kwargs["color"] in self.COLOR_FILTERS:
|
|
152
|
+
color_val = self.COLOR_FILTERS[kwargs["color"]]
|
|
153
|
+
if color_val:
|
|
154
|
+
payload["imgc"] = color_val
|
|
155
|
+
|
|
156
|
+
# Type filter
|
|
157
|
+
if "type" in kwargs and kwargs["type"] in self.TYPE_FILTERS:
|
|
158
|
+
type_val = self.TYPE_FILTERS[kwargs["type"]]
|
|
159
|
+
if type_val:
|
|
160
|
+
payload["imgt"] = type_val
|
|
161
|
+
|
|
162
|
+
# Layout filter
|
|
163
|
+
if "layout" in kwargs and kwargs["layout"] in self.LAYOUT_FILTERS:
|
|
164
|
+
layout_val = self.LAYOUT_FILTERS[kwargs["layout"]]
|
|
165
|
+
if layout_val:
|
|
166
|
+
payload["imgsp"] = layout_val
|
|
167
|
+
|
|
168
|
+
return payload
|
|
169
|
+
|
|
170
|
+
def post_extract_results(self, results: list[ImagesResult]) -> list[ImagesResult]:
|
|
171
|
+
"""Post-process image results to parse JSON data.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
results: Raw extracted results
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Cleaned results with proper URLs and metadata
|
|
178
|
+
"""
|
|
179
|
+
import json
|
|
180
|
+
from urllib.parse import unquote
|
|
181
|
+
|
|
182
|
+
cleaned_results = []
|
|
183
|
+
|
|
184
|
+
for result in results:
|
|
185
|
+
# Parse JSON data from the data attribute
|
|
186
|
+
if result.title and result.title.startswith('{'):
|
|
187
|
+
try:
|
|
188
|
+
data = json.loads(result.title)
|
|
189
|
+
|
|
190
|
+
# Extract title
|
|
191
|
+
result.title = data.get('desc', '') or data.get('tit', '')
|
|
192
|
+
|
|
193
|
+
# Extract URLs
|
|
194
|
+
result.url = data.get('rurl', '')
|
|
195
|
+
result.thumbnail = data.get('turl', '')
|
|
196
|
+
result.image = data.get('turlL', '') or data.get('turl', '')
|
|
197
|
+
|
|
198
|
+
# Extract dimensions
|
|
199
|
+
result.width = int(data.get('imgW', 0))
|
|
200
|
+
result.height = int(data.get('imgH', 0))
|
|
201
|
+
|
|
202
|
+
except (json.JSONDecodeError, KeyError, ValueError):
|
|
203
|
+
# If JSON parsing fails, keep original data
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
# Clean URLs if they exist
|
|
207
|
+
if result.url:
|
|
208
|
+
result.url = unquote(result.url)
|
|
209
|
+
if result.image:
|
|
210
|
+
result.image = unquote(result.image)
|
|
211
|
+
if result.thumbnail:
|
|
212
|
+
result.thumbnail = unquote(result.thumbnail)
|
|
213
|
+
|
|
214
|
+
cleaned_results.append(result)
|
|
215
|
+
|
|
216
|
+
return cleaned_results
|
|
217
|
+
|
|
218
|
+
def search(
|
|
219
|
+
self,
|
|
220
|
+
query: str,
|
|
221
|
+
region: str = "us-en",
|
|
222
|
+
safesearch: str = "moderate",
|
|
223
|
+
timelimit: str | None = None,
|
|
224
|
+
page: int = 1,
|
|
225
|
+
max_results: int | None = None,
|
|
226
|
+
**kwargs: Any,
|
|
227
|
+
) -> list[ImagesResult] | None:
|
|
228
|
+
"""Search Yahoo Images with pagination.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
query: Image search query
|
|
232
|
+
region: Region code
|
|
233
|
+
safesearch: Safe search level
|
|
234
|
+
timelimit: Time filter
|
|
235
|
+
page: Starting page
|
|
236
|
+
max_results: Maximum results to return
|
|
237
|
+
**kwargs: Additional filters (size, color, type, layout)
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
List of ImageResult objects
|
|
241
|
+
"""
|
|
242
|
+
results = []
|
|
243
|
+
current_page = page
|
|
244
|
+
max_pages = kwargs.get("max_pages", 5)
|
|
245
|
+
|
|
246
|
+
while current_page <= max_pages:
|
|
247
|
+
payload = self.build_payload(
|
|
248
|
+
query=query,
|
|
249
|
+
region=region,
|
|
250
|
+
safesearch=safesearch,
|
|
251
|
+
timelimit=timelimit,
|
|
252
|
+
page=current_page,
|
|
253
|
+
**kwargs
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
html_text = self.request(self.search_method, self.search_url, params=payload)
|
|
257
|
+
if not html_text:
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
html_text = self.pre_process_html(html_text)
|
|
261
|
+
page_results = self.extract_results(html_text)
|
|
262
|
+
|
|
263
|
+
if not page_results:
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
results.extend(page_results)
|
|
267
|
+
|
|
268
|
+
if max_results and len(results) >= max_results:
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
current_page += 1
|
|
272
|
+
|
|
273
|
+
results = self.post_extract_results(results)
|
|
274
|
+
|
|
275
|
+
if max_results:
|
|
276
|
+
results = results[:max_results]
|
|
277
|
+
|
|
278
|
+
return results if results else None
|
|
279
|
+
|
|
280
|
+
def run(
|
|
281
|
+
self,
|
|
282
|
+
keywords: str,
|
|
283
|
+
region: str = "us-en",
|
|
284
|
+
safesearch: str = "moderate",
|
|
285
|
+
timelimit: str | None = None,
|
|
286
|
+
size: str | None = None,
|
|
287
|
+
color: str | None = None,
|
|
288
|
+
type_image: str | None = None,
|
|
289
|
+
layout: str | None = None,
|
|
290
|
+
license_image: str | None = None,
|
|
291
|
+
max_results: int | None = None,
|
|
292
|
+
) -> list[dict[str, str]]:
|
|
293
|
+
"""Run image search and return results as dictionaries.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
keywords: Search query.
|
|
297
|
+
region: Region code.
|
|
298
|
+
safesearch: Safe search level.
|
|
299
|
+
timelimit: Time filter.
|
|
300
|
+
size: Image size filter.
|
|
301
|
+
color: Color filter.
|
|
302
|
+
type_image: Image type filter.
|
|
303
|
+
layout: Layout filter.
|
|
304
|
+
license_image: License filter.
|
|
305
|
+
max_results: Maximum number of results.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
List of image result dictionaries.
|
|
309
|
+
"""
|
|
310
|
+
results = self.search(
|
|
311
|
+
query=keywords,
|
|
312
|
+
region=region,
|
|
313
|
+
safesearch=safesearch,
|
|
314
|
+
timelimit=timelimit,
|
|
315
|
+
size=size,
|
|
316
|
+
color=color,
|
|
317
|
+
type_image=type_image,
|
|
318
|
+
layout=layout,
|
|
319
|
+
license_image=license_image,
|
|
320
|
+
max_results=max_results,
|
|
321
|
+
)
|
|
322
|
+
if results is None:
|
|
323
|
+
return []
|
|
324
|
+
return [result.to_dict() for result in results]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Yahoo maps search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import YahooSearchEngine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YahooMaps(YahooSearchEngine):
|
|
9
|
+
"""Yahoo maps search."""
|
|
10
|
+
|
|
11
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
12
|
+
"""Get maps results from Yahoo.
|
|
13
|
+
|
|
14
|
+
Not supported.
|
|
15
|
+
"""
|
|
16
|
+
raise NotImplementedError("Yahoo does not support maps search")
|