webscout 2025.10.15__py3-none-any.whl → 2025.10.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Extra/YTToolkit/README.md +1 -1
- webscout/Extra/tempmail/README.md +3 -3
- webscout/Provider/ClaudeOnline.py +350 -0
- webscout/Provider/OPENAI/README.md +1 -1
- webscout/Provider/TTI/bing.py +4 -4
- webscout/Provider/TTI/claudeonline.py +315 -0
- webscout/__init__.py +1 -1
- webscout/client.py +4 -5
- webscout/litprinter/__init__.py +0 -42
- webscout/scout/README.md +59 -8
- webscout/scout/core/scout.py +62 -0
- webscout/scout/element.py +251 -45
- webscout/search/__init__.py +3 -4
- webscout/search/engines/bing/images.py +5 -2
- webscout/search/engines/bing/news.py +6 -4
- webscout/search/engines/bing/text.py +5 -2
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +16 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +324 -0
- webscout/search/engines/yahoo/maps.py +16 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +16 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/http_client.py +1 -1
- webscout/search/yahoo_main.py +54 -0
- webscout/{auth → server}/__init__.py +2 -23
- webscout/server/config.py +84 -0
- webscout/{auth → server}/request_processing.py +3 -28
- webscout/{auth → server}/routes.py +6 -148
- webscout/server/schemas.py +23 -0
- webscout/{auth → server}/server.py +11 -43
- webscout/server/simple_logger.py +84 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +1 -1
- webscout/zeroart/README.md +17 -9
- webscout/zeroart/__init__.py +78 -6
- webscout/zeroart/effects.py +51 -1
- webscout/zeroart/fonts.py +559 -1
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/METADATA +11 -54
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/RECORD +51 -46
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/entry_points.txt +1 -1
- webscout/Extra/weather.md +0 -281
- webscout/auth/api_key_manager.py +0 -189
- webscout/auth/auth_system.py +0 -85
- webscout/auth/config.py +0 -175
- webscout/auth/database.py +0 -755
- webscout/auth/middleware.py +0 -248
- webscout/auth/models.py +0 -185
- webscout/auth/rate_limiter.py +0 -254
- webscout/auth/schemas.py +0 -103
- webscout/auth/simple_logger.py +0 -236
- webscout/search/engines/yahoo.py +0 -65
- webscout/search/engines/yahoo_news.py +0 -64
- /webscout/{auth → server}/exceptions.py +0 -0
- /webscout/{auth → server}/providers.py +0 -0
- /webscout/{auth → server}/request_models.py +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/WHEEL +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/top_level.txt +0 -0
webscout/scout/element.py
CHANGED
|
@@ -267,7 +267,14 @@ class Tag:
|
|
|
267
267
|
def select(self, selector: str) -> List['Tag']:
|
|
268
268
|
"""
|
|
269
269
|
Select elements using CSS selector.
|
|
270
|
-
Enhanced to support more complex selectors
|
|
270
|
+
Enhanced to support more complex selectors including:
|
|
271
|
+
- Tag selectors: 'p', 'div'
|
|
272
|
+
- Class selectors: '.class', 'p.class'
|
|
273
|
+
- ID selectors: '#id', 'div#id'
|
|
274
|
+
- Attribute selectors: '[attr]', '[attr=value]'
|
|
275
|
+
- Descendant selectors: 'div p'
|
|
276
|
+
- Child selectors: 'div > p'
|
|
277
|
+
- Multiple classes: '.class1.class2'
|
|
271
278
|
|
|
272
279
|
Args:
|
|
273
280
|
selector (str): CSS selector string
|
|
@@ -275,54 +282,248 @@ class Tag:
|
|
|
275
282
|
Returns:
|
|
276
283
|
List[Tag]: List of matching elements
|
|
277
284
|
"""
|
|
278
|
-
# More advanced CSS selector parsing
|
|
279
|
-
# This is a simplified implementation and might need more robust parsing
|
|
280
|
-
parts = re.split(r'\s+', selector.strip())
|
|
281
285
|
results = []
|
|
282
|
-
|
|
283
|
-
def
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
286
|
+
|
|
287
|
+
def _parse_simple_selector(simple_sel: str) -> dict:
|
|
288
|
+
"""Parse a simple selector like 'p.class#id[attr=value]' into components."""
|
|
289
|
+
components = {
|
|
290
|
+
'tag': None,
|
|
291
|
+
'id': None,
|
|
292
|
+
'classes': [],
|
|
293
|
+
'attrs': {}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
# Extract tag name (at the start)
|
|
297
|
+
tag_match = re.match(r'^([a-zA-Z][\w-]*)', simple_sel)
|
|
298
|
+
if tag_match:
|
|
299
|
+
components['tag'] = tag_match.group(1)
|
|
300
|
+
simple_sel = simple_sel[len(tag_match.group(1)):]
|
|
301
|
+
|
|
302
|
+
# Extract ID
|
|
303
|
+
id_matches = re.findall(r'#([\w-]+)', simple_sel)
|
|
304
|
+
if id_matches:
|
|
305
|
+
components['id'] = id_matches[0]
|
|
306
|
+
|
|
307
|
+
# Extract classes
|
|
308
|
+
class_matches = re.findall(r'\.([\w-]+)', simple_sel)
|
|
309
|
+
components['classes'] = class_matches
|
|
310
|
+
|
|
311
|
+
# Extract attributes
|
|
312
|
+
attr_matches = re.findall(r'\[([^\]]+)\]', simple_sel)
|
|
313
|
+
for attr_expr in attr_matches:
|
|
314
|
+
if '=' in attr_expr:
|
|
315
|
+
attr_name, attr_value = attr_expr.split('=', 1)
|
|
316
|
+
components['attrs'][attr_name.strip()] = attr_value.strip('\'"')
|
|
317
|
+
else:
|
|
318
|
+
components['attrs'][attr_expr.strip()] = None
|
|
319
|
+
|
|
320
|
+
return components
|
|
321
|
+
|
|
322
|
+
def _match_simple_selector(tag: 'Tag', components: dict) -> bool:
|
|
323
|
+
"""Check if a tag matches the parsed selector components."""
|
|
324
|
+
# Check tag name
|
|
325
|
+
if components['tag'] and tag.name != components['tag']:
|
|
326
|
+
return False
|
|
327
|
+
|
|
328
|
+
# Check ID
|
|
329
|
+
if components['id'] and tag.get('id') != components['id']:
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
# Check classes
|
|
333
|
+
tag_classes = tag.get('class', '')
|
|
334
|
+
if isinstance(tag_classes, str):
|
|
335
|
+
tag_classes = tag_classes.split()
|
|
336
|
+
elif not isinstance(tag_classes, list):
|
|
337
|
+
tag_classes = [str(tag_classes)] if tag_classes else []
|
|
338
|
+
|
|
339
|
+
for cls in components['classes']:
|
|
340
|
+
if cls not in tag_classes:
|
|
341
|
+
return False
|
|
342
|
+
|
|
343
|
+
# Check attributes
|
|
344
|
+
for attr_name, attr_value in components['attrs'].items():
|
|
345
|
+
if attr_value is None:
|
|
346
|
+
# Just check attribute exists
|
|
347
|
+
if attr_name not in tag.attrs:
|
|
297
348
|
return False
|
|
298
|
-
if value:
|
|
299
|
-
return tag.get(attr) == value.strip("'\"")
|
|
300
|
-
return attr in tag.attrs
|
|
301
|
-
else:
|
|
302
|
-
# Tag selector
|
|
303
|
-
return tag.name == selector_part
|
|
304
|
-
|
|
305
|
-
def _recursive_select(element, selector_parts):
|
|
306
|
-
if not selector_parts:
|
|
307
|
-
results.append(element)
|
|
308
|
-
return
|
|
309
|
-
|
|
310
|
-
current_selector = selector_parts[0]
|
|
311
|
-
remaining_selectors = selector_parts[1:]
|
|
312
|
-
|
|
313
|
-
if _match_selector(element, current_selector):
|
|
314
|
-
if not remaining_selectors:
|
|
315
|
-
results.append(element)
|
|
316
349
|
else:
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
350
|
+
# Check attribute value
|
|
351
|
+
if tag.get(attr_name) != attr_value:
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
def _find_all_matching(element: 'Tag', components: dict) -> List['Tag']:
|
|
357
|
+
"""Recursively find all elements matching the selector components."""
|
|
358
|
+
matches = []
|
|
359
|
+
|
|
360
|
+
# Check current element
|
|
361
|
+
if _match_simple_selector(element, components):
|
|
362
|
+
matches.append(element)
|
|
363
|
+
|
|
364
|
+
# Check children recursively
|
|
365
|
+
for child in element.contents:
|
|
366
|
+
if isinstance(child, Tag):
|
|
367
|
+
matches.extend(_find_all_matching(child, components))
|
|
368
|
+
|
|
369
|
+
return matches
|
|
370
|
+
|
|
371
|
+
# Handle combinators (descendant ' ' and child '>')
|
|
372
|
+
if ' > ' in selector:
|
|
373
|
+
# Child combinator
|
|
374
|
+
parts = [p.strip() for p in selector.split(' > ')]
|
|
375
|
+
return self._select_with_child_combinator(parts)
|
|
376
|
+
elif ' ' in selector.strip():
|
|
377
|
+
# Descendant combinator
|
|
378
|
+
parts = [p.strip() for p in selector.split()]
|
|
379
|
+
return self._select_with_descendant_combinator(parts)
|
|
380
|
+
else:
|
|
381
|
+
# Simple selector
|
|
382
|
+
components = _parse_simple_selector(selector)
|
|
383
|
+
return _find_all_matching(self, components)
|
|
384
|
+
|
|
385
|
+
def _select_with_descendant_combinator(self, parts: List[str]) -> List['Tag']:
|
|
386
|
+
"""Handle descendant combinator (space)."""
|
|
387
|
+
if not parts:
|
|
388
|
+
return []
|
|
389
|
+
|
|
390
|
+
if len(parts) == 1:
|
|
391
|
+
components = self._parse_selector_components(parts[0])
|
|
392
|
+
return self._find_all_matching_in_tree(self, components)
|
|
393
|
+
|
|
394
|
+
# Find elements matching the first part
|
|
395
|
+
first_components = self._parse_selector_components(parts[0])
|
|
396
|
+
first_matches = self._find_all_matching_in_tree(self, first_components)
|
|
397
|
+
|
|
398
|
+
# For each match, find descendants matching remaining parts
|
|
399
|
+
results = []
|
|
400
|
+
remaining_selector = ' '.join(parts[1:])
|
|
401
|
+
for match in first_matches:
|
|
402
|
+
descendants = match.select(remaining_selector)
|
|
403
|
+
results.extend(descendants)
|
|
404
|
+
|
|
325
405
|
return results
|
|
406
|
+
|
|
407
|
+
def _select_with_child_combinator(self, parts: List[str]) -> List['Tag']:
|
|
408
|
+
"""Handle child combinator (>)."""
|
|
409
|
+
if not parts:
|
|
410
|
+
return []
|
|
411
|
+
|
|
412
|
+
if len(parts) == 1:
|
|
413
|
+
components = self._parse_selector_components(parts[0])
|
|
414
|
+
return self._find_all_matching_in_tree(self, components)
|
|
415
|
+
|
|
416
|
+
# Find elements matching the first part
|
|
417
|
+
first_components = self._parse_selector_components(parts[0])
|
|
418
|
+
first_matches = self._find_all_matching_in_tree(self, first_components)
|
|
419
|
+
|
|
420
|
+
# For each match, find direct children matching the next part
|
|
421
|
+
if len(parts) == 2:
|
|
422
|
+
# Last part, just check direct children
|
|
423
|
+
next_components = self._parse_selector_components(parts[1])
|
|
424
|
+
results = []
|
|
425
|
+
for match in first_matches:
|
|
426
|
+
for child in match.contents:
|
|
427
|
+
if isinstance(child, Tag) and self._match_selector_components(child, next_components):
|
|
428
|
+
results.append(child)
|
|
429
|
+
return results
|
|
430
|
+
else:
|
|
431
|
+
# More parts, need to continue recursively
|
|
432
|
+
results = []
|
|
433
|
+
next_components = self._parse_selector_components(parts[1])
|
|
434
|
+
remaining_parts = parts[2:]
|
|
435
|
+
for match in first_matches:
|
|
436
|
+
for child in match.contents:
|
|
437
|
+
if isinstance(child, Tag) and self._match_selector_components(child, next_components):
|
|
438
|
+
# Continue with remaining parts
|
|
439
|
+
remaining_selector = ' > '.join(remaining_parts)
|
|
440
|
+
descendants = child.select(remaining_selector)
|
|
441
|
+
results.extend(descendants)
|
|
442
|
+
return results
|
|
443
|
+
|
|
444
|
+
def _parse_selector_components(self, simple_sel: str) -> dict:
|
|
445
|
+
"""Parse a simple selector like 'p.class#id[attr=value]' into components."""
|
|
446
|
+
components = {
|
|
447
|
+
'tag': None,
|
|
448
|
+
'id': None,
|
|
449
|
+
'classes': [],
|
|
450
|
+
'attrs': {}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
# Extract tag name (at the start)
|
|
454
|
+
tag_match = re.match(r'^([a-zA-Z][\w-]*)', simple_sel)
|
|
455
|
+
if tag_match:
|
|
456
|
+
components['tag'] = tag_match.group(1)
|
|
457
|
+
simple_sel = simple_sel[len(tag_match.group(1)):]
|
|
458
|
+
|
|
459
|
+
# Extract ID
|
|
460
|
+
id_matches = re.findall(r'#([\w-]+)', simple_sel)
|
|
461
|
+
if id_matches:
|
|
462
|
+
components['id'] = id_matches[0]
|
|
463
|
+
|
|
464
|
+
# Extract classes
|
|
465
|
+
class_matches = re.findall(r'\.([\w-]+)', simple_sel)
|
|
466
|
+
components['classes'] = class_matches
|
|
467
|
+
|
|
468
|
+
# Extract attributes
|
|
469
|
+
attr_matches = re.findall(r'\[([^\]]+)\]', simple_sel)
|
|
470
|
+
for attr_expr in attr_matches:
|
|
471
|
+
if '=' in attr_expr:
|
|
472
|
+
attr_name, attr_value = attr_expr.split('=', 1)
|
|
473
|
+
components['attrs'][attr_name.strip()] = attr_value.strip('\'"')
|
|
474
|
+
else:
|
|
475
|
+
components['attrs'][attr_expr.strip()] = None
|
|
476
|
+
|
|
477
|
+
return components
|
|
478
|
+
|
|
479
|
+
def _match_selector_components(self, tag: 'Tag', components: dict) -> bool:
|
|
480
|
+
"""Check if a tag matches the parsed selector components."""
|
|
481
|
+
# Check tag name
|
|
482
|
+
if components['tag'] and tag.name != components['tag']:
|
|
483
|
+
return False
|
|
484
|
+
|
|
485
|
+
# Check ID
|
|
486
|
+
if components['id'] and tag.get('id') != components['id']:
|
|
487
|
+
return False
|
|
488
|
+
|
|
489
|
+
# Check classes
|
|
490
|
+
tag_classes = tag.get('class', '')
|
|
491
|
+
if isinstance(tag_classes, str):
|
|
492
|
+
tag_classes = tag_classes.split()
|
|
493
|
+
elif not isinstance(tag_classes, list):
|
|
494
|
+
tag_classes = [str(tag_classes)] if tag_classes else []
|
|
495
|
+
|
|
496
|
+
for cls in components['classes']:
|
|
497
|
+
if cls not in tag_classes:
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
# Check attributes
|
|
501
|
+
for attr_name, attr_value in components['attrs'].items():
|
|
502
|
+
if attr_value is None:
|
|
503
|
+
# Just check attribute exists
|
|
504
|
+
if attr_name not in tag.attrs:
|
|
505
|
+
return False
|
|
506
|
+
else:
|
|
507
|
+
# Check attribute value
|
|
508
|
+
if tag.get(attr_name) != attr_value:
|
|
509
|
+
return False
|
|
510
|
+
|
|
511
|
+
return True
|
|
512
|
+
|
|
513
|
+
def _find_all_matching_in_tree(self, element: 'Tag', components: dict) -> List['Tag']:
|
|
514
|
+
"""Recursively find all elements matching the selector components."""
|
|
515
|
+
matches = []
|
|
516
|
+
|
|
517
|
+
# Check current element
|
|
518
|
+
if self._match_selector_components(element, components):
|
|
519
|
+
matches.append(element)
|
|
520
|
+
|
|
521
|
+
# Check children recursively
|
|
522
|
+
for child in element.contents:
|
|
523
|
+
if isinstance(child, Tag):
|
|
524
|
+
matches.extend(self._find_all_matching_in_tree(child, components))
|
|
525
|
+
|
|
526
|
+
return matches
|
|
326
527
|
|
|
327
528
|
def select_one(self, selector: str) -> Optional['Tag']:
|
|
328
529
|
"""
|
|
@@ -462,6 +663,11 @@ class Tag:
|
|
|
462
663
|
new_child.parent = self
|
|
463
664
|
self.contents.append(new_child)
|
|
464
665
|
|
|
666
|
+
def extend(self, new_children: List[Union['Tag', NavigableString, str]]) -> None:
|
|
667
|
+
"""Extend the contents of this tag with a list of new children."""
|
|
668
|
+
for child in new_children:
|
|
669
|
+
self.append(child)
|
|
670
|
+
|
|
465
671
|
def insert(self, index: int, new_child: Union['Tag', NavigableString, str]) -> None:
|
|
466
672
|
"""Insert a new child at the given index with error handling."""
|
|
467
673
|
if isinstance(new_child, str):
|
webscout/search/__init__.py
CHANGED
|
@@ -4,14 +4,14 @@ from .base import BaseSearch, BaseSearchEngine
|
|
|
4
4
|
from .duckduckgo_main import DuckDuckGoSearch
|
|
5
5
|
from .yep_main import YepSearch
|
|
6
6
|
from .bing_main import BingSearch
|
|
7
|
+
from .yahoo_main import YahooSearch
|
|
7
8
|
|
|
8
9
|
# Import new search engines
|
|
9
10
|
from .engines.brave import Brave
|
|
10
11
|
from .engines.mojeek import Mojeek
|
|
11
|
-
|
|
12
|
+
|
|
12
13
|
from .engines.yandex import Yandex
|
|
13
14
|
from .engines.wikipedia import Wikipedia
|
|
14
|
-
from .engines.yahoo_news import YahooNews
|
|
15
15
|
|
|
16
16
|
# Import result models
|
|
17
17
|
from .results import (
|
|
@@ -31,14 +31,13 @@ __all__ = [
|
|
|
31
31
|
"DuckDuckGoSearch",
|
|
32
32
|
"YepSearch",
|
|
33
33
|
"BingSearch",
|
|
34
|
+
"YahooSearch",
|
|
34
35
|
|
|
35
36
|
# Individual engines
|
|
36
37
|
"Brave",
|
|
37
38
|
"Mojeek",
|
|
38
|
-
"Yahoo",
|
|
39
39
|
"Yandex",
|
|
40
40
|
"Wikipedia",
|
|
41
|
-
"YahooNews",
|
|
42
41
|
|
|
43
42
|
# Result models
|
|
44
43
|
"TextResult",
|
|
@@ -4,10 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Dict, List
|
|
6
6
|
from urllib.parse import urlencode
|
|
7
|
-
from bs4 import BeautifulSoup
|
|
8
7
|
from time import sleep
|
|
9
8
|
|
|
10
9
|
from .base import BingBase
|
|
10
|
+
from webscout.scout import Scout
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class BingImagesSearch(BingBase):
|
|
@@ -17,6 +17,9 @@ class BingImagesSearch(BingBase):
|
|
|
17
17
|
safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
|
|
18
18
|
max_results = args[3] if len(args) > 3 else kwargs.get("max_results", 10)
|
|
19
19
|
|
|
20
|
+
if max_results is None:
|
|
21
|
+
max_results = 10
|
|
22
|
+
|
|
20
23
|
if not keywords:
|
|
21
24
|
raise ValueError("Keywords are mandatory")
|
|
22
25
|
|
|
@@ -59,7 +62,7 @@ class BingImagesSearch(BingBase):
|
|
|
59
62
|
except Exception as e:
|
|
60
63
|
raise Exception(f"Failed to fetch images: {str(e)}")
|
|
61
64
|
|
|
62
|
-
soup =
|
|
65
|
+
soup = Scout(html)
|
|
63
66
|
img_tags = soup.select('a.iusc img')
|
|
64
67
|
|
|
65
68
|
for img in img_tags:
|
|
@@ -4,10 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Dict, List
|
|
6
6
|
from urllib.parse import urlencode
|
|
7
|
-
from bs4 import BeautifulSoup
|
|
8
7
|
from time import sleep
|
|
9
8
|
|
|
10
9
|
from .base import BingBase
|
|
10
|
+
from webscout.scout import Scout
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class BingNewsSearch(BingBase):
|
|
@@ -17,6 +17,9 @@ class BingNewsSearch(BingBase):
|
|
|
17
17
|
safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
|
|
18
18
|
max_results = args[3] if len(args) > 3 else kwargs.get("max_results", 10)
|
|
19
19
|
|
|
20
|
+
if max_results is None:
|
|
21
|
+
max_results = 10
|
|
22
|
+
|
|
20
23
|
if not keywords:
|
|
21
24
|
raise ValueError("Keywords are mandatory")
|
|
22
25
|
|
|
@@ -50,15 +53,14 @@ class BingNewsSearch(BingBase):
|
|
|
50
53
|
try:
|
|
51
54
|
response = self.session.get(full_url, timeout=self.timeout)
|
|
52
55
|
response.raise_for_status()
|
|
53
|
-
|
|
56
|
+
html = response.text
|
|
54
57
|
except Exception as e:
|
|
55
58
|
raise Exception(f"Failed to fetch news: {str(e)}")
|
|
56
59
|
|
|
57
|
-
html = data.get('html', '')
|
|
58
60
|
if not html:
|
|
59
61
|
break
|
|
60
62
|
|
|
61
|
-
soup =
|
|
63
|
+
soup = Scout(html)
|
|
62
64
|
news_items = soup.select('div.newsitem')
|
|
63
65
|
|
|
64
66
|
for item in news_items:
|
|
@@ -4,10 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Dict, List
|
|
6
6
|
from urllib.parse import urlencode
|
|
7
|
-
from bs4 import BeautifulSoup
|
|
8
7
|
from time import sleep
|
|
9
8
|
|
|
10
9
|
from .base import BingBase
|
|
10
|
+
from webscout.scout import Scout
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class BingTextSearch(BingBase):
|
|
@@ -18,6 +18,9 @@ class BingTextSearch(BingBase):
|
|
|
18
18
|
max_results = args[3] if len(args) > 3 else kwargs.get("max_results", 10)
|
|
19
19
|
unique = kwargs.get("unique", True)
|
|
20
20
|
|
|
21
|
+
if max_results is None:
|
|
22
|
+
max_results = 10
|
|
23
|
+
|
|
21
24
|
if not keywords:
|
|
22
25
|
raise ValueError("Keywords are mandatory")
|
|
23
26
|
|
|
@@ -46,7 +49,7 @@ class BingTextSearch(BingBase):
|
|
|
46
49
|
while len(fetched_results) < max_results and urls_to_fetch:
|
|
47
50
|
current_url = urls_to_fetch.pop(0)
|
|
48
51
|
html = fetch_page(current_url)
|
|
49
|
-
soup =
|
|
52
|
+
soup = Scout(html)
|
|
50
53
|
|
|
51
54
|
links = soup.select('ol#b_results > li.b_algo')
|
|
52
55
|
for link in links:
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Yahoo search engines package.
|
|
2
|
+
|
|
3
|
+
This package provides comprehensive Yahoo search functionality including:
|
|
4
|
+
- Text search with multi-page pagination
|
|
5
|
+
- Image search with advanced filters
|
|
6
|
+
- Video search with quality and length filters
|
|
7
|
+
- News search with time filtering
|
|
8
|
+
- Search suggestions/autocomplete
|
|
9
|
+
|
|
10
|
+
All engines support:
|
|
11
|
+
- Human-like browsing through multiple pages
|
|
12
|
+
- Rich metadata extraction
|
|
13
|
+
- Filter support
|
|
14
|
+
- Clean result formatting
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
>>> from webscout.search.engines.yahoo import YahooText
|
|
18
|
+
>>>
|
|
19
|
+
>>> # Search with automatic pagination
|
|
20
|
+
>>> searcher = YahooText()
|
|
21
|
+
>>> results = searcher.search("python programming", max_results=50)
|
|
22
|
+
>>>
|
|
23
|
+
>>> for result in results:
|
|
24
|
+
... print(f"{result.title}: {result.url}")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .base import YahooSearchEngine
|
|
28
|
+
from .images import YahooImages
|
|
29
|
+
from .news import YahooNews
|
|
30
|
+
from .suggestions import YahooSuggestions
|
|
31
|
+
from .text import YahooText
|
|
32
|
+
from .videos import YahooVideos
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"YahooSearchEngine",
|
|
36
|
+
"YahooText",
|
|
37
|
+
"YahooImages",
|
|
38
|
+
"YahooVideos",
|
|
39
|
+
"YahooNews",
|
|
40
|
+
"YahooSuggestions",
|
|
41
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Yahoo answers search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import YahooSearchEngine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YahooAnswers(YahooSearchEngine):
|
|
9
|
+
"""Yahoo instant answers."""
|
|
10
|
+
|
|
11
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
12
|
+
"""Get instant answers from Yahoo.
|
|
13
|
+
|
|
14
|
+
Not supported.
|
|
15
|
+
"""
|
|
16
|
+
raise NotImplementedError("Yahoo does not support instant answers")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Base class for Yahoo search engines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from secrets import token_urlsafe
|
|
6
|
+
from typing import Any, Generic, TypeVar
|
|
7
|
+
|
|
8
|
+
from ...base import BaseSearchEngine
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T")
|
|
11
|
+
|
|
12
|
+
class YahooSearchEngine(BaseSearchEngine[T], Generic[T]):
|
|
13
|
+
"""Base class for Yahoo search engines.
|
|
14
|
+
|
|
15
|
+
Yahoo search is powered by Bing but has its own interface.
|
|
16
|
+
All Yahoo searches use dynamic URLs with tokens for tracking.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
provider = "yahoo"
|
|
20
|
+
_base_url = "https://search.yahoo.com"
|
|
21
|
+
|
|
22
|
+
def generate_ylt_token(self) -> str:
|
|
23
|
+
"""Generate Yahoo _ylt tracking token."""
|
|
24
|
+
return token_urlsafe(24 * 3 // 4)
|
|
25
|
+
|
|
26
|
+
def generate_ylu_token(self) -> str:
|
|
27
|
+
"""Generate Yahoo _ylu tracking token."""
|
|
28
|
+
return token_urlsafe(47 * 3 // 4)
|
|
29
|
+
|
|
30
|
+
def build_search_url(self, base_path: str) -> str:
|
|
31
|
+
"""Build search URL with tracking tokens."""
|
|
32
|
+
ylt = self.generate_ylt_token()
|
|
33
|
+
ylu = self.generate_ylu_token()
|
|
34
|
+
return f"{self._base_url}/{base_path};_ylt={ylt};_ylu={ylu}"
|