webquest 0.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webquest/__init__.py +0 -0
- webquest/browsers/__init__.py +4 -0
- webquest/browsers/browser.py +23 -0
- webquest/browsers/hyperbrowser.py +52 -0
- webquest/py.typed +0 -0
- webquest/scrapers/__init__.py +47 -0
- webquest/scrapers/any_article/__init__.py +11 -0
- webquest/scrapers/any_article/schemas.py +13 -0
- webquest/scrapers/any_article/scraper.py +43 -0
- webquest/scrapers/duckduckgo_search/__init__.py +11 -0
- webquest/scrapers/duckduckgo_search/schemas.py +16 -0
- webquest/scrapers/duckduckgo_search/scraper.py +84 -0
- webquest/scrapers/google_news_search/__init__.py +11 -0
- webquest/scrapers/google_news_search/schemas.py +16 -0
- webquest/scrapers/google_news_search/scraper.py +78 -0
- webquest/scrapers/openai_parser.py +58 -0
- webquest/scrapers/scraper.py +116 -0
- webquest/scrapers/youtube_search/__init__.py +11 -0
- webquest/scrapers/youtube_search/schemas.py +51 -0
- webquest/scrapers/youtube_search/scraper.py +303 -0
- webquest/scrapers/youtube_transcript/__init__.py +11 -0
- webquest/scrapers/youtube_transcript/schemas.py +9 -0
- webquest/scrapers/youtube_transcript/scraper.py +84 -0
- webquest-0.7.3.dist-info/METADATA +97 -0
- webquest-0.7.3.dist-info/RECORD +26 -0
- webquest-0.7.3.dist-info/WHEEL +4 -0
webquest/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import AsyncContextManager
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import BrowserContext
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Browser(ABC):
|
|
8
|
+
"""
|
|
9
|
+
Abstract base class for browser implementations.
|
|
10
|
+
|
|
11
|
+
This class defines the interface for obtaining a browser context, which is used
|
|
12
|
+
for performing web scraping operations.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def get_context(self) -> AsyncContextManager[BrowserContext]:
|
|
17
|
+
"""
|
|
18
|
+
Get an asynchronous context manager that yields a Playwright BrowserContext.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
AsyncContextManager[BrowserContext]: An async context manager that yields a BrowserContext.
|
|
22
|
+
"""
|
|
23
|
+
...
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from contextlib import asynccontextmanager
|
|
2
|
+
from typing import AsyncIterator, override
|
|
3
|
+
|
|
4
|
+
from hyperbrowser import AsyncHyperbrowser
|
|
5
|
+
from playwright.async_api import BrowserContext, async_playwright
|
|
6
|
+
|
|
7
|
+
from webquest.browsers.browser import Browser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Hyperbrowser(Browser):
|
|
11
|
+
"""
|
|
12
|
+
A Browser implementation that uses Hyperbrowser for remote browser sessions.
|
|
13
|
+
|
|
14
|
+
This class manages the creation and cleanup of Hyperbrowser sessions and provides
|
|
15
|
+
a Playwright BrowserContext connected to the remote session.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
client: AsyncHyperbrowser | None = None,
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Initialize the Hyperbrowser instance.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
client (AsyncHyperbrowser | None): An optional AsyncHyperbrowser client.
|
|
27
|
+
If not provided, a new client will be created.
|
|
28
|
+
"""
|
|
29
|
+
if client is None:
|
|
30
|
+
client = AsyncHyperbrowser()
|
|
31
|
+
self._client = client
|
|
32
|
+
|
|
33
|
+
@override
|
|
34
|
+
@asynccontextmanager
|
|
35
|
+
async def get_context(self) -> AsyncIterator[BrowserContext]:
|
|
36
|
+
"""
|
|
37
|
+
Get a browser context from a new Hyperbrowser session.
|
|
38
|
+
|
|
39
|
+
This method creates a new session, connects to it using Playwright, yields
|
|
40
|
+
the context, and ensures the session is stopped afterwards.
|
|
41
|
+
|
|
42
|
+
Yields:
|
|
43
|
+
BrowserContext: The Playwright browser context connected to the Hyperbrowser session.
|
|
44
|
+
"""
|
|
45
|
+
session = await self._client.sessions.create()
|
|
46
|
+
async with async_playwright() as p:
|
|
47
|
+
browser = await p.chromium.connect_over_cdp(session.ws_endpoint)
|
|
48
|
+
context = browser.contexts[0]
|
|
49
|
+
try:
|
|
50
|
+
yield context
|
|
51
|
+
finally:
|
|
52
|
+
await self._client.sessions.stop(session.id)
|
webquest/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from webquest.scrapers.any_article import (
|
|
2
|
+
AnyArticle,
|
|
3
|
+
AnyArticleRequest,
|
|
4
|
+
AnyArticleResponse,
|
|
5
|
+
)
|
|
6
|
+
from webquest.scrapers.duckduckgo_search import (
|
|
7
|
+
DuckDuckGoSearch,
|
|
8
|
+
DuckDuckGoSearchRequest,
|
|
9
|
+
DuckDuckGoSearchResponse,
|
|
10
|
+
)
|
|
11
|
+
from webquest.scrapers.google_news_search import (
|
|
12
|
+
GoogleNewsSearch,
|
|
13
|
+
GoogleNewsSearchRequest,
|
|
14
|
+
GoogleNewsSearchResponse,
|
|
15
|
+
)
|
|
16
|
+
from webquest.scrapers.openai_parser import OpenAIParser
|
|
17
|
+
from webquest.scrapers.scraper import Scraper
|
|
18
|
+
from webquest.scrapers.youtube_search import (
|
|
19
|
+
YouTubeSearch,
|
|
20
|
+
YouTubeSearchRequest,
|
|
21
|
+
YouTubeSearchResponse,
|
|
22
|
+
)
|
|
23
|
+
from webquest.scrapers.youtube_transcript import (
|
|
24
|
+
YouTubeTranscript,
|
|
25
|
+
YouTubeTranscriptRequest,
|
|
26
|
+
YouTubeTranscriptResponse,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"AnyArticle",
|
|
31
|
+
"AnyArticleRequest",
|
|
32
|
+
"AnyArticleResponse",
|
|
33
|
+
"DuckDuckGoSearch",
|
|
34
|
+
"DuckDuckGoSearchRequest",
|
|
35
|
+
"DuckDuckGoSearchResponse",
|
|
36
|
+
"GoogleNewsSearch",
|
|
37
|
+
"GoogleNewsSearchRequest",
|
|
38
|
+
"GoogleNewsSearchResponse",
|
|
39
|
+
"OpenAIParser",
|
|
40
|
+
"Scraper",
|
|
41
|
+
"YouTubeSearch",
|
|
42
|
+
"YouTubeSearchRequest",
|
|
43
|
+
"YouTubeSearchResponse",
|
|
44
|
+
"YouTubeTranscript",
|
|
45
|
+
"YouTubeTranscriptRequest",
|
|
46
|
+
"YouTubeTranscriptResponse",
|
|
47
|
+
]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import override
|
|
2
|
+
|
|
3
|
+
from openai import AsyncOpenAI
|
|
4
|
+
from playwright.async_api import BrowserContext
|
|
5
|
+
|
|
6
|
+
from webquest.browsers.browser import Browser
|
|
7
|
+
from webquest.scrapers.any_article.schemas import AnyArticleRequest, AnyArticleResponse
|
|
8
|
+
from webquest.scrapers.openai_parser import OpenAIParser
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AnyArticle(OpenAIParser[AnyArticleRequest, AnyArticleResponse]):
|
|
12
|
+
"""Scraper to extract the main article from any web page using OpenAI."""
|
|
13
|
+
|
|
14
|
+
request = AnyArticleRequest
|
|
15
|
+
response = AnyArticleResponse
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
browser: Browser,
|
|
20
|
+
client: AsyncOpenAI | None = None,
|
|
21
|
+
model: str = "gpt-5-mini",
|
|
22
|
+
character_limit: int = 4000,
|
|
23
|
+
) -> None:
|
|
24
|
+
super().__init__(
|
|
25
|
+
browser=browser,
|
|
26
|
+
response_type=AnyArticleResponse,
|
|
27
|
+
client=client,
|
|
28
|
+
model=model,
|
|
29
|
+
input="Parse the following web page and extract the main article:\n\n",
|
|
30
|
+
character_limit=character_limit,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
@override
|
|
34
|
+
async def fetch(
|
|
35
|
+
self,
|
|
36
|
+
context: BrowserContext,
|
|
37
|
+
request: AnyArticleRequest,
|
|
38
|
+
) -> str:
|
|
39
|
+
page = await context.new_page()
|
|
40
|
+
await page.goto(request.url, wait_until="domcontentloaded")
|
|
41
|
+
await page.wait_for_timeout(3000)
|
|
42
|
+
html = await page.content()
|
|
43
|
+
return html
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DuckDuckGoSearchRequest(BaseModel):
|
|
5
|
+
query: str
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Page(BaseModel):
|
|
9
|
+
site: str
|
|
10
|
+
url: str
|
|
11
|
+
title: str
|
|
12
|
+
description: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DuckDuckGoSearchResponse(BaseModel):
|
|
16
|
+
pages: list[Page]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import override
|
|
3
|
+
from urllib.parse import quote_plus
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from playwright.async_api import BrowserContext
|
|
7
|
+
|
|
8
|
+
from webquest.scrapers.duckduckgo_search.schemas import (
|
|
9
|
+
DuckDuckGoSearchRequest,
|
|
10
|
+
DuckDuckGoSearchResponse,
|
|
11
|
+
Page,
|
|
12
|
+
)
|
|
13
|
+
from webquest.scrapers.scraper import Scraper
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DuckDuckGoSearch(Scraper[DuckDuckGoSearchRequest, str, DuckDuckGoSearchResponse]):
|
|
17
|
+
"""Scraper to perform a DuckDuckGo web search and parse the results."""
|
|
18
|
+
|
|
19
|
+
request = DuckDuckGoSearchRequest
|
|
20
|
+
response = DuckDuckGoSearchResponse
|
|
21
|
+
|
|
22
|
+
@override
|
|
23
|
+
async def fetch(
|
|
24
|
+
self,
|
|
25
|
+
context: BrowserContext,
|
|
26
|
+
request: DuckDuckGoSearchRequest,
|
|
27
|
+
) -> str:
|
|
28
|
+
url = f"https://duckduckgo.com/?origin=funnel_home_website&t=h_&q={quote_plus(request.query)}&ia=web"
|
|
29
|
+
page = await context.new_page()
|
|
30
|
+
|
|
31
|
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
32
|
+
await asyncio.sleep(1)
|
|
33
|
+
|
|
34
|
+
await page.wait_for_selector("button#more-results", timeout=15000)
|
|
35
|
+
await page.click("button#more-results")
|
|
36
|
+
|
|
37
|
+
await page.wait_for_selector("li[data-layout='organic']", timeout=15000)
|
|
38
|
+
|
|
39
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
40
|
+
await asyncio.sleep(4)
|
|
41
|
+
|
|
42
|
+
html = await page.content()
|
|
43
|
+
|
|
44
|
+
return html
|
|
45
|
+
|
|
46
|
+
@override
|
|
47
|
+
async def parse(self, raw: str) -> DuckDuckGoSearchResponse:
|
|
48
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
49
|
+
pages: list[Page] = []
|
|
50
|
+
|
|
51
|
+
article_tags = soup.find_all("article", {"data-testid": "result"})
|
|
52
|
+
|
|
53
|
+
for article_tag in article_tags:
|
|
54
|
+
site_tag = article_tag.find("p", class_="fOCEb2mA3YZTJXXjpgdS")
|
|
55
|
+
if not site_tag:
|
|
56
|
+
continue
|
|
57
|
+
site = site_tag.get_text(strip=True)
|
|
58
|
+
|
|
59
|
+
url_tag = article_tag.find("a", {"data-testid": "result-title-a"})
|
|
60
|
+
if not url_tag:
|
|
61
|
+
continue
|
|
62
|
+
url = url_tag.get("href")
|
|
63
|
+
if not isinstance(url, str):
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
title_tag = article_tag.find("span", class_="EKtkFWMYpwzMKOYr0GYm")
|
|
67
|
+
if not title_tag:
|
|
68
|
+
continue
|
|
69
|
+
title = title_tag.get_text(strip=True)
|
|
70
|
+
|
|
71
|
+
description_tag = article_tag.find("span", class_="kY2IgmnCmOGjharHErah")
|
|
72
|
+
if not description_tag:
|
|
73
|
+
continue
|
|
74
|
+
description = description_tag.get_text(strip=True)
|
|
75
|
+
|
|
76
|
+
page = Page(
|
|
77
|
+
site=site,
|
|
78
|
+
url=url,
|
|
79
|
+
title=title,
|
|
80
|
+
description=description,
|
|
81
|
+
)
|
|
82
|
+
pages.append(page)
|
|
83
|
+
|
|
84
|
+
return DuckDuckGoSearchResponse(pages=pages)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class GoogleNewsSearchRequest(BaseModel):
|
|
5
|
+
query: str
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Article(BaseModel):
|
|
9
|
+
site: str
|
|
10
|
+
url: str
|
|
11
|
+
title: str
|
|
12
|
+
published_at: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GoogleNewsSearchResponse(BaseModel):
|
|
16
|
+
articles: list[Article]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import override
|
|
3
|
+
from urllib.parse import quote_plus
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from playwright.async_api import BrowserContext
|
|
7
|
+
|
|
8
|
+
from webquest.scrapers.google_news_search.schemas import (
|
|
9
|
+
Article,
|
|
10
|
+
GoogleNewsSearchRequest,
|
|
11
|
+
GoogleNewsSearchResponse,
|
|
12
|
+
)
|
|
13
|
+
from webquest.scrapers.scraper import Scraper
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GoogleNewsSearch(Scraper[GoogleNewsSearchRequest, str, GoogleNewsSearchResponse]):
|
|
17
|
+
"""Scraper to perform a Google News search and parse the results."""
|
|
18
|
+
|
|
19
|
+
request = GoogleNewsSearchRequest
|
|
20
|
+
response = GoogleNewsSearchResponse
|
|
21
|
+
|
|
22
|
+
@override
|
|
23
|
+
async def fetch(
|
|
24
|
+
self,
|
|
25
|
+
context: BrowserContext,
|
|
26
|
+
request: GoogleNewsSearchRequest,
|
|
27
|
+
) -> str:
|
|
28
|
+
url = f"https://news.google.com/search?q={quote_plus(request.query)}"
|
|
29
|
+
page = await context.new_page()
|
|
30
|
+
|
|
31
|
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
32
|
+
await asyncio.sleep(1)
|
|
33
|
+
|
|
34
|
+
html = await page.content()
|
|
35
|
+
|
|
36
|
+
return html
|
|
37
|
+
|
|
38
|
+
@override
|
|
39
|
+
async def parse(self, raw: str) -> GoogleNewsSearchResponse:
|
|
40
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
41
|
+
articles: list[Article] = []
|
|
42
|
+
|
|
43
|
+
article_tags = soup.find_all("c-wiz")
|
|
44
|
+
for article_tag in article_tags:
|
|
45
|
+
title_tag = article_tag.find("a", class_="JtKRv")
|
|
46
|
+
if not title_tag:
|
|
47
|
+
continue
|
|
48
|
+
title = title_tag.get_text().strip()
|
|
49
|
+
|
|
50
|
+
url_tag = article_tag.find("a", class_="JtKRv")
|
|
51
|
+
if not url_tag:
|
|
52
|
+
continue
|
|
53
|
+
url = url_tag.get("href")
|
|
54
|
+
if not isinstance(url, str):
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
url = f"https://news.google.com{url[1:]}"
|
|
58
|
+
|
|
59
|
+
site_tag = article_tag.find("div", class_="vr1PYe")
|
|
60
|
+
if not site_tag:
|
|
61
|
+
continue
|
|
62
|
+
site = site_tag.get_text().strip()
|
|
63
|
+
|
|
64
|
+
published_at_tag = article_tag.find("time")
|
|
65
|
+
if not published_at_tag:
|
|
66
|
+
continue
|
|
67
|
+
published_at = published_at_tag.get_text().strip()
|
|
68
|
+
|
|
69
|
+
article = Article(
|
|
70
|
+
site=site,
|
|
71
|
+
url=url,
|
|
72
|
+
title=title,
|
|
73
|
+
published_at=published_at,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
articles.append(article)
|
|
77
|
+
|
|
78
|
+
return GoogleNewsSearchResponse(articles=articles)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Generic, Type, TypeVar, override
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from openai import AsyncOpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from webquest.browsers.browser import Browser
|
|
9
|
+
from webquest.scrapers.scraper import Scraper
|
|
10
|
+
|
|
11
|
+
TRequest = TypeVar("TRequest", bound=BaseModel)
|
|
12
|
+
TResponse = TypeVar("TResponse", bound=BaseModel)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OpenAIParser(
|
|
16
|
+
Generic[TRequest, TResponse],
|
|
17
|
+
Scraper[TRequest, str, TResponse],
|
|
18
|
+
ABC,
|
|
19
|
+
):
|
|
20
|
+
"""Abstract base class for OpenAI-based parsers."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
browser: Browser,
|
|
25
|
+
response_type: Type[TResponse],
|
|
26
|
+
client: AsyncOpenAI | None = None,
|
|
27
|
+
model: str = "gpt-5-mini",
|
|
28
|
+
input: str = "Parse the following web content:\n",
|
|
29
|
+
character_limit: int = 20000,
|
|
30
|
+
) -> None:
|
|
31
|
+
self._response_type = response_type
|
|
32
|
+
if client is None:
|
|
33
|
+
client = AsyncOpenAI()
|
|
34
|
+
self._client = client
|
|
35
|
+
self._model = model
|
|
36
|
+
self._character_limit = character_limit
|
|
37
|
+
self._input = input
|
|
38
|
+
super().__init__(browser=browser)
|
|
39
|
+
|
|
40
|
+
@override
|
|
41
|
+
async def parse(self, raw: str) -> TResponse:
|
|
42
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
43
|
+
text = soup.get_text(separator="\n", strip=True)
|
|
44
|
+
|
|
45
|
+
if len(text) > self._character_limit:
|
|
46
|
+
start = (len(text) - self._character_limit) // 2
|
|
47
|
+
end = start + self._character_limit
|
|
48
|
+
text = text[start:end]
|
|
49
|
+
|
|
50
|
+
response = await self._client.responses.parse(
|
|
51
|
+
input=f"{self._input}{text}",
|
|
52
|
+
text_format=self._response_type,
|
|
53
|
+
model=self._model,
|
|
54
|
+
reasoning={"effort": "minimal"},
|
|
55
|
+
)
|
|
56
|
+
if response.output_parsed is None:
|
|
57
|
+
raise ValueError("Failed to parse the response into the desired format.")
|
|
58
|
+
return response.output_parsed
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import ClassVar, Generic, TypeVar, overload
|
|
4
|
+
|
|
5
|
+
from playwright.async_api import BrowserContext
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from webquest.browsers.browser import Browser
|
|
9
|
+
|
|
10
|
+
TRequest = TypeVar("TRequest", bound=BaseModel)
|
|
11
|
+
TRaw = TypeVar("TRaw")
|
|
12
|
+
TResponse = TypeVar("TResponse", bound=BaseModel)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Scraper(ABC, Generic[TRequest, TRaw, TResponse]):
|
|
16
|
+
"""
|
|
17
|
+
Abstract base class for web scrapers.
|
|
18
|
+
|
|
19
|
+
This class defines the structure for a scraper, including fetching raw data
|
|
20
|
+
and parsing it into a structured response. It handles the execution flow
|
|
21
|
+
using a provided Browser instance.
|
|
22
|
+
|
|
23
|
+
Type Parameters:
|
|
24
|
+
TRequest: The type of the request object.
|
|
25
|
+
TRaw: The type of the raw data fetched from the browser.
|
|
26
|
+
TResponse: The type of the parsed response object.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
request: ClassVar[type[TRequest]]
|
|
30
|
+
response: ClassVar[type[TResponse]]
|
|
31
|
+
|
|
32
|
+
def __init__(self, browser: Browser) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Initialize the Scraper.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
browser (Browser): The browser instance to use for scraping.
|
|
38
|
+
"""
|
|
39
|
+
self._browser = browser
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
async def fetch(self, context: BrowserContext, request: TRequest) -> TRaw:
|
|
43
|
+
"""
|
|
44
|
+
Fetch raw data from the target website.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
context (BrowserContext): The browser context to use.
|
|
48
|
+
request (TRequest): The request object containing parameters for the fetch operation.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
TRaw: The raw data fetched from the website.
|
|
52
|
+
"""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
async def parse(self, raw: TRaw) -> TResponse:
|
|
57
|
+
"""
|
|
58
|
+
Parse the raw data into a structured response.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
raw (TRaw): The raw data returned by the fetch method.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
TResponse: The structured response object.
|
|
65
|
+
"""
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
@overload
|
|
69
|
+
async def run(self, request: TRequest, /) -> TResponse: ...
|
|
70
|
+
|
|
71
|
+
@overload
|
|
72
|
+
async def run(self, requests: list[TRequest], /) -> list[TResponse]: ...
|
|
73
|
+
|
|
74
|
+
@overload
|
|
75
|
+
async def run(self, *requests: TRequest) -> list[TResponse]: ...
|
|
76
|
+
|
|
77
|
+
async def run(
|
|
78
|
+
self,
|
|
79
|
+
*requests: TRequest | list[TRequest],
|
|
80
|
+
) -> list[TResponse] | TResponse:
|
|
81
|
+
"""
|
|
82
|
+
Run the scraper for one or more requests.
|
|
83
|
+
|
|
84
|
+
This method handles the browser context creation, concurrent fetching,
|
|
85
|
+
and parsing of results.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
*requests: One or more request objects, or a list of request objects.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
list[TResponse] | TResponse: A single response if a single request was passed,
|
|
92
|
+
or a list of responses corresponding to the input requests.
|
|
93
|
+
"""
|
|
94
|
+
normalized_requests: list[TRequest]
|
|
95
|
+
return_single = False
|
|
96
|
+
|
|
97
|
+
if len(requests) == 1 and isinstance(requests[0], list):
|
|
98
|
+
normalized_requests = requests[0]
|
|
99
|
+
else:
|
|
100
|
+
normalized_requests = []
|
|
101
|
+
for req in requests:
|
|
102
|
+
if isinstance(req, list):
|
|
103
|
+
raise TypeError("Expected request object, got list")
|
|
104
|
+
normalized_requests.append(req)
|
|
105
|
+
|
|
106
|
+
if len(normalized_requests) == 1:
|
|
107
|
+
return_single = True
|
|
108
|
+
|
|
109
|
+
async with self._browser.get_context() as context:
|
|
110
|
+
raw_items = await asyncio.gather(
|
|
111
|
+
*[self.fetch(context, request) for request in normalized_requests]
|
|
112
|
+
)
|
|
113
|
+
responses = await asyncio.gather(*[self.parse(raw) for raw in raw_items])
|
|
114
|
+
if return_single:
|
|
115
|
+
return responses[0]
|
|
116
|
+
return responses
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Video(BaseModel):
|
|
5
|
+
id: str
|
|
6
|
+
url: str
|
|
7
|
+
title: str
|
|
8
|
+
description: str
|
|
9
|
+
published_at: str
|
|
10
|
+
views: str
|
|
11
|
+
channel_id: str
|
|
12
|
+
channel_url: str
|
|
13
|
+
channel_name: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Channel(BaseModel):
|
|
17
|
+
id: str
|
|
18
|
+
url: str
|
|
19
|
+
name: str
|
|
20
|
+
description: str | None
|
|
21
|
+
subscribers: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Post(BaseModel):
|
|
25
|
+
id: str
|
|
26
|
+
url: str
|
|
27
|
+
content: str
|
|
28
|
+
published_at: str
|
|
29
|
+
channel_id: str
|
|
30
|
+
channel_url: str
|
|
31
|
+
channel_name: str
|
|
32
|
+
comments: str
|
|
33
|
+
likes: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Short(BaseModel):
|
|
37
|
+
id: str
|
|
38
|
+
url: str
|
|
39
|
+
title: str
|
|
40
|
+
views: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class YouTubeSearchRequest(BaseModel):
|
|
44
|
+
query: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class YouTubeSearchResponse(BaseModel):
|
|
48
|
+
videos: list[Video]
|
|
49
|
+
channels: list[Channel]
|
|
50
|
+
posts: list[Post]
|
|
51
|
+
shorts: list[Short]
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
from typing import override
|
|
2
|
+
from urllib.parse import quote_plus
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from playwright.async_api import BrowserContext
|
|
6
|
+
|
|
7
|
+
from webquest.scrapers.scraper import Scraper
|
|
8
|
+
from webquest.scrapers.youtube_search.schemas import (
|
|
9
|
+
Channel,
|
|
10
|
+
Post,
|
|
11
|
+
Short,
|
|
12
|
+
Video,
|
|
13
|
+
YouTubeSearchRequest,
|
|
14
|
+
YouTubeSearchResponse,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class YouTubeSearch(Scraper[YouTubeSearchRequest, str, YouTubeSearchResponse]):
|
|
19
|
+
"""Scraper to perform a YouTube search and parse the results."""
|
|
20
|
+
|
|
21
|
+
request = YouTubeSearchRequest
|
|
22
|
+
response = YouTubeSearchResponse
|
|
23
|
+
|
|
24
|
+
def _parse_videos(self, soup: BeautifulSoup) -> list[Video]:
|
|
25
|
+
videos: list[Video] = []
|
|
26
|
+
video_tags = soup.find_all("ytd-video-renderer")
|
|
27
|
+
|
|
28
|
+
for video_tag in video_tags:
|
|
29
|
+
title_tag = video_tag.find(
|
|
30
|
+
"h3",
|
|
31
|
+
class_="title-and-badge style-scope ytd-video-renderer",
|
|
32
|
+
)
|
|
33
|
+
if not title_tag:
|
|
34
|
+
continue
|
|
35
|
+
title = title_tag.get_text(strip=True)
|
|
36
|
+
|
|
37
|
+
views_tag, published_at_tag = video_tag.find_all(
|
|
38
|
+
"span",
|
|
39
|
+
class_="inline-metadata-item style-scope ytd-video-meta-block",
|
|
40
|
+
)
|
|
41
|
+
views = views_tag.get_text(strip=True)
|
|
42
|
+
published_at = published_at_tag.get_text(strip=True)
|
|
43
|
+
|
|
44
|
+
description_tag = video_tag.find(
|
|
45
|
+
"yt-formatted-string",
|
|
46
|
+
class_="metadata-snippet-text style-scope ytd-video-renderer",
|
|
47
|
+
)
|
|
48
|
+
if not description_tag:
|
|
49
|
+
continue
|
|
50
|
+
description = description_tag.get_text(strip=True)
|
|
51
|
+
|
|
52
|
+
channel_name_tag = video_tag.find(
|
|
53
|
+
"a",
|
|
54
|
+
class_="yt-simple-endpoint style-scope yt-formatted-string",
|
|
55
|
+
)
|
|
56
|
+
if not channel_name_tag:
|
|
57
|
+
continue
|
|
58
|
+
channel_name = channel_name_tag.get_text(strip=True)
|
|
59
|
+
|
|
60
|
+
channel_id_tag = video_tag.find(
|
|
61
|
+
"a",
|
|
62
|
+
class_="yt-simple-endpoint style-scope yt-formatted-string",
|
|
63
|
+
)
|
|
64
|
+
if not channel_id_tag:
|
|
65
|
+
continue
|
|
66
|
+
channel_id = channel_id_tag.get("href")
|
|
67
|
+
if not isinstance(channel_id, str):
|
|
68
|
+
continue
|
|
69
|
+
channel_id = channel_id[1:]
|
|
70
|
+
|
|
71
|
+
channel_url = f"https://www.youtube.com/{channel_id}"
|
|
72
|
+
|
|
73
|
+
video_id_tag = video_tag.find(
|
|
74
|
+
"a",
|
|
75
|
+
class_="yt-simple-endpoint style-scope ytd-video-renderer",
|
|
76
|
+
)
|
|
77
|
+
if not video_id_tag:
|
|
78
|
+
continue
|
|
79
|
+
video_id = video_id_tag.get("href")
|
|
80
|
+
if not isinstance(video_id, str):
|
|
81
|
+
continue
|
|
82
|
+
video_id = video_id.split("v=")[-1].split("&")[0]
|
|
83
|
+
|
|
84
|
+
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
85
|
+
|
|
86
|
+
video = Video(
|
|
87
|
+
id=video_id,
|
|
88
|
+
url=video_url,
|
|
89
|
+
title=title,
|
|
90
|
+
description=description,
|
|
91
|
+
published_at=published_at,
|
|
92
|
+
views=views,
|
|
93
|
+
channel_id=channel_id,
|
|
94
|
+
channel_url=channel_url,
|
|
95
|
+
channel_name=channel_name,
|
|
96
|
+
)
|
|
97
|
+
videos.append(video)
|
|
98
|
+
|
|
99
|
+
videos = [video for video in videos if len(video.id) == 11]
|
|
100
|
+
|
|
101
|
+
unique_videos = {video.id: video for video in videos}
|
|
102
|
+
videos = list(unique_videos.values())
|
|
103
|
+
|
|
104
|
+
return videos
|
|
105
|
+
|
|
106
|
+
def _parse_channels(self, soup: BeautifulSoup) -> list[Channel]:
|
|
107
|
+
channels: list[Channel] = []
|
|
108
|
+
channel_tags = soup.find_all("ytd-channel-renderer")
|
|
109
|
+
for channel_tag in channel_tags:
|
|
110
|
+
channel_name_tag = channel_tag.find(
|
|
111
|
+
"yt-formatted-string",
|
|
112
|
+
class_="style-scope ytd-channel-name",
|
|
113
|
+
)
|
|
114
|
+
if not channel_name_tag:
|
|
115
|
+
continue
|
|
116
|
+
channel_name = channel_name_tag.get_text(strip=True)
|
|
117
|
+
|
|
118
|
+
description_tag = channel_tag.find("yt-formatted-string", id="description")
|
|
119
|
+
if not description_tag:
|
|
120
|
+
continue
|
|
121
|
+
description: str | None = description_tag.get_text(strip=True)
|
|
122
|
+
if description == "":
|
|
123
|
+
description = None
|
|
124
|
+
|
|
125
|
+
channel_id_tag = channel_tag.find("yt-formatted-string", id="subscribers")
|
|
126
|
+
if not channel_id_tag:
|
|
127
|
+
continue
|
|
128
|
+
channel_id = channel_id_tag.get_text(strip=True)
|
|
129
|
+
|
|
130
|
+
channel_url = f"https://www.youtube.com/{channel_id}"
|
|
131
|
+
|
|
132
|
+
subscribers_tag = channel_tag.find("span", id="video-count")
|
|
133
|
+
if not subscribers_tag:
|
|
134
|
+
continue
|
|
135
|
+
subscribers = subscribers_tag.get_text(strip=True)
|
|
136
|
+
|
|
137
|
+
channel = Channel(
|
|
138
|
+
id=channel_id,
|
|
139
|
+
url=channel_url,
|
|
140
|
+
name=channel_name,
|
|
141
|
+
description=description,
|
|
142
|
+
subscribers=subscribers,
|
|
143
|
+
)
|
|
144
|
+
channels.append(channel)
|
|
145
|
+
return channels
|
|
146
|
+
|
|
147
|
+
def _parse_posts(self, soup: BeautifulSoup) -> list[Post]:
|
|
148
|
+
posts: list[Post] = []
|
|
149
|
+
post_tags = soup.find_all("ytd-post-renderer")
|
|
150
|
+
for post_tag in post_tags:
|
|
151
|
+
content_tag = post_tag.find(
|
|
152
|
+
"div",
|
|
153
|
+
id="content",
|
|
154
|
+
)
|
|
155
|
+
if not content_tag:
|
|
156
|
+
continue
|
|
157
|
+
content = content_tag.get_text(strip=True)
|
|
158
|
+
|
|
159
|
+
channel_name_tag = post_tag.find(
|
|
160
|
+
"div",
|
|
161
|
+
id="author",
|
|
162
|
+
)
|
|
163
|
+
if not channel_name_tag:
|
|
164
|
+
continue
|
|
165
|
+
channel_name = channel_name_tag.get_text(strip=True)
|
|
166
|
+
|
|
167
|
+
published_at_tag = post_tag.find(
|
|
168
|
+
"yt-formatted-string",
|
|
169
|
+
id="published-time-text",
|
|
170
|
+
)
|
|
171
|
+
if not published_at_tag:
|
|
172
|
+
continue
|
|
173
|
+
published_at = published_at_tag.get_text(strip=True)
|
|
174
|
+
|
|
175
|
+
channel_id_tag = post_tag.find(
|
|
176
|
+
"a",
|
|
177
|
+
id="author-text",
|
|
178
|
+
)
|
|
179
|
+
if not channel_id_tag:
|
|
180
|
+
continue
|
|
181
|
+
channel_id = channel_id_tag.get("href")
|
|
182
|
+
if not isinstance(channel_id, str):
|
|
183
|
+
continue
|
|
184
|
+
channel_id = channel_id[1:]
|
|
185
|
+
|
|
186
|
+
channel_url = f"https://www.youtube.com/{channel_id}"
|
|
187
|
+
|
|
188
|
+
post_id_tag = post_tag.find(
|
|
189
|
+
"a",
|
|
190
|
+
class_="yt-simple-endpoint style-scope yt-formatted-string",
|
|
191
|
+
)
|
|
192
|
+
if not post_id_tag:
|
|
193
|
+
continue
|
|
194
|
+
post_id = post_id_tag.get("href")
|
|
195
|
+
if not isinstance(post_id, str):
|
|
196
|
+
continue
|
|
197
|
+
post_id = post_id.split("/post/")[-1]
|
|
198
|
+
|
|
199
|
+
post_url = f"https://www.youtube.com/post/{post_id}"
|
|
200
|
+
|
|
201
|
+
likes_tag = post_tag.find(
|
|
202
|
+
"span",
|
|
203
|
+
id="vote-count-middle",
|
|
204
|
+
)
|
|
205
|
+
if not likes_tag:
|
|
206
|
+
continue
|
|
207
|
+
likes = likes_tag.get_text(strip=True)
|
|
208
|
+
|
|
209
|
+
comments_tag = post_tag.find(
|
|
210
|
+
"div",
|
|
211
|
+
class_="yt-spec-button-shape-next__button-text-content",
|
|
212
|
+
)
|
|
213
|
+
if not comments_tag:
|
|
214
|
+
continue
|
|
215
|
+
comments = comments_tag.get_text(strip=True)
|
|
216
|
+
|
|
217
|
+
post = Post(
|
|
218
|
+
id=post_id,
|
|
219
|
+
url=post_url,
|
|
220
|
+
content=content,
|
|
221
|
+
published_at=published_at,
|
|
222
|
+
channel_id=channel_id,
|
|
223
|
+
channel_url=channel_url,
|
|
224
|
+
channel_name=channel_name,
|
|
225
|
+
comments=comments,
|
|
226
|
+
likes=likes,
|
|
227
|
+
)
|
|
228
|
+
posts.append(post)
|
|
229
|
+
|
|
230
|
+
return posts
|
|
231
|
+
|
|
232
|
+
def _parse_shorts(self, soup: BeautifulSoup) -> list[Short]:
|
|
233
|
+
shorts: list[Short] = []
|
|
234
|
+
short_tags = soup.find_all("ytm-shorts-lockup-view-model-v2")
|
|
235
|
+
for short_tag in short_tags:
|
|
236
|
+
title_tag = short_tag.find(
|
|
237
|
+
"h3",
|
|
238
|
+
role="presentation",
|
|
239
|
+
)
|
|
240
|
+
if not title_tag:
|
|
241
|
+
continue
|
|
242
|
+
title = title_tag.get_text(strip=True)
|
|
243
|
+
|
|
244
|
+
views_tag = short_tag.find(
|
|
245
|
+
"div",
|
|
246
|
+
class_="shortsLockupViewModelHostOutsideMetadataSubhead shortsLockupViewModelHostMetadataSubhead",
|
|
247
|
+
)
|
|
248
|
+
if not views_tag:
|
|
249
|
+
continue
|
|
250
|
+
views = views_tag.get_text(strip=True)
|
|
251
|
+
|
|
252
|
+
short_id_tag = short_tag.find(
|
|
253
|
+
"a",
|
|
254
|
+
class_="shortsLockupViewModelHostEndpoint shortsLockupViewModelHostOutsideMetadataEndpoint",
|
|
255
|
+
)
|
|
256
|
+
if not short_id_tag:
|
|
257
|
+
continue
|
|
258
|
+
short_id = short_id_tag.get("href")
|
|
259
|
+
if not isinstance(short_id, str):
|
|
260
|
+
continue
|
|
261
|
+
short_id = short_id.split("shorts/")[-1]
|
|
262
|
+
|
|
263
|
+
short_url = f"https://www.youtube.com/shorts/{short_id}"
|
|
264
|
+
|
|
265
|
+
short = Short(
|
|
266
|
+
id=short_id,
|
|
267
|
+
url=short_url,
|
|
268
|
+
title=title,
|
|
269
|
+
views=views,
|
|
270
|
+
)
|
|
271
|
+
shorts.append(short)
|
|
272
|
+
return shorts
|
|
273
|
+
|
|
274
|
+
def _parse_search_results(self, soup: BeautifulSoup) -> YouTubeSearchResponse:
|
|
275
|
+
videos = self._parse_videos(soup)
|
|
276
|
+
channels = self._parse_channels(soup)
|
|
277
|
+
posts = self._parse_posts(soup)
|
|
278
|
+
shorts = self._parse_shorts(soup)
|
|
279
|
+
return YouTubeSearchResponse(
|
|
280
|
+
videos=videos,
|
|
281
|
+
channels=channels,
|
|
282
|
+
posts=posts,
|
|
283
|
+
shorts=shorts,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
@override
|
|
287
|
+
async def parse(self, raw: str) -> YouTubeSearchResponse:
|
|
288
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
289
|
+
result = self._parse_search_results(soup)
|
|
290
|
+
return result
|
|
291
|
+
|
|
292
|
+
@override
|
|
293
|
+
async def fetch(
|
|
294
|
+
self, context: BrowserContext, request: YouTubeSearchRequest
|
|
295
|
+
) -> str:
|
|
296
|
+
url = (
|
|
297
|
+
f"https://www.youtube.com/results?search_query={quote_plus(request.query)}"
|
|
298
|
+
)
|
|
299
|
+
page = await context.new_page()
|
|
300
|
+
await page.goto(url)
|
|
301
|
+
await page.wait_for_selector("ytd-video-renderer", timeout=10000)
|
|
302
|
+
html = await page.content()
|
|
303
|
+
return html
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import override
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from playwright.async_api import BrowserContext
|
|
6
|
+
|
|
7
|
+
from webquest.scrapers.scraper import Scraper
|
|
8
|
+
from webquest.scrapers.youtube_transcript.schemas import (
|
|
9
|
+
YouTubeTranscriptRequest,
|
|
10
|
+
YouTubeTranscriptResponse,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class YouTubeTranscript(
|
|
15
|
+
Scraper[YouTubeTranscriptRequest, str, YouTubeTranscriptResponse]
|
|
16
|
+
):
|
|
17
|
+
"""Scraper to extract the transcript of a YouTube video."""
|
|
18
|
+
|
|
19
|
+
request = YouTubeTranscriptRequest
|
|
20
|
+
response = YouTubeTranscriptResponse
|
|
21
|
+
|
|
22
|
+
@override
|
|
23
|
+
async def fetch(
|
|
24
|
+
self,
|
|
25
|
+
context: BrowserContext,
|
|
26
|
+
request: YouTubeTranscriptRequest,
|
|
27
|
+
) -> str:
|
|
28
|
+
video_url = f"https://www.youtube.com/watch?v={request.video_id}"
|
|
29
|
+
|
|
30
|
+
page = await context.new_page()
|
|
31
|
+
|
|
32
|
+
await page.goto(video_url, wait_until="networkidle", timeout=30000)
|
|
33
|
+
await asyncio.sleep(1)
|
|
34
|
+
|
|
35
|
+
await page.wait_for_selector("div#description", timeout=10000)
|
|
36
|
+
await page.click("div#description")
|
|
37
|
+
|
|
38
|
+
await asyncio.sleep(0.5)
|
|
39
|
+
|
|
40
|
+
transcript_button = await page.wait_for_selector(
|
|
41
|
+
'button[aria-label="Show transcript"]', timeout=10000
|
|
42
|
+
)
|
|
43
|
+
if not transcript_button:
|
|
44
|
+
raise Exception("Transcript button not found")
|
|
45
|
+
|
|
46
|
+
await transcript_button.click()
|
|
47
|
+
|
|
48
|
+
await page.wait_for_selector(
|
|
49
|
+
"ytd-transcript-segment-list-renderer", timeout=10000
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
html = await page.content()
|
|
53
|
+
return html
|
|
54
|
+
|
|
55
|
+
@override
|
|
56
|
+
async def parse(self, raw: str) -> YouTubeTranscriptResponse:
|
|
57
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
58
|
+
|
|
59
|
+
# Find the transcript segment list renderer
|
|
60
|
+
segment_renderer = soup.select_one("ytd-transcript-segment-list-renderer")
|
|
61
|
+
if not segment_renderer:
|
|
62
|
+
raise Exception("No transcript segments found")
|
|
63
|
+
|
|
64
|
+
# Find the segments container
|
|
65
|
+
segments_container = segment_renderer.select_one("div#segments-container")
|
|
66
|
+
if not segments_container:
|
|
67
|
+
raise Exception("No transcript segments found")
|
|
68
|
+
|
|
69
|
+
# Find all transcript segment renderers
|
|
70
|
+
segments = segments_container.select("ytd-transcript-segment-renderer")
|
|
71
|
+
if not segments:
|
|
72
|
+
raise Exception("No transcript segments found")
|
|
73
|
+
|
|
74
|
+
# Extract text from each segment
|
|
75
|
+
transcript_segments = []
|
|
76
|
+
for segment in segments:
|
|
77
|
+
text_element = segment.select_one("yt-formatted-string")
|
|
78
|
+
if text_element:
|
|
79
|
+
transcript_segments.append(text_element.get_text())
|
|
80
|
+
|
|
81
|
+
formatted_transcript = " ".join(transcript_segments).strip()
|
|
82
|
+
result = YouTubeTranscriptResponse(transcript=formatted_transcript)
|
|
83
|
+
|
|
84
|
+
return result
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: webquest
|
|
3
|
+
Version: 0.7.3
|
|
4
|
+
Summary: WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
|
|
5
|
+
Requires-Dist: beautifulsoup4>=4.14.2
|
|
6
|
+
Requires-Dist: hyperbrowser>=0.68.0
|
|
7
|
+
Requires-Dist: openai>=2.6.0
|
|
8
|
+
Requires-Dist: playwright>=1.55.0
|
|
9
|
+
Requires-Dist: pydantic>=2.12.3
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# WebQuest
|
|
14
|
+
|
|
15
|
+
WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
|
|
16
|
+
|
|
17
|
+
**Scrapers**
|
|
18
|
+
|
|
19
|
+
- **Any Article:** Extracts readable content from arbitrary web articles.
|
|
20
|
+
- **DuckDuckGo Search:** General web search using DuckDuckGo.
|
|
21
|
+
- **Google News Search:** News-focused search via Google News.
|
|
22
|
+
- **YouTube Search:** Search YouTube videos, channels, posts, and shorts.
|
|
23
|
+
- **YouTube Transcript:** Fetch transcripts for YouTube videos.
|
|
24
|
+
|
|
25
|
+
**Browsers**
|
|
26
|
+
|
|
27
|
+
- **Hyperbrowser:** A cloud-based browser service for running Playwright scrapers without managing infrastructure.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Installing using pip:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install webquest
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Installing using uv:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv add webquest
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
Example usage of the DuckDuckGo Search scraper:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import asyncio
|
|
49
|
+
|
|
50
|
+
from webquest.browsers import Hyperbrowser
|
|
51
|
+
from webquest.scrapers import DuckDuckGoSearch
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
async def main() -> None:
|
|
55
|
+
scraper = DuckDuckGoSearch(browser=Hyperbrowser())
|
|
56
|
+
|
|
57
|
+
response = await scraper.run(
|
|
58
|
+
scraper.request(query="Pizza Toppings"),
|
|
59
|
+
)
|
|
60
|
+
print(response.model_dump_json(indent=4))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
asyncio.run(main())
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
You can also run multiple requests at the same time:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import asyncio
|
|
71
|
+
|
|
72
|
+
from webquest.browsers import Hyperbrowser
|
|
73
|
+
from webquest.scrapers import DuckDuckGoSearch
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def main() -> None:
|
|
77
|
+
scraper = DuckDuckGoSearch(browser=Hyperbrowser())
|
|
78
|
+
|
|
79
|
+
responses = await scraper.run(
|
|
80
|
+
scraper.request(query="Pizza Toppings"),
|
|
81
|
+
scraper.request(query="AI News"),
|
|
82
|
+
)
|
|
83
|
+
for response in responses:
|
|
84
|
+
print(response.model_dump_json(indent=4))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
asyncio.run(main())
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
> To use the Hyperbrowser browser, you need to set the `HYPERBROWSER_API_KEY` environment variable.
|
|
92
|
+
|
|
93
|
+
> To use the Any Article scraper, you need to set the `OPENAI_API_KEY` environment variable.
|
|
94
|
+
|
|
95
|
+
## Disclaimer
|
|
96
|
+
|
|
97
|
+
This tool is for educational and research purposes only. The developers of WebQuest are not responsible for any misuse of this tool. Scraping websites may violate their Terms of Service. Users are solely responsible for ensuring their activities comply with all applicable laws and website policies.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
webquest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
webquest/browsers/__init__.py,sha256=zx78XH7Ys0xKrMaeV5Pkwx5q37GDagAOBIxted_K2Gk,141
|
|
3
|
+
webquest/browsers/browser.py,sha256=XSojMYWfsUuxZypOi0g9ylhHlN7Fct9RTwrNyBCJIgs,674
|
|
4
|
+
webquest/browsers/hyperbrowser.py,sha256=FRV3tAd6OcMlfnsGNt3vW-j4CEJPgqLi-6U_tUQVMCA,1750
|
|
5
|
+
webquest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
webquest/scrapers/__init__.py,sha256=t7f7oy1R66mIfdw9fWDcpHJ8njHoCgOFUn_N2I145PI,1198
|
|
7
|
+
webquest/scrapers/any_article/__init__.py,sha256=vrAqkwWHrRnnm5jcvSbOZDOzkw_HAJNmcbKbFF93tyg,201
|
|
8
|
+
webquest/scrapers/any_article/schemas.py,sha256=qogFp13pIQy0u3aNHhDSqRJVZpApqCvbCi6tGls5tHw,217
|
|
9
|
+
webquest/scrapers/any_article/scraper.py,sha256=hQBN5SWd1iucZ7-O_Nh1WnzTNUAPC62odwckaF3bKWA,1338
|
|
10
|
+
webquest/scrapers/duckduckgo_search/__init__.py,sha256=9egcEqtLtxQSiyy8lg6e8_2uU4psuxp0nZf7lqhQ1IU,243
|
|
11
|
+
webquest/scrapers/duckduckgo_search/schemas.py,sha256=YT1jJXs2j0_Lc1HHdgfX4N_88-XtrasqTeiqhfl9UpA,245
|
|
12
|
+
webquest/scrapers/duckduckgo_search/scraper.py,sha256=DfNgvgRoCU0-ZbhRSxtBeEbUvoxC1DHADFmeMptqPic,2684
|
|
13
|
+
webquest/scrapers/google_news_search/__init__.py,sha256=-NhQBRay_flLTBtHGwe-KQtY6AdWjw5QnMBOZlzJvBs,244
|
|
14
|
+
webquest/scrapers/google_news_search/schemas.py,sha256=qY0U217bHZwtYvQ_f6FDKzNI_0wT9M-s6z0MtJm3qF4,255
|
|
15
|
+
webquest/scrapers/google_news_search/scraper.py,sha256=2aRvoYszJMWN8CZ3TP-g1GWPsaQJuCaCjX4_oHNOoQc,2307
|
|
16
|
+
webquest/scrapers/openai_parser.py,sha256=meK1nBFkDvF04QUEQC5NCTukH2_Mhs8swyL5EATAzK0,1849
|
|
17
|
+
webquest/scrapers/scraper.py,sha256=jLzHQnvx9EsR-MohBEc1rtBmv9PqxliSHbqtu4vrmsE,3649
|
|
18
|
+
webquest/scrapers/youtube_search/__init__.py,sha256=BvuHhLSZQ9N9pRW6ciKZjIcScecaFqS6BGzgaytwH3o,222
|
|
19
|
+
webquest/scrapers/youtube_search/schemas.py,sha256=Lf4QOObvrYC3sHETS5KKlunci4yTS1GKD177-N9evKY,796
|
|
20
|
+
webquest/scrapers/youtube_search/scraper.py,sha256=bGWlD2BWV9UKFRYptisKW-IP_4AZ5OSoJfbEN8Vq-Dk,10108
|
|
21
|
+
webquest/scrapers/youtube_transcript/__init__.py,sha256=m1LyZcr-PGQqNRNDtWOVsuhIMIV-lG5YIivCyRENTiY,250
|
|
22
|
+
webquest/scrapers/youtube_transcript/schemas.py,sha256=pIrq3XPbXERTTmaJwnUrgyu1LBfD8T-Vn7be088-Ss4,160
|
|
23
|
+
webquest/scrapers/youtube_transcript/scraper.py,sha256=A9JjQhdOQZXAPw5NCzQaRc9yFpgUpRpfR2OdfEvdYYI,2723
|
|
24
|
+
webquest-0.7.3.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
25
|
+
webquest-0.7.3.dist-info/METADATA,sha256=eYRZeyvC0hOKEaCwsS2-U6BXTki2tfoUiZ5ZPPinY_Q,2778
|
|
26
|
+
webquest-0.7.3.dist-info/RECORD,,
|