webquest 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
webquest/__init__.py ADDED
File without changes
@@ -0,0 +1,4 @@
1
+ from webquest.browsers.browser import Browser
2
+ from webquest.browsers.hyperbrowser import Hyperbrowser
3
+
4
+ __all__ = ["Browser", "Hyperbrowser"]
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import AsyncContextManager
3
+
4
+ from playwright.async_api import BrowserContext
5
+
6
+
7
+ class Browser(ABC):
8
+ """
9
+ Abstract base class for browser implementations.
10
+
11
+ This class defines the interface for obtaining a browser context, which is used
12
+ for performing web scraping operations.
13
+ """
14
+
15
+ @abstractmethod
16
+ def get_context(self) -> AsyncContextManager[BrowserContext]:
17
+ """
18
+ Get an asynchronous context manager that yields a Playwright BrowserContext.
19
+
20
+ Returns:
21
+ AsyncContextManager[BrowserContext]: An async context manager that yields a BrowserContext.
22
+ """
23
+ ...
@@ -0,0 +1,52 @@
1
+ from contextlib import asynccontextmanager
2
+ from typing import AsyncIterator, override
3
+
4
+ from hyperbrowser import AsyncHyperbrowser
5
+ from playwright.async_api import BrowserContext, async_playwright
6
+
7
+ from webquest.browsers.browser import Browser
8
+
9
+
10
+ class Hyperbrowser(Browser):
11
+ """
12
+ A Browser implementation that uses Hyperbrowser for remote browser sessions.
13
+
14
+ This class manages the creation and cleanup of Hyperbrowser sessions and provides
15
+ a Playwright BrowserContext connected to the remote session.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ client: AsyncHyperbrowser | None = None,
21
+ ):
22
+ """
23
+ Initialize the Hyperbrowser instance.
24
+
25
+ Args:
26
+ client (AsyncHyperbrowser | None): An optional AsyncHyperbrowser client.
27
+ If not provided, a new client will be created.
28
+ """
29
+ if client is None:
30
+ client = AsyncHyperbrowser()
31
+ self._client = client
32
+
33
+ @override
34
+ @asynccontextmanager
35
+ async def get_context(self) -> AsyncIterator[BrowserContext]:
36
+ """
37
+ Get a browser context from a new Hyperbrowser session.
38
+
39
+ This method creates a new session, connects to it using Playwright, yields
40
+ the context, and ensures the session is stopped afterwards.
41
+
42
+ Yields:
43
+ BrowserContext: The Playwright browser context connected to the Hyperbrowser session.
44
+ """
45
+ session = await self._client.sessions.create()
46
+ async with async_playwright() as p:
47
+ browser = await p.chromium.connect_over_cdp(session.ws_endpoint)
48
+ context = browser.contexts[0]
49
+ try:
50
+ yield context
51
+ finally:
52
+ await self._client.sessions.stop(session.id)
webquest/py.typed ADDED
File without changes
@@ -0,0 +1,47 @@
1
+ from webquest.scrapers.any_article import (
2
+ AnyArticle,
3
+ AnyArticleRequest,
4
+ AnyArticleResponse,
5
+ )
6
+ from webquest.scrapers.duckduckgo_search import (
7
+ DuckDuckGoSearch,
8
+ DuckDuckGoSearchRequest,
9
+ DuckDuckGoSearchResponse,
10
+ )
11
+ from webquest.scrapers.google_news_search import (
12
+ GoogleNewsSearch,
13
+ GoogleNewsSearchRequest,
14
+ GoogleNewsSearchResponse,
15
+ )
16
+ from webquest.scrapers.openai_parser import OpenAIParser
17
+ from webquest.scrapers.scraper import Scraper
18
+ from webquest.scrapers.youtube_search import (
19
+ YouTubeSearch,
20
+ YouTubeSearchRequest,
21
+ YouTubeSearchResponse,
22
+ )
23
+ from webquest.scrapers.youtube_transcript import (
24
+ YouTubeTranscript,
25
+ YouTubeTranscriptRequest,
26
+ YouTubeTranscriptResponse,
27
+ )
28
+
29
+ __all__ = [
30
+ "AnyArticle",
31
+ "AnyArticleRequest",
32
+ "AnyArticleResponse",
33
+ "DuckDuckGoSearch",
34
+ "DuckDuckGoSearchRequest",
35
+ "DuckDuckGoSearchResponse",
36
+ "GoogleNewsSearch",
37
+ "GoogleNewsSearchRequest",
38
+ "GoogleNewsSearchResponse",
39
+ "OpenAIParser",
40
+ "Scraper",
41
+ "YouTubeSearch",
42
+ "YouTubeSearchRequest",
43
+ "YouTubeSearchResponse",
44
+ "YouTubeTranscript",
45
+ "YouTubeTranscriptRequest",
46
+ "YouTubeTranscriptResponse",
47
+ ]
@@ -0,0 +1,11 @@
1
+ from webquest.scrapers.any_article.scraper import (
2
+ AnyArticle,
3
+ AnyArticleRequest,
4
+ AnyArticleResponse,
5
+ )
6
+
7
+ __all__ = [
8
+ "AnyArticle",
9
+ "AnyArticleRequest",
10
+ "AnyArticleResponse",
11
+ ]
@@ -0,0 +1,13 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class AnyArticleRequest(BaseModel):
5
+ url: str
6
+
7
+
8
+ class AnyArticleResponse(BaseModel):
9
+ publisher: str
10
+ title: str
11
+ published_at: str
12
+ authors: list[str]
13
+ content: str
@@ -0,0 +1,43 @@
1
+ from typing import override
2
+
3
+ from openai import AsyncOpenAI
4
+ from playwright.async_api import BrowserContext
5
+
6
+ from webquest.browsers.browser import Browser
7
+ from webquest.scrapers.any_article.schemas import AnyArticleRequest, AnyArticleResponse
8
+ from webquest.scrapers.openai_parser import OpenAIParser
9
+
10
+
11
+ class AnyArticle(OpenAIParser[AnyArticleRequest, AnyArticleResponse]):
12
+ """Scraper to extract the main article from any web page using OpenAI."""
13
+
14
+ request = AnyArticleRequest
15
+ response = AnyArticleResponse
16
+
17
+ def __init__(
18
+ self,
19
+ browser: Browser,
20
+ client: AsyncOpenAI | None = None,
21
+ model: str = "gpt-5-mini",
22
+ character_limit: int = 4000,
23
+ ) -> None:
24
+ super().__init__(
25
+ browser=browser,
26
+ response_type=AnyArticleResponse,
27
+ client=client,
28
+ model=model,
29
+ input="Parse the following web page and extract the main article:\n\n",
30
+ character_limit=character_limit,
31
+ )
32
+
33
+ @override
34
+ async def fetch(
35
+ self,
36
+ context: BrowserContext,
37
+ request: AnyArticleRequest,
38
+ ) -> str:
39
+ page = await context.new_page()
40
+ await page.goto(request.url, wait_until="domcontentloaded")
41
+ await page.wait_for_timeout(3000)
42
+ html = await page.content()
43
+ return html
@@ -0,0 +1,11 @@
1
+ from webquest.scrapers.duckduckgo_search.scraper import (
2
+ DuckDuckGoSearch,
3
+ DuckDuckGoSearchRequest,
4
+ DuckDuckGoSearchResponse,
5
+ )
6
+
7
+ __all__ = [
8
+ "DuckDuckGoSearch",
9
+ "DuckDuckGoSearchRequest",
10
+ "DuckDuckGoSearchResponse",
11
+ ]
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class DuckDuckGoSearchRequest(BaseModel):
5
+ query: str
6
+
7
+
8
+ class Page(BaseModel):
9
+ site: str
10
+ url: str
11
+ title: str
12
+ description: str
13
+
14
+
15
+ class DuckDuckGoSearchResponse(BaseModel):
16
+ pages: list[Page]
@@ -0,0 +1,84 @@
1
+ import asyncio
2
+ from typing import override
3
+ from urllib.parse import quote_plus
4
+
5
+ from bs4 import BeautifulSoup
6
+ from playwright.async_api import BrowserContext
7
+
8
+ from webquest.scrapers.duckduckgo_search.schemas import (
9
+ DuckDuckGoSearchRequest,
10
+ DuckDuckGoSearchResponse,
11
+ Page,
12
+ )
13
+ from webquest.scrapers.scraper import Scraper
14
+
15
+
16
+ class DuckDuckGoSearch(Scraper[DuckDuckGoSearchRequest, str, DuckDuckGoSearchResponse]):
17
+ """Scraper to perform a DuckDuckGo web search and parse the results."""
18
+
19
+ request = DuckDuckGoSearchRequest
20
+ response = DuckDuckGoSearchResponse
21
+
22
+ @override
23
+ async def fetch(
24
+ self,
25
+ context: BrowserContext,
26
+ request: DuckDuckGoSearchRequest,
27
+ ) -> str:
28
+ url = f"https://duckduckgo.com/?origin=funnel_home_website&t=h_&q={quote_plus(request.query)}&ia=web"
29
+ page = await context.new_page()
30
+
31
+ await page.goto(url, wait_until="networkidle", timeout=30000)
32
+ await asyncio.sleep(1)
33
+
34
+ await page.wait_for_selector("button#more-results", timeout=15000)
35
+ await page.click("button#more-results")
36
+
37
+ await page.wait_for_selector("li[data-layout='organic']", timeout=15000)
38
+
39
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
40
+ await asyncio.sleep(4)
41
+
42
+ html = await page.content()
43
+
44
+ return html
45
+
46
+ @override
47
+ async def parse(self, raw: str) -> DuckDuckGoSearchResponse:
48
+ soup = BeautifulSoup(raw, "html.parser")
49
+ pages: list[Page] = []
50
+
51
+ article_tags = soup.find_all("article", {"data-testid": "result"})
52
+
53
+ for article_tag in article_tags:
54
+ site_tag = article_tag.find("p", class_="fOCEb2mA3YZTJXXjpgdS")
55
+ if not site_tag:
56
+ continue
57
+ site = site_tag.get_text(strip=True)
58
+
59
+ url_tag = article_tag.find("a", {"data-testid": "result-title-a"})
60
+ if not url_tag:
61
+ continue
62
+ url = url_tag.get("href")
63
+ if not isinstance(url, str):
64
+ continue
65
+
66
+ title_tag = article_tag.find("span", class_="EKtkFWMYpwzMKOYr0GYm")
67
+ if not title_tag:
68
+ continue
69
+ title = title_tag.get_text(strip=True)
70
+
71
+ description_tag = article_tag.find("span", class_="kY2IgmnCmOGjharHErah")
72
+ if not description_tag:
73
+ continue
74
+ description = description_tag.get_text(strip=True)
75
+
76
+ page = Page(
77
+ site=site,
78
+ url=url,
79
+ title=title,
80
+ description=description,
81
+ )
82
+ pages.append(page)
83
+
84
+ return DuckDuckGoSearchResponse(pages=pages)
@@ -0,0 +1,11 @@
1
+ from webquest.scrapers.google_news_search.scraper import (
2
+ GoogleNewsSearch,
3
+ GoogleNewsSearchRequest,
4
+ GoogleNewsSearchResponse,
5
+ )
6
+
7
+ __all__ = [
8
+ "GoogleNewsSearch",
9
+ "GoogleNewsSearchRequest",
10
+ "GoogleNewsSearchResponse",
11
+ ]
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class GoogleNewsSearchRequest(BaseModel):
5
+ query: str
6
+
7
+
8
+ class Article(BaseModel):
9
+ site: str
10
+ url: str
11
+ title: str
12
+ published_at: str
13
+
14
+
15
+ class GoogleNewsSearchResponse(BaseModel):
16
+ articles: list[Article]
@@ -0,0 +1,78 @@
1
+ import asyncio
2
+ from typing import override
3
+ from urllib.parse import quote_plus
4
+
5
+ from bs4 import BeautifulSoup
6
+ from playwright.async_api import BrowserContext
7
+
8
+ from webquest.scrapers.google_news_search.schemas import (
9
+ Article,
10
+ GoogleNewsSearchRequest,
11
+ GoogleNewsSearchResponse,
12
+ )
13
+ from webquest.scrapers.scraper import Scraper
14
+
15
+
16
+ class GoogleNewsSearch(Scraper[GoogleNewsSearchRequest, str, GoogleNewsSearchResponse]):
17
+ """Scraper to perform a Google News search and parse the results."""
18
+
19
+ request = GoogleNewsSearchRequest
20
+ response = GoogleNewsSearchResponse
21
+
22
+ @override
23
+ async def fetch(
24
+ self,
25
+ context: BrowserContext,
26
+ request: GoogleNewsSearchRequest,
27
+ ) -> str:
28
+ url = f"https://news.google.com/search?q={quote_plus(request.query)}"
29
+ page = await context.new_page()
30
+
31
+ await page.goto(url, wait_until="networkidle", timeout=30000)
32
+ await asyncio.sleep(1)
33
+
34
+ html = await page.content()
35
+
36
+ return html
37
+
38
+ @override
39
+ async def parse(self, raw: str) -> GoogleNewsSearchResponse:
40
+ soup = BeautifulSoup(raw, "html.parser")
41
+ articles: list[Article] = []
42
+
43
+ article_tags = soup.find_all("c-wiz")
44
+ for article_tag in article_tags:
45
+ title_tag = article_tag.find("a", class_="JtKRv")
46
+ if not title_tag:
47
+ continue
48
+ title = title_tag.get_text().strip()
49
+
50
+ url_tag = article_tag.find("a", class_="JtKRv")
51
+ if not url_tag:
52
+ continue
53
+ url = url_tag.get("href")
54
+ if not isinstance(url, str):
55
+ continue
56
+
57
+ url = f"https://news.google.com{url[1:]}"
58
+
59
+ site_tag = article_tag.find("div", class_="vr1PYe")
60
+ if not site_tag:
61
+ continue
62
+ site = site_tag.get_text().strip()
63
+
64
+ published_at_tag = article_tag.find("time")
65
+ if not published_at_tag:
66
+ continue
67
+ published_at = published_at_tag.get_text().strip()
68
+
69
+ article = Article(
70
+ site=site,
71
+ url=url,
72
+ title=title,
73
+ published_at=published_at,
74
+ )
75
+
76
+ articles.append(article)
77
+
78
+ return GoogleNewsSearchResponse(articles=articles)
@@ -0,0 +1,58 @@
1
+ from abc import ABC
2
+ from typing import Generic, Type, TypeVar, override
3
+
4
+ from bs4 import BeautifulSoup
5
+ from openai import AsyncOpenAI
6
+ from pydantic import BaseModel
7
+
8
+ from webquest.browsers.browser import Browser
9
+ from webquest.scrapers.scraper import Scraper
10
+
11
+ TRequest = TypeVar("TRequest", bound=BaseModel)
12
+ TResponse = TypeVar("TResponse", bound=BaseModel)
13
+
14
+
15
+ class OpenAIParser(
16
+ Generic[TRequest, TResponse],
17
+ Scraper[TRequest, str, TResponse],
18
+ ABC,
19
+ ):
20
+ """Abstract base class for OpenAI-based parsers."""
21
+
22
+ def __init__(
23
+ self,
24
+ browser: Browser,
25
+ response_type: Type[TResponse],
26
+ client: AsyncOpenAI | None = None,
27
+ model: str = "gpt-5-mini",
28
+ input: str = "Parse the following web content:\n",
29
+ character_limit: int = 20000,
30
+ ) -> None:
31
+ self._response_type = response_type
32
+ if client is None:
33
+ client = AsyncOpenAI()
34
+ self._client = client
35
+ self._model = model
36
+ self._character_limit = character_limit
37
+ self._input = input
38
+ super().__init__(browser=browser)
39
+
40
+ @override
41
+ async def parse(self, raw: str) -> TResponse:
42
+ soup = BeautifulSoup(raw, "html.parser")
43
+ text = soup.get_text(separator="\n", strip=True)
44
+
45
+ if len(text) > self._character_limit:
46
+ start = (len(text) - self._character_limit) // 2
47
+ end = start + self._character_limit
48
+ text = text[start:end]
49
+
50
+ response = await self._client.responses.parse(
51
+ input=f"{self._input}{text}",
52
+ text_format=self._response_type,
53
+ model=self._model,
54
+ reasoning={"effort": "minimal"},
55
+ )
56
+ if response.output_parsed is None:
57
+ raise ValueError("Failed to parse the response into the desired format.")
58
+ return response.output_parsed
@@ -0,0 +1,116 @@
1
+ import asyncio
2
+ from abc import ABC, abstractmethod
3
+ from typing import ClassVar, Generic, TypeVar, overload
4
+
5
+ from playwright.async_api import BrowserContext
6
+ from pydantic import BaseModel
7
+
8
+ from webquest.browsers.browser import Browser
9
+
10
+ TRequest = TypeVar("TRequest", bound=BaseModel)
11
+ TRaw = TypeVar("TRaw")
12
+ TResponse = TypeVar("TResponse", bound=BaseModel)
13
+
14
+
15
+ class Scraper(ABC, Generic[TRequest, TRaw, TResponse]):
16
+ """
17
+ Abstract base class for web scrapers.
18
+
19
+ This class defines the structure for a scraper, including fetching raw data
20
+ and parsing it into a structured response. It handles the execution flow
21
+ using a provided Browser instance.
22
+
23
+ Type Parameters:
24
+ TRequest: The type of the request object.
25
+ TRaw: The type of the raw data fetched from the browser.
26
+ TResponse: The type of the parsed response object.
27
+ """
28
+
29
+ request: ClassVar[type[TRequest]]
30
+ response: ClassVar[type[TResponse]]
31
+
32
+ def __init__(self, browser: Browser) -> None:
33
+ """
34
+ Initialize the Scraper.
35
+
36
+ Args:
37
+ browser (Browser): The browser instance to use for scraping.
38
+ """
39
+ self._browser = browser
40
+
41
+ @abstractmethod
42
+ async def fetch(self, context: BrowserContext, request: TRequest) -> TRaw:
43
+ """
44
+ Fetch raw data from the target website.
45
+
46
+ Args:
47
+ context (BrowserContext): The browser context to use.
48
+ request (TRequest): The request object containing parameters for the fetch operation.
49
+
50
+ Returns:
51
+ TRaw: The raw data fetched from the website.
52
+ """
53
+ ...
54
+
55
+ @abstractmethod
56
+ async def parse(self, raw: TRaw) -> TResponse:
57
+ """
58
+ Parse the raw data into a structured response.
59
+
60
+ Args:
61
+ raw (TRaw): The raw data returned by the fetch method.
62
+
63
+ Returns:
64
+ TResponse: The structured response object.
65
+ """
66
+ ...
67
+
68
+ @overload
69
+ async def run(self, request: TRequest, /) -> TResponse: ...
70
+
71
+ @overload
72
+ async def run(self, requests: list[TRequest], /) -> list[TResponse]: ...
73
+
74
+ @overload
75
+ async def run(self, *requests: TRequest) -> list[TResponse]: ...
76
+
77
+ async def run(
78
+ self,
79
+ *requests: TRequest | list[TRequest],
80
+ ) -> list[TResponse] | TResponse:
81
+ """
82
+ Run the scraper for one or more requests.
83
+
84
+ This method handles the browser context creation, concurrent fetching,
85
+ and parsing of results.
86
+
87
+ Args:
88
+ *requests: One or more request objects, or a list of request objects.
89
+
90
+ Returns:
91
+ list[TResponse] | TResponse: A single response if a single request was passed,
92
+ or a list of responses corresponding to the input requests.
93
+ """
94
+ normalized_requests: list[TRequest]
95
+ return_single = False
96
+
97
+ if len(requests) == 1 and isinstance(requests[0], list):
98
+ normalized_requests = requests[0]
99
+ else:
100
+ normalized_requests = []
101
+ for req in requests:
102
+ if isinstance(req, list):
103
+ raise TypeError("Expected request object, got list")
104
+ normalized_requests.append(req)
105
+
106
+ if len(normalized_requests) == 1:
107
+ return_single = True
108
+
109
+ async with self._browser.get_context() as context:
110
+ raw_items = await asyncio.gather(
111
+ *[self.fetch(context, request) for request in normalized_requests]
112
+ )
113
+ responses = await asyncio.gather(*[self.parse(raw) for raw in raw_items])
114
+ if return_single:
115
+ return responses[0]
116
+ return responses
@@ -0,0 +1,11 @@
1
+ from webquest.scrapers.youtube_search.scraper import (
2
+ YouTubeSearch,
3
+ YouTubeSearchRequest,
4
+ YouTubeSearchResponse,
5
+ )
6
+
7
+ __all__ = [
8
+ "YouTubeSearch",
9
+ "YouTubeSearchRequest",
10
+ "YouTubeSearchResponse",
11
+ ]
@@ -0,0 +1,51 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class Video(BaseModel):
5
+ id: str
6
+ url: str
7
+ title: str
8
+ description: str
9
+ published_at: str
10
+ views: str
11
+ channel_id: str
12
+ channel_url: str
13
+ channel_name: str
14
+
15
+
16
+ class Channel(BaseModel):
17
+ id: str
18
+ url: str
19
+ name: str
20
+ description: str | None
21
+ subscribers: str
22
+
23
+
24
+ class Post(BaseModel):
25
+ id: str
26
+ url: str
27
+ content: str
28
+ published_at: str
29
+ channel_id: str
30
+ channel_url: str
31
+ channel_name: str
32
+ comments: str
33
+ likes: str
34
+
35
+
36
+ class Short(BaseModel):
37
+ id: str
38
+ url: str
39
+ title: str
40
+ views: str
41
+
42
+
43
+ class YouTubeSearchRequest(BaseModel):
44
+ query: str
45
+
46
+
47
+ class YouTubeSearchResponse(BaseModel):
48
+ videos: list[Video]
49
+ channels: list[Channel]
50
+ posts: list[Post]
51
+ shorts: list[Short]
@@ -0,0 +1,303 @@
1
+ from typing import override
2
+ from urllib.parse import quote_plus
3
+
4
+ from bs4 import BeautifulSoup
5
+ from playwright.async_api import BrowserContext
6
+
7
+ from webquest.scrapers.scraper import Scraper
8
+ from webquest.scrapers.youtube_search.schemas import (
9
+ Channel,
10
+ Post,
11
+ Short,
12
+ Video,
13
+ YouTubeSearchRequest,
14
+ YouTubeSearchResponse,
15
+ )
16
+
17
+
18
+ class YouTubeSearch(Scraper[YouTubeSearchRequest, str, YouTubeSearchResponse]):
19
+ """Scraper to perform a YouTube search and parse the results."""
20
+
21
+ request = YouTubeSearchRequest
22
+ response = YouTubeSearchResponse
23
+
24
+ def _parse_videos(self, soup: BeautifulSoup) -> list[Video]:
25
+ videos: list[Video] = []
26
+ video_tags = soup.find_all("ytd-video-renderer")
27
+
28
+ for video_tag in video_tags:
29
+ title_tag = video_tag.find(
30
+ "h3",
31
+ class_="title-and-badge style-scope ytd-video-renderer",
32
+ )
33
+ if not title_tag:
34
+ continue
35
+ title = title_tag.get_text(strip=True)
36
+
37
+ views_tag, published_at_tag = video_tag.find_all(
38
+ "span",
39
+ class_="inline-metadata-item style-scope ytd-video-meta-block",
40
+ )
41
+ views = views_tag.get_text(strip=True)
42
+ published_at = published_at_tag.get_text(strip=True)
43
+
44
+ description_tag = video_tag.find(
45
+ "yt-formatted-string",
46
+ class_="metadata-snippet-text style-scope ytd-video-renderer",
47
+ )
48
+ if not description_tag:
49
+ continue
50
+ description = description_tag.get_text(strip=True)
51
+
52
+ channel_name_tag = video_tag.find(
53
+ "a",
54
+ class_="yt-simple-endpoint style-scope yt-formatted-string",
55
+ )
56
+ if not channel_name_tag:
57
+ continue
58
+ channel_name = channel_name_tag.get_text(strip=True)
59
+
60
+ channel_id_tag = video_tag.find(
61
+ "a",
62
+ class_="yt-simple-endpoint style-scope yt-formatted-string",
63
+ )
64
+ if not channel_id_tag:
65
+ continue
66
+ channel_id = channel_id_tag.get("href")
67
+ if not isinstance(channel_id, str):
68
+ continue
69
+ channel_id = channel_id[1:]
70
+
71
+ channel_url = f"https://www.youtube.com/{channel_id}"
72
+
73
+ video_id_tag = video_tag.find(
74
+ "a",
75
+ class_="yt-simple-endpoint style-scope ytd-video-renderer",
76
+ )
77
+ if not video_id_tag:
78
+ continue
79
+ video_id = video_id_tag.get("href")
80
+ if not isinstance(video_id, str):
81
+ continue
82
+ video_id = video_id.split("v=")[-1].split("&")[0]
83
+
84
+ video_url = f"https://www.youtube.com/watch?v={video_id}"
85
+
86
+ video = Video(
87
+ id=video_id,
88
+ url=video_url,
89
+ title=title,
90
+ description=description,
91
+ published_at=published_at,
92
+ views=views,
93
+ channel_id=channel_id,
94
+ channel_url=channel_url,
95
+ channel_name=channel_name,
96
+ )
97
+ videos.append(video)
98
+
99
+ videos = [video for video in videos if len(video.id) == 11]
100
+
101
+ unique_videos = {video.id: video for video in videos}
102
+ videos = list(unique_videos.values())
103
+
104
+ return videos
105
+
106
+ def _parse_channels(self, soup: BeautifulSoup) -> list[Channel]:
107
+ channels: list[Channel] = []
108
+ channel_tags = soup.find_all("ytd-channel-renderer")
109
+ for channel_tag in channel_tags:
110
+ channel_name_tag = channel_tag.find(
111
+ "yt-formatted-string",
112
+ class_="style-scope ytd-channel-name",
113
+ )
114
+ if not channel_name_tag:
115
+ continue
116
+ channel_name = channel_name_tag.get_text(strip=True)
117
+
118
+ description_tag = channel_tag.find("yt-formatted-string", id="description")
119
+ if not description_tag:
120
+ continue
121
+ description: str | None = description_tag.get_text(strip=True)
122
+ if description == "":
123
+ description = None
124
+
125
+ channel_id_tag = channel_tag.find("yt-formatted-string", id="subscribers")
126
+ if not channel_id_tag:
127
+ continue
128
+ channel_id = channel_id_tag.get_text(strip=True)
129
+
130
+ channel_url = f"https://www.youtube.com/{channel_id}"
131
+
132
+ subscribers_tag = channel_tag.find("span", id="video-count")
133
+ if not subscribers_tag:
134
+ continue
135
+ subscribers = subscribers_tag.get_text(strip=True)
136
+
137
+ channel = Channel(
138
+ id=channel_id,
139
+ url=channel_url,
140
+ name=channel_name,
141
+ description=description,
142
+ subscribers=subscribers,
143
+ )
144
+ channels.append(channel)
145
+ return channels
146
+
147
+ def _parse_posts(self, soup: BeautifulSoup) -> list[Post]:
148
+ posts: list[Post] = []
149
+ post_tags = soup.find_all("ytd-post-renderer")
150
+ for post_tag in post_tags:
151
+ content_tag = post_tag.find(
152
+ "div",
153
+ id="content",
154
+ )
155
+ if not content_tag:
156
+ continue
157
+ content = content_tag.get_text(strip=True)
158
+
159
+ channel_name_tag = post_tag.find(
160
+ "div",
161
+ id="author",
162
+ )
163
+ if not channel_name_tag:
164
+ continue
165
+ channel_name = channel_name_tag.get_text(strip=True)
166
+
167
+ published_at_tag = post_tag.find(
168
+ "yt-formatted-string",
169
+ id="published-time-text",
170
+ )
171
+ if not published_at_tag:
172
+ continue
173
+ published_at = published_at_tag.get_text(strip=True)
174
+
175
+ channel_id_tag = post_tag.find(
176
+ "a",
177
+ id="author-text",
178
+ )
179
+ if not channel_id_tag:
180
+ continue
181
+ channel_id = channel_id_tag.get("href")
182
+ if not isinstance(channel_id, str):
183
+ continue
184
+ channel_id = channel_id[1:]
185
+
186
+ channel_url = f"https://www.youtube.com/{channel_id}"
187
+
188
+ post_id_tag = post_tag.find(
189
+ "a",
190
+ class_="yt-simple-endpoint style-scope yt-formatted-string",
191
+ )
192
+ if not post_id_tag:
193
+ continue
194
+ post_id = post_id_tag.get("href")
195
+ if not isinstance(post_id, str):
196
+ continue
197
+ post_id = post_id.split("/post/")[-1]
198
+
199
+ post_url = f"https://www.youtube.com/post/{post_id}"
200
+
201
+ likes_tag = post_tag.find(
202
+ "span",
203
+ id="vote-count-middle",
204
+ )
205
+ if not likes_tag:
206
+ continue
207
+ likes = likes_tag.get_text(strip=True)
208
+
209
+ comments_tag = post_tag.find(
210
+ "div",
211
+ class_="yt-spec-button-shape-next__button-text-content",
212
+ )
213
+ if not comments_tag:
214
+ continue
215
+ comments = comments_tag.get_text(strip=True)
216
+
217
+ post = Post(
218
+ id=post_id,
219
+ url=post_url,
220
+ content=content,
221
+ published_at=published_at,
222
+ channel_id=channel_id,
223
+ channel_url=channel_url,
224
+ channel_name=channel_name,
225
+ comments=comments,
226
+ likes=likes,
227
+ )
228
+ posts.append(post)
229
+
230
+ return posts
231
+
232
+ def _parse_shorts(self, soup: BeautifulSoup) -> list[Short]:
233
+ shorts: list[Short] = []
234
+ short_tags = soup.find_all("ytm-shorts-lockup-view-model-v2")
235
+ for short_tag in short_tags:
236
+ title_tag = short_tag.find(
237
+ "h3",
238
+ role="presentation",
239
+ )
240
+ if not title_tag:
241
+ continue
242
+ title = title_tag.get_text(strip=True)
243
+
244
+ views_tag = short_tag.find(
245
+ "div",
246
+ class_="shortsLockupViewModelHostOutsideMetadataSubhead shortsLockupViewModelHostMetadataSubhead",
247
+ )
248
+ if not views_tag:
249
+ continue
250
+ views = views_tag.get_text(strip=True)
251
+
252
+ short_id_tag = short_tag.find(
253
+ "a",
254
+ class_="shortsLockupViewModelHostEndpoint shortsLockupViewModelHostOutsideMetadataEndpoint",
255
+ )
256
+ if not short_id_tag:
257
+ continue
258
+ short_id = short_id_tag.get("href")
259
+ if not isinstance(short_id, str):
260
+ continue
261
+ short_id = short_id.split("shorts/")[-1]
262
+
263
+ short_url = f"https://www.youtube.com/shorts/{short_id}"
264
+
265
+ short = Short(
266
+ id=short_id,
267
+ url=short_url,
268
+ title=title,
269
+ views=views,
270
+ )
271
+ shorts.append(short)
272
+ return shorts
273
+
274
+ def _parse_search_results(self, soup: BeautifulSoup) -> YouTubeSearchResponse:
275
+ videos = self._parse_videos(soup)
276
+ channels = self._parse_channels(soup)
277
+ posts = self._parse_posts(soup)
278
+ shorts = self._parse_shorts(soup)
279
+ return YouTubeSearchResponse(
280
+ videos=videos,
281
+ channels=channels,
282
+ posts=posts,
283
+ shorts=shorts,
284
+ )
285
+
286
+ @override
287
+ async def parse(self, raw: str) -> YouTubeSearchResponse:
288
+ soup = BeautifulSoup(raw, "html.parser")
289
+ result = self._parse_search_results(soup)
290
+ return result
291
+
292
+ @override
293
+ async def fetch(
294
+ self, context: BrowserContext, request: YouTubeSearchRequest
295
+ ) -> str:
296
+ url = (
297
+ f"https://www.youtube.com/results?search_query={quote_plus(request.query)}"
298
+ )
299
+ page = await context.new_page()
300
+ await page.goto(url)
301
+ await page.wait_for_selector("ytd-video-renderer", timeout=10000)
302
+ html = await page.content()
303
+ return html
@@ -0,0 +1,11 @@
1
+ from webquest.scrapers.youtube_transcript.scraper import (
2
+ YouTubeTranscript,
3
+ YouTubeTranscriptRequest,
4
+ YouTubeTranscriptResponse,
5
+ )
6
+
7
+ __all__ = [
8
+ "YouTubeTranscript",
9
+ "YouTubeTranscriptRequest",
10
+ "YouTubeTranscriptResponse",
11
+ ]
@@ -0,0 +1,9 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class YouTubeTranscriptRequest(BaseModel):
5
+ video_id: str
6
+
7
+
8
+ class YouTubeTranscriptResponse(BaseModel):
9
+ transcript: str
@@ -0,0 +1,84 @@
1
+ import asyncio
2
+ from typing import override
3
+
4
+ from bs4 import BeautifulSoup
5
+ from playwright.async_api import BrowserContext
6
+
7
+ from webquest.scrapers.scraper import Scraper
8
+ from webquest.scrapers.youtube_transcript.schemas import (
9
+ YouTubeTranscriptRequest,
10
+ YouTubeTranscriptResponse,
11
+ )
12
+
13
+
14
+ class YouTubeTranscript(
15
+ Scraper[YouTubeTranscriptRequest, str, YouTubeTranscriptResponse]
16
+ ):
17
+ """Scraper to extract the transcript of a YouTube video."""
18
+
19
+ request = YouTubeTranscriptRequest
20
+ response = YouTubeTranscriptResponse
21
+
22
+ @override
23
+ async def fetch(
24
+ self,
25
+ context: BrowserContext,
26
+ request: YouTubeTranscriptRequest,
27
+ ) -> str:
28
+ video_url = f"https://www.youtube.com/watch?v={request.video_id}"
29
+
30
+ page = await context.new_page()
31
+
32
+ await page.goto(video_url, wait_until="networkidle", timeout=30000)
33
+ await asyncio.sleep(1)
34
+
35
+ await page.wait_for_selector("div#description", timeout=10000)
36
+ await page.click("div#description")
37
+
38
+ await asyncio.sleep(0.5)
39
+
40
+ transcript_button = await page.wait_for_selector(
41
+ 'button[aria-label="Show transcript"]', timeout=10000
42
+ )
43
+ if not transcript_button:
44
+ raise Exception("Transcript button not found")
45
+
46
+ await transcript_button.click()
47
+
48
+ await page.wait_for_selector(
49
+ "ytd-transcript-segment-list-renderer", timeout=10000
50
+ )
51
+
52
+ html = await page.content()
53
+ return html
54
+
55
+ @override
56
+ async def parse(self, raw: str) -> YouTubeTranscriptResponse:
57
+ soup = BeautifulSoup(raw, "html.parser")
58
+
59
+ # Find the transcript segment list renderer
60
+ segment_renderer = soup.select_one("ytd-transcript-segment-list-renderer")
61
+ if not segment_renderer:
62
+ raise Exception("No transcript segments found")
63
+
64
+ # Find the segments container
65
+ segments_container = segment_renderer.select_one("div#segments-container")
66
+ if not segments_container:
67
+ raise Exception("No transcript segments found")
68
+
69
+ # Find all transcript segment renderers
70
+ segments = segments_container.select("ytd-transcript-segment-renderer")
71
+ if not segments:
72
+ raise Exception("No transcript segments found")
73
+
74
+ # Extract text from each segment
75
+ transcript_segments = []
76
+ for segment in segments:
77
+ text_element = segment.select_one("yt-formatted-string")
78
+ if text_element:
79
+ transcript_segments.append(text_element.get_text())
80
+
81
+ formatted_transcript = " ".join(transcript_segments).strip()
82
+ result = YouTubeTranscriptResponse(transcript=formatted_transcript)
83
+
84
+ return result
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.3
2
+ Name: webquest
3
+ Version: 0.7.3
4
+ Summary: WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
5
+ Requires-Dist: beautifulsoup4>=4.14.2
6
+ Requires-Dist: hyperbrowser>=0.68.0
7
+ Requires-Dist: openai>=2.6.0
8
+ Requires-Dist: playwright>=1.55.0
9
+ Requires-Dist: pydantic>=2.12.3
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+
13
+ # WebQuest
14
+
15
+ WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
16
+
17
+ **Scrapers**
18
+
19
+ - **Any Article:** Extracts readable content from arbitrary web articles.
20
+ - **DuckDuckGo Search:** General web search using DuckDuckGo.
21
+ - **Google News Search:** News-focused search via Google News.
22
+ - **YouTube Search:** Search YouTube videos, channels, posts, and shorts.
23
+ - **YouTube Transcript:** Fetch transcripts for YouTube videos.
24
+
25
+ **Browsers**
26
+
27
+ - **Hyperbrowser:** A cloud-based browser service for running Playwright scrapers without managing infrastructure.
28
+
29
+ ## Installation
30
+
31
+ Installing using pip:
32
+
33
+ ```bash
34
+ pip install webquest
35
+ ```
36
+
37
+ Installing using uv:
38
+
39
+ ```bash
40
+ uv add webquest
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ Example usage of the DuckDuckGo Search scraper:
46
+
47
+ ```python
48
+ import asyncio
49
+
50
+ from webquest.browsers import Hyperbrowser
51
+ from webquest.scrapers import DuckDuckGoSearch
52
+
53
+
54
+ async def main() -> None:
55
+ scraper = DuckDuckGoSearch(browser=Hyperbrowser())
56
+
57
+ response = await scraper.run(
58
+ scraper.request(query="Pizza Toppings"),
59
+ )
60
+ print(response.model_dump_json(indent=4))
61
+
62
+
63
+ if __name__ == "__main__":
64
+ asyncio.run(main())
65
+ ```
66
+
67
+ You can also run multiple requests at the same time:
68
+
69
+ ```python
70
+ import asyncio
71
+
72
+ from webquest.browsers import Hyperbrowser
73
+ from webquest.scrapers import DuckDuckGoSearch
74
+
75
+
76
+ async def main() -> None:
77
+ scraper = DuckDuckGoSearch(browser=Hyperbrowser())
78
+
79
+ responses = await scraper.run(
80
+ scraper.request(query="Pizza Toppings"),
81
+ scraper.request(query="AI News"),
82
+ )
83
+ for response in responses:
84
+ print(response.model_dump_json(indent=4))
85
+
86
+
87
+ if __name__ == "__main__":
88
+ asyncio.run(main())
89
+ ```
90
+
91
+ > To use the Hyperbrowser browser, you need to set the `HYPERBROWSER_API_KEY` environment variable.
92
+
93
+ > To use the Any Article scraper, you need to set the `OPENAI_API_KEY` environment variable.
94
+
95
+ ## Disclaimer
96
+
97
+ This tool is for educational and research purposes only. The developers of WebQuest are not responsible for any misuse of this tool. Scraping websites may violate their Terms of Service. Users are solely responsible for ensuring their activities comply with all applicable laws and website policies.
@@ -0,0 +1,26 @@
1
+ webquest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ webquest/browsers/__init__.py,sha256=zx78XH7Ys0xKrMaeV5Pkwx5q37GDagAOBIxted_K2Gk,141
3
+ webquest/browsers/browser.py,sha256=XSojMYWfsUuxZypOi0g9ylhHlN7Fct9RTwrNyBCJIgs,674
4
+ webquest/browsers/hyperbrowser.py,sha256=FRV3tAd6OcMlfnsGNt3vW-j4CEJPgqLi-6U_tUQVMCA,1750
5
+ webquest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ webquest/scrapers/__init__.py,sha256=t7f7oy1R66mIfdw9fWDcpHJ8njHoCgOFUn_N2I145PI,1198
7
+ webquest/scrapers/any_article/__init__.py,sha256=vrAqkwWHrRnnm5jcvSbOZDOzkw_HAJNmcbKbFF93tyg,201
8
+ webquest/scrapers/any_article/schemas.py,sha256=qogFp13pIQy0u3aNHhDSqRJVZpApqCvbCi6tGls5tHw,217
9
+ webquest/scrapers/any_article/scraper.py,sha256=hQBN5SWd1iucZ7-O_Nh1WnzTNUAPC62odwckaF3bKWA,1338
10
+ webquest/scrapers/duckduckgo_search/__init__.py,sha256=9egcEqtLtxQSiyy8lg6e8_2uU4psuxp0nZf7lqhQ1IU,243
11
+ webquest/scrapers/duckduckgo_search/schemas.py,sha256=YT1jJXs2j0_Lc1HHdgfX4N_88-XtrasqTeiqhfl9UpA,245
12
+ webquest/scrapers/duckduckgo_search/scraper.py,sha256=DfNgvgRoCU0-ZbhRSxtBeEbUvoxC1DHADFmeMptqPic,2684
13
+ webquest/scrapers/google_news_search/__init__.py,sha256=-NhQBRay_flLTBtHGwe-KQtY6AdWjw5QnMBOZlzJvBs,244
14
+ webquest/scrapers/google_news_search/schemas.py,sha256=qY0U217bHZwtYvQ_f6FDKzNI_0wT9M-s6z0MtJm3qF4,255
15
+ webquest/scrapers/google_news_search/scraper.py,sha256=2aRvoYszJMWN8CZ3TP-g1GWPsaQJuCaCjX4_oHNOoQc,2307
16
+ webquest/scrapers/openai_parser.py,sha256=meK1nBFkDvF04QUEQC5NCTukH2_Mhs8swyL5EATAzK0,1849
17
+ webquest/scrapers/scraper.py,sha256=jLzHQnvx9EsR-MohBEc1rtBmv9PqxliSHbqtu4vrmsE,3649
18
+ webquest/scrapers/youtube_search/__init__.py,sha256=BvuHhLSZQ9N9pRW6ciKZjIcScecaFqS6BGzgaytwH3o,222
19
+ webquest/scrapers/youtube_search/schemas.py,sha256=Lf4QOObvrYC3sHETS5KKlunci4yTS1GKD177-N9evKY,796
20
+ webquest/scrapers/youtube_search/scraper.py,sha256=bGWlD2BWV9UKFRYptisKW-IP_4AZ5OSoJfbEN8Vq-Dk,10108
21
+ webquest/scrapers/youtube_transcript/__init__.py,sha256=m1LyZcr-PGQqNRNDtWOVsuhIMIV-lG5YIivCyRENTiY,250
22
+ webquest/scrapers/youtube_transcript/schemas.py,sha256=pIrq3XPbXERTTmaJwnUrgyu1LBfD8T-Vn7be088-Ss4,160
23
+ webquest/scrapers/youtube_transcript/scraper.py,sha256=A9JjQhdOQZXAPw5NCzQaRc9yFpgUpRpfR2OdfEvdYYI,2723
24
+ webquest-0.7.3.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
25
+ webquest-0.7.3.dist-info/METADATA,sha256=eYRZeyvC0hOKEaCwsS2-U6BXTki2tfoUiZ5ZPPinY_Q,2778
26
+ webquest-0.7.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.24
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any