webquest 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. webquest-0.2.0/PKG-INFO +96 -0
  2. webquest-0.2.0/README.md +83 -0
  3. webquest-0.2.0/pyproject.toml +35 -0
  4. webquest-0.2.0/src/webquest/__init__.py +0 -0
  5. webquest-0.2.0/src/webquest/base/__init__.py +8 -0
  6. webquest-0.2.0/src/webquest/base/base_scraper.py +22 -0
  7. webquest-0.2.0/src/webquest/base/openai_parser.py +66 -0
  8. webquest-0.2.0/src/webquest/py.typed +0 -0
  9. webquest-0.2.0/src/webquest/runners/__init__.py +3 -0
  10. webquest-0.2.0/src/webquest/runners/hyperbrowser.py +60 -0
  11. webquest-0.2.0/src/webquest/scrapers/__init__.py +43 -0
  12. webquest-0.2.0/src/webquest/scrapers/any_article/__init__.py +4 -0
  13. webquest-0.2.0/src/webquest/scrapers/any_article/schemas.py +13 -0
  14. webquest-0.2.0/src/webquest/scrapers/any_article/scraper.py +35 -0
  15. webquest-0.2.0/src/webquest/scrapers/duckduckgo_search/__init__.py +7 -0
  16. webquest-0.2.0/src/webquest/scrapers/duckduckgo_search/schemas.py +16 -0
  17. webquest-0.2.0/src/webquest/scrapers/duckduckgo_search/scraper.py +81 -0
  18. webquest-0.2.0/src/webquest/scrapers/google_news_search/__init__.py +7 -0
  19. webquest-0.2.0/src/webquest/scrapers/google_news_search/schemas.py +17 -0
  20. webquest-0.2.0/src/webquest/scrapers/google_news_search/scraper.py +75 -0
  21. webquest-0.2.0/src/webquest/scrapers/youtube_search/__init__.py +7 -0
  22. webquest-0.2.0/src/webquest/scrapers/youtube_search/schemas.py +51 -0
  23. webquest-0.2.0/src/webquest/scrapers/youtube_search/scraper.py +298 -0
  24. webquest-0.2.0/src/webquest/scrapers/youtube_transcript/__init__.py +7 -0
  25. webquest-0.2.0/src/webquest/scrapers/youtube_transcript/schemas.py +9 -0
  26. webquest-0.2.0/src/webquest/scrapers/youtube_transcript/scraper.py +79 -0
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.3
2
+ Name: webquest
3
+ Version: 0.2.0
4
+ Summary: WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
5
+ Requires-Dist: beautifulsoup4>=4.14.2
6
+ Requires-Dist: hyperbrowser>=0.68.0
7
+ Requires-Dist: openai>=2.6.0
8
+ Requires-Dist: playwright>=1.55.0
9
+ Requires-Dist: pydantic>=2.12.3
10
+ Requires-Dist: pydantic-settings>=2.11.0
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+
14
+ # WebQuest
15
+
16
+ WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
17
+
18
+ Scrapers:
19
+
20
+ - Any Article
21
+ - DuckDuckGo Search
22
+ - Google News Search
23
+ - YouTube Search
24
+ - YouTube Transcript
25
+
26
+ Runners:
27
+
28
+ - Hyperbrowser
29
+
30
+ ## Installation
31
+
32
+ Installing using pip:
33
+
34
+ ```bash
35
+ pip install webquest
36
+ ```
37
+
38
+ Installing using uv:
39
+
40
+ ```bash
41
+ uv add webquest
42
+ ```
43
+
44
+ ## Usage
45
+
46
+ Example usage of the DuckDuckGo Search scraper:
47
+
48
+ ```python
49
+ import asyncio
50
+
51
+ from webquest.runners import Hyperbrowser
52
+ from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
53
+
54
+
55
+ async def main() -> None:
56
+ runner = Hyperbrowser()
57
+ scraper = DuckDuckGoSearch()
58
+ response = await runner.run(
59
+ scraper,
60
+ DuckDuckGoSearchRequest(query="Pizza Toppings"),
61
+ )
62
+ print(response.model_dump_json(indent=4))
63
+
64
+
65
+ if __name__ == "__main__":
66
+ asyncio.run(main())
67
+ ```
68
+
69
+ > To use the Hyperbrowser runner, you need to set the `HYPERBROWSER_API_KEY` environment variable.
70
+
71
+ You can also run multiple requests at the same time:
72
+
73
+ ```python
74
+ import asyncio
75
+
76
+ from webquest.runners import Hyperbrowser
77
+ from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
78
+
79
+
80
+ async def main() -> None:
81
+ runner = Hyperbrowser()
82
+ scraper = DuckDuckGoSearch()
83
+ responses = await runner.run_multiple(
84
+ scraper,
85
+ [
86
+ DuckDuckGoSearchRequest(query="Pizza Toppings"),
87
+ DuckDuckGoSearchRequest(query="AI News"),
88
+ ],
89
+ )
90
+ for response in responses:
91
+ print(response.model_dump_json(indent=4))
92
+
93
+
94
+ if __name__ == "__main__":
95
+ asyncio.run(main())
96
+ ```
@@ -0,0 +1,83 @@
1
+ # WebQuest
2
+
3
+ WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
4
+
5
+ Scrapers:
6
+
7
+ - Any Article
8
+ - DuckDuckGo Search
9
+ - Google News Search
10
+ - YouTube Search
11
+ - YouTube Transcript
12
+
13
+ Runners:
14
+
15
+ - Hyperbrowser
16
+
17
+ ## Installation
18
+
19
+ Installing using pip:
20
+
21
+ ```bash
22
+ pip install webquest
23
+ ```
24
+
25
+ Installing using uv:
26
+
27
+ ```bash
28
+ uv add webquest
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ Example usage of the DuckDuckGo Search scraper:
34
+
35
+ ```python
36
+ import asyncio
37
+
38
+ from webquest.runners import Hyperbrowser
39
+ from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
40
+
41
+
42
+ async def main() -> None:
43
+ runner = Hyperbrowser()
44
+ scraper = DuckDuckGoSearch()
45
+ response = await runner.run(
46
+ scraper,
47
+ DuckDuckGoSearchRequest(query="Pizza Toppings"),
48
+ )
49
+ print(response.model_dump_json(indent=4))
50
+
51
+
52
+ if __name__ == "__main__":
53
+ asyncio.run(main())
54
+ ```
55
+
56
+ > To use the Hyperbrowser runner, you need to set the `HYPERBROWSER_API_KEY` environment variable.
57
+
58
+ You can also run multiple requests at the same time:
59
+
60
+ ```python
61
+ import asyncio
62
+
63
+ from webquest.runners import Hyperbrowser
64
+ from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
65
+
66
+
67
+ async def main() -> None:
68
+ runner = Hyperbrowser()
69
+ scraper = DuckDuckGoSearch()
70
+ responses = await runner.run_multiple(
71
+ scraper,
72
+ [
73
+ DuckDuckGoSearchRequest(query="Pizza Toppings"),
74
+ DuckDuckGoSearchRequest(query="AI News"),
75
+ ],
76
+ )
77
+ for response in responses:
78
+ print(response.model_dump_json(indent=4))
79
+
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
83
+ ```
@@ -0,0 +1,35 @@
1
+ [project]
2
+ name = "webquest"
3
+ version = "0.2.0"
4
+ description = "WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "beautifulsoup4>=4.14.2",
9
+ "hyperbrowser>=0.68.0",
10
+ "openai>=2.6.0",
11
+ "playwright>=1.55.0",
12
+ "pydantic>=2.12.3",
13
+ "pydantic-settings>=2.11.0",
14
+ ]
15
+
16
+ [build-system]
17
+ requires = ["uv_build>=0.8.18,<0.9.0"]
18
+ build-backend = "uv_build"
19
+
20
+ [dependency-groups]
21
+ dev = [
22
+ "pytest>=8.4.2",
23
+ "pytest-asyncio>=1.2.0",
24
+ "pytest-mypy>=1.0.1",
25
+ "ruff>=0.14.1",
26
+ ]
27
+
28
+ [tool.pytest.ini_options]
29
+ addopts = "--mypy"
30
+ asyncio_mode = "auto"
31
+ asyncio_default_fixture_loop_scope = "function"
32
+
33
+ [[tool.mypy.overrides]]
34
+ module = ["hyperbrowser.*"]
35
+ ignore_missing_imports = true
File without changes
@@ -0,0 +1,8 @@
1
+ from webquest.base.base_scraper import BaseScraper
2
+ from webquest.base.openai_parser import OpenAIParser, OpenAIParserSettings
3
+
4
+ __all__ = [
5
+ "BaseScraper",
6
+ "OpenAIParser",
7
+ "OpenAIParserSettings",
8
+ ]
@@ -0,0 +1,22 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generic, TypeVar
3
+
4
+ from playwright.async_api import BrowserContext
5
+ from pydantic import BaseModel
6
+
7
+ TRequest = TypeVar("TRequest", bound=BaseModel)
8
+ TRaw = TypeVar("TRaw")
9
+ TResponse = TypeVar("TResponse", bound=BaseModel)
10
+
11
+
12
+ class BaseScraper(ABC, Generic[TRequest, TRaw, TResponse]):
13
+ @abstractmethod
14
+ async def fetch(self, context: BrowserContext, request: TRequest) -> TRaw: ...
15
+
16
+ @abstractmethod
17
+ async def parse(self, raw: TRaw) -> TResponse: ...
18
+
19
+ async def scrape(self, context: BrowserContext, request: TRequest) -> TResponse:
20
+ raw = await self.fetch(context, request)
21
+ response = await self.parse(raw)
22
+ return response
@@ -0,0 +1,66 @@
1
+ from abc import ABC
2
+ from typing import Generic, Type, TypeVar, override
3
+
4
+ from bs4 import BeautifulSoup
5
+ from openai import AsyncOpenAI
6
+ from pydantic import BaseModel
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+ from webquest.base.base_scraper import BaseScraper
10
+
11
+ TRequest = TypeVar("TRequest", bound=BaseModel)
12
+ TResponse = TypeVar("TResponse", bound=BaseModel)
13
+
14
+
15
+ class OpenAIParserSettings(BaseSettings):
16
+ model_config = SettingsConfigDict(
17
+ env_file=".env",
18
+ extra="ignore",
19
+ )
20
+ openai_api_key: str | None = None
21
+
22
+
23
+ class OpenAIParser(
24
+ Generic[TRequest, TResponse],
25
+ BaseScraper[TRequest, str, TResponse],
26
+ ABC,
27
+ ):
28
+ def __init__(
29
+ self,
30
+ response_type: Type[TResponse],
31
+ openai: AsyncOpenAI | None = None,
32
+ settings: OpenAIParserSettings | None = None,
33
+ model: str = "gpt-5-mini",
34
+ input: str | None = None,
35
+ character_limit: int = 20000,
36
+ ) -> None:
37
+ self._response_type = response_type
38
+ if settings is None:
39
+ settings = OpenAIParserSettings()
40
+ self._settings = settings
41
+ if openai is None:
42
+ openai = AsyncOpenAI(api_key=self._settings.openai_api_key)
43
+ self._openai = openai
44
+ self._model = model
45
+ self._character_limit = character_limit
46
+ self._input = input or ""
47
+
48
+ @override
49
+ async def parse(self, raw: str) -> TResponse:
50
+ soup = BeautifulSoup(raw, "html.parser")
51
+ text = soup.get_text(separator="\n", strip=True)
52
+
53
+ if len(text) > self._character_limit:
54
+ start = (len(text) - self._character_limit) // 2
55
+ end = start + self._character_limit
56
+ text = text[start:end]
57
+
58
+ response = await self._openai.responses.parse(
59
+ input=f"{self._input}{text}",
60
+ text_format=self._response_type,
61
+ model=self._model,
62
+ reasoning={"effort": "minimal"},
63
+ )
64
+ if response.output_parsed is None:
65
+ raise ValueError("Failed to parse the response into the desired format.")
66
+ return response.output_parsed
File without changes
@@ -0,0 +1,3 @@
1
+ from webquest.runners.hyperbrowser import Hyperbrowser, HyperbrowserSettings
2
+
3
+ __all__ = ["Hyperbrowser", "HyperbrowserSettings"]
@@ -0,0 +1,60 @@
1
+ import asyncio
2
+ from typing import TypeVar
3
+
4
+ from hyperbrowser import AsyncHyperbrowser
5
+ from playwright.async_api import async_playwright
6
+ from pydantic import BaseModel
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+ from webquest.base.base_scraper import BaseScraper
10
+
11
+ TRequest = TypeVar("TRequest", bound=BaseModel)
12
+ TRaw = TypeVar("TRaw")
13
+ TResponse = TypeVar("TResponse", bound=BaseModel)
14
+
15
+
16
+ class HyperbrowserSettings(BaseSettings):
17
+ model_config = SettingsConfigDict(
18
+ env_file=".env",
19
+ extra="ignore",
20
+ )
21
+ hyperbrowser_api_key: str | None = None
22
+
23
+
24
+ class Hyperbrowser:
25
+ def __init__(
26
+ self,
27
+ settings: HyperbrowserSettings | None = None,
28
+ hyperbrowser_client: AsyncHyperbrowser | None = None,
29
+ ):
30
+ self._settings = settings or HyperbrowserSettings()
31
+ self._hyperbrowser_client = hyperbrowser_client or AsyncHyperbrowser(
32
+ api_key=self._settings.hyperbrowser_api_key,
33
+ )
34
+
35
+ async def run_multiple(
36
+ self,
37
+ scraper: BaseScraper[TRequest, TRaw, TResponse],
38
+ requests: list[TRequest],
39
+ ) -> list[TResponse]:
40
+ session = await self._hyperbrowser_client.sessions.create()
41
+ async with async_playwright() as p:
42
+ browser = await p.chromium.connect_over_cdp(session.ws_endpoint)
43
+ context = browser.contexts[0]
44
+ raw_items = await asyncio.gather(
45
+ *[scraper.fetch(context, request) for request in requests]
46
+ )
47
+ await self._hyperbrowser_client.sessions.stop(session.id)
48
+
49
+ responses = await asyncio.gather(
50
+ *[scraper.parse(raw_item) for raw_item in raw_items]
51
+ )
52
+ return responses
53
+
54
+ async def run(
55
+ self,
56
+ scraper: BaseScraper[TRequest, TRaw, TResponse],
57
+ request: TRequest,
58
+ ) -> TResponse:
59
+ responses = await self.run_multiple(scraper, [request])
60
+ return responses[0]
@@ -0,0 +1,43 @@
1
+ from webquest.scrapers.any_article import (
2
+ AnyArticle,
3
+ AnyArticleRequest,
4
+ AnyArticleResponse,
5
+ )
6
+ from webquest.scrapers.duckduckgo_search import (
7
+ DuckDuckGoSearch,
8
+ DuckDuckGoSearchRequest,
9
+ DuckDuckGoSearchResponse,
10
+ )
11
+ from webquest.scrapers.google_news_search import (
12
+ GoogleNewsSearch,
13
+ GoogleNewsSearchRequest,
14
+ GoogleNewsSearchResponse,
15
+ )
16
+ from webquest.scrapers.youtube_search import (
17
+ YouTubeSearch,
18
+ YouTubeSearchRequest,
19
+ YouTubeSearchResponse,
20
+ )
21
+ from webquest.scrapers.youtube_transcript import (
22
+ YouTubeTranscript,
23
+ YouTubeTranscriptRequest,
24
+ YouTubeTranscriptResponse,
25
+ )
26
+
27
+ __all__ = [
28
+ "AnyArticle",
29
+ "AnyArticleRequest",
30
+ "AnyArticleResponse",
31
+ "DuckDuckGoSearch",
32
+ "DuckDuckGoSearchRequest",
33
+ "DuckDuckGoSearchResponse",
34
+ "GoogleNewsSearch",
35
+ "GoogleNewsSearchRequest",
36
+ "GoogleNewsSearchResponse",
37
+ "YouTubeSearch",
38
+ "YouTubeSearchRequest",
39
+ "YouTubeSearchResponse",
40
+ "YouTubeTranscript",
41
+ "YouTubeTranscriptRequest",
42
+ "YouTubeTranscriptResponse",
43
+ ]
@@ -0,0 +1,4 @@
1
+ from webquest.scrapers.any_article.schemas import AnyArticleRequest, AnyArticleResponse
2
+ from webquest.scrapers.any_article.scraper import AnyArticle
3
+
4
+ __all__ = ["AnyArticleRequest", "AnyArticleResponse", "AnyArticle"]
@@ -0,0 +1,13 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class AnyArticleRequest(BaseModel):
5
+ url: str
6
+
7
+
8
+ class AnyArticleResponse(BaseModel):
9
+ publisher: str
10
+ title: str
11
+ published_at: str
12
+ authors: list[str]
13
+ content: str
@@ -0,0 +1,35 @@
1
+ from typing import override
2
+
3
+ from openai import AsyncOpenAI
4
+ from playwright.async_api import BrowserContext
5
+
6
+ from webquest.base.openai_parser import OpenAIParser, OpenAIParserSettings
7
+ from webquest.scrapers.any_article.schemas import AnyArticleRequest, AnyArticleResponse
8
+
9
+
10
+ class AnyArticle(OpenAIParser[AnyArticleRequest, AnyArticleResponse]):
11
+ def __init__(
12
+ self,
13
+ openai: AsyncOpenAI | None = None,
14
+ settings: OpenAIParserSettings | None = None,
15
+ model: str = "gpt-5-mini",
16
+ ) -> None:
17
+ super().__init__(
18
+ response_type=AnyArticleResponse,
19
+ openai=openai,
20
+ settings=settings,
21
+ model=model,
22
+ input="Parse the following web page and extract the main article:\n\n",
23
+ )
24
+
25
+ @override
26
+ async def fetch(
27
+ self,
28
+ context: BrowserContext,
29
+ request: AnyArticleRequest,
30
+ ) -> str:
31
+ page = await context.new_page()
32
+ await page.goto(request.url, wait_until="domcontentloaded")
33
+ await page.wait_for_timeout(3000)
34
+ html = await page.content()
35
+ return html
@@ -0,0 +1,7 @@
1
+ from webquest.scrapers.duckduckgo_search.schemas import (
2
+ DuckDuckGoSearchRequest,
3
+ DuckDuckGoSearchResponse,
4
+ )
5
+ from webquest.scrapers.duckduckgo_search.scraper import DuckDuckGoSearch
6
+
7
+ __all__ = ["DuckDuckGoSearchRequest", "DuckDuckGoSearchResponse", "DuckDuckGoSearch"]
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class DuckDuckGoSearchRequest(BaseModel):
5
+ query: str
6
+
7
+
8
+ class Page(BaseModel):
9
+ site: str
10
+ url: str
11
+ title: str
12
+ description: str
13
+
14
+
15
+ class DuckDuckGoSearchResponse(BaseModel):
16
+ pages: list[Page]
@@ -0,0 +1,81 @@
1
+ import asyncio
2
+ from typing import override
3
+ from urllib.parse import quote_plus
4
+
5
+ from bs4 import BeautifulSoup
6
+ from playwright.async_api import BrowserContext
7
+
8
+ from webquest.base.base_scraper import BaseScraper
9
+ from webquest.scrapers.duckduckgo_search.schemas import (
10
+ DuckDuckGoSearchRequest,
11
+ DuckDuckGoSearchResponse,
12
+ Page,
13
+ )
14
+
15
+
16
+ class DuckDuckGoSearch(
17
+ BaseScraper[DuckDuckGoSearchRequest, str, DuckDuckGoSearchResponse]
18
+ ):
19
+ @override
20
+ async def fetch(
21
+ self,
22
+ context: BrowserContext,
23
+ request: DuckDuckGoSearchRequest,
24
+ ) -> str:
25
+ url = f"https://duckduckgo.com/?origin=funnel_home_website&t=h_&q={quote_plus(request.query)}&ia=web"
26
+ page = await context.new_page()
27
+
28
+ await page.goto(url, wait_until="networkidle", timeout=30000)
29
+ await asyncio.sleep(1)
30
+
31
+ await page.wait_for_selector("button#more-results", timeout=15000)
32
+ await page.click("button#more-results")
33
+
34
+ await page.wait_for_selector("li[data-layout='organic']", timeout=15000)
35
+
36
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
37
+ await asyncio.sleep(4)
38
+
39
+ html = await page.content()
40
+
41
+ return html
42
+
43
+ @override
44
+ async def parse(self, raw: str) -> DuckDuckGoSearchResponse:
45
+ soup = BeautifulSoup(raw, "html.parser")
46
+ pages: list[Page] = []
47
+
48
+ article_tags = soup.find_all("article", {"data-testid": "result"})
49
+
50
+ for article_tag in article_tags:
51
+ site_tag = article_tag.find("p", class_="fOCEb2mA3YZTJXXjpgdS")
52
+ if not site_tag:
53
+ continue
54
+ site = site_tag.get_text(strip=True)
55
+
56
+ url_tag = article_tag.find("a", {"data-testid": "result-title-a"})
57
+ if not url_tag:
58
+ continue
59
+ url = url_tag.get("href")
60
+ if not isinstance(url, str):
61
+ continue
62
+
63
+ title_tag = article_tag.find("span", class_="EKtkFWMYpwzMKOYr0GYm")
64
+ if not title_tag:
65
+ continue
66
+ title = title_tag.get_text(strip=True)
67
+
68
+ description_tag = article_tag.find("span", class_="kY2IgmnCmOGjharHErah")
69
+ if not description_tag:
70
+ continue
71
+ description = description_tag.get_text(strip=True)
72
+
73
+ page = Page(
74
+ site=site,
75
+ url=url,
76
+ title=title,
77
+ description=description,
78
+ )
79
+ pages.append(page)
80
+
81
+ return DuckDuckGoSearchResponse(pages=pages)
@@ -0,0 +1,7 @@
1
+ from webquest.scrapers.google_news_search.schemas import (
2
+ GoogleNewsSearchRequest,
3
+ GoogleNewsSearchResponse,
4
+ )
5
+ from webquest.scrapers.google_news_search.scraper import GoogleNewsSearch
6
+
7
+ __all__ = ["GoogleNewsSearch", "GoogleNewsSearchRequest", "GoogleNewsSearchResponse"]
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class GoogleNewsSearchRequest(BaseModel):
5
+ query: str
6
+ locale: str | None = None
7
+
8
+
9
+ class Article(BaseModel):
10
+ site: str
11
+ url: str
12
+ title: str
13
+ published_at: str
14
+
15
+
16
+ class GoogleNewsSearchResponse(BaseModel):
17
+ articles: list[Article]
@@ -0,0 +1,75 @@
1
+ import asyncio
2
+ from typing import override
3
+ from urllib.parse import quote_plus
4
+
5
+ from bs4 import BeautifulSoup
6
+ from playwright.async_api import BrowserContext
7
+
8
+ from webquest.base.base_scraper import BaseScraper
9
+ from webquest.scrapers.google_news_search.schemas import (
10
+ Article,
11
+ GoogleNewsSearchRequest,
12
+ GoogleNewsSearchResponse,
13
+ )
14
+
15
+
16
+ class GoogleNewsSearch(
17
+ BaseScraper[GoogleNewsSearchRequest, str, GoogleNewsSearchResponse]
18
+ ):
19
+ @override
20
+ async def fetch(
21
+ self,
22
+ context: BrowserContext,
23
+ request: GoogleNewsSearchRequest,
24
+ ) -> str:
25
+ url = f"https://news.google.com/search?q={quote_plus(request.query)}"
26
+ page = await context.new_page()
27
+
28
+ await page.goto(url, wait_until="networkidle", timeout=30000)
29
+ await asyncio.sleep(1)
30
+
31
+ html = await page.content()
32
+
33
+ return html
34
+
35
+ @override
36
+ async def parse(self, raw: str) -> GoogleNewsSearchResponse:
37
+ soup = BeautifulSoup(raw, "html.parser")
38
+ articles: list[Article] = []
39
+
40
+ article_tags = soup.find_all("article")
41
+ for article_tag in article_tags:
42
+ title_tag = article_tag.find("a", class_="JtKRv")
43
+ if not title_tag:
44
+ continue
45
+ title = title_tag.get_text().strip()
46
+
47
+ url_tag = article_tag.find("a", class_="JtKRv")
48
+ if not url_tag:
49
+ continue
50
+ url = url_tag.get("href")
51
+ if not isinstance(url, str):
52
+ continue
53
+
54
+ url = f"https://news.google.com{url[1:]}"
55
+
56
+ site_tag = article_tag.find("div", class_="vr1PYe")
57
+ if not site_tag:
58
+ continue
59
+ site = site_tag.get_text().strip()
60
+
61
+ published_at_tag = article_tag.find("time")
62
+ if not published_at_tag:
63
+ continue
64
+ published_at = published_at_tag.get_text().strip()
65
+
66
+ article = Article(
67
+ site=site,
68
+ url=url,
69
+ title=title,
70
+ published_at=published_at,
71
+ )
72
+
73
+ articles.append(article)
74
+
75
+ return GoogleNewsSearchResponse(articles=articles)
@@ -0,0 +1,7 @@
1
+ from webquest.scrapers.youtube_search.schemas import (
2
+ YouTubeSearchRequest,
3
+ YouTubeSearchResponse,
4
+ )
5
+ from webquest.scrapers.youtube_search.scraper import YouTubeSearch
6
+
7
+ __all__ = ["YouTubeSearchRequest", "YouTubeSearchResponse", "YouTubeSearch"]
@@ -0,0 +1,51 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class Video(BaseModel):
5
+ id: str
6
+ url: str
7
+ title: str
8
+ description: str
9
+ published_at: str
10
+ views: str
11
+ channel_id: str
12
+ channel_url: str
13
+ channel_name: str
14
+
15
+
16
+ class Channel(BaseModel):
17
+ id: str
18
+ url: str
19
+ name: str
20
+ description: str | None
21
+ subscribers: str
22
+
23
+
24
+ class Post(BaseModel):
25
+ id: str
26
+ url: str
27
+ content: str
28
+ published_at: str
29
+ channel_id: str
30
+ channel_url: str
31
+ channel_name: str
32
+ comments: str
33
+ likes: str
34
+
35
+
36
+ class Short(BaseModel):
37
+ id: str
38
+ url: str
39
+ title: str
40
+ views: str
41
+
42
+
43
+ class YouTubeSearchRequest(BaseModel):
44
+ query: str
45
+
46
+
47
+ class YouTubeSearchResponse(BaseModel):
48
+ videos: list[Video]
49
+ channels: list[Channel]
50
+ posts: list[Post]
51
+ shorts: list[Short]
@@ -0,0 +1,298 @@
1
+ from typing import override
2
+ from urllib.parse import quote_plus
3
+
4
+ from bs4 import BeautifulSoup
5
+ from playwright.async_api import BrowserContext
6
+
7
+ from webquest.base.base_scraper import BaseScraper
8
+ from webquest.scrapers.youtube_search.schemas import (
9
+ Channel,
10
+ Post,
11
+ Short,
12
+ Video,
13
+ YouTubeSearchRequest,
14
+ YouTubeSearchResponse,
15
+ )
16
+
17
+
18
+ class YouTubeSearch(BaseScraper[YouTubeSearchRequest, str, YouTubeSearchResponse]):
19
+ def _parse_videos(self, soup: BeautifulSoup) -> list[Video]:
20
+ videos: list[Video] = []
21
+ video_tags = soup.find_all("ytd-video-renderer")
22
+
23
+ for video_tag in video_tags:
24
+ title_tag = video_tag.find(
25
+ "h3",
26
+ class_="title-and-badge style-scope ytd-video-renderer",
27
+ )
28
+ if not title_tag:
29
+ continue
30
+ title = title_tag.get_text(strip=True)
31
+
32
+ views_tag, published_at_tag = video_tag.find_all(
33
+ "span",
34
+ class_="inline-metadata-item style-scope ytd-video-meta-block",
35
+ )
36
+ views = views_tag.get_text(strip=True)
37
+ published_at = published_at_tag.get_text(strip=True)
38
+
39
+ description_tag = video_tag.find(
40
+ "yt-formatted-string",
41
+ class_="metadata-snippet-text style-scope ytd-video-renderer",
42
+ )
43
+ if not description_tag:
44
+ continue
45
+ description = description_tag.get_text(strip=True)
46
+
47
+ channel_name_tag = video_tag.find(
48
+ "a",
49
+ class_="yt-simple-endpoint style-scope yt-formatted-string",
50
+ )
51
+ if not channel_name_tag:
52
+ continue
53
+ channel_name = channel_name_tag.get_text(strip=True)
54
+
55
+ channel_id_tag = video_tag.find(
56
+ "a",
57
+ class_="yt-simple-endpoint style-scope yt-formatted-string",
58
+ )
59
+ if not channel_id_tag:
60
+ continue
61
+ channel_id = channel_id_tag.get("href")
62
+ if not isinstance(channel_id, str):
63
+ continue
64
+ channel_id = channel_id[1:]
65
+
66
+ channel_url = f"https://www.youtube.com/{channel_id}"
67
+
68
+ video_id_tag = video_tag.find(
69
+ "a",
70
+ class_="yt-simple-endpoint style-scope ytd-video-renderer",
71
+ )
72
+ if not video_id_tag:
73
+ continue
74
+ video_id = video_id_tag.get("href")
75
+ if not isinstance(video_id, str):
76
+ continue
77
+ video_id = video_id.split("v=")[-1].split("&")[0]
78
+
79
+ video_url = f"https://www.youtube.com/watch?v={video_id}"
80
+
81
+ video = Video(
82
+ id=video_id,
83
+ url=video_url,
84
+ title=title,
85
+ description=description,
86
+ published_at=published_at,
87
+ views=views,
88
+ channel_id=channel_id,
89
+ channel_url=channel_url,
90
+ channel_name=channel_name,
91
+ )
92
+ videos.append(video)
93
+
94
+ videos = [video for video in videos if len(video.id) == 11]
95
+
96
+ unique_videos = {video.id: video for video in videos}
97
+ videos = list(unique_videos.values())
98
+
99
+ return videos
100
+
101
+ def _parse_channels(self, soup: BeautifulSoup) -> list[Channel]:
102
+ channels: list[Channel] = []
103
+ channel_tags = soup.find_all("ytd-channel-renderer")
104
+ for channel_tag in channel_tags:
105
+ channel_name_tag = channel_tag.find(
106
+ "yt-formatted-string",
107
+ class_="style-scope ytd-channel-name",
108
+ )
109
+ if not channel_name_tag:
110
+ continue
111
+ channel_name = channel_name_tag.get_text(strip=True)
112
+
113
+ description_tag = channel_tag.find("yt-formatted-string", id="description")
114
+ if not description_tag:
115
+ continue
116
+ description: str | None = description_tag.get_text(strip=True)
117
+ if description == "":
118
+ description = None
119
+
120
+ channel_id_tag = channel_tag.find("yt-formatted-string", id="subscribers")
121
+ if not channel_id_tag:
122
+ continue
123
+ channel_id = channel_id_tag.get_text(strip=True)
124
+
125
+ channel_url = f"https://www.youtube.com/{channel_id}"
126
+
127
+ subscribers_tag = channel_tag.find("span", id="video-count")
128
+ if not subscribers_tag:
129
+ continue
130
+ subscribers = subscribers_tag.get_text(strip=True)
131
+
132
+ channel = Channel(
133
+ id=channel_id,
134
+ url=channel_url,
135
+ name=channel_name,
136
+ description=description,
137
+ subscribers=subscribers,
138
+ )
139
+ channels.append(channel)
140
+ return channels
141
+
142
+ def _parse_posts(self, soup: BeautifulSoup) -> list[Post]:
143
+ posts: list[Post] = []
144
+ post_tags = soup.find_all("ytd-post-renderer")
145
+ for post_tag in post_tags:
146
+ content_tag = post_tag.find(
147
+ "div",
148
+ id="content",
149
+ )
150
+ if not content_tag:
151
+ continue
152
+ content = content_tag.get_text(strip=True)
153
+
154
+ channel_name_tag = post_tag.find(
155
+ "div",
156
+ id="author",
157
+ )
158
+ if not channel_name_tag:
159
+ continue
160
+ channel_name = channel_name_tag.get_text(strip=True)
161
+
162
+ published_at_tag = post_tag.find(
163
+ "yt-formatted-string",
164
+ id="published-time-text",
165
+ )
166
+ if not published_at_tag:
167
+ continue
168
+ published_at = published_at_tag.get_text(strip=True)
169
+
170
+ channel_id_tag = post_tag.find(
171
+ "a",
172
+ id="author-text",
173
+ )
174
+ if not channel_id_tag:
175
+ continue
176
+ channel_id = channel_id_tag.get("href")
177
+ if not isinstance(channel_id, str):
178
+ continue
179
+ channel_id = channel_id[1:]
180
+
181
+ channel_url = f"https://www.youtube.com/{channel_id}"
182
+
183
+ post_id_tag = post_tag.find(
184
+ "a",
185
+ class_="yt-simple-endpoint style-scope yt-formatted-string",
186
+ )
187
+ if not post_id_tag:
188
+ continue
189
+ post_id = post_id_tag.get("href")
190
+ if not isinstance(post_id, str):
191
+ continue
192
+ post_id = post_id.split("/post/")[-1]
193
+
194
+ post_url = f"https://www.youtube.com/post/{post_id}"
195
+
196
+ likes_tag = post_tag.find(
197
+ "span",
198
+ id="vote-count-middle",
199
+ )
200
+ if not likes_tag:
201
+ continue
202
+ likes = likes_tag.get_text(strip=True)
203
+
204
+ comments_tag = post_tag.find(
205
+ "div",
206
+ class_="yt-spec-button-shape-next__button-text-content",
207
+ )
208
+ if not comments_tag:
209
+ continue
210
+ comments = comments_tag.get_text(strip=True)
211
+
212
+ post = Post(
213
+ id=post_id,
214
+ url=post_url,
215
+ content=content,
216
+ published_at=published_at,
217
+ channel_id=channel_id,
218
+ channel_url=channel_url,
219
+ channel_name=channel_name,
220
+ comments=comments,
221
+ likes=likes,
222
+ )
223
+ posts.append(post)
224
+
225
+ return posts
226
+
227
+ def _parse_shorts(self, soup: BeautifulSoup) -> list[Short]:
228
+ shorts: list[Short] = []
229
+ short_tags = soup.find_all("ytm-shorts-lockup-view-model-v2")
230
+ for short_tag in short_tags:
231
+ title_tag = short_tag.find(
232
+ "h3",
233
+ role="presentation",
234
+ )
235
+ if not title_tag:
236
+ continue
237
+ title = title_tag.get_text(strip=True)
238
+
239
+ views_tag = short_tag.find(
240
+ "div",
241
+ class_="shortsLockupViewModelHostOutsideMetadataSubhead shortsLockupViewModelHostMetadataSubhead",
242
+ )
243
+ if not views_tag:
244
+ continue
245
+ views = views_tag.get_text(strip=True)
246
+
247
+ short_id_tag = short_tag.find(
248
+ "a",
249
+ class_="shortsLockupViewModelHostEndpoint shortsLockupViewModelHostOutsideMetadataEndpoint",
250
+ )
251
+ if not short_id_tag:
252
+ continue
253
+ short_id = short_id_tag.get("href")
254
+ if not isinstance(short_id, str):
255
+ continue
256
+ short_id = short_id.split("shorts/")[-1]
257
+
258
+ short_url = f"https://www.youtube.com/shorts/{short_id}"
259
+
260
+ short = Short(
261
+ id=short_id,
262
+ url=short_url,
263
+ title=title,
264
+ views=views,
265
+ )
266
+ shorts.append(short)
267
+ return shorts
268
+
269
+ def _parse_search_results(self, soup: BeautifulSoup) -> YouTubeSearchResponse:
270
+ videos = self._parse_videos(soup)
271
+ channels = self._parse_channels(soup)
272
+ posts = self._parse_posts(soup)
273
+ shorts = self._parse_shorts(soup)
274
+ return YouTubeSearchResponse(
275
+ videos=videos,
276
+ channels=channels,
277
+ posts=posts,
278
+ shorts=shorts,
279
+ )
280
+
281
+ @override
282
+ async def parse(self, raw: str) -> YouTubeSearchResponse:
283
+ soup = BeautifulSoup(raw, "html.parser")
284
+ result = self._parse_search_results(soup)
285
+ return result
286
+
287
+ @override
288
+ async def fetch(
289
+ self, context: BrowserContext, request: YouTubeSearchRequest
290
+ ) -> str:
291
+ url = (
292
+ f"https://www.youtube.com/results?search_query={quote_plus(request.query)}"
293
+ )
294
+ page = await context.new_page()
295
+ await page.goto(url)
296
+ await page.wait_for_selector("ytd-video-renderer", timeout=10000)
297
+ html = await page.content()
298
+ return html
@@ -0,0 +1,7 @@
1
+ from webquest.scrapers.youtube_transcript.schemas import (
2
+ YouTubeTranscriptRequest,
3
+ YouTubeTranscriptResponse,
4
+ )
5
+ from webquest.scrapers.youtube_transcript.scraper import YouTubeTranscript
6
+
7
+ __all__ = ["YouTubeTranscriptRequest", "YouTubeTranscriptResponse", "YouTubeTranscript"]
@@ -0,0 +1,9 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class YouTubeTranscriptRequest(BaseModel):
5
+ video_id: str
6
+
7
+
8
+ class YouTubeTranscriptResponse(BaseModel):
9
+ transcript: str
@@ -0,0 +1,79 @@
1
+ import asyncio
2
+ from typing import override
3
+
4
+ from bs4 import BeautifulSoup
5
+ from playwright.async_api import BrowserContext
6
+
7
+ from webquest.base.base_scraper import BaseScraper
8
+ from webquest.scrapers.youtube_transcript.schemas import (
9
+ YouTubeTranscriptRequest,
10
+ YouTubeTranscriptResponse,
11
+ )
12
+
13
+
14
+ class YouTubeTranscript(
15
+ BaseScraper[YouTubeTranscriptRequest, str, YouTubeTranscriptResponse]
16
+ ):
17
+ @override
18
+ async def fetch(
19
+ self,
20
+ context: BrowserContext,
21
+ request: YouTubeTranscriptRequest,
22
+ ) -> str:
23
+ video_url = f"https://www.youtube.com/watch?v={request.video_id}"
24
+
25
+ page = await context.new_page()
26
+
27
+ await page.goto(video_url, wait_until="networkidle", timeout=30000)
28
+ await asyncio.sleep(1)
29
+
30
+ await page.wait_for_selector("div#description", timeout=10000)
31
+ await page.click("div#description")
32
+
33
+ await asyncio.sleep(0.5)
34
+
35
+ transcript_button = await page.wait_for_selector(
36
+ 'button[aria-label="Show transcript"]', timeout=10000
37
+ )
38
+ if not transcript_button:
39
+ raise Exception("Transcript button not found")
40
+
41
+ await transcript_button.click()
42
+
43
+ await page.wait_for_selector(
44
+ "ytd-transcript-segment-list-renderer", timeout=10000
45
+ )
46
+
47
+ html = await page.content()
48
+ return html
49
+
50
+ @override
51
+ async def parse(self, raw: str) -> YouTubeTranscriptResponse:
52
+ soup = BeautifulSoup(raw, "html.parser")
53
+
54
+ # Find the transcript segment list renderer
55
+ segment_renderer = soup.select_one("ytd-transcript-segment-list-renderer")
56
+ if not segment_renderer:
57
+ raise Exception("No transcript segments found")
58
+
59
+ # Find the segments container
60
+ segments_container = segment_renderer.select_one("div#segments-container")
61
+ if not segments_container:
62
+ raise Exception("No transcript segments found")
63
+
64
+ # Find all transcript segment renderers
65
+ segments = segments_container.select("ytd-transcript-segment-renderer")
66
+ if not segments:
67
+ raise Exception("No transcript segments found")
68
+
69
+ # Extract text from each segment
70
+ transcript_segments = []
71
+ for segment in segments:
72
+ text_element = segment.select_one("yt-formatted-string")
73
+ if text_element:
74
+ transcript_segments.append(text_element.get_text())
75
+
76
+ formatted_transcript = " ".join(transcript_segments).strip()
77
+ result = YouTubeTranscriptResponse(transcript=formatted_transcript)
78
+
79
+ return result