webquest 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webquest-0.2.0/PKG-INFO +96 -0
- webquest-0.2.0/README.md +83 -0
- webquest-0.2.0/pyproject.toml +35 -0
- webquest-0.2.0/src/webquest/__init__.py +0 -0
- webquest-0.2.0/src/webquest/base/__init__.py +8 -0
- webquest-0.2.0/src/webquest/base/base_scraper.py +22 -0
- webquest-0.2.0/src/webquest/base/openai_parser.py +66 -0
- webquest-0.2.0/src/webquest/py.typed +0 -0
- webquest-0.2.0/src/webquest/runners/__init__.py +3 -0
- webquest-0.2.0/src/webquest/runners/hyperbrowser.py +60 -0
- webquest-0.2.0/src/webquest/scrapers/__init__.py +43 -0
- webquest-0.2.0/src/webquest/scrapers/any_article/__init__.py +4 -0
- webquest-0.2.0/src/webquest/scrapers/any_article/schemas.py +13 -0
- webquest-0.2.0/src/webquest/scrapers/any_article/scraper.py +35 -0
- webquest-0.2.0/src/webquest/scrapers/duckduckgo_search/__init__.py +7 -0
- webquest-0.2.0/src/webquest/scrapers/duckduckgo_search/schemas.py +16 -0
- webquest-0.2.0/src/webquest/scrapers/duckduckgo_search/scraper.py +81 -0
- webquest-0.2.0/src/webquest/scrapers/google_news_search/__init__.py +7 -0
- webquest-0.2.0/src/webquest/scrapers/google_news_search/schemas.py +17 -0
- webquest-0.2.0/src/webquest/scrapers/google_news_search/scraper.py +75 -0
- webquest-0.2.0/src/webquest/scrapers/youtube_search/__init__.py +7 -0
- webquest-0.2.0/src/webquest/scrapers/youtube_search/schemas.py +51 -0
- webquest-0.2.0/src/webquest/scrapers/youtube_search/scraper.py +298 -0
- webquest-0.2.0/src/webquest/scrapers/youtube_transcript/__init__.py +7 -0
- webquest-0.2.0/src/webquest/scrapers/youtube_transcript/schemas.py +9 -0
- webquest-0.2.0/src/webquest/scrapers/youtube_transcript/scraper.py +79 -0
webquest-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: webquest
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
|
|
5
|
+
Requires-Dist: beautifulsoup4>=4.14.2
|
|
6
|
+
Requires-Dist: hyperbrowser>=0.68.0
|
|
7
|
+
Requires-Dist: openai>=2.6.0
|
|
8
|
+
Requires-Dist: playwright>=1.55.0
|
|
9
|
+
Requires-Dist: pydantic>=2.12.3
|
|
10
|
+
Requires-Dist: pydantic-settings>=2.11.0
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# WebQuest
|
|
15
|
+
|
|
16
|
+
WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
|
|
17
|
+
|
|
18
|
+
Scrapers:
|
|
19
|
+
|
|
20
|
+
- Any Article
|
|
21
|
+
- DuckDuckGo Search
|
|
22
|
+
- Google News Search
|
|
23
|
+
- YouTube Search
|
|
24
|
+
- YouTube Transcript
|
|
25
|
+
|
|
26
|
+
Runners:
|
|
27
|
+
|
|
28
|
+
- Hyperbrowser
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
Installing using pip:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install webquest
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Installing using uv:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv add webquest
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
Example usage of the DuckDuckGo Search scraper:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import asyncio
|
|
50
|
+
|
|
51
|
+
from webquest.runners import Hyperbrowser
|
|
52
|
+
from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def main() -> None:
|
|
56
|
+
runner = Hyperbrowser()
|
|
57
|
+
scraper = DuckDuckGoSearch()
|
|
58
|
+
response = await runner.run(
|
|
59
|
+
scraper,
|
|
60
|
+
DuckDuckGoSearchRequest(query="Pizza Toppings"),
|
|
61
|
+
)
|
|
62
|
+
print(response.model_dump_json(indent=4))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
asyncio.run(main())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
> To use the Hyperbrowser runner, you need to set the `HYPERBROWSER_API_KEY` environment variable.
|
|
70
|
+
|
|
71
|
+
You can also run multiple requests at the same time:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import asyncio
|
|
75
|
+
|
|
76
|
+
from webquest.runners import Hyperbrowser
|
|
77
|
+
from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def main() -> None:
|
|
81
|
+
runner = Hyperbrowser()
|
|
82
|
+
scraper = DuckDuckGoSearch()
|
|
83
|
+
responses = await runner.run_multiple(
|
|
84
|
+
scraper,
|
|
85
|
+
[
|
|
86
|
+
DuckDuckGoSearchRequest(query="Pizza Toppings"),
|
|
87
|
+
DuckDuckGoSearchRequest(query="AI News"),
|
|
88
|
+
],
|
|
89
|
+
)
|
|
90
|
+
for response in responses:
|
|
91
|
+
print(response.model_dump_json(indent=4))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
asyncio.run(main())
|
|
96
|
+
```
|
webquest-0.2.0/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# WebQuest
|
|
2
|
+
|
|
3
|
+
WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers.
|
|
4
|
+
|
|
5
|
+
Scrapers:
|
|
6
|
+
|
|
7
|
+
- Any Article
|
|
8
|
+
- DuckDuckGo Search
|
|
9
|
+
- Google News Search
|
|
10
|
+
- YouTube Search
|
|
11
|
+
- YouTube Transcript
|
|
12
|
+
|
|
13
|
+
Runners:
|
|
14
|
+
|
|
15
|
+
- Hyperbrowser
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
Installing using pip:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install webquest
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Installing using uv:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv add webquest
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
Example usage of the DuckDuckGo Search scraper:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import asyncio
|
|
37
|
+
|
|
38
|
+
from webquest.runners import Hyperbrowser
|
|
39
|
+
from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def main() -> None:
|
|
43
|
+
runner = Hyperbrowser()
|
|
44
|
+
scraper = DuckDuckGoSearch()
|
|
45
|
+
response = await runner.run(
|
|
46
|
+
scraper,
|
|
47
|
+
DuckDuckGoSearchRequest(query="Pizza Toppings"),
|
|
48
|
+
)
|
|
49
|
+
print(response.model_dump_json(indent=4))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
asyncio.run(main())
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
> To use the Hyperbrowser runner, you need to set the `HYPERBROWSER_API_KEY` environment variable.
|
|
57
|
+
|
|
58
|
+
You can also run multiple requests at the same time:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import asyncio
|
|
62
|
+
|
|
63
|
+
from webquest.runners import Hyperbrowser
|
|
64
|
+
from webquest.scrapers import DuckDuckGoSearch, DuckDuckGoSearchRequest
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def main() -> None:
|
|
68
|
+
runner = Hyperbrowser()
|
|
69
|
+
scraper = DuckDuckGoSearch()
|
|
70
|
+
responses = await runner.run_multiple(
|
|
71
|
+
scraper,
|
|
72
|
+
[
|
|
73
|
+
DuckDuckGoSearchRequest(query="Pizza Toppings"),
|
|
74
|
+
DuckDuckGoSearchRequest(query="AI News"),
|
|
75
|
+
],
|
|
76
|
+
)
|
|
77
|
+
for response in responses:
|
|
78
|
+
print(response.model_dump_json(indent=4))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
asyncio.run(main())
|
|
83
|
+
```
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "webquest"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "WebQuest is an extensible Python toolkit for high-level web scraping, built around a generic Playwright-based scraper interface for quickly building, running, and reusing custom scrapers."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"beautifulsoup4>=4.14.2",
|
|
9
|
+
"hyperbrowser>=0.68.0",
|
|
10
|
+
"openai>=2.6.0",
|
|
11
|
+
"playwright>=1.55.0",
|
|
12
|
+
"pydantic>=2.12.3",
|
|
13
|
+
"pydantic-settings>=2.11.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[build-system]
|
|
17
|
+
requires = ["uv_build>=0.8.18,<0.9.0"]
|
|
18
|
+
build-backend = "uv_build"
|
|
19
|
+
|
|
20
|
+
[dependency-groups]
|
|
21
|
+
dev = [
|
|
22
|
+
"pytest>=8.4.2",
|
|
23
|
+
"pytest-asyncio>=1.2.0",
|
|
24
|
+
"pytest-mypy>=1.0.1",
|
|
25
|
+
"ruff>=0.14.1",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[tool.pytest.ini_options]
|
|
29
|
+
addopts = "--mypy"
|
|
30
|
+
asyncio_mode = "auto"
|
|
31
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
32
|
+
|
|
33
|
+
[[tool.mypy.overrides]]
|
|
34
|
+
module = ["hyperbrowser.*"]
|
|
35
|
+
ignore_missing_imports = true
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Generic, TypeVar
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import BrowserContext
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
TRequest = TypeVar("TRequest", bound=BaseModel)
|
|
8
|
+
TRaw = TypeVar("TRaw")
|
|
9
|
+
TResponse = TypeVar("TResponse", bound=BaseModel)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseScraper(ABC, Generic[TRequest, TRaw, TResponse]):
|
|
13
|
+
@abstractmethod
|
|
14
|
+
async def fetch(self, context: BrowserContext, request: TRequest) -> TRaw: ...
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
async def parse(self, raw: TRaw) -> TResponse: ...
|
|
18
|
+
|
|
19
|
+
async def scrape(self, context: BrowserContext, request: TRequest) -> TResponse:
|
|
20
|
+
raw = await self.fetch(context, request)
|
|
21
|
+
response = await self.parse(raw)
|
|
22
|
+
return response
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Generic, Type, TypeVar, override
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from openai import AsyncOpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
8
|
+
|
|
9
|
+
from webquest.base.base_scraper import BaseScraper
|
|
10
|
+
|
|
11
|
+
TRequest = TypeVar("TRequest", bound=BaseModel)
|
|
12
|
+
TResponse = TypeVar("TResponse", bound=BaseModel)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OpenAIParserSettings(BaseSettings):
|
|
16
|
+
model_config = SettingsConfigDict(
|
|
17
|
+
env_file=".env",
|
|
18
|
+
extra="ignore",
|
|
19
|
+
)
|
|
20
|
+
openai_api_key: str | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OpenAIParser(
|
|
24
|
+
Generic[TRequest, TResponse],
|
|
25
|
+
BaseScraper[TRequest, str, TResponse],
|
|
26
|
+
ABC,
|
|
27
|
+
):
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
response_type: Type[TResponse],
|
|
31
|
+
openai: AsyncOpenAI | None = None,
|
|
32
|
+
settings: OpenAIParserSettings | None = None,
|
|
33
|
+
model: str = "gpt-5-mini",
|
|
34
|
+
input: str | None = None,
|
|
35
|
+
character_limit: int = 20000,
|
|
36
|
+
) -> None:
|
|
37
|
+
self._response_type = response_type
|
|
38
|
+
if settings is None:
|
|
39
|
+
settings = OpenAIParserSettings()
|
|
40
|
+
self._settings = settings
|
|
41
|
+
if openai is None:
|
|
42
|
+
openai = AsyncOpenAI(api_key=self._settings.openai_api_key)
|
|
43
|
+
self._openai = openai
|
|
44
|
+
self._model = model
|
|
45
|
+
self._character_limit = character_limit
|
|
46
|
+
self._input = input or ""
|
|
47
|
+
|
|
48
|
+
@override
|
|
49
|
+
async def parse(self, raw: str) -> TResponse:
|
|
50
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
51
|
+
text = soup.get_text(separator="\n", strip=True)
|
|
52
|
+
|
|
53
|
+
if len(text) > self._character_limit:
|
|
54
|
+
start = (len(text) - self._character_limit) // 2
|
|
55
|
+
end = start + self._character_limit
|
|
56
|
+
text = text[start:end]
|
|
57
|
+
|
|
58
|
+
response = await self._openai.responses.parse(
|
|
59
|
+
input=f"{self._input}{text}",
|
|
60
|
+
text_format=self._response_type,
|
|
61
|
+
model=self._model,
|
|
62
|
+
reasoning={"effort": "minimal"},
|
|
63
|
+
)
|
|
64
|
+
if response.output_parsed is None:
|
|
65
|
+
raise ValueError("Failed to parse the response into the desired format.")
|
|
66
|
+
return response.output_parsed
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import TypeVar
|
|
3
|
+
|
|
4
|
+
from hyperbrowser import AsyncHyperbrowser
|
|
5
|
+
from playwright.async_api import async_playwright
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
8
|
+
|
|
9
|
+
from webquest.base.base_scraper import BaseScraper
|
|
10
|
+
|
|
11
|
+
TRequest = TypeVar("TRequest", bound=BaseModel)
|
|
12
|
+
TRaw = TypeVar("TRaw")
|
|
13
|
+
TResponse = TypeVar("TResponse", bound=BaseModel)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HyperbrowserSettings(BaseSettings):
|
|
17
|
+
model_config = SettingsConfigDict(
|
|
18
|
+
env_file=".env",
|
|
19
|
+
extra="ignore",
|
|
20
|
+
)
|
|
21
|
+
hyperbrowser_api_key: str | None = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Hyperbrowser:
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
settings: HyperbrowserSettings | None = None,
|
|
28
|
+
hyperbrowser_client: AsyncHyperbrowser | None = None,
|
|
29
|
+
):
|
|
30
|
+
self._settings = settings or HyperbrowserSettings()
|
|
31
|
+
self._hyperbrowser_client = hyperbrowser_client or AsyncHyperbrowser(
|
|
32
|
+
api_key=self._settings.hyperbrowser_api_key,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
async def run_multiple(
|
|
36
|
+
self,
|
|
37
|
+
scraper: BaseScraper[TRequest, TRaw, TResponse],
|
|
38
|
+
requests: list[TRequest],
|
|
39
|
+
) -> list[TResponse]:
|
|
40
|
+
session = await self._hyperbrowser_client.sessions.create()
|
|
41
|
+
async with async_playwright() as p:
|
|
42
|
+
browser = await p.chromium.connect_over_cdp(session.ws_endpoint)
|
|
43
|
+
context = browser.contexts[0]
|
|
44
|
+
raw_items = await asyncio.gather(
|
|
45
|
+
*[scraper.fetch(context, request) for request in requests]
|
|
46
|
+
)
|
|
47
|
+
await self._hyperbrowser_client.sessions.stop(session.id)
|
|
48
|
+
|
|
49
|
+
responses = await asyncio.gather(
|
|
50
|
+
*[scraper.parse(raw_item) for raw_item in raw_items]
|
|
51
|
+
)
|
|
52
|
+
return responses
|
|
53
|
+
|
|
54
|
+
async def run(
|
|
55
|
+
self,
|
|
56
|
+
scraper: BaseScraper[TRequest, TRaw, TResponse],
|
|
57
|
+
request: TRequest,
|
|
58
|
+
) -> TResponse:
|
|
59
|
+
responses = await self.run_multiple(scraper, [request])
|
|
60
|
+
return responses[0]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from webquest.scrapers.any_article import (
|
|
2
|
+
AnyArticle,
|
|
3
|
+
AnyArticleRequest,
|
|
4
|
+
AnyArticleResponse,
|
|
5
|
+
)
|
|
6
|
+
from webquest.scrapers.duckduckgo_search import (
|
|
7
|
+
DuckDuckGoSearch,
|
|
8
|
+
DuckDuckGoSearchRequest,
|
|
9
|
+
DuckDuckGoSearchResponse,
|
|
10
|
+
)
|
|
11
|
+
from webquest.scrapers.google_news_search import (
|
|
12
|
+
GoogleNewsSearch,
|
|
13
|
+
GoogleNewsSearchRequest,
|
|
14
|
+
GoogleNewsSearchResponse,
|
|
15
|
+
)
|
|
16
|
+
from webquest.scrapers.youtube_search import (
|
|
17
|
+
YouTubeSearch,
|
|
18
|
+
YouTubeSearchRequest,
|
|
19
|
+
YouTubeSearchResponse,
|
|
20
|
+
)
|
|
21
|
+
from webquest.scrapers.youtube_transcript import (
|
|
22
|
+
YouTubeTranscript,
|
|
23
|
+
YouTubeTranscriptRequest,
|
|
24
|
+
YouTubeTranscriptResponse,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"AnyArticle",
|
|
29
|
+
"AnyArticleRequest",
|
|
30
|
+
"AnyArticleResponse",
|
|
31
|
+
"DuckDuckGoSearch",
|
|
32
|
+
"DuckDuckGoSearchRequest",
|
|
33
|
+
"DuckDuckGoSearchResponse",
|
|
34
|
+
"GoogleNewsSearch",
|
|
35
|
+
"GoogleNewsSearchRequest",
|
|
36
|
+
"GoogleNewsSearchResponse",
|
|
37
|
+
"YouTubeSearch",
|
|
38
|
+
"YouTubeSearchRequest",
|
|
39
|
+
"YouTubeSearchResponse",
|
|
40
|
+
"YouTubeTranscript",
|
|
41
|
+
"YouTubeTranscriptRequest",
|
|
42
|
+
"YouTubeTranscriptResponse",
|
|
43
|
+
]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import override
|
|
2
|
+
|
|
3
|
+
from openai import AsyncOpenAI
|
|
4
|
+
from playwright.async_api import BrowserContext
|
|
5
|
+
|
|
6
|
+
from webquest.base.openai_parser import OpenAIParser, OpenAIParserSettings
|
|
7
|
+
from webquest.scrapers.any_article.schemas import AnyArticleRequest, AnyArticleResponse
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AnyArticle(OpenAIParser[AnyArticleRequest, AnyArticleResponse]):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
openai: AsyncOpenAI | None = None,
|
|
14
|
+
settings: OpenAIParserSettings | None = None,
|
|
15
|
+
model: str = "gpt-5-mini",
|
|
16
|
+
) -> None:
|
|
17
|
+
super().__init__(
|
|
18
|
+
response_type=AnyArticleResponse,
|
|
19
|
+
openai=openai,
|
|
20
|
+
settings=settings,
|
|
21
|
+
model=model,
|
|
22
|
+
input="Parse the following web page and extract the main article:\n\n",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
@override
|
|
26
|
+
async def fetch(
|
|
27
|
+
self,
|
|
28
|
+
context: BrowserContext,
|
|
29
|
+
request: AnyArticleRequest,
|
|
30
|
+
) -> str:
|
|
31
|
+
page = await context.new_page()
|
|
32
|
+
await page.goto(request.url, wait_until="domcontentloaded")
|
|
33
|
+
await page.wait_for_timeout(3000)
|
|
34
|
+
html = await page.content()
|
|
35
|
+
return html
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from webquest.scrapers.duckduckgo_search.schemas import (
|
|
2
|
+
DuckDuckGoSearchRequest,
|
|
3
|
+
DuckDuckGoSearchResponse,
|
|
4
|
+
)
|
|
5
|
+
from webquest.scrapers.duckduckgo_search.scraper import DuckDuckGoSearch
|
|
6
|
+
|
|
7
|
+
__all__ = ["DuckDuckGoSearchRequest", "DuckDuckGoSearchResponse", "DuckDuckGoSearch"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DuckDuckGoSearchRequest(BaseModel):
|
|
5
|
+
query: str
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Page(BaseModel):
|
|
9
|
+
site: str
|
|
10
|
+
url: str
|
|
11
|
+
title: str
|
|
12
|
+
description: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DuckDuckGoSearchResponse(BaseModel):
|
|
16
|
+
pages: list[Page]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import override
|
|
3
|
+
from urllib.parse import quote_plus
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from playwright.async_api import BrowserContext
|
|
7
|
+
|
|
8
|
+
from webquest.base.base_scraper import BaseScraper
|
|
9
|
+
from webquest.scrapers.duckduckgo_search.schemas import (
|
|
10
|
+
DuckDuckGoSearchRequest,
|
|
11
|
+
DuckDuckGoSearchResponse,
|
|
12
|
+
Page,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DuckDuckGoSearch(
|
|
17
|
+
BaseScraper[DuckDuckGoSearchRequest, str, DuckDuckGoSearchResponse]
|
|
18
|
+
):
|
|
19
|
+
@override
|
|
20
|
+
async def fetch(
|
|
21
|
+
self,
|
|
22
|
+
context: BrowserContext,
|
|
23
|
+
request: DuckDuckGoSearchRequest,
|
|
24
|
+
) -> str:
|
|
25
|
+
url = f"https://duckduckgo.com/?origin=funnel_home_website&t=h_&q={quote_plus(request.query)}&ia=web"
|
|
26
|
+
page = await context.new_page()
|
|
27
|
+
|
|
28
|
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
29
|
+
await asyncio.sleep(1)
|
|
30
|
+
|
|
31
|
+
await page.wait_for_selector("button#more-results", timeout=15000)
|
|
32
|
+
await page.click("button#more-results")
|
|
33
|
+
|
|
34
|
+
await page.wait_for_selector("li[data-layout='organic']", timeout=15000)
|
|
35
|
+
|
|
36
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
37
|
+
await asyncio.sleep(4)
|
|
38
|
+
|
|
39
|
+
html = await page.content()
|
|
40
|
+
|
|
41
|
+
return html
|
|
42
|
+
|
|
43
|
+
@override
|
|
44
|
+
async def parse(self, raw: str) -> DuckDuckGoSearchResponse:
|
|
45
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
46
|
+
pages: list[Page] = []
|
|
47
|
+
|
|
48
|
+
article_tags = soup.find_all("article", {"data-testid": "result"})
|
|
49
|
+
|
|
50
|
+
for article_tag in article_tags:
|
|
51
|
+
site_tag = article_tag.find("p", class_="fOCEb2mA3YZTJXXjpgdS")
|
|
52
|
+
if not site_tag:
|
|
53
|
+
continue
|
|
54
|
+
site = site_tag.get_text(strip=True)
|
|
55
|
+
|
|
56
|
+
url_tag = article_tag.find("a", {"data-testid": "result-title-a"})
|
|
57
|
+
if not url_tag:
|
|
58
|
+
continue
|
|
59
|
+
url = url_tag.get("href")
|
|
60
|
+
if not isinstance(url, str):
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
title_tag = article_tag.find("span", class_="EKtkFWMYpwzMKOYr0GYm")
|
|
64
|
+
if not title_tag:
|
|
65
|
+
continue
|
|
66
|
+
title = title_tag.get_text(strip=True)
|
|
67
|
+
|
|
68
|
+
description_tag = article_tag.find("span", class_="kY2IgmnCmOGjharHErah")
|
|
69
|
+
if not description_tag:
|
|
70
|
+
continue
|
|
71
|
+
description = description_tag.get_text(strip=True)
|
|
72
|
+
|
|
73
|
+
page = Page(
|
|
74
|
+
site=site,
|
|
75
|
+
url=url,
|
|
76
|
+
title=title,
|
|
77
|
+
description=description,
|
|
78
|
+
)
|
|
79
|
+
pages.append(page)
|
|
80
|
+
|
|
81
|
+
return DuckDuckGoSearchResponse(pages=pages)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from webquest.scrapers.google_news_search.schemas import (
|
|
2
|
+
GoogleNewsSearchRequest,
|
|
3
|
+
GoogleNewsSearchResponse,
|
|
4
|
+
)
|
|
5
|
+
from webquest.scrapers.google_news_search.scraper import GoogleNewsSearch
|
|
6
|
+
|
|
7
|
+
__all__ = ["GoogleNewsSearch", "GoogleNewsSearchRequest", "GoogleNewsSearchResponse"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class GoogleNewsSearchRequest(BaseModel):
|
|
5
|
+
query: str
|
|
6
|
+
locale: str | None = None
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Article(BaseModel):
|
|
10
|
+
site: str
|
|
11
|
+
url: str
|
|
12
|
+
title: str
|
|
13
|
+
published_at: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GoogleNewsSearchResponse(BaseModel):
|
|
17
|
+
articles: list[Article]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import override
|
|
3
|
+
from urllib.parse import quote_plus
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from playwright.async_api import BrowserContext
|
|
7
|
+
|
|
8
|
+
from webquest.base.base_scraper import BaseScraper
|
|
9
|
+
from webquest.scrapers.google_news_search.schemas import (
|
|
10
|
+
Article,
|
|
11
|
+
GoogleNewsSearchRequest,
|
|
12
|
+
GoogleNewsSearchResponse,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GoogleNewsSearch(
|
|
17
|
+
BaseScraper[GoogleNewsSearchRequest, str, GoogleNewsSearchResponse]
|
|
18
|
+
):
|
|
19
|
+
@override
|
|
20
|
+
async def fetch(
|
|
21
|
+
self,
|
|
22
|
+
context: BrowserContext,
|
|
23
|
+
request: GoogleNewsSearchRequest,
|
|
24
|
+
) -> str:
|
|
25
|
+
url = f"https://news.google.com/search?q={quote_plus(request.query)}"
|
|
26
|
+
page = await context.new_page()
|
|
27
|
+
|
|
28
|
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
29
|
+
await asyncio.sleep(1)
|
|
30
|
+
|
|
31
|
+
html = await page.content()
|
|
32
|
+
|
|
33
|
+
return html
|
|
34
|
+
|
|
35
|
+
@override
|
|
36
|
+
async def parse(self, raw: str) -> GoogleNewsSearchResponse:
|
|
37
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
38
|
+
articles: list[Article] = []
|
|
39
|
+
|
|
40
|
+
article_tags = soup.find_all("article")
|
|
41
|
+
for article_tag in article_tags:
|
|
42
|
+
title_tag = article_tag.find("a", class_="JtKRv")
|
|
43
|
+
if not title_tag:
|
|
44
|
+
continue
|
|
45
|
+
title = title_tag.get_text().strip()
|
|
46
|
+
|
|
47
|
+
url_tag = article_tag.find("a", class_="JtKRv")
|
|
48
|
+
if not url_tag:
|
|
49
|
+
continue
|
|
50
|
+
url = url_tag.get("href")
|
|
51
|
+
if not isinstance(url, str):
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
url = f"https://news.google.com{url[1:]}"
|
|
55
|
+
|
|
56
|
+
site_tag = article_tag.find("div", class_="vr1PYe")
|
|
57
|
+
if not site_tag:
|
|
58
|
+
continue
|
|
59
|
+
site = site_tag.get_text().strip()
|
|
60
|
+
|
|
61
|
+
published_at_tag = article_tag.find("time")
|
|
62
|
+
if not published_at_tag:
|
|
63
|
+
continue
|
|
64
|
+
published_at = published_at_tag.get_text().strip()
|
|
65
|
+
|
|
66
|
+
article = Article(
|
|
67
|
+
site=site,
|
|
68
|
+
url=url,
|
|
69
|
+
title=title,
|
|
70
|
+
published_at=published_at,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
articles.append(article)
|
|
74
|
+
|
|
75
|
+
return GoogleNewsSearchResponse(articles=articles)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Video(BaseModel):
|
|
5
|
+
id: str
|
|
6
|
+
url: str
|
|
7
|
+
title: str
|
|
8
|
+
description: str
|
|
9
|
+
published_at: str
|
|
10
|
+
views: str
|
|
11
|
+
channel_id: str
|
|
12
|
+
channel_url: str
|
|
13
|
+
channel_name: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Channel(BaseModel):
|
|
17
|
+
id: str
|
|
18
|
+
url: str
|
|
19
|
+
name: str
|
|
20
|
+
description: str | None
|
|
21
|
+
subscribers: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Post(BaseModel):
|
|
25
|
+
id: str
|
|
26
|
+
url: str
|
|
27
|
+
content: str
|
|
28
|
+
published_at: str
|
|
29
|
+
channel_id: str
|
|
30
|
+
channel_url: str
|
|
31
|
+
channel_name: str
|
|
32
|
+
comments: str
|
|
33
|
+
likes: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Short(BaseModel):
|
|
37
|
+
id: str
|
|
38
|
+
url: str
|
|
39
|
+
title: str
|
|
40
|
+
views: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class YouTubeSearchRequest(BaseModel):
|
|
44
|
+
query: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class YouTubeSearchResponse(BaseModel):
|
|
48
|
+
videos: list[Video]
|
|
49
|
+
channels: list[Channel]
|
|
50
|
+
posts: list[Post]
|
|
51
|
+
shorts: list[Short]
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
from typing import override
|
|
2
|
+
from urllib.parse import quote_plus
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from playwright.async_api import BrowserContext
|
|
6
|
+
|
|
7
|
+
from webquest.base.base_scraper import BaseScraper
|
|
8
|
+
from webquest.scrapers.youtube_search.schemas import (
|
|
9
|
+
Channel,
|
|
10
|
+
Post,
|
|
11
|
+
Short,
|
|
12
|
+
Video,
|
|
13
|
+
YouTubeSearchRequest,
|
|
14
|
+
YouTubeSearchResponse,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class YouTubeSearch(BaseScraper[YouTubeSearchRequest, str, YouTubeSearchResponse]):
|
|
19
|
+
def _parse_videos(self, soup: BeautifulSoup) -> list[Video]:
|
|
20
|
+
videos: list[Video] = []
|
|
21
|
+
video_tags = soup.find_all("ytd-video-renderer")
|
|
22
|
+
|
|
23
|
+
for video_tag in video_tags:
|
|
24
|
+
title_tag = video_tag.find(
|
|
25
|
+
"h3",
|
|
26
|
+
class_="title-and-badge style-scope ytd-video-renderer",
|
|
27
|
+
)
|
|
28
|
+
if not title_tag:
|
|
29
|
+
continue
|
|
30
|
+
title = title_tag.get_text(strip=True)
|
|
31
|
+
|
|
32
|
+
views_tag, published_at_tag = video_tag.find_all(
|
|
33
|
+
"span",
|
|
34
|
+
class_="inline-metadata-item style-scope ytd-video-meta-block",
|
|
35
|
+
)
|
|
36
|
+
views = views_tag.get_text(strip=True)
|
|
37
|
+
published_at = published_at_tag.get_text(strip=True)
|
|
38
|
+
|
|
39
|
+
description_tag = video_tag.find(
|
|
40
|
+
"yt-formatted-string",
|
|
41
|
+
class_="metadata-snippet-text style-scope ytd-video-renderer",
|
|
42
|
+
)
|
|
43
|
+
if not description_tag:
|
|
44
|
+
continue
|
|
45
|
+
description = description_tag.get_text(strip=True)
|
|
46
|
+
|
|
47
|
+
channel_name_tag = video_tag.find(
|
|
48
|
+
"a",
|
|
49
|
+
class_="yt-simple-endpoint style-scope yt-formatted-string",
|
|
50
|
+
)
|
|
51
|
+
if not channel_name_tag:
|
|
52
|
+
continue
|
|
53
|
+
channel_name = channel_name_tag.get_text(strip=True)
|
|
54
|
+
|
|
55
|
+
channel_id_tag = video_tag.find(
|
|
56
|
+
"a",
|
|
57
|
+
class_="yt-simple-endpoint style-scope yt-formatted-string",
|
|
58
|
+
)
|
|
59
|
+
if not channel_id_tag:
|
|
60
|
+
continue
|
|
61
|
+
channel_id = channel_id_tag.get("href")
|
|
62
|
+
if not isinstance(channel_id, str):
|
|
63
|
+
continue
|
|
64
|
+
channel_id = channel_id[1:]
|
|
65
|
+
|
|
66
|
+
channel_url = f"https://www.youtube.com/{channel_id}"
|
|
67
|
+
|
|
68
|
+
video_id_tag = video_tag.find(
|
|
69
|
+
"a",
|
|
70
|
+
class_="yt-simple-endpoint style-scope ytd-video-renderer",
|
|
71
|
+
)
|
|
72
|
+
if not video_id_tag:
|
|
73
|
+
continue
|
|
74
|
+
video_id = video_id_tag.get("href")
|
|
75
|
+
if not isinstance(video_id, str):
|
|
76
|
+
continue
|
|
77
|
+
video_id = video_id.split("v=")[-1].split("&")[0]
|
|
78
|
+
|
|
79
|
+
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
80
|
+
|
|
81
|
+
video = Video(
|
|
82
|
+
id=video_id,
|
|
83
|
+
url=video_url,
|
|
84
|
+
title=title,
|
|
85
|
+
description=description,
|
|
86
|
+
published_at=published_at,
|
|
87
|
+
views=views,
|
|
88
|
+
channel_id=channel_id,
|
|
89
|
+
channel_url=channel_url,
|
|
90
|
+
channel_name=channel_name,
|
|
91
|
+
)
|
|
92
|
+
videos.append(video)
|
|
93
|
+
|
|
94
|
+
videos = [video for video in videos if len(video.id) == 11]
|
|
95
|
+
|
|
96
|
+
unique_videos = {video.id: video for video in videos}
|
|
97
|
+
videos = list(unique_videos.values())
|
|
98
|
+
|
|
99
|
+
return videos
|
|
100
|
+
|
|
101
|
+
def _parse_channels(self, soup: BeautifulSoup) -> list[Channel]:
|
|
102
|
+
channels: list[Channel] = []
|
|
103
|
+
channel_tags = soup.find_all("ytd-channel-renderer")
|
|
104
|
+
for channel_tag in channel_tags:
|
|
105
|
+
channel_name_tag = channel_tag.find(
|
|
106
|
+
"yt-formatted-string",
|
|
107
|
+
class_="style-scope ytd-channel-name",
|
|
108
|
+
)
|
|
109
|
+
if not channel_name_tag:
|
|
110
|
+
continue
|
|
111
|
+
channel_name = channel_name_tag.get_text(strip=True)
|
|
112
|
+
|
|
113
|
+
description_tag = channel_tag.find("yt-formatted-string", id="description")
|
|
114
|
+
if not description_tag:
|
|
115
|
+
continue
|
|
116
|
+
description: str | None = description_tag.get_text(strip=True)
|
|
117
|
+
if description == "":
|
|
118
|
+
description = None
|
|
119
|
+
|
|
120
|
+
channel_id_tag = channel_tag.find("yt-formatted-string", id="subscribers")
|
|
121
|
+
if not channel_id_tag:
|
|
122
|
+
continue
|
|
123
|
+
channel_id = channel_id_tag.get_text(strip=True)
|
|
124
|
+
|
|
125
|
+
channel_url = f"https://www.youtube.com/{channel_id}"
|
|
126
|
+
|
|
127
|
+
subscribers_tag = channel_tag.find("span", id="video-count")
|
|
128
|
+
if not subscribers_tag:
|
|
129
|
+
continue
|
|
130
|
+
subscribers = subscribers_tag.get_text(strip=True)
|
|
131
|
+
|
|
132
|
+
channel = Channel(
|
|
133
|
+
id=channel_id,
|
|
134
|
+
url=channel_url,
|
|
135
|
+
name=channel_name,
|
|
136
|
+
description=description,
|
|
137
|
+
subscribers=subscribers,
|
|
138
|
+
)
|
|
139
|
+
channels.append(channel)
|
|
140
|
+
return channels
|
|
141
|
+
|
|
142
|
+
def _parse_posts(self, soup: BeautifulSoup) -> list[Post]:
|
|
143
|
+
posts: list[Post] = []
|
|
144
|
+
post_tags = soup.find_all("ytd-post-renderer")
|
|
145
|
+
for post_tag in post_tags:
|
|
146
|
+
content_tag = post_tag.find(
|
|
147
|
+
"div",
|
|
148
|
+
id="content",
|
|
149
|
+
)
|
|
150
|
+
if not content_tag:
|
|
151
|
+
continue
|
|
152
|
+
content = content_tag.get_text(strip=True)
|
|
153
|
+
|
|
154
|
+
channel_name_tag = post_tag.find(
|
|
155
|
+
"div",
|
|
156
|
+
id="author",
|
|
157
|
+
)
|
|
158
|
+
if not channel_name_tag:
|
|
159
|
+
continue
|
|
160
|
+
channel_name = channel_name_tag.get_text(strip=True)
|
|
161
|
+
|
|
162
|
+
published_at_tag = post_tag.find(
|
|
163
|
+
"yt-formatted-string",
|
|
164
|
+
id="published-time-text",
|
|
165
|
+
)
|
|
166
|
+
if not published_at_tag:
|
|
167
|
+
continue
|
|
168
|
+
published_at = published_at_tag.get_text(strip=True)
|
|
169
|
+
|
|
170
|
+
channel_id_tag = post_tag.find(
|
|
171
|
+
"a",
|
|
172
|
+
id="author-text",
|
|
173
|
+
)
|
|
174
|
+
if not channel_id_tag:
|
|
175
|
+
continue
|
|
176
|
+
channel_id = channel_id_tag.get("href")
|
|
177
|
+
if not isinstance(channel_id, str):
|
|
178
|
+
continue
|
|
179
|
+
channel_id = channel_id[1:]
|
|
180
|
+
|
|
181
|
+
channel_url = f"https://www.youtube.com/{channel_id}"
|
|
182
|
+
|
|
183
|
+
post_id_tag = post_tag.find(
|
|
184
|
+
"a",
|
|
185
|
+
class_="yt-simple-endpoint style-scope yt-formatted-string",
|
|
186
|
+
)
|
|
187
|
+
if not post_id_tag:
|
|
188
|
+
continue
|
|
189
|
+
post_id = post_id_tag.get("href")
|
|
190
|
+
if not isinstance(post_id, str):
|
|
191
|
+
continue
|
|
192
|
+
post_id = post_id.split("/post/")[-1]
|
|
193
|
+
|
|
194
|
+
post_url = f"https://www.youtube.com/post/{post_id}"
|
|
195
|
+
|
|
196
|
+
likes_tag = post_tag.find(
|
|
197
|
+
"span",
|
|
198
|
+
id="vote-count-middle",
|
|
199
|
+
)
|
|
200
|
+
if not likes_tag:
|
|
201
|
+
continue
|
|
202
|
+
likes = likes_tag.get_text(strip=True)
|
|
203
|
+
|
|
204
|
+
comments_tag = post_tag.find(
|
|
205
|
+
"div",
|
|
206
|
+
class_="yt-spec-button-shape-next__button-text-content",
|
|
207
|
+
)
|
|
208
|
+
if not comments_tag:
|
|
209
|
+
continue
|
|
210
|
+
comments = comments_tag.get_text(strip=True)
|
|
211
|
+
|
|
212
|
+
post = Post(
|
|
213
|
+
id=post_id,
|
|
214
|
+
url=post_url,
|
|
215
|
+
content=content,
|
|
216
|
+
published_at=published_at,
|
|
217
|
+
channel_id=channel_id,
|
|
218
|
+
channel_url=channel_url,
|
|
219
|
+
channel_name=channel_name,
|
|
220
|
+
comments=comments,
|
|
221
|
+
likes=likes,
|
|
222
|
+
)
|
|
223
|
+
posts.append(post)
|
|
224
|
+
|
|
225
|
+
return posts
|
|
226
|
+
|
|
227
|
+
def _parse_shorts(self, soup: BeautifulSoup) -> list[Short]:
|
|
228
|
+
shorts: list[Short] = []
|
|
229
|
+
short_tags = soup.find_all("ytm-shorts-lockup-view-model-v2")
|
|
230
|
+
for short_tag in short_tags:
|
|
231
|
+
title_tag = short_tag.find(
|
|
232
|
+
"h3",
|
|
233
|
+
role="presentation",
|
|
234
|
+
)
|
|
235
|
+
if not title_tag:
|
|
236
|
+
continue
|
|
237
|
+
title = title_tag.get_text(strip=True)
|
|
238
|
+
|
|
239
|
+
views_tag = short_tag.find(
|
|
240
|
+
"div",
|
|
241
|
+
class_="shortsLockupViewModelHostOutsideMetadataSubhead shortsLockupViewModelHostMetadataSubhead",
|
|
242
|
+
)
|
|
243
|
+
if not views_tag:
|
|
244
|
+
continue
|
|
245
|
+
views = views_tag.get_text(strip=True)
|
|
246
|
+
|
|
247
|
+
short_id_tag = short_tag.find(
|
|
248
|
+
"a",
|
|
249
|
+
class_="shortsLockupViewModelHostEndpoint shortsLockupViewModelHostOutsideMetadataEndpoint",
|
|
250
|
+
)
|
|
251
|
+
if not short_id_tag:
|
|
252
|
+
continue
|
|
253
|
+
short_id = short_id_tag.get("href")
|
|
254
|
+
if not isinstance(short_id, str):
|
|
255
|
+
continue
|
|
256
|
+
short_id = short_id.split("shorts/")[-1]
|
|
257
|
+
|
|
258
|
+
short_url = f"https://www.youtube.com/shorts/{short_id}"
|
|
259
|
+
|
|
260
|
+
short = Short(
|
|
261
|
+
id=short_id,
|
|
262
|
+
url=short_url,
|
|
263
|
+
title=title,
|
|
264
|
+
views=views,
|
|
265
|
+
)
|
|
266
|
+
shorts.append(short)
|
|
267
|
+
return shorts
|
|
268
|
+
|
|
269
|
+
def _parse_search_results(self, soup: BeautifulSoup) -> YouTubeSearchResponse:
|
|
270
|
+
videos = self._parse_videos(soup)
|
|
271
|
+
channels = self._parse_channels(soup)
|
|
272
|
+
posts = self._parse_posts(soup)
|
|
273
|
+
shorts = self._parse_shorts(soup)
|
|
274
|
+
return YouTubeSearchResponse(
|
|
275
|
+
videos=videos,
|
|
276
|
+
channels=channels,
|
|
277
|
+
posts=posts,
|
|
278
|
+
shorts=shorts,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
@override
|
|
282
|
+
async def parse(self, raw: str) -> YouTubeSearchResponse:
|
|
283
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
284
|
+
result = self._parse_search_results(soup)
|
|
285
|
+
return result
|
|
286
|
+
|
|
287
|
+
@override
|
|
288
|
+
async def fetch(
|
|
289
|
+
self, context: BrowserContext, request: YouTubeSearchRequest
|
|
290
|
+
) -> str:
|
|
291
|
+
url = (
|
|
292
|
+
f"https://www.youtube.com/results?search_query={quote_plus(request.query)}"
|
|
293
|
+
)
|
|
294
|
+
page = await context.new_page()
|
|
295
|
+
await page.goto(url)
|
|
296
|
+
await page.wait_for_selector("ytd-video-renderer", timeout=10000)
|
|
297
|
+
html = await page.content()
|
|
298
|
+
return html
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from webquest.scrapers.youtube_transcript.schemas import (
|
|
2
|
+
YouTubeTranscriptRequest,
|
|
3
|
+
YouTubeTranscriptResponse,
|
|
4
|
+
)
|
|
5
|
+
from webquest.scrapers.youtube_transcript.scraper import YouTubeTranscript
|
|
6
|
+
|
|
7
|
+
__all__ = ["YouTubeTranscriptRequest", "YouTubeTranscriptResponse", "YouTubeTranscript"]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import override
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from playwright.async_api import BrowserContext
|
|
6
|
+
|
|
7
|
+
from webquest.base.base_scraper import BaseScraper
|
|
8
|
+
from webquest.scrapers.youtube_transcript.schemas import (
|
|
9
|
+
YouTubeTranscriptRequest,
|
|
10
|
+
YouTubeTranscriptResponse,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class YouTubeTranscript(
|
|
15
|
+
BaseScraper[YouTubeTranscriptRequest, str, YouTubeTranscriptResponse]
|
|
16
|
+
):
|
|
17
|
+
@override
|
|
18
|
+
async def fetch(
|
|
19
|
+
self,
|
|
20
|
+
context: BrowserContext,
|
|
21
|
+
request: YouTubeTranscriptRequest,
|
|
22
|
+
) -> str:
|
|
23
|
+
video_url = f"https://www.youtube.com/watch?v={request.video_id}"
|
|
24
|
+
|
|
25
|
+
page = await context.new_page()
|
|
26
|
+
|
|
27
|
+
await page.goto(video_url, wait_until="networkidle", timeout=30000)
|
|
28
|
+
await asyncio.sleep(1)
|
|
29
|
+
|
|
30
|
+
await page.wait_for_selector("div#description", timeout=10000)
|
|
31
|
+
await page.click("div#description")
|
|
32
|
+
|
|
33
|
+
await asyncio.sleep(0.5)
|
|
34
|
+
|
|
35
|
+
transcript_button = await page.wait_for_selector(
|
|
36
|
+
'button[aria-label="Show transcript"]', timeout=10000
|
|
37
|
+
)
|
|
38
|
+
if not transcript_button:
|
|
39
|
+
raise Exception("Transcript button not found")
|
|
40
|
+
|
|
41
|
+
await transcript_button.click()
|
|
42
|
+
|
|
43
|
+
await page.wait_for_selector(
|
|
44
|
+
"ytd-transcript-segment-list-renderer", timeout=10000
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
html = await page.content()
|
|
48
|
+
return html
|
|
49
|
+
|
|
50
|
+
@override
|
|
51
|
+
async def parse(self, raw: str) -> YouTubeTranscriptResponse:
|
|
52
|
+
soup = BeautifulSoup(raw, "html.parser")
|
|
53
|
+
|
|
54
|
+
# Find the transcript segment list renderer
|
|
55
|
+
segment_renderer = soup.select_one("ytd-transcript-segment-list-renderer")
|
|
56
|
+
if not segment_renderer:
|
|
57
|
+
raise Exception("No transcript segments found")
|
|
58
|
+
|
|
59
|
+
# Find the segments container
|
|
60
|
+
segments_container = segment_renderer.select_one("div#segments-container")
|
|
61
|
+
if not segments_container:
|
|
62
|
+
raise Exception("No transcript segments found")
|
|
63
|
+
|
|
64
|
+
# Find all transcript segment renderers
|
|
65
|
+
segments = segments_container.select("ytd-transcript-segment-renderer")
|
|
66
|
+
if not segments:
|
|
67
|
+
raise Exception("No transcript segments found")
|
|
68
|
+
|
|
69
|
+
# Extract text from each segment
|
|
70
|
+
transcript_segments = []
|
|
71
|
+
for segment in segments:
|
|
72
|
+
text_element = segment.select_one("yt-formatted-string")
|
|
73
|
+
if text_element:
|
|
74
|
+
transcript_segments.append(text_element.get_text())
|
|
75
|
+
|
|
76
|
+
formatted_transcript = " ".join(transcript_segments).strip()
|
|
77
|
+
result = YouTubeTranscriptResponse(transcript=formatted_transcript)
|
|
78
|
+
|
|
79
|
+
return result
|