viewlyt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- viewlyt/__init__.py +119 -0
- viewlyt/api.py +450 -0
- viewlyt/cli.py +873 -0
- viewlyt/driver.py +255 -0
- viewlyt/htmltext.py +444 -0
- viewlyt/live/__init__.py +45 -0
- viewlyt/live/capture.py +134 -0
- viewlyt/live/cli.py +207 -0
- viewlyt/live/llm.py +709 -0
- viewlyt/live/messages.py +137 -0
- viewlyt/live/persistence.py +103 -0
- viewlyt/live/probes.py +382 -0
- viewlyt/live/server.py +1119 -0
- viewlyt/live/static/assets/index-CUxPUbjJ.css +1 -0
- viewlyt/live/static/assets/index-DdFGMUIl.js +59 -0
- viewlyt/live/static/index.html +302 -0
- viewlyt/live/window.py +118 -0
- viewlyt/py.typed +0 -0
- viewlyt/rag.py +827 -0
- viewlyt/scraper.py +1206 -0
- viewlyt/vl.py +52 -0
- viewlyt-0.1.0.dist-info/METADATA +616 -0
- viewlyt-0.1.0.dist-info/RECORD +26 -0
- viewlyt-0.1.0.dist-info/WHEEL +4 -0
- viewlyt-0.1.0.dist-info/entry_points.txt +2 -0
- viewlyt-0.1.0.dist-info/licenses/LICENSE +21 -0
viewlyt/__init__.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""viewlyt — scrape a YouTube video's comments (and transcript) with Selenium.
|
|
2
|
+
|
|
3
|
+
Library quickstart::
|
|
4
|
+
|
|
5
|
+
from viewlyt import scrape_video
|
|
6
|
+
r = scrape_video("https://youtu.be/dQw4w9WgXcQ", transcript=True, related=10)
|
|
7
|
+
print("\\n".join(r.comment_lines())) # same text as the CLI's .md
|
|
8
|
+
r.write("out/") # .md / .transcript.md / .related.md
|
|
9
|
+
|
|
10
|
+
Many videos on ONE reused browser (amortises Chrome startup)::
|
|
11
|
+
|
|
12
|
+
from viewlyt import scrape_videos, Session
|
|
13
|
+
results = scrape_videos(urls, jobs=4) # list aligned to input order
|
|
14
|
+
with Session() as s: # or drive it manually
|
|
15
|
+
a, b = s.scrape(url1), s.scrape(url2)
|
|
16
|
+
|
|
17
|
+
The pure, dependency-free helpers (``html_to_text``, ``format_comment_lines``,
|
|
18
|
+
``format_transcript``, ``format_related``, ``group_consecutive_comments``,
|
|
19
|
+
``parse_relative_date``, ``flatten_inline``, ``slugify``) and ``__version__`` are
|
|
20
|
+
importable WITHOUT pulling in Selenium — ``import viewlyt`` stays lightweight. The
|
|
21
|
+
Selenium-backed names (``scrape_video``, ``scrape_videos``, ``Session``,
|
|
22
|
+
``build_driver``, …) are loaded lazily on first access (PEP 562), so they cost
|
|
23
|
+
nothing until you use them.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
from importlib import import_module
|
|
29
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
30
|
+
from typing import TYPE_CHECKING
|
|
31
|
+
|
|
32
|
+
# Pure helpers: stdlib only, safe to import eagerly (no Selenium).
|
|
33
|
+
from .htmltext import (
|
|
34
|
+
flatten_inline,
|
|
35
|
+
format_comment_lines,
|
|
36
|
+
format_related,
|
|
37
|
+
format_transcript,
|
|
38
|
+
format_unified,
|
|
39
|
+
group_consecutive_comments,
|
|
40
|
+
html_to_text,
|
|
41
|
+
join_unified,
|
|
42
|
+
parse_relative_date,
|
|
43
|
+
slugify,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
__version__ = version("viewlyt")
|
|
48
|
+
except PackageNotFoundError: # pragma: no cover - running from a source tree w/o install
|
|
49
|
+
__version__ = "0.0.0"
|
|
50
|
+
|
|
51
|
+
# Selenium-backed names resolved lazily via __getattr__, mapped to their module.
|
|
52
|
+
_LAZY = {
|
|
53
|
+
"scrape_video": "api",
|
|
54
|
+
"scrape_videos": "api",
|
|
55
|
+
"Session": "api",
|
|
56
|
+
"ScrapeResult": "api",
|
|
57
|
+
"Comment": "api",
|
|
58
|
+
"RelatedVideo": "api",
|
|
59
|
+
"build_driver": "driver",
|
|
60
|
+
"collect_comments": "scraper",
|
|
61
|
+
"collect_related": "scraper",
|
|
62
|
+
"fetch_transcript": "scraper",
|
|
63
|
+
"extract_video_id": "scraper",
|
|
64
|
+
"BlockedError": "scraper",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if TYPE_CHECKING: # let type checkers and IDEs see the real symbols
|
|
68
|
+
from .api import Comment, RelatedVideo, ScrapeResult, Session, scrape_video, scrape_videos
|
|
69
|
+
from .driver import build_driver
|
|
70
|
+
from .scraper import (
|
|
71
|
+
BlockedError,
|
|
72
|
+
collect_comments,
|
|
73
|
+
collect_related,
|
|
74
|
+
extract_video_id,
|
|
75
|
+
fetch_transcript,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def __getattr__(name: str) -> object: # PEP 562 module-level lazy attribute access
|
|
80
|
+
module = _LAZY.get(name)
|
|
81
|
+
if module is None:
|
|
82
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
83
|
+
value = getattr(import_module(f".{module}", __name__), name)
|
|
84
|
+
globals()[name] = value # cache so __getattr__ runs at most once per name
|
|
85
|
+
return value
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def __dir__() -> list[str]:
|
|
89
|
+
return sorted(__all__)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
__all__ = [
|
|
93
|
+
# high-level (lazy)
|
|
94
|
+
"scrape_video",
|
|
95
|
+
"scrape_videos",
|
|
96
|
+
"Session",
|
|
97
|
+
"ScrapeResult",
|
|
98
|
+
"Comment",
|
|
99
|
+
"RelatedVideo",
|
|
100
|
+
# building blocks (lazy)
|
|
101
|
+
"extract_video_id",
|
|
102
|
+
"build_driver",
|
|
103
|
+
"collect_comments",
|
|
104
|
+
"collect_related",
|
|
105
|
+
"fetch_transcript",
|
|
106
|
+
"BlockedError",
|
|
107
|
+
# pure text helpers (eager)
|
|
108
|
+
"html_to_text",
|
|
109
|
+
"format_comment_lines",
|
|
110
|
+
"format_transcript",
|
|
111
|
+
"format_related",
|
|
112
|
+
"format_unified",
|
|
113
|
+
"join_unified",
|
|
114
|
+
"group_consecutive_comments",
|
|
115
|
+
"parse_relative_date",
|
|
116
|
+
"flatten_inline",
|
|
117
|
+
"slugify",
|
|
118
|
+
"__version__",
|
|
119
|
+
]
|
viewlyt/api.py
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
"""Programmatic API — use viewlyt as a library.
|
|
2
|
+
|
|
3
|
+
from viewlyt import scrape_video
|
|
4
|
+
|
|
5
|
+
r = scrape_video("https://youtu.be/dQw4w9WgXcQ", transcript=True)
|
|
6
|
+
print(r.title)
|
|
7
|
+
for c in r.comments:
|
|
8
|
+
print(c.author, c.likes, c.date, c.text)
|
|
9
|
+
print("\\n".join(r.transcript_lines()))
|
|
10
|
+
|
|
11
|
+
``scrape_video`` builds and tears down its own headless Chrome and returns
|
|
12
|
+
structured data (nothing is written to disk). For batch use with a reused
|
|
13
|
+
browser-instance pool and file output, see :mod:`viewlyt.cli`. The pure text
|
|
14
|
+
helpers (``html_to_text``, ``format_transcript``, …) live — dependency-free — in
|
|
15
|
+
:mod:`viewlyt.htmltext`.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import threading
|
|
22
|
+
from collections.abc import Iterable
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from datetime import date
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from queue import Empty, Queue
|
|
27
|
+
|
|
28
|
+
from .driver import build_driver
|
|
29
|
+
from .htmltext import (
|
|
30
|
+
format_comment_lines,
|
|
31
|
+
format_related,
|
|
32
|
+
format_transcript,
|
|
33
|
+
format_unified,
|
|
34
|
+
html_to_text,
|
|
35
|
+
pair_lines,
|
|
36
|
+
slugify,
|
|
37
|
+
strip_timestamps,
|
|
38
|
+
)
|
|
39
|
+
from .scraper import (
|
|
40
|
+
BlockedError,
|
|
41
|
+
collect_comments,
|
|
42
|
+
collect_related,
|
|
43
|
+
detect_block,
|
|
44
|
+
dismiss_consent_dialog,
|
|
45
|
+
extract_video_id,
|
|
46
|
+
fetch_transcript,
|
|
47
|
+
get_video_title,
|
|
48
|
+
prime_consent_cookies,
|
|
49
|
+
safe_get,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
log = logging.getLogger("viewlyt")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass(slots=True)
|
|
56
|
+
class Comment:
|
|
57
|
+
"""A single comment or reply, as ready-to-use plain text."""
|
|
58
|
+
|
|
59
|
+
kind: str # "comment" | "reply"
|
|
60
|
+
author: str # e.g. "@handle" ("" if it couldn't be resolved)
|
|
61
|
+
text: str # plain text (emoji alt + link text kept; <br> -> newline)
|
|
62
|
+
likes: str # YouTube's own count, e.g. "842"/"1.2K"; "0" when hidden
|
|
63
|
+
date: str # relative timestamp as YouTube shows it, e.g. "2 days ago"
|
|
64
|
+
parent_author: str | None = None # set on replies
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(slots=True)
|
|
68
|
+
class RelatedVideo:
|
|
69
|
+
"""One related video from the watch-page sidebar. ``views`` is YouTube's own
|
|
70
|
+
sidebar text (e.g. "1.2B views"); the sidebar exposes NO likes."""
|
|
71
|
+
|
|
72
|
+
video_id: str
|
|
73
|
+
title: str
|
|
74
|
+
views: str
|
|
75
|
+
url: str
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(slots=True)
|
|
79
|
+
class ScrapeResult:
|
|
80
|
+
"""Everything scraped for one video. ``transcript`` is ``[(timestamp, text)]``."""
|
|
81
|
+
|
|
82
|
+
video_id: str
|
|
83
|
+
title: str
|
|
84
|
+
comments: list[Comment] = field(default_factory=list)
|
|
85
|
+
transcript: list[tuple[str, str]] = field(default_factory=list)
|
|
86
|
+
related: list[RelatedVideo] = field(default_factory=list)
|
|
87
|
+
# Raw scraper records (with HTML), kept so comment_lines()/write() can reuse the
|
|
88
|
+
# exact CLI merge+format pipeline. Private; hidden from repr.
|
|
89
|
+
_records: list[dict] = field(default_factory=list, repr=False)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def top_level(self) -> list[Comment]:
|
|
93
|
+
return [c for c in self.comments if c.kind == "comment"]
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def replies(self) -> list[Comment]:
|
|
97
|
+
return [c for c in self.comments if c.kind == "reply"]
|
|
98
|
+
|
|
99
|
+
def comment_lines(self, *, merge: bool = True, today: date | None = None) -> list[str]:
|
|
100
|
+
"""Comments as the CLI-formatted text block (merged by default) — identical
|
|
101
|
+
to viewlyt's ``out/<slug>-<id>.md`` body (see
|
|
102
|
+
:func:`viewlyt.format_comment_lines`)."""
|
|
103
|
+
return format_comment_lines(self._records, today=today, merge=merge)
|
|
104
|
+
|
|
105
|
+
def transcript_lines(self, *, timestamps: bool = False, pair: bool = True) -> list[str]:
|
|
106
|
+
"""Transcript lines, token-lean by default (CLI parity): ``[m:ss]``
|
|
107
|
+
timestamps stripped (``timestamps=True`` keeps them; ``h:mm:ss`` on long
|
|
108
|
+
videos always stays) and every 2 segments joined into one line
|
|
109
|
+
(``pair=False`` for one segment per line). ``timestamps=True, pair=False``
|
|
110
|
+
gives the verbatim :func:`viewlyt.format_transcript` output."""
|
|
111
|
+
lines = format_transcript(self.transcript)
|
|
112
|
+
if not timestamps:
|
|
113
|
+
lines = strip_timestamps(lines)
|
|
114
|
+
return pair_lines(lines) if pair else lines
|
|
115
|
+
|
|
116
|
+
def related_lines(self) -> list[str]:
|
|
117
|
+
"""Related videos as a numbered Markdown list (see :func:`viewlyt.format_related`)."""
|
|
118
|
+
return format_related(
|
|
119
|
+
[{"title": r.title, "views": r.views, "url": r.url} for r in self.related]
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _sections(self, *, merge: bool = True) -> list[tuple[str, str, str, list[str]]]:
|
|
123
|
+
"""Single source of the product sections, in canonical order:
|
|
124
|
+
``(kind, header, filename-suffix, lines)``. Drives ``write()`` (separate
|
|
125
|
+
files), ``unified_lines()``/``write(unify=True)``, and is the one place a
|
|
126
|
+
new product type is added so it flows into every output for free."""
|
|
127
|
+
return [
|
|
128
|
+
("comments", "Comments", "", self.comment_lines(merge=merge)),
|
|
129
|
+
("transcript", "Transcript", ".transcript", self.transcript_lines()),
|
|
130
|
+
("related", "Related videos", ".related", self.related_lines()),
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
def unified_lines(self, *, merge: bool = True) -> list[str]:
|
|
134
|
+
"""All collected products in ONE document — ``# title`` + ``## section``
|
|
135
|
+
blocks, empty sections skipped (see :func:`viewlyt.format_unified`)."""
|
|
136
|
+
return format_unified(
|
|
137
|
+
self.title,
|
|
138
|
+
[(header, lines) for _kind, header, _suffix, lines in self._sections(merge=merge)],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def write(self, out_dir: str, *, merge: bool = True, unify: bool = False) -> dict[str, Path]:
|
|
142
|
+
"""Write the scraped data to ``out_dir``.
|
|
143
|
+
|
|
144
|
+
Default — one file per product, exactly like the CLI: ``<slug>-<id>.md``
|
|
145
|
+
(comments), ``.transcript.md``, ``.related.md`` (only non-empty ones).
|
|
146
|
+
With ``unify=True`` — a single ``<slug>-<id>.unified.md`` with every
|
|
147
|
+
product instead. Returns a mapping of section name (or ``"unified"``) to
|
|
148
|
+
the written :class:`pathlib.Path`.
|
|
149
|
+
"""
|
|
150
|
+
base = Path(out_dir)
|
|
151
|
+
base_name = f"{slugify(self.title) or 'video'}-{self.video_id}"
|
|
152
|
+
written: dict[str, Path] = {}
|
|
153
|
+
|
|
154
|
+
if unify:
|
|
155
|
+
lines = self.unified_lines(merge=merge)
|
|
156
|
+
if lines:
|
|
157
|
+
base.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
path = base / f"{base_name}.unified.md"
|
|
159
|
+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
160
|
+
written["unified"] = path
|
|
161
|
+
return written
|
|
162
|
+
|
|
163
|
+
for kind, _header, suffix, lines in self._sections(merge=merge):
|
|
164
|
+
if not lines:
|
|
165
|
+
continue
|
|
166
|
+
base.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
path = base / f"{base_name}{suffix}.md"
|
|
168
|
+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
169
|
+
written[kind] = path
|
|
170
|
+
return written
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _to_comments(records: list[dict]) -> list[Comment]:
|
|
174
|
+
return [
|
|
175
|
+
Comment(
|
|
176
|
+
kind=r.get("kind", "comment"),
|
|
177
|
+
author=r.get("author") or "",
|
|
178
|
+
text=html_to_text(r.get("html", "")),
|
|
179
|
+
likes=r.get("likes") or "0",
|
|
180
|
+
date=r.get("date_raw") or "",
|
|
181
|
+
parent_author=r.get("parent_author"),
|
|
182
|
+
)
|
|
183
|
+
for r in records
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _to_related(items: list[dict]) -> list[RelatedVideo]:
|
|
188
|
+
return [
|
|
189
|
+
RelatedVideo(
|
|
190
|
+
video_id=it.get("video_id") or "",
|
|
191
|
+
title=it.get("title") or "",
|
|
192
|
+
views=it.get("views") or "",
|
|
193
|
+
url=it.get("url") or "",
|
|
194
|
+
)
|
|
195
|
+
for it in items
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _scrape_url(
|
|
200
|
+
driver,
|
|
201
|
+
url: str,
|
|
202
|
+
*,
|
|
203
|
+
comments: bool,
|
|
204
|
+
transcript: bool,
|
|
205
|
+
related: int,
|
|
206
|
+
limit: int,
|
|
207
|
+
max_viewports: int,
|
|
208
|
+
replies: bool,
|
|
209
|
+
max_replies: int,
|
|
210
|
+
) -> ScrapeResult:
|
|
211
|
+
"""Scrape one video on an already-built, consent-primed ``driver``.
|
|
212
|
+
|
|
213
|
+
Shared by :func:`scrape_video`, :class:`Session` and :func:`scrape_videos`.
|
|
214
|
+
Raises :class:`BlockedError` on a consent/bot wall. Does NOT build or quit the
|
|
215
|
+
driver — the caller owns its lifecycle.
|
|
216
|
+
"""
|
|
217
|
+
video_id = extract_video_id(url)
|
|
218
|
+
safe_get(driver, f"https://www.youtube.com/watch?v={video_id}")
|
|
219
|
+
dismiss_consent_dialog(driver, timeout=2.0)
|
|
220
|
+
block = detect_block(driver)
|
|
221
|
+
if block:
|
|
222
|
+
raise BlockedError(block)
|
|
223
|
+
title = get_video_title(driver)
|
|
224
|
+
records = (
|
|
225
|
+
collect_comments(
|
|
226
|
+
driver,
|
|
227
|
+
limit=limit,
|
|
228
|
+
max_viewports=max_viewports,
|
|
229
|
+
expand_replies=replies,
|
|
230
|
+
max_replies=max_replies,
|
|
231
|
+
progress=False,
|
|
232
|
+
)
|
|
233
|
+
if comments
|
|
234
|
+
else []
|
|
235
|
+
)
|
|
236
|
+
# Related before transcript: the transcript panel takes over the #secondary
|
|
237
|
+
# column that hosts the related lockups (collect_related never raises).
|
|
238
|
+
rel = collect_related(driver, limit=related, progress=False) if related > 0 else []
|
|
239
|
+
tx = fetch_transcript(driver, progress=False) if transcript else []
|
|
240
|
+
return ScrapeResult(
|
|
241
|
+
video_id=video_id,
|
|
242
|
+
title=title,
|
|
243
|
+
comments=_to_comments(records),
|
|
244
|
+
transcript=tx,
|
|
245
|
+
related=_to_related(rel),
|
|
246
|
+
_records=records,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def scrape_video(
|
|
251
|
+
url: str,
|
|
252
|
+
*,
|
|
253
|
+
comments: bool = True,
|
|
254
|
+
transcript: bool = False,
|
|
255
|
+
related: int = 0,
|
|
256
|
+
limit: int = 150,
|
|
257
|
+
max_viewports: int = 25,
|
|
258
|
+
replies: bool = True,
|
|
259
|
+
max_replies: int = 5,
|
|
260
|
+
headless: bool = True,
|
|
261
|
+
user_data_dir: str | None = None,
|
|
262
|
+
) -> ScrapeResult:
|
|
263
|
+
"""Scrape one video and return a :class:`ScrapeResult` (writes no files).
|
|
264
|
+
|
|
265
|
+
Builds and quits its own Chrome. ``related`` is the number of sidebar related
|
|
266
|
+
videos to collect (0 = none). Raises :class:`viewlyt.BlockedError` if YouTube
|
|
267
|
+
serves a consent/bot wall (retry with ``headless=False`` or a logged-in
|
|
268
|
+
``user_data_dir``). To scrape several videos on ONE browser, use
|
|
269
|
+
:class:`Session` or :func:`scrape_videos`.
|
|
270
|
+
"""
|
|
271
|
+
driver = build_driver(headless=headless, user_data_dir=user_data_dir)
|
|
272
|
+
try:
|
|
273
|
+
prime_consent_cookies(driver)
|
|
274
|
+
return _scrape_url(
|
|
275
|
+
driver,
|
|
276
|
+
url,
|
|
277
|
+
comments=comments,
|
|
278
|
+
transcript=transcript,
|
|
279
|
+
related=related,
|
|
280
|
+
limit=limit,
|
|
281
|
+
max_viewports=max_viewports,
|
|
282
|
+
replies=replies,
|
|
283
|
+
max_replies=max_replies,
|
|
284
|
+
)
|
|
285
|
+
finally:
|
|
286
|
+
try:
|
|
287
|
+
driver.quit()
|
|
288
|
+
except Exception: # pragma: no cover
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class Session:
|
|
293
|
+
"""A reusable scraping session over ONE Chrome instance.
|
|
294
|
+
|
|
295
|
+
Building Chrome is the expensive part; a ``Session`` builds (and
|
|
296
|
+
consent-primes) it once and scrapes many videos on it, amortising the
|
|
297
|
+
cold-start. Use it as a context manager so the browser is always closed::
|
|
298
|
+
|
|
299
|
+
with viewlyt.Session(headless=True) as s:
|
|
300
|
+
a = s.scrape(url1)
|
|
301
|
+
b = s.scrape(url2) # same browser, no cold-start
|
|
302
|
+
|
|
303
|
+
On a consent/bot wall a headless session transparently rebuilds itself headed
|
|
304
|
+
and retries the video once; pass ``fallback=False`` to instead re-raise
|
|
305
|
+
:class:`BlockedError`. The driver is built lazily on the first ``scrape``.
|
|
306
|
+
"""
|
|
307
|
+
|
|
308
|
+
def __init__(
|
|
309
|
+
self,
|
|
310
|
+
*,
|
|
311
|
+
headless: bool = True,
|
|
312
|
+
user_data_dir: str | None = None,
|
|
313
|
+
fallback: bool = True,
|
|
314
|
+
) -> None:
|
|
315
|
+
self._headless = headless
|
|
316
|
+
self._user_data_dir = user_data_dir
|
|
317
|
+
self._fallback = fallback
|
|
318
|
+
self._driver = None
|
|
319
|
+
|
|
320
|
+
def __enter__(self) -> Session:
|
|
321
|
+
return self
|
|
322
|
+
|
|
323
|
+
def __exit__(self, *exc: object) -> bool:
|
|
324
|
+
self.close()
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
def _ensure_driver(self):
|
|
328
|
+
if self._driver is None:
|
|
329
|
+
self._driver = build_driver(headless=self._headless, user_data_dir=self._user_data_dir)
|
|
330
|
+
prime_consent_cookies(self._driver)
|
|
331
|
+
return self._driver
|
|
332
|
+
|
|
333
|
+
def scrape(
|
|
334
|
+
self,
|
|
335
|
+
url: str,
|
|
336
|
+
*,
|
|
337
|
+
comments: bool = True,
|
|
338
|
+
transcript: bool = False,
|
|
339
|
+
related: int = 0,
|
|
340
|
+
limit: int = 150,
|
|
341
|
+
max_viewports: int = 25,
|
|
342
|
+
replies: bool = True,
|
|
343
|
+
max_replies: int = 5,
|
|
344
|
+
) -> ScrapeResult:
|
|
345
|
+
"""Scrape one video on this session's (lazily built) browser.
|
|
346
|
+
|
|
347
|
+
Raises :class:`BlockedError` only when a block survives the headed retry
|
|
348
|
+
(or when ``fallback=False``).
|
|
349
|
+
"""
|
|
350
|
+
kw = dict(
|
|
351
|
+
comments=comments,
|
|
352
|
+
transcript=transcript,
|
|
353
|
+
related=related,
|
|
354
|
+
limit=limit,
|
|
355
|
+
max_viewports=max_viewports,
|
|
356
|
+
replies=replies,
|
|
357
|
+
max_replies=max_replies,
|
|
358
|
+
)
|
|
359
|
+
try:
|
|
360
|
+
return _scrape_url(self._ensure_driver(), url, **kw)
|
|
361
|
+
except BlockedError:
|
|
362
|
+
if self._headless and self._fallback:
|
|
363
|
+
log.warning("blocked on %s — rebuilding this session headed", url)
|
|
364
|
+
self.close()
|
|
365
|
+
self._headless = False
|
|
366
|
+
return _scrape_url(self._ensure_driver(), url, **kw)
|
|
367
|
+
raise
|
|
368
|
+
|
|
369
|
+
def close(self) -> None:
|
|
370
|
+
"""Quit the browser (idempotent; also called on context-manager exit)."""
|
|
371
|
+
if self._driver is not None:
|
|
372
|
+
try:
|
|
373
|
+
self._driver.quit()
|
|
374
|
+
except Exception: # pragma: no cover
|
|
375
|
+
pass
|
|
376
|
+
self._driver = None
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def scrape_videos(
|
|
380
|
+
urls: Iterable[str],
|
|
381
|
+
*,
|
|
382
|
+
jobs: int = 4,
|
|
383
|
+
comments: bool = True,
|
|
384
|
+
transcript: bool = False,
|
|
385
|
+
related: int = 0,
|
|
386
|
+
limit: int = 150,
|
|
387
|
+
max_viewports: int = 25,
|
|
388
|
+
replies: bool = True,
|
|
389
|
+
max_replies: int = 5,
|
|
390
|
+
headless: bool = True,
|
|
391
|
+
user_data_dir: str | None = None,
|
|
392
|
+
fallback: bool = True,
|
|
393
|
+
) -> list[ScrapeResult | None]:
|
|
394
|
+
"""Scrape many videos over a bounded pool of reused browsers.
|
|
395
|
+
|
|
396
|
+
Runs ``jobs`` worker threads, each owning ONE reused, consent-primed
|
|
397
|
+
:class:`Session` (Chrome starts once per worker, not once per video). Returns
|
|
398
|
+
a list ALIGNED to the ``urls`` input order: a :class:`ScrapeResult` per
|
|
399
|
+
success, or ``None`` for a video that failed (the error is logged). A poisoned
|
|
400
|
+
session is recycled, so one bad video can't sink the batch.
|
|
401
|
+
|
|
402
|
+
WebDriver is single-thread per instance — each worker keeps its own driver and
|
|
403
|
+
they are never shared.
|
|
404
|
+
"""
|
|
405
|
+
url_list = list(urls)
|
|
406
|
+
if not url_list:
|
|
407
|
+
return []
|
|
408
|
+
jobs = max(1, min(jobs, len(url_list)))
|
|
409
|
+
kw = dict(
|
|
410
|
+
comments=comments,
|
|
411
|
+
transcript=transcript,
|
|
412
|
+
related=related,
|
|
413
|
+
limit=limit,
|
|
414
|
+
max_viewports=max_viewports,
|
|
415
|
+
replies=replies,
|
|
416
|
+
max_replies=max_replies,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
q: Queue[tuple[int, str]] = Queue()
|
|
420
|
+
for item in enumerate(url_list):
|
|
421
|
+
q.put(item)
|
|
422
|
+
results: list[ScrapeResult | None] = [None] * len(url_list)
|
|
423
|
+
lock = threading.Lock()
|
|
424
|
+
|
|
425
|
+
def worker() -> None:
|
|
426
|
+
session = Session(headless=headless, user_data_dir=user_data_dir, fallback=fallback)
|
|
427
|
+
try:
|
|
428
|
+
while True:
|
|
429
|
+
try:
|
|
430
|
+
idx, url = q.get_nowait()
|
|
431
|
+
except Empty:
|
|
432
|
+
break
|
|
433
|
+
try:
|
|
434
|
+
res = session.scrape(url, **kw)
|
|
435
|
+
with lock:
|
|
436
|
+
results[idx] = res
|
|
437
|
+
except Exception as exc: # isolate per-video; recycle the session
|
|
438
|
+
log.warning("scrape_videos: %r failed: %s", url, exc)
|
|
439
|
+
session.close()
|
|
440
|
+
finally:
|
|
441
|
+
q.task_done()
|
|
442
|
+
finally:
|
|
443
|
+
session.close()
|
|
444
|
+
|
|
445
|
+
threads = [threading.Thread(target=worker, daemon=True) for _ in range(jobs)]
|
|
446
|
+
for t in threads:
|
|
447
|
+
t.start()
|
|
448
|
+
for t in threads:
|
|
449
|
+
t.join()
|
|
450
|
+
return results
|