viewlyt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
viewlyt/__init__.py ADDED
@@ -0,0 +1,119 @@
1
+ """viewlyt — scrape a YouTube video's comments (and transcript) with Selenium.
2
+
3
+ Library quickstart::
4
+
5
+ from viewlyt import scrape_video
6
+ r = scrape_video("https://youtu.be/dQw4w9WgXcQ", transcript=True, related=10)
7
+ print("\\n".join(r.comment_lines())) # same text as the CLI's .md
8
+ r.write("out/") # .md / .transcript.md / .related.md
9
+
10
+ Many videos on ONE reused browser (amortises Chrome startup)::
11
+
12
+ from viewlyt import scrape_videos, Session
13
+ results = scrape_videos(urls, jobs=4) # list aligned to input order
14
+ with Session() as s: # or drive it manually
15
+ a, b = s.scrape(url1), s.scrape(url2)
16
+
17
+ The pure, dependency-free helpers (``html_to_text``, ``format_comment_lines``,
18
+ ``format_transcript``, ``format_related``, ``group_consecutive_comments``,
19
+ ``parse_relative_date``, ``flatten_inline``, ``slugify``) and ``__version__`` are
20
+ importable WITHOUT pulling in Selenium — ``import viewlyt`` stays lightweight. The
21
+ Selenium-backed names (``scrape_video``, ``scrape_videos``, ``Session``,
22
+ ``build_driver``, …) are loaded lazily on first access (PEP 562), so they cost
23
+ nothing until you use them.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from importlib import import_module
29
+ from importlib.metadata import PackageNotFoundError, version
30
+ from typing import TYPE_CHECKING
31
+
32
+ # Pure helpers: stdlib only, safe to import eagerly (no Selenium).
33
+ from .htmltext import (
34
+ flatten_inline,
35
+ format_comment_lines,
36
+ format_related,
37
+ format_transcript,
38
+ format_unified,
39
+ group_consecutive_comments,
40
+ html_to_text,
41
+ join_unified,
42
+ parse_relative_date,
43
+ slugify,
44
+ )
45
+
46
+ try:
47
+ __version__ = version("viewlyt")
48
+ except PackageNotFoundError: # pragma: no cover - running from a source tree w/o install
49
+ __version__ = "0.0.0"
50
+
51
+ # Selenium-backed names resolved lazily via __getattr__, mapped to their module.
52
+ _LAZY = {
53
+ "scrape_video": "api",
54
+ "scrape_videos": "api",
55
+ "Session": "api",
56
+ "ScrapeResult": "api",
57
+ "Comment": "api",
58
+ "RelatedVideo": "api",
59
+ "build_driver": "driver",
60
+ "collect_comments": "scraper",
61
+ "collect_related": "scraper",
62
+ "fetch_transcript": "scraper",
63
+ "extract_video_id": "scraper",
64
+ "BlockedError": "scraper",
65
+ }
66
+
67
+ if TYPE_CHECKING: # let type checkers and IDEs see the real symbols
68
+ from .api import Comment, RelatedVideo, ScrapeResult, Session, scrape_video, scrape_videos
69
+ from .driver import build_driver
70
+ from .scraper import (
71
+ BlockedError,
72
+ collect_comments,
73
+ collect_related,
74
+ extract_video_id,
75
+ fetch_transcript,
76
+ )
77
+
78
+
79
+ def __getattr__(name: str) -> object: # PEP 562 module-level lazy attribute access
80
+ module = _LAZY.get(name)
81
+ if module is None:
82
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
83
+ value = getattr(import_module(f".{module}", __name__), name)
84
+ globals()[name] = value # cache so __getattr__ runs at most once per name
85
+ return value
86
+
87
+
88
+ def __dir__() -> list[str]:
89
+ return sorted(__all__)
90
+
91
+
92
+ __all__ = [
93
+ # high-level (lazy)
94
+ "scrape_video",
95
+ "scrape_videos",
96
+ "Session",
97
+ "ScrapeResult",
98
+ "Comment",
99
+ "RelatedVideo",
100
+ # building blocks (lazy)
101
+ "extract_video_id",
102
+ "build_driver",
103
+ "collect_comments",
104
+ "collect_related",
105
+ "fetch_transcript",
106
+ "BlockedError",
107
+ # pure text helpers (eager)
108
+ "html_to_text",
109
+ "format_comment_lines",
110
+ "format_transcript",
111
+ "format_related",
112
+ "format_unified",
113
+ "join_unified",
114
+ "group_consecutive_comments",
115
+ "parse_relative_date",
116
+ "flatten_inline",
117
+ "slugify",
118
+ "__version__",
119
+ ]
viewlyt/api.py ADDED
@@ -0,0 +1,450 @@
1
+ """Programmatic API — use viewlyt as a library.
2
+
3
+ from viewlyt import scrape_video
4
+
5
+ r = scrape_video("https://youtu.be/dQw4w9WgXcQ", transcript=True)
6
+ print(r.title)
7
+ for c in r.comments:
8
+ print(c.author, c.likes, c.date, c.text)
9
+ print("\\n".join(r.transcript_lines()))
10
+
11
+ ``scrape_video`` builds and tears down its own headless Chrome and returns
12
+ structured data (nothing is written to disk). For batch use with a reused
13
+ browser-instance pool and file output, see :mod:`viewlyt.cli`. The pure text
14
+ helpers (``html_to_text``, ``format_transcript``, …) live — dependency-free — in
15
+ :mod:`viewlyt.htmltext`.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import logging
21
+ import threading
22
+ from collections.abc import Iterable
23
+ from dataclasses import dataclass, field
24
+ from datetime import date
25
+ from pathlib import Path
26
+ from queue import Empty, Queue
27
+
28
+ from .driver import build_driver
29
+ from .htmltext import (
30
+ format_comment_lines,
31
+ format_related,
32
+ format_transcript,
33
+ format_unified,
34
+ html_to_text,
35
+ pair_lines,
36
+ slugify,
37
+ strip_timestamps,
38
+ )
39
+ from .scraper import (
40
+ BlockedError,
41
+ collect_comments,
42
+ collect_related,
43
+ detect_block,
44
+ dismiss_consent_dialog,
45
+ extract_video_id,
46
+ fetch_transcript,
47
+ get_video_title,
48
+ prime_consent_cookies,
49
+ safe_get,
50
+ )
51
+
52
+ log = logging.getLogger("viewlyt")
53
+
54
+
55
+ @dataclass(slots=True)
56
+ class Comment:
57
+ """A single comment or reply, as ready-to-use plain text."""
58
+
59
+ kind: str # "comment" | "reply"
60
+ author: str # e.g. "@handle" ("" if it couldn't be resolved)
61
+ text: str # plain text (emoji alt + link text kept; <br> -> newline)
62
+ likes: str # YouTube's own count, e.g. "842"/"1.2K"; "0" when hidden
63
+ date: str # relative timestamp as YouTube shows it, e.g. "2 days ago"
64
+ parent_author: str | None = None # set on replies
65
+
66
+
67
+ @dataclass(slots=True)
68
+ class RelatedVideo:
69
+ """One related video from the watch-page sidebar. ``views`` is YouTube's own
70
+ sidebar text (e.g. "1.2B views"); the sidebar exposes NO likes."""
71
+
72
+ video_id: str
73
+ title: str
74
+ views: str
75
+ url: str
76
+
77
+
78
+ @dataclass(slots=True)
79
+ class ScrapeResult:
80
+ """Everything scraped for one video. ``transcript`` is ``[(timestamp, text)]``."""
81
+
82
+ video_id: str
83
+ title: str
84
+ comments: list[Comment] = field(default_factory=list)
85
+ transcript: list[tuple[str, str]] = field(default_factory=list)
86
+ related: list[RelatedVideo] = field(default_factory=list)
87
+ # Raw scraper records (with HTML), kept so comment_lines()/write() can reuse the
88
+ # exact CLI merge+format pipeline. Private; hidden from repr.
89
+ _records: list[dict] = field(default_factory=list, repr=False)
90
+
91
+ @property
92
+ def top_level(self) -> list[Comment]:
93
+ return [c for c in self.comments if c.kind == "comment"]
94
+
95
+ @property
96
+ def replies(self) -> list[Comment]:
97
+ return [c for c in self.comments if c.kind == "reply"]
98
+
99
+ def comment_lines(self, *, merge: bool = True, today: date | None = None) -> list[str]:
100
+ """Comments as the CLI-formatted text block (merged by default) — identical
101
+ to viewlyt's ``out/<slug>-<id>.md`` body (see
102
+ :func:`viewlyt.format_comment_lines`)."""
103
+ return format_comment_lines(self._records, today=today, merge=merge)
104
+
105
+ def transcript_lines(self, *, timestamps: bool = False, pair: bool = True) -> list[str]:
106
+ """Transcript lines, token-lean by default (CLI parity): ``[m:ss]``
107
+ timestamps stripped (``timestamps=True`` keeps them; ``h:mm:ss`` on long
108
+ videos always stays) and every 2 segments joined into one line
109
+ (``pair=False`` for one segment per line). ``timestamps=True, pair=False``
110
+ gives the verbatim :func:`viewlyt.format_transcript` output."""
111
+ lines = format_transcript(self.transcript)
112
+ if not timestamps:
113
+ lines = strip_timestamps(lines)
114
+ return pair_lines(lines) if pair else lines
115
+
116
+ def related_lines(self) -> list[str]:
117
+ """Related videos as a numbered Markdown list (see :func:`viewlyt.format_related`)."""
118
+ return format_related(
119
+ [{"title": r.title, "views": r.views, "url": r.url} for r in self.related]
120
+ )
121
+
122
+ def _sections(self, *, merge: bool = True) -> list[tuple[str, str, str, list[str]]]:
123
+ """Single source of the product sections, in canonical order:
124
+ ``(kind, header, filename-suffix, lines)``. Drives ``write()`` (separate
125
+ files), ``unified_lines()``/``write(unify=True)``, and is the one place a
126
+ new product type is added so it flows into every output for free."""
127
+ return [
128
+ ("comments", "Comments", "", self.comment_lines(merge=merge)),
129
+ ("transcript", "Transcript", ".transcript", self.transcript_lines()),
130
+ ("related", "Related videos", ".related", self.related_lines()),
131
+ ]
132
+
133
+ def unified_lines(self, *, merge: bool = True) -> list[str]:
134
+ """All collected products in ONE document — ``# title`` + ``## section``
135
+ blocks, empty sections skipped (see :func:`viewlyt.format_unified`)."""
136
+ return format_unified(
137
+ self.title,
138
+ [(header, lines) for _kind, header, _suffix, lines in self._sections(merge=merge)],
139
+ )
140
+
141
+ def write(self, out_dir: str, *, merge: bool = True, unify: bool = False) -> dict[str, Path]:
142
+ """Write the scraped data to ``out_dir``.
143
+
144
+ Default — one file per product, exactly like the CLI: ``<slug>-<id>.md``
145
+ (comments), ``.transcript.md``, ``.related.md`` (only non-empty ones).
146
+ With ``unify=True`` — a single ``<slug>-<id>.unified.md`` with every
147
+ product instead. Returns a mapping of section name (or ``"unified"``) to
148
+ the written :class:`pathlib.Path`.
149
+ """
150
+ base = Path(out_dir)
151
+ base_name = f"{slugify(self.title) or 'video'}-{self.video_id}"
152
+ written: dict[str, Path] = {}
153
+
154
+ if unify:
155
+ lines = self.unified_lines(merge=merge)
156
+ if lines:
157
+ base.mkdir(parents=True, exist_ok=True)
158
+ path = base / f"{base_name}.unified.md"
159
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
160
+ written["unified"] = path
161
+ return written
162
+
163
+ for kind, _header, suffix, lines in self._sections(merge=merge):
164
+ if not lines:
165
+ continue
166
+ base.mkdir(parents=True, exist_ok=True)
167
+ path = base / f"{base_name}{suffix}.md"
168
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
169
+ written[kind] = path
170
+ return written
171
+
172
+
173
+ def _to_comments(records: list[dict]) -> list[Comment]:
174
+ return [
175
+ Comment(
176
+ kind=r.get("kind", "comment"),
177
+ author=r.get("author") or "",
178
+ text=html_to_text(r.get("html", "")),
179
+ likes=r.get("likes") or "0",
180
+ date=r.get("date_raw") or "",
181
+ parent_author=r.get("parent_author"),
182
+ )
183
+ for r in records
184
+ ]
185
+
186
+
187
+ def _to_related(items: list[dict]) -> list[RelatedVideo]:
188
+ return [
189
+ RelatedVideo(
190
+ video_id=it.get("video_id") or "",
191
+ title=it.get("title") or "",
192
+ views=it.get("views") or "",
193
+ url=it.get("url") or "",
194
+ )
195
+ for it in items
196
+ ]
197
+
198
+
199
+ def _scrape_url(
200
+ driver,
201
+ url: str,
202
+ *,
203
+ comments: bool,
204
+ transcript: bool,
205
+ related: int,
206
+ limit: int,
207
+ max_viewports: int,
208
+ replies: bool,
209
+ max_replies: int,
210
+ ) -> ScrapeResult:
211
+ """Scrape one video on an already-built, consent-primed ``driver``.
212
+
213
+ Shared by :func:`scrape_video`, :class:`Session` and :func:`scrape_videos`.
214
+ Raises :class:`BlockedError` on a consent/bot wall. Does NOT build or quit the
215
+ driver — the caller owns its lifecycle.
216
+ """
217
+ video_id = extract_video_id(url)
218
+ safe_get(driver, f"https://www.youtube.com/watch?v={video_id}")
219
+ dismiss_consent_dialog(driver, timeout=2.0)
220
+ block = detect_block(driver)
221
+ if block:
222
+ raise BlockedError(block)
223
+ title = get_video_title(driver)
224
+ records = (
225
+ collect_comments(
226
+ driver,
227
+ limit=limit,
228
+ max_viewports=max_viewports,
229
+ expand_replies=replies,
230
+ max_replies=max_replies,
231
+ progress=False,
232
+ )
233
+ if comments
234
+ else []
235
+ )
236
+ # Related before transcript: the transcript panel takes over the #secondary
237
+ # column that hosts the related lockups (collect_related never raises).
238
+ rel = collect_related(driver, limit=related, progress=False) if related > 0 else []
239
+ tx = fetch_transcript(driver, progress=False) if transcript else []
240
+ return ScrapeResult(
241
+ video_id=video_id,
242
+ title=title,
243
+ comments=_to_comments(records),
244
+ transcript=tx,
245
+ related=_to_related(rel),
246
+ _records=records,
247
+ )
248
+
249
+
250
+ def scrape_video(
251
+ url: str,
252
+ *,
253
+ comments: bool = True,
254
+ transcript: bool = False,
255
+ related: int = 0,
256
+ limit: int = 150,
257
+ max_viewports: int = 25,
258
+ replies: bool = True,
259
+ max_replies: int = 5,
260
+ headless: bool = True,
261
+ user_data_dir: str | None = None,
262
+ ) -> ScrapeResult:
263
+ """Scrape one video and return a :class:`ScrapeResult` (writes no files).
264
+
265
+ Builds and quits its own Chrome. ``related`` is the number of sidebar related
266
+ videos to collect (0 = none). Raises :class:`viewlyt.BlockedError` if YouTube
267
+ serves a consent/bot wall (retry with ``headless=False`` or a logged-in
268
+ ``user_data_dir``). To scrape several videos on ONE browser, use
269
+ :class:`Session` or :func:`scrape_videos`.
270
+ """
271
+ driver = build_driver(headless=headless, user_data_dir=user_data_dir)
272
+ try:
273
+ prime_consent_cookies(driver)
274
+ return _scrape_url(
275
+ driver,
276
+ url,
277
+ comments=comments,
278
+ transcript=transcript,
279
+ related=related,
280
+ limit=limit,
281
+ max_viewports=max_viewports,
282
+ replies=replies,
283
+ max_replies=max_replies,
284
+ )
285
+ finally:
286
+ try:
287
+ driver.quit()
288
+ except Exception: # pragma: no cover
289
+ pass
290
+
291
+
292
+ class Session:
293
+ """A reusable scraping session over ONE Chrome instance.
294
+
295
+ Building Chrome is the expensive part; a ``Session`` builds (and
296
+ consent-primes) it once and scrapes many videos on it, amortising the
297
+ cold-start. Use it as a context manager so the browser is always closed::
298
+
299
+ with viewlyt.Session(headless=True) as s:
300
+ a = s.scrape(url1)
301
+ b = s.scrape(url2) # same browser, no cold-start
302
+
303
+ On a consent/bot wall a headless session transparently rebuilds itself headed
304
+ and retries the video once; pass ``fallback=False`` to instead re-raise
305
+ :class:`BlockedError`. The driver is built lazily on the first ``scrape``.
306
+ """
307
+
308
+ def __init__(
309
+ self,
310
+ *,
311
+ headless: bool = True,
312
+ user_data_dir: str | None = None,
313
+ fallback: bool = True,
314
+ ) -> None:
315
+ self._headless = headless
316
+ self._user_data_dir = user_data_dir
317
+ self._fallback = fallback
318
+ self._driver = None
319
+
320
+ def __enter__(self) -> Session:
321
+ return self
322
+
323
+ def __exit__(self, *exc: object) -> bool:
324
+ self.close()
325
+ return False
326
+
327
+ def _ensure_driver(self):
328
+ if self._driver is None:
329
+ self._driver = build_driver(headless=self._headless, user_data_dir=self._user_data_dir)
330
+ prime_consent_cookies(self._driver)
331
+ return self._driver
332
+
333
+ def scrape(
334
+ self,
335
+ url: str,
336
+ *,
337
+ comments: bool = True,
338
+ transcript: bool = False,
339
+ related: int = 0,
340
+ limit: int = 150,
341
+ max_viewports: int = 25,
342
+ replies: bool = True,
343
+ max_replies: int = 5,
344
+ ) -> ScrapeResult:
345
+ """Scrape one video on this session's (lazily built) browser.
346
+
347
+ Raises :class:`BlockedError` only when a block survives the headed retry
348
+ (or when ``fallback=False``).
349
+ """
350
+ kw = dict(
351
+ comments=comments,
352
+ transcript=transcript,
353
+ related=related,
354
+ limit=limit,
355
+ max_viewports=max_viewports,
356
+ replies=replies,
357
+ max_replies=max_replies,
358
+ )
359
+ try:
360
+ return _scrape_url(self._ensure_driver(), url, **kw)
361
+ except BlockedError:
362
+ if self._headless and self._fallback:
363
+ log.warning("blocked on %s — rebuilding this session headed", url)
364
+ self.close()
365
+ self._headless = False
366
+ return _scrape_url(self._ensure_driver(), url, **kw)
367
+ raise
368
+
369
+ def close(self) -> None:
370
+ """Quit the browser (idempotent; also called on context-manager exit)."""
371
+ if self._driver is not None:
372
+ try:
373
+ self._driver.quit()
374
+ except Exception: # pragma: no cover
375
+ pass
376
+ self._driver = None
377
+
378
+
379
+ def scrape_videos(
380
+ urls: Iterable[str],
381
+ *,
382
+ jobs: int = 4,
383
+ comments: bool = True,
384
+ transcript: bool = False,
385
+ related: int = 0,
386
+ limit: int = 150,
387
+ max_viewports: int = 25,
388
+ replies: bool = True,
389
+ max_replies: int = 5,
390
+ headless: bool = True,
391
+ user_data_dir: str | None = None,
392
+ fallback: bool = True,
393
+ ) -> list[ScrapeResult | None]:
394
+ """Scrape many videos over a bounded pool of reused browsers.
395
+
396
+ Runs ``jobs`` worker threads, each owning ONE reused, consent-primed
397
+ :class:`Session` (Chrome starts once per worker, not once per video). Returns
398
+ a list ALIGNED to the ``urls`` input order: a :class:`ScrapeResult` per
399
+ success, or ``None`` for a video that failed (the error is logged). A poisoned
400
+ session is recycled, so one bad video can't sink the batch.
401
+
402
+ WebDriver is single-thread per instance — each worker keeps its own driver and
403
+ they are never shared.
404
+ """
405
+ url_list = list(urls)
406
+ if not url_list:
407
+ return []
408
+ jobs = max(1, min(jobs, len(url_list)))
409
+ kw = dict(
410
+ comments=comments,
411
+ transcript=transcript,
412
+ related=related,
413
+ limit=limit,
414
+ max_viewports=max_viewports,
415
+ replies=replies,
416
+ max_replies=max_replies,
417
+ )
418
+
419
+ q: Queue[tuple[int, str]] = Queue()
420
+ for item in enumerate(url_list):
421
+ q.put(item)
422
+ results: list[ScrapeResult | None] = [None] * len(url_list)
423
+ lock = threading.Lock()
424
+
425
+ def worker() -> None:
426
+ session = Session(headless=headless, user_data_dir=user_data_dir, fallback=fallback)
427
+ try:
428
+ while True:
429
+ try:
430
+ idx, url = q.get_nowait()
431
+ except Empty:
432
+ break
433
+ try:
434
+ res = session.scrape(url, **kw)
435
+ with lock:
436
+ results[idx] = res
437
+ except Exception as exc: # isolate per-video; recycle the session
438
+ log.warning("scrape_videos: %r failed: %s", url, exc)
439
+ session.close()
440
+ finally:
441
+ q.task_done()
442
+ finally:
443
+ session.close()
444
+
445
+ threads = [threading.Thread(target=worker, daemon=True) for _ in range(jobs)]
446
+ for t in threads:
447
+ t.start()
448
+ for t in threads:
449
+ t.join()
450
+ return results