getraw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/.gitattributes +4 -0
  2. package/CLAUDE.md +57 -0
  3. package/README.md +166 -0
  4. package/RESEARCH.md +109 -0
  5. package/STATUS.md +23 -0
  6. package/bun.lock +50 -0
  7. package/bunfig.toml +3 -0
  8. package/docs/plugin-guide.md +166 -0
  9. package/docs/supported-sites.md +41 -0
  10. package/package.json +30 -0
  11. package/src/cli/index.ts +52 -0
  12. package/src/cli/options.ts +97 -0
  13. package/src/core/format-sorter.ts +208 -0
  14. package/src/core/logger.ts +101 -0
  15. package/src/core/orchestrator.ts +140 -0
  16. package/src/core/output-template.ts +58 -0
  17. package/src/core/types.ts +237 -0
  18. package/src/downloaders/base.ts +25 -0
  19. package/src/downloaders/dash.ts +287 -0
  20. package/src/downloaders/fragment.ts +226 -0
  21. package/src/downloaders/hls.ts +170 -0
  22. package/src/downloaders/http.ts +260 -0
  23. package/src/extractors/archive-org.ts +126 -0
  24. package/src/extractors/bandcamp.ts +130 -0
  25. package/src/extractors/base.ts +29 -0
  26. package/src/extractors/bilibili/bangumi.ts +205 -0
  27. package/src/extractors/bilibili/index.ts +233 -0
  28. package/src/extractors/bilibili/wbi.ts +60 -0
  29. package/src/extractors/coub.ts +137 -0
  30. package/src/extractors/dailymotion.ts +99 -0
  31. package/src/extractors/dropbox.ts +52 -0
  32. package/src/extractors/generic.ts +118 -0
  33. package/src/extractors/google-drive.ts +106 -0
  34. package/src/extractors/imgur.ts +156 -0
  35. package/src/extractors/instagram/index.ts +263 -0
  36. package/src/extractors/instagram/reels.ts +166 -0
  37. package/src/extractors/kick/clips.ts +91 -0
  38. package/src/extractors/kick/index.ts +118 -0
  39. package/src/extractors/kick/live.ts +89 -0
  40. package/src/extractors/niconico/index.ts +209 -0
  41. package/src/extractors/odysee.ts +126 -0
  42. package/src/extractors/peertube.ts +143 -0
  43. package/src/extractors/reddit/gallery.ts +124 -0
  44. package/src/extractors/reddit/index.ts +203 -0
  45. package/src/extractors/rumble.ts +127 -0
  46. package/src/extractors/soundcloud/index.ts +161 -0
  47. package/src/extractors/soundcloud/playlist.ts +129 -0
  48. package/src/extractors/spotify.ts +97 -0
  49. package/src/extractors/streamable.ts +121 -0
  50. package/src/extractors/ted.ts +151 -0
  51. package/src/extractors/tiktok/index.ts +207 -0
  52. package/src/extractors/tiktok/user.ts +176 -0
  53. package/src/extractors/twitch/clips.ts +125 -0
  54. package/src/extractors/twitch/index.ts +136 -0
  55. package/src/extractors/twitch/live.ts +132 -0
  56. package/src/extractors/twitter/index.ts +140 -0
  57. package/src/extractors/twitter/spaces.ts +200 -0
  58. package/src/extractors/vimeo/index.ts +187 -0
  59. package/src/extractors/youtube/captions.ts +111 -0
  60. package/src/extractors/youtube/index.ts +252 -0
  61. package/src/extractors/youtube/innertube.ts +364 -0
  62. package/src/extractors/youtube/nsig.ts +105 -0
  63. package/src/extractors/youtube/playlist.ts +227 -0
  64. package/src/extractors/youtube/signature.ts +163 -0
  65. package/src/networking/client.ts +311 -0
  66. package/src/networking/cookies.ts +138 -0
  67. package/src/networking/proxy.ts +132 -0
  68. package/src/networking/tls.ts +67 -0
  69. package/src/networking/user-agents.ts +88 -0
  70. package/src/postprocessors/base.ts +44 -0
  71. package/src/postprocessors/extract-audio.ts +98 -0
  72. package/src/postprocessors/ffmpeg.ts +146 -0
  73. package/src/postprocessors/merge.ts +102 -0
  74. package/src/postprocessors/metadata.ts +73 -0
  75. package/src/postprocessors/sponsorblock.ts +162 -0
  76. package/src/postprocessors/subtitles.ts +285 -0
  77. package/src/postprocessors/thumbnails.ts +194 -0
  78. package/src/utils/sanitize.ts +36 -0
  79. package/src/utils/traverse.ts +68 -0
  80. package/tests/core/format-sorter.test.ts +96 -0
  81. package/tests/core/output-template.test.ts +56 -0
  82. package/tests/core/types.test.ts +79 -0
  83. package/tests/unit/downloaders/dash.test.ts +57 -0
  84. package/tests/unit/downloaders/hls.test.ts +120 -0
  85. package/tests/unit/downloaders/http.test.ts +114 -0
  86. package/tests/unit/extractors/bilibili.test.ts +83 -0
  87. package/tests/unit/extractors/instagram.test.ts +273 -0
  88. package/tests/unit/extractors/kick.test.ts +85 -0
  89. package/tests/unit/extractors/misc.test.ts +942 -0
  90. package/tests/unit/extractors/niconico.test.ts +61 -0
  91. package/tests/unit/extractors/reddit.test.ts +222 -0
  92. package/tests/unit/extractors/soundcloud.test.ts +299 -0
  93. package/tests/unit/extractors/tiktok.test.ts +260 -0
  94. package/tests/unit/extractors/twitch.test.ts +250 -0
  95. package/tests/unit/extractors/twitter.test.ts +181 -0
  96. package/tests/unit/extractors/vimeo.test.ts +253 -0
  97. package/tests/unit/extractors/youtube.test.ts +259 -0
  98. package/tests/unit/networking/client.test.ts +272 -0
  99. package/tests/unit/networking/cookies.test.ts +256 -0
  100. package/tests/unit/networking/proxy.test.ts +137 -0
  101. package/tests/unit/postprocessors/extract-audio.test.ts +63 -0
  102. package/tests/unit/postprocessors/merge.test.ts +61 -0
  103. package/tests/unit/postprocessors/subtitles.test.ts +89 -0
  104. package/tools/dashboard.ts +112 -0
  105. package/tsconfig.json +17 -0
@@ -0,0 +1,111 @@
1
+ import type { Subtitle } from "../../core/types";
2
+ import type { CaptionTrack } from "./innertube";
3
+
4
+ export interface TimedTextEvent {
5
+ tStartMs: number;
6
+ dDurationMs: number;
7
+ segs?: Array<{ utf8: string }>;
8
+ wWinId?: number;
9
+ }
10
+
11
+ export interface TimedTextResponse {
12
+ events: TimedTextEvent[];
13
+ }
14
+
15
+ export function parseCaptionTracks(
16
+ tracks: CaptionTrack[],
17
+ ): { subtitles: Record<string, Subtitle[]>; automatic_captions: Record<string, Subtitle[]> } {
18
+ const subtitles: Record<string, Subtitle[]> = {};
19
+ const automatic_captions: Record<string, Subtitle[]> = {};
20
+
21
+ for (const track of tracks) {
22
+ const lang = track.languageCode;
23
+ const name = track.name?.simpleText ?? track.name?.runs?.[0]?.text ?? lang;
24
+ const isAuto = track.kind === "asr";
25
+ const target = isAuto ? automatic_captions : subtitles;
26
+
27
+ target[lang] = [
28
+ { url: track.baseUrl, ext: "json3", name },
29
+ { url: `${track.baseUrl}&fmt=vtt`, ext: "vtt", name },
30
+ { url: `${track.baseUrl}&fmt=srv1`, ext: "srv1", name },
31
+ ];
32
+ }
33
+
34
+ return { subtitles, automatic_captions };
35
+ }
36
+
37
+ export async function fetchCaptionData(url: string): Promise<TimedTextEvent[]> {
38
+ const response = await fetch(url);
39
+ if (!response.ok) {
40
+ throw new Error(`Failed to fetch captions: ${response.status}`);
41
+ }
42
+
43
+ const data = (await response.json()) as TimedTextResponse;
44
+ return data.events ?? [];
45
+ }
46
+
47
+ export function convertToSrt(events: TimedTextEvent[]): string {
48
+ const lines: string[] = [];
49
+ let index = 1;
50
+
51
+ for (const event of events) {
52
+ if (!event.segs || event.segs.length === 0) continue;
53
+
54
+ const text = event.segs.map((s) => s.utf8).join("").trim();
55
+ if (!text) continue;
56
+
57
+ const startMs = event.tStartMs;
58
+ const endMs = startMs + event.dDurationMs;
59
+
60
+ lines.push(String(index));
61
+ lines.push(`${formatSrtTime(startMs)} --> ${formatSrtTime(endMs)}`);
62
+ lines.push(text);
63
+ lines.push("");
64
+
65
+ index++;
66
+ }
67
+
68
+ return lines.join("\n");
69
+ }
70
+
71
+ export function convertToVtt(events: TimedTextEvent[]): string {
72
+ const lines: string[] = ["WEBVTT", ""];
73
+
74
+ for (const event of events) {
75
+ if (!event.segs || event.segs.length === 0) continue;
76
+
77
+ const text = event.segs.map((s) => s.utf8).join("").trim();
78
+ if (!text) continue;
79
+
80
+ const startMs = event.tStartMs;
81
+ const endMs = startMs + event.dDurationMs;
82
+
83
+ lines.push(`${formatVttTime(startMs)} --> ${formatVttTime(endMs)}`);
84
+ lines.push(text);
85
+ lines.push("");
86
+ }
87
+
88
+ return lines.join("\n");
89
+ }
90
+
91
+ function formatSrtTime(ms: number): string {
92
+ const hours = Math.floor(ms / 3600000);
93
+ const minutes = Math.floor((ms % 3600000) / 60000);
94
+ const seconds = Math.floor((ms % 60000) / 1000);
95
+ const millis = ms % 1000;
96
+
97
+ return `${pad(hours, 2)}:${pad(minutes, 2)}:${pad(seconds, 2)},${pad(millis, 3)}`;
98
+ }
99
+
100
+ function formatVttTime(ms: number): string {
101
+ const hours = Math.floor(ms / 3600000);
102
+ const minutes = Math.floor((ms % 3600000) / 60000);
103
+ const seconds = Math.floor((ms % 60000) / 1000);
104
+ const millis = ms % 1000;
105
+
106
+ return `${pad(hours, 2)}:${pad(minutes, 2)}:${pad(seconds, 2)}.${pad(millis, 3)}`;
107
+ }
108
+
109
+ function pad(num: number, size: number): string {
110
+ return String(num).padStart(size, "0");
111
+ }
@@ -0,0 +1,252 @@
1
+ import { BaseExtractor, ExtractorError } from "../../core/types";
2
+ import type { InfoDict, Format, Thumbnail } from "../../core/types";
3
+ import { InnerTubeClient } from "./innertube";
4
+ import type { PlayerResponse, VideoDetails, StreamingData } from "./innertube";
5
+ import { fetchPlayerJs, decipherSignatureUrl, clearCache as clearSigCache } from "./signature";
6
+ import { transformNsig, clearNsigCache } from "./nsig";
7
+ import { parseCaptionTracks } from "./captions";
8
+ import { PlaylistExtractor } from "./playlist";
9
+
10
+ const VALID_URL = /^https?:\/\/(?:(?:www|m|music)\.)?(?:youtube\.com\/(?:watch\?.*v=|shorts\/|live\/|embed\/|v\/)|youtu\.be\/)([a-zA-Z0-9_-]{11})/;
11
+ const PLAYLIST_URL = /^https?:\/\/(?:(?:www|m|music)\.)?youtube\.com\/playlist\?.*list=([a-zA-Z0-9_-]+)/;
12
+ const CHANNEL_URL = /^https?:\/\/(?:(?:www|m|music)\.)?youtube\.com\/(?:channel\/|@)([a-zA-Z0-9_-]+)/;
13
+
14
+ const PLAYER_URL_RE = /"jsUrl"\s*:\s*"(\/s\/player\/[^"]+\/base\.js)"/;
15
+
16
+ export class YouTubeExtractor extends BaseExtractor {
17
+ readonly _VALID_URL = new RegExp(
18
+ `(?:${VALID_URL.source})|(?:${PLAYLIST_URL.source})|(?:${CHANNEL_URL.source})`
19
+ );
20
+ readonly _NAME = "youtube";
21
+
22
+ private playlistExtractor = new PlaylistExtractor();
23
+
24
+ protected async _real_extract(url: string): Promise<InfoDict> {
25
+ const playlistMatch = url.match(PLAYLIST_URL);
26
+ if (playlistMatch) {
27
+ return this.playlistExtractor.extractPlaylist(playlistMatch[1]);
28
+ }
29
+
30
+ const channelMatch = url.match(CHANNEL_URL);
31
+ if (channelMatch && !url.match(VALID_URL)) {
32
+ return this.playlistExtractor.extractChannelVideos(channelMatch[1]);
33
+ }
34
+
35
+ const videoMatch = url.match(VALID_URL);
36
+ if (!videoMatch) {
37
+ throw new ExtractorError(`Could not extract video ID from URL: ${url}`);
38
+ }
39
+
40
+ return this.extractVideo(videoMatch[1]);
41
+ }
42
+
43
+ private async extractVideo(videoId: string): Promise<InfoDict> {
44
+ const webClient = InnerTubeClient.withClient("WEB");
45
+ let playerResponse = await webClient.getPlayerResponse(videoId);
46
+
47
+ const status = playerResponse.playabilityStatus?.status;
48
+ if (status === "LOGIN_REQUIRED" || status === "CONTENT_CHECK_REQUIRED") {
49
+ playerResponse = await this.tryAgeGateBypass(videoId, playerResponse);
50
+ }
51
+
52
+ if (playerResponse.playabilityStatus?.status === "ERROR") {
53
+ throw new ExtractorError(
54
+ playerResponse.playabilityStatus.reason ?? "Video unavailable"
55
+ );
56
+ }
57
+
58
+ const videoDetails = playerResponse.videoDetails;
59
+ if (!videoDetails) {
60
+ throw new ExtractorError("No video details in player response");
61
+ }
62
+
63
+ let formats = await this.extractFormats(playerResponse, webClient, videoId);
64
+
65
+ if (formats.length === 0) {
66
+ const androidClient = InnerTubeClient.withClient("ANDROID");
67
+ const androidResponse = await androidClient.getPlayerResponse(videoId);
68
+ if (androidResponse.streamingData) {
69
+ formats = androidClient.parseFormats(androidResponse.streamingData);
70
+ }
71
+ }
72
+
73
+ const info = this.buildInfoDict(videoId, videoDetails, playerResponse, formats);
74
+ return info;
75
+ }
76
+
77
+ private async tryAgeGateBypass(
78
+ videoId: string,
79
+ originalResponse: PlayerResponse,
80
+ ): Promise<PlayerResponse> {
81
+ const tvClient = InnerTubeClient.withClient("TVHTML5_EMBED");
82
+ const embedUrl = `https://www.youtube.com/embed/${videoId}`;
83
+ const tvResponse = await tvClient.getPlayerResponse(videoId, embedUrl);
84
+
85
+ if (tvResponse.playabilityStatus?.status === "OK" && tvResponse.streamingData) {
86
+ return {
87
+ ...tvResponse,
88
+ videoDetails: originalResponse.videoDetails ?? tvResponse.videoDetails,
89
+ captions: originalResponse.captions ?? tvResponse.captions,
90
+ microformat: originalResponse.microformat ?? tvResponse.microformat,
91
+ };
92
+ }
93
+
94
+ return originalResponse;
95
+ }
96
+
97
+ private async extractFormats(
98
+ playerResponse: PlayerResponse,
99
+ client: InnerTubeClient,
100
+ videoId: string,
101
+ ): Promise<Format[]> {
102
+ const streamingData = playerResponse.streamingData;
103
+ if (!streamingData) return [];
104
+
105
+ let formats = client.parseFormats(streamingData);
106
+
107
+ const needsDecipher = this.formatsNeedDecipher(streamingData);
108
+ if (needsDecipher) {
109
+ formats = await this.decipherFormats(formats, streamingData, videoId);
110
+ }
111
+
112
+ return formats;
113
+ }
114
+
115
+ private formatsNeedDecipher(streamingData: StreamingData): boolean {
116
+ const allFormats = [
117
+ ...(streamingData.formats ?? []),
118
+ ...(streamingData.adaptiveFormats ?? []),
119
+ ];
120
+ return allFormats.some((f) => f.signatureCipher && !f.url);
121
+ }
122
+
123
+ private async decipherFormats(
124
+ formats: Format[],
125
+ streamingData: StreamingData,
126
+ videoId: string,
127
+ ): Promise<Format[]> {
128
+ const playerJsUrl = await this.getPlayerJsUrl(videoId);
129
+ if (!playerJsUrl) return formats;
130
+
131
+ const playerJs = await fetchPlayerJs(playerJsUrl);
132
+
133
+ const allRaw = [
134
+ ...(streamingData.formats ?? []),
135
+ ...(streamingData.adaptiveFormats ?? []),
136
+ ];
137
+
138
+ for (let i = 0; i < formats.length; i++) {
139
+ const raw = allRaw[i];
140
+ if (!raw) continue;
141
+
142
+ if (raw.signatureCipher && !raw.url) {
143
+ try {
144
+ formats[i].url = decipherSignatureUrl(raw.signatureCipher, playerJs);
145
+ } catch {
146
+ continue;
147
+ }
148
+ }
149
+
150
+ if (formats[i].url) {
151
+ try {
152
+ formats[i].url = transformNsig(formats[i].url, playerJs);
153
+ } catch {
154
+ continue;
155
+ }
156
+ }
157
+ }
158
+
159
+ return formats;
160
+ }
161
+
162
+ private async getPlayerJsUrl(videoId: string): Promise<string | null> {
163
+ const watchUrl = `https://www.youtube.com/watch?v=${videoId}`;
164
+ try {
165
+ const response = await fetch(watchUrl, {
166
+ headers: {
167
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
168
+ },
169
+ });
170
+ const html = await response.text();
171
+ const match = html.match(PLAYER_URL_RE);
172
+ return match ? `https://www.youtube.com${match[1]}` : null;
173
+ } catch {
174
+ return null;
175
+ }
176
+ }
177
+
178
+ private buildInfoDict(
179
+ videoId: string,
180
+ details: VideoDetails,
181
+ response: PlayerResponse,
182
+ formats: Format[],
183
+ ): InfoDict {
184
+ const microformat = response.microformat?.playerMicroformatRenderer;
185
+
186
+ const thumbnails: Thumbnail[] = (details.thumbnail?.thumbnails ?? []).map((t) => ({
187
+ url: t.url,
188
+ width: t.width,
189
+ height: t.height,
190
+ }));
191
+
192
+ const liveStatus = this.getLiveStatus(details, response);
193
+
194
+ const info: InfoDict = {
195
+ id: videoId,
196
+ title: details.title,
197
+ formats,
198
+ thumbnails,
199
+ description: details.shortDescription ?? microformat?.description?.simpleText,
200
+ channel: details.author,
201
+ channel_id: details.channelId,
202
+ channel_url: `https://www.youtube.com/channel/${details.channelId}`,
203
+ uploader: details.author,
204
+ uploader_id: details.channelId,
205
+ uploader_url: microformat?.ownerProfileUrl,
206
+ duration: parseInt(details.lengthSeconds, 10) || undefined,
207
+ view_count: parseInt(details.viewCount, 10) || undefined,
208
+ upload_date: microformat?.uploadDate?.replace(/-/g, ""),
209
+ live_status: liveStatus,
210
+ webpage_url: `https://www.youtube.com/watch?v=${videoId}`,
211
+ age_limit: 0,
212
+ categories: microformat?.category ? [microformat.category] : undefined,
213
+ };
214
+
215
+ if (microformat?.liveBroadcastDetails?.startTimestamp) {
216
+ info.release_timestamp = Math.floor(
217
+ new Date(microformat.liveBroadcastDetails.startTimestamp).getTime() / 1000
218
+ );
219
+ }
220
+
221
+ const captionTracks = response.captions?.playerCaptionsTracklistRenderer?.captionTracks;
222
+ if (captionTracks?.length) {
223
+ const { subtitles, automatic_captions } = parseCaptionTracks(captionTracks);
224
+ info.subtitles = subtitles;
225
+ info.automatic_captions = automatic_captions;
226
+ }
227
+
228
+ return info;
229
+ }
230
+
231
+ private getLiveStatus(
232
+ details: VideoDetails,
233
+ response: PlayerResponse,
234
+ ): InfoDict["live_status"] {
235
+ if (details.isLive) return "is_live";
236
+ if (details.isUpcoming) return "is_upcoming";
237
+ if (details.isLiveContent) return "was_live";
238
+ if (response.playabilityStatus?.liveStreamability) return "is_live";
239
+ return "not_live";
240
+ }
241
+
242
+ static clearCaches(): void {
243
+ clearSigCache();
244
+ clearNsigCache();
245
+ }
246
+ }
247
+
248
+ export { InnerTubeClient } from "./innertube";
249
+ export { PlaylistExtractor } from "./playlist";
250
+ export { parseCaptionTracks, convertToSrt, convertToVtt } from "./captions";
251
+ export { decipherSignatureUrl, fetchPlayerJs } from "./signature";
252
+ export { transformNsig } from "./nsig";