getraw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/.gitattributes +4 -0
  2. package/CLAUDE.md +57 -0
  3. package/README.md +166 -0
  4. package/RESEARCH.md +109 -0
  5. package/STATUS.md +23 -0
  6. package/bun.lock +50 -0
  7. package/bunfig.toml +3 -0
  8. package/docs/plugin-guide.md +166 -0
  9. package/docs/supported-sites.md +41 -0
  10. package/package.json +30 -0
  11. package/src/cli/index.ts +52 -0
  12. package/src/cli/options.ts +97 -0
  13. package/src/core/format-sorter.ts +208 -0
  14. package/src/core/logger.ts +101 -0
  15. package/src/core/orchestrator.ts +140 -0
  16. package/src/core/output-template.ts +58 -0
  17. package/src/core/types.ts +237 -0
  18. package/src/downloaders/base.ts +25 -0
  19. package/src/downloaders/dash.ts +287 -0
  20. package/src/downloaders/fragment.ts +226 -0
  21. package/src/downloaders/hls.ts +170 -0
  22. package/src/downloaders/http.ts +260 -0
  23. package/src/extractors/archive-org.ts +126 -0
  24. package/src/extractors/bandcamp.ts +130 -0
  25. package/src/extractors/base.ts +29 -0
  26. package/src/extractors/bilibili/bangumi.ts +205 -0
  27. package/src/extractors/bilibili/index.ts +233 -0
  28. package/src/extractors/bilibili/wbi.ts +60 -0
  29. package/src/extractors/coub.ts +137 -0
  30. package/src/extractors/dailymotion.ts +99 -0
  31. package/src/extractors/dropbox.ts +52 -0
  32. package/src/extractors/generic.ts +118 -0
  33. package/src/extractors/google-drive.ts +106 -0
  34. package/src/extractors/imgur.ts +156 -0
  35. package/src/extractors/instagram/index.ts +263 -0
  36. package/src/extractors/instagram/reels.ts +166 -0
  37. package/src/extractors/kick/clips.ts +91 -0
  38. package/src/extractors/kick/index.ts +118 -0
  39. package/src/extractors/kick/live.ts +89 -0
  40. package/src/extractors/niconico/index.ts +209 -0
  41. package/src/extractors/odysee.ts +126 -0
  42. package/src/extractors/peertube.ts +143 -0
  43. package/src/extractors/reddit/gallery.ts +124 -0
  44. package/src/extractors/reddit/index.ts +203 -0
  45. package/src/extractors/rumble.ts +127 -0
  46. package/src/extractors/soundcloud/index.ts +161 -0
  47. package/src/extractors/soundcloud/playlist.ts +129 -0
  48. package/src/extractors/spotify.ts +97 -0
  49. package/src/extractors/streamable.ts +121 -0
  50. package/src/extractors/ted.ts +151 -0
  51. package/src/extractors/tiktok/index.ts +207 -0
  52. package/src/extractors/tiktok/user.ts +176 -0
  53. package/src/extractors/twitch/clips.ts +125 -0
  54. package/src/extractors/twitch/index.ts +136 -0
  55. package/src/extractors/twitch/live.ts +132 -0
  56. package/src/extractors/twitter/index.ts +140 -0
  57. package/src/extractors/twitter/spaces.ts +200 -0
  58. package/src/extractors/vimeo/index.ts +187 -0
  59. package/src/extractors/youtube/captions.ts +111 -0
  60. package/src/extractors/youtube/index.ts +252 -0
  61. package/src/extractors/youtube/innertube.ts +364 -0
  62. package/src/extractors/youtube/nsig.ts +105 -0
  63. package/src/extractors/youtube/playlist.ts +227 -0
  64. package/src/extractors/youtube/signature.ts +163 -0
  65. package/src/networking/client.ts +311 -0
  66. package/src/networking/cookies.ts +138 -0
  67. package/src/networking/proxy.ts +132 -0
  68. package/src/networking/tls.ts +67 -0
  69. package/src/networking/user-agents.ts +88 -0
  70. package/src/postprocessors/base.ts +44 -0
  71. package/src/postprocessors/extract-audio.ts +98 -0
  72. package/src/postprocessors/ffmpeg.ts +146 -0
  73. package/src/postprocessors/merge.ts +102 -0
  74. package/src/postprocessors/metadata.ts +73 -0
  75. package/src/postprocessors/sponsorblock.ts +162 -0
  76. package/src/postprocessors/subtitles.ts +285 -0
  77. package/src/postprocessors/thumbnails.ts +194 -0
  78. package/src/utils/sanitize.ts +36 -0
  79. package/src/utils/traverse.ts +68 -0
  80. package/tests/core/format-sorter.test.ts +96 -0
  81. package/tests/core/output-template.test.ts +56 -0
  82. package/tests/core/types.test.ts +79 -0
  83. package/tests/unit/downloaders/dash.test.ts +57 -0
  84. package/tests/unit/downloaders/hls.test.ts +120 -0
  85. package/tests/unit/downloaders/http.test.ts +114 -0
  86. package/tests/unit/extractors/bilibili.test.ts +83 -0
  87. package/tests/unit/extractors/instagram.test.ts +273 -0
  88. package/tests/unit/extractors/kick.test.ts +85 -0
  89. package/tests/unit/extractors/misc.test.ts +942 -0
  90. package/tests/unit/extractors/niconico.test.ts +61 -0
  91. package/tests/unit/extractors/reddit.test.ts +222 -0
  92. package/tests/unit/extractors/soundcloud.test.ts +299 -0
  93. package/tests/unit/extractors/tiktok.test.ts +260 -0
  94. package/tests/unit/extractors/twitch.test.ts +250 -0
  95. package/tests/unit/extractors/twitter.test.ts +181 -0
  96. package/tests/unit/extractors/vimeo.test.ts +253 -0
  97. package/tests/unit/extractors/youtube.test.ts +259 -0
  98. package/tests/unit/networking/client.test.ts +272 -0
  99. package/tests/unit/networking/cookies.test.ts +256 -0
  100. package/tests/unit/networking/proxy.test.ts +137 -0
  101. package/tests/unit/postprocessors/extract-audio.test.ts +63 -0
  102. package/tests/unit/postprocessors/merge.test.ts +61 -0
  103. package/tests/unit/postprocessors/subtitles.test.ts +89 -0
  104. package/tools/dashboard.ts +112 -0
  105. package/tsconfig.json +17 -0
@@ -0,0 +1,161 @@
1
+ import { BaseExtractor, ExtractorError } from "../../core/types";
2
+ import type { InfoDict, Format, Thumbnail } from "../../core/types";
3
+
4
+ interface SoundCloudTranscoding {
5
+ url: string;
6
+ preset: string;
7
+ duration?: number;
8
+ format: {
9
+ protocol: string;
10
+ mime_type: string;
11
+ };
12
+ quality?: string;
13
+ }
14
+
15
+ interface SoundCloudUser {
16
+ id: number;
17
+ username: string;
18
+ permalink_url?: string;
19
+ }
20
+
21
+ interface SoundCloudTrack {
22
+ id: number;
23
+ title: string;
24
+ description?: string;
25
+ duration?: number;
26
+ playback_count?: number;
27
+ likes_count?: number;
28
+ comment_count?: number;
29
+ created_at?: string;
30
+ genre?: string;
31
+ tag_list?: string;
32
+ permalink_url?: string;
33
+ user?: SoundCloudUser;
34
+ artwork_url?: string;
35
+ waveform_url?: string;
36
+ media?: {
37
+ transcodings?: SoundCloudTranscoding[];
38
+ };
39
+ }
40
+
41
+ interface StreamResponse {
42
+ url: string;
43
+ }
44
+
45
+ const CLIENT_ID_PATTERN =
46
+ /,client_id:"([a-zA-Z0-9_-]{32,})"/;
47
+
48
+ async function extractClientId(pageUrl: string): Promise<string> {
49
+ const pageResponse = await fetch(pageUrl, {
50
+ headers: { "User-Agent": "Mozilla/5.0" },
51
+ });
52
+ if (!pageResponse.ok) {
53
+ throw new ExtractorError(`SoundCloud page fetch failed: ${pageResponse.status}`);
54
+ }
55
+ const html = await pageResponse.text();
56
+
57
+ const scriptMatches = html.matchAll(/<script[^>]+src="(https:\/\/a-v2\.sndcdn\.com\/assets\/[^"]+\.js)"/g);
58
+ const scriptUrls: string[] = [];
59
+ for (const match of scriptMatches) {
60
+ scriptUrls.push(match[1]);
61
+ }
62
+
63
+ for (const scriptUrl of scriptUrls.slice(-3)) {
64
+ const scriptResponse = await fetch(scriptUrl);
65
+ if (!scriptResponse.ok) continue;
66
+ const scriptText = await scriptResponse.text();
67
+ const match = CLIENT_ID_PATTERN.exec(scriptText);
68
+ if (match) return match[1];
69
+ }
70
+
71
+ throw new ExtractorError("Could not extract SoundCloud client_id from JS bundle");
72
+ }
73
+
74
+ export class SoundCloudExtractor extends BaseExtractor {
75
+ readonly _VALID_URL =
76
+ /^https?:\/\/(?:(?:www|m)\.)?soundcloud\.com\/([^/]+)\/(?!sets\/)([^/?#]+)/;
77
+ readonly _NAME = "soundcloud";
78
+
79
+ protected async _real_extract(url: string): Promise<InfoDict> {
80
+ const clientId = await extractClientId(url);
81
+
82
+ const resolveUrl = `https://api-v2.soundcloud.com/resolve?url=${encodeURIComponent(url)}&client_id=${clientId}`;
83
+ const resolveResponse = await fetch(resolveUrl, {
84
+ headers: { "User-Agent": "Mozilla/5.0" },
85
+ });
86
+ if (!resolveResponse.ok) {
87
+ throw new ExtractorError(`SoundCloud resolve failed: ${resolveResponse.status}`);
88
+ }
89
+
90
+ const track = (await resolveResponse.json()) as SoundCloudTrack;
91
+
92
+ if (!track.id) {
93
+ throw new ExtractorError("Could not resolve SoundCloud track");
94
+ }
95
+
96
+ const transcodings = track.media?.transcodings ?? [];
97
+ const formats: Format[] = [];
98
+
99
+ for (const transcoding of transcodings) {
100
+ const streamResponse = await fetch(
101
+ `${transcoding.url}?client_id=${clientId}`,
102
+ { headers: { "User-Agent": "Mozilla/5.0" } },
103
+ );
104
+ if (!streamResponse.ok) continue;
105
+
106
+ const stream = (await streamResponse.json()) as StreamResponse;
107
+ if (!stream.url) continue;
108
+
109
+ const isHLS = transcoding.format.protocol === "hls";
110
+ const isOpus = transcoding.preset.includes("opus");
111
+ const isAac = transcoding.preset.includes("aac") || transcoding.format.mime_type.includes("aac");
112
+
113
+ formats.push({
114
+ format_id: `${transcoding.format.protocol}-${transcoding.preset}`,
115
+ url: stream.url,
116
+ ext: isHLS ? (isOpus ? "opus" : "m4a") : "mp3",
117
+ protocol: isHLS ? "m3u8" : "https",
118
+ acodec: isOpus ? "opus" : isAac ? "aac" : "mp3",
119
+ vcodec: "none",
120
+ abr: isOpus ? 64 : 128,
121
+ format_note: transcoding.preset,
122
+ quality: isOpus ? 0 : 1,
123
+ });
124
+ }
125
+
126
+ if (formats.length === 0) {
127
+ throw new ExtractorError("No playable formats found for this SoundCloud track");
128
+ }
129
+
130
+ formats.sort((a, b) => (b.quality ?? 0) - (a.quality ?? 0));
131
+
132
+ const thumbnails: Thumbnail[] = [];
133
+ if (track.artwork_url) {
134
+ thumbnails.push({ url: track.artwork_url.replace("-large", "-t500x500"), preference: 1 });
135
+ thumbnails.push({ url: track.artwork_url });
136
+ }
137
+
138
+ const tags = track.tag_list
139
+ ? track.tag_list.match(/"[^"]+"|[^ ]+/g)?.map((t) => t.replace(/"/g, "")) ?? []
140
+ : [];
141
+
142
+ return {
143
+ id: String(track.id),
144
+ title: track.title,
145
+ description: track.description,
146
+ uploader: track.user?.username,
147
+ uploader_id: track.user ? String(track.user.id) : undefined,
148
+ uploader_url: track.user?.permalink_url,
149
+ duration: track.duration ? Math.round(track.duration / 1000) : undefined,
150
+ view_count: track.playback_count,
151
+ like_count: track.likes_count,
152
+ comment_count: track.comment_count,
153
+ upload_date: track.created_at?.replace(/-/g, "").slice(0, 8),
154
+ webpage_url: track.permalink_url ?? url,
155
+ categories: track.genre ? [track.genre] : undefined,
156
+ tags,
157
+ thumbnails,
158
+ formats,
159
+ };
160
+ }
161
+ }
@@ -0,0 +1,129 @@
1
+ import { BaseExtractor, ExtractorError } from "../../core/types";
2
+ import type { InfoDict } from "../../core/types";
3
+
4
+ interface SoundCloudUser {
5
+ id: number;
6
+ username: string;
7
+ permalink_url?: string;
8
+ }
9
+
10
+ interface SoundCloudTrackRef {
11
+ id: number;
12
+ title?: string;
13
+ permalink_url?: string;
14
+ }
15
+
16
+ interface SoundCloudPlaylist {
17
+ id: number;
18
+ title: string;
19
+ description?: string;
20
+ duration?: number;
21
+ track_count?: number;
22
+ likes_count?: number;
23
+ created_at?: string;
24
+ permalink_url?: string;
25
+ user?: SoundCloudUser;
26
+ artwork_url?: string;
27
+ tracks?: SoundCloudTrackRef[];
28
+ is_album?: boolean;
29
+ }
30
+
31
+ interface SoundCloudTracksResponse {
32
+ collection: SoundCloudTrackRef[];
33
+ next_href?: string;
34
+ }
35
+
36
+ const CLIENT_ID_PATTERN = /,client_id:"([a-zA-Z0-9_-]{32,})"/;
37
+
38
+ async function extractClientId(pageUrl: string): Promise<string> {
39
+ const pageResponse = await fetch(pageUrl, {
40
+ headers: { "User-Agent": "Mozilla/5.0" },
41
+ });
42
+ if (!pageResponse.ok) {
43
+ throw new ExtractorError(`SoundCloud page fetch failed: ${pageResponse.status}`);
44
+ }
45
+ const html = await pageResponse.text();
46
+ const scriptMatches = html.matchAll(/<script[^>]+src="(https:\/\/a-v2\.sndcdn\.com\/assets\/[^"]+\.js)"/g);
47
+ const scriptUrls: string[] = [];
48
+ for (const match of scriptMatches) {
49
+ scriptUrls.push(match[1]);
50
+ }
51
+ for (const scriptUrl of scriptUrls.slice(-3)) {
52
+ const scriptResponse = await fetch(scriptUrl);
53
+ if (!scriptResponse.ok) continue;
54
+ const scriptText = await scriptResponse.text();
55
+ const match = CLIENT_ID_PATTERN.exec(scriptText);
56
+ if (match) return match[1];
57
+ }
58
+ throw new ExtractorError("Could not extract SoundCloud client_id from JS bundle");
59
+ }
60
+
61
+ async function fetchAllTracks(
62
+ playlistId: number,
63
+ clientId: string,
64
+ ): Promise<SoundCloudTrackRef[]> {
65
+ const allTracks: SoundCloudTrackRef[] = [];
66
+ let nextHref: string | null =
67
+ `https://api-v2.soundcloud.com/playlists/${playlistId}/tracks?client_id=${clientId}&limit=50`;
68
+
69
+ while (nextHref) {
70
+ const response = await fetch(nextHref, {
71
+ headers: { "User-Agent": "Mozilla/5.0" },
72
+ });
73
+ if (!response.ok) break;
74
+ const data = (await response.json()) as SoundCloudTracksResponse;
75
+ allTracks.push(...(data.collection ?? []));
76
+ nextHref = data.next_href
77
+ ? `${data.next_href}&client_id=${clientId}`
78
+ : null;
79
+ }
80
+
81
+ return allTracks;
82
+ }
83
+
84
+ export class SoundCloudPlaylistExtractor extends BaseExtractor {
85
+ readonly _VALID_URL =
86
+ /^https?:\/\/(?:(?:www|m)\.)?soundcloud\.com\/([^/]+)\/sets\/([^/?#]+)/;
87
+ readonly _NAME = "soundcloud:playlist";
88
+
89
+ protected async _real_extract(url: string): Promise<InfoDict> {
90
+ const clientId = await extractClientId(url);
91
+
92
+ const resolveUrl = `https://api-v2.soundcloud.com/resolve?url=${encodeURIComponent(url)}&client_id=${clientId}`;
93
+ const resolveResponse = await fetch(resolveUrl, {
94
+ headers: { "User-Agent": "Mozilla/5.0" },
95
+ });
96
+ if (!resolveResponse.ok) {
97
+ throw new ExtractorError(`SoundCloud resolve failed: ${resolveResponse.status}`);
98
+ }
99
+
100
+ const playlist = (await resolveResponse.json()) as SoundCloudPlaylist;
101
+ if (!playlist.id) throw new ExtractorError("Could not resolve SoundCloud playlist");
102
+
103
+ const tracks = await fetchAllTracks(playlist.id, clientId);
104
+
105
+ const entries: InfoDict[] = tracks.map((track, idx) => ({
106
+ id: String(track.id),
107
+ title: track.title ?? `Track ${idx + 1}`,
108
+ webpage_url: track.permalink_url ?? url,
109
+ url: track.permalink_url ?? url,
110
+ _type: "url" as const,
111
+ playlist_index: idx + 1,
112
+ }));
113
+
114
+ return {
115
+ id: String(playlist.id),
116
+ title: playlist.title,
117
+ description: playlist.description,
118
+ uploader: playlist.user?.username,
119
+ uploader_id: playlist.user ? String(playlist.user.id) : undefined,
120
+ uploader_url: playlist.user?.permalink_url,
121
+ like_count: playlist.likes_count,
122
+ upload_date: playlist.created_at?.replace(/-/g, "").slice(0, 8),
123
+ webpage_url: playlist.permalink_url ?? url,
124
+ _type: "playlist",
125
+ entries,
126
+ playlist_count: tracks.length,
127
+ };
128
+ }
129
+ }
@@ -0,0 +1,97 @@
1
+ import { BaseExtractor, ExtractorError } from "../core/types";
2
+ import type { InfoDict, Thumbnail } from "../core/types";
3
+
4
+ export class SpotifyExtractor extends BaseExtractor {
5
+ readonly _VALID_URL = /https?:\/\/open\.spotify\.com\/episode\/([a-zA-Z0-9]+)/;
6
+ readonly _NAME = "spotify";
7
+
8
+ protected async _real_extract(url: string): Promise<InfoDict> {
9
+ const match = url.match(this._VALID_URL);
10
+ if (!match) throw new ExtractorError(`Invalid Spotify episode URL: ${url}`);
11
+ const episodeId = match[1];
12
+
13
+ const response = await fetch(url, {
14
+ headers: {
15
+ "User-Agent": "Mozilla/5.0",
16
+ "Accept-Language": "en-US,en;q=0.9",
17
+ },
18
+ });
19
+
20
+ if (!response.ok) {
21
+ throw new ExtractorError(`Spotify fetch error: ${response.status}`);
22
+ }
23
+
24
+ const html = await response.text();
25
+
26
+ const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/);
27
+ let audioPreviewUrl: string | undefined;
28
+ let title = "Spotify Podcast Episode";
29
+ let description: string | undefined;
30
+ let duration: number | undefined;
31
+ let uploader: string | undefined;
32
+ let thumbnails: Thumbnail[] = [];
33
+
34
+ if (nextDataMatch) {
35
+ try {
36
+ const nextData = JSON.parse(nextDataMatch[1]) as Record<string, unknown>;
37
+ const props = (nextData.props as Record<string, unknown>)?.pageProps as Record<string, unknown> | undefined;
38
+ const episode = props?.episode as Record<string, unknown> | undefined;
39
+
40
+ if (episode) {
41
+ title = (episode.name as string) ?? title;
42
+ description = episode.description as string | undefined;
43
+ duration = typeof episode.duration_ms === "number"
44
+ ? Math.round(episode.duration_ms / 1000)
45
+ : undefined;
46
+ audioPreviewUrl = episode.audio_preview_url as string | undefined;
47
+
48
+ const show = episode.show as Record<string, unknown> | undefined;
49
+ uploader = show?.name as string | undefined;
50
+
51
+ const images = episode.images as Array<{ url: string; width?: number; height?: number }> | undefined;
52
+ if (images) {
53
+ thumbnails = images.map((img) => ({
54
+ url: img.url,
55
+ width: img.width,
56
+ height: img.height,
57
+ }));
58
+ }
59
+ }
60
+ } catch {
61
+ }
62
+ }
63
+
64
+ if (!audioPreviewUrl) {
65
+ const previewMatch = html.match(/"audio_preview_url"\s*:\s*"([^"]+)"/);
66
+ if (previewMatch) audioPreviewUrl = previewMatch[1];
67
+ }
68
+
69
+ if (!audioPreviewUrl) {
70
+ throw new ExtractorError(
71
+ "Spotify: no audio preview URL found. Note: full podcast audio requires Spotify auth. Only 30-second previews are available without DRM.",
72
+ );
73
+ }
74
+
75
+ return {
76
+ id: episodeId,
77
+ title,
78
+ description,
79
+ duration,
80
+ uploader,
81
+ thumbnails,
82
+ url: audioPreviewUrl,
83
+ ext: "mp3",
84
+ formats: [
85
+ {
86
+ format_id: "preview-mp3",
87
+ url: audioPreviewUrl,
88
+ ext: "mp3",
89
+ acodec: "mp3",
90
+ format_note: "30-second preview only (full episode requires Spotify auth)",
91
+ },
92
+ ],
93
+ webpage_url: url,
94
+ extractor: this._NAME,
95
+ };
96
+ }
97
+ }
@@ -0,0 +1,121 @@
1
+ import { BaseExtractor, ExtractorError } from "../core/types";
2
+ import type { InfoDict, Format, Thumbnail } from "../core/types";
3
+
4
+ interface StreamableVideoSource {
5
+ url?: string;
6
+ width?: number;
7
+ height?: number;
8
+ bitrate?: number;
9
+ size?: number;
10
+ }
11
+
12
+ interface StreamablePageData {
13
+ title?: string;
14
+ status?: number;
15
+ percent?: number;
16
+ thumbnail_url?: string;
17
+ files?: Record<string, StreamableVideoSource>;
18
+ url?: string;
19
+ duration?: number;
20
+ }
21
+
22
+ export class StreamableExtractor extends BaseExtractor {
23
+ readonly _VALID_URL = /https?:\/\/(?:www\.)?streamable\.com\/([a-zA-Z0-9]+)/;
24
+ readonly _NAME = "streamable";
25
+
26
+ protected async _real_extract(url: string): Promise<InfoDict> {
27
+ const match = url.match(this._VALID_URL);
28
+ if (!match) throw new ExtractorError(`Invalid Streamable URL: ${url}`);
29
+ const videoId = match[1];
30
+
31
+ const response = await fetch(url, {
32
+ headers: { "User-Agent": "Mozilla/5.0" },
33
+ });
34
+
35
+ if (!response.ok) {
36
+ throw new ExtractorError(`Streamable fetch error: ${response.status}`);
37
+ }
38
+
39
+ const html = await response.text();
40
+
41
+ let pageData: StreamablePageData = {};
42
+
43
+ const reactDataMatch = html.match(/window\.__reactData__\s*=\s*(\{[\s\S]*?});\s*<\/script>/);
44
+ if (reactDataMatch) {
45
+ try {
46
+ const reactData = JSON.parse(reactDataMatch[1]) as Record<string, unknown>;
47
+ const videoKey = Object.keys(reactData).find((k) => (reactData[k] as Record<string, unknown>)?.files);
48
+ if (videoKey) pageData = reactData[videoKey] as StreamablePageData;
49
+ } catch {
50
+ }
51
+ }
52
+
53
+ if (!pageData.files) {
54
+ const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/);
55
+ if (nextDataMatch) {
56
+ try {
57
+ const nextData = JSON.parse(nextDataMatch[1]) as Record<string, unknown>;
58
+ const props = (nextData.props as Record<string, unknown>)?.pageProps as Record<string, unknown> | undefined;
59
+ pageData = (props?.video ?? props?.clip ?? {}) as StreamablePageData;
60
+ } catch {
61
+ }
62
+ }
63
+ }
64
+
65
+ if (!pageData.files) {
66
+ const jsonMatch = html.match(/"files"\s*:\s*(\{[^}]+\})/);
67
+ if (jsonMatch) {
68
+ try {
69
+ pageData.files = JSON.parse(jsonMatch[1]) as Record<string, StreamableVideoSource>;
70
+ } catch {
71
+ }
72
+ }
73
+ }
74
+
75
+ const files = pageData.files ?? {};
76
+ const formats: Format[] = [];
77
+
78
+ const qualityMap: Record<string, number> = {
79
+ mp4: 1,
80
+ "mp4-mobile": 2,
81
+ original: 3,
82
+ };
83
+
84
+ for (const [key, source] of Object.entries(files)) {
85
+ if (!source.url) continue;
86
+
87
+ const fullUrl = source.url.startsWith("//")
88
+ ? `https:${source.url}`
89
+ : source.url;
90
+
91
+ formats.push({
92
+ format_id: key,
93
+ url: fullUrl,
94
+ ext: "mp4",
95
+ width: source.width,
96
+ height: source.height,
97
+ tbr: source.bitrate,
98
+ filesize: source.size,
99
+ quality: qualityMap[key] ?? 0,
100
+ resolution: source.height ? `${source.height}p` : undefined,
101
+ });
102
+ }
103
+
104
+ const thumbnails: Thumbnail[] = pageData.thumbnail_url
105
+ ? [{ url: pageData.thumbnail_url.startsWith("//") ? `https:${pageData.thumbnail_url}` : pageData.thumbnail_url }]
106
+ : [];
107
+
108
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
109
+ const title = pageData.title ?? titleMatch?.[1]?.replace(" - Streamable", "").trim() ?? videoId;
110
+
111
+ return {
112
+ id: videoId,
113
+ title,
114
+ thumbnails,
115
+ formats,
116
+ duration: pageData.duration,
117
+ webpage_url: url,
118
+ extractor: this._NAME,
119
+ };
120
+ }
121
+ }
@@ -0,0 +1,151 @@
1
+ import { BaseExtractor, ExtractorError } from "../core/types";
2
+ import type { InfoDict, Format, Thumbnail, Subtitle } from "../core/types";
3
+
4
+ interface TEDVideoResource {
5
+ bitrate?: number;
6
+ file?: string;
7
+ quality?: string;
8
+ height?: number;
9
+ width?: number;
10
+ }
11
+
12
+ interface TEDPlayerData {
13
+ resources?: {
14
+ h264?: TEDVideoResource[];
15
+ hls?: { stream?: string };
16
+ };
17
+ duration?: number;
18
+ thumb?: string;
19
+ }
20
+
21
+ interface TEDTalkData {
22
+ id?: number;
23
+ slug?: string;
24
+ title?: string;
25
+ description?: string;
26
+ duration?: number;
27
+ viewedCount?: number;
28
+ publishedAt?: string;
29
+ speakers?: Array<{ firstname?: string; lastname?: string }>;
30
+ playerData?: TEDPlayerData | string;
31
+ subtitledDownloads?: Record<string, { high?: string; low?: string; name?: string }>;
32
+ image?: { url?: string };
33
+ canonicalUrl?: string;
34
+ }
35
+
36
+ interface TEDNextData {
37
+ props?: {
38
+ pageProps?: {
39
+ talkData?: TEDTalkData;
40
+ videoData?: TEDTalkData;
41
+ };
42
+ };
43
+ }
44
+
45
+ export class TEDExtractor extends BaseExtractor {
46
+ readonly _VALID_URL = /https?:\/\/(?:www\.)?ted\.com\/talks\/([a-zA-Z0-9_]+)/;
47
+ readonly _NAME = "ted";
48
+
49
+ protected async _real_extract(url: string): Promise<InfoDict> {
50
+ const match = url.match(this._VALID_URL);
51
+ if (!match) throw new ExtractorError(`Invalid TED URL: ${url}`);
52
+ const talkSlug = match[1];
53
+
54
+ const response = await fetch(url, {
55
+ headers: { "User-Agent": "Mozilla/5.0" },
56
+ });
57
+
58
+ if (!response.ok) {
59
+ throw new ExtractorError(`TED fetch error: ${response.status}`);
60
+ }
61
+
62
+ const html = await response.text();
63
+
64
+ const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/);
65
+ if (!nextDataMatch) {
66
+ throw new ExtractorError("TED: could not find __NEXT_DATA__");
67
+ }
68
+
69
+ let nextData: TEDNextData;
70
+ try {
71
+ nextData = JSON.parse(nextDataMatch[1]) as TEDNextData;
72
+ } catch {
73
+ throw new ExtractorError("TED: failed to parse __NEXT_DATA__");
74
+ }
75
+
76
+ const talkData = nextData.props?.pageProps?.talkData ?? nextData.props?.pageProps?.videoData;
77
+ if (!talkData) throw new ExtractorError("TED: no talk data found");
78
+
79
+ let playerData: TEDPlayerData | undefined;
80
+ if (typeof talkData.playerData === "string") {
81
+ try {
82
+ playerData = JSON.parse(talkData.playerData) as TEDPlayerData;
83
+ } catch {
84
+ }
85
+ } else if (talkData.playerData && typeof talkData.playerData === "object") {
86
+ playerData = talkData.playerData;
87
+ }
88
+
89
+ const formats: Format[] = [];
90
+
91
+ const h264Resources = playerData?.resources?.h264 ?? [];
92
+ for (const resource of h264Resources) {
93
+ if (!resource.file) continue;
94
+ formats.push({
95
+ format_id: `mp4-${resource.bitrate ?? resource.quality ?? "unknown"}`,
96
+ url: resource.file,
97
+ ext: "mp4",
98
+ vcodec: "h264",
99
+ tbr: resource.bitrate,
100
+ height: resource.height,
101
+ width: resource.width,
102
+ quality: resource.height ?? resource.bitrate ?? 0,
103
+ resolution: resource.height ? `${resource.height}p` : undefined,
104
+ });
105
+ }
106
+
107
+ const hlsStream = playerData?.resources?.hls?.stream;
108
+ if (hlsStream) {
109
+ formats.push({
110
+ format_id: "hls",
111
+ url: hlsStream,
112
+ ext: "mp4",
113
+ protocol: "m3u8",
114
+ quality: -1,
115
+ });
116
+ }
117
+
118
+ const subtitles: Record<string, Subtitle[]> = {};
119
+ const subtitledDownloads = talkData.subtitledDownloads ?? {};
120
+ for (const [lang, subtitleData] of Object.entries(subtitledDownloads)) {
121
+ const subUrl = subtitleData.high ?? subtitleData.low;
122
+ if (subUrl) {
123
+ subtitles[lang] = [{ url: subUrl, ext: "srt", name: subtitleData.name }];
124
+ }
125
+ }
126
+
127
+ const thumbnails: Thumbnail[] = [];
128
+ const thumbUrl = playerData?.thumb ?? talkData.image?.url;
129
+ if (thumbUrl) thumbnails.push({ url: thumbUrl });
130
+
131
+ const speakerNames = (talkData.speakers ?? [])
132
+ .map((s) => [s.firstname, s.lastname].filter(Boolean).join(" "))
133
+ .filter(Boolean);
134
+
135
+ return {
136
+ id: String(talkData.id ?? talkSlug),
137
+ title: talkData.title ?? talkSlug,
138
+ description: talkData.description,
139
+ duration: talkData.duration ?? playerData?.duration,
140
+ view_count: talkData.viewedCount,
141
+ uploader: speakerNames[0],
142
+ upload_date: talkData.publishedAt?.slice(0, 10).replace(/-/g, ""),
143
+ thumbnails,
144
+ formats,
145
+ subtitles,
146
+ tags: [],
147
+ webpage_url: url,
148
+ extractor: this._NAME,
149
+ };
150
+ }
151
+ }