getraw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitattributes +4 -0
- package/CLAUDE.md +57 -0
- package/README.md +166 -0
- package/RESEARCH.md +109 -0
- package/STATUS.md +23 -0
- package/bun.lock +50 -0
- package/bunfig.toml +3 -0
- package/docs/plugin-guide.md +166 -0
- package/docs/supported-sites.md +41 -0
- package/package.json +30 -0
- package/src/cli/index.ts +52 -0
- package/src/cli/options.ts +97 -0
- package/src/core/format-sorter.ts +208 -0
- package/src/core/logger.ts +101 -0
- package/src/core/orchestrator.ts +140 -0
- package/src/core/output-template.ts +58 -0
- package/src/core/types.ts +237 -0
- package/src/downloaders/base.ts +25 -0
- package/src/downloaders/dash.ts +287 -0
- package/src/downloaders/fragment.ts +226 -0
- package/src/downloaders/hls.ts +170 -0
- package/src/downloaders/http.ts +260 -0
- package/src/extractors/archive-org.ts +126 -0
- package/src/extractors/bandcamp.ts +130 -0
- package/src/extractors/base.ts +29 -0
- package/src/extractors/bilibili/bangumi.ts +205 -0
- package/src/extractors/bilibili/index.ts +233 -0
- package/src/extractors/bilibili/wbi.ts +60 -0
- package/src/extractors/coub.ts +137 -0
- package/src/extractors/dailymotion.ts +99 -0
- package/src/extractors/dropbox.ts +52 -0
- package/src/extractors/generic.ts +118 -0
- package/src/extractors/google-drive.ts +106 -0
- package/src/extractors/imgur.ts +156 -0
- package/src/extractors/instagram/index.ts +263 -0
- package/src/extractors/instagram/reels.ts +166 -0
- package/src/extractors/kick/clips.ts +91 -0
- package/src/extractors/kick/index.ts +118 -0
- package/src/extractors/kick/live.ts +89 -0
- package/src/extractors/niconico/index.ts +209 -0
- package/src/extractors/odysee.ts +126 -0
- package/src/extractors/peertube.ts +143 -0
- package/src/extractors/reddit/gallery.ts +124 -0
- package/src/extractors/reddit/index.ts +203 -0
- package/src/extractors/rumble.ts +127 -0
- package/src/extractors/soundcloud/index.ts +161 -0
- package/src/extractors/soundcloud/playlist.ts +129 -0
- package/src/extractors/spotify.ts +97 -0
- package/src/extractors/streamable.ts +121 -0
- package/src/extractors/ted.ts +151 -0
- package/src/extractors/tiktok/index.ts +207 -0
- package/src/extractors/tiktok/user.ts +176 -0
- package/src/extractors/twitch/clips.ts +125 -0
- package/src/extractors/twitch/index.ts +136 -0
- package/src/extractors/twitch/live.ts +132 -0
- package/src/extractors/twitter/index.ts +140 -0
- package/src/extractors/twitter/spaces.ts +200 -0
- package/src/extractors/vimeo/index.ts +187 -0
- package/src/extractors/youtube/captions.ts +111 -0
- package/src/extractors/youtube/index.ts +252 -0
- package/src/extractors/youtube/innertube.ts +364 -0
- package/src/extractors/youtube/nsig.ts +105 -0
- package/src/extractors/youtube/playlist.ts +227 -0
- package/src/extractors/youtube/signature.ts +163 -0
- package/src/networking/client.ts +311 -0
- package/src/networking/cookies.ts +138 -0
- package/src/networking/proxy.ts +132 -0
- package/src/networking/tls.ts +67 -0
- package/src/networking/user-agents.ts +88 -0
- package/src/postprocessors/base.ts +44 -0
- package/src/postprocessors/extract-audio.ts +98 -0
- package/src/postprocessors/ffmpeg.ts +146 -0
- package/src/postprocessors/merge.ts +102 -0
- package/src/postprocessors/metadata.ts +73 -0
- package/src/postprocessors/sponsorblock.ts +162 -0
- package/src/postprocessors/subtitles.ts +285 -0
- package/src/postprocessors/thumbnails.ts +194 -0
- package/src/utils/sanitize.ts +36 -0
- package/src/utils/traverse.ts +68 -0
- package/tests/core/format-sorter.test.ts +96 -0
- package/tests/core/output-template.test.ts +56 -0
- package/tests/core/types.test.ts +79 -0
- package/tests/unit/downloaders/dash.test.ts +57 -0
- package/tests/unit/downloaders/hls.test.ts +120 -0
- package/tests/unit/downloaders/http.test.ts +114 -0
- package/tests/unit/extractors/bilibili.test.ts +83 -0
- package/tests/unit/extractors/instagram.test.ts +273 -0
- package/tests/unit/extractors/kick.test.ts +85 -0
- package/tests/unit/extractors/misc.test.ts +942 -0
- package/tests/unit/extractors/niconico.test.ts +61 -0
- package/tests/unit/extractors/reddit.test.ts +222 -0
- package/tests/unit/extractors/soundcloud.test.ts +299 -0
- package/tests/unit/extractors/tiktok.test.ts +260 -0
- package/tests/unit/extractors/twitch.test.ts +250 -0
- package/tests/unit/extractors/twitter.test.ts +181 -0
- package/tests/unit/extractors/vimeo.test.ts +253 -0
- package/tests/unit/extractors/youtube.test.ts +259 -0
- package/tests/unit/networking/client.test.ts +272 -0
- package/tests/unit/networking/cookies.test.ts +256 -0
- package/tests/unit/networking/proxy.test.ts +137 -0
- package/tests/unit/postprocessors/extract-audio.test.ts +63 -0
- package/tests/unit/postprocessors/merge.test.ts +61 -0
- package/tests/unit/postprocessors/subtitles.test.ts +89 -0
- package/tools/dashboard.ts +112 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { BaseExtractor, ExtractorError } from "../core/types";
|
|
2
|
+
import type { InfoDict, Format } from "../core/types";
|
|
3
|
+
|
|
4
|
+
const GDRIVE_VALID_URL = /https?:\/\/(?:docs\.google\.com\/(?:file\/d\/|open\?id=)|drive\.google\.com\/(?:file\/d\/|open\?id=)|drive\.google\.com\/uc\?(?:.*&)?id=)([a-zA-Z0-9_-]+)/;
|
|
5
|
+
|
|
6
|
+
export class GoogleDriveExtractor extends BaseExtractor {
|
|
7
|
+
readonly _VALID_URL = GDRIVE_VALID_URL;
|
|
8
|
+
readonly _NAME = "google-drive";
|
|
9
|
+
|
|
10
|
+
protected async _real_extract(url: string): Promise<InfoDict> {
|
|
11
|
+
const match = url.match(this._VALID_URL);
|
|
12
|
+
if (!match) throw new ExtractorError(`Invalid Google Drive URL: ${url}`);
|
|
13
|
+
const fileId = match[1];
|
|
14
|
+
|
|
15
|
+
const downloadUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
|
|
16
|
+
|
|
17
|
+
const response = await fetch(downloadUrl, {
|
|
18
|
+
headers: { "User-Agent": "Mozilla/5.0" },
|
|
19
|
+
redirect: "follow",
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
if (!response.ok) {
|
|
23
|
+
throw new ExtractorError(`Google Drive: fetch error ${response.status}`);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
27
|
+
|
|
28
|
+
if (contentType.includes("text/html")) {
|
|
29
|
+
const html = await response.text();
|
|
30
|
+
return this.handleVirusScanPage(html, fileId, url);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const contentDisposition = response.headers.get("content-disposition") ?? "";
|
|
34
|
+
const filenameMatch = contentDisposition.match(/filename\*?=(?:UTF-8'')?["']?([^"';\n]+)["']?/i);
|
|
35
|
+
const filename = filenameMatch?.[1]
|
|
36
|
+
? decodeURIComponent(filenameMatch[1].trim())
|
|
37
|
+
: `gdrive_${fileId}`;
|
|
38
|
+
const ext = filename.includes(".") ? filename.split(".").pop()!.toLowerCase() : "mp4";
|
|
39
|
+
const title = filename.replace(/\.[^.]+$/, "");
|
|
40
|
+
|
|
41
|
+
const contentLength = response.headers.get("content-length");
|
|
42
|
+
const filesize = contentLength ? parseInt(contentLength) : undefined;
|
|
43
|
+
|
|
44
|
+
const resolvedUrl = response.url || downloadUrl;
|
|
45
|
+
return this.buildInfo(fileId, title, ext, resolvedUrl, filesize, url);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
private async handleVirusScanPage(html: string, fileId: string, originalUrl: string): Promise<InfoDict> {
|
|
49
|
+
const confirmMatch = html.match(/[?&]confirm=([0-9A-Za-z_-]+)/);
|
|
50
|
+
if (!confirmMatch) {
|
|
51
|
+
const idMatch = html.match(/id=([a-zA-Z0-9_-]+)/);
|
|
52
|
+
const foundId = idMatch?.[1] ?? fileId;
|
|
53
|
+
const directUrl = `https://drive.google.com/uc?export=download&id=${foundId}`;
|
|
54
|
+
return this.buildInfo(fileId, `gdrive_${fileId}`, "mp4", directUrl, undefined, originalUrl);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const confirmToken = confirmMatch[1];
|
|
58
|
+
const confirmedUrl = `https://drive.google.com/uc?export=download&id=${fileId}&confirm=${confirmToken}`;
|
|
59
|
+
|
|
60
|
+
const confirmedResponse = await fetch(confirmedUrl, {
|
|
61
|
+
headers: { "User-Agent": "Mozilla/5.0" },
|
|
62
|
+
redirect: "follow",
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
if (!confirmedResponse.ok) {
|
|
66
|
+
throw new ExtractorError(`Google Drive: confirmed download failed with status ${confirmedResponse.status}`);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const contentDisposition = confirmedResponse.headers.get("content-disposition") ?? "";
|
|
70
|
+
const filenameMatch = contentDisposition.match(/filename\*?=(?:UTF-8'')?["']?([^"';\n]+)["']?/i);
|
|
71
|
+
const filename = filenameMatch?.[1]
|
|
72
|
+
? decodeURIComponent(filenameMatch[1].trim())
|
|
73
|
+
: `gdrive_${fileId}`;
|
|
74
|
+
const ext = filename.includes(".") ? filename.split(".").pop()!.toLowerCase() : "mp4";
|
|
75
|
+
const title = filename.replace(/\.[^.]+$/, "");
|
|
76
|
+
|
|
77
|
+
const contentLength = confirmedResponse.headers.get("content-length");
|
|
78
|
+
const filesize = contentLength ? parseInt(contentLength) : undefined;
|
|
79
|
+
|
|
80
|
+
return this.buildInfo(fileId, title, ext, confirmedUrl, filesize, originalUrl);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
private buildInfo(id: string, title: string, ext: string, directUrl: string, filesize: number | undefined, webpageUrl: string): InfoDict {
|
|
84
|
+
const formats: Format[] = [
|
|
85
|
+
{
|
|
86
|
+
format_id: "direct",
|
|
87
|
+
url: directUrl,
|
|
88
|
+
ext,
|
|
89
|
+
filesize,
|
|
90
|
+
http_headers: {
|
|
91
|
+
"User-Agent": "Mozilla/5.0",
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
];
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
id,
|
|
98
|
+
title,
|
|
99
|
+
url: directUrl,
|
|
100
|
+
ext,
|
|
101
|
+
formats,
|
|
102
|
+
webpage_url: webpageUrl,
|
|
103
|
+
extractor: this._NAME,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
}
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import { BaseExtractor, ExtractorError } from "../core/types";
|
|
2
|
+
import type { InfoDict, Format, Thumbnail } from "../core/types";
|
|
3
|
+
|
|
4
|
+
interface ImgurImage {
|
|
5
|
+
id?: string;
|
|
6
|
+
title?: string;
|
|
7
|
+
description?: string;
|
|
8
|
+
type?: string;
|
|
9
|
+
mp4?: string;
|
|
10
|
+
link?: string;
|
|
11
|
+
width?: number;
|
|
12
|
+
height?: number;
|
|
13
|
+
size?: number;
|
|
14
|
+
animated?: boolean;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
interface ImgurApiResponse {
|
|
18
|
+
data?: ImgurImage | ImgurImage[] | { images?: ImgurImage[]; title?: string; description?: string; id?: string };
|
|
19
|
+
success?: boolean;
|
|
20
|
+
status?: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const IMGUR_VALID_URL = /https?:\/\/(?:i\.)?imgur\.com\/(?:a\/|gallery\/)?([a-zA-Z0-9]+)(?:\.[a-zA-Z]+)?/;
|
|
24
|
+
|
|
25
|
+
export class ImgurExtractor extends BaseExtractor {
|
|
26
|
+
readonly _VALID_URL = IMGUR_VALID_URL;
|
|
27
|
+
readonly _NAME = "imgur";
|
|
28
|
+
|
|
29
|
+
protected async _real_extract(url: string): Promise<InfoDict> {
|
|
30
|
+
const match = url.match(this._VALID_URL);
|
|
31
|
+
if (!match) throw new ExtractorError(`Invalid Imgur URL: ${url}`);
|
|
32
|
+
const itemId = match[1];
|
|
33
|
+
|
|
34
|
+
if (url.includes("i.imgur.com") && /\.(gif|gifv|mp4|webm)$/i.test(url)) {
|
|
35
|
+
return this.extractDirectMedia(url, itemId);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const isAlbum = url.includes("/a/") || url.includes("/gallery/");
|
|
39
|
+
return isAlbum
|
|
40
|
+
? this.extractAlbum(url, itemId)
|
|
41
|
+
: this.extractSingle(url, itemId);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
private extractDirectMedia(url: string, id: string): InfoDict {
|
|
45
|
+
const mp4Url = url.replace(/\.gifv?$/i, ".mp4");
|
|
46
|
+
return {
|
|
47
|
+
id,
|
|
48
|
+
title: id,
|
|
49
|
+
url: mp4Url,
|
|
50
|
+
ext: "mp4",
|
|
51
|
+
formats: [{ format_id: "mp4", url: mp4Url, ext: "mp4" }],
|
|
52
|
+
webpage_url: url,
|
|
53
|
+
extractor: this._NAME,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
private async extractSingle(url: string, id: string): Promise<InfoDict> {
|
|
58
|
+
const apiUrl = `https://api.imgur.com/3/image/${id}`;
|
|
59
|
+
const response = await fetch(apiUrl, {
|
|
60
|
+
headers: {
|
|
61
|
+
Authorization: "Client-ID 546c25a59c58ad7",
|
|
62
|
+
"User-Agent": "Mozilla/5.0",
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
if (!response.ok) {
|
|
67
|
+
return this.extractDirectMedia(`https://i.imgur.com/${id}.mp4`, id);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const data = (await response.json()) as ImgurApiResponse;
|
|
71
|
+
const image = data.data as ImgurImage | undefined;
|
|
72
|
+
|
|
73
|
+
if (!image) throw new ExtractorError("Imgur: no image data in response");
|
|
74
|
+
|
|
75
|
+
return this.buildImageInfo(image, url);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
private async extractAlbum(url: string, id: string): Promise<InfoDict> {
|
|
79
|
+
const apiUrl = `https://api.imgur.com/3/album/${id}/images`;
|
|
80
|
+
const response = await fetch(apiUrl, {
|
|
81
|
+
headers: {
|
|
82
|
+
Authorization: "Client-ID 546c25a59c58ad7",
|
|
83
|
+
"User-Agent": "Mozilla/5.0",
|
|
84
|
+
},
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
if (!response.ok) {
|
|
88
|
+
throw new ExtractorError(`Imgur album API error: ${response.status}`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const data = (await response.json()) as ImgurApiResponse;
|
|
92
|
+
const images = data.data as ImgurImage[] | undefined;
|
|
93
|
+
|
|
94
|
+
if (!images || !Array.isArray(images)) {
|
|
95
|
+
throw new ExtractorError("Imgur: no album images in response");
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const videoImages = images.filter((img) => img.animated || img.type?.includes("video") || img.mp4);
|
|
99
|
+
|
|
100
|
+
if (videoImages.length === 0) {
|
|
101
|
+
throw new ExtractorError("Imgur: album contains no video/animated content");
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (videoImages.length === 1) {
|
|
105
|
+
return this.buildImageInfo(videoImages[0], url);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const entries = videoImages.map((img, i) => ({
|
|
109
|
+
...this.buildImageInfo(img, `https://imgur.com/${img.id ?? ""}`),
|
|
110
|
+
playlist_index: i + 1,
|
|
111
|
+
}));
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
id,
|
|
115
|
+
title: `Imgur Album ${id}`,
|
|
116
|
+
entries,
|
|
117
|
+
_type: "playlist" as const,
|
|
118
|
+
playlist_count: entries.length,
|
|
119
|
+
webpage_url: url,
|
|
120
|
+
extractor: this._NAME,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
private buildImageInfo(image: ImgurImage, url: string): InfoDict {
|
|
125
|
+
const id = image.id ?? "unknown";
|
|
126
|
+
const formats: Format[] = [];
|
|
127
|
+
const thumbnails: Thumbnail[] = [];
|
|
128
|
+
|
|
129
|
+
const mp4Url = image.mp4 ?? `https://i.imgur.com/${id}.mp4`;
|
|
130
|
+
|
|
131
|
+
if (image.animated || image.type?.includes("gif") || image.mp4) {
|
|
132
|
+
formats.push({
|
|
133
|
+
format_id: "mp4",
|
|
134
|
+
url: mp4Url,
|
|
135
|
+
ext: "mp4",
|
|
136
|
+
width: image.width,
|
|
137
|
+
height: image.height,
|
|
138
|
+
filesize: image.size,
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (image.link) {
|
|
143
|
+
thumbnails.push({ url: image.link, width: image.width, height: image.height });
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
id,
|
|
148
|
+
title: image.title ?? id,
|
|
149
|
+
description: image.description,
|
|
150
|
+
thumbnails,
|
|
151
|
+
formats,
|
|
152
|
+
webpage_url: url,
|
|
153
|
+
extractor: this._NAME,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
}
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import { BaseExtractor, ExtractorError } from "../../core/types";
|
|
2
|
+
import type { InfoDict, Format, Thumbnail } from "../../core/types";
|
|
3
|
+
|
|
4
|
+
const VALID_URL =
|
|
5
|
+
/https?:\/\/(?:www\.)?instagram\.com\/(?:p|reel|reels)\/([A-Za-z0-9_-]+)/;
|
|
6
|
+
|
|
7
|
+
interface IGMediaNode {
|
|
8
|
+
__typename?: string;
|
|
9
|
+
id?: string;
|
|
10
|
+
shortcode?: string;
|
|
11
|
+
video_url?: string;
|
|
12
|
+
display_url?: string;
|
|
13
|
+
is_video?: boolean;
|
|
14
|
+
dimensions?: { width?: number; height?: number };
|
|
15
|
+
accessibility_caption?: string;
|
|
16
|
+
edge_media_to_caption?: { edges?: Array<{ node?: { text?: string } }> };
|
|
17
|
+
owner?: {
|
|
18
|
+
id?: string;
|
|
19
|
+
username?: string;
|
|
20
|
+
full_name?: string;
|
|
21
|
+
profile_pic_url?: string;
|
|
22
|
+
};
|
|
23
|
+
taken_at_timestamp?: number;
|
|
24
|
+
video_view_count?: number;
|
|
25
|
+
edge_liked_by?: { count?: number };
|
|
26
|
+
edge_media_to_comment?: { count?: number };
|
|
27
|
+
video_duration?: number;
|
|
28
|
+
edge_sidecar_to_children?: {
|
|
29
|
+
edges?: Array<{ node?: IGMediaNode }>;
|
|
30
|
+
};
|
|
31
|
+
thumbnail_src?: string;
|
|
32
|
+
thumbnail_resources?: Array<{
|
|
33
|
+
src?: string;
|
|
34
|
+
config_width?: number;
|
|
35
|
+
config_height?: number;
|
|
36
|
+
}>;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
interface IGSharedData {
|
|
40
|
+
entry_data?: {
|
|
41
|
+
PostPage?: Array<{
|
|
42
|
+
graphql?: { shortcode_media?: IGMediaNode };
|
|
43
|
+
}>;
|
|
44
|
+
};
|
|
45
|
+
config?: { viewer?: { id?: string } };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
interface IGAdditionalData {
|
|
49
|
+
graphql?: { shortcode_media?: IGMediaNode };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export class InstagramExtractor extends BaseExtractor {
|
|
53
|
+
readonly _VALID_URL = VALID_URL;
|
|
54
|
+
readonly _NAME = "instagram";
|
|
55
|
+
|
|
56
|
+
private readonly _headers: Record<string, string> = {
|
|
57
|
+
"User-Agent":
|
|
58
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
59
|
+
Accept:
|
|
60
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
61
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
62
|
+
"Sec-Fetch-Dest": "document",
|
|
63
|
+
"Sec-Fetch-Mode": "navigate",
|
|
64
|
+
"Sec-Fetch-Site": "none",
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
private extractSharedData(html: string): IGMediaNode | null {
|
|
68
|
+
const sharedDataMatch = html.match(
|
|
69
|
+
/window\._sharedData\s*=\s*(\{[\s\S]*?\});(?:\s*<\/script>|\s*window\.)/,
|
|
70
|
+
);
|
|
71
|
+
if (sharedDataMatch) {
|
|
72
|
+
try {
|
|
73
|
+
const sharedData = JSON.parse(sharedDataMatch[1]) as IGSharedData;
|
|
74
|
+
const media =
|
|
75
|
+
sharedData?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;
|
|
76
|
+
if (media) return media;
|
|
77
|
+
} catch {
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const additionalDataMatch = html.match(
|
|
82
|
+
/window\.__additionalDataLoaded\s*\(\s*['"][^'"]+['"]\s*,\s*(\{[\s\S]*?\})\s*\)\s*;/,
|
|
83
|
+
);
|
|
84
|
+
if (additionalDataMatch) {
|
|
85
|
+
try {
|
|
86
|
+
const additionalData = JSON.parse(
|
|
87
|
+
additionalDataMatch[1],
|
|
88
|
+
) as IGAdditionalData;
|
|
89
|
+
const media = additionalData?.graphql?.shortcode_media;
|
|
90
|
+
if (media) return media;
|
|
91
|
+
} catch {
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const scriptMatches = html.matchAll(
|
|
96
|
+
/<script[^>]*type="application\/json"[^>]*>([\s\S]*?)<\/script>/g,
|
|
97
|
+
);
|
|
98
|
+
for (const scriptMatch of scriptMatches) {
|
|
99
|
+
try {
|
|
100
|
+
const data = JSON.parse(scriptMatch[1]) as Record<string, unknown>;
|
|
101
|
+
if (data && typeof data === "object" && "shortcode_media" in data) {
|
|
102
|
+
return (data as { shortcode_media: IGMediaNode }).shortcode_media;
|
|
103
|
+
}
|
|
104
|
+
} catch {
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
private buildInfoFromMedia(
|
|
112
|
+
media: IGMediaNode,
|
|
113
|
+
shortcode: string,
|
|
114
|
+
url: string,
|
|
115
|
+
): InfoDict {
|
|
116
|
+
const formats: Format[] = [];
|
|
117
|
+
const thumbnails: Thumbnail[] = [];
|
|
118
|
+
const entries: InfoDict[] = [];
|
|
119
|
+
|
|
120
|
+
const description =
|
|
121
|
+
media.edge_media_to_caption?.edges?.[0]?.node?.text ?? undefined;
|
|
122
|
+
|
|
123
|
+
const uploadDate = media.taken_at_timestamp
|
|
124
|
+
? new Date(media.taken_at_timestamp * 1000)
|
|
125
|
+
.toISOString()
|
|
126
|
+
.slice(0, 10)
|
|
127
|
+
.replace(/-/g, "")
|
|
128
|
+
: undefined;
|
|
129
|
+
|
|
130
|
+
if (media.thumbnail_resources) {
|
|
131
|
+
for (const thumb of media.thumbnail_resources) {
|
|
132
|
+
if (thumb.src) {
|
|
133
|
+
thumbnails.push({
|
|
134
|
+
url: thumb.src,
|
|
135
|
+
width: thumb.config_width,
|
|
136
|
+
height: thumb.config_height,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
if (media.thumbnail_src) {
|
|
142
|
+
thumbnails.push({ url: media.thumbnail_src });
|
|
143
|
+
}
|
|
144
|
+
if (media.display_url) {
|
|
145
|
+
thumbnails.push({ url: media.display_url, preference: 1 });
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (media.edge_sidecar_to_children?.edges) {
|
|
149
|
+
for (const edge of media.edge_sidecar_to_children.edges) {
|
|
150
|
+
const node = edge.node;
|
|
151
|
+
if (!node) continue;
|
|
152
|
+
const entryFormats: Format[] = [];
|
|
153
|
+
const entryThumbs: Thumbnail[] = [];
|
|
154
|
+
|
|
155
|
+
if (node.is_video && node.video_url) {
|
|
156
|
+
entryFormats.push({
|
|
157
|
+
format_id: "mp4",
|
|
158
|
+
url: node.video_url,
|
|
159
|
+
ext: "mp4",
|
|
160
|
+
width: node.dimensions?.width,
|
|
161
|
+
height: node.dimensions?.height,
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
if (node.display_url) {
|
|
165
|
+
entryThumbs.push({ url: node.display_url });
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
entries.push({
|
|
169
|
+
id: node.id ?? node.shortcode ?? "unknown",
|
|
170
|
+
title: description?.slice(0, 100) ?? `Instagram media`,
|
|
171
|
+
description,
|
|
172
|
+
formats: entryFormats,
|
|
173
|
+
thumbnails: entryThumbs,
|
|
174
|
+
_type: node.is_video ? "video" : "url",
|
|
175
|
+
url: node.is_video ? node.video_url : node.display_url,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (entries.length === 0 && media.is_video && media.video_url) {
|
|
181
|
+
formats.push({
|
|
182
|
+
format_id: "mp4",
|
|
183
|
+
url: media.video_url,
|
|
184
|
+
ext: "mp4",
|
|
185
|
+
width: media.dimensions?.width,
|
|
186
|
+
height: media.dimensions?.height,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const base: InfoDict = {
|
|
191
|
+
id: media.id ?? shortcode,
|
|
192
|
+
title: description?.slice(0, 100) ?? `Instagram post ${shortcode}`,
|
|
193
|
+
description,
|
|
194
|
+
uploader: media.owner?.full_name ?? media.owner?.username,
|
|
195
|
+
uploader_id: media.owner?.username,
|
|
196
|
+
uploader_url: media.owner?.username
|
|
197
|
+
? `https://www.instagram.com/${media.owner.username}/`
|
|
198
|
+
: undefined,
|
|
199
|
+
channel_id: media.owner?.id,
|
|
200
|
+
upload_date: uploadDate,
|
|
201
|
+
timestamp: media.taken_at_timestamp,
|
|
202
|
+
view_count: media.video_view_count,
|
|
203
|
+
like_count: media.edge_liked_by?.count,
|
|
204
|
+
comment_count: media.edge_media_to_comment?.count,
|
|
205
|
+
duration: media.video_duration,
|
|
206
|
+
thumbnails,
|
|
207
|
+
webpage_url: url,
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
if (entries.length > 1) {
|
|
211
|
+
return { ...base, _type: "playlist", entries, playlist_count: entries.length };
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return { ...base, formats, _type: "video" };
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
protected async _real_extract(url: string): Promise<InfoDict> {
|
|
218
|
+
const match = VALID_URL.exec(url);
|
|
219
|
+
if (!match) throw new ExtractorError(`instagram: invalid URL: ${url}`);
|
|
220
|
+
const shortcode = match[1];
|
|
221
|
+
|
|
222
|
+
const apiUrl = `https://www.instagram.com/p/${shortcode}/?__a=1&__d=dis`;
|
|
223
|
+
const apiResp = await fetch(apiUrl, {
|
|
224
|
+
headers: {
|
|
225
|
+
...this._headers,
|
|
226
|
+
Accept: "application/json, text/javascript, */*; q=0.01",
|
|
227
|
+
"X-Requested-With": "XMLHttpRequest",
|
|
228
|
+
},
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
if (apiResp.ok) {
|
|
232
|
+
try {
|
|
233
|
+
const data = (await apiResp.json()) as IGAdditionalData;
|
|
234
|
+
const media = data?.graphql?.shortcode_media;
|
|
235
|
+
if (media) {
|
|
236
|
+
return this.buildInfoFromMedia(media, shortcode, url);
|
|
237
|
+
}
|
|
238
|
+
} catch {
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const pageResp = await fetch(`https://www.instagram.com/p/${shortcode}/`, {
|
|
243
|
+
headers: this._headers,
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
if (!pageResp.ok) {
|
|
247
|
+
throw new ExtractorError(
|
|
248
|
+
`instagram: page fetch failed: ${pageResp.status} ${pageResp.statusText}`,
|
|
249
|
+
);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
const html = await pageResp.text();
|
|
253
|
+
const media = this.extractSharedData(html);
|
|
254
|
+
|
|
255
|
+
if (!media) {
|
|
256
|
+
throw new ExtractorError(
|
|
257
|
+
`instagram: could not extract media data for post ${shortcode}`,
|
|
258
|
+
);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return this.buildInfoFromMedia(media, shortcode, url);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { BaseExtractor, ExtractorError } from "../../core/types";
|
|
2
|
+
import type { InfoDict, Format, Thumbnail } from "../../core/types";
|
|
3
|
+
|
|
4
|
+
const VALID_URL = /https?:\/\/(?:www\.)?instagram\.com\/reels?\/?(?:\?.*)?$/;
|
|
5
|
+
|
|
6
|
+
const REELS_DOC_ID = "8845758582119845";
|
|
7
|
+
|
|
8
|
+
interface ReelsEdgeNode {
|
|
9
|
+
id?: string;
|
|
10
|
+
shortcode?: string;
|
|
11
|
+
is_video?: boolean;
|
|
12
|
+
video_url?: string;
|
|
13
|
+
display_url?: string;
|
|
14
|
+
dimensions?: { width?: number; height?: number };
|
|
15
|
+
video_view_count?: number;
|
|
16
|
+
video_duration?: number;
|
|
17
|
+
taken_at_timestamp?: number;
|
|
18
|
+
edge_liked_by?: { count?: number };
|
|
19
|
+
edge_media_to_comment?: { count?: number };
|
|
20
|
+
edge_media_to_caption?: { edges?: Array<{ node?: { text?: string } }> };
|
|
21
|
+
owner?: {
|
|
22
|
+
id?: string;
|
|
23
|
+
username?: string;
|
|
24
|
+
full_name?: string;
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
interface ReelsResponse {
|
|
29
|
+
data?: {
|
|
30
|
+
xdt_api__v1__clips__home__connection_v2?: {
|
|
31
|
+
edges?: Array<{ node?: { media?: ReelsEdgeNode } }>;
|
|
32
|
+
page_info?: { end_cursor?: string; has_next_page?: boolean };
|
|
33
|
+
};
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export class InstagramReelsExtractor extends BaseExtractor {
|
|
38
|
+
readonly _VALID_URL = VALID_URL;
|
|
39
|
+
readonly _NAME = "instagram:reels";
|
|
40
|
+
|
|
41
|
+
private readonly _headers: Record<string, string> = {
|
|
42
|
+
"User-Agent":
|
|
43
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
44
|
+
Accept: "*/*",
|
|
45
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
46
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
47
|
+
"X-FB-Friendly-Name": "PolarisClipsHomePageQuery",
|
|
48
|
+
"X-IG-App-ID": "936619743392459",
|
|
49
|
+
Origin: "https://www.instagram.com",
|
|
50
|
+
Referer: "https://www.instagram.com/reels/",
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
private buildEntryFromNode(node: ReelsEdgeNode): InfoDict | null {
|
|
54
|
+
if (!node.is_video || !node.video_url) return null;
|
|
55
|
+
|
|
56
|
+
const shortcode = node.shortcode ?? node.id ?? "unknown";
|
|
57
|
+
const description = node.edge_media_to_caption?.edges?.[0]?.node?.text;
|
|
58
|
+
const formats: Format[] = [
|
|
59
|
+
{
|
|
60
|
+
format_id: "mp4",
|
|
61
|
+
url: node.video_url,
|
|
62
|
+
ext: "mp4",
|
|
63
|
+
width: node.dimensions?.width,
|
|
64
|
+
height: node.dimensions?.height,
|
|
65
|
+
},
|
|
66
|
+
];
|
|
67
|
+
const thumbnails: Thumbnail[] = node.display_url
|
|
68
|
+
? [{ url: node.display_url }]
|
|
69
|
+
: [];
|
|
70
|
+
const uploadDate = node.taken_at_timestamp
|
|
71
|
+
? new Date(node.taken_at_timestamp * 1000)
|
|
72
|
+
.toISOString()
|
|
73
|
+
.slice(0, 10)
|
|
74
|
+
.replace(/-/g, "")
|
|
75
|
+
: undefined;
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
id: node.id ?? shortcode,
|
|
79
|
+
title: description?.slice(0, 100) ?? `Instagram Reel ${shortcode}`,
|
|
80
|
+
description,
|
|
81
|
+
url: `https://www.instagram.com/reel/${shortcode}/`,
|
|
82
|
+
webpage_url: `https://www.instagram.com/reel/${shortcode}/`,
|
|
83
|
+
uploader: node.owner?.full_name ?? node.owner?.username,
|
|
84
|
+
uploader_id: node.owner?.username,
|
|
85
|
+
uploader_url: node.owner?.username
|
|
86
|
+
? `https://www.instagram.com/${node.owner.username}/`
|
|
87
|
+
: undefined,
|
|
88
|
+
channel_id: node.owner?.id,
|
|
89
|
+
timestamp: node.taken_at_timestamp,
|
|
90
|
+
upload_date: uploadDate,
|
|
91
|
+
view_count: node.video_view_count,
|
|
92
|
+
like_count: node.edge_liked_by?.count,
|
|
93
|
+
comment_count: node.edge_media_to_comment?.count,
|
|
94
|
+
duration: node.video_duration,
|
|
95
|
+
formats,
|
|
96
|
+
thumbnails,
|
|
97
|
+
_type: "video",
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
private async fetchReelsPage(cursor?: string): Promise<ReelsResponse> {
|
|
102
|
+
const variables: Record<string, unknown> = {
|
|
103
|
+
surface: "REELS_TAB",
|
|
104
|
+
has_threaded_comments: true,
|
|
105
|
+
};
|
|
106
|
+
if (cursor) variables["after"] = cursor;
|
|
107
|
+
|
|
108
|
+
const body = new URLSearchParams({
|
|
109
|
+
doc_id: REELS_DOC_ID,
|
|
110
|
+
variables: JSON.stringify(variables),
|
|
111
|
+
server_timestamps: "true",
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
const resp = await fetch("https://www.instagram.com/graphql/query", {
|
|
115
|
+
method: "POST",
|
|
116
|
+
headers: this._headers,
|
|
117
|
+
body: body.toString(),
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
if (!resp.ok) {
|
|
121
|
+
throw new ExtractorError(
|
|
122
|
+
`instagram:reels: GraphQL request failed: ${resp.status}`,
|
|
123
|
+
);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return (await resp.json()) as ReelsResponse;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
protected async _real_extract(url: string): Promise<InfoDict> {
|
|
130
|
+
const entries: InfoDict[] = [];
|
|
131
|
+
let cursor: string | undefined = undefined;
|
|
132
|
+
let hasNextPage = true;
|
|
133
|
+
let pageCount = 0;
|
|
134
|
+
const maxPages = 5;
|
|
135
|
+
|
|
136
|
+
while (hasNextPage && pageCount < maxPages) {
|
|
137
|
+
const data = await this.fetchReelsPage(cursor);
|
|
138
|
+
const connection =
|
|
139
|
+
data?.data?.xdt_api__v1__clips__home__connection_v2;
|
|
140
|
+
const edges = connection?.edges ?? [];
|
|
141
|
+
|
|
142
|
+
for (const edge of edges) {
|
|
143
|
+
const media = edge.node?.media;
|
|
144
|
+
if (!media) continue;
|
|
145
|
+
const entry = this.buildEntryFromNode(media);
|
|
146
|
+
if (entry) entries.push(entry);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const pageInfo = connection?.page_info;
|
|
150
|
+
hasNextPage = pageInfo?.has_next_page ?? false;
|
|
151
|
+
cursor = pageInfo?.end_cursor;
|
|
152
|
+
pageCount++;
|
|
153
|
+
|
|
154
|
+
if (edges.length === 0) break;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
id: "instagram-reels",
|
|
159
|
+
title: "Instagram Reels",
|
|
160
|
+
webpage_url: url,
|
|
161
|
+
_type: "playlist",
|
|
162
|
+
entries,
|
|
163
|
+
playlist_count: entries.length,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
}
|