@zetagoaurum-dev/straw 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/dist/index.d.mts +8 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +43 -7
- package/dist/index.mjs +43 -7
- package/download test/Blue Archive - Maki (Camp) Live2D_HD.webm +0 -0
- package/download_test.js +46 -0
- package/downloaded_video.mp4 +0 -0
- package/find_keys.js +25 -0
- package/next_api_dump.json +34987 -0
- package/package.json +1 -1
- package/player_api_dump.json +2131 -0
- package/release.bat +1 -1
- package/src/scrapers/youtube.ts +54 -11
- package/straw/youtube.py +72 -10
- package/test_aqz.js +54 -0
- package/test_extract.js +41 -0
- package/test_metadata.js +33 -0
- package/{test_visitor.js → test_next.js} +10 -13
- package/tests/test.py +7 -3
- package/tests/test.ts +6 -3
- package/ytInitialData_dump.json +17156 -0
- package/test_api.js +0 -42
- package/test_api_clients.js +0 -39
- package/test_client.js +0 -37
- package/test_embed.js +0 -26
- package/test_html.js +0 -26
- package/test_vr.js +0 -27
- package/test_yt.js +0 -17
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [1.2.0] "Deep Metadata & Formats Engine" - 2026-02-27
|
|
6
|
+
- **Feat:** Integrated extracting `subscribers`, `likes`, and `comments` directly from YouTube's `ytInitialData` payload without external parsing overhead.
|
|
7
|
+
- **Feat:** Segregated `formats` array into three exact categorical bins: `video` (combined), `videoOnly`, and `audio` (audio-only), ensuring zero-ambiguity when downloading specific streams.
|
|
8
|
+
|
|
5
9
|
## [1.1.1] "Performance Patch" - 2026-02-27
|
|
6
10
|
- **Perf:** Re-engineered the YouTube scraper in Node.js and Python to use the `IOS` InnerTube API directly, injecting localized `visitorData` tokens to seamlessly bypass bot checks and cipher encryption. Video format lists are returned instantaneously for optimal downloading infrastructure.
|
|
7
11
|
- **Fix:** Fixed HTML parser blocking on high-volume deployed servers by upgrading to the direct `POST /youtubei/v1/player` endpoints.
|
package/dist/index.d.mts
CHANGED
|
@@ -51,11 +51,18 @@ interface YouTubeFormats {
|
|
|
51
51
|
interface YouTubeResult {
|
|
52
52
|
title: string;
|
|
53
53
|
author: string;
|
|
54
|
+
subscribers: string;
|
|
54
55
|
description: string;
|
|
55
56
|
views: string;
|
|
57
|
+
likes: string;
|
|
58
|
+
comments: string;
|
|
56
59
|
durationSeconds: string;
|
|
57
60
|
thumbnail: string;
|
|
58
|
-
formats:
|
|
61
|
+
formats: {
|
|
62
|
+
video: YouTubeFormats[];
|
|
63
|
+
videoOnly: YouTubeFormats[];
|
|
64
|
+
audio: YouTubeFormats[];
|
|
65
|
+
};
|
|
59
66
|
}
|
|
60
67
|
declare class YouTubeScraper {
|
|
61
68
|
private client;
|
package/dist/index.d.ts
CHANGED
|
@@ -51,11 +51,18 @@ interface YouTubeFormats {
|
|
|
51
51
|
interface YouTubeResult {
|
|
52
52
|
title: string;
|
|
53
53
|
author: string;
|
|
54
|
+
subscribers: string;
|
|
54
55
|
description: string;
|
|
55
56
|
views: string;
|
|
57
|
+
likes: string;
|
|
58
|
+
comments: string;
|
|
56
59
|
durationSeconds: string;
|
|
57
60
|
thumbnail: string;
|
|
58
|
-
formats:
|
|
61
|
+
formats: {
|
|
62
|
+
video: YouTubeFormats[];
|
|
63
|
+
videoOnly: YouTubeFormats[];
|
|
64
|
+
audio: YouTubeFormats[];
|
|
65
|
+
};
|
|
59
66
|
}
|
|
60
67
|
declare class YouTubeScraper {
|
|
61
68
|
private client;
|
package/dist/index.js
CHANGED
|
@@ -191,12 +191,20 @@ var YouTubeScraper = class {
|
|
|
191
191
|
}
|
|
192
192
|
const videoId = videoIdMatch[1];
|
|
193
193
|
const html = await this.client.getText(url, {
|
|
194
|
-
headers: { "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430" }
|
|
194
|
+
headers: { "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430", "Accept-Language": "en-US,en;q=0.9" }
|
|
195
195
|
});
|
|
196
196
|
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
197
197
|
const match = html.match(regex);
|
|
198
198
|
let visitorData = "";
|
|
199
199
|
let details = {};
|
|
200
|
+
let initialData = {};
|
|
201
|
+
const dataMatch = html.match(/var ytInitialData\s*=\s*({.*?});(?:<\/script>)/);
|
|
202
|
+
if (dataMatch && dataMatch[1]) {
|
|
203
|
+
try {
|
|
204
|
+
initialData = JSON.parse(dataMatch[1]);
|
|
205
|
+
} catch (e) {
|
|
206
|
+
}
|
|
207
|
+
}
|
|
200
208
|
if (match && match[1]) {
|
|
201
209
|
const data = JSON.parse(match[1]);
|
|
202
210
|
details = data?.videoDetails || {};
|
|
@@ -232,6 +240,7 @@ var YouTubeScraper = class {
|
|
|
232
240
|
body: JSON.stringify(payload)
|
|
233
241
|
});
|
|
234
242
|
const apiData = await res.json();
|
|
243
|
+
console.log("Playability Status:", apiData?.playabilityStatus?.status, "StreamingData keys:", Object.keys(apiData?.streamingData || {}));
|
|
235
244
|
if (!details.title) {
|
|
236
245
|
details = apiData?.videoDetails || {};
|
|
237
246
|
}
|
|
@@ -239,12 +248,31 @@ var YouTubeScraper = class {
|
|
|
239
248
|
if (!details) {
|
|
240
249
|
throw new Error("Video details not found inside player response.");
|
|
241
250
|
}
|
|
242
|
-
|
|
251
|
+
let subscribers = "";
|
|
252
|
+
let likes = "";
|
|
253
|
+
let comments = "";
|
|
254
|
+
try {
|
|
255
|
+
const secInfo = initialData?.contents?.twoColumnWatchNextResults?.results?.results?.contents?.find((c) => c.videoSecondaryInfoRenderer)?.videoSecondaryInfoRenderer;
|
|
256
|
+
if (secInfo?.owner?.videoOwnerRenderer?.subscriberCountText?.simpleText) {
|
|
257
|
+
subscribers = secInfo.owner.videoOwnerRenderer.subscriberCountText.simpleText;
|
|
258
|
+
}
|
|
259
|
+
const factoids = initialData?.engagementPanels?.find((p) => p.engagementPanelSectionListRenderer?.targetId === "engagement-panel-structured-description")?.engagementPanelSectionListRenderer?.content?.structuredDescriptionContentRenderer?.items?.find((i) => i.videoDescriptionHeaderRenderer)?.videoDescriptionHeaderRenderer?.factoid || [];
|
|
260
|
+
const likesFactoid = factoids.find((f) => f.factoidRenderer?.accessibilityText?.toLowerCase().includes("like"));
|
|
261
|
+
if (likesFactoid) likes = likesFactoid.factoidRenderer.accessibilityText;
|
|
262
|
+
const commentsPanel = initialData?.engagementPanels?.find((p) => p.engagementPanelSectionListRenderer?.panelIdentifier === "engagement-panel-comments-section");
|
|
263
|
+
if (commentsPanel) {
|
|
264
|
+
comments = commentsPanel.engagementPanelSectionListRenderer.header.engagementPanelTitleHeaderRenderer.contextualInfo?.runs?.[0]?.text || "";
|
|
265
|
+
}
|
|
266
|
+
} catch (e) {
|
|
267
|
+
}
|
|
268
|
+
const video = [];
|
|
269
|
+
const videoOnly = [];
|
|
270
|
+
const audio = [];
|
|
243
271
|
const rawFormats = [...streamingData?.formats || [], ...streamingData?.adaptiveFormats || []];
|
|
244
272
|
for (const format of rawFormats) {
|
|
245
273
|
if (format.url) {
|
|
246
274
|
const mimeType = format.mimeType || "";
|
|
247
|
-
|
|
275
|
+
const formatObj = {
|
|
248
276
|
url: format.url,
|
|
249
277
|
mimeType,
|
|
250
278
|
width: format.width,
|
|
@@ -253,19 +281,27 @@ var YouTubeScraper = class {
|
|
|
253
281
|
bitrate: format.bitrate,
|
|
254
282
|
hasAudio: mimeType.includes("audio/"),
|
|
255
283
|
hasVideo: mimeType.includes("video/")
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
|
|
284
|
+
};
|
|
285
|
+
if (formatObj.hasVideo && formatObj.hasAudio) video.push(formatObj);
|
|
286
|
+
else if (formatObj.hasVideo) videoOnly.push(formatObj);
|
|
287
|
+
else if (formatObj.hasAudio) audio.push(formatObj);
|
|
259
288
|
}
|
|
260
289
|
}
|
|
261
290
|
return {
|
|
262
291
|
title: details.title || "",
|
|
263
292
|
author: details.author || "",
|
|
293
|
+
subscribers,
|
|
264
294
|
description: details.shortDescription || "",
|
|
265
295
|
views: details.viewCount || "0",
|
|
296
|
+
likes,
|
|
297
|
+
comments,
|
|
266
298
|
durationSeconds: details.lengthSeconds || "0",
|
|
267
299
|
thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || "",
|
|
268
|
-
formats
|
|
300
|
+
formats: {
|
|
301
|
+
video,
|
|
302
|
+
videoOnly,
|
|
303
|
+
audio
|
|
304
|
+
}
|
|
269
305
|
};
|
|
270
306
|
}
|
|
271
307
|
};
|
package/dist/index.mjs
CHANGED
|
@@ -151,12 +151,20 @@ var YouTubeScraper = class {
|
|
|
151
151
|
}
|
|
152
152
|
const videoId = videoIdMatch[1];
|
|
153
153
|
const html = await this.client.getText(url, {
|
|
154
|
-
headers: { "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430" }
|
|
154
|
+
headers: { "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430", "Accept-Language": "en-US,en;q=0.9" }
|
|
155
155
|
});
|
|
156
156
|
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
157
157
|
const match = html.match(regex);
|
|
158
158
|
let visitorData = "";
|
|
159
159
|
let details = {};
|
|
160
|
+
let initialData = {};
|
|
161
|
+
const dataMatch = html.match(/var ytInitialData\s*=\s*({.*?});(?:<\/script>)/);
|
|
162
|
+
if (dataMatch && dataMatch[1]) {
|
|
163
|
+
try {
|
|
164
|
+
initialData = JSON.parse(dataMatch[1]);
|
|
165
|
+
} catch (e) {
|
|
166
|
+
}
|
|
167
|
+
}
|
|
160
168
|
if (match && match[1]) {
|
|
161
169
|
const data = JSON.parse(match[1]);
|
|
162
170
|
details = data?.videoDetails || {};
|
|
@@ -192,6 +200,7 @@ var YouTubeScraper = class {
|
|
|
192
200
|
body: JSON.stringify(payload)
|
|
193
201
|
});
|
|
194
202
|
const apiData = await res.json();
|
|
203
|
+
console.log("Playability Status:", apiData?.playabilityStatus?.status, "StreamingData keys:", Object.keys(apiData?.streamingData || {}));
|
|
195
204
|
if (!details.title) {
|
|
196
205
|
details = apiData?.videoDetails || {};
|
|
197
206
|
}
|
|
@@ -199,12 +208,31 @@ var YouTubeScraper = class {
|
|
|
199
208
|
if (!details) {
|
|
200
209
|
throw new Error("Video details not found inside player response.");
|
|
201
210
|
}
|
|
202
|
-
|
|
211
|
+
let subscribers = "";
|
|
212
|
+
let likes = "";
|
|
213
|
+
let comments = "";
|
|
214
|
+
try {
|
|
215
|
+
const secInfo = initialData?.contents?.twoColumnWatchNextResults?.results?.results?.contents?.find((c) => c.videoSecondaryInfoRenderer)?.videoSecondaryInfoRenderer;
|
|
216
|
+
if (secInfo?.owner?.videoOwnerRenderer?.subscriberCountText?.simpleText) {
|
|
217
|
+
subscribers = secInfo.owner.videoOwnerRenderer.subscriberCountText.simpleText;
|
|
218
|
+
}
|
|
219
|
+
const factoids = initialData?.engagementPanels?.find((p) => p.engagementPanelSectionListRenderer?.targetId === "engagement-panel-structured-description")?.engagementPanelSectionListRenderer?.content?.structuredDescriptionContentRenderer?.items?.find((i) => i.videoDescriptionHeaderRenderer)?.videoDescriptionHeaderRenderer?.factoid || [];
|
|
220
|
+
const likesFactoid = factoids.find((f) => f.factoidRenderer?.accessibilityText?.toLowerCase().includes("like"));
|
|
221
|
+
if (likesFactoid) likes = likesFactoid.factoidRenderer.accessibilityText;
|
|
222
|
+
const commentsPanel = initialData?.engagementPanels?.find((p) => p.engagementPanelSectionListRenderer?.panelIdentifier === "engagement-panel-comments-section");
|
|
223
|
+
if (commentsPanel) {
|
|
224
|
+
comments = commentsPanel.engagementPanelSectionListRenderer.header.engagementPanelTitleHeaderRenderer.contextualInfo?.runs?.[0]?.text || "";
|
|
225
|
+
}
|
|
226
|
+
} catch (e) {
|
|
227
|
+
}
|
|
228
|
+
const video = [];
|
|
229
|
+
const videoOnly = [];
|
|
230
|
+
const audio = [];
|
|
203
231
|
const rawFormats = [...streamingData?.formats || [], ...streamingData?.adaptiveFormats || []];
|
|
204
232
|
for (const format of rawFormats) {
|
|
205
233
|
if (format.url) {
|
|
206
234
|
const mimeType = format.mimeType || "";
|
|
207
|
-
|
|
235
|
+
const formatObj = {
|
|
208
236
|
url: format.url,
|
|
209
237
|
mimeType,
|
|
210
238
|
width: format.width,
|
|
@@ -213,19 +241,27 @@ var YouTubeScraper = class {
|
|
|
213
241
|
bitrate: format.bitrate,
|
|
214
242
|
hasAudio: mimeType.includes("audio/"),
|
|
215
243
|
hasVideo: mimeType.includes("video/")
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
|
|
244
|
+
};
|
|
245
|
+
if (formatObj.hasVideo && formatObj.hasAudio) video.push(formatObj);
|
|
246
|
+
else if (formatObj.hasVideo) videoOnly.push(formatObj);
|
|
247
|
+
else if (formatObj.hasAudio) audio.push(formatObj);
|
|
219
248
|
}
|
|
220
249
|
}
|
|
221
250
|
return {
|
|
222
251
|
title: details.title || "",
|
|
223
252
|
author: details.author || "",
|
|
253
|
+
subscribers,
|
|
224
254
|
description: details.shortDescription || "",
|
|
225
255
|
views: details.viewCount || "0",
|
|
256
|
+
likes,
|
|
257
|
+
comments,
|
|
226
258
|
durationSeconds: details.lengthSeconds || "0",
|
|
227
259
|
thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || "",
|
|
228
|
-
formats
|
|
260
|
+
formats: {
|
|
261
|
+
video,
|
|
262
|
+
videoOnly,
|
|
263
|
+
audio
|
|
264
|
+
}
|
|
229
265
|
};
|
|
230
266
|
}
|
|
231
267
|
};
|
|
File without changes
|
package/download_test.js
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const { fetch } = require('undici');
|
|
3
|
+
const straw = require('./dist/index.js');
|
|
4
|
+
|
|
5
|
+
async function download() {
|
|
6
|
+
const yt = new straw.YouTubeScraper();
|
|
7
|
+
console.log('Scraping metadata and direct links...');
|
|
8
|
+
const res = await yt.scrapeVideo('https://youtu.be/_4j1Abt_AiM?si=_dA2lroz096f1cYp');
|
|
9
|
+
|
|
10
|
+
// Find a combined video+audio format, or fallback to the highest quality video format
|
|
11
|
+
const combined = res.formats.find(f => f.hasVideo && f.hasAudio);
|
|
12
|
+
const bestVideo = res.formats.filter(f => f.hasVideo).sort((a, b) => (b.width || 0) - (a.width || 0))[0];
|
|
13
|
+
|
|
14
|
+
const target = combined || bestVideo;
|
|
15
|
+
|
|
16
|
+
if (!target) {
|
|
17
|
+
console.log('No suitable downloadable format found.');
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
console.log(`Downloading: ${res.title}`);
|
|
22
|
+
console.log(`Format: ${target.mimeType} (${target.width || 'unknown'}x${target.height || 'unknown'})`);
|
|
23
|
+
|
|
24
|
+
// To avoid buffering the whole video in memory, we stream it to the file
|
|
25
|
+
const outPath = 'downloaded_video.mp4';
|
|
26
|
+
const outStream = fs.createWriteStream(outPath);
|
|
27
|
+
|
|
28
|
+
console.log('Initiating download stream...');
|
|
29
|
+
const response = await fetch(target.url);
|
|
30
|
+
if (!response.body) throw new Error('No response body');
|
|
31
|
+
|
|
32
|
+
const reader = response.body.getReader();
|
|
33
|
+
let downloaded = 0;
|
|
34
|
+
|
|
35
|
+
while (true) {
|
|
36
|
+
const { done, value } = await reader.read();
|
|
37
|
+
if (done) break;
|
|
38
|
+
outStream.write(value);
|
|
39
|
+
downloaded += value.length;
|
|
40
|
+
process.stdout.write(`\rDownloaded: ${(downloaded / 1024 / 1024).toFixed(2)} MB`);
|
|
41
|
+
}
|
|
42
|
+
outStream.end();
|
|
43
|
+
console.log(`\nDownload complete! Saved to ${outPath}`);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
download().catch(console.error);
|
|
File without changes
|
package/find_keys.js
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
|
|
3
|
+
const data = JSON.parse(fs.readFileSync('next_api_dump.json', 'utf-8'));
|
|
4
|
+
|
|
5
|
+
function findKey(obj, key, path = '') {
|
|
6
|
+
if (obj === null || typeof obj !== 'object') return;
|
|
7
|
+
if (Array.isArray(obj)) {
|
|
8
|
+
for (let i = 0; i < obj.length; i++) {
|
|
9
|
+
findKey(obj[i], key, `${path}[${i}]`);
|
|
10
|
+
}
|
|
11
|
+
} else {
|
|
12
|
+
for (const k in obj) {
|
|
13
|
+
if (k === key) {
|
|
14
|
+
console.log(`Found ${key} at ${path}.${k} =`, JSON.stringify(obj[k]).substring(0, 100));
|
|
15
|
+
}
|
|
16
|
+
findKey(obj[k], key, `${path}.${k}`);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
findKey(data, 'subscriberCountText');
|
|
22
|
+
findKey(data, 'likeCount');
|
|
23
|
+
findKey(data, 'likeCountWithLikeText');
|
|
24
|
+
findKey(data, 'description');
|
|
25
|
+
findKey(data, 'commentCount');
|