@zetagoaurum-dev/straw 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/dist/index.d.mts +8 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +88 -14
- package/dist/index.mjs +88 -14
- package/download test/Blue Archive - Maki (Camp) Live2D_HD.webm +0 -0
- package/download_test.js +46 -0
- package/downloaded_video.mp4 +0 -0
- package/find_keys.js +25 -0
- package/next_api_dump.json +34987 -0
- package/package.json +1 -1
- package/player_api_dump.json +2131 -0
- package/release.bat +4 -0
- package/src/scrapers/youtube.ts +103 -19
- package/straw/youtube.py +117 -15
- package/test_aqz.js +54 -0
- package/test_extract.js +41 -0
- package/test_metadata.js +33 -0
- package/test_next.js +53 -0
- package/tests/test.py +7 -3
- package/tests/test.ts +6 -3
- package/ytInitialData_dump.json +17156 -0
- package/straw/__pycache__/__init__.cpython-311.pyc +0 -0
- package/straw/__pycache__/client.cpython-311.pyc +0 -0
- package/straw/__pycache__/helpers.cpython-311.pyc +0 -0
- package/straw/__pycache__/media.cpython-311.pyc +0 -0
- package/straw/__pycache__/web.cpython-311.pyc +0 -0
- package/straw/__pycache__/youtube.cpython-311.pyc +0 -0
package/release.bat
ADDED
package/src/scrapers/youtube.ts
CHANGED
|
@@ -14,11 +14,18 @@ export interface YouTubeFormats {
|
|
|
14
14
|
export interface YouTubeResult {
|
|
15
15
|
title: string;
|
|
16
16
|
author: string;
|
|
17
|
+
subscribers: string;
|
|
17
18
|
description: string;
|
|
18
19
|
views: string;
|
|
20
|
+
likes: string;
|
|
21
|
+
comments: string;
|
|
19
22
|
durationSeconds: string;
|
|
20
23
|
thumbnail: string;
|
|
21
|
-
formats:
|
|
24
|
+
formats: {
|
|
25
|
+
video: YouTubeFormats[];
|
|
26
|
+
videoOnly: YouTubeFormats[];
|
|
27
|
+
audio: YouTubeFormats[];
|
|
28
|
+
};
|
|
22
29
|
}
|
|
23
30
|
|
|
24
31
|
export class YouTubeScraper {
|
|
@@ -33,35 +40,106 @@ export class YouTubeScraper {
|
|
|
33
40
|
* Parses the ytInitialPlayerResponse object embedded in the watch HTML.
|
|
34
41
|
*/
|
|
35
42
|
public async scrapeVideo(url: string): Promise<YouTubeResult> {
|
|
43
|
+
const videoIdMatch = url.match(/(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))([^"&?\/\s]{11})/);
|
|
44
|
+
if (!videoIdMatch || !videoIdMatch[1]) {
|
|
45
|
+
throw new Error('Invalid YouTube URL');
|
|
46
|
+
}
|
|
47
|
+
const videoId = videoIdMatch[1];
|
|
48
|
+
|
|
36
49
|
const html = await this.client.getText(url, {
|
|
37
|
-
headers: {
|
|
38
|
-
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
39
|
-
}
|
|
50
|
+
headers: { 'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430', 'Accept-Language': 'en-US,en;q=0.9' }
|
|
40
51
|
});
|
|
41
52
|
|
|
42
|
-
// Find ytInitialPlayerResponse JSON fragment in the HTML
|
|
43
53
|
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
44
54
|
const match = html.match(regex);
|
|
55
|
+
let visitorData = '';
|
|
56
|
+
let details: any = {};
|
|
57
|
+
|
|
58
|
+
let initialData: any = {};
|
|
59
|
+
const dataMatch = html.match(/var ytInitialData\s*=\s*({.*?});(?:<\/script>)/);
|
|
60
|
+
if (dataMatch && dataMatch[1]) {
|
|
61
|
+
try { initialData = JSON.parse(dataMatch[1]); } catch(e) {}
|
|
62
|
+
}
|
|
45
63
|
|
|
46
|
-
if (
|
|
47
|
-
|
|
64
|
+
if (match && match[1]) {
|
|
65
|
+
const data = JSON.parse(match[1]);
|
|
66
|
+
details = data?.videoDetails || {};
|
|
67
|
+
visitorData = data?.responseContext?.visitorData || '';
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (!visitorData) {
|
|
71
|
+
const vdMatch = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
72
|
+
if (vdMatch) visitorData = vdMatch[1];
|
|
48
73
|
}
|
|
49
74
|
|
|
50
|
-
const
|
|
51
|
-
|
|
52
|
-
|
|
75
|
+
const payload = {
|
|
76
|
+
context: {
|
|
77
|
+
client: {
|
|
78
|
+
hl: 'en',
|
|
79
|
+
gl: 'US',
|
|
80
|
+
clientName: 'IOS',
|
|
81
|
+
clientVersion: '19.28.1',
|
|
82
|
+
osName: 'iOS',
|
|
83
|
+
osVersion: '17.5.1',
|
|
84
|
+
deviceMake: 'Apple',
|
|
85
|
+
deviceModel: 'iPhone16,2',
|
|
86
|
+
visitorData: visitorData
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
videoId: videoId
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
const res = await this.client.request('https://www.youtube.com/youtubei/v1/player', {
|
|
93
|
+
method: 'POST',
|
|
94
|
+
headers: {
|
|
95
|
+
'Accept': 'application/json',
|
|
96
|
+
'Content-Type': 'application/json',
|
|
97
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
98
|
+
},
|
|
99
|
+
body: JSON.stringify(payload)
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
const apiData = await res.json() as any;
|
|
103
|
+
if (!details.title) {
|
|
104
|
+
details = apiData?.videoDetails || {};
|
|
105
|
+
}
|
|
106
|
+
const streamingData = apiData?.streamingData;
|
|
53
107
|
|
|
54
108
|
if (!details) {
|
|
55
109
|
throw new Error('Video details not found inside player response.');
|
|
56
110
|
}
|
|
57
111
|
|
|
58
|
-
|
|
112
|
+
let subscribers = '';
|
|
113
|
+
let likes = '';
|
|
114
|
+
let comments = '';
|
|
115
|
+
|
|
116
|
+
try {
|
|
117
|
+
const secInfo = initialData?.contents?.twoColumnWatchNextResults?.results?.results?.contents?.find((c: any) => c.videoSecondaryInfoRenderer)?.videoSecondaryInfoRenderer;
|
|
118
|
+
if (secInfo?.owner?.videoOwnerRenderer?.subscriberCountText?.simpleText) {
|
|
119
|
+
subscribers = secInfo.owner.videoOwnerRenderer.subscriberCountText.simpleText;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const factoids = initialData?.engagementPanels?.find((p: any) => p.engagementPanelSectionListRenderer?.targetId === 'engagement-panel-structured-description')
|
|
123
|
+
?.engagementPanelSectionListRenderer?.content?.structuredDescriptionContentRenderer?.items?.find((i: any) => i.videoDescriptionHeaderRenderer)?.videoDescriptionHeaderRenderer?.factoid || [];
|
|
124
|
+
const likesFactoid = factoids.find((f: any) => f.factoidRenderer?.accessibilityText?.toLowerCase().includes('like'));
|
|
125
|
+
if (likesFactoid) likes = likesFactoid.factoidRenderer.accessibilityText;
|
|
126
|
+
|
|
127
|
+
const commentsPanel = initialData?.engagementPanels?.find((p: any) => p.engagementPanelSectionListRenderer?.panelIdentifier === 'engagement-panel-comments-section');
|
|
128
|
+
if (commentsPanel) {
|
|
129
|
+
comments = commentsPanel.engagementPanelSectionListRenderer.header.engagementPanelTitleHeaderRenderer.contextualInfo?.runs?.[0]?.text || '';
|
|
130
|
+
}
|
|
131
|
+
} catch (e) {}
|
|
132
|
+
|
|
133
|
+
const video: YouTubeFormats[] = [];
|
|
134
|
+
const videoOnly: YouTubeFormats[] = [];
|
|
135
|
+
const audio: YouTubeFormats[] = [];
|
|
136
|
+
|
|
59
137
|
const rawFormats = [...(streamingData?.formats || []), ...(streamingData?.adaptiveFormats || [])];
|
|
60
138
|
|
|
61
139
|
for (const format of rawFormats) {
|
|
62
140
|
if (format.url) {
|
|
63
141
|
const mimeType = format.mimeType || '';
|
|
64
|
-
|
|
142
|
+
const formatObj = {
|
|
65
143
|
url: format.url,
|
|
66
144
|
mimeType: mimeType,
|
|
67
145
|
width: format.width,
|
|
@@ -70,23 +148,29 @@ export class YouTubeScraper {
|
|
|
70
148
|
bitrate: format.bitrate,
|
|
71
149
|
hasAudio: mimeType.includes('audio/'),
|
|
72
150
|
hasVideo: mimeType.includes('video/')
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
continue;
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
if (formatObj.hasVideo && formatObj.hasAudio) video.push(formatObj);
|
|
154
|
+
else if (formatObj.hasVideo) videoOnly.push(formatObj);
|
|
155
|
+
else if (formatObj.hasAudio) audio.push(formatObj);
|
|
79
156
|
}
|
|
80
157
|
}
|
|
81
158
|
|
|
82
159
|
return {
|
|
83
160
|
title: details.title || '',
|
|
84
161
|
author: details.author || '',
|
|
162
|
+
subscribers: subscribers,
|
|
85
163
|
description: details.shortDescription || '',
|
|
86
164
|
views: details.viewCount || '0',
|
|
165
|
+
likes: likes,
|
|
166
|
+
comments: comments,
|
|
87
167
|
durationSeconds: details.lengthSeconds || '0',
|
|
88
168
|
thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || '',
|
|
89
|
-
formats
|
|
169
|
+
formats: {
|
|
170
|
+
video,
|
|
171
|
+
videoOnly,
|
|
172
|
+
audio
|
|
173
|
+
}
|
|
90
174
|
};
|
|
91
175
|
}
|
|
92
176
|
}
|
package/straw/youtube.py
CHANGED
|
@@ -8,38 +8,133 @@ class YouTubeScraper:
|
|
|
8
8
|
self.client = StrawClient(**client_options)
|
|
9
9
|
|
|
10
10
|
async def scrape_video(self, url: str) -> Dict[str, Any]:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
match = re.search(r'(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))([^"&?\/\s]{11})', url)
|
|
12
|
+
if not match:
|
|
13
|
+
raise Exception("Invalid YouTube URL")
|
|
14
|
+
video_id = match.group(1)
|
|
15
|
+
|
|
16
|
+
headers = {'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430', 'Accept-Language': 'en-US,en;q=0.9'}
|
|
14
17
|
html = await self.client.get_text(url, headers=headers)
|
|
18
|
+
|
|
19
|
+
visitor_data = ""
|
|
20
|
+
details = {}
|
|
21
|
+
initial_data = {}
|
|
22
|
+
|
|
23
|
+
player_match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)', html)
|
|
24
|
+
if player_match:
|
|
25
|
+
try:
|
|
26
|
+
data_html = json.loads(player_match.group(1))
|
|
27
|
+
details = data_html.get('videoDetails', {})
|
|
28
|
+
visitor_data = data_html.get('responseContext', {}).get('visitorData', '')
|
|
29
|
+
except:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
data_match = re.search(r'var ytInitialData\s*=\s*({.*?});(?:<\/script>)', html)
|
|
33
|
+
if data_match:
|
|
34
|
+
try:
|
|
35
|
+
initial_data = json.loads(data_match.group(1))
|
|
36
|
+
except:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
if not visitor_data:
|
|
40
|
+
vd_match = re.search(r'"visitorData"\s*:\s*"([^"]+)"', html)
|
|
41
|
+
if vd_match:
|
|
42
|
+
visitor_data = vd_match.group(1)
|
|
15
43
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
44
|
+
payload = {
|
|
45
|
+
"context": {
|
|
46
|
+
"client": {
|
|
47
|
+
"hl": "en",
|
|
48
|
+
"gl": "US",
|
|
49
|
+
"clientName": "IOS",
|
|
50
|
+
"clientVersion": "19.28.1",
|
|
51
|
+
"osName": "iOS",
|
|
52
|
+
"osVersion": "17.5.1",
|
|
53
|
+
"deviceMake": "Apple",
|
|
54
|
+
"deviceModel": "iPhone16,2",
|
|
55
|
+
"visitorData": visitor_data
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"videoId": video_id
|
|
59
|
+
}
|
|
19
60
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
61
|
+
api_headers = {
|
|
62
|
+
'Accept': 'application/json',
|
|
63
|
+
'Content-Type': 'application/json',
|
|
64
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
response = await self.client.request('POST', 'https://www.youtube.com/youtubei/v1/player', json=payload, headers=api_headers)
|
|
68
|
+
api_data = response.json()
|
|
69
|
+
|
|
70
|
+
if not details.get('title'):
|
|
71
|
+
details = api_data.get('videoDetails', {})
|
|
72
|
+
|
|
73
|
+
streaming_data = api_data.get('streamingData', {})
|
|
23
74
|
|
|
24
75
|
if not details:
|
|
25
76
|
raise Exception("Video details not found inside player response.")
|
|
26
77
|
|
|
27
|
-
|
|
78
|
+
subscribers = ""
|
|
79
|
+
likes = ""
|
|
80
|
+
comments = ""
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
contents = initial_data.get('contents', {}).get('twoColumnWatchNextResults', {}).get('results', {}).get('results', {}).get('contents', [])
|
|
84
|
+
for c in contents:
|
|
85
|
+
sec_info = c.get('videoSecondaryInfoRenderer')
|
|
86
|
+
if sec_info:
|
|
87
|
+
stext = sec_info.get('owner', {}).get('videoOwnerRenderer', {}).get('subscriberCountText', {}).get('simpleText')
|
|
88
|
+
if stext: subscribers = stext
|
|
89
|
+
|
|
90
|
+
panels = initial_data.get('engagementPanels', [])
|
|
91
|
+
for p in panels:
|
|
92
|
+
sr = p.get('engagementPanelSectionListRenderer', {})
|
|
93
|
+
if sr.get('targetId') == 'engagement-panel-structured-description':
|
|
94
|
+
items = sr.get('content', {}).get('structuredDescriptionContentRenderer', {}).get('items', [])
|
|
95
|
+
for i in items:
|
|
96
|
+
factoids = i.get('videoDescriptionHeaderRenderer', {}).get('factoid', [])
|
|
97
|
+
for f in factoids:
|
|
98
|
+
acc = f.get('factoidRenderer', {}).get('accessibilityText', '')
|
|
99
|
+
if 'like' in acc.lower():
|
|
100
|
+
likes = acc
|
|
101
|
+
|
|
102
|
+
if sr.get('panelIdentifier') == 'engagement-panel-comments-section':
|
|
103
|
+
runs = sr.get('header', {}).get('engagementPanelTitleHeaderRenderer', {}).get('contextualInfo', {}).get('runs', [])
|
|
104
|
+
if runs:
|
|
105
|
+
comments = runs[0].get('text', '')
|
|
106
|
+
except:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
video_combined = []
|
|
110
|
+
video_only = []
|
|
111
|
+
audio_only = []
|
|
112
|
+
|
|
28
113
|
raw_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
|
|
29
114
|
|
|
30
115
|
for f in raw_formats:
|
|
31
116
|
if 'url' in f:
|
|
32
117
|
mime_type = f.get('mimeType', '')
|
|
33
|
-
|
|
118
|
+
has_audio = 'audio/' in mime_type
|
|
119
|
+
has_video = 'video/' in mime_type
|
|
120
|
+
|
|
121
|
+
f_obj = {
|
|
34
122
|
'url': f['url'],
|
|
35
123
|
'mimeType': mime_type,
|
|
36
124
|
'width': f.get('width'),
|
|
37
125
|
'height': f.get('height'),
|
|
38
126
|
'quality': f.get('qualityLabel') or f.get('quality'),
|
|
39
127
|
'bitrate': f.get('bitrate'),
|
|
40
|
-
'hasAudio':
|
|
41
|
-
'hasVideo':
|
|
42
|
-
}
|
|
128
|
+
'hasAudio': has_audio,
|
|
129
|
+
'hasVideo': has_video
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if has_video and has_audio:
|
|
133
|
+
video_combined.append(f_obj)
|
|
134
|
+
elif has_video:
|
|
135
|
+
video_only.append(f_obj)
|
|
136
|
+
elif has_audio:
|
|
137
|
+
audio_only.append(f_obj)
|
|
43
138
|
|
|
44
139
|
thumbnails = details.get('thumbnail', {}).get('thumbnails', [])
|
|
45
140
|
best_thumbnail = thumbnails[-1]['url'] if thumbnails else ''
|
|
@@ -47,9 +142,16 @@ class YouTubeScraper:
|
|
|
47
142
|
return {
|
|
48
143
|
'title': details.get('title', ''),
|
|
49
144
|
'author': details.get('author', ''),
|
|
145
|
+
'subscribers': subscribers,
|
|
50
146
|
'description': details.get('shortDescription', ''),
|
|
51
147
|
'views': details.get('viewCount', '0'),
|
|
148
|
+
'likes': likes,
|
|
149
|
+
'comments': comments,
|
|
52
150
|
'durationSeconds': details.get('lengthSeconds', '0'),
|
|
53
151
|
'thumbnail': best_thumbnail,
|
|
54
|
-
'formats':
|
|
152
|
+
'formats': {
|
|
153
|
+
'video': video_combined,
|
|
154
|
+
'videoOnly': video_only,
|
|
155
|
+
'audio': audio_only
|
|
156
|
+
}
|
|
55
157
|
}
|
package/test_aqz.js
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
|
|
4
|
+
async function testNextApi() {
|
|
5
|
+
const videoId = 'aqz-KE-bpKQ';
|
|
6
|
+
const url = `https://www.youtube.com/watch?v=${videoId}`;
|
|
7
|
+
|
|
8
|
+
const htmlRes = await undici.request(url, {
|
|
9
|
+
method: 'GET',
|
|
10
|
+
headers: {
|
|
11
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/115.0.0.0 Safari/537.36',
|
|
12
|
+
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
13
|
+
}
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
const html = await htmlRes.body.text();
|
|
17
|
+
let visitorData = '';
|
|
18
|
+
const vdMatch = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
19
|
+
if (vdMatch) visitorData = vdMatch[1];
|
|
20
|
+
|
|
21
|
+
const payload = {
|
|
22
|
+
context: {
|
|
23
|
+
client: {
|
|
24
|
+
hl: 'en',
|
|
25
|
+
gl: 'US',
|
|
26
|
+
clientName: 'IOS',
|
|
27
|
+
clientVersion: '19.28.1',
|
|
28
|
+
osName: 'iOS',
|
|
29
|
+
osVersion: '17.5.1',
|
|
30
|
+
deviceMake: 'Apple',
|
|
31
|
+
deviceModel: 'iPhone16,2',
|
|
32
|
+
visitorData: visitorData
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
videoId: videoId
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
const res = await undici.request('https://www.youtube.com/youtubei/v1/player', {
|
|
39
|
+
method: 'POST',
|
|
40
|
+
headers: {
|
|
41
|
+
'Accept': 'application/json',
|
|
42
|
+
'Content-Type': 'application/json',
|
|
43
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
44
|
+
},
|
|
45
|
+
body: JSON.stringify(payload)
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const apiData = await res.body.json();
|
|
49
|
+
fs.writeFileSync('player_api_dump.json', JSON.stringify(apiData, null, 2));
|
|
50
|
+
console.log('Saved dump to player_api_dump.json, Playable:', apiData?.playabilityStatus?.status);
|
|
51
|
+
console.log('Formats:', apiData?.streamingData?.formats?.length || 0, 'Adaptive', apiData?.streamingData?.adaptiveFormats?.length || 0);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
testNextApi().catch(console.error);
|
package/test_extract.js
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
|
|
3
|
+
const data = JSON.parse(fs.readFileSync('ytInitialData_dump.json', 'utf8'));
|
|
4
|
+
|
|
5
|
+
let subscribers = '';
|
|
6
|
+
let likes = '';
|
|
7
|
+
let comments = '';
|
|
8
|
+
|
|
9
|
+
try {
|
|
10
|
+
const videoPrimaryInfo = data?.contents?.twoColumnWatchNextResults?.results?.results?.contents?.find(c => c.videoPrimaryInfoRenderer)?.videoPrimaryInfoRenderer;
|
|
11
|
+
const videoSecondaryInfo = data?.contents?.twoColumnWatchNextResults?.results?.results?.contents?.find(c => c.videoSecondaryInfoRenderer)?.videoSecondaryInfoRenderer;
|
|
12
|
+
|
|
13
|
+
// Subscribers
|
|
14
|
+
if (videoSecondaryInfo?.owner?.videoOwnerRenderer?.subscriberCountText?.simpleText) {
|
|
15
|
+
subscribers = videoSecondaryInfo.owner.videoOwnerRenderer.subscriberCountText.simpleText;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// Try to get likes from factoids (modern UI)
|
|
19
|
+
const factoids = data?.engagementPanels?.find(p => p.engagementPanelSectionListRenderer?.targetId === 'engagement-panel-structured-description')
|
|
20
|
+
?.engagementPanelSectionListRenderer?.content?.structuredDescriptionContentRenderer?.items?.find(i => i.videoDescriptionHeaderRenderer)?.videoDescriptionHeaderRenderer?.factoid || [];
|
|
21
|
+
|
|
22
|
+
const likesFactoid = factoids.find(f => f.factoidRenderer?.accessibilityText?.toLowerCase().includes('like'));
|
|
23
|
+
if (likesFactoid) {
|
|
24
|
+
likes = likesFactoid.factoidRenderer.accessibilityText;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Try to get comments
|
|
28
|
+
// Usually comments count is harder to find in initialData without scrolling, but let's check engagement panels
|
|
29
|
+
const commentsPanel = data?.engagementPanels?.find(p => p.engagementPanelSectionListRenderer?.panelIdentifier === 'engagement-panel-comments-section');
|
|
30
|
+
if (commentsPanel) {
|
|
31
|
+
comments = commentsPanel.engagementPanelSectionListRenderer.header.engagementPanelTitleHeaderRenderer.contextualInfo?.runs?.[0]?.text || '';
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
} catch (e) {
|
|
36
|
+
console.error(e);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
console.log('Subscribers:', subscribers);
|
|
40
|
+
console.log('Likes:', likes);
|
|
41
|
+
console.log('Comments:', comments);
|
package/test_metadata.js
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const undici = require('undici');
|
|
3
|
+
|
|
4
|
+
async function extractMetadata() {
|
|
5
|
+
const videoId = '_4j1Abt_AiM';
|
|
6
|
+
const url = `https://www.youtube.com/watch?v=${videoId}`;
|
|
7
|
+
|
|
8
|
+
const htmlRes = await undici.request(url, {
|
|
9
|
+
method: 'GET',
|
|
10
|
+
headers: {
|
|
11
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/115.0.0.0 Safari/537.36',
|
|
12
|
+
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430',
|
|
13
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
14
|
+
}
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
const html = await htmlRes.body.text();
|
|
18
|
+
const match = html.match(/var ytInitialData\s*=\s*({.*?});(?:<\/script>)/);
|
|
19
|
+
|
|
20
|
+
if (match) {
|
|
21
|
+
const data = JSON.parse(match[1]);
|
|
22
|
+
fs.writeFileSync('ytInitialData_dump.json', JSON.stringify(data, null, 2));
|
|
23
|
+
console.log('Saved ytInitialData to ytInitialData_dump.json');
|
|
24
|
+
|
|
25
|
+
// Attempt to drill down to find metadata.
|
|
26
|
+
// Subscribers usually in secondaryResults or owner item
|
|
27
|
+
|
|
28
|
+
} else {
|
|
29
|
+
console.log('ytInitialData not found!');
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
extractMetadata().catch(console.error);
|
package/test_next.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
const fs = require('fs');
|
|
3
|
+
|
|
4
|
+
async function testNextApi() {
|
|
5
|
+
const videoId = '_4j1Abt_AiM';
|
|
6
|
+
const url = `https://www.youtube.com/watch?v=${videoId}`;
|
|
7
|
+
|
|
8
|
+
const htmlRes = await undici.request(url, {
|
|
9
|
+
method: 'GET',
|
|
10
|
+
headers: {
|
|
11
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/115.0.0.0 Safari/537.36',
|
|
12
|
+
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
13
|
+
}
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
const html = await htmlRes.body.text();
|
|
17
|
+
let visitorData = '';
|
|
18
|
+
const vdMatch = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
19
|
+
if (vdMatch) visitorData = vdMatch[1];
|
|
20
|
+
|
|
21
|
+
const payload = {
|
|
22
|
+
context: {
|
|
23
|
+
client: {
|
|
24
|
+
hl: 'en',
|
|
25
|
+
gl: 'US',
|
|
26
|
+
clientName: 'IOS',
|
|
27
|
+
clientVersion: '19.28.1',
|
|
28
|
+
osName: 'iOS',
|
|
29
|
+
osVersion: '17.5.1',
|
|
30
|
+
deviceMake: 'Apple',
|
|
31
|
+
deviceModel: 'iPhone16,2',
|
|
32
|
+
visitorData: visitorData
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
videoId: videoId
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
const res = await undici.request('https://www.youtube.com/youtubei/v1/next', {
|
|
39
|
+
method: 'POST',
|
|
40
|
+
headers: {
|
|
41
|
+
'Accept': 'application/json',
|
|
42
|
+
'Content-Type': 'application/json',
|
|
43
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
44
|
+
},
|
|
45
|
+
body: JSON.stringify(payload)
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const apiData = await res.body.json();
|
|
49
|
+
fs.writeFileSync('next_api_dump.json', JSON.stringify(apiData, null, 2));
|
|
50
|
+
console.log('Saved dump to next_api_dump.json');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
testNextApi().catch(console.error);
|
package/tests/test.py
CHANGED
|
@@ -24,9 +24,13 @@ async def run_tests():
|
|
|
24
24
|
print("2. Testing YouTube Scraper...")
|
|
25
25
|
yt = YouTubeScraper()
|
|
26
26
|
yt_res = await yt.scrape_video("https://www.youtube.com/watch?v=aqz-KE-bpKQ")
|
|
27
|
-
print(f"YouTube Scraper Output: Title = {yt_res
|
|
28
|
-
print(f"YouTube Scraper Output:
|
|
29
|
-
print(f"YouTube Scraper Output:
|
|
27
|
+
print(f"YouTube Scraper Output: Title = {yt_res.get('title')}")
|
|
28
|
+
print(f"YouTube Scraper Output: Subscribers = {yt_res.get('subscribers')}")
|
|
29
|
+
print(f"YouTube Scraper Output: Likes = {yt_res.get('likes')}")
|
|
30
|
+
print(f"YouTube Scraper Output: Comments = {yt_res.get('comments')}")
|
|
31
|
+
print(f"YouTube Scraper Output: Duration = {yt_res.get('durationSeconds')} seconds")
|
|
32
|
+
formats = yt_res.get('formats', {})
|
|
33
|
+
print(f"YouTube Scraper Output: Found {len(formats.get('video', []))} video, {len(formats.get('videoOnly', []))} video-only, and {len(formats.get('audio', []))} audio formats")
|
|
30
34
|
await yt.client.close()
|
|
31
35
|
|
|
32
36
|
print("\n" + "-" * 33)
|
package/tests/test.ts
CHANGED
|
@@ -19,9 +19,12 @@ async function runTests() {
|
|
|
19
19
|
const ytClient = straw.youtube();
|
|
20
20
|
// Use a generic test video like Big Buck Bunny
|
|
21
21
|
const ytResult = await ytClient.scrapeVideo('https://www.youtube.com/watch?v=aqz-KE-bpKQ');
|
|
22
|
-
console.log(
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
console.log('YouTube Scraper Output: Title =', ytResult.title);
|
|
23
|
+
console.log('YouTube Scraper Output: Subscribers =', ytResult.subscribers);
|
|
24
|
+
console.log('YouTube Scraper Output: Likes =', ytResult.likes);
|
|
25
|
+
console.log('YouTube Scraper Output: Comments =', ytResult.comments);
|
|
26
|
+
console.log('YouTube Scraper Output: Duration =', ytResult.durationSeconds, 'seconds');
|
|
27
|
+
console.log(`YouTube Scraper Output: Found ${ytResult.formats.video.length} video (combined), ${ytResult.formats.videoOnly.length} video-only, and ${ytResult.formats.audio.length} audio formats.`);
|
|
25
28
|
|
|
26
29
|
console.log('\n---------------------------------');
|
|
27
30
|
|