@zetagoaurum-dev/straw 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +3 -2
- package/dist/index.js +46 -8
- package/dist/index.mjs +46 -8
- package/docs/api_reference.md +42 -0
- package/docs/getting_started.md +42 -0
- package/package.json +6 -2
- package/release.bat +4 -0
- package/src/scrapers/youtube.ts +50 -9
- package/straw/media.py +1 -1
- package/straw/youtube.py +49 -9
- package/test_api.js +42 -0
- package/test_api_clients.js +39 -0
- package/test_client.js +37 -0
- package/test_embed.js +26 -0
- package/test_html.js +26 -0
- package/test_visitor.js +56 -0
- package/test_vr.js +27 -0
- package/test_yt.js +17 -0
- package/straw/__pycache__/__init__.cpython-311.pyc +0 -0
- package/straw/__pycache__/client.cpython-311.pyc +0 -0
- package/straw/__pycache__/helpers.cpython-311.pyc +0 -0
- package/straw/__pycache__/media.cpython-311.pyc +0 -0
- package/straw/__pycache__/web.cpython-311.pyc +0 -0
- package/straw/__pycache__/youtube.cpython-311.pyc +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [1.1.1] "Performance Patch" - 2026-02-27
|
|
6
|
+
- **Perf:** Re-engineered the YouTube scraper in Node.js and Python to use the `IOS` InnerTube API directly, injecting localized `visitorData` tokens to seamlessly bypass bot checks and cipher encryption. Video format lists are returned instantaneously for optimal downloading infrastructure.
|
|
7
|
+
- **Fix:** Fixed HTML parser blocking on high-volume deployed servers by upgrading to the direct `POST /youtubei/v1/player` endpoints.
|
|
8
|
+
|
|
9
|
+
## [1.1.0] - "Milk Tea" Release - 2026-02-27
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
- Fixed Python `media.py` RegExp syntax causing import failures.
|
|
13
|
+
- Updated README.md with functional badges and version codename.
|
|
14
|
+
- Linked package.json to the correct Git metadata and License.
|
|
15
|
+
- Added comprehensive structured documentation inside `/docs` folder.
|
|
16
|
+
|
|
5
17
|
## [1.0.0] - 2026-02-27
|
|
6
18
|
|
|
7
19
|
### Added
|
package/README.md
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
<div align="center">
|
|
2
2
|
<img src="https://raw.githubusercontent.com/ZetaGo-Aurum/straw/main/assets/logo.png" alt="Straw Logo" width="200" height="200" />
|
|
3
3
|
<h1>🚀 Straw - The Enterprise-Grade Scraper</h1>
|
|
4
|
+
<p><strong>Version: 1.1.0 (Codename: Milk Tea)</strong></p>
|
|
4
5
|
<p><strong>A blazingly fast, multi-platform, unified JS/TS and Python scraping library for Web, YouTube, and Media (Images, Audio, Video, Documents).</strong></p>
|
|
5
6
|
|
|
6
7
|
[](https://npmjs.org/package/@zetagoaurum-dev/straw)
|
|
7
|
-
[](https://github.com/ZetaGo-Aurum/straw/blob/main/LICENSE)
|
|
9
|
+
[]()
|
|
9
10
|
</div>
|
|
10
11
|
|
|
11
12
|
---
|
package/dist/index.js
CHANGED
|
@@ -185,19 +185,57 @@ var YouTubeScraper = class {
|
|
|
185
185
|
* Parses the ytInitialPlayerResponse object embedded in the watch HTML.
|
|
186
186
|
*/
|
|
187
187
|
async scrapeVideo(url) {
|
|
188
|
+
const videoIdMatch = url.match(/(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))([^"&?\/\s]{11})/);
|
|
189
|
+
if (!videoIdMatch || !videoIdMatch[1]) {
|
|
190
|
+
throw new Error("Invalid YouTube URL");
|
|
191
|
+
}
|
|
192
|
+
const videoId = videoIdMatch[1];
|
|
188
193
|
const html = await this.client.getText(url, {
|
|
189
|
-
headers: {
|
|
190
|
-
"Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430"
|
|
191
|
-
}
|
|
194
|
+
headers: { "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430" }
|
|
192
195
|
});
|
|
193
196
|
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
194
197
|
const match = html.match(regex);
|
|
195
|
-
|
|
196
|
-
|
|
198
|
+
let visitorData = "";
|
|
199
|
+
let details = {};
|
|
200
|
+
if (match && match[1]) {
|
|
201
|
+
const data = JSON.parse(match[1]);
|
|
202
|
+
details = data?.videoDetails || {};
|
|
203
|
+
visitorData = data?.responseContext?.visitorData || "";
|
|
204
|
+
}
|
|
205
|
+
if (!visitorData) {
|
|
206
|
+
const vdMatch = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
207
|
+
if (vdMatch) visitorData = vdMatch[1];
|
|
208
|
+
}
|
|
209
|
+
const payload = {
|
|
210
|
+
context: {
|
|
211
|
+
client: {
|
|
212
|
+
hl: "en",
|
|
213
|
+
gl: "US",
|
|
214
|
+
clientName: "IOS",
|
|
215
|
+
clientVersion: "19.28.1",
|
|
216
|
+
osName: "iOS",
|
|
217
|
+
osVersion: "17.5.1",
|
|
218
|
+
deviceMake: "Apple",
|
|
219
|
+
deviceModel: "iPhone16,2",
|
|
220
|
+
visitorData
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
videoId
|
|
224
|
+
};
|
|
225
|
+
const res = await this.client.request("https://www.youtube.com/youtubei/v1/player", {
|
|
226
|
+
method: "POST",
|
|
227
|
+
headers: {
|
|
228
|
+
"Accept": "application/json",
|
|
229
|
+
"Content-Type": "application/json",
|
|
230
|
+
"User-Agent": "com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)"
|
|
231
|
+
},
|
|
232
|
+
body: JSON.stringify(payload)
|
|
233
|
+
});
|
|
234
|
+
const apiData = await res.json();
|
|
235
|
+
if (!details.title) {
|
|
236
|
+
details = apiData?.videoDetails || {};
|
|
197
237
|
}
|
|
198
|
-
const
|
|
199
|
-
const details = data?.videoDetails;
|
|
200
|
-
const streamingData = data?.streamingData;
|
|
238
|
+
const streamingData = apiData?.streamingData;
|
|
201
239
|
if (!details) {
|
|
202
240
|
throw new Error("Video details not found inside player response.");
|
|
203
241
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -145,19 +145,57 @@ var YouTubeScraper = class {
|
|
|
145
145
|
* Parses the ytInitialPlayerResponse object embedded in the watch HTML.
|
|
146
146
|
*/
|
|
147
147
|
async scrapeVideo(url) {
|
|
148
|
+
const videoIdMatch = url.match(/(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))([^"&?\/\s]{11})/);
|
|
149
|
+
if (!videoIdMatch || !videoIdMatch[1]) {
|
|
150
|
+
throw new Error("Invalid YouTube URL");
|
|
151
|
+
}
|
|
152
|
+
const videoId = videoIdMatch[1];
|
|
148
153
|
const html = await this.client.getText(url, {
|
|
149
|
-
headers: {
|
|
150
|
-
"Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430"
|
|
151
|
-
}
|
|
154
|
+
headers: { "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430" }
|
|
152
155
|
});
|
|
153
156
|
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
154
157
|
const match = html.match(regex);
|
|
155
|
-
|
|
156
|
-
|
|
158
|
+
let visitorData = "";
|
|
159
|
+
let details = {};
|
|
160
|
+
if (match && match[1]) {
|
|
161
|
+
const data = JSON.parse(match[1]);
|
|
162
|
+
details = data?.videoDetails || {};
|
|
163
|
+
visitorData = data?.responseContext?.visitorData || "";
|
|
164
|
+
}
|
|
165
|
+
if (!visitorData) {
|
|
166
|
+
const vdMatch = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
167
|
+
if (vdMatch) visitorData = vdMatch[1];
|
|
168
|
+
}
|
|
169
|
+
const payload = {
|
|
170
|
+
context: {
|
|
171
|
+
client: {
|
|
172
|
+
hl: "en",
|
|
173
|
+
gl: "US",
|
|
174
|
+
clientName: "IOS",
|
|
175
|
+
clientVersion: "19.28.1",
|
|
176
|
+
osName: "iOS",
|
|
177
|
+
osVersion: "17.5.1",
|
|
178
|
+
deviceMake: "Apple",
|
|
179
|
+
deviceModel: "iPhone16,2",
|
|
180
|
+
visitorData
|
|
181
|
+
}
|
|
182
|
+
},
|
|
183
|
+
videoId
|
|
184
|
+
};
|
|
185
|
+
const res = await this.client.request("https://www.youtube.com/youtubei/v1/player", {
|
|
186
|
+
method: "POST",
|
|
187
|
+
headers: {
|
|
188
|
+
"Accept": "application/json",
|
|
189
|
+
"Content-Type": "application/json",
|
|
190
|
+
"User-Agent": "com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)"
|
|
191
|
+
},
|
|
192
|
+
body: JSON.stringify(payload)
|
|
193
|
+
});
|
|
194
|
+
const apiData = await res.json();
|
|
195
|
+
if (!details.title) {
|
|
196
|
+
details = apiData?.videoDetails || {};
|
|
157
197
|
}
|
|
158
|
-
const
|
|
159
|
-
const details = data?.videoDetails;
|
|
160
|
-
const streamingData = data?.streamingData;
|
|
198
|
+
const streamingData = apiData?.streamingData;
|
|
161
199
|
if (!details) {
|
|
162
200
|
throw new Error("Video details not found inside player response.");
|
|
163
201
|
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# API Reference
|
|
2
|
+
|
|
3
|
+
This module exports the exact same interfaces across both JS and Python.
|
|
4
|
+
|
|
5
|
+
## `WebScraper`
|
|
6
|
+
Extracts high-level semantics from any standard webpage.
|
|
7
|
+
|
|
8
|
+
- `scrape(url: string)`: Returns the following schema:
|
|
9
|
+
- `title`: The `<title>` of the page.
|
|
10
|
+
- `description`: The meta-description or OG-description.
|
|
11
|
+
- `text`: Every pure string in the `<body>` element perfectly separated by spaces (great for LLM RAGs).
|
|
12
|
+
- `links`: Array of dictionaries containing `href` and `text` for every `<a>` tag.
|
|
13
|
+
- `meta`: Key-value pair of all `<meta>` tags present on the page.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## `YouTubeScraper`
|
|
18
|
+
Extracts rich media from the YouTube Player Response JSON naturally, completely dodging rate-limit heavy JS scrapers like `ytdl-core`.
|
|
19
|
+
|
|
20
|
+
- `scrapeVideo(url: string)` / `scrape_video(url: str)`: Returns:
|
|
21
|
+
- `title`, `author`, `description`, `views`, `durationSeconds`, `thumbnail`.
|
|
22
|
+
- `formats`: An array of media formats containing `url`, `mimeType`, `quality`, `hasAudio`, and `hasVideo`. You can directly stream from these URLs or pass them to `ffmpeg`.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## `MediaScraper`
|
|
27
|
+
Extracts deeply embedded raw media files from web layers. Identifies raw paths from `<video>`, `<img>`, HTML `<source>` tags, and general deep URL sniffing.
|
|
28
|
+
- Extracted Extensions: `mp4, mp3, pdf, docx, png, jpg, webm, wav, ogg` and more.
|
|
29
|
+
|
|
30
|
+
- `extractMedia(url: string)` / `extract_media(url: str)`: Returns:
|
|
31
|
+
- `pageTitle`: Title of the scraped page.
|
|
32
|
+
- `mediaLinks`: Array of absolute HTTP/HTTPS strings directly leading to files.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## `StrawClient`
|
|
37
|
+
The core engine. If you want to build custom scrapers, instantiate the base client!
|
|
38
|
+
- **Options / Config**:
|
|
39
|
+
- `timeout`: Request timeout in milliseconds (JS) or seconds (Py). Default `10000` / `10`.
|
|
40
|
+
- `retries`: Number of exponential backoff retry attempts. Default `3`.
|
|
41
|
+
- `rotateUserAgent` / `rotate_user_agent`: `true` by default.
|
|
42
|
+
- `proxy`: An optional HTTP/HTTPS proxy string.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Getting Started with Straw
|
|
2
|
+
|
|
3
|
+
Straw perfectly unifies JavaScript/TypeScript and Python by providing exactly the same class patterns across both languages.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
### Node.js Setup
|
|
8
|
+
Install the core scraper using npm:
|
|
9
|
+
```bash
|
|
10
|
+
npm install @zetagoaurum-dev/straw
|
|
11
|
+
```
|
|
12
|
+
Straw relies on `undici` and `cheerio` under the hood. For TypeScript projects, types are included right out of the box!
|
|
13
|
+
|
|
14
|
+
### Python Setup
|
|
15
|
+
Currently, `straw-py` is intended to be cloned or included directly alongside your code, though you can bundle it as a module easily. Ensure these dependencies are installed:
|
|
16
|
+
```bash
|
|
17
|
+
pip install httpx beautifulsoup4 lxml
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Basic Scraping
|
|
21
|
+
Both versions initialize scraper modules out of the box. The base scraper client (`StrawClient`) comes configured with anti-blocking headers and User-Agent rotation. You don't need to write custom rotation logic!
|
|
22
|
+
|
|
23
|
+
**TypeScript Example**:
|
|
24
|
+
```ts
|
|
25
|
+
import straw from '@zetagoaurum-dev/straw';
|
|
26
|
+
|
|
27
|
+
const web = straw.web();
|
|
28
|
+
const dataset = await web.scrape('https://wikipedia.org');
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**Python Example**:
|
|
32
|
+
```py
|
|
33
|
+
import asyncio
|
|
34
|
+
from straw import WebScraper
|
|
35
|
+
|
|
36
|
+
async def run():
|
|
37
|
+
web = WebScraper()
|
|
38
|
+
dataset = await web.scrape('https://wikipedia.org')
|
|
39
|
+
await web.client.close()
|
|
40
|
+
|
|
41
|
+
asyncio.run(run())
|
|
42
|
+
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zetagoaurum-dev/straw",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.1",
|
|
4
4
|
"description": "Enterprise-grade unified JS/TS and Python scraping library for Web, YouTube, and Media (Images, Audio, Video, Documents)",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -25,7 +25,11 @@
|
|
|
25
25
|
"anti-cors"
|
|
26
26
|
],
|
|
27
27
|
"author": "ZetaGo-Aurum",
|
|
28
|
-
"license": "
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"repository": {
|
|
30
|
+
"type": "git",
|
|
31
|
+
"url": "https://github.com/ZetaGo-Aurum/straw.git"
|
|
32
|
+
},
|
|
29
33
|
"devDependencies": {
|
|
30
34
|
"@types/node": "^25.3.2",
|
|
31
35
|
"ts-node": "^10.9.2",
|
package/release.bat
ADDED
package/src/scrapers/youtube.ts
CHANGED
|
@@ -33,23 +33,64 @@ export class YouTubeScraper {
|
|
|
33
33
|
* Parses the ytInitialPlayerResponse object embedded in the watch HTML.
|
|
34
34
|
*/
|
|
35
35
|
public async scrapeVideo(url: string): Promise<YouTubeResult> {
|
|
36
|
+
const videoIdMatch = url.match(/(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))([^"&?\/\s]{11})/);
|
|
37
|
+
if (!videoIdMatch || !videoIdMatch[1]) {
|
|
38
|
+
throw new Error('Invalid YouTube URL');
|
|
39
|
+
}
|
|
40
|
+
const videoId = videoIdMatch[1];
|
|
41
|
+
|
|
36
42
|
const html = await this.client.getText(url, {
|
|
37
|
-
headers: {
|
|
38
|
-
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
39
|
-
}
|
|
43
|
+
headers: { 'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430' }
|
|
40
44
|
});
|
|
41
45
|
|
|
42
|
-
// Find ytInitialPlayerResponse JSON fragment in the HTML
|
|
43
46
|
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
44
47
|
const match = html.match(regex);
|
|
48
|
+
let visitorData = '';
|
|
49
|
+
let details: any = {};
|
|
50
|
+
|
|
51
|
+
if (match && match[1]) {
|
|
52
|
+
const data = JSON.parse(match[1]);
|
|
53
|
+
details = data?.videoDetails || {};
|
|
54
|
+
visitorData = data?.responseContext?.visitorData || '';
|
|
55
|
+
}
|
|
45
56
|
|
|
46
|
-
if (!
|
|
47
|
-
|
|
57
|
+
if (!visitorData) {
|
|
58
|
+
const vdMatch = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
59
|
+
if (vdMatch) visitorData = vdMatch[1];
|
|
48
60
|
}
|
|
49
61
|
|
|
50
|
-
const
|
|
51
|
-
|
|
52
|
-
|
|
62
|
+
const payload = {
|
|
63
|
+
context: {
|
|
64
|
+
client: {
|
|
65
|
+
hl: 'en',
|
|
66
|
+
gl: 'US',
|
|
67
|
+
clientName: 'IOS',
|
|
68
|
+
clientVersion: '19.28.1',
|
|
69
|
+
osName: 'iOS',
|
|
70
|
+
osVersion: '17.5.1',
|
|
71
|
+
deviceMake: 'Apple',
|
|
72
|
+
deviceModel: 'iPhone16,2',
|
|
73
|
+
visitorData: visitorData
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
videoId: videoId
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const res = await this.client.request('https://www.youtube.com/youtubei/v1/player', {
|
|
80
|
+
method: 'POST',
|
|
81
|
+
headers: {
|
|
82
|
+
'Accept': 'application/json',
|
|
83
|
+
'Content-Type': 'application/json',
|
|
84
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
85
|
+
},
|
|
86
|
+
body: JSON.stringify(payload)
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
const apiData = await res.json() as any;
|
|
90
|
+
if (!details.title) {
|
|
91
|
+
details = apiData?.videoDetails || {};
|
|
92
|
+
}
|
|
93
|
+
const streamingData = apiData?.streamingData;
|
|
53
94
|
|
|
54
95
|
if (!details) {
|
|
55
96
|
throw new Error('Video details not found inside player response.');
|
package/straw/media.py
CHANGED
|
@@ -17,7 +17,7 @@ class MediaScraper:
|
|
|
17
17
|
for tag in soup.find_all(['video', 'audio', 'source', 'img']):
|
|
18
18
|
src = tag.get('src') or tag.get('srcset')
|
|
19
19
|
if src:
|
|
20
|
-
urls = re.findall(r'https?:\/\/[^\s"',]+', src)
|
|
20
|
+
urls = re.findall(r'''https?:\/\/[^\s"',]+''', src)
|
|
21
21
|
for u in urls:
|
|
22
22
|
media_links.add(u)
|
|
23
23
|
if src.startswith('http') and src not in media_links:
|
package/straw/youtube.py
CHANGED
|
@@ -8,18 +8,58 @@ class YouTubeScraper:
|
|
|
8
8
|
self.client = StrawClient(**client_options)
|
|
9
9
|
|
|
10
10
|
async def scrape_video(self, url: str) -> Dict[str, Any]:
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
match = re.search(r'(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?v=|watch\?.+&v=))([^"&?\/\s]{11})', url)
|
|
12
|
+
if not match:
|
|
13
|
+
raise Exception("Invalid YouTube URL")
|
|
14
|
+
video_id = match.group(1)
|
|
15
|
+
|
|
16
|
+
headers = {'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'}
|
|
14
17
|
html = await self.client.get_text(url, headers=headers)
|
|
18
|
+
|
|
19
|
+
visitor_data = ""
|
|
20
|
+
details = {}
|
|
21
|
+
|
|
22
|
+
player_match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)', html)
|
|
23
|
+
if player_match:
|
|
24
|
+
data_html = json.loads(player_match.group(1))
|
|
25
|
+
details = data_html.get('videoDetails', {})
|
|
26
|
+
visitor_data = data_html.get('responseContext', {}).get('visitorData', '')
|
|
27
|
+
|
|
28
|
+
if not visitor_data:
|
|
29
|
+
vd_match = re.search(r'"visitorData"\s*:\s*"([^"]+)"', html)
|
|
30
|
+
if vd_match:
|
|
31
|
+
visitor_data = vd_match.group(1)
|
|
15
32
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
33
|
+
payload = {
|
|
34
|
+
"context": {
|
|
35
|
+
"client": {
|
|
36
|
+
"hl": "en",
|
|
37
|
+
"gl": "US",
|
|
38
|
+
"clientName": "IOS",
|
|
39
|
+
"clientVersion": "19.28.1",
|
|
40
|
+
"osName": "iOS",
|
|
41
|
+
"osVersion": "17.5.1",
|
|
42
|
+
"deviceMake": "Apple",
|
|
43
|
+
"deviceModel": "iPhone16,2",
|
|
44
|
+
"visitorData": visitor_data
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
"videoId": video_id
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
api_headers = {
|
|
51
|
+
'Accept': 'application/json',
|
|
52
|
+
'Content-Type': 'application/json',
|
|
53
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
54
|
+
}
|
|
19
55
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
56
|
+
response = await self.client.request('POST', 'https://www.youtube.com/youtubei/v1/player', json=payload, headers=api_headers)
|
|
57
|
+
api_data = response.json()
|
|
58
|
+
|
|
59
|
+
if not details.get('title'):
|
|
60
|
+
details = api_data.get('videoDetails', {})
|
|
61
|
+
|
|
62
|
+
streaming_data = api_data.get('streamingData', {})
|
|
23
63
|
|
|
24
64
|
if not details:
|
|
25
65
|
raise Exception("Video details not found inside player response.")
|
package/test_api.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
|
|
3
|
+
async function testInnerTube() {
|
|
4
|
+
const videoId = '_4j1Abt_AiM';
|
|
5
|
+
|
|
6
|
+
const payload = {
|
|
7
|
+
context: {
|
|
8
|
+
client: {
|
|
9
|
+
hl: 'en',
|
|
10
|
+
gl: 'US',
|
|
11
|
+
clientName: 'IOS',
|
|
12
|
+
clientVersion: '19.28.1',
|
|
13
|
+
osName: 'iOS',
|
|
14
|
+
osVersion: '17.5.1',
|
|
15
|
+
deviceMake: 'Apple',
|
|
16
|
+
deviceModel: 'iPhone16,2'
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
videoId: videoId
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
const res = await undici.request('https://www.youtube.com/youtubei/v1/player', {
|
|
23
|
+
method: 'POST',
|
|
24
|
+
headers: {
|
|
25
|
+
'Content-Type': 'application/json',
|
|
26
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
27
|
+
},
|
|
28
|
+
body: JSON.stringify(payload)
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
const body = await res.body.json();
|
|
32
|
+
console.log('Full JSON Response Keys:', Object.keys(body));
|
|
33
|
+
console.log('Raw JSON String (Truncated):', JSON.stringify(body).slice(0, 1000));
|
|
34
|
+
console.log('Playability:', body.playabilityStatus);
|
|
35
|
+
console.log('Title:', body.videoDetails?.title);
|
|
36
|
+
|
|
37
|
+
const formats = [...(body.streamingData?.formats || []), ...(body.streamingData?.adaptiveFormats || [])];
|
|
38
|
+
console.log('Total Formats:', formats.length);
|
|
39
|
+
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
testInnerTube();
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
|
|
3
|
+
async function testClient(clientName, clientVersion, userAgent, osName='', osVersion='') {
|
|
4
|
+
const payload = {
|
|
5
|
+
context: {
|
|
6
|
+
client: {
|
|
7
|
+
hl: 'en',
|
|
8
|
+
gl: 'US',
|
|
9
|
+
clientName,
|
|
10
|
+
clientVersion,
|
|
11
|
+
osName,
|
|
12
|
+
osVersion
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
videoId: '_4j1Abt_AiM'
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
const res = await undici.request('https://www.youtube.com/youtubei/v1/player', {
|
|
19
|
+
method: 'POST',
|
|
20
|
+
headers: {
|
|
21
|
+
'Content-Type': 'application/json',
|
|
22
|
+
'User-Agent': userAgent
|
|
23
|
+
},
|
|
24
|
+
body: JSON.stringify(payload)
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
const body = await res.body.json();
|
|
28
|
+
const formats = [...(body.streamingData?.formats || []), ...(body.streamingData?.adaptiveFormats || [])];
|
|
29
|
+
console.log(`[${clientName}] Playability:`, body.playabilityStatus?.status, '| Formats:', formats.length);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
async function runAll() {
|
|
33
|
+
await testClient('WEB_EMBED', '1.20230209.00.00', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
|
|
34
|
+
await testClient('TVHTML5', '7.20230209.00.00', 'Mozilla/5.0 (Web0S; Linux/SmartTV) AppleWebKit/537.36 (KHTML, like Gecko)');
|
|
35
|
+
await testClient('ANDROID', '17.31.35', 'com.google.android.youtube/17.31.35 (Linux; U; Android 11)', 'Android', '11');
|
|
36
|
+
await testClient('IOS', '19.28.1', 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)', 'iOS', '17.5.1');
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
runAll();
|
package/test_client.js
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
const { StrawClient } = require('./dist/core/client.js');
|
|
2
|
+
|
|
3
|
+
async function test() {
|
|
4
|
+
const client = new StrawClient();
|
|
5
|
+
const payload = {
|
|
6
|
+
context: {
|
|
7
|
+
client: {
|
|
8
|
+
hl: 'en',
|
|
9
|
+
gl: 'US',
|
|
10
|
+
clientName: 'IOS',
|
|
11
|
+
clientVersion: '19.28.1',
|
|
12
|
+
osName: 'iOS',
|
|
13
|
+
osVersion: '17.5.1',
|
|
14
|
+
deviceMake: 'Apple',
|
|
15
|
+
deviceModel: 'iPhone16,2'
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
videoId: '_4j1Abt_AiM'
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
const res = await client.request('https://www.youtube.com/youtubei/v1/player', {
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: {
|
|
24
|
+
'Content-Type': 'application/json',
|
|
25
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
26
|
+
},
|
|
27
|
+
body: JSON.stringify(payload)
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
const data = await res.json();
|
|
31
|
+
console.log(Object.keys(data));
|
|
32
|
+
if (data.playabilityStatus) {
|
|
33
|
+
console.log('Playability:', data.playabilityStatus);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
test();
|
package/test_embed.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
|
|
3
|
+
async function testEmbed() {
|
|
4
|
+
const url = 'https://www.youtube.com/embed/_4j1Abt_AiM';
|
|
5
|
+
const res = await undici.request(url, {
|
|
6
|
+
headers: {
|
|
7
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
|
8
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
9
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
|
10
|
+
}
|
|
11
|
+
});
|
|
12
|
+
const html = await res.body.text();
|
|
13
|
+
|
|
14
|
+
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
15
|
+
const match = html.match(regex);
|
|
16
|
+
if (match) {
|
|
17
|
+
const data = JSON.parse(match[1]);
|
|
18
|
+
const formats = [...(data.streamingData?.formats || []), ...(data.streamingData?.adaptiveFormats || [])];
|
|
19
|
+
console.log('Embed playability:', data.playabilityStatus?.status);
|
|
20
|
+
console.log('Formats found:', formats.length);
|
|
21
|
+
} else {
|
|
22
|
+
console.log('No ytInitialPlayerResponse found in embed HTML');
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
testEmbed();
|
package/test_html.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
|
|
3
|
+
async function testHtml() {
|
|
4
|
+
const url = 'https://www.youtube.com/watch?v=_4j1Abt_AiM';
|
|
5
|
+
const res = await undici.request(url, {
|
|
6
|
+
method: 'GET',
|
|
7
|
+
headers: {
|
|
8
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0',
|
|
9
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
10
|
+
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
11
|
+
}
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
const html = await res.body.text();
|
|
15
|
+
const match = html.match(/ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/);
|
|
16
|
+
if (match) {
|
|
17
|
+
const data = JSON.parse(match[1]);
|
|
18
|
+
const formats = [...(data.streamingData?.formats || []), ...(data.streamingData?.adaptiveFormats || [])];
|
|
19
|
+
console.log('Got HTML Response with Player:', data.playabilityStatus?.status);
|
|
20
|
+
console.log('Formats:', formats.length);
|
|
21
|
+
} else {
|
|
22
|
+
console.log('No ytInitialPlayerResponse found in direct HTML fetching.');
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
testHtml();
|
package/test_visitor.js
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
|
|
3
|
+
async function testVisitor() {
|
|
4
|
+
const videoId = '_4j1Abt_AiM';
|
|
5
|
+
const url = `https://www.youtube.com/watch?v=${videoId}`;
|
|
6
|
+
|
|
7
|
+
const htmlRes = await undici.request(url, {
|
|
8
|
+
method: 'GET',
|
|
9
|
+
headers: {
|
|
10
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/115.0.0.0 Safari/537.36',
|
|
11
|
+
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
const html = await htmlRes.body.text();
|
|
16
|
+
|
|
17
|
+
let visitorData = '';
|
|
18
|
+
const match = html.match(/"visitorData"\s*:\s*"([^"]+)"/);
|
|
19
|
+
if (match) visitorData = match[1];
|
|
20
|
+
|
|
21
|
+
console.log('Got Visitor Data:', visitorData);
|
|
22
|
+
|
|
23
|
+
const payload = {
|
|
24
|
+
context: {
|
|
25
|
+
client: {
|
|
26
|
+
hl: 'en',
|
|
27
|
+
gl: 'US',
|
|
28
|
+
clientName: 'IOS',
|
|
29
|
+
clientVersion: '19.28.1',
|
|
30
|
+
osName: 'iOS',
|
|
31
|
+
osVersion: '17.5.1',
|
|
32
|
+
deviceMake: 'Apple',
|
|
33
|
+
deviceModel: 'iPhone16,2',
|
|
34
|
+
visitorData: visitorData
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
videoId: videoId
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const res = await undici.request('https://www.youtube.com/youtubei/v1/player', {
|
|
41
|
+
method: 'POST',
|
|
42
|
+
headers: {
|
|
43
|
+
'Content-Type': 'application/json',
|
|
44
|
+
'User-Agent': 'com.google.ios.youtube/19.28.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X; en_US)'
|
|
45
|
+
},
|
|
46
|
+
body: JSON.stringify(payload)
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
const body = await res.body.json();
|
|
50
|
+
const formats = [...(body.streamingData?.formats || []), ...(body.streamingData?.adaptiveFormats || [])];
|
|
51
|
+
|
|
52
|
+
console.log('Target Playability:', body.playabilityStatus?.status);
|
|
53
|
+
console.log('Target Formats:', formats.length);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
testVisitor();
|
package/test_vr.js
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
const undici = require('undici');
|
|
2
|
+
|
|
3
|
+
async function testVR() {
|
|
4
|
+
const payload = {
|
|
5
|
+
context: {
|
|
6
|
+
client: {
|
|
7
|
+
clientName: 'ANDROID_TESTSUITE',
|
|
8
|
+
clientVersion: '1.9',
|
|
9
|
+
androidSdkVersion: 30,
|
|
10
|
+
hl: 'en',
|
|
11
|
+
gl: 'US',
|
|
12
|
+
utcOffsetMinutes: 0
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
videoId: '_4j1Abt_AiM'
|
|
16
|
+
};
|
|
17
|
+
const res = await undici.request('https://www.youtube.com/youtubei/v1/player', {
|
|
18
|
+
method: 'POST',
|
|
19
|
+
headers: { 'Content-Type': 'application/json', 'User-Agent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11)' },
|
|
20
|
+
body: JSON.stringify(payload)
|
|
21
|
+
});
|
|
22
|
+
const body = await res.body.json();
|
|
23
|
+
const formats = [...(body.streamingData?.formats || []), ...(body.streamingData?.adaptiveFormats || [])];
|
|
24
|
+
console.log('Playability:', body.playabilityStatus?.status);
|
|
25
|
+
console.log('Formats:', formats.length);
|
|
26
|
+
}
|
|
27
|
+
testVR();
|
package/test_yt.js
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
const straw = require('./dist/index.js');
|
|
2
|
+
|
|
3
|
+
async function run() {
|
|
4
|
+
console.time('YouTube Scrape');
|
|
5
|
+
const yt = new straw.YouTubeScraper();
|
|
6
|
+
try {
|
|
7
|
+
const res = await yt.scrapeVideo('https://youtu.be/_4j1Abt_AiM?si=qJY_gv4F_adBYMYP');
|
|
8
|
+
console.log('Title:', res.title);
|
|
9
|
+
console.log('Formats:', res.formats.length);
|
|
10
|
+
console.log('First format URL (truncated):', res.formats[0]?.url?.substring(0, 100));
|
|
11
|
+
} catch (e) {
|
|
12
|
+
console.error('Scrape failed:', e);
|
|
13
|
+
}
|
|
14
|
+
console.timeEnd('YouTube Scrape');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
run();
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|