@zetagoaurum-dev/straw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,251 @@
1
+ // src/core/client.ts
2
+ import { fetch, Agent } from "undici";
3
+
4
+ // src/utils/helpers.ts
5
+ var USER_AGENTS = [
6
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
7
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
8
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
9
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
10
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:109.0) Gecko/20100101 Firefox/121.0",
11
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
12
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
13
+ ];
14
+ function getRandomUserAgent() {
15
+ return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
16
+ }
17
+ function sleep(ms) {
18
+ return new Promise((resolve) => setTimeout(resolve, ms));
19
+ }
20
+
21
+ // src/core/client.ts
22
+ var StrawClient = class {
23
+ options;
24
+ dispatcher;
25
+ constructor(options = {}) {
26
+ this.options = {
27
+ timeout: 1e4,
28
+ retries: 3,
29
+ rotateUserAgent: true,
30
+ ...options
31
+ };
32
+ this.dispatcher = new Agent({
33
+ connect: {
34
+ rejectUnauthorized: false
35
+ }
36
+ });
37
+ }
38
+ /**
39
+ * Fetch a URL with built-in retries, timeout, and User-Agent rotation.
40
+ */
41
+ async request(url, init) {
42
+ let attempts = 0;
43
+ const maxRetries = this.options.retries || 1;
44
+ while (attempts < maxRetries) {
45
+ try {
46
+ const headers = new Headers(init?.headers);
47
+ if (this.options.rotateUserAgent && !headers.has("User-Agent")) {
48
+ headers.set("User-Agent", getRandomUserAgent());
49
+ }
50
+ if (!headers.has("Accept")) {
51
+ headers.set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
52
+ }
53
+ if (!headers.has("Accept-Language")) {
54
+ headers.set("Accept-Language", "en-US,en;q=0.9");
55
+ }
56
+ const controller = new AbortController();
57
+ const timeoutId = setTimeout(() => controller.abort(), this.options.timeout);
58
+ const response = await fetch(url, {
59
+ ...init,
60
+ headers,
61
+ signal: controller.signal,
62
+ dispatcher: this.options.proxy ? void 0 : this.dispatcher
63
+ });
64
+ clearTimeout(timeoutId);
65
+ if ([429, 500, 502, 503, 504].includes(response.status)) {
66
+ throw new Error(`HTTP Error ${response.status}`);
67
+ }
68
+ return response;
69
+ } catch (error) {
70
+ attempts++;
71
+ if (attempts >= maxRetries) {
72
+ const cause = error.cause ? String(error.cause) : "No cause provided";
73
+ throw new Error(`Failed to fetch ${url} after ${maxRetries} attempts. Last error: ${error.message} - Cause: ${cause}`);
74
+ }
75
+ await sleep(1e3 * Math.pow(2, attempts));
76
+ }
77
+ }
78
+ throw new Error("Unreachable");
79
+ }
80
+ async getText(url, init) {
81
+ const response = await this.request(url, init);
82
+ return await response.text();
83
+ }
84
+ async getJson(url, init) {
85
+ const response = await this.request(url, init);
86
+ return await response.json();
87
+ }
88
+ };
89
+
90
+ // src/scrapers/web.ts
91
+ import * as cheerio from "cheerio";
92
+ var WebScraper = class {
93
+ client;
94
+ constructor(options) {
95
+ this.client = new StrawClient(options);
96
+ }
97
+ /**
98
+ * Scrape a webpage and return structured data.
99
+ * Extracts title, generic text, metadata, and all links.
100
+ */
101
+ async scrape(url) {
102
+ const html = await this.client.getText(url);
103
+ const $ = cheerio.load(html);
104
+ const title = $("title").text().trim();
105
+ let description = $('meta[name="description"]').attr("content") || "";
106
+ if (!description) {
107
+ description = $('meta[property="og:description"]').attr("content") || "";
108
+ }
109
+ const meta = {};
110
+ $("meta").each((_, el) => {
111
+ const name = $(el).attr("name") || $(el).attr("property");
112
+ const content = $(el).attr("content");
113
+ if (name && content) {
114
+ meta[name] = content;
115
+ }
116
+ });
117
+ const links = [];
118
+ $("a").each((_, el) => {
119
+ const href = $(el).attr("href");
120
+ const text2 = $(el).text().trim();
121
+ if (href && href.startsWith("http")) {
122
+ links.push({ text: text2, href });
123
+ }
124
+ });
125
+ $("script, style, noscript, iframe, svg").remove();
126
+ const text = $("body").text().replace(/\s+/g, " ").trim();
127
+ return {
128
+ title,
129
+ description,
130
+ text,
131
+ links,
132
+ meta
133
+ };
134
+ }
135
+ };
136
+
137
+ // src/scrapers/youtube.ts
138
+ var YouTubeScraper = class {
139
+ client;
140
+ constructor(options) {
141
+ this.client = new StrawClient(options);
142
+ }
143
+ /**
144
+ * Extracts YouTube video metadata and direct stream URLs without external bloatware.
145
+ * Parses the ytInitialPlayerResponse object embedded in the watch HTML.
146
+ */
147
+ async scrapeVideo(url) {
148
+ const html = await this.client.getText(url, {
149
+ headers: {
150
+ "Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430"
151
+ }
152
+ });
153
+ const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
154
+ const match = html.match(regex);
155
+ if (!match || !match[1]) {
156
+ throw new Error("ytInitialPlayerResponse not found. YouTube might have changed their layout or the IP is blocked.");
157
+ }
158
+ const data = JSON.parse(match[1]);
159
+ const details = data?.videoDetails;
160
+ const streamingData = data?.streamingData;
161
+ if (!details) {
162
+ throw new Error("Video details not found inside player response.");
163
+ }
164
+ const formats = [];
165
+ const rawFormats = [...streamingData?.formats || [], ...streamingData?.adaptiveFormats || []];
166
+ for (const format of rawFormats) {
167
+ if (format.url) {
168
+ const mimeType = format.mimeType || "";
169
+ formats.push({
170
+ url: format.url,
171
+ mimeType,
172
+ width: format.width,
173
+ height: format.height,
174
+ quality: format.qualityLabel || format.quality,
175
+ bitrate: format.bitrate,
176
+ hasAudio: mimeType.includes("audio/"),
177
+ hasVideo: mimeType.includes("video/")
178
+ });
179
+ } else if (format.signatureCipher) {
180
+ continue;
181
+ }
182
+ }
183
+ return {
184
+ title: details.title || "",
185
+ author: details.author || "",
186
+ description: details.shortDescription || "",
187
+ views: details.viewCount || "0",
188
+ durationSeconds: details.lengthSeconds || "0",
189
+ thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || "",
190
+ formats
191
+ };
192
+ }
193
+ };
194
+
195
+ // src/scrapers/media.ts
196
+ import * as cheerio2 from "cheerio";
197
+ var MediaScraper = class {
198
+ client;
199
+ constructor(options) {
200
+ this.client = new StrawClient(options);
201
+ }
202
+ /**
203
+ * Attempts to find direct media files (Images/Audio/Video/Documents) referenced in any generic webpage HTML.
204
+ */
205
+ async extractMedia(url) {
206
+ const html = await this.client.getText(url);
207
+ const $ = cheerio2.load(html);
208
+ const pageTitle = $("title").text().trim();
209
+ const mediaLinks = /* @__PURE__ */ new Set();
210
+ $("video, audio, img, source").each((_, el) => {
211
+ const src = $(el).attr("src") || $(el).attr("srcset");
212
+ if (src) {
213
+ const urls = src.match(/https?:\/\/[^\s"',]+/g);
214
+ if (urls) urls.forEach((u) => mediaLinks.add(u));
215
+ else if (src.startsWith("http")) mediaLinks.add(src);
216
+ }
217
+ });
218
+ $("a").each((_, el) => {
219
+ const href = $(el).attr("href");
220
+ if (href && href.startsWith("http") && href.match(/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv|png|jpg|jpeg|gif|svg|webp|avif|ico|bmp)(\?.*)?$/i)) {
221
+ mediaLinks.add(href);
222
+ }
223
+ });
224
+ const rawLinksMatch = html.match(/https?:\/\/[^\s"',]+\.(png|jpg|jpeg|gif|svg|webp|avif|ico|bmp|pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv)/gi);
225
+ if (rawLinksMatch) {
226
+ for (const link of rawLinksMatch) {
227
+ mediaLinks.add(link);
228
+ }
229
+ }
230
+ return {
231
+ pageTitle,
232
+ mediaLinks: Array.from(mediaLinks)
233
+ };
234
+ }
235
+ };
236
+
237
+ // src/index.ts
238
+ var straw = {
239
+ client: (options) => new StrawClient(options),
240
+ web: (options) => new WebScraper(options),
241
+ youtube: (options) => new YouTubeScraper(options),
242
+ media: (options) => new MediaScraper(options)
243
+ };
244
+ var index_default = straw;
245
+ export {
246
+ MediaScraper,
247
+ StrawClient,
248
+ WebScraper,
249
+ YouTubeScraper,
250
+ index_default as default
251
+ };
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "@zetagoaurum-dev/straw",
3
+ "version": "1.0.0",
4
+ "description": "Enterprise-grade unified JS/TS and Python scraping library for Web, YouTube, and Media (Images, Audio, Video, Documents)",
5
+ "main": "dist/index.js",
6
+ "module": "dist/index.mjs",
7
+ "types": "dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "require": "./dist/index.js",
11
+ "import": "./dist/index.mjs",
12
+ "types": "./dist/index.d.ts"
13
+ }
14
+ },
15
+ "scripts": {
16
+ "build": "tsup src/index.ts --format cjs,esm --dts --clean",
17
+ "dev": "tsup src/index.ts --format cjs,esm --dts --watch",
18
+ "test": "tsx tests/test.ts"
19
+ },
20
+ "keywords": [
21
+ "scraping",
22
+ "scraper",
23
+ "youtube-scraper",
24
+ "media-extractor",
25
+ "anti-cors"
26
+ ],
27
+ "author": "ZetaGo-Aurum",
28
+ "license": "ISC",
29
+ "devDependencies": {
30
+ "@types/node": "^25.3.2",
31
+ "ts-node": "^10.9.2",
32
+ "tsup": "^8.5.1",
33
+ "tsx": "^4.21.0",
34
+ "typescript": "^5.9.3"
35
+ },
36
+ "dependencies": {
37
+ "cheerio": "^1.2.0",
38
+ "undici": "^7.22.0"
39
+ }
40
+ }
package/pyproject.toml ADDED
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "straw-scraper"
7
+ version = "1.0.0"
8
+ description = "A high-performance, enterprise-grade scraping library for Python."
9
+ authors = [
10
+ { name = "ZetaGo-Aurum" }
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ dependencies = [
15
+ "httpx>=0.25.0",
16
+ "beautifulsoup4>=4.12.0",
17
+ "lxml>=4.9.0"
18
+ ]
19
+ keywords = ["scraping", "scraper", "youtube-scraper", "media-extractor", "anti-cors"]
20
+ license = { text = "ISC" }
21
+
22
+ [project.urls]
23
+ Homepage = "https://github.com/ZetaGo-Aurum/straw"
@@ -0,0 +1,94 @@
1
+ import { fetch, RequestInit, Response, Agent } from 'undici';
2
+ import { getRandomUserAgent, sleep } from '../utils/helpers';
3
+
4
+ export interface StrawClientOptions {
5
+ proxy?: string;
6
+ timeout?: number;
7
+ retries?: number;
8
+ rotateUserAgent?: boolean;
9
+ }
10
+
11
+ export class StrawClient {
12
+ private options: StrawClientOptions;
13
+ private dispatcher: Agent;
14
+
15
+ constructor(options: StrawClientOptions = {}) {
16
+ this.options = {
17
+ timeout: 10000,
18
+ retries: 3,
19
+ rotateUserAgent: true,
20
+ ...options
21
+ };
22
+
23
+ this.dispatcher = new Agent({
24
+ connect: {
25
+ rejectUnauthorized: false
26
+ }
27
+ });
28
+ }
29
+
30
+ /**
31
+ * Fetch a URL with built-in retries, timeout, and User-Agent rotation.
32
+ */
33
+ public async request(url: string, init?: RequestInit): Promise<Response> {
34
+ let attempts = 0;
35
+ const maxRetries = this.options.retries || 1;
36
+
37
+ while (attempts < maxRetries) {
38
+ try {
39
+ const headers = new Headers(init?.headers as any);
40
+
41
+ if (this.options.rotateUserAgent && !headers.has('User-Agent')) {
42
+ headers.set('User-Agent', getRandomUserAgent());
43
+ }
44
+
45
+ // Default headers to masquerade as a normal browser
46
+ if (!headers.has('Accept')) {
47
+ headers.set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7');
48
+ }
49
+ if (!headers.has('Accept-Language')) {
50
+ headers.set('Accept-Language', 'en-US,en;q=0.9');
51
+ }
52
+
53
+ // Setup AbortController for timeout
54
+ const controller = new AbortController();
55
+ const timeoutId = setTimeout(() => controller.abort(), this.options.timeout);
56
+
57
+ const response = await fetch(url, {
58
+ ...init,
59
+ headers,
60
+ signal: controller.signal as RequestInit['signal'],
61
+ dispatcher: this.options.proxy ? undefined : this.dispatcher
62
+ });
63
+
64
+ clearTimeout(timeoutId);
65
+
66
+ // If rate limited or standard server error, retry
67
+ if ([429, 500, 502, 503, 504].includes(response.status)) {
68
+ throw new Error(`HTTP Error ${response.status}`);
69
+ }
70
+
71
+ return response;
72
+ } catch (error: any) {
73
+ attempts++;
74
+ if (attempts >= maxRetries) {
75
+ const cause = error.cause ? String(error.cause) : 'No cause provided';
76
+ throw new Error(`Failed to fetch ${url} after ${maxRetries} attempts. Last error: ${error.message} - Cause: ${cause}`);
77
+ }
78
+ // Exponential backoff
79
+ await sleep(1000 * Math.pow(2, attempts));
80
+ }
81
+ }
82
+ throw new Error('Unreachable');
83
+ }
84
+
85
+ public async getText(url: string, init?: RequestInit): Promise<string> {
86
+ const response = await this.request(url, init);
87
+ return await response.text();
88
+ }
89
+
90
+ public async getJson<T>(url: string, init?: RequestInit): Promise<T> {
91
+ const response = await this.request(url, init);
92
+ return await response.json() as T;
93
+ }
94
+ }
package/src/index.ts ADDED
@@ -0,0 +1,26 @@
1
+ import { StrawClient, StrawClientOptions } from './core/client';
2
+ import { WebScraper, WebScrapeResult } from './scrapers/web';
3
+ import { YouTubeScraper, YouTubeResult, YouTubeFormats } from './scrapers/youtube';
4
+ import { MediaScraper, MediaScrapeResult } from './scrapers/media';
5
+
6
+ export type { StrawClientOptions };
7
+ export type { WebScrapeResult };
8
+ export type { YouTubeResult, YouTubeFormats };
9
+ export type { MediaScrapeResult };
10
+
11
+ export {
12
+ StrawClient,
13
+ WebScraper,
14
+ YouTubeScraper,
15
+ MediaScraper
16
+ };
17
+
18
+ // Default export wrapper
19
+ const straw = {
20
+ client: (options?: StrawClientOptions) => new StrawClient(options),
21
+ web: (options?: StrawClientOptions) => new WebScraper(options),
22
+ youtube: (options?: StrawClientOptions) => new YouTubeScraper(options),
23
+ media: (options?: StrawClientOptions) => new MediaScraper(options),
24
+ };
25
+
26
+ export default straw;
@@ -0,0 +1,58 @@
1
+ import * as cheerio from 'cheerio';
2
+ import { StrawClient, StrawClientOptions } from '../core/client';
3
+
4
+ export interface MediaScrapeResult {
5
+ pageTitle: string;
6
+ mediaLinks: string[];
7
+ }
8
+
9
+ export class MediaScraper {
10
+ private client: StrawClient;
11
+
12
+ constructor(options?: StrawClientOptions) {
13
+ this.client = new StrawClient(options);
14
+ }
15
+
16
+ /**
17
+ * Attempts to find direct media files (Images/Audio/Video/Documents) referenced in any generic webpage HTML.
18
+ */
19
+ public async extractMedia(url: string): Promise<MediaScrapeResult> {
20
+ const html = await this.client.getText(url);
21
+ const $ = cheerio.load(html);
22
+
23
+ const pageTitle = $('title').text().trim();
24
+ const mediaLinks = new Set<string>();
25
+
26
+ // 1. Check <video>, <audio>, <img>, and <source> tags
27
+ $('video, audio, img, source').each((_, el) => {
28
+ const src = $(el).attr('src') || $(el).attr('srcset');
29
+ if (src) {
30
+ // handle srcset parsing simply by grabbing the first URL if needed, or just finding http links
31
+ const urls = src.match(/https?:\/\/[^\s"',]+/g);
32
+ if (urls) urls.forEach(u => mediaLinks.add(u));
33
+ else if (src.startsWith('http')) mediaLinks.add(src);
34
+ }
35
+ });
36
+
37
+ // 2. Check <a> tags for document/media links
38
+ $('a').each((_, el) => {
39
+ const href = $(el).attr('href');
40
+ if (href && href.startsWith('http') && href.match(/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv|png|jpg|jpeg|gif|svg|webp|avif|ico|bmp)(\?.*)?$/i)) {
41
+ mediaLinks.add(href);
42
+ }
43
+ });
44
+
45
+ // 3. Fallback: Check regex for embedded JSON or JS containing media/document links
46
+ const rawLinksMatch = html.match(/https?:\/\/[^\s"',]+\.(png|jpg|jpeg|gif|svg|webp|avif|ico|bmp|pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv)/gi);
47
+ if (rawLinksMatch) {
48
+ for (const link of rawLinksMatch) {
49
+ mediaLinks.add(link);
50
+ }
51
+ }
52
+
53
+ return {
54
+ pageTitle,
55
+ mediaLinks: Array.from(mediaLinks)
56
+ };
57
+ }
58
+ }
@@ -0,0 +1,64 @@
1
+ import * as cheerio from 'cheerio';
2
+ import { StrawClient, StrawClientOptions } from '../core/client';
3
+
4
+ export interface WebScrapeResult {
5
+ title: string;
6
+ description: string;
7
+ text: string;
8
+ links: { text: string; href: string }[];
9
+ meta: Record<string, string>;
10
+ }
11
+
12
+ export class WebScraper {
13
+ private client: StrawClient;
14
+
15
+ constructor(options?: StrawClientOptions) {
16
+ this.client = new StrawClient(options);
17
+ }
18
+
19
+ /**
20
+ * Scrape a webpage and return structured data.
21
+ * Extracts title, generic text, metadata, and all links.
22
+ */
23
+ public async scrape(url: string): Promise<WebScrapeResult> {
24
+ const html = await this.client.getText(url);
25
+ const $ = cheerio.load(html);
26
+
27
+ const title = $('title').text().trim();
28
+ let description = $('meta[name="description"]').attr('content') || '';
29
+
30
+ if (!description) {
31
+ description = $('meta[property="og:description"]').attr('content') || '';
32
+ }
33
+
34
+ const meta: Record<string, string> = {};
35
+ $('meta').each((_, el) => {
36
+ const name = $(el).attr('name') || $(el).attr('property');
37
+ const content = $(el).attr('content');
38
+ if (name && content) {
39
+ meta[name] = content;
40
+ }
41
+ });
42
+
43
+ const links: { text: string; href: string }[] = [];
44
+ $('a').each((_, el) => {
45
+ const href = $(el).attr('href');
46
+ const text = $(el).text().trim();
47
+ if (href && href.startsWith('http')) {
48
+ links.push({ text, href });
49
+ }
50
+ });
51
+
52
+ // Remove scripts and styles for cleaner text extraction
53
+ $('script, style, noscript, iframe, svg').remove();
54
+ const text = $('body').text().replace(/\s+/g, ' ').trim();
55
+
56
+ return {
57
+ title,
58
+ description,
59
+ text,
60
+ links,
61
+ meta
62
+ };
63
+ }
64
+ }
@@ -0,0 +1,92 @@
1
+ import { StrawClient, StrawClientOptions } from '../core/client';
2
+
3
+ export interface YouTubeFormats {
4
+ url: string;
5
+ mimeType: string;
6
+ width?: number;
7
+ height?: number;
8
+ quality?: string;
9
+ bitrate?: number;
10
+ hasAudio: boolean;
11
+ hasVideo: boolean;
12
+ }
13
+
14
+ export interface YouTubeResult {
15
+ title: string;
16
+ author: string;
17
+ description: string;
18
+ views: string;
19
+ durationSeconds: string;
20
+ thumbnail: string;
21
+ formats: YouTubeFormats[];
22
+ }
23
+
24
+ export class YouTubeScraper {
25
+ private client: StrawClient;
26
+
27
+ constructor(options?: StrawClientOptions) {
28
+ this.client = new StrawClient(options);
29
+ }
30
+
31
+ /**
32
+ * Extracts YouTube video metadata and direct stream URLs without external bloatware.
33
+ * Parses the ytInitialPlayerResponse object embedded in the watch HTML.
34
+ */
35
+ public async scrapeVideo(url: string): Promise<YouTubeResult> {
36
+ const html = await this.client.getText(url, {
37
+ headers: {
38
+ 'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
39
+ }
40
+ });
41
+
42
+ // Find ytInitialPlayerResponse JSON fragment in the HTML
43
+ const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
44
+ const match = html.match(regex);
45
+
46
+ if (!match || !match[1]) {
47
+ throw new Error('ytInitialPlayerResponse not found. YouTube might have changed their layout or the IP is blocked.');
48
+ }
49
+
50
+ const data = JSON.parse(match[1]);
51
+ const details = data?.videoDetails;
52
+ const streamingData = data?.streamingData;
53
+
54
+ if (!details) {
55
+ throw new Error('Video details not found inside player response.');
56
+ }
57
+
58
+ const formats: YouTubeFormats[] = [];
59
+ const rawFormats = [...(streamingData?.formats || []), ...(streamingData?.adaptiveFormats || [])];
60
+
61
+ for (const format of rawFormats) {
62
+ if (format.url) {
63
+ const mimeType = format.mimeType || '';
64
+ formats.push({
65
+ url: format.url,
66
+ mimeType: mimeType,
67
+ width: format.width,
68
+ height: format.height,
69
+ quality: format.qualityLabel || format.quality,
70
+ bitrate: format.bitrate,
71
+ hasAudio: mimeType.includes('audio/'),
72
+ hasVideo: mimeType.includes('video/')
73
+ });
74
+ } else if (format.signatureCipher) {
75
+ // To avoid bloatware, we do not implement the complex decipher algorithm here.
76
+ // Modern APIs usually provide the URL directly for lower qualities or we can fallback to other APIs.
77
+ // Implementing decipher requires porting youtube-dl's sig logic or using ytdl-core.
78
+ continue;
79
+ }
80
+ }
81
+
82
+ return {
83
+ title: details.title || '',
84
+ author: details.author || '',
85
+ description: details.shortDescription || '',
86
+ views: details.viewCount || '0',
87
+ durationSeconds: details.lengthSeconds || '0',
88
+ thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || '',
89
+ formats
90
+ };
91
+ }
92
+ }
@@ -0,0 +1,17 @@
1
+ export const USER_AGENTS = [
2
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
3
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
4
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
5
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
6
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:109.0) Gecko/20100101 Firefox/121.0',
7
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
8
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
9
+ ];
10
+
11
+ export function getRandomUserAgent(): string {
12
+ return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
13
+ }
14
+
15
+ export function sleep(ms: number): Promise<void> {
16
+ return new Promise(resolve => setTimeout(resolve, ms));
17
+ }
@@ -0,0 +1,11 @@
1
+ from .client import StrawClient
2
+ from .web import WebScraper
3
+ from .youtube import YouTubeScraper
4
+ from .media import MediaScraper
5
+
6
+ __all__ = [
7
+ "StrawClient",
8
+ "WebScraper",
9
+ "YouTubeScraper",
10
+ "MediaScraper"
11
+ ]