@zetagoaurum-dev/straw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/LICENSE +21 -0
- package/README.md +136 -0
- package/dist/index.d.mts +90 -0
- package/dist/index.d.ts +90 -0
- package/dist/index.js +291 -0
- package/dist/index.mjs +251 -0
- package/package.json +40 -0
- package/pyproject.toml +23 -0
- package/src/core/client.ts +94 -0
- package/src/index.ts +26 -0
- package/src/scrapers/media.ts +58 -0
- package/src/scrapers/web.ts +64 -0
- package/src/scrapers/youtube.ts +92 -0
- package/src/utils/helpers.ts +17 -0
- package/straw/__init__.py +11 -0
- package/straw/__pycache__/__init__.cpython-311.pyc +0 -0
- package/straw/__pycache__/client.cpython-311.pyc +0 -0
- package/straw/__pycache__/helpers.cpython-311.pyc +0 -0
- package/straw/__pycache__/media.cpython-311.pyc +0 -0
- package/straw/__pycache__/web.cpython-311.pyc +0 -0
- package/straw/__pycache__/youtube.cpython-311.pyc +0 -0
- package/straw/client.py +64 -0
- package/straw/helpers.py +18 -0
- package/straw/media.py +38 -0
- package/straw/web.py +51 -0
- package/straw/youtube.py +55 -0
- package/tests/test.py +52 -0
- package/tests/test.ts +45 -0
- package/tsconfig.json +13 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
// src/core/client.ts
|
|
2
|
+
import { fetch, Agent } from "undici";
|
|
3
|
+
|
|
4
|
+
// src/utils/helpers.ts
|
|
5
|
+
var USER_AGENTS = [
|
|
6
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
7
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
8
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
9
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
|
|
10
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:109.0) Gecko/20100101 Firefox/121.0",
|
|
11
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
12
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
|
|
13
|
+
];
|
|
14
|
+
function getRandomUserAgent() {
|
|
15
|
+
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
16
|
+
}
|
|
17
|
+
function sleep(ms) {
|
|
18
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// src/core/client.ts
|
|
22
|
+
var StrawClient = class {
|
|
23
|
+
options;
|
|
24
|
+
dispatcher;
|
|
25
|
+
constructor(options = {}) {
|
|
26
|
+
this.options = {
|
|
27
|
+
timeout: 1e4,
|
|
28
|
+
retries: 3,
|
|
29
|
+
rotateUserAgent: true,
|
|
30
|
+
...options
|
|
31
|
+
};
|
|
32
|
+
this.dispatcher = new Agent({
|
|
33
|
+
connect: {
|
|
34
|
+
rejectUnauthorized: false
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Fetch a URL with built-in retries, timeout, and User-Agent rotation.
|
|
40
|
+
*/
|
|
41
|
+
async request(url, init) {
|
|
42
|
+
let attempts = 0;
|
|
43
|
+
const maxRetries = this.options.retries || 1;
|
|
44
|
+
while (attempts < maxRetries) {
|
|
45
|
+
try {
|
|
46
|
+
const headers = new Headers(init?.headers);
|
|
47
|
+
if (this.options.rotateUserAgent && !headers.has("User-Agent")) {
|
|
48
|
+
headers.set("User-Agent", getRandomUserAgent());
|
|
49
|
+
}
|
|
50
|
+
if (!headers.has("Accept")) {
|
|
51
|
+
headers.set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
|
|
52
|
+
}
|
|
53
|
+
if (!headers.has("Accept-Language")) {
|
|
54
|
+
headers.set("Accept-Language", "en-US,en;q=0.9");
|
|
55
|
+
}
|
|
56
|
+
const controller = new AbortController();
|
|
57
|
+
const timeoutId = setTimeout(() => controller.abort(), this.options.timeout);
|
|
58
|
+
const response = await fetch(url, {
|
|
59
|
+
...init,
|
|
60
|
+
headers,
|
|
61
|
+
signal: controller.signal,
|
|
62
|
+
dispatcher: this.options.proxy ? void 0 : this.dispatcher
|
|
63
|
+
});
|
|
64
|
+
clearTimeout(timeoutId);
|
|
65
|
+
if ([429, 500, 502, 503, 504].includes(response.status)) {
|
|
66
|
+
throw new Error(`HTTP Error ${response.status}`);
|
|
67
|
+
}
|
|
68
|
+
return response;
|
|
69
|
+
} catch (error) {
|
|
70
|
+
attempts++;
|
|
71
|
+
if (attempts >= maxRetries) {
|
|
72
|
+
const cause = error.cause ? String(error.cause) : "No cause provided";
|
|
73
|
+
throw new Error(`Failed to fetch ${url} after ${maxRetries} attempts. Last error: ${error.message} - Cause: ${cause}`);
|
|
74
|
+
}
|
|
75
|
+
await sleep(1e3 * Math.pow(2, attempts));
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
throw new Error("Unreachable");
|
|
79
|
+
}
|
|
80
|
+
async getText(url, init) {
|
|
81
|
+
const response = await this.request(url, init);
|
|
82
|
+
return await response.text();
|
|
83
|
+
}
|
|
84
|
+
async getJson(url, init) {
|
|
85
|
+
const response = await this.request(url, init);
|
|
86
|
+
return await response.json();
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// src/scrapers/web.ts
|
|
91
|
+
import * as cheerio from "cheerio";
|
|
92
|
+
var WebScraper = class {
|
|
93
|
+
client;
|
|
94
|
+
constructor(options) {
|
|
95
|
+
this.client = new StrawClient(options);
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Scrape a webpage and return structured data.
|
|
99
|
+
* Extracts title, generic text, metadata, and all links.
|
|
100
|
+
*/
|
|
101
|
+
async scrape(url) {
|
|
102
|
+
const html = await this.client.getText(url);
|
|
103
|
+
const $ = cheerio.load(html);
|
|
104
|
+
const title = $("title").text().trim();
|
|
105
|
+
let description = $('meta[name="description"]').attr("content") || "";
|
|
106
|
+
if (!description) {
|
|
107
|
+
description = $('meta[property="og:description"]').attr("content") || "";
|
|
108
|
+
}
|
|
109
|
+
const meta = {};
|
|
110
|
+
$("meta").each((_, el) => {
|
|
111
|
+
const name = $(el).attr("name") || $(el).attr("property");
|
|
112
|
+
const content = $(el).attr("content");
|
|
113
|
+
if (name && content) {
|
|
114
|
+
meta[name] = content;
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
const links = [];
|
|
118
|
+
$("a").each((_, el) => {
|
|
119
|
+
const href = $(el).attr("href");
|
|
120
|
+
const text2 = $(el).text().trim();
|
|
121
|
+
if (href && href.startsWith("http")) {
|
|
122
|
+
links.push({ text: text2, href });
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
$("script, style, noscript, iframe, svg").remove();
|
|
126
|
+
const text = $("body").text().replace(/\s+/g, " ").trim();
|
|
127
|
+
return {
|
|
128
|
+
title,
|
|
129
|
+
description,
|
|
130
|
+
text,
|
|
131
|
+
links,
|
|
132
|
+
meta
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
// src/scrapers/youtube.ts
|
|
138
|
+
var YouTubeScraper = class {
|
|
139
|
+
client;
|
|
140
|
+
constructor(options) {
|
|
141
|
+
this.client = new StrawClient(options);
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Extracts YouTube video metadata and direct stream URLs without external bloatware.
|
|
145
|
+
* Parses the ytInitialPlayerResponse object embedded in the watch HTML.
|
|
146
|
+
*/
|
|
147
|
+
async scrapeVideo(url) {
|
|
148
|
+
const html = await this.client.getText(url, {
|
|
149
|
+
headers: {
|
|
150
|
+
"Cookie": "CONSENT=YES+cb.20230501-14-p0.en+FX+430"
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
154
|
+
const match = html.match(regex);
|
|
155
|
+
if (!match || !match[1]) {
|
|
156
|
+
throw new Error("ytInitialPlayerResponse not found. YouTube might have changed their layout or the IP is blocked.");
|
|
157
|
+
}
|
|
158
|
+
const data = JSON.parse(match[1]);
|
|
159
|
+
const details = data?.videoDetails;
|
|
160
|
+
const streamingData = data?.streamingData;
|
|
161
|
+
if (!details) {
|
|
162
|
+
throw new Error("Video details not found inside player response.");
|
|
163
|
+
}
|
|
164
|
+
const formats = [];
|
|
165
|
+
const rawFormats = [...streamingData?.formats || [], ...streamingData?.adaptiveFormats || []];
|
|
166
|
+
for (const format of rawFormats) {
|
|
167
|
+
if (format.url) {
|
|
168
|
+
const mimeType = format.mimeType || "";
|
|
169
|
+
formats.push({
|
|
170
|
+
url: format.url,
|
|
171
|
+
mimeType,
|
|
172
|
+
width: format.width,
|
|
173
|
+
height: format.height,
|
|
174
|
+
quality: format.qualityLabel || format.quality,
|
|
175
|
+
bitrate: format.bitrate,
|
|
176
|
+
hasAudio: mimeType.includes("audio/"),
|
|
177
|
+
hasVideo: mimeType.includes("video/")
|
|
178
|
+
});
|
|
179
|
+
} else if (format.signatureCipher) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return {
|
|
184
|
+
title: details.title || "",
|
|
185
|
+
author: details.author || "",
|
|
186
|
+
description: details.shortDescription || "",
|
|
187
|
+
views: details.viewCount || "0",
|
|
188
|
+
durationSeconds: details.lengthSeconds || "0",
|
|
189
|
+
thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || "",
|
|
190
|
+
formats
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
// src/scrapers/media.ts
|
|
196
|
+
import * as cheerio2 from "cheerio";
|
|
197
|
+
var MediaScraper = class {
|
|
198
|
+
client;
|
|
199
|
+
constructor(options) {
|
|
200
|
+
this.client = new StrawClient(options);
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Attempts to find direct media files (Images/Audio/Video/Documents) referenced in any generic webpage HTML.
|
|
204
|
+
*/
|
|
205
|
+
async extractMedia(url) {
|
|
206
|
+
const html = await this.client.getText(url);
|
|
207
|
+
const $ = cheerio2.load(html);
|
|
208
|
+
const pageTitle = $("title").text().trim();
|
|
209
|
+
const mediaLinks = /* @__PURE__ */ new Set();
|
|
210
|
+
$("video, audio, img, source").each((_, el) => {
|
|
211
|
+
const src = $(el).attr("src") || $(el).attr("srcset");
|
|
212
|
+
if (src) {
|
|
213
|
+
const urls = src.match(/https?:\/\/[^\s"',]+/g);
|
|
214
|
+
if (urls) urls.forEach((u) => mediaLinks.add(u));
|
|
215
|
+
else if (src.startsWith("http")) mediaLinks.add(src);
|
|
216
|
+
}
|
|
217
|
+
});
|
|
218
|
+
$("a").each((_, el) => {
|
|
219
|
+
const href = $(el).attr("href");
|
|
220
|
+
if (href && href.startsWith("http") && href.match(/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv|png|jpg|jpeg|gif|svg|webp|avif|ico|bmp)(\?.*)?$/i)) {
|
|
221
|
+
mediaLinks.add(href);
|
|
222
|
+
}
|
|
223
|
+
});
|
|
224
|
+
const rawLinksMatch = html.match(/https?:\/\/[^\s"',]+\.(png|jpg|jpeg|gif|svg|webp|avif|ico|bmp|pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv)/gi);
|
|
225
|
+
if (rawLinksMatch) {
|
|
226
|
+
for (const link of rawLinksMatch) {
|
|
227
|
+
mediaLinks.add(link);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return {
|
|
231
|
+
pageTitle,
|
|
232
|
+
mediaLinks: Array.from(mediaLinks)
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
// src/index.ts
|
|
238
|
+
var straw = {
|
|
239
|
+
client: (options) => new StrawClient(options),
|
|
240
|
+
web: (options) => new WebScraper(options),
|
|
241
|
+
youtube: (options) => new YouTubeScraper(options),
|
|
242
|
+
media: (options) => new MediaScraper(options)
|
|
243
|
+
};
|
|
244
|
+
var index_default = straw;
|
|
245
|
+
export {
|
|
246
|
+
MediaScraper,
|
|
247
|
+
StrawClient,
|
|
248
|
+
WebScraper,
|
|
249
|
+
YouTubeScraper,
|
|
250
|
+
index_default as default
|
|
251
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@zetagoaurum-dev/straw",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Enterprise-grade unified JS/TS and Python scraping library for Web, YouTube, and Media (Images, Audio, Video, Documents)",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/index.mjs",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"require": "./dist/index.js",
|
|
11
|
+
"import": "./dist/index.mjs",
|
|
12
|
+
"types": "./dist/index.d.ts"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"scripts": {
|
|
16
|
+
"build": "tsup src/index.ts --format cjs,esm --dts --clean",
|
|
17
|
+
"dev": "tsup src/index.ts --format cjs,esm --dts --watch",
|
|
18
|
+
"test": "tsx tests/test.ts"
|
|
19
|
+
},
|
|
20
|
+
"keywords": [
|
|
21
|
+
"scraping",
|
|
22
|
+
"scraper",
|
|
23
|
+
"youtube-scraper",
|
|
24
|
+
"media-extractor",
|
|
25
|
+
"anti-cors"
|
|
26
|
+
],
|
|
27
|
+
"author": "ZetaGo-Aurum",
|
|
28
|
+
"license": "ISC",
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@types/node": "^25.3.2",
|
|
31
|
+
"ts-node": "^10.9.2",
|
|
32
|
+
"tsup": "^8.5.1",
|
|
33
|
+
"tsx": "^4.21.0",
|
|
34
|
+
"typescript": "^5.9.3"
|
|
35
|
+
},
|
|
36
|
+
"dependencies": {
|
|
37
|
+
"cheerio": "^1.2.0",
|
|
38
|
+
"undici": "^7.22.0"
|
|
39
|
+
}
|
|
40
|
+
}
|
package/pyproject.toml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "straw-scraper"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "A high-performance, enterprise-grade scraping library for Python."
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "ZetaGo-Aurum" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"httpx>=0.25.0",
|
|
16
|
+
"beautifulsoup4>=4.12.0",
|
|
17
|
+
"lxml>=4.9.0"
|
|
18
|
+
]
|
|
19
|
+
keywords = ["scraping", "scraper", "youtube-scraper", "media-extractor", "anti-cors"]
|
|
20
|
+
license = { text = "ISC" }
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/ZetaGo-Aurum/straw"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { fetch, RequestInit, Response, Agent } from 'undici';
|
|
2
|
+
import { getRandomUserAgent, sleep } from '../utils/helpers';
|
|
3
|
+
|
|
4
|
+
export interface StrawClientOptions {
|
|
5
|
+
proxy?: string;
|
|
6
|
+
timeout?: number;
|
|
7
|
+
retries?: number;
|
|
8
|
+
rotateUserAgent?: boolean;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export class StrawClient {
|
|
12
|
+
private options: StrawClientOptions;
|
|
13
|
+
private dispatcher: Agent;
|
|
14
|
+
|
|
15
|
+
constructor(options: StrawClientOptions = {}) {
|
|
16
|
+
this.options = {
|
|
17
|
+
timeout: 10000,
|
|
18
|
+
retries: 3,
|
|
19
|
+
rotateUserAgent: true,
|
|
20
|
+
...options
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
this.dispatcher = new Agent({
|
|
24
|
+
connect: {
|
|
25
|
+
rejectUnauthorized: false
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Fetch a URL with built-in retries, timeout, and User-Agent rotation.
|
|
32
|
+
*/
|
|
33
|
+
public async request(url: string, init?: RequestInit): Promise<Response> {
|
|
34
|
+
let attempts = 0;
|
|
35
|
+
const maxRetries = this.options.retries || 1;
|
|
36
|
+
|
|
37
|
+
while (attempts < maxRetries) {
|
|
38
|
+
try {
|
|
39
|
+
const headers = new Headers(init?.headers as any);
|
|
40
|
+
|
|
41
|
+
if (this.options.rotateUserAgent && !headers.has('User-Agent')) {
|
|
42
|
+
headers.set('User-Agent', getRandomUserAgent());
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Default headers to masquerade as a normal browser
|
|
46
|
+
if (!headers.has('Accept')) {
|
|
47
|
+
headers.set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7');
|
|
48
|
+
}
|
|
49
|
+
if (!headers.has('Accept-Language')) {
|
|
50
|
+
headers.set('Accept-Language', 'en-US,en;q=0.9');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Setup AbortController for timeout
|
|
54
|
+
const controller = new AbortController();
|
|
55
|
+
const timeoutId = setTimeout(() => controller.abort(), this.options.timeout);
|
|
56
|
+
|
|
57
|
+
const response = await fetch(url, {
|
|
58
|
+
...init,
|
|
59
|
+
headers,
|
|
60
|
+
signal: controller.signal as RequestInit['signal'],
|
|
61
|
+
dispatcher: this.options.proxy ? undefined : this.dispatcher
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
clearTimeout(timeoutId);
|
|
65
|
+
|
|
66
|
+
// If rate limited or standard server error, retry
|
|
67
|
+
if ([429, 500, 502, 503, 504].includes(response.status)) {
|
|
68
|
+
throw new Error(`HTTP Error ${response.status}`);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return response;
|
|
72
|
+
} catch (error: any) {
|
|
73
|
+
attempts++;
|
|
74
|
+
if (attempts >= maxRetries) {
|
|
75
|
+
const cause = error.cause ? String(error.cause) : 'No cause provided';
|
|
76
|
+
throw new Error(`Failed to fetch ${url} after ${maxRetries} attempts. Last error: ${error.message} - Cause: ${cause}`);
|
|
77
|
+
}
|
|
78
|
+
// Exponential backoff
|
|
79
|
+
await sleep(1000 * Math.pow(2, attempts));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
throw new Error('Unreachable');
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
public async getText(url: string, init?: RequestInit): Promise<string> {
|
|
86
|
+
const response = await this.request(url, init);
|
|
87
|
+
return await response.text();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
public async getJson<T>(url: string, init?: RequestInit): Promise<T> {
|
|
91
|
+
const response = await this.request(url, init);
|
|
92
|
+
return await response.json() as T;
|
|
93
|
+
}
|
|
94
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { StrawClient, StrawClientOptions } from './core/client';
|
|
2
|
+
import { WebScraper, WebScrapeResult } from './scrapers/web';
|
|
3
|
+
import { YouTubeScraper, YouTubeResult, YouTubeFormats } from './scrapers/youtube';
|
|
4
|
+
import { MediaScraper, MediaScrapeResult } from './scrapers/media';
|
|
5
|
+
|
|
6
|
+
export type { StrawClientOptions };
|
|
7
|
+
export type { WebScrapeResult };
|
|
8
|
+
export type { YouTubeResult, YouTubeFormats };
|
|
9
|
+
export type { MediaScrapeResult };
|
|
10
|
+
|
|
11
|
+
export {
|
|
12
|
+
StrawClient,
|
|
13
|
+
WebScraper,
|
|
14
|
+
YouTubeScraper,
|
|
15
|
+
MediaScraper
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
// Default export wrapper
|
|
19
|
+
const straw = {
|
|
20
|
+
client: (options?: StrawClientOptions) => new StrawClient(options),
|
|
21
|
+
web: (options?: StrawClientOptions) => new WebScraper(options),
|
|
22
|
+
youtube: (options?: StrawClientOptions) => new YouTubeScraper(options),
|
|
23
|
+
media: (options?: StrawClientOptions) => new MediaScraper(options),
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export default straw;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
import { StrawClient, StrawClientOptions } from '../core/client';
|
|
3
|
+
|
|
4
|
+
export interface MediaScrapeResult {
|
|
5
|
+
pageTitle: string;
|
|
6
|
+
mediaLinks: string[];
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class MediaScraper {
|
|
10
|
+
private client: StrawClient;
|
|
11
|
+
|
|
12
|
+
constructor(options?: StrawClientOptions) {
|
|
13
|
+
this.client = new StrawClient(options);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Attempts to find direct media files (Images/Audio/Video/Documents) referenced in any generic webpage HTML.
|
|
18
|
+
*/
|
|
19
|
+
public async extractMedia(url: string): Promise<MediaScrapeResult> {
|
|
20
|
+
const html = await this.client.getText(url);
|
|
21
|
+
const $ = cheerio.load(html);
|
|
22
|
+
|
|
23
|
+
const pageTitle = $('title').text().trim();
|
|
24
|
+
const mediaLinks = new Set<string>();
|
|
25
|
+
|
|
26
|
+
// 1. Check <video>, <audio>, <img>, and <source> tags
|
|
27
|
+
$('video, audio, img, source').each((_, el) => {
|
|
28
|
+
const src = $(el).attr('src') || $(el).attr('srcset');
|
|
29
|
+
if (src) {
|
|
30
|
+
// handle srcset parsing simply by grabbing the first URL if needed, or just finding http links
|
|
31
|
+
const urls = src.match(/https?:\/\/[^\s"',]+/g);
|
|
32
|
+
if (urls) urls.forEach(u => mediaLinks.add(u));
|
|
33
|
+
else if (src.startsWith('http')) mediaLinks.add(src);
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
// 2. Check <a> tags for document/media links
|
|
38
|
+
$('a').each((_, el) => {
|
|
39
|
+
const href = $(el).attr('href');
|
|
40
|
+
if (href && href.startsWith('http') && href.match(/\.(pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv|png|jpg|jpeg|gif|svg|webp|avif|ico|bmp)(\?.*)?$/i)) {
|
|
41
|
+
mediaLinks.add(href);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
// 3. Fallback: Check regex for embedded JSON or JS containing media/document links
|
|
46
|
+
const rawLinksMatch = html.match(/https?:\/\/[^\s"',]+\.(png|jpg|jpeg|gif|svg|webp|avif|ico|bmp|pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv)/gi);
|
|
47
|
+
if (rawLinksMatch) {
|
|
48
|
+
for (const link of rawLinksMatch) {
|
|
49
|
+
mediaLinks.add(link);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
pageTitle,
|
|
55
|
+
mediaLinks: Array.from(mediaLinks)
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
import { StrawClient, StrawClientOptions } from '../core/client';
|
|
3
|
+
|
|
4
|
+
export interface WebScrapeResult {
|
|
5
|
+
title: string;
|
|
6
|
+
description: string;
|
|
7
|
+
text: string;
|
|
8
|
+
links: { text: string; href: string }[];
|
|
9
|
+
meta: Record<string, string>;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export class WebScraper {
|
|
13
|
+
private client: StrawClient;
|
|
14
|
+
|
|
15
|
+
constructor(options?: StrawClientOptions) {
|
|
16
|
+
this.client = new StrawClient(options);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Scrape a webpage and return structured data.
|
|
21
|
+
* Extracts title, generic text, metadata, and all links.
|
|
22
|
+
*/
|
|
23
|
+
public async scrape(url: string): Promise<WebScrapeResult> {
|
|
24
|
+
const html = await this.client.getText(url);
|
|
25
|
+
const $ = cheerio.load(html);
|
|
26
|
+
|
|
27
|
+
const title = $('title').text().trim();
|
|
28
|
+
let description = $('meta[name="description"]').attr('content') || '';
|
|
29
|
+
|
|
30
|
+
if (!description) {
|
|
31
|
+
description = $('meta[property="og:description"]').attr('content') || '';
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const meta: Record<string, string> = {};
|
|
35
|
+
$('meta').each((_, el) => {
|
|
36
|
+
const name = $(el).attr('name') || $(el).attr('property');
|
|
37
|
+
const content = $(el).attr('content');
|
|
38
|
+
if (name && content) {
|
|
39
|
+
meta[name] = content;
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
const links: { text: string; href: string }[] = [];
|
|
44
|
+
$('a').each((_, el) => {
|
|
45
|
+
const href = $(el).attr('href');
|
|
46
|
+
const text = $(el).text().trim();
|
|
47
|
+
if (href && href.startsWith('http')) {
|
|
48
|
+
links.push({ text, href });
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
// Remove scripts and styles for cleaner text extraction
|
|
53
|
+
$('script, style, noscript, iframe, svg').remove();
|
|
54
|
+
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
title,
|
|
58
|
+
description,
|
|
59
|
+
text,
|
|
60
|
+
links,
|
|
61
|
+
meta
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { StrawClient, StrawClientOptions } from '../core/client';
|
|
2
|
+
|
|
3
|
+
export interface YouTubeFormats {
|
|
4
|
+
url: string;
|
|
5
|
+
mimeType: string;
|
|
6
|
+
width?: number;
|
|
7
|
+
height?: number;
|
|
8
|
+
quality?: string;
|
|
9
|
+
bitrate?: number;
|
|
10
|
+
hasAudio: boolean;
|
|
11
|
+
hasVideo: boolean;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface YouTubeResult {
|
|
15
|
+
title: string;
|
|
16
|
+
author: string;
|
|
17
|
+
description: string;
|
|
18
|
+
views: string;
|
|
19
|
+
durationSeconds: string;
|
|
20
|
+
thumbnail: string;
|
|
21
|
+
formats: YouTubeFormats[];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export class YouTubeScraper {
|
|
25
|
+
private client: StrawClient;
|
|
26
|
+
|
|
27
|
+
constructor(options?: StrawClientOptions) {
|
|
28
|
+
this.client = new StrawClient(options);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Extracts YouTube video metadata and direct stream URLs without external bloatware.
|
|
33
|
+
* Parses the ytInitialPlayerResponse object embedded in the watch HTML.
|
|
34
|
+
*/
|
|
35
|
+
public async scrapeVideo(url: string): Promise<YouTubeResult> {
|
|
36
|
+
const html = await this.client.getText(url, {
|
|
37
|
+
headers: {
|
|
38
|
+
'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// Find ytInitialPlayerResponse JSON fragment in the HTML
|
|
43
|
+
const regex = /ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)/;
|
|
44
|
+
const match = html.match(regex);
|
|
45
|
+
|
|
46
|
+
if (!match || !match[1]) {
|
|
47
|
+
throw new Error('ytInitialPlayerResponse not found. YouTube might have changed their layout or the IP is blocked.');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const data = JSON.parse(match[1]);
|
|
51
|
+
const details = data?.videoDetails;
|
|
52
|
+
const streamingData = data?.streamingData;
|
|
53
|
+
|
|
54
|
+
if (!details) {
|
|
55
|
+
throw new Error('Video details not found inside player response.');
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const formats: YouTubeFormats[] = [];
|
|
59
|
+
const rawFormats = [...(streamingData?.formats || []), ...(streamingData?.adaptiveFormats || [])];
|
|
60
|
+
|
|
61
|
+
for (const format of rawFormats) {
|
|
62
|
+
if (format.url) {
|
|
63
|
+
const mimeType = format.mimeType || '';
|
|
64
|
+
formats.push({
|
|
65
|
+
url: format.url,
|
|
66
|
+
mimeType: mimeType,
|
|
67
|
+
width: format.width,
|
|
68
|
+
height: format.height,
|
|
69
|
+
quality: format.qualityLabel || format.quality,
|
|
70
|
+
bitrate: format.bitrate,
|
|
71
|
+
hasAudio: mimeType.includes('audio/'),
|
|
72
|
+
hasVideo: mimeType.includes('video/')
|
|
73
|
+
});
|
|
74
|
+
} else if (format.signatureCipher) {
|
|
75
|
+
// To avoid bloatware, we do not implement the complex decipher algorithm here.
|
|
76
|
+
// Modern APIs usually provide the URL directly for lower qualities or we can fallback to other APIs.
|
|
77
|
+
// Implementing decipher requires porting youtube-dl's sig logic or using ytdl-core.
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
title: details.title || '',
|
|
84
|
+
author: details.author || '',
|
|
85
|
+
description: details.shortDescription || '',
|
|
86
|
+
views: details.viewCount || '0',
|
|
87
|
+
durationSeconds: details.lengthSeconds || '0',
|
|
88
|
+
thumbnail: details.thumbnail?.thumbnails?.[details.thumbnail.thumbnails.length - 1]?.url || '',
|
|
89
|
+
formats
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export const USER_AGENTS = [
|
|
2
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
3
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
4
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
5
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
|
|
6
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:109.0) Gecko/20100101 Firefox/121.0',
|
|
7
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
|
8
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
|
|
9
|
+
];
|
|
10
|
+
|
|
11
|
+
export function getRandomUserAgent(): string {
|
|
12
|
+
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function sleep(ms: number): Promise<void> {
|
|
16
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
17
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|