pin-dl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +326 -0
- package/dist/index.d.mts +2 -0
- package/dist/index.mjs +1442 -0
- package/package.json +37 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,1442 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Args, Command, Options } from "@effect/cli";
|
|
3
|
+
import { NodeContext, NodeRuntime } from "@effect/platform-node";
|
|
4
|
+
import { Console, Data, Effect, Option } from "effect";
|
|
5
|
+
import * as fs from "node:fs/promises";
|
|
6
|
+
import * as readline from "node:readline/promises";
|
|
7
|
+
import { execFile, spawn } from "node:child_process";
|
|
8
|
+
import * as path from "node:path";
|
|
9
|
+
import { promisify } from "node:util";
|
|
10
|
+
import { createWriteStream } from "node:fs";
|
|
11
|
+
|
|
12
|
+
//#region src/domain/errors.ts
|
|
13
|
+
var PinterestHttpError = class extends Data.TaggedError("PinterestHttpError") {};
|
|
14
|
+
var PinterestParseError = class extends Data.TaggedError("PinterestParseError") {};
|
|
15
|
+
var InvalidUrlError = class extends Data.TaggedError("InvalidUrlError") {};
|
|
16
|
+
var DownloadError = class extends Data.TaggedError("DownloadError") {};
|
|
17
|
+
var CacheError = class extends Data.TaggedError("CacheError") {};
|
|
18
|
+
var LoginError = class extends Data.TaggedError("LoginError") {};
|
|
19
|
+
var CaptionError = class extends Data.TaggedError("CaptionError") {};
|
|
20
|
+
|
|
21
|
+
//#endregion
|
|
22
|
+
//#region src/domain/media.ts
|
|
23
|
+
const isCaptionMode = (s) => s === "txt" || s === "json" || s === "metadata" || s === "none";
|
|
24
|
+
const mediaToDict = (m) => ({
|
|
25
|
+
id: m.id,
|
|
26
|
+
src: m.src,
|
|
27
|
+
alt: m.alt,
|
|
28
|
+
origin: m.origin,
|
|
29
|
+
resolution: m.resolution,
|
|
30
|
+
...m.media_stream ? { media_stream: m.media_stream } : {}
|
|
31
|
+
});
|
|
32
|
+
const dictToMedia = (d) => ({
|
|
33
|
+
id: d.id,
|
|
34
|
+
src: d.src,
|
|
35
|
+
alt: d.alt,
|
|
36
|
+
origin: d.origin,
|
|
37
|
+
resolution: d.resolution,
|
|
38
|
+
...d.media_stream ? { media_stream: d.media_stream } : {}
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
//#endregion
|
|
42
|
+
//#region src/storage/cache.ts
|
|
43
|
+
/** Save scraped media items to a JSON cache file. */
|
|
44
|
+
const writeCacheFile = (items, cachePath) => Effect.tryPromise({
|
|
45
|
+
try: async () => {
|
|
46
|
+
const dicts = items.map(mediaToDict);
|
|
47
|
+
await fs.writeFile(cachePath, JSON.stringify(dicts, null, 4), "utf-8");
|
|
48
|
+
},
|
|
49
|
+
catch: (e) => new CacheError({
|
|
50
|
+
path: cachePath,
|
|
51
|
+
message: `Failed to write cache: ${String(e)}`,
|
|
52
|
+
cause: e
|
|
53
|
+
})
|
|
54
|
+
});
|
|
55
|
+
/** Read media items from a JSON cache file. */
|
|
56
|
+
const readCacheFile = (cachePath) => Effect.tryPromise({
|
|
57
|
+
try: async () => {
|
|
58
|
+
const content = await fs.readFile(cachePath, "utf-8");
|
|
59
|
+
const raw = JSON.parse(content);
|
|
60
|
+
if (!Array.isArray(raw)) throw new Error("Cache file must contain a JSON array");
|
|
61
|
+
return raw.map((d) => dictToMedia(d));
|
|
62
|
+
},
|
|
63
|
+
catch: (e) => new CacheError({
|
|
64
|
+
path: cachePath,
|
|
65
|
+
message: `Failed to read cache: ${String(e)}`,
|
|
66
|
+
cause: e
|
|
67
|
+
})
|
|
68
|
+
});
|
|
69
|
+
/** Load a Selenium-format cookies JSON file. */
|
|
70
|
+
const readCookiesFile = (cookiesPath) => Effect.tryPromise({
|
|
71
|
+
try: async () => {
|
|
72
|
+
const content = await fs.readFile(cookiesPath, "utf-8");
|
|
73
|
+
const raw = JSON.parse(content);
|
|
74
|
+
if (!Array.isArray(raw)) throw new Error("Cookies file must contain a JSON array");
|
|
75
|
+
return raw;
|
|
76
|
+
},
|
|
77
|
+
catch: (e) => new CacheError({
|
|
78
|
+
path: cookiesPath,
|
|
79
|
+
message: `Failed to read cookies file: ${String(e)}`,
|
|
80
|
+
cause: e
|
|
81
|
+
})
|
|
82
|
+
});
|
|
83
|
+
/** Save Selenium-format cookies to a JSON file. */
|
|
84
|
+
const writeCookiesFile = (cookies, outputPath) => Effect.tryPromise({
|
|
85
|
+
try: async () => {
|
|
86
|
+
await fs.writeFile(outputPath, JSON.stringify(cookies, null, 4), "utf-8");
|
|
87
|
+
},
|
|
88
|
+
catch: (e) => new CacheError({
|
|
89
|
+
path: outputPath,
|
|
90
|
+
message: `Failed to write cookies file: ${String(e)}`,
|
|
91
|
+
cause: e
|
|
92
|
+
})
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
//#endregion
|
|
96
|
+
//#region src/cli/commands/login.ts
|
|
97
|
+
const promptText = async (prompt) => {
|
|
98
|
+
const rl = readline.createInterface({
|
|
99
|
+
input: process.stdin,
|
|
100
|
+
output: process.stderr
|
|
101
|
+
});
|
|
102
|
+
try {
|
|
103
|
+
return await rl.question(prompt);
|
|
104
|
+
} finally {
|
|
105
|
+
rl.close();
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
const promptPassword = async (prompt) => {
|
|
109
|
+
process.stderr.write(prompt);
|
|
110
|
+
return new Promise((resolve) => {
|
|
111
|
+
const chunks = [];
|
|
112
|
+
process.stdin.setRawMode?.(true);
|
|
113
|
+
process.stdin.resume();
|
|
114
|
+
process.stdin.setEncoding("utf-8");
|
|
115
|
+
const onData = (chunk) => {
|
|
116
|
+
if (chunk === "\r" || chunk === "\n") {
|
|
117
|
+
process.stdin.setRawMode?.(false);
|
|
118
|
+
process.stdin.pause();
|
|
119
|
+
process.stdin.removeListener("data", onData);
|
|
120
|
+
process.stderr.write("\n");
|
|
121
|
+
resolve(chunks.join(""));
|
|
122
|
+
} else if (chunk === "") process.exit(1);
|
|
123
|
+
else if (chunk === "") chunks.pop();
|
|
124
|
+
else chunks.push(chunk);
|
|
125
|
+
};
|
|
126
|
+
process.stdin.on("data", onData);
|
|
127
|
+
});
|
|
128
|
+
};
|
|
129
|
+
const runLogin = (options) => Effect.gen(function* () {
|
|
130
|
+
const email = yield* Effect.tryPromise({
|
|
131
|
+
try: () => promptText("Enter Pinterest email: "),
|
|
132
|
+
catch: (e) => new LoginError({
|
|
133
|
+
message: `Failed to read email: ${String(e)}`,
|
|
134
|
+
cause: e
|
|
135
|
+
})
|
|
136
|
+
});
|
|
137
|
+
const password = yield* Effect.tryPromise({
|
|
138
|
+
try: () => promptPassword("Enter Pinterest password: "),
|
|
139
|
+
catch: (e) => new LoginError({
|
|
140
|
+
message: `Failed to read password: ${String(e)}`,
|
|
141
|
+
cause: e
|
|
142
|
+
})
|
|
143
|
+
});
|
|
144
|
+
yield* Console.log(`\nOpening browser — log in manually, then wait ${options.wait}s for cookies to be captured...`);
|
|
145
|
+
const { chromium, firefox, webkit } = yield* Effect.tryPromise({
|
|
146
|
+
try: () => import("playwright"),
|
|
147
|
+
catch: (e) => new LoginError({
|
|
148
|
+
message: "playwright is not installed. Run: pnpm add playwright && npx playwright install chromium",
|
|
149
|
+
cause: e
|
|
150
|
+
})
|
|
151
|
+
});
|
|
152
|
+
const browserType = options.client === "firefox" ? firefox : options.client === "webkit" ? webkit : chromium;
|
|
153
|
+
const browserName = browserType.name();
|
|
154
|
+
const customExecutable = options.browserExecutable ?? process.env["pin-dl_BROWSER_EXECUTABLE"] ?? void 0;
|
|
155
|
+
if (customExecutable) yield* Console.log(`[info] Using custom browser executable: ${customExecutable}`);
|
|
156
|
+
const playwrightExecutable = browserType.executablePath();
|
|
157
|
+
if (!(customExecutable !== void 0 || (yield* Effect.promise(() => fs.access(playwrightExecutable).then(() => true).catch(() => false))))) {
|
|
158
|
+
if ((yield* Effect.tryPromise({
|
|
159
|
+
try: () => promptText(`\n[!] The ${browserName} browser binary is not installed.\n Expected at: ${playwrightExecutable}\n Install it now? (may download several hundred MB) [Y/n]: `),
|
|
160
|
+
catch: (e) => new LoginError({
|
|
161
|
+
message: `Failed to read install prompt: ${String(e)}`,
|
|
162
|
+
cause: e
|
|
163
|
+
})
|
|
164
|
+
})).trim().toLowerCase() === "n") return yield* Effect.fail(new LoginError({
|
|
165
|
+
message: `${browserName} is not installed. Run manually:\n npx playwright install ${browserName}`,
|
|
166
|
+
cause: null
|
|
167
|
+
}));
|
|
168
|
+
yield* Console.log(`\nInstalling ${browserName}...`);
|
|
169
|
+
yield* Effect.tryPromise({
|
|
170
|
+
try: () => new Promise((resolve, reject) => {
|
|
171
|
+
spawn("npx", [
|
|
172
|
+
"playwright",
|
|
173
|
+
"install",
|
|
174
|
+
browserName
|
|
175
|
+
], {
|
|
176
|
+
stdio: "inherit",
|
|
177
|
+
shell: true
|
|
178
|
+
}).on("close", (code) => {
|
|
179
|
+
if (code === 0) resolve();
|
|
180
|
+
else reject(/* @__PURE__ */ new Error(`playwright install exited with code ${code}`));
|
|
181
|
+
});
|
|
182
|
+
}),
|
|
183
|
+
catch: (e) => new LoginError({
|
|
184
|
+
message: `Failed to install ${browserName}: ${String(e)}`,
|
|
185
|
+
cause: e
|
|
186
|
+
})
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
const chromiumArgs = options.client === "chromium" ? [
|
|
190
|
+
"--disable-blink-features=AutomationControlled",
|
|
191
|
+
"--disable-dev-shm-usage",
|
|
192
|
+
"--no-sandbox",
|
|
193
|
+
"--disable-web-security",
|
|
194
|
+
"--disable-features=IsolateOrigins,site-per-process"
|
|
195
|
+
] : void 0;
|
|
196
|
+
const browser = yield* Effect.tryPromise({
|
|
197
|
+
try: () => browserType.launch({
|
|
198
|
+
headless: !options.headful,
|
|
199
|
+
args: chromiumArgs,
|
|
200
|
+
...customExecutable ? { executablePath: customExecutable } : {}
|
|
201
|
+
}),
|
|
202
|
+
catch: (e) => new LoginError({
|
|
203
|
+
message: `Failed to launch browser: ${String(e)}`,
|
|
204
|
+
cause: e
|
|
205
|
+
})
|
|
206
|
+
});
|
|
207
|
+
const context = yield* Effect.tryPromise({
|
|
208
|
+
try: () => browser.newContext({
|
|
209
|
+
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
210
|
+
viewport: {
|
|
211
|
+
width: 1366,
|
|
212
|
+
height: 768
|
|
213
|
+
},
|
|
214
|
+
locale: "en-US",
|
|
215
|
+
timezoneId: "America/New_York",
|
|
216
|
+
colorScheme: "light"
|
|
217
|
+
}),
|
|
218
|
+
catch: (e) => new LoginError({
|
|
219
|
+
message: `Failed to create browser context: ${String(e)}`,
|
|
220
|
+
cause: e
|
|
221
|
+
})
|
|
222
|
+
});
|
|
223
|
+
const page = yield* Effect.tryPromise({
|
|
224
|
+
try: () => context.newPage(),
|
|
225
|
+
catch: (e) => new LoginError({
|
|
226
|
+
message: `Failed to open page: ${String(e)}`,
|
|
227
|
+
cause: e
|
|
228
|
+
})
|
|
229
|
+
});
|
|
230
|
+
yield* Effect.tryPromise({
|
|
231
|
+
try: async () => {
|
|
232
|
+
await page.goto("https://www.pinterest.com/login/");
|
|
233
|
+
await page.waitForSelector("input[id=\"email\"]", { timeout: 1e4 });
|
|
234
|
+
await page.fill("input[id=\"email\"]", email);
|
|
235
|
+
await page.fill("input[id=\"password\"]", password);
|
|
236
|
+
await page.click("button[type=\"submit\"]");
|
|
237
|
+
await new Promise((r) => setTimeout(r, options.wait * 1e3));
|
|
238
|
+
},
|
|
239
|
+
catch: (e) => new LoginError({
|
|
240
|
+
message: `Browser login failed: ${String(e)}`,
|
|
241
|
+
cause: e
|
|
242
|
+
})
|
|
243
|
+
});
|
|
244
|
+
const cookies = yield* Effect.tryPromise({
|
|
245
|
+
try: () => context.cookies(),
|
|
246
|
+
catch: (e) => new LoginError({
|
|
247
|
+
message: `Failed to capture cookies: ${String(e)}`,
|
|
248
|
+
cause: e
|
|
249
|
+
})
|
|
250
|
+
});
|
|
251
|
+
yield* Effect.tryPromise({
|
|
252
|
+
try: () => browser.close(),
|
|
253
|
+
catch: () => void 0
|
|
254
|
+
});
|
|
255
|
+
const isAuthenticated = cookies.find((c) => c.name === "_auth")?.value === "1";
|
|
256
|
+
yield* writeCookiesFile(cookies, options.output);
|
|
257
|
+
if (!isAuthenticated) {
|
|
258
|
+
yield* Console.log("\n[WARNING] Login may have failed!");
|
|
259
|
+
yield* Console.log("The captured cookies do not indicate an authenticated session (_auth != 1).");
|
|
260
|
+
yield* Console.log("This usually means credentials were wrong, a captcha was shown,");
|
|
261
|
+
yield* Console.log(`or not enough time was given (try --headful and --wait ${options.wait + 20}).`);
|
|
262
|
+
yield* Console.log(`\nCookies saved to '${options.output}' but may not work for private boards.`);
|
|
263
|
+
} else {
|
|
264
|
+
yield* Console.log(`\n[SUCCESS] Authenticated cookies saved to '${options.output}'`);
|
|
265
|
+
yield* Console.log("\nNote: Keep your cookies file safe — do not share it.");
|
|
266
|
+
yield* Console.log(`Use: pin-dl scrape <url> -o ./output --cookies ${options.output}`);
|
|
267
|
+
}
|
|
268
|
+
});
|
|
269
|
+
const loginCommand = Command.make("login", {
|
|
270
|
+
output: Options.withDefault(Options.text("output").pipe(Options.withAlias("o")), "cookies.json"),
|
|
271
|
+
client: Options.withDefault(Options.text("client").pipe(Options.withDescription("Browser engine to use: chromium | firefox | webkit (default: chromium)")), "chromium"),
|
|
272
|
+
browserExecutable: Options.optional(Options.text("browser-executable").pipe(Options.withDescription("Path to a custom browser executable instead of Playwright's managed binary. Skips the install check. Also readable from the pin-dl_BROWSER_EXECUTABLE environment variable."))),
|
|
273
|
+
headful: Options.boolean("headful"),
|
|
274
|
+
incognito: Options.boolean("incognito"),
|
|
275
|
+
wait: Options.withDefault(Options.integer("wait"), 10),
|
|
276
|
+
verbose: Options.boolean("verbose")
|
|
277
|
+
}, (opts) => runLogin({
|
|
278
|
+
...opts,
|
|
279
|
+
browserExecutable: opts.browserExecutable._tag === "Some" ? opts.browserExecutable.value : void 0
|
|
280
|
+
}).pipe(Effect.catchAll((e) => Console.error(`Error: ${String(e)}`))));
|
|
281
|
+
|
|
282
|
+
//#endregion
|
|
283
|
+
//#region src/api/client.ts
|
|
284
|
+
const USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36";
|
|
285
|
+
const PWS_HANDLER = "www/pin/[id].js";
|
|
286
|
+
/** Convert cookies to a Cookie header string. */
|
|
287
|
+
const cookiesToString = (cookies) => cookies.map((c) => `${c.name}=${c.value}`).join("; ");
|
|
288
|
+
/** Fetch default cookies from Pinterest's homepage. */
|
|
289
|
+
const getDefaultCookies = () => Effect.tryPromise({
|
|
290
|
+
try: async () => {
|
|
291
|
+
return (await fetch("https://www.pinterest.com", {
|
|
292
|
+
headers: { "User-Agent": USER_AGENT },
|
|
293
|
+
redirect: "follow"
|
|
294
|
+
})).headers.getSetCookie().map((h) => h.split(";")[0].trim()).join("; ");
|
|
295
|
+
},
|
|
296
|
+
catch: (e) => new PinterestHttpError({
|
|
297
|
+
message: `Failed to get default cookies from Pinterest: ${String(e)}`,
|
|
298
|
+
cause: e
|
|
299
|
+
})
|
|
300
|
+
});
|
|
301
|
+
/**
|
|
302
|
+
* Create a Pinterest session.
|
|
303
|
+
* Fetches default cookies from pinterest.com, optionally merging in user-provided
|
|
304
|
+
* cookies (from a cookies.json file loaded as Selenium format).
|
|
305
|
+
*/
|
|
306
|
+
const makeSession = (customCookies) => Effect.gen(function* () {
|
|
307
|
+
const parts = [yield* getDefaultCookies()];
|
|
308
|
+
if (customCookies && customCookies.length > 0) parts.push(cookiesToString(customCookies));
|
|
309
|
+
return { cookies: parts.join("; ") };
|
|
310
|
+
});
|
|
311
|
+
/** Make an authenticated GET request to a Pinterest API endpoint. */
|
|
312
|
+
const apiGet = (session, url, timeout = 1e4) => Effect.tryPromise({
|
|
313
|
+
try: async () => {
|
|
314
|
+
const response = await fetch(url, {
|
|
315
|
+
signal: AbortSignal.timeout(timeout),
|
|
316
|
+
headers: {
|
|
317
|
+
Cookie: session.cookies,
|
|
318
|
+
"User-Agent": USER_AGENT,
|
|
319
|
+
"x-pinterest-pws-handler": PWS_HANDLER,
|
|
320
|
+
"X-Requested-With": "XMLHttpRequest",
|
|
321
|
+
Accept: "application/json, text/javascript, */*, q=0.01"
|
|
322
|
+
}
|
|
323
|
+
});
|
|
324
|
+
const body = await response.json().catch(() => null);
|
|
325
|
+
if (!response.ok) {
|
|
326
|
+
const apiMsg = body?.resource_response?.error?.message;
|
|
327
|
+
throw new Error(apiMsg ?? `HTTP ${response.status} ${response.statusText}`);
|
|
328
|
+
}
|
|
329
|
+
return body;
|
|
330
|
+
},
|
|
331
|
+
catch: (e) => new PinterestHttpError({
|
|
332
|
+
message: `API request failed: ${String(e)}`,
|
|
333
|
+
cause: e
|
|
334
|
+
})
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
//#endregion
|
|
338
|
+
//#region src/api/endpoints.ts
|
|
339
|
+
const BASE = "https://www.pinterest.com";
|
|
340
|
+
const Endpoints = {
|
|
341
|
+
BASE,
|
|
342
|
+
GET_RELATED_MODULES: `${BASE}/resource/RelatedModulesResource/get/`,
|
|
343
|
+
GET_PIN: `${BASE}/resource/PinResource/get/`,
|
|
344
|
+
GET_MAIN_IMAGE: `${BASE}/resource/ApiResource/get/`,
|
|
345
|
+
GET_BOARD: `${BASE}/resource/BoardResource/get/`,
|
|
346
|
+
GET_BOARD_PIN: `${BASE}/resource/BoardFeedResource/get/`,
|
|
347
|
+
GET_BOARD_SECTIONS: `${BASE}/resource/BoardSectionsResource/get/`,
|
|
348
|
+
GET_BOARD_SECTION_PINS: `${BASE}/resource/BoardSectionPinsResource/get/`,
|
|
349
|
+
GET_SEARCH: `${BASE}/resource/BaseSearchResource/get/`
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
//#endregion
|
|
353
|
+
//#region src/api/request-builder.ts
|
|
354
|
+
/**
|
|
355
|
+
* Mirrors pinterest-dl's Python request_builder.py.
|
|
356
|
+
* Builds GET request URLs with query params in the format Pinterest's API expects:
|
|
357
|
+
* {endpoint}?source_url={url}&data={JSON}&_={timestamp}
|
|
358
|
+
* Spaces are encoded as %20 (not +).
|
|
359
|
+
*/
|
|
360
|
+
const buildGetUrl = (endpoint, options, sourceUrl = "/", context = {}) => {
|
|
361
|
+
const data = JSON.stringify({
|
|
362
|
+
options,
|
|
363
|
+
context
|
|
364
|
+
});
|
|
365
|
+
return `${endpoint}?${new URLSearchParams({
|
|
366
|
+
source_url: sourceUrl,
|
|
367
|
+
data,
|
|
368
|
+
_: Date.now().toString()
|
|
369
|
+
}).toString().replace(/\+/g, "%20")}`;
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
//#endregion
|
|
373
|
+
//#region src/api/response-parser.ts
|
|
374
|
+
/** Narrow an unknown value to a plain object record, or return null. */
|
|
375
|
+
const asRecord = (val) => typeof val === "object" && val !== null && !Array.isArray(val) ? val : null;
|
|
376
|
+
/** Extract the highest-resolution video variant from a pin item. */
|
|
377
|
+
const extractBestVideoVariant = (pin) => {
|
|
378
|
+
const videoList = pin.videos?.video_list ?? pin.story_pin_data?.pages?.[0]?.blocks?.[0]?.video?.video_list ?? {};
|
|
379
|
+
const variants = Object.values(videoList);
|
|
380
|
+
if (variants.length === 0) return null;
|
|
381
|
+
const best = variants.reduce((prev, curr) => {
|
|
382
|
+
const prevPx = (prev.width ?? 0) * (prev.height ?? 0);
|
|
383
|
+
return (curr.width ?? 0) * (curr.height ?? 0) > prevPx ? curr : prev;
|
|
384
|
+
});
|
|
385
|
+
if (!best.url) return null;
|
|
386
|
+
return {
|
|
387
|
+
url: best.url,
|
|
388
|
+
resolution: [best.width ?? 0, best.height ?? 0],
|
|
389
|
+
duration: best.duration ?? 0
|
|
390
|
+
};
|
|
391
|
+
};
|
|
392
|
+
/**
|
|
393
|
+
* Parse a list of raw pin items from the Pinterest API into PinterestMedia objects.
|
|
394
|
+
* Mirrors ResponseParser.from_responses() in the Python code.
|
|
395
|
+
*/
|
|
396
|
+
const parseResponseItems = (items, minResolution = [0, 0], captionFromTitle = false) => {
|
|
397
|
+
const [minWidth, minHeight] = minResolution;
|
|
398
|
+
const result = [];
|
|
399
|
+
for (const item of items) {
|
|
400
|
+
if (typeof item !== "object" || item === null) continue;
|
|
401
|
+
const pin = item;
|
|
402
|
+
const orig = pin.images?.orig;
|
|
403
|
+
if (!orig?.url) continue;
|
|
404
|
+
const width = orig.width ?? 0;
|
|
405
|
+
const height = orig.height ?? 0;
|
|
406
|
+
if (width < minWidth || height < minHeight) continue;
|
|
407
|
+
const id = String(pin.id ?? "0");
|
|
408
|
+
const src = orig.url;
|
|
409
|
+
const alt = captionFromTitle ? (pin.title ?? pin.auto_alt_text ?? null) || null : (pin.auto_alt_text ?? null) || null;
|
|
410
|
+
const origin = `https://www.pinterest.com/pin/${id}/`;
|
|
411
|
+
const videoStream = extractBestVideoVariant(pin);
|
|
412
|
+
result.push({
|
|
413
|
+
id,
|
|
414
|
+
src,
|
|
415
|
+
alt,
|
|
416
|
+
origin,
|
|
417
|
+
resolution: {
|
|
418
|
+
x: width,
|
|
419
|
+
y: height
|
|
420
|
+
},
|
|
421
|
+
...videoStream ? { media_stream: { video: videoStream } } : {}
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
return result;
|
|
425
|
+
};
|
|
426
|
+
/** Validate and extract `resource_response.data` as a single-item array (for PinResource). */
|
|
427
|
+
const extractPinData = (raw) => Effect.gen(function* () {
|
|
428
|
+
const rawRecord = asRecord(raw);
|
|
429
|
+
if (rawRecord === null) return yield* Effect.fail(new PinterestParseError({ message: "Response is not an object" }));
|
|
430
|
+
const rr = asRecord(rawRecord["resource_response"]);
|
|
431
|
+
if (rr === null) return yield* Effect.fail(new PinterestParseError({ message: "Missing 'resource_response' key" }));
|
|
432
|
+
const data = rr["data"];
|
|
433
|
+
if (data === null || data === void 0) return [];
|
|
434
|
+
return Array.isArray(data) ? data : [data];
|
|
435
|
+
});
|
|
436
|
+
/** Validate and extract `resource_response.data` (array form for boards/pins). */
|
|
437
|
+
const extractDataArray = (raw) => Effect.gen(function* () {
|
|
438
|
+
const rawRecord = asRecord(raw);
|
|
439
|
+
if (rawRecord === null) return yield* Effect.fail(new PinterestParseError({ message: "Response is not an object" }));
|
|
440
|
+
const rr = asRecord(rawRecord["resource_response"]);
|
|
441
|
+
if (rr === null) return yield* Effect.fail(new PinterestParseError({ message: "Missing 'resource_response' key" }));
|
|
442
|
+
const data = rr["data"];
|
|
443
|
+
return Array.isArray(data) ? data : [];
|
|
444
|
+
});
|
|
445
|
+
/** Validate and extract `resource_response.data.results` (array form for search). */
|
|
446
|
+
const extractSearchResults = (raw) => Effect.gen(function* () {
|
|
447
|
+
const arr = yield* extractDataArray(raw);
|
|
448
|
+
if (arr.length === 0) {
|
|
449
|
+
const data = asRecord(asRecord(asRecord(raw)?.["resource_response"])?.["data"]);
|
|
450
|
+
if (data !== null) {
|
|
451
|
+
const results = data["results"];
|
|
452
|
+
return Array.isArray(results) ? results : [];
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
return arr;
|
|
456
|
+
});
|
|
457
|
+
/** Extract bookmarks from a raw Pinterest API response. */
|
|
458
|
+
const extractBookmarks = (raw) => {
|
|
459
|
+
const bookmarks = asRecord(asRecord(asRecord(raw)?.["resource"])?.["options"])?.["bookmarks"];
|
|
460
|
+
if (!Array.isArray(bookmarks)) return [];
|
|
461
|
+
return bookmarks.filter((b) => typeof b === "string");
|
|
462
|
+
};
|
|
463
|
+
/** Extract board ID from a raw Pinterest board API response. */
|
|
464
|
+
const extractBoardId = (raw) => Effect.gen(function* () {
|
|
465
|
+
const rr = asRecord(asRecord(raw)?.["resource_response"]);
|
|
466
|
+
if (rr === null) return yield* Effect.fail(new PinterestParseError({ message: "Missing resource_response" }));
|
|
467
|
+
const data = asRecord(rr["data"]);
|
|
468
|
+
if (data === null) return yield* Effect.fail(new PinterestParseError({ message: "Expected board data object" }));
|
|
469
|
+
const id = data["id"];
|
|
470
|
+
if (typeof id !== "string" || !id) return yield* Effect.fail(new PinterestParseError({ message: `Invalid board id: ${String(id)}` }));
|
|
471
|
+
return id;
|
|
472
|
+
});
|
|
473
|
+
/** Extract pin count from a raw Pinterest board API response. */
|
|
474
|
+
const extractPinCount = (raw) => {
|
|
475
|
+
const count = asRecord(asRecord(asRecord(raw)?.["resource_response"])?.["data"])?.["pin_count"];
|
|
476
|
+
return typeof count === "number" ? count : 0;
|
|
477
|
+
};
|
|
478
|
+
/** Extract sections from a raw Pinterest sections API response. */
|
|
479
|
+
const extractSections = (raw) => {
|
|
480
|
+
const data = asRecord(asRecord(raw)?.["resource_response"])?.["data"];
|
|
481
|
+
if (!Array.isArray(data)) return [];
|
|
482
|
+
return data.filter((item) => typeof item === "object" && item !== null && typeof item["id"] === "string");
|
|
483
|
+
};
|
|
484
|
+
|
|
485
|
+
//#endregion
|
|
486
|
+
//#region src/scraper/bookmark-manager.ts
|
|
487
|
+
/**
|
|
488
|
+
* Manages pagination bookmarks for Pinterest API requests.
|
|
489
|
+
* Mirrors Python's BookmarkManager class.
|
|
490
|
+
*
|
|
491
|
+
* @param last - Number of bookmarks to keep from the tail (1–4). Controls
|
|
492
|
+
* how many recent bookmarks are sent to the API per request.
|
|
493
|
+
*/
|
|
494
|
+
var BookmarkManager = class {
|
|
495
|
+
bookmarks = [];
|
|
496
|
+
constructor(last) {
|
|
497
|
+
this.last = last;
|
|
498
|
+
if (last < 0 || last > 4) throw new Error("last must be 0–4");
|
|
499
|
+
}
|
|
500
|
+
add(bookmark) {
|
|
501
|
+
this.bookmarks.push(bookmark);
|
|
502
|
+
}
|
|
503
|
+
addAll(bookmarks) {
|
|
504
|
+
this.bookmarks.push(...bookmarks);
|
|
505
|
+
}
|
|
506
|
+
clear() {
|
|
507
|
+
this.bookmarks = [];
|
|
508
|
+
}
|
|
509
|
+
/** Returns the last `N` bookmarks (or all if fewer than N exist). */
|
|
510
|
+
get() {
|
|
511
|
+
if (this.bookmarks.length < this.last) return [...this.bookmarks];
|
|
512
|
+
return this.bookmarks.slice(-this.last);
|
|
513
|
+
}
|
|
514
|
+
getAll() {
|
|
515
|
+
return [...this.bookmarks];
|
|
516
|
+
}
|
|
517
|
+
/** Returns true when Pinterest signals there are no more items. */
|
|
518
|
+
isAtEnd() {
|
|
519
|
+
return this.bookmarks.some((b) => b.includes("-end-"));
|
|
520
|
+
}
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
//#endregion
|
|
524
|
+
//#region src/scraper/url-parser.ts
|
|
525
|
+
/** Add trailing slash if missing. */
|
|
526
|
+
const sanitizeUrl = (url) => url.endsWith("/") ? url : url + "/";
|
|
527
|
+
/** Parse a Pinterest URL into a structured UrlType. */
|
|
528
|
+
const parsePinterestUrl = (url) => {
|
|
529
|
+
const pinMatch = /pin\/(\d+)\/?/.exec(url);
|
|
530
|
+
if (pinMatch) return Effect.succeed({
|
|
531
|
+
kind: "pin",
|
|
532
|
+
pinId: pinMatch[1]
|
|
533
|
+
});
|
|
534
|
+
const searchMatch = /\/search\/pins\/\?q=([^&]+)/.exec(url);
|
|
535
|
+
if (searchMatch) {
|
|
536
|
+
const query = decodeURIComponent(searchMatch[1].replace(/\+/g, " "));
|
|
537
|
+
return Effect.succeed({
|
|
538
|
+
kind: "search",
|
|
539
|
+
query
|
|
540
|
+
});
|
|
541
|
+
}
|
|
542
|
+
const sectionMatch = /https:\/\/(?:[a-z0-9-]+\.)?pinterest\.com\/([A-Za-z0-9_-]+)\/([A-Za-z0-9_-]+)\/([A-Za-z0-9_-]+)\/?$/.exec(url);
|
|
543
|
+
if (sectionMatch) return Effect.succeed({
|
|
544
|
+
kind: "section",
|
|
545
|
+
username: sectionMatch[1],
|
|
546
|
+
boardname: sectionMatch[2],
|
|
547
|
+
sectionSlug: sectionMatch[3]
|
|
548
|
+
});
|
|
549
|
+
const boardMatch = /https:\/\/(?:[a-z0-9-]+\.)?pinterest\.com\/([A-Za-z0-9_-]+)\/([A-Za-z0-9_-]+)\/?$/.exec(url);
|
|
550
|
+
if (boardMatch) return Effect.succeed({
|
|
551
|
+
kind: "board",
|
|
552
|
+
username: boardMatch[1],
|
|
553
|
+
boardname: boardMatch[2]
|
|
554
|
+
});
|
|
555
|
+
if (/^(?!https?:\/\/).*pinterest\./i.test(url)) return Effect.fail(new InvalidUrlError({
|
|
556
|
+
url,
|
|
557
|
+
message: `Cannot parse Pinterest URL: ${url}\n Hint: URL must start with https:// (e.g. https://www.pinterest.com/user/board/)`
|
|
558
|
+
}));
|
|
559
|
+
if (/https?:\/\/(?:[a-z0-9-]+\.)?pinterest\.(?!com)[a-z.]+/i.test(url)) return Effect.fail(new InvalidUrlError({
|
|
560
|
+
url,
|
|
561
|
+
message: `Cannot parse Pinterest URL: ${url}\n Hint: Only pinterest.com URLs are supported. Try converting to https://www.pinterest.com/...`
|
|
562
|
+
}));
|
|
563
|
+
return Effect.fail(new InvalidUrlError({
|
|
564
|
+
url,
|
|
565
|
+
message: `Cannot parse Pinterest URL: ${url}`
|
|
566
|
+
}));
|
|
567
|
+
};
|
|
568
|
+
|
|
569
|
+
//#endregion
|
|
570
|
+
//#region src/scraper/api-scraper.ts
|
|
571
|
+
const DEFAULT_OPTS = {
|
|
572
|
+
num: 100,
|
|
573
|
+
minResolution: [0, 0],
|
|
574
|
+
delay: .2,
|
|
575
|
+
captionFromTitle: false,
|
|
576
|
+
ensureAlt: false,
|
|
577
|
+
timeout: 1e4,
|
|
578
|
+
verbose: false,
|
|
579
|
+
dump: void 0,
|
|
580
|
+
related: false
|
|
581
|
+
};
|
|
582
|
+
/** Incrementing counter for dump file naming, scoped to this module. */
|
|
583
|
+
let _dumpSeq = 0;
|
|
584
|
+
/** Save a request URL and its response JSON to the dump directory. Exported for testing. */
|
|
585
|
+
const dumpPair = (url, response, dumpDir) => Effect.promise(async () => {
|
|
586
|
+
const n = String(++_dumpSeq).padStart(3, "0");
|
|
587
|
+
await fs.mkdir(dumpDir, { recursive: true });
|
|
588
|
+
await fs.writeFile(path.join(dumpDir, `dump-${n}-request.json`), JSON.stringify({ url }, null, 2), "utf-8");
|
|
589
|
+
await fs.writeFile(path.join(dumpDir, `dump-${n}-response.json`), JSON.stringify(response, null, 2), "utf-8");
|
|
590
|
+
});
|
|
591
|
+
const sleep = (seconds) => Effect.promise(() => new Promise((r) => setTimeout(r, seconds * 1e3)));
|
|
592
|
+
const dedup = (items) => {
|
|
593
|
+
const seen = /* @__PURE__ */ new Set();
|
|
594
|
+
return items.filter((m) => {
|
|
595
|
+
if (seen.has(m.src)) return false;
|
|
596
|
+
seen.add(m.src);
|
|
597
|
+
return true;
|
|
598
|
+
});
|
|
599
|
+
};
|
|
600
|
+
const filterAlt = (items) => items.filter((m) => m.alt && m.alt.trim() !== "");
|
|
601
|
+
/**
|
|
602
|
+
* Retry an Effect on PinterestHttpError with linear backoff.
|
|
603
|
+
* Parse errors and other failures propagate immediately without retry.
|
|
604
|
+
*/
|
|
605
|
+
const withNetworkRetry = (makeEffect, maxAttempts = 3) => {
|
|
606
|
+
const attempt = (remaining) => makeEffect().pipe(Effect.catchTag("PinterestHttpError", (e) => {
|
|
607
|
+
if (remaining <= 1) return Effect.fail(e);
|
|
608
|
+
return sleep(maxAttempts - remaining + 1).pipe(Effect.andThen(attempt(remaining - 1)));
|
|
609
|
+
}));
|
|
610
|
+
return attempt(maxAttempts);
|
|
611
|
+
};
|
|
612
|
+
/** Scrape related pins given a pin ID, paginating until `num` items collected. */
|
|
613
|
+
const scrapeRelatedPins = (session, pinId, opts) => Effect.gen(function* () {
|
|
614
|
+
const bookmarks = new BookmarkManager(3);
|
|
615
|
+
const results = [];
|
|
616
|
+
let remains = opts.num;
|
|
617
|
+
while (remains > 0) {
|
|
618
|
+
const batchSize = Math.min(50, remains);
|
|
619
|
+
const pinUrl = buildGetUrl(Endpoints.GET_RELATED_MODULES, {
|
|
620
|
+
pin_id: pinId,
|
|
621
|
+
context_pin_ids: [],
|
|
622
|
+
page_size: batchSize,
|
|
623
|
+
bookmarks: bookmarks.get(),
|
|
624
|
+
search_query: "",
|
|
625
|
+
source: "deep_linking",
|
|
626
|
+
top_level_source: "deep_linking",
|
|
627
|
+
top_level_source_depth: 1,
|
|
628
|
+
is_pdp: false
|
|
629
|
+
}, `/pin/${pinId}/`);
|
|
630
|
+
const raw = yield* withNetworkRetry(() => apiGet(session, pinUrl, opts.timeout));
|
|
631
|
+
if (opts.dump) yield* dumpPair(pinUrl, raw, opts.dump);
|
|
632
|
+
bookmarks.addAll(extractBookmarks(raw));
|
|
633
|
+
const batch = parseResponseItems(yield* extractDataArray(raw), opts.minResolution, opts.captionFromTitle);
|
|
634
|
+
const filtered = opts.ensureAlt ? filterAlt(batch) : batch;
|
|
635
|
+
const before = results.length;
|
|
636
|
+
results.push(...filtered);
|
|
637
|
+
const dedupd = dedup(results);
|
|
638
|
+
results.length = 0;
|
|
639
|
+
results.push(...dedupd);
|
|
640
|
+
remains -= results.length - before;
|
|
641
|
+
opts.onProgress?.(results.length, opts.num);
|
|
642
|
+
if (bookmarks.isAtEnd()) break;
|
|
643
|
+
if (remains > 0) yield* sleep(opts.delay);
|
|
644
|
+
}
|
|
645
|
+
return results.slice(0, opts.num);
|
|
646
|
+
});
|
|
647
|
+
/** Fetch a single pin directly by its ID using PinResource. */
|
|
648
|
+
const scrapeSinglePin = (session, pinId, opts) => Effect.gen(function* () {
|
|
649
|
+
const pinUrl = buildGetUrl(Endpoints.GET_PIN, {
|
|
650
|
+
id: pinId,
|
|
651
|
+
field_set_key: "detailed"
|
|
652
|
+
}, `/pin/${pinId}/`);
|
|
653
|
+
const raw = yield* withNetworkRetry(() => apiGet(session, pinUrl, opts.timeout));
|
|
654
|
+
if (opts.dump) yield* dumpPair(pinUrl, raw, opts.dump);
|
|
655
|
+
const results = parseResponseItems(yield* extractPinData(raw), opts.minResolution, opts.captionFromTitle);
|
|
656
|
+
const filtered = opts.ensureAlt ? filterAlt(results) : results;
|
|
657
|
+
opts.onProgress?.(filtered.length, 1);
|
|
658
|
+
return filtered;
|
|
659
|
+
});
|
|
660
|
+
/** Scrape board pins given username and boardname. */
|
|
661
|
+
const scrapeBoard = (session, username, boardname, opts) => Effect.gen(function* () {
|
|
662
|
+
const boardInfoUrl = buildGetUrl(Endpoints.GET_BOARD, {
|
|
663
|
+
username,
|
|
664
|
+
slug: boardname,
|
|
665
|
+
field_set_key: "detailed"
|
|
666
|
+
}, `/${username}/${boardname}/`);
|
|
667
|
+
const boardRaw = yield* withNetworkRetry(() => apiGet(session, boardInfoUrl, opts.timeout));
|
|
668
|
+
if (opts.dump) yield* dumpPair(boardInfoUrl, boardRaw, opts.dump);
|
|
669
|
+
const boardId = yield* extractBoardId(boardRaw);
|
|
670
|
+
const pinCount = extractPinCount(boardRaw);
|
|
671
|
+
const target = Math.min(opts.num, pinCount || opts.num);
|
|
672
|
+
const bookmarks = new BookmarkManager(3);
|
|
673
|
+
const results = [];
|
|
674
|
+
let remains = target;
|
|
675
|
+
let consecutiveEmpty = 0;
|
|
676
|
+
const MAX_EMPTY = 3;
|
|
677
|
+
while (remains > 0) {
|
|
678
|
+
const batchSize = Math.min(50, remains);
|
|
679
|
+
const boardFeedUrl = buildGetUrl(Endpoints.GET_BOARD_PIN, {
|
|
680
|
+
board_id: boardId,
|
|
681
|
+
board_url: `/${username}/${boardname}/`,
|
|
682
|
+
page_size: batchSize,
|
|
683
|
+
bookmarks: bookmarks.get(),
|
|
684
|
+
currentFilter: -1,
|
|
685
|
+
field_set_key: "react_grid_pin",
|
|
686
|
+
filter_section_pins: true,
|
|
687
|
+
sort: "default",
|
|
688
|
+
layout: "default",
|
|
689
|
+
redux_normalize_feed: true
|
|
690
|
+
}, `/${username}/${boardname}/`);
|
|
691
|
+
const raw = yield* withNetworkRetry(() => apiGet(session, boardFeedUrl, opts.timeout));
|
|
692
|
+
if (opts.dump) yield* dumpPair(boardFeedUrl, raw, opts.dump);
|
|
693
|
+
bookmarks.addAll(extractBookmarks(raw));
|
|
694
|
+
const batch = parseResponseItems(yield* extractDataArray(raw), opts.minResolution, opts.captionFromTitle);
|
|
695
|
+
const filtered = opts.ensureAlt ? filterAlt(batch) : batch;
|
|
696
|
+
if (filtered.length === 0) {
|
|
697
|
+
consecutiveEmpty++;
|
|
698
|
+
if (consecutiveEmpty >= MAX_EMPTY) break;
|
|
699
|
+
} else consecutiveEmpty = 0;
|
|
700
|
+
const before = results.length;
|
|
701
|
+
results.push(...filtered);
|
|
702
|
+
const dedupd = dedup(results);
|
|
703
|
+
results.length = 0;
|
|
704
|
+
results.push(...dedupd);
|
|
705
|
+
remains -= results.length - before;
|
|
706
|
+
opts.onProgress?.(results.length, opts.num);
|
|
707
|
+
if (bookmarks.isAtEnd()) break;
|
|
708
|
+
if (remains > 0) yield* sleep(opts.delay);
|
|
709
|
+
}
|
|
710
|
+
return results.slice(0, opts.num);
|
|
711
|
+
});
|
|
712
|
+
/** Scrape a specific board section by slug. */
|
|
713
|
+
const scrapeSection = (session, username, boardname, sectionSlug, opts) => Effect.gen(function* () {
|
|
714
|
+
const sectionBoardInfoUrl = buildGetUrl(Endpoints.GET_BOARD, {
|
|
715
|
+
username,
|
|
716
|
+
slug: boardname,
|
|
717
|
+
field_set_key: "detailed"
|
|
718
|
+
}, `/${username}/${boardname}/`);
|
|
719
|
+
const boardRaw = yield* withNetworkRetry(() => apiGet(session, sectionBoardInfoUrl, opts.timeout));
|
|
720
|
+
if (opts.dump) yield* dumpPair(sectionBoardInfoUrl, boardRaw, opts.dump);
|
|
721
|
+
const boardId = yield* extractBoardId(boardRaw);
|
|
722
|
+
const sectionsUrl = buildGetUrl(Endpoints.GET_BOARD_SECTIONS, { board_id: boardId });
|
|
723
|
+
const sectionsRaw = yield* withNetworkRetry(() => apiGet(session, sectionsUrl, opts.timeout));
|
|
724
|
+
if (opts.dump) yield* dumpPair(sectionsUrl, sectionsRaw, opts.dump);
|
|
725
|
+
const section = extractSections(sectionsRaw).find((s) => s.slug === sectionSlug || (s.title ?? "").toLowerCase().replace(/\s+/g, "-") === sectionSlug.toLowerCase());
|
|
726
|
+
if (!section) return yield* Effect.fail(new InvalidUrlError({
|
|
727
|
+
url: `/${username}/${boardname}/${sectionSlug}/`,
|
|
728
|
+
message: `Section '${sectionSlug}' not found in board '${boardname}'`
|
|
729
|
+
}));
|
|
730
|
+
const sectionId = section.id;
|
|
731
|
+
const bookmarks = new BookmarkManager(3);
|
|
732
|
+
const results = [];
|
|
733
|
+
let remains = opts.num;
|
|
734
|
+
while (remains > 0) {
|
|
735
|
+
const batchSize = Math.min(50, remains);
|
|
736
|
+
const sectionPinsUrl = buildGetUrl(Endpoints.GET_BOARD_SECTION_PINS, {
|
|
737
|
+
section_id: sectionId,
|
|
738
|
+
page_size: batchSize,
|
|
739
|
+
bookmarks: bookmarks.get(),
|
|
740
|
+
field_set_key: "react_grid_pin",
|
|
741
|
+
redux_normalize_feed: true
|
|
742
|
+
}, `/board/section/${sectionId}/`);
|
|
743
|
+
const raw = yield* withNetworkRetry(() => apiGet(session, sectionPinsUrl, opts.timeout));
|
|
744
|
+
if (opts.dump) yield* dumpPair(sectionPinsUrl, raw, opts.dump);
|
|
745
|
+
bookmarks.addAll(extractBookmarks(raw));
|
|
746
|
+
const batch = parseResponseItems(yield* extractDataArray(raw), opts.minResolution, opts.captionFromTitle);
|
|
747
|
+
const filtered = opts.ensureAlt ? filterAlt(batch) : batch;
|
|
748
|
+
const before = results.length;
|
|
749
|
+
results.push(...filtered);
|
|
750
|
+
const dedupd = dedup(results);
|
|
751
|
+
results.length = 0;
|
|
752
|
+
results.push(...dedupd);
|
|
753
|
+
remains -= results.length - before;
|
|
754
|
+
opts.onProgress?.(results.length, opts.num);
|
|
755
|
+
if (bookmarks.isAtEnd() || filtered.length === 0) break;
|
|
756
|
+
if (remains > 0) yield* sleep(opts.delay);
|
|
757
|
+
}
|
|
758
|
+
return results.slice(0, opts.num);
|
|
759
|
+
});
|
|
760
|
+
/** Search Pinterest by query string. */
|
|
761
|
+
const scrapeSearch = (session, query, opts) => Effect.gen(function* () {
|
|
762
|
+
const bookmarks = new BookmarkManager(1);
|
|
763
|
+
const results = [];
|
|
764
|
+
let remains = opts.num;
|
|
765
|
+
const apiSourceUrl = `/search/pins/?q=${encodeURIComponent(query)}rs=typed`;
|
|
766
|
+
while (remains > 0) {
|
|
767
|
+
const batchSize = Math.min(50, remains);
|
|
768
|
+
const searchUrl = buildGetUrl(Endpoints.GET_SEARCH, {
|
|
769
|
+
appliedProductFilters: "---",
|
|
770
|
+
auto_correction_disabled: false,
|
|
771
|
+
bookmarks: bookmarks.get(),
|
|
772
|
+
page_size: batchSize,
|
|
773
|
+
query,
|
|
774
|
+
redux_normalize_feed: true,
|
|
775
|
+
rs: "typed",
|
|
776
|
+
scope: "pins",
|
|
777
|
+
source_url: apiSourceUrl
|
|
778
|
+
}, apiSourceUrl);
|
|
779
|
+
const raw = yield* withNetworkRetry(() => apiGet(session, searchUrl, opts.timeout));
|
|
780
|
+
if (opts.dump) yield* dumpPair(searchUrl, raw, opts.dump);
|
|
781
|
+
bookmarks.addAll(extractBookmarks(raw));
|
|
782
|
+
const batch = parseResponseItems(yield* extractSearchResults(raw), opts.minResolution, opts.captionFromTitle);
|
|
783
|
+
const filtered = opts.ensureAlt ? filterAlt(batch) : batch;
|
|
784
|
+
const before = results.length;
|
|
785
|
+
results.push(...filtered);
|
|
786
|
+
const dedupd = dedup(results);
|
|
787
|
+
results.length = 0;
|
|
788
|
+
results.push(...dedupd);
|
|
789
|
+
remains -= results.length - before;
|
|
790
|
+
opts.onProgress?.(results.length, opts.num);
|
|
791
|
+
if (bookmarks.isAtEnd() || filtered.length === 0) break;
|
|
792
|
+
if (remains > 0) yield* sleep(opts.delay);
|
|
793
|
+
}
|
|
794
|
+
return results.slice(0, opts.num);
|
|
795
|
+
});
|
|
796
|
+
/** Scrape a Pinterest URL (pin, board, section, or search). */
|
|
797
|
+
const scrapeUrl = (session, url, opts = {}) => Effect.gen(function* () {
|
|
798
|
+
const resolved = {
|
|
799
|
+
...DEFAULT_OPTS,
|
|
800
|
+
...opts
|
|
801
|
+
};
|
|
802
|
+
const parsed = yield* parsePinterestUrl(sanitizeUrl(url));
|
|
803
|
+
switch (parsed.kind) {
|
|
804
|
+
case "pin": return resolved.related ? yield* scrapeRelatedPins(session, parsed.pinId, resolved) : yield* scrapeSinglePin(session, parsed.pinId, resolved);
|
|
805
|
+
case "board": return yield* scrapeBoard(session, parsed.username, parsed.boardname, resolved);
|
|
806
|
+
case "section": return yield* scrapeSection(session, parsed.username, parsed.boardname, parsed.sectionSlug, resolved);
|
|
807
|
+
case "search": return yield* scrapeSearch(session, parsed.query, resolved);
|
|
808
|
+
}
|
|
809
|
+
});
|
|
810
|
+
/** Search Pinterest for a query string. */
|
|
811
|
+
const scrapeQuery = (session, query, opts = {}) => {
|
|
812
|
+
return scrapeSearch(session, query, {
|
|
813
|
+
...DEFAULT_OPTS,
|
|
814
|
+
...opts
|
|
815
|
+
});
|
|
816
|
+
};
|
|
817
|
+
|
|
818
|
+
//#endregion
|
|
819
|
+
//#region src/download/downloader.ts
|
|
820
|
+
const execFileAsync = promisify(execFile);
|
|
821
|
+
/**
|
|
822
|
+
* Render a filename template for a single media item.
|
|
823
|
+
* Variables: {id}, {alt}, {index} (1-based).
|
|
824
|
+
* Exported for testing.
|
|
825
|
+
*/
|
|
826
|
+
const renderFilename = (template, media, index) => {
|
|
827
|
+
const safeAlt = (media.alt ?? "").trim().replace(/[^\w\s-]/g, "").replace(/\s+/g, "_").slice(0, 100).trim() || media.id;
|
|
828
|
+
return template.replace(/\{id\}/g, media.id).replace(/\{alt\}/g, safeAlt).replace(/\{index\}/g, String(index + 1));
|
|
829
|
+
};
|
|
830
|
+
/** Build ffmpeg arguments for remuxing or re-encoding an HLS stream. Exported for testing. */
|
|
831
|
+
const buildFfmpegArgs = (inputPath, outputPath, reencode) => reencode ? [
|
|
832
|
+
"-y",
|
|
833
|
+
"-i",
|
|
834
|
+
inputPath,
|
|
835
|
+
outputPath
|
|
836
|
+
] : [
|
|
837
|
+
"-y",
|
|
838
|
+
"-i",
|
|
839
|
+
inputPath,
|
|
840
|
+
"-c",
|
|
841
|
+
"copy",
|
|
842
|
+
outputPath
|
|
843
|
+
];
|
|
844
|
+
const downloadBlob = async (url, dest, timeout = 3e4) => {
|
|
845
|
+
const response = await fetch(url, {
|
|
846
|
+
signal: AbortSignal.timeout(timeout),
|
|
847
|
+
headers: { "User-Agent": USER_AGENT }
|
|
848
|
+
});
|
|
849
|
+
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
|
850
|
+
await fs.writeFile(dest, new Uint8Array(await response.arrayBuffer()));
|
|
851
|
+
};
|
|
852
|
+
/** Parse an M3U8 playlist and return all segment URLs. */
|
|
853
|
+
const parseM3U8Segments = (content, baseUrl) => {
|
|
854
|
+
const base = new URL(baseUrl);
|
|
855
|
+
const segments = [];
|
|
856
|
+
for (const line of content.split("\n")) {
|
|
857
|
+
const trimmed = line.trim();
|
|
858
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
859
|
+
if (/^https?:\/\//.test(trimmed)) segments.push(trimmed);
|
|
860
|
+
else {
|
|
861
|
+
base.pathname = base.pathname.replace(/[^/]+$/, trimmed);
|
|
862
|
+
segments.push(base.href);
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
return segments;
|
|
866
|
+
};
|
|
867
|
+
/** Download an HLS stream. Concatenates .ts segments, optionally remuxes with ffmpeg. */
|
|
868
|
+
const downloadHls = async (streamUrl, targetPath, skipRemux, reencode, timeout = 6e4) => {
|
|
869
|
+
const response = await fetch(streamUrl, {
|
|
870
|
+
signal: AbortSignal.timeout(timeout),
|
|
871
|
+
headers: { "User-Agent": USER_AGENT }
|
|
872
|
+
});
|
|
873
|
+
if (!response.ok) throw new Error(`Failed to fetch HLS manifest: HTTP ${response.status}`);
|
|
874
|
+
const segments = parseM3U8Segments(await response.text(), streamUrl);
|
|
875
|
+
if (segments.length === 0) throw new Error("No segments found in HLS manifest");
|
|
876
|
+
const buffers = [];
|
|
877
|
+
for (const segUrl of segments) {
|
|
878
|
+
const seg = await fetch(segUrl, {
|
|
879
|
+
signal: AbortSignal.timeout(timeout),
|
|
880
|
+
headers: { "User-Agent": USER_AGENT }
|
|
881
|
+
});
|
|
882
|
+
if (!seg.ok) throw new Error(`Failed to fetch HLS segment: ${segUrl}`);
|
|
883
|
+
buffers.push(new Uint8Array(await seg.arrayBuffer()));
|
|
884
|
+
}
|
|
885
|
+
const tsBuffer = Buffer.concat(buffers);
|
|
886
|
+
if (skipRemux && !reencode) {
|
|
887
|
+
const tsPath = targetPath.replace(/\.mp4$/, ".ts");
|
|
888
|
+
await fs.writeFile(tsPath, tsBuffer);
|
|
889
|
+
return tsPath;
|
|
890
|
+
}
|
|
891
|
+
const tsPath = targetPath + ".tmp.ts";
|
|
892
|
+
await fs.writeFile(tsPath, tsBuffer);
|
|
893
|
+
try {
|
|
894
|
+
await execFileAsync("ffmpeg", buildFfmpegArgs(tsPath, targetPath, reencode));
|
|
895
|
+
await fs.rm(tsPath, { force: true });
|
|
896
|
+
return targetPath;
|
|
897
|
+
} catch {
|
|
898
|
+
await fs.rm(tsPath, { force: true }).catch(() => {});
|
|
899
|
+
const tsOut = targetPath.replace(/\.mp4$/, ".ts");
|
|
900
|
+
await fs.writeFile(tsOut, tsBuffer);
|
|
901
|
+
console.warn(`[warn] ffmpeg not found or failed — saved raw .ts file instead: ${tsOut}`);
|
|
902
|
+
return tsOut;
|
|
903
|
+
}
|
|
904
|
+
};
|
|
905
|
+
/** Download a single media item to the output directory. */
|
|
906
|
+
const downloadOne = async (media, outputDir, downloadStreams, skipRemux, reencode, skipExisting, filenameTemplate, index) => {
|
|
907
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
908
|
+
const basename = renderFilename(filenameTemplate, media, index);
|
|
909
|
+
if (downloadStreams && media.media_stream) {
|
|
910
|
+
const videoUrl = media.media_stream.video.url;
|
|
911
|
+
const dest = path.join(outputDir, `${basename}.mp4`);
|
|
912
|
+
if (skipExisting) try {
|
|
913
|
+
await fs.access(dest);
|
|
914
|
+
return {
|
|
915
|
+
...media,
|
|
916
|
+
localPath: dest
|
|
917
|
+
};
|
|
918
|
+
} catch {}
|
|
919
|
+
if (new URL(videoUrl).pathname.toLowerCase().endsWith(".mp4")) await downloadBlob(videoUrl, dest);
|
|
920
|
+
else {
|
|
921
|
+
const actual = await downloadHls(videoUrl, dest, skipRemux, reencode);
|
|
922
|
+
return {
|
|
923
|
+
...media,
|
|
924
|
+
localPath: actual
|
|
925
|
+
};
|
|
926
|
+
}
|
|
927
|
+
return {
|
|
928
|
+
...media,
|
|
929
|
+
localPath: dest
|
|
930
|
+
};
|
|
931
|
+
}
|
|
932
|
+
const imageUrl = media.src;
|
|
933
|
+
const parsedPath = new URL(imageUrl).pathname;
|
|
934
|
+
const ext = path.extname(parsedPath).toLowerCase() || ".jpg";
|
|
935
|
+
const dest = path.join(outputDir, `${basename}${ext}`);
|
|
936
|
+
if (skipExisting) try {
|
|
937
|
+
await fs.access(dest);
|
|
938
|
+
return {
|
|
939
|
+
...media,
|
|
940
|
+
localPath: dest
|
|
941
|
+
};
|
|
942
|
+
} catch {}
|
|
943
|
+
await downloadBlob(imageUrl, dest);
|
|
944
|
+
return {
|
|
945
|
+
...media,
|
|
946
|
+
localPath: dest
|
|
947
|
+
};
|
|
948
|
+
};
|
|
949
|
+
/**
|
|
950
|
+
* Download media items concurrently.
|
|
951
|
+
* Returns the same list with `localPath` populated for successfully downloaded items.
|
|
952
|
+
*/
|
|
953
|
+
const downloadMedia = (items, outputDir, downloadStreams = false, skipRemux = false, reencode = false, concurrency = 8, skipExisting = false, filenameTemplate = "{id}") => Effect.gen(function* () {
|
|
954
|
+
const total = items.length;
|
|
955
|
+
let done = 0;
|
|
956
|
+
const indexedItems = items.map((item, i) => [item, i]);
|
|
957
|
+
const results = yield* Effect.forEach(indexedItems, ([media, i]) => Effect.tryPromise({
|
|
958
|
+
try: async () => {
|
|
959
|
+
const result = await downloadOne(media, outputDir, downloadStreams, skipRemux, reencode, skipExisting, filenameTemplate, i);
|
|
960
|
+
done++;
|
|
961
|
+
process.stderr.write(`\r Downloading ${done}/${total}...`);
|
|
962
|
+
return result;
|
|
963
|
+
},
|
|
964
|
+
catch: (e) => new DownloadError({
|
|
965
|
+
url: media.src,
|
|
966
|
+
message: `Failed to download ${media.id}: ${String(e)}`,
|
|
967
|
+
cause: e
|
|
968
|
+
})
|
|
969
|
+
}).pipe(Effect.catchTag("DownloadError", (err) => {
|
|
970
|
+
done++;
|
|
971
|
+
process.stderr.write(`\r Downloading ${done}/${total}...`);
|
|
972
|
+
console.warn(`\n[warn] ${err.message}`);
|
|
973
|
+
return Effect.succeed(media);
|
|
974
|
+
})), { concurrency });
|
|
975
|
+
if (total > 0) process.stderr.write("\n");
|
|
976
|
+
return results;
|
|
977
|
+
});
|
|
978
|
+
|
|
979
|
+
//#endregion
|
|
980
|
+
//#region src/storage/captions.ts
|
|
981
|
+
/** Write caption files for downloaded items (txt or json mode). */
|
|
982
|
+
const writeCaptionFiles = (items, outputDir, mode) => Effect.tryPromise({
|
|
983
|
+
try: async () => {
|
|
984
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
985
|
+
for (const item of items) {
|
|
986
|
+
if (!item.localPath) continue;
|
|
987
|
+
const baseName = path.basename(item.localPath, path.extname(item.localPath));
|
|
988
|
+
if (mode === "txt") {
|
|
989
|
+
const caption = item.alt ?? "";
|
|
990
|
+
const capPath = path.join(outputDir, `${baseName}.txt`);
|
|
991
|
+
await fs.writeFile(capPath, caption, "utf-8");
|
|
992
|
+
} else if (mode === "json") {
|
|
993
|
+
const capPath = path.join(outputDir, `${baseName}.json`);
|
|
994
|
+
await fs.writeFile(capPath, JSON.stringify(mediaToDict(item), null, 2), "utf-8");
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
},
|
|
998
|
+
catch: (e) => new CaptionError({
|
|
999
|
+
path: outputDir,
|
|
1000
|
+
message: `Failed to write caption files: ${String(e)}`,
|
|
1001
|
+
cause: e
|
|
1002
|
+
})
|
|
1003
|
+
});
|
|
1004
|
+
/**
|
|
1005
|
+
* Route caption writing based on mode.
|
|
1006
|
+
* 'metadata' (EXIF embedding) is noted as unsupported in this TS implementation
|
|
1007
|
+
* and falls back to printing a warning.
|
|
1008
|
+
*/
|
|
1009
|
+
const applyCaption = (items, outputDir, mode) => {
|
|
1010
|
+
if (mode === "txt" || mode === "json") return writeCaptionFiles(items, outputDir, mode);
|
|
1011
|
+
if (mode === "metadata") return Effect.sync(() => {
|
|
1012
|
+
console.warn("[warn] EXIF metadata embedding is not supported in this implementation. Use --caption txt or --caption json instead.");
|
|
1013
|
+
});
|
|
1014
|
+
return Effect.void;
|
|
1015
|
+
};
|
|
1016
|
+
/** Filter items that have alt text (for --ensure-cap). */
|
|
1017
|
+
const filterWithAlt = (items) => items.filter((m) => m.alt && m.alt.trim() !== "");
|
|
1018
|
+
|
|
1019
|
+
//#endregion
|
|
1020
|
+
//#region src/cli/shared-options.ts
|
|
1021
|
+
/**
|
|
1022
|
+
* Shared CLI option definitions re-used across scrape, search, and download commands.
|
|
1023
|
+
*/
|
|
1024
|
+
const outputOption = Options.optional(Options.text("output").pipe(Options.withAlias("o"), Options.withDescription("Output directory for downloaded files")));
|
|
1025
|
+
const cookiesOption = Options.optional(Options.text("cookies").pipe(Options.withAlias("c"), Options.withDescription("Path to cookies JSON file for private boards")));
|
|
1026
|
+
const numOption = Options.withDefault(Options.integer("limit").pipe(Options.withAlias("n"), Options.withDescription("Maximum number of images to scrape (default: 100)")), 100);
|
|
1027
|
+
const resolutionOption = Options.optional(Options.text("min-res").pipe(Options.withAlias("r"), Options.withDescription("Minimum resolution filter, e.g. `512x512` — skips images smaller than this")));
|
|
1028
|
+
const videoOption = Options.boolean("video").pipe(Options.withDescription("Download video streams if available"));
|
|
1029
|
+
const skipRemuxOption = Options.boolean("skip-remux").pipe(Options.withDescription("Skip ffmpeg remux; output raw .ts file"));
|
|
1030
|
+
const timeoutOption = Options.withDefault(Options.integer("timeout").pipe(Options.withDescription("Request timeout in seconds (default: 10)")), 10);
|
|
1031
|
+
const delayOption = Options.withDefault(Options.text("delay").pipe(Options.withDescription("Delay between requests in seconds (default: 0.2)")), "0.2");
|
|
1032
|
+
const cacheOption = Options.optional(Options.text("save-urls").pipe(Options.withDescription("Save scraped URLs to a JSON file for later download with `pin-dl download`")));
|
|
1033
|
+
const dryRunOption = Options.boolean("dry-run").pipe(Options.withDescription("Print found URLs to stdout without downloading anything"));
|
|
1034
|
+
const skipExistingOption = Options.boolean("skip-existing").pipe(Options.withDescription("Skip files that already exist in the output directory"));
|
|
1035
|
+
const captionOption = Options.withDefault(Options.text("caption").pipe(Options.withDescription("Caption format: txt | json | metadata | none (default: none)")), "none");
|
|
1036
|
+
const ensureCapOption = Options.boolean("ensure-cap").pipe(Options.withDescription("Only include images that have alt text"));
|
|
1037
|
+
const capFromTitleOption = Options.boolean("cap-from-title").pipe(Options.withDescription("Use image title as the caption instead of auto_alt_text"));
|
|
1038
|
+
const verboseOption = Options.boolean("verbose").pipe(Options.withDescription("Enable verbose output"));
|
|
1039
|
+
const dumpOption = Options.optional(Options.text("dump").pipe(Options.withDescription("Save raw API request/response pairs to this directory for debugging")));
|
|
1040
|
+
const relatedOption = Options.boolean("related").pipe(Options.withDescription("For pin URLs: scrape related pins instead of just the linked pin"));
|
|
1041
|
+
const filenameOption = Options.withDefault(Options.text("filename").pipe(Options.withDescription("Filename template for downloaded files. Variables: {id} (pin ID), {alt} (sanitized alt text), {index} (1-based). Extension is always appended automatically. Default: {id}")), "{id}");
|
|
1042
|
+
const sectionOption = Options.optional(Options.text("section").pipe(Options.withDescription("Board section to scrape, by name. Slugified and appended to the board URL. E.g. --section 'Kitchen Ideas' → .../kitchen-ideas/. Pass the section URL directly for guaranteed accuracy.")));
|
|
1043
|
+
const reencodeOption = Options.boolean("reencode").pipe(Options.withDescription("Force re-encode HLS video to MP4 (slower but guaranteed compatibility; overrides --skip-remux)"));
|
|
1044
|
+
const logFileOption = Options.optional(Options.text("log-file").pipe(Options.withDescription("Append all log output to this file in addition to the console")));
|
|
1045
|
+
/**
|
|
1046
|
+
* Tee console.log / console.warn / console.error to a file.
|
|
1047
|
+
* Subsequent writes append to the file (flags: 'a').
|
|
1048
|
+
*/
|
|
1049
|
+
const setupLogFile = (logPath) => {
|
|
1050
|
+
const stream = createWriteStream(logPath, { flags: "a" });
|
|
1051
|
+
const write = (...args) => {
|
|
1052
|
+
stream.write(args.map(String).join(" ") + "\n");
|
|
1053
|
+
};
|
|
1054
|
+
const origLog = console.log.bind(console);
|
|
1055
|
+
const origWarn = console.warn.bind(console);
|
|
1056
|
+
const origError = console.error.bind(console);
|
|
1057
|
+
console.log = (...args) => {
|
|
1058
|
+
origLog(...args);
|
|
1059
|
+
write(...args);
|
|
1060
|
+
};
|
|
1061
|
+
console.warn = (...args) => {
|
|
1062
|
+
origWarn(...args);
|
|
1063
|
+
write(...args);
|
|
1064
|
+
};
|
|
1065
|
+
console.error = (...args) => {
|
|
1066
|
+
origError(...args);
|
|
1067
|
+
write(...args);
|
|
1068
|
+
};
|
|
1069
|
+
};
|
|
1070
|
+
const fileOption = Options.optional(Options.text("file").pipe(Options.withAlias("f"), Options.withDescription("Path to a file containing URLs/queries (one per line). Use - for stdin.")));
|
|
1071
|
+
/** Parse "512x512" → [512, 512] */
|
|
1072
|
+
const parseResolution = (res) => {
|
|
1073
|
+
const parts = res.split("x");
|
|
1074
|
+
if (parts.length !== 2) throw new Error(`Invalid resolution format: ${res}. Use WxH, e.g. 512x512`);
|
|
1075
|
+
const w = parseInt(parts[0], 10);
|
|
1076
|
+
const h = parseInt(parts[1], 10);
|
|
1077
|
+
if (isNaN(w) || isNaN(h)) throw new Error(`Invalid resolution: ${res}`);
|
|
1078
|
+
return [w, h];
|
|
1079
|
+
};
|
|
1080
|
+
/** Parse delay string to a float. */
|
|
1081
|
+
const parseDelay = (delay) => {
|
|
1082
|
+
const f = parseFloat(delay);
|
|
1083
|
+
return isNaN(f) ? .2 : f;
|
|
1084
|
+
};
|
|
1085
|
+
/** Read lines from a file path (or stdin if path === '-'). */
|
|
1086
|
+
const readLines = async (filePath) => {
|
|
1087
|
+
const lines = [];
|
|
1088
|
+
if (filePath === "-") {
|
|
1089
|
+
const chunks = [];
|
|
1090
|
+
for await (const chunk of process.stdin) chunks.push(String(chunk));
|
|
1091
|
+
lines.push(...chunks.join("").split("\n").map((l) => l.trim()).filter(Boolean));
|
|
1092
|
+
} else {
|
|
1093
|
+
const { readFile } = await import("node:fs/promises");
|
|
1094
|
+
const content = await readFile(filePath, "utf-8");
|
|
1095
|
+
lines.push(...content.split("\n").map((l) => l.trim()).filter(Boolean));
|
|
1096
|
+
}
|
|
1097
|
+
return lines;
|
|
1098
|
+
};
|
|
1099
|
+
/** Validate cookie auth: checks for _auth=1. */
|
|
1100
|
+
const checkCookiesAuthenticated = (cookies) => {
|
|
1101
|
+
return cookies.find((c) => c.name === "_auth")?.value === "1";
|
|
1102
|
+
};
|
|
1103
|
+
|
|
1104
|
+
//#endregion
|
|
1105
|
+
//#region src/cli/commands/scrape.ts
|
|
1106
|
+
const processUrl = (url, opts, session) => Effect.gen(function* () {
|
|
1107
|
+
const minRes = Option.isSome(opts.resolution) ? parseResolution(opts.resolution.value) : [0, 0];
|
|
1108
|
+
yield* Console.log(`Scraping ${url}...`);
|
|
1109
|
+
const media = yield* scrapeUrl(session, url, {
|
|
1110
|
+
num: opts.num,
|
|
1111
|
+
minResolution: minRes,
|
|
1112
|
+
delay: parseDelay(opts.delay),
|
|
1113
|
+
captionFromTitle: opts.capFromTitle,
|
|
1114
|
+
ensureAlt: opts.ensureCap,
|
|
1115
|
+
timeout: opts.timeout * 1e3,
|
|
1116
|
+
verbose: opts.verbose,
|
|
1117
|
+
dump: Option.getOrUndefined(opts.dump),
|
|
1118
|
+
related: opts.related,
|
|
1119
|
+
onProgress: /pin\/\d+/.test(url) && !opts.related ? void 0 : (collected, total) => {
|
|
1120
|
+
process.stderr.write(`\r Scraped ${collected}/${total}...`);
|
|
1121
|
+
}
|
|
1122
|
+
});
|
|
1123
|
+
if (!(/pin\/\d+/.test(url) && !opts.related)) process.stderr.write("\n");
|
|
1124
|
+
if (opts.dryRun) {
|
|
1125
|
+
for (const m of media) yield* Console.log(m.src);
|
|
1126
|
+
yield* Console.log(`Found ${media.length} item(s).`);
|
|
1127
|
+
if (Option.isSome(opts.cache)) {
|
|
1128
|
+
yield* writeCacheFile(media, opts.cache.value);
|
|
1129
|
+
yield* Console.log(`Saved URLs → ${opts.cache.value}`);
|
|
1130
|
+
}
|
|
1131
|
+
return;
|
|
1132
|
+
}
|
|
1133
|
+
if (opts.verbose) for (const m of media) yield* Console.log(` ${m.src}`);
|
|
1134
|
+
if (Option.isSome(opts.cache)) {
|
|
1135
|
+
yield* writeCacheFile(media, opts.cache.value);
|
|
1136
|
+
yield* Console.log(`Cached ${media.length} items → ${opts.cache.value}`);
|
|
1137
|
+
}
|
|
1138
|
+
if (Option.isNone(opts.output)) {
|
|
1139
|
+
if (Option.isNone(opts.cache)) yield* Console.log(JSON.stringify(media.map((m) => ({
|
|
1140
|
+
src: m.src,
|
|
1141
|
+
alt: m.alt,
|
|
1142
|
+
origin: m.origin
|
|
1143
|
+
})), null, 2));
|
|
1144
|
+
return;
|
|
1145
|
+
}
|
|
1146
|
+
const outputDir = opts.output.value;
|
|
1147
|
+
const downloaded = yield* downloadMedia(media, outputDir, opts.video, opts.skipRemux, opts.reencode, 8, opts.skipExisting, opts.filename);
|
|
1148
|
+
const captionMode = isCaptionMode(opts.caption) ? opts.caption : "none";
|
|
1149
|
+
if (captionMode !== "none") yield* applyCaption(downloaded, outputDir, captionMode);
|
|
1150
|
+
if (!(/pin\/\d+/.test(url) && !opts.related) && downloaded.length !== opts.num) yield* Console.log(`Note: Got ${downloaded.length} items (requested ${opts.num}). Some may have been filtered, duplicated, or unavailable.`);
|
|
1151
|
+
});
|
|
1152
|
+
const scrapeCommand = Command.make("scrape", {
|
|
1153
|
+
urls: Args.between(Args.text({ name: "url" }), 0, 999),
|
|
1154
|
+
output: outputOption,
|
|
1155
|
+
file: fileOption,
|
|
1156
|
+
cookies: cookiesOption,
|
|
1157
|
+
num: numOption,
|
|
1158
|
+
resolution: resolutionOption,
|
|
1159
|
+
video: videoOption,
|
|
1160
|
+
skipRemux: skipRemuxOption,
|
|
1161
|
+
reencode: reencodeOption,
|
|
1162
|
+
timeout: timeoutOption,
|
|
1163
|
+
delay: delayOption,
|
|
1164
|
+
cache: cacheOption,
|
|
1165
|
+
caption: captionOption,
|
|
1166
|
+
ensureCap: ensureCapOption,
|
|
1167
|
+
capFromTitle: capFromTitleOption,
|
|
1168
|
+
related: relatedOption,
|
|
1169
|
+
section: sectionOption,
|
|
1170
|
+
filename: filenameOption,
|
|
1171
|
+
verbose: verboseOption,
|
|
1172
|
+
dump: dumpOption,
|
|
1173
|
+
logFile: logFileOption,
|
|
1174
|
+
dryRun: dryRunOption,
|
|
1175
|
+
skipExisting: skipExistingOption
|
|
1176
|
+
}, (opts) => Effect.gen(function* () {
|
|
1177
|
+
if (Option.isSome(opts.logFile)) setupLogFile(opts.logFile.value);
|
|
1178
|
+
const autoSavePath = Option.isNone(opts.cache) && Option.isSome(opts.output) ? `${opts.output.value}.json` : null;
|
|
1179
|
+
const effectiveOpts = autoSavePath !== null ? {
|
|
1180
|
+
...opts,
|
|
1181
|
+
cache: Option.some(autoSavePath)
|
|
1182
|
+
} : opts;
|
|
1183
|
+
process.once("SIGINT", () => {
|
|
1184
|
+
if (autoSavePath) process.stderr.write(`\nInterrupted. If URLs were already collected, they were saved to '${autoSavePath}'.\nResume: pin-dl download ${autoSavePath} -o ${Option.getOrElse(opts.output, () => ".")} --skip-existing\n`);
|
|
1185
|
+
else process.stderr.write("\nInterrupted. Re-run the same command with --skip-existing to continue where you left off.\n");
|
|
1186
|
+
process.exit(130);
|
|
1187
|
+
});
|
|
1188
|
+
let urls = [...opts.urls];
|
|
1189
|
+
if (Option.isSome(opts.file)) {
|
|
1190
|
+
const filePath = opts.file.value;
|
|
1191
|
+
urls = [...yield* Effect.tryPromise({
|
|
1192
|
+
try: () => readLines(filePath),
|
|
1193
|
+
catch: (e) => /* @__PURE__ */ new Error(`Failed to read file: ${String(e)}`)
|
|
1194
|
+
}), ...urls];
|
|
1195
|
+
}
|
|
1196
|
+
if (urls.length === 0) {
|
|
1197
|
+
yield* Console.error("Error: No URLs provided. Pass URLs as arguments or use --file.");
|
|
1198
|
+
return;
|
|
1199
|
+
}
|
|
1200
|
+
if (Option.isSome(opts.section)) {
|
|
1201
|
+
const slug = opts.section.value.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
1202
|
+
urls = urls.map((url) => {
|
|
1203
|
+
return `${url.endsWith("/") ? url : url + "/"}${slug}/`;
|
|
1204
|
+
});
|
|
1205
|
+
}
|
|
1206
|
+
let session;
|
|
1207
|
+
if (Option.isSome(opts.cookies)) {
|
|
1208
|
+
const cookiesPath = opts.cookies.value;
|
|
1209
|
+
const savedCookies = yield* readCookiesFile(cookiesPath);
|
|
1210
|
+
if (!checkCookiesAuthenticated(savedCookies)) yield* Console.log(`[WARNING] Cookies in '${cookiesPath}' are NOT authenticated (_auth != 1). Private boards may fail. Run: pin-dl login -o cookies.json`);
|
|
1211
|
+
session = yield* makeSession(savedCookies);
|
|
1212
|
+
} else session = yield* makeSession();
|
|
1213
|
+
for (const url of urls) yield* processUrl(url, effectiveOpts, session).pipe(Effect.catchAll((e) => Console.error(`Error scraping ${url}: ${e.message ?? String(e)}`)));
|
|
1214
|
+
yield* Console.log("\nDone.");
|
|
1215
|
+
}).pipe(Effect.catchAll((e) => Console.error(`Fatal error: ${e.message ?? String(e)}`))));
|
|
1216
|
+
|
|
1217
|
+
//#endregion
|
|
1218
|
+
//#region src/cli/commands/search.ts
|
|
1219
|
+
const searchCommand = Command.make("search", {
|
|
1220
|
+
queries: Args.between(Args.text({ name: "query" }), 0, 999),
|
|
1221
|
+
output: outputOption,
|
|
1222
|
+
file: fileOption,
|
|
1223
|
+
cookies: cookiesOption,
|
|
1224
|
+
num: numOption,
|
|
1225
|
+
resolution: resolutionOption,
|
|
1226
|
+
video: videoOption,
|
|
1227
|
+
skipRemux: skipRemuxOption,
|
|
1228
|
+
reencode: reencodeOption,
|
|
1229
|
+
timeout: timeoutOption,
|
|
1230
|
+
delay: delayOption,
|
|
1231
|
+
cache: cacheOption,
|
|
1232
|
+
caption: captionOption,
|
|
1233
|
+
ensureCap: ensureCapOption,
|
|
1234
|
+
capFromTitle: capFromTitleOption,
|
|
1235
|
+
verbose: verboseOption,
|
|
1236
|
+
dump: dumpOption,
|
|
1237
|
+
logFile: logFileOption,
|
|
1238
|
+
dryRun: dryRunOption,
|
|
1239
|
+
skipExisting: skipExistingOption,
|
|
1240
|
+
filename: filenameOption
|
|
1241
|
+
}, (opts) => Effect.gen(function* () {
|
|
1242
|
+
if (Option.isSome(opts.logFile)) setupLogFile(opts.logFile.value);
|
|
1243
|
+
const autoSavePath = Option.isNone(opts.cache) && Option.isSome(opts.output) ? `${opts.output.value}.json` : null;
|
|
1244
|
+
const effectiveCache = autoSavePath !== null ? Option.some(autoSavePath) : opts.cache;
|
|
1245
|
+
process.once("SIGINT", () => {
|
|
1246
|
+
if (autoSavePath) process.stderr.write(`\nInterrupted. If results were already scraped, they were saved to '${autoSavePath}'.\nResume: pin-dl download ${autoSavePath} -o ${Option.getOrElse(opts.output, () => ".")} --skip-existing\n`);
|
|
1247
|
+
else process.stderr.write("\nInterrupted. Re-run the same command with --skip-existing to continue where you left off.\n");
|
|
1248
|
+
process.exit(130);
|
|
1249
|
+
});
|
|
1250
|
+
let queries = [...opts.queries];
|
|
1251
|
+
if (Option.isSome(opts.file)) {
|
|
1252
|
+
const filePath = opts.file.value;
|
|
1253
|
+
queries = [...yield* Effect.tryPromise({
|
|
1254
|
+
try: () => readLines(filePath),
|
|
1255
|
+
catch: (e) => /* @__PURE__ */ new Error(`Failed to read file: ${String(e)}`)
|
|
1256
|
+
}), ...queries];
|
|
1257
|
+
}
|
|
1258
|
+
if (queries.length === 0) {
|
|
1259
|
+
yield* Console.error("Error: No queries provided. Pass queries as arguments or use --file.");
|
|
1260
|
+
return;
|
|
1261
|
+
}
|
|
1262
|
+
let session;
|
|
1263
|
+
if (Option.isSome(opts.cookies)) {
|
|
1264
|
+
const cookiesPath = opts.cookies.value;
|
|
1265
|
+
const savedCookies = yield* readCookiesFile(cookiesPath);
|
|
1266
|
+
if (!checkCookiesAuthenticated(savedCookies)) yield* Console.log(`[WARNING] Cookies in '${cookiesPath}' may not be authenticated.`);
|
|
1267
|
+
session = yield* makeSession(savedCookies);
|
|
1268
|
+
} else session = yield* makeSession();
|
|
1269
|
+
const minRes = Option.isSome(opts.resolution) ? parseResolution(opts.resolution.value) : [0, 0];
|
|
1270
|
+
for (const query of queries) {
|
|
1271
|
+
yield* Console.log(`Searching '${query}'...`);
|
|
1272
|
+
const media = yield* scrapeQuery(session, query, {
|
|
1273
|
+
num: opts.num,
|
|
1274
|
+
minResolution: minRes,
|
|
1275
|
+
delay: parseDelay(opts.delay),
|
|
1276
|
+
captionFromTitle: opts.capFromTitle,
|
|
1277
|
+
ensureAlt: opts.ensureCap,
|
|
1278
|
+
timeout: opts.timeout * 1e3,
|
|
1279
|
+
verbose: opts.verbose,
|
|
1280
|
+
dump: Option.getOrUndefined(opts.dump),
|
|
1281
|
+
onProgress: (collected, total) => {
|
|
1282
|
+
process.stderr.write(`\r Scraped ${collected}/${total}...`);
|
|
1283
|
+
}
|
|
1284
|
+
}).pipe(Effect.catchAll((e) => {
|
|
1285
|
+
process.stderr.write("\n");
|
|
1286
|
+
Console.error(`Error searching '${query}': ${e.message ?? String(e)}`);
|
|
1287
|
+
return Effect.succeed([]);
|
|
1288
|
+
}));
|
|
1289
|
+
process.stderr.write("\n");
|
|
1290
|
+
if (opts.verbose) for (const m of media) yield* Console.log(` ${m.src}`);
|
|
1291
|
+
if (opts.dryRun) {
|
|
1292
|
+
for (const m of media) yield* Console.log(m.src);
|
|
1293
|
+
yield* Console.log(`Found ${media.length} item(s).`);
|
|
1294
|
+
if (Option.isSome(effectiveCache)) {
|
|
1295
|
+
yield* writeCacheFile(media, effectiveCache.value);
|
|
1296
|
+
yield* Console.log(`Saved URLs → ${effectiveCache.value}`);
|
|
1297
|
+
}
|
|
1298
|
+
continue;
|
|
1299
|
+
}
|
|
1300
|
+
if (Option.isSome(effectiveCache)) {
|
|
1301
|
+
yield* writeCacheFile(media, effectiveCache.value);
|
|
1302
|
+
yield* Console.log(`Cached ${media.length} items → ${effectiveCache.value}`);
|
|
1303
|
+
}
|
|
1304
|
+
if (Option.isNone(opts.output)) {
|
|
1305
|
+
if (Option.isNone(effectiveCache)) yield* Console.log(JSON.stringify(media.map((m) => ({
|
|
1306
|
+
src: m.src,
|
|
1307
|
+
alt: m.alt,
|
|
1308
|
+
origin: m.origin
|
|
1309
|
+
})), null, 2));
|
|
1310
|
+
continue;
|
|
1311
|
+
}
|
|
1312
|
+
const outputDir = opts.output.value;
|
|
1313
|
+
const downloaded = yield* downloadMedia(media, outputDir, opts.video, opts.skipRemux, opts.reencode, 8, opts.skipExisting, opts.filename);
|
|
1314
|
+
const captionMode = isCaptionMode(opts.caption) ? opts.caption : "none";
|
|
1315
|
+
if (captionMode !== "none") yield* applyCaption(downloaded, outputDir, captionMode);
|
|
1316
|
+
if (downloaded.length !== opts.num) yield* Console.log(`Note: Got ${downloaded.length} items (requested ${opts.num}).`);
|
|
1317
|
+
}
|
|
1318
|
+
yield* Console.log("\nDone.");
|
|
1319
|
+
}).pipe(Effect.catchAll((e) => Console.error(`Fatal error: ${e.message ?? String(e)}`))));
|
|
1320
|
+
|
|
1321
|
+
//#endregion
|
|
1322
|
+
//#region src/cli/commands/download.ts
|
|
1323
|
+
const downloadCommand = Command.make("download", {
|
|
1324
|
+
input: Args.text({ name: "input" }),
|
|
1325
|
+
output: outputOption,
|
|
1326
|
+
resolution: resolutionOption,
|
|
1327
|
+
video: videoOption,
|
|
1328
|
+
skipRemux: skipRemuxOption,
|
|
1329
|
+
reencode: reencodeOption,
|
|
1330
|
+
caption: captionOption,
|
|
1331
|
+
ensureCap: ensureCapOption,
|
|
1332
|
+
verbose: verboseOption,
|
|
1333
|
+
logFile: logFileOption,
|
|
1334
|
+
skipExisting: skipExistingOption,
|
|
1335
|
+
filename: filenameOption
|
|
1336
|
+
}, (opts) => Effect.gen(function* () {
|
|
1337
|
+
if (Option.isSome(opts.logFile)) setupLogFile(opts.logFile.value);
|
|
1338
|
+
if (opts.input.startsWith("http://") || opts.input.startsWith("https://")) {
|
|
1339
|
+
yield* Console.error(`Error: 'download' expects a saved URLs file (from --save-urls), not a URL.
|
|
1340
|
+
To scrape and download directly, run:
|
|
1341
|
+
pin-dl ${opts.input} -o <dir>`);
|
|
1342
|
+
return;
|
|
1343
|
+
}
|
|
1344
|
+
let media = yield* readCacheFile(opts.input);
|
|
1345
|
+
if (Option.isSome(opts.resolution)) {
|
|
1346
|
+
const [minW, minH] = parseResolution(opts.resolution.value);
|
|
1347
|
+
media = media.filter((m) => (m.resolution.x ?? 0) >= minW && (m.resolution.y ?? 0) >= minH);
|
|
1348
|
+
}
|
|
1349
|
+
if (opts.ensureCap) media = filterWithAlt(media);
|
|
1350
|
+
const outputDir = Option.getOrElse(opts.output, () => path.basename(opts.input, path.extname(opts.input)));
|
|
1351
|
+
yield* Console.log(`Downloading ${media.length} items to '${outputDir}'...`);
|
|
1352
|
+
const downloaded = yield* downloadMedia(media, outputDir, opts.video, opts.skipRemux, opts.reencode, 8, opts.skipExisting, opts.filename);
|
|
1353
|
+
const captionMode = isCaptionMode(opts.caption) ? opts.caption : "none";
|
|
1354
|
+
if (captionMode !== "none") yield* applyCaption(downloaded, outputDir, captionMode);
|
|
1355
|
+
if (opts.verbose) {
|
|
1356
|
+
for (const m of downloaded) if (m.localPath) yield* Console.log(` ${m.localPath}`);
|
|
1357
|
+
}
|
|
1358
|
+
yield* Console.log(`\nDownloaded ${downloaded.length} items.\nDone.`);
|
|
1359
|
+
}).pipe(Effect.catchAll((e) => Console.error(`Error: ${e.message ?? String(e)}`))));
|
|
1360
|
+
|
|
1361
|
+
//#endregion
|
|
1362
|
+
//#region src/cli/dispatch.ts
|
|
1363
|
+
const knownSubcommands = new Set([
|
|
1364
|
+
"login",
|
|
1365
|
+
"scrape",
|
|
1366
|
+
"search",
|
|
1367
|
+
"download"
|
|
1368
|
+
]);
|
|
1369
|
+
/**
|
|
1370
|
+
* Normalise raw argv before subcommand dispatch.
|
|
1371
|
+
* - `--stdin` → `--file -`
|
|
1372
|
+
* - `--resume`/`--continue` → `--skip-existing`
|
|
1373
|
+
* - first arg `get` → `scrape`
|
|
1374
|
+
* - first arg `similar` → `scrape --related`
|
|
1375
|
+
* Returns a new array; does not mutate the input.
|
|
1376
|
+
*/
|
|
1377
|
+
const preprocessArgv = (args) => {
|
|
1378
|
+
let result = [...args];
|
|
1379
|
+
const stdinIdx = result.indexOf("--stdin");
|
|
1380
|
+
if (stdinIdx !== -1) result = [
|
|
1381
|
+
...result.slice(0, stdinIdx),
|
|
1382
|
+
"--file",
|
|
1383
|
+
"-",
|
|
1384
|
+
...result.slice(stdinIdx + 1)
|
|
1385
|
+
];
|
|
1386
|
+
result = result.map((a) => a === "--resume" || a === "--continue" ? "--skip-existing" : a);
|
|
1387
|
+
if (result[0] === "get") result = ["scrape", ...result.slice(1)];
|
|
1388
|
+
if (result[0] === "similar") result = [
|
|
1389
|
+
"scrape",
|
|
1390
|
+
"--related",
|
|
1391
|
+
...result.slice(1)
|
|
1392
|
+
];
|
|
1393
|
+
return result;
|
|
1394
|
+
};
|
|
1395
|
+
/**
|
|
1396
|
+
* Given raw args (after preprocessing), return the subcommand to inject
|
|
1397
|
+
* ("scrape" | "search") or null if a subcommand is already present or no
|
|
1398
|
+
* positional args exist.
|
|
1399
|
+
*/
|
|
1400
|
+
const resolveDispatch = (args, subcommands = knownSubcommands) => {
|
|
1401
|
+
if (args.some((a) => subcommands.has(a))) return null;
|
|
1402
|
+
if (args.some((a) => a.startsWith("https://") || a.startsWith("http://")) || args.some((a) => a === "-f" || a === "--file")) return "scrape";
|
|
1403
|
+
const first = args[0];
|
|
1404
|
+
if (first !== void 0 && !first.startsWith("-") && !subcommands.has(first)) return "search";
|
|
1405
|
+
return null;
|
|
1406
|
+
};
|
|
1407
|
+
|
|
1408
|
+
//#endregion
|
|
1409
|
+
//#region src/index.ts
|
|
1410
|
+
const preprocessed = preprocessArgv(process.argv.slice(2));
|
|
1411
|
+
if (preprocessed.length !== process.argv.slice(2).length || preprocessed.some((v, i) => v !== process.argv.slice(2)[i])) process.argv = [...process.argv.slice(0, 2), ...preprocessed];
|
|
1412
|
+
const rawArgs = process.argv.slice(2);
|
|
1413
|
+
const dispatched = resolveDispatch(rawArgs);
|
|
1414
|
+
if (dispatched === "search") {
|
|
1415
|
+
const splitAt = rawArgs.findIndex((a) => a.startsWith("-")) === -1 ? rawArgs.length : rawArgs.findIndex((a) => a.startsWith("-"));
|
|
1416
|
+
const query = rawArgs.slice(0, splitAt).join(" ");
|
|
1417
|
+
const rest = rawArgs.slice(splitAt);
|
|
1418
|
+
process.argv = [
|
|
1419
|
+
...process.argv.slice(0, 2),
|
|
1420
|
+
"search",
|
|
1421
|
+
query,
|
|
1422
|
+
...rest
|
|
1423
|
+
];
|
|
1424
|
+
} else if (dispatched === "scrape") process.argv = [
|
|
1425
|
+
...process.argv.slice(0, 2),
|
|
1426
|
+
"scrape",
|
|
1427
|
+
...rawArgs
|
|
1428
|
+
];
|
|
1429
|
+
const root = Command.make("pin-dl").pipe(Command.withSubcommands([
|
|
1430
|
+
loginCommand,
|
|
1431
|
+
scrapeCommand,
|
|
1432
|
+
searchCommand,
|
|
1433
|
+
downloadCommand
|
|
1434
|
+
]));
|
|
1435
|
+
const cli = Command.run(root, {
|
|
1436
|
+
name: "pin-dl",
|
|
1437
|
+
version: "0.1.0"
|
|
1438
|
+
});
|
|
1439
|
+
NodeRuntime.runMain(Effect.suspend(() => cli(process.argv)).pipe(Effect.provide(NodeContext.layer)));
|
|
1440
|
+
|
|
1441
|
+
//#endregion
|
|
1442
|
+
export { };
|