rubycrawl 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +167 -432
- data/lib/rubycrawl/browser/extraction.rb +106 -0
- data/lib/rubycrawl/browser.rb +106 -0
- data/lib/rubycrawl/errors.rb +1 -1
- data/lib/rubycrawl/helpers.rb +8 -44
- data/lib/rubycrawl/markdown_converter.rb +2 -2
- data/lib/rubycrawl/result.rb +49 -18
- data/lib/rubycrawl/site_crawler.rb +40 -22
- data/lib/rubycrawl/tasks/install.rake +17 -56
- data/lib/rubycrawl/url_normalizer.rb +5 -1
- data/lib/rubycrawl/version.rb +1 -1
- data/lib/rubycrawl.rb +35 -90
- data/rubycrawl.gemspec +3 -4
- metadata +19 -10
- data/lib/rubycrawl/service_client.rb +0 -108
- data/node/.gitignore +0 -2
- data/node/.npmrc +0 -1
- data/node/README.md +0 -19
- data/node/package-lock.json +0 -72
- data/node/package.json +0 -14
- data/node/src/index.js +0 -389
data/node/src/index.js
DELETED
|
@@ -1,389 +0,0 @@
|
|
|
1
|
-
import "dotenv/config";
|
|
2
|
-
import http from "node:http";
|
|
3
|
-
import crypto from "node:crypto";
|
|
4
|
-
import { chromium } from "playwright";
|
|
5
|
-
|
|
6
|
-
const HOST = "127.0.0.1";
|
|
7
|
-
const PORT = process.env.RUBYCRAWL_NODE_PORT || 3344;
|
|
8
|
-
const DEFAULT_BLOCK_RESOURCES = true;
|
|
9
|
-
const BLOCKED_RESOURCE_TYPES = new Set([
|
|
10
|
-
"image",
|
|
11
|
-
"media",
|
|
12
|
-
"font",
|
|
13
|
-
"stylesheet",
|
|
14
|
-
]);
|
|
15
|
-
|
|
16
|
-
function json(res, statusCode, body) {
|
|
17
|
-
const payload = JSON.stringify(body);
|
|
18
|
-
res.writeHead(statusCode, {
|
|
19
|
-
"content-type": "application/json",
|
|
20
|
-
"content-length": Buffer.byteLength(payload),
|
|
21
|
-
});
|
|
22
|
-
res.end(payload);
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
const MAX_BODY_SIZE = 1 * 1024 * 1024; // 1 MB
|
|
26
|
-
|
|
27
|
-
function readJson(req) {
|
|
28
|
-
return new Promise((resolve, reject) => {
|
|
29
|
-
let data = "";
|
|
30
|
-
let size = 0;
|
|
31
|
-
req.on("data", (chunk) => {
|
|
32
|
-
size += chunk.length;
|
|
33
|
-
if (size > MAX_BODY_SIZE) {
|
|
34
|
-
reject(new Error("Request body too large"));
|
|
35
|
-
req.destroy();
|
|
36
|
-
return;
|
|
37
|
-
}
|
|
38
|
-
data += chunk;
|
|
39
|
-
});
|
|
40
|
-
req.on("end", () => {
|
|
41
|
-
if (!data) return resolve({});
|
|
42
|
-
try {
|
|
43
|
-
resolve(JSON.parse(data));
|
|
44
|
-
} catch (error) {
|
|
45
|
-
reject(error);
|
|
46
|
-
}
|
|
47
|
-
});
|
|
48
|
-
req.on("error", reject);
|
|
49
|
-
});
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
function validateRequest(body) {
|
|
53
|
-
if (!body || typeof body.url !== "string" || body.url.trim() === "") {
|
|
54
|
-
return { ok: false, error: "url is required" };
|
|
55
|
-
}
|
|
56
|
-
return { ok: true };
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
let browser = null;
|
|
60
|
-
|
|
61
|
-
// Session storage: session_id -> { context, createdAt, lastUsedAt }
|
|
62
|
-
const sessions = new Map();
|
|
63
|
-
|
|
64
|
-
// Session TTL: 30 minutes of inactivity
|
|
65
|
-
const SESSION_TTL_MS = 30 * 60 * 1000;
|
|
66
|
-
// Cleanup interval: every 5 minutes
|
|
67
|
-
const CLEANUP_INTERVAL_MS = 5 * 60 * 1000;
|
|
68
|
-
|
|
69
|
-
function generateSessionId() {
|
|
70
|
-
return `sess_${crypto.randomBytes(16).toString("hex")}`;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
async function getBrowser() {
|
|
74
|
-
if (browser && browser.isConnected()) return browser;
|
|
75
|
-
browser = await chromium.launch({ headless: true });
|
|
76
|
-
return browser;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* Create a fresh browser context.
|
|
81
|
-
*/
|
|
82
|
-
async function createContext() {
|
|
83
|
-
const browser = await getBrowser();
|
|
84
|
-
return browser.newContext();
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
/**
|
|
88
|
-
* Get or create context based on session_id.
|
|
89
|
-
* If session_id provided and exists, reuse existing context.
|
|
90
|
-
* If session_id provided but expired/destroyed, create new context (handles retries).
|
|
91
|
-
* Otherwise create a fresh one-off context.
|
|
92
|
-
*/
|
|
93
|
-
async function getContext(sessionId) {
|
|
94
|
-
if (sessionId && sessions.has(sessionId)) {
|
|
95
|
-
// Update last used time
|
|
96
|
-
const session = sessions.get(sessionId);
|
|
97
|
-
session.lastUsedAt = Date.now();
|
|
98
|
-
return { context: session.context, isSession: true };
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// If session_id provided but doesn't exist (expired/destroyed), recreate it
|
|
102
|
-
// This handles job retries gracefully
|
|
103
|
-
if (sessionId) {
|
|
104
|
-
const context = await createContext();
|
|
105
|
-
const now = Date.now();
|
|
106
|
-
sessions.set(sessionId, { context, createdAt: now, lastUsedAt: now });
|
|
107
|
-
// eslint-disable-next-line no-console
|
|
108
|
-
console.log(
|
|
109
|
-
`[rubycrawl] session recreated ${sessionId} (was expired or destroyed)`,
|
|
110
|
-
);
|
|
111
|
-
return { context, isSession: true };
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
return { context: await createContext(), isSession: false };
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
/**
|
|
118
|
-
* Cleanup expired sessions (no activity for SESSION_TTL_MS).
|
|
119
|
-
*/
|
|
120
|
-
async function cleanupExpiredSessions() {
|
|
121
|
-
const now = Date.now();
|
|
122
|
-
const expiredIds = [];
|
|
123
|
-
|
|
124
|
-
for (const [sessionId, session] of sessions) {
|
|
125
|
-
if (now - session.lastUsedAt > SESSION_TTL_MS) {
|
|
126
|
-
expiredIds.push(sessionId);
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
for (const sessionId of expiredIds) {
|
|
131
|
-
const session = sessions.get(sessionId);
|
|
132
|
-
await session.context.close().catch(() => {});
|
|
133
|
-
sessions.delete(sessionId);
|
|
134
|
-
// eslint-disable-next-line no-console
|
|
135
|
-
console.log(
|
|
136
|
-
`[rubycrawl] session expired ${sessionId} (inactive for ${SESSION_TTL_MS / 60000} min)`,
|
|
137
|
-
);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
if (expiredIds.length > 0) {
|
|
141
|
-
// eslint-disable-next-line no-console
|
|
142
|
-
console.log(
|
|
143
|
-
`[rubycrawl] cleanup: ${expiredIds.length} expired, ${sessions.size} active`,
|
|
144
|
-
);
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
// Start cleanup interval
|
|
149
|
-
setInterval(cleanupExpiredSessions, CLEANUP_INTERVAL_MS);
|
|
150
|
-
|
|
151
|
-
/**
|
|
152
|
-
* Extract HTML metadata from the page
|
|
153
|
-
*/
|
|
154
|
-
async function extractMetadata(page) {
|
|
155
|
-
return page.evaluate(() => {
|
|
156
|
-
const getMeta = (name) => {
|
|
157
|
-
const meta = document.querySelector(
|
|
158
|
-
`meta[name="${name}"], meta[property="${name}"]`,
|
|
159
|
-
);
|
|
160
|
-
return meta?.getAttribute("content") || null;
|
|
161
|
-
};
|
|
162
|
-
|
|
163
|
-
const getLink = (rel) => {
|
|
164
|
-
const link = document.querySelector(`link[rel="${rel}"]`);
|
|
165
|
-
return link?.getAttribute("href") || null;
|
|
166
|
-
};
|
|
167
|
-
|
|
168
|
-
return {
|
|
169
|
-
title: document.title || null,
|
|
170
|
-
description: getMeta("description") || getMeta("og:description") || null,
|
|
171
|
-
keywords: getMeta("keywords"),
|
|
172
|
-
author: getMeta("author"),
|
|
173
|
-
og_title: getMeta("og:title"),
|
|
174
|
-
og_description: getMeta("og:description"),
|
|
175
|
-
og_image: getMeta("og:image"),
|
|
176
|
-
og_url: getMeta("og:url"),
|
|
177
|
-
og_type: getMeta("og:type"),
|
|
178
|
-
twitter_card: getMeta("twitter:card"),
|
|
179
|
-
twitter_title: getMeta("twitter:title"),
|
|
180
|
-
twitter_description: getMeta("twitter:description"),
|
|
181
|
-
twitter_image: getMeta("twitter:image"),
|
|
182
|
-
canonical: getLink("canonical"),
|
|
183
|
-
lang: document.documentElement.lang || null,
|
|
184
|
-
charset: document.characterSet || null,
|
|
185
|
-
};
|
|
186
|
-
});
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
/**
|
|
190
|
-
* Extract links from the page.
|
|
191
|
-
*/
|
|
192
|
-
async function extractLinks(page) {
|
|
193
|
-
return page.evaluate(() => {
|
|
194
|
-
const links = Array.from(document.querySelectorAll("a[href]"));
|
|
195
|
-
return links.map((link) => ({
|
|
196
|
-
url: link.href,
|
|
197
|
-
text: (link.textContent || "").trim(),
|
|
198
|
-
title: link.getAttribute("title") || null,
|
|
199
|
-
rel: link.getAttribute("rel") || null,
|
|
200
|
-
}));
|
|
201
|
-
});
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
/**
|
|
205
|
-
* Extract plain text content from the page using innerText.
|
|
206
|
-
*/
|
|
207
|
-
async function extractText(page) {
|
|
208
|
-
return page.evaluate(() => (document.body?.innerText || "").trim());
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
async function handleCrawl(req, res) {
|
|
212
|
-
let context = null;
|
|
213
|
-
let isSession = false;
|
|
214
|
-
|
|
215
|
-
try {
|
|
216
|
-
const body = await readJson(req);
|
|
217
|
-
const validation = validateRequest(body);
|
|
218
|
-
if (!validation.ok) {
|
|
219
|
-
return json(res, 422, { error: validation.error });
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
const waitUntil = body.wait_until || "load";
|
|
223
|
-
const blockResources =
|
|
224
|
-
typeof body.block_resources === "boolean"
|
|
225
|
-
? body.block_resources
|
|
226
|
-
: DEFAULT_BLOCK_RESOURCES;
|
|
227
|
-
|
|
228
|
-
const start = Date.now();
|
|
229
|
-
// eslint-disable-next-line no-console
|
|
230
|
-
console.log(
|
|
231
|
-
`[rubycrawl] crawl start ${body.url}${body.session_id ? ` (session=${body.session_id})` : ""}`,
|
|
232
|
-
);
|
|
233
|
-
|
|
234
|
-
// Get context (reuse if session_id provided)
|
|
235
|
-
const ctxResult = await getContext(body.session_id);
|
|
236
|
-
context = ctxResult.context;
|
|
237
|
-
isSession = ctxResult.isSession;
|
|
238
|
-
|
|
239
|
-
const page = await context.newPage();
|
|
240
|
-
|
|
241
|
-
try {
|
|
242
|
-
if (blockResources) {
|
|
243
|
-
await page.route("**/*", (route) => {
|
|
244
|
-
const type = route.request().resourceType();
|
|
245
|
-
if (BLOCKED_RESOURCE_TYPES.has(type)) {
|
|
246
|
-
return route.abort();
|
|
247
|
-
}
|
|
248
|
-
return route.continue();
|
|
249
|
-
});
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
const response = await page.goto(body.url, {
|
|
253
|
-
waitUntil,
|
|
254
|
-
timeout: 30_000,
|
|
255
|
-
});
|
|
256
|
-
|
|
257
|
-
const html = await page.content();
|
|
258
|
-
const finalUrl = page.url();
|
|
259
|
-
const status = response ? response.status() : null;
|
|
260
|
-
const htmlMetadata = await extractMetadata(page);
|
|
261
|
-
const links = await extractLinks(page);
|
|
262
|
-
const text = await extractText(page);
|
|
263
|
-
|
|
264
|
-
// eslint-disable-next-line no-console
|
|
265
|
-
console.log(
|
|
266
|
-
`[rubycrawl] crawl done ${body.url} status=${status} ms=${Date.now() - start}`,
|
|
267
|
-
);
|
|
268
|
-
|
|
269
|
-
return json(res, 200, {
|
|
270
|
-
ok: true,
|
|
271
|
-
url: body.url,
|
|
272
|
-
html,
|
|
273
|
-
text,
|
|
274
|
-
links,
|
|
275
|
-
metadata: {
|
|
276
|
-
status,
|
|
277
|
-
final_url: finalUrl,
|
|
278
|
-
...htmlMetadata,
|
|
279
|
-
},
|
|
280
|
-
});
|
|
281
|
-
} finally {
|
|
282
|
-
await page.close();
|
|
283
|
-
}
|
|
284
|
-
} catch (error) {
|
|
285
|
-
const code =
|
|
286
|
-
error?.name === "SyntaxError" ? "invalid_json" : "crawl_failed";
|
|
287
|
-
// eslint-disable-next-line no-console
|
|
288
|
-
console.log(`[rubycrawl] crawl error ${code} ${error?.message || ""}`);
|
|
289
|
-
return json(res, 400, { error: code, message: error?.message });
|
|
290
|
-
} finally {
|
|
291
|
-
// Only close context if not a session (sessions are managed separately)
|
|
292
|
-
if (context && !isSession) {
|
|
293
|
-
await context.close().catch(() => {});
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
/**
|
|
299
|
-
* Create a new session with a reusable browser context.
|
|
300
|
-
*/
|
|
301
|
-
async function handleSessionCreate(req, res) {
|
|
302
|
-
try {
|
|
303
|
-
const sessionId = generateSessionId();
|
|
304
|
-
const context = await createContext();
|
|
305
|
-
const now = Date.now();
|
|
306
|
-
sessions.set(sessionId, { context, createdAt: now, lastUsedAt: now });
|
|
307
|
-
|
|
308
|
-
// eslint-disable-next-line no-console
|
|
309
|
-
console.log(
|
|
310
|
-
`[rubycrawl] session created ${sessionId} (active=${sessions.size})`,
|
|
311
|
-
);
|
|
312
|
-
|
|
313
|
-
return json(res, 200, { ok: true, session_id: sessionId });
|
|
314
|
-
} catch (error) {
|
|
315
|
-
// eslint-disable-next-line no-console
|
|
316
|
-
console.log(`[rubycrawl] session create error ${error?.message || ""}`);
|
|
317
|
-
return json(res, 400, {
|
|
318
|
-
error: "session_create_failed",
|
|
319
|
-
message: error?.message,
|
|
320
|
-
});
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
/**
|
|
325
|
-
* Destroy a session and close its browser context.
|
|
326
|
-
* Returns success even if session doesn't exist (idempotent for retries).
|
|
327
|
-
*/
|
|
328
|
-
async function handleSessionDestroy(req, res) {
|
|
329
|
-
try {
|
|
330
|
-
const body = await readJson(req);
|
|
331
|
-
const sessionId = body.session_id;
|
|
332
|
-
|
|
333
|
-
if (!sessionId) {
|
|
334
|
-
return json(res, 422, { error: "session_id required" });
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
// Idempotent: if session doesn't exist, still return success
|
|
338
|
-
if (!sessions.has(sessionId)) {
|
|
339
|
-
return json(res, 200, {
|
|
340
|
-
ok: true,
|
|
341
|
-
message: "session already destroyed or expired",
|
|
342
|
-
});
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
const session = sessions.get(sessionId);
|
|
346
|
-
await session.context.close().catch(() => {});
|
|
347
|
-
sessions.delete(sessionId);
|
|
348
|
-
|
|
349
|
-
// eslint-disable-next-line no-console
|
|
350
|
-
console.log(`[rubycrawl] session destroyed ${sessionId}`);
|
|
351
|
-
|
|
352
|
-
return json(res, 200, { ok: true });
|
|
353
|
-
} catch (error) {
|
|
354
|
-
// eslint-disable-next-line no-console
|
|
355
|
-
console.log(`[rubycrawl] session destroy error ${error?.message || ""}`);
|
|
356
|
-
return json(res, 400, {
|
|
357
|
-
error: "session_destroy_failed",
|
|
358
|
-
message: error?.message,
|
|
359
|
-
});
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
const server = http.createServer((req, res) => {
|
|
364
|
-
// eslint-disable-next-line no-console
|
|
365
|
-
console.log(`[rubycrawl] request ${req.method} ${req.url}`);
|
|
366
|
-
|
|
367
|
-
if (req.method === "POST" && req.url === "/crawl") {
|
|
368
|
-
return handleCrawl(req, res);
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
if (req.method === "POST" && req.url === "/session/create") {
|
|
372
|
-
return handleSessionCreate(req, res);
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
if (req.method === "POST" && req.url === "/session/destroy") {
|
|
376
|
-
return handleSessionDestroy(req, res);
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
if (req.method === "GET" && req.url === "/health") {
|
|
380
|
-
return json(res, 200, { ok: true });
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
return json(res, 404, { error: "not_found" });
|
|
384
|
-
});
|
|
385
|
-
|
|
386
|
-
server.listen(PORT, HOST, () => {
|
|
387
|
-
// eslint-disable-next-line no-console
|
|
388
|
-
console.log(`rubycrawl node service listening on http://${HOST}:${PORT}`);
|
|
389
|
-
});
|