rubycrawl 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/node/src/index.js ADDED
@@ -0,0 +1,389 @@
1
+ import "dotenv/config";
2
+ import http from "node:http";
3
+ import crypto from "node:crypto";
4
+ import { chromium } from "playwright";
5
+
6
+ const HOST = "127.0.0.1";
7
+ const PORT = process.env.RUBYCRAWL_NODE_PORT || 3344;
8
+ const DEFAULT_BLOCK_RESOURCES = true;
9
+ const BLOCKED_RESOURCE_TYPES = new Set([
10
+ "image",
11
+ "media",
12
+ "font",
13
+ "stylesheet",
14
+ ]);
15
+
16
+ function json(res, statusCode, body) {
17
+ const payload = JSON.stringify(body);
18
+ res.writeHead(statusCode, {
19
+ "content-type": "application/json",
20
+ "content-length": Buffer.byteLength(payload),
21
+ });
22
+ res.end(payload);
23
+ }
24
+
25
+ const MAX_BODY_SIZE = 1 * 1024 * 1024; // 1 MB
26
+
27
+ function readJson(req) {
28
+ return new Promise((resolve, reject) => {
29
+ let data = "";
30
+ let size = 0;
31
+ req.on("data", (chunk) => {
32
+ size += chunk.length;
33
+ if (size > MAX_BODY_SIZE) {
34
+ reject(new Error("Request body too large"));
35
+ req.destroy();
36
+ return;
37
+ }
38
+ data += chunk;
39
+ });
40
+ req.on("end", () => {
41
+ if (!data) return resolve({});
42
+ try {
43
+ resolve(JSON.parse(data));
44
+ } catch (error) {
45
+ reject(error);
46
+ }
47
+ });
48
+ req.on("error", reject);
49
+ });
50
+ }
51
+
52
+ function validateRequest(body) {
53
+ if (!body || typeof body.url !== "string" || body.url.trim() === "") {
54
+ return { ok: false, error: "url is required" };
55
+ }
56
+ return { ok: true };
57
+ }
58
+
59
+ let browser = null;
60
+
61
+ // Session storage: session_id -> { context, createdAt, lastUsedAt }
62
+ const sessions = new Map();
63
+
64
+ // Session TTL: 30 minutes of inactivity
65
+ const SESSION_TTL_MS = 30 * 60 * 1000;
66
+ // Cleanup interval: every 5 minutes
67
+ const CLEANUP_INTERVAL_MS = 5 * 60 * 1000;
68
+
69
+ function generateSessionId() {
70
+ return `sess_${crypto.randomBytes(16).toString("hex")}`;
71
+ }
72
+
73
+ async function getBrowser() {
74
+ if (browser && browser.isConnected()) return browser;
75
+ browser = await chromium.launch({ headless: true });
76
+ return browser;
77
+ }
78
+
79
+ /**
80
+ * Create a fresh browser context.
81
+ */
82
+ async function createContext() {
83
+ const browser = await getBrowser();
84
+ return browser.newContext();
85
+ }
86
+
87
+ /**
88
+ * Get or create context based on session_id.
89
+ * If session_id provided and exists, reuse existing context.
90
+ * If session_id provided but expired/destroyed, create new context (handles retries).
91
+ * Otherwise create a fresh one-off context.
92
+ */
93
+ async function getContext(sessionId) {
94
+ if (sessionId && sessions.has(sessionId)) {
95
+ // Update last used time
96
+ const session = sessions.get(sessionId);
97
+ session.lastUsedAt = Date.now();
98
+ return { context: session.context, isSession: true };
99
+ }
100
+
101
+ // If session_id provided but doesn't exist (expired/destroyed), recreate it
102
+ // This handles job retries gracefully
103
+ if (sessionId) {
104
+ const context = await createContext();
105
+ const now = Date.now();
106
+ sessions.set(sessionId, { context, createdAt: now, lastUsedAt: now });
107
+ // eslint-disable-next-line no-console
108
+ console.log(
109
+ `[rubycrawl] session recreated ${sessionId} (was expired or destroyed)`,
110
+ );
111
+ return { context, isSession: true };
112
+ }
113
+
114
+ return { context: await createContext(), isSession: false };
115
+ }
116
+
117
+ /**
118
+ * Cleanup expired sessions (no activity for SESSION_TTL_MS).
119
+ */
120
+ async function cleanupExpiredSessions() {
121
+ const now = Date.now();
122
+ const expiredIds = [];
123
+
124
+ for (const [sessionId, session] of sessions) {
125
+ if (now - session.lastUsedAt > SESSION_TTL_MS) {
126
+ expiredIds.push(sessionId);
127
+ }
128
+ }
129
+
130
+ for (const sessionId of expiredIds) {
131
+ const session = sessions.get(sessionId);
132
+ await session.context.close().catch(() => {});
133
+ sessions.delete(sessionId);
134
+ // eslint-disable-next-line no-console
135
+ console.log(
136
+ `[rubycrawl] session expired ${sessionId} (inactive for ${SESSION_TTL_MS / 60000} min)`,
137
+ );
138
+ }
139
+
140
+ if (expiredIds.length > 0) {
141
+ // eslint-disable-next-line no-console
142
+ console.log(
143
+ `[rubycrawl] cleanup: ${expiredIds.length} expired, ${sessions.size} active`,
144
+ );
145
+ }
146
+ }
147
+
148
+ // Start cleanup interval
149
+ setInterval(cleanupExpiredSessions, CLEANUP_INTERVAL_MS);
150
+
151
+ /**
152
+ * Extract HTML metadata from the page
153
+ */
154
+ async function extractMetadata(page) {
155
+ return page.evaluate(() => {
156
+ const getMeta = (name) => {
157
+ const meta = document.querySelector(
158
+ `meta[name="${name}"], meta[property="${name}"]`,
159
+ );
160
+ return meta?.getAttribute("content") || null;
161
+ };
162
+
163
+ const getLink = (rel) => {
164
+ const link = document.querySelector(`link[rel="${rel}"]`);
165
+ return link?.getAttribute("href") || null;
166
+ };
167
+
168
+ return {
169
+ title: document.title || null,
170
+ description: getMeta("description") || getMeta("og:description") || null,
171
+ keywords: getMeta("keywords"),
172
+ author: getMeta("author"),
173
+ og_title: getMeta("og:title"),
174
+ og_description: getMeta("og:description"),
175
+ og_image: getMeta("og:image"),
176
+ og_url: getMeta("og:url"),
177
+ og_type: getMeta("og:type"),
178
+ twitter_card: getMeta("twitter:card"),
179
+ twitter_title: getMeta("twitter:title"),
180
+ twitter_description: getMeta("twitter:description"),
181
+ twitter_image: getMeta("twitter:image"),
182
+ canonical: getLink("canonical"),
183
+ lang: document.documentElement.lang || null,
184
+ charset: document.characterSet || null,
185
+ };
186
+ });
187
+ }
188
+
189
+ /**
190
+ * Extract links from the page.
191
+ */
192
+ async function extractLinks(page) {
193
+ return page.evaluate(() => {
194
+ const links = Array.from(document.querySelectorAll("a[href]"));
195
+ return links.map((link) => ({
196
+ url: link.href,
197
+ text: (link.textContent || "").trim(),
198
+ title: link.getAttribute("title") || null,
199
+ rel: link.getAttribute("rel") || null,
200
+ }));
201
+ });
202
+ }
203
+
204
+ /**
205
+ * Extract plain text content from the page using innerText.
206
+ */
207
+ async function extractText(page) {
208
+ return page.evaluate(() => (document.body?.innerText || "").trim());
209
+ }
210
+
211
+ async function handleCrawl(req, res) {
212
+ let context = null;
213
+ let isSession = false;
214
+
215
+ try {
216
+ const body = await readJson(req);
217
+ const validation = validateRequest(body);
218
+ if (!validation.ok) {
219
+ return json(res, 422, { error: validation.error });
220
+ }
221
+
222
+ const waitUntil = body.wait_until || "load";
223
+ const blockResources =
224
+ typeof body.block_resources === "boolean"
225
+ ? body.block_resources
226
+ : DEFAULT_BLOCK_RESOURCES;
227
+
228
+ const start = Date.now();
229
+ // eslint-disable-next-line no-console
230
+ console.log(
231
+ `[rubycrawl] crawl start ${body.url}${body.session_id ? ` (session=${body.session_id})` : ""}`,
232
+ );
233
+
234
+ // Get context (reuse if session_id provided)
235
+ const ctxResult = await getContext(body.session_id);
236
+ context = ctxResult.context;
237
+ isSession = ctxResult.isSession;
238
+
239
+ const page = await context.newPage();
240
+
241
+ try {
242
+ if (blockResources) {
243
+ await page.route("**/*", (route) => {
244
+ const type = route.request().resourceType();
245
+ if (BLOCKED_RESOURCE_TYPES.has(type)) {
246
+ return route.abort();
247
+ }
248
+ return route.continue();
249
+ });
250
+ }
251
+
252
+ const response = await page.goto(body.url, {
253
+ waitUntil,
254
+ timeout: 30_000,
255
+ });
256
+
257
+ const html = await page.content();
258
+ const finalUrl = page.url();
259
+ const status = response ? response.status() : null;
260
+ const htmlMetadata = await extractMetadata(page);
261
+ const links = await extractLinks(page);
262
+ const text = await extractText(page);
263
+
264
+ // eslint-disable-next-line no-console
265
+ console.log(
266
+ `[rubycrawl] crawl done ${body.url} status=${status} ms=${Date.now() - start}`,
267
+ );
268
+
269
+ return json(res, 200, {
270
+ ok: true,
271
+ url: body.url,
272
+ html,
273
+ text,
274
+ links,
275
+ metadata: {
276
+ status,
277
+ final_url: finalUrl,
278
+ ...htmlMetadata,
279
+ },
280
+ });
281
+ } finally {
282
+ await page.close();
283
+ }
284
+ } catch (error) {
285
+ const code =
286
+ error?.name === "SyntaxError" ? "invalid_json" : "crawl_failed";
287
+ // eslint-disable-next-line no-console
288
+ console.log(`[rubycrawl] crawl error ${code} ${error?.message || ""}`);
289
+ return json(res, 400, { error: code, message: error?.message });
290
+ } finally {
291
+ // Only close context if not a session (sessions are managed separately)
292
+ if (context && !isSession) {
293
+ await context.close().catch(() => {});
294
+ }
295
+ }
296
+ }
297
+
298
+ /**
299
+ * Create a new session with a reusable browser context.
300
+ */
301
+ async function handleSessionCreate(req, res) {
302
+ try {
303
+ const sessionId = generateSessionId();
304
+ const context = await createContext();
305
+ const now = Date.now();
306
+ sessions.set(sessionId, { context, createdAt: now, lastUsedAt: now });
307
+
308
+ // eslint-disable-next-line no-console
309
+ console.log(
310
+ `[rubycrawl] session created ${sessionId} (active=${sessions.size})`,
311
+ );
312
+
313
+ return json(res, 200, { ok: true, session_id: sessionId });
314
+ } catch (error) {
315
+ // eslint-disable-next-line no-console
316
+ console.log(`[rubycrawl] session create error ${error?.message || ""}`);
317
+ return json(res, 400, {
318
+ error: "session_create_failed",
319
+ message: error?.message,
320
+ });
321
+ }
322
+ }
323
+
324
+ /**
325
+ * Destroy a session and close its browser context.
326
+ * Returns success even if session doesn't exist (idempotent for retries).
327
+ */
328
+ async function handleSessionDestroy(req, res) {
329
+ try {
330
+ const body = await readJson(req);
331
+ const sessionId = body.session_id;
332
+
333
+ if (!sessionId) {
334
+ return json(res, 422, { error: "session_id required" });
335
+ }
336
+
337
+ // Idempotent: if session doesn't exist, still return success
338
+ if (!sessions.has(sessionId)) {
339
+ return json(res, 200, {
340
+ ok: true,
341
+ message: "session already destroyed or expired",
342
+ });
343
+ }
344
+
345
+ const session = sessions.get(sessionId);
346
+ await session.context.close().catch(() => {});
347
+ sessions.delete(sessionId);
348
+
349
+ // eslint-disable-next-line no-console
350
+ console.log(`[rubycrawl] session destroyed ${sessionId}`);
351
+
352
+ return json(res, 200, { ok: true });
353
+ } catch (error) {
354
+ // eslint-disable-next-line no-console
355
+ console.log(`[rubycrawl] session destroy error ${error?.message || ""}`);
356
+ return json(res, 400, {
357
+ error: "session_destroy_failed",
358
+ message: error?.message,
359
+ });
360
+ }
361
+ }
362
+
363
+ const server = http.createServer((req, res) => {
364
+ // eslint-disable-next-line no-console
365
+ console.log(`[rubycrawl] request ${req.method} ${req.url}`);
366
+
367
+ if (req.method === "POST" && req.url === "/crawl") {
368
+ return handleCrawl(req, res);
369
+ }
370
+
371
+ if (req.method === "POST" && req.url === "/session/create") {
372
+ return handleSessionCreate(req, res);
373
+ }
374
+
375
+ if (req.method === "POST" && req.url === "/session/destroy") {
376
+ return handleSessionDestroy(req, res);
377
+ }
378
+
379
+ if (req.method === "GET" && req.url === "/health") {
380
+ return json(res, 200, { ok: true });
381
+ }
382
+
383
+ return json(res, 404, { error: "not_found" });
384
+ });
385
+
386
+ server.listen(PORT, HOST, () => {
387
+ // eslint-disable-next-line no-console
388
+ console.log(`rubycrawl node service listening on http://${HOST}:${PORT}`);
389
+ });
data/rubycrawl.gemspec CHANGED
@@ -15,8 +15,9 @@ Gem::Specification.new do |spec|
15
15
 
16
16
  spec.required_ruby_version = '>= 3.0'
17
17
 
18
- spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
19
- spec.files += %w[README.md LICENSE Gemfile Rakefile rubycrawl.gemspec .rspec]
18
+ spec.files = Dir.glob('{lib}/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) }
19
+ spec.files += Dir.glob('node/**/*', File::FNM_DOTMATCH).reject { |f| File.directory?(f) || f.include?('node_modules') }
20
+ spec.files += %w[README.md LICENSE Rakefile rubycrawl.gemspec .rspec]
20
21
 
21
22
  spec.bindir = 'bin'
22
23
  spec.executables = []
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubycrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - RubyCrawl contributors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-01-26 00:00:00.000000000 Z
11
+ date: 2026-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: reverse_markdown
@@ -32,7 +32,6 @@ extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
34
  - ".rspec"
35
- - Gemfile
36
35
  - LICENSE
37
36
  - README.md
38
37
  - Rakefile
@@ -47,6 +46,12 @@ files:
47
46
  - lib/rubycrawl/tasks/install.rake
48
47
  - lib/rubycrawl/url_normalizer.rb
49
48
  - lib/rubycrawl/version.rb
49
+ - node/.gitignore
50
+ - node/.npmrc
51
+ - node/README.md
52
+ - node/package-lock.json
53
+ - node/package.json
54
+ - node/src/index.js
50
55
  - rubycrawl.gemspec
51
56
  homepage: https://github.com/craft-wise/rubycrawl
52
57
  licenses:
data/Gemfile DELETED
@@ -1,11 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- source 'https://rubygems.org'
4
-
5
- gemspec
6
-
7
- group :development do
8
- gem 'rake', '>= 13.0'
9
- gem 'rspec', '>= 3.12'
10
- gem 'rubocop', '>= 1.50'
11
- end