scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,2591 @@
1
+ /*
2
+ * scrapetor_http.c
3
+ *
4
+ * Native HTTP/2-capable fetch layer. Wraps libcurl's easy interface
5
+ * with sensible scraping defaults: HTTP/2 over TLS, automatic
6
+ * Accept-Encoding (gzip + brotli + zstd when the linked libcurl was
7
+ * built with them), connection reuse via a per-thread persistent
8
+ * handle, redirect following with a cap, and total-time timeout.
9
+ *
10
+ * Build-time conditional. If pkg-config can't find libcurl (or the
11
+ * caller passes --without-libcurl to extconf), this whole file
12
+ * collapses to a stub that raises a clear error at fetch time —
13
+ * Scrapetor still loads, only the HTTP surface is unavailable.
14
+ */
15
+
16
+ #include <ruby.h>
17
+ #include <ruby/encoding.h>
18
+ #include <ruby/thread.h>
19
+
20
+ #ifdef HAVE_LIBCURL
21
+
22
+ #include <curl/curl.h>
23
+ #include <pthread.h>
24
+ #include <string.h>
25
+ #include <stdlib.h>
26
+ #include <time.h>
27
+ #include <errno.h>
28
+ #include <iconv.h>
29
+ #include <sys/stat.h>
30
+ #include <unistd.h>
31
+
32
+ #ifdef HAVE_ZLIB
33
+ #include <zlib.h>
34
+ #endif
35
+
36
+ #ifdef HAVE_BROTLI
37
+ #include <brotli/decode.h>
38
+ #endif
39
+
40
+ #ifdef HAVE_ZSTD
41
+ #include <zstd.h>
42
+ #endif
43
+
44
+ extern rb_encoding *enc_utf8;
45
+
46
+ /* Hooks into scrapetor_dom.c so we can run dom_parse on each
47
+ * response body inside the same no-GVL worker that fetched it. */
48
+ typedef struct dom_doc dom_doc_t;
49
+ extern dom_doc_t *scrap_dom_make_owned_doc(char *bytes, size_t len);
50
+ extern void scrap_dom_parse_eager_nocache(dom_doc_t *d);
51
+ extern VALUE scrap_dom_wrap_doc(VALUE klass, dom_doc_t *d);
52
+
53
+ /* ---- Accept-Encoding negotiation --------------------------------- *
54
+ * Returns the comma-separated list of content codings this build can
55
+ * decode. We own decompression end-to-end — CURLOPT_ACCEPT_ENCODING is
56
+ * intentionally left unset so libcurl doesn't reject responses whose
57
+ * encoding it wasn't compiled for. */
58
+ static const char *scrap_accept_encoding(void) {
59
+ static char cached[128];
60
+ static int inited = 0;
61
+ if (inited) return cached;
62
+ cached[0] = 0;
63
+ int first = 1;
64
+ #ifdef HAVE_ZLIB
65
+ { strcat(cached, first ? "gzip, deflate" : ", gzip, deflate"); first = 0; }
66
+ #endif
67
+ #ifdef HAVE_BROTLI
68
+ { strcat(cached, first ? "br" : ", br"); first = 0; }
69
+ #endif
70
+ #ifdef HAVE_ZSTD
71
+ { strcat(cached, first ? "zstd" : ", zstd"); first = 0; }
72
+ #endif
73
+ if (first) {
74
+ /* No codecs linked at all — advertise identity so servers know
75
+ * not to compress. */
76
+ strcpy(cached, "identity");
77
+ }
78
+ inited = 1;
79
+ return cached;
80
+ }
81
+
82
+ /* ---- in-process zlib / brotli / zstd decoders -------------------- */
83
+
84
+ #ifdef HAVE_ZLIB
85
+ /* `gzip` and `deflate`. window_bits selects which: 31 = gzip wrapper,
86
+ * 15 = zlib wrapper, -15 = raw deflate. The 47 path auto-detects gzip
87
+ * vs zlib, which is what we want since some servers send Content-
88
+ * Encoding: deflate with the zlib wrapper and others without. */
89
+ static int scrap_zlib_decode(const char *in, size_t in_len,
90
+ int window_bits,
91
+ char **out, size_t *out_len) {
92
+ z_stream s; memset(&s, 0, sizeof(s));
93
+ if (inflateInit2(&s, window_bits) != Z_OK) return 0;
94
+ size_t cap = in_len * 4 + 4096;
95
+ char *buf = (char *)malloc(cap);
96
+ size_t total = 0;
97
+ s.next_in = (Bytef *)in;
98
+ s.avail_in = (uInt)in_len;
99
+ while (1) {
100
+ if (cap - total < 4096) {
101
+ cap *= 2;
102
+ buf = (char *)realloc(buf, cap);
103
+ }
104
+ s.next_out = (Bytef *)(buf + total);
105
+ s.avail_out = (uInt)(cap - total);
106
+ int r = inflate(&s, Z_NO_FLUSH);
107
+ total = cap - s.avail_out;
108
+ if (r == Z_STREAM_END) break;
109
+ if (r != Z_OK) { inflateEnd(&s); free(buf); return 0; }
110
+ if (s.avail_in == 0 && s.avail_out > 0) break;
111
+ }
112
+ inflateEnd(&s);
113
+ *out = buf; *out_len = total;
114
+ return 1;
115
+ }
116
+ #endif
117
+
118
+ /* ---- in-process brotli / zstd decoders --------------------------- */
119
+
120
+ #ifdef HAVE_BROTLI
121
+ static int scrap_brotli_decode(const char *in, size_t in_len,
122
+ char **out, size_t *out_len) {
123
+ BrotliDecoderState *st = BrotliDecoderCreateInstance(NULL, NULL, NULL);
124
+ if (!st) return 0;
125
+ size_t cap = in_len * 4 + 1024;
126
+ char *buf = (char *)malloc(cap);
127
+ size_t total = 0;
128
+ const uint8_t *next_in = (const uint8_t *)in;
129
+ size_t avail_in = in_len;
130
+ BrotliDecoderResult r;
131
+ do {
132
+ uint8_t *next_out = (uint8_t *)(buf + total);
133
+ size_t avail_out = cap - total;
134
+ r = BrotliDecoderDecompressStream(st, &avail_in, &next_in,
135
+ &avail_out, &next_out, NULL);
136
+ total = (size_t)((char *)next_out - buf);
137
+ if (r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
138
+ cap *= 2;
139
+ buf = (char *)realloc(buf, cap);
140
+ }
141
+ } while (r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT ||
142
+ r == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT);
143
+ BrotliDecoderDestroyInstance(st);
144
+ if (r != BROTLI_DECODER_RESULT_SUCCESS) { free(buf); return 0; }
145
+ *out = buf; *out_len = total;
146
+ return 1;
147
+ }
148
+ #endif
149
+
150
+ #ifdef HAVE_ZSTD
151
+ static int scrap_zstd_decode(const char *in, size_t in_len,
152
+ char **out, size_t *out_len) {
153
+ /* Use streaming zstd so we don't have to trust the frame's
154
+ * declared size. */
155
+ ZSTD_DStream *zds = ZSTD_createDStream();
156
+ if (!zds) return 0;
157
+ ZSTD_initDStream(zds);
158
+ size_t cap = ZSTD_DStreamOutSize();
159
+ if (cap < in_len * 4) cap = in_len * 4 + 4096;
160
+ char *buf = (char *)malloc(cap);
161
+ size_t total = 0;
162
+ ZSTD_inBuffer zin = { in, in_len, 0 };
163
+ while (zin.pos < zin.size) {
164
+ if (cap - total < ZSTD_DStreamOutSize()) {
165
+ cap *= 2;
166
+ buf = (char *)realloc(buf, cap);
167
+ }
168
+ ZSTD_outBuffer zout = { buf + total, cap - total, 0 };
169
+ size_t r = ZSTD_decompressStream(zds, &zout, &zin);
170
+ if (ZSTD_isError(r)) { ZSTD_freeDStream(zds); free(buf); return 0; }
171
+ total += zout.pos;
172
+ if (r == 0) break; /* frame complete */
173
+ }
174
+ ZSTD_freeDStream(zds);
175
+ *out = buf; *out_len = total;
176
+ return 1;
177
+ }
178
+ #endif
179
+
180
+ /* ---- per-host throttle table ------------------------------------- *
181
+ * Global, thread-safe map from host name to last-request timestamp.
182
+ * Drives polite-scraping rate limits across both single-fetch and
183
+ * parallel-fetch paths — a parallel batch of 32 URLs against the same
184
+ * host with rate_limit_ms=500 will serialise at that host through this
185
+ * table even though the worker threads themselves are independent.
186
+ *
187
+ * 256 slots is plenty: scrapers target dozens of hosts max in practice;
188
+ * past that we round-robin LRU evictions.
189
+ */
190
+ typedef struct {
191
+ char *host; /* malloc'd, lowercase */
192
+ uint64_t last_ns; /* monotonic-ns timestamp of last completed wait */
193
+ uint32_t hits; /* LRU counter for eviction */
194
+ } throttle_slot_t;
195
+
196
+ #define THROTTLE_CAP 256
197
+ static throttle_slot_t g_throttle[THROTTLE_CAP];
198
+ static int g_throttle_n = 0;
199
+ static pthread_mutex_t g_throttle_mu = PTHREAD_MUTEX_INITIALIZER;
200
+
201
+ static uint64_t mono_ns(void) {
202
+ struct timespec ts;
203
+ clock_gettime(CLOCK_MONOTONIC, &ts);
204
+ return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
205
+ }
206
+
207
+ /* Pull the host out of a URL: bytes between "://" and the next
208
+ * '/', '?', '#', or ':'. Returns 1 on success. */
209
+ static int scrap_extract_host(const char *url, char *out, size_t cap) {
210
+ const char *p = strstr(url, "://");
211
+ if (!p) return 0;
212
+ p += 3;
213
+ const char *e = p;
214
+ while (*e && *e != '/' && *e != '?' && *e != '#' && *e != ':') e++;
215
+ size_t l = (size_t)(e - p);
216
+ if (l == 0 || l + 1 > cap) return 0;
217
+ for (size_t i = 0; i < l; i++) {
218
+ char c = p[i];
219
+ out[i] = (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c;
220
+ }
221
+ out[l] = 0;
222
+ return 1;
223
+ }
224
+
225
+ /* Wait long enough since the last request to `host` to honour the
226
+ * min interval, then mark the new "last" time. The sleep happens
227
+ * outside the mutex so concurrent workers for different hosts don't
228
+ * block each other.
229
+ *
230
+ * Safe to call from a no-GVL worker — uses nanosleep, no Ruby state. */
231
+ static void scrap_throttle_wait(const char *host, uint64_t min_interval_ns) {
232
+ if (!host || !*host || min_interval_ns == 0) return;
233
+ pthread_mutex_lock(&g_throttle_mu);
234
+ int idx = -1;
235
+ for (int i = 0; i < g_throttle_n; i++) {
236
+ if (strcmp(g_throttle[i].host, host) == 0) { idx = i; break; }
237
+ }
238
+ if (idx < 0) {
239
+ if (g_throttle_n < THROTTLE_CAP) {
240
+ idx = g_throttle_n++;
241
+ } else {
242
+ /* Evict the least-recently-used slot. */
243
+ int lru = 0;
244
+ for (int i = 1; i < THROTTLE_CAP; i++) {
245
+ if (g_throttle[i].hits < g_throttle[lru].hits) lru = i;
246
+ }
247
+ idx = lru;
248
+ free(g_throttle[idx].host);
249
+ g_throttle[idx].host = NULL;
250
+ }
251
+ g_throttle[idx].host = strdup(host);
252
+ g_throttle[idx].last_ns = 0;
253
+ g_throttle[idx].hits = 0;
254
+ }
255
+ g_throttle[idx].hits++;
256
+ uint64_t now = mono_ns();
257
+ /* last_ns is the "earliest allowed start" for the next request to
258
+ * this host. Each worker reserves its slot by advancing
259
+ * last_ns = max(now, last_ns) + min_interval_ns. Concurrent workers
260
+ * to the same host see ever-increasing reservations and serialise
261
+ * cleanly; concurrent workers to *different* hosts hit different
262
+ * slots and don't block each other. */
263
+ uint64_t earliest = g_throttle[idx].last_ns;
264
+ uint64_t start = (earliest <= now) ? now : earliest;
265
+ uint64_t wait_ns = start - now;
266
+ g_throttle[idx].last_ns = start + min_interval_ns;
267
+ pthread_mutex_unlock(&g_throttle_mu);
268
+
269
+ if (wait_ns > 0) {
270
+ struct timespec ts;
271
+ ts.tv_sec = (time_t)(wait_ns / 1000000000ull);
272
+ ts.tv_nsec = (long)(wait_ns % 1000000000ull);
273
+ nanosleep(&ts, NULL);
274
+ }
275
+ }
276
+
277
+ /* ---- HTTP response cache (ETag / Last-Modified) ------------------ *
278
+ * Disk-backed cache of completed GET responses keyed by URL. Each entry
279
+ * stores status + ETag + Last-Modified + Content-Type + body in a
280
+ * tagged binary format:
281
+ *
282
+ * 8 bytes magic "SCRHV001"
283
+ * 4 bytes uint32_le status
284
+ * 4 + N etag (length-prefixed)
285
+ * 4 + N lastmod (length-prefixed)
286
+ * 4 + N ctype (length-prefixed)
287
+ * 8 + N body (uint64_le length-prefixed)
288
+ *
289
+ * When :cache_dir is set on a request and a cache entry exists for the
290
+ * URL, the fetch adds If-None-Match / If-Modified-Since automatically.
291
+ * A 304 response is served from the cached body with the cached
292
+ * Content-Type — the network round-trip stayed cheap (no body) but
293
+ * the caller sees a fully-formed 200-shaped response.
294
+ *
295
+ * Cache key is a 128-bit FNV1a-double (two FNV64s with different
296
+ * seeds) — collision probability for any realistic crawl is
297
+ * effectively zero, and avoiding a SHA-2 dep keeps the binary lean.
298
+ */
299
+
300
+ static void scrap_cache_key(const char *url, char out[33]) {
301
+ uint64_t h1 = 0xcbf29ce484222325ull;
302
+ uint64_t h2 = 0x84222325cbf29ce4ull;
303
+ for (size_t i = 0; url[i]; i++) {
304
+ uint8_t c = (uint8_t)url[i];
305
+ h1 ^= c; h1 *= 0x100000001b3ull;
306
+ h2 ^= c; h2 *= 0x9e3779b97f4a7c15ull;
307
+ }
308
+ snprintf(out, 33, "%016llx%016llx",
309
+ (unsigned long long)h1, (unsigned long long)h2);
310
+ }
311
+
312
+ typedef struct {
313
+ long status;
314
+ char *etag; size_t etag_len;
315
+ char *lastmod; size_t lastmod_len;
316
+ char *ctype; size_t ctype_len;
317
+ char *body; size_t body_len;
318
+ } scrap_cache_entry_t;
319
+
320
+ static void scrap_cache_entry_free(scrap_cache_entry_t *e) {
321
+ free(e->etag); free(e->lastmod); free(e->ctype); free(e->body);
322
+ memset(e, 0, sizeof(*e));
323
+ }
324
+
325
+ static int scrap_cache_path(const char *dir, const char *url,
326
+ char *out, size_t cap) {
327
+ char key[33];
328
+ scrap_cache_key(url, key);
329
+ int n = snprintf(out, cap, "%s/%c%c", dir, key[0], key[1]);
330
+ if (n <= 0 || (size_t)n >= cap) return 0;
331
+ mkdir(dir, 0755); /* best-effort; the leaf mkdir below is what matters */
332
+ mkdir(out, 0755);
333
+ return snprintf(out, cap, "%s/%c%c/%s.cache", dir, key[0], key[1], key) > 0;
334
+ }
335
+
336
+ static int read_u32_le(FILE *f, uint32_t *out) {
337
+ uint8_t b[4];
338
+ if (fread(b, 1, 4, f) != 4) return 0;
339
+ *out = (uint32_t)b[0] | ((uint32_t)b[1] << 8) |
340
+ ((uint32_t)b[2] << 16) | ((uint32_t)b[3] << 24);
341
+ return 1;
342
+ }
343
+ static int read_u64_le(FILE *f, uint64_t *out) {
344
+ uint8_t b[8];
345
+ if (fread(b, 1, 8, f) != 8) return 0;
346
+ *out = 0;
347
+ for (int i = 0; i < 8; i++) *out |= (uint64_t)b[i] << (i * 8);
348
+ return 1;
349
+ }
350
+ static int read_lenstr(FILE *f, char **out, size_t *out_len) {
351
+ uint32_t l;
352
+ if (!read_u32_le(f, &l)) return 0;
353
+ *out_len = l;
354
+ if (l == 0) { *out = NULL; return 1; }
355
+ *out = (char *)malloc(l + 1);
356
+ if (fread(*out, 1, l, f) != l) { free(*out); *out = NULL; return 0; }
357
+ (*out)[l] = 0;
358
+ return 1;
359
+ }
360
+
361
+ static int scrap_cache_load(const char *dir, const char *url,
362
+ scrap_cache_entry_t *e) {
363
+ memset(e, 0, sizeof(*e));
364
+ char path[1024];
365
+ if (!scrap_cache_path(dir, url, path, sizeof(path))) return 0;
366
+ FILE *f = fopen(path, "rb");
367
+ if (!f) return 0;
368
+ char magic[8];
369
+ if (fread(magic, 1, 8, f) != 8 || memcmp(magic, "SCRHV001", 8) != 0) {
370
+ fclose(f); return 0;
371
+ }
372
+ uint32_t status;
373
+ if (!read_u32_le(f, &status)) { fclose(f); return 0; }
374
+ e->status = (long)status;
375
+ if (!read_lenstr(f, &e->etag, &e->etag_len)) { fclose(f); return 0; }
376
+ if (!read_lenstr(f, &e->lastmod, &e->lastmod_len)) { fclose(f); return 0; }
377
+ if (!read_lenstr(f, &e->ctype, &e->ctype_len)) { fclose(f); return 0; }
378
+ uint64_t body_len;
379
+ if (!read_u64_le(f, &body_len)) { fclose(f); return 0; }
380
+ e->body_len = body_len;
381
+ if (body_len > 0) {
382
+ e->body = (char *)malloc(body_len + 1);
383
+ if (fread(e->body, 1, body_len, f) != body_len) {
384
+ fclose(f); scrap_cache_entry_free(e); return 0;
385
+ }
386
+ e->body[body_len] = 0;
387
+ }
388
+ fclose(f);
389
+ return 1;
390
+ }
391
+
392
+ static void write_u32_le(FILE *f, uint32_t v) {
393
+ uint8_t b[4] = { v & 0xFF, (v >> 8) & 0xFF, (v >> 16) & 0xFF, (v >> 24) & 0xFF };
394
+ fwrite(b, 1, 4, f);
395
+ }
396
+ static void write_u64_le(FILE *f, uint64_t v) {
397
+ uint8_t b[8];
398
+ for (int i = 0; i < 8; i++) b[i] = (v >> (i * 8)) & 0xFF;
399
+ fwrite(b, 1, 8, f);
400
+ }
401
+ static void write_lenstr(FILE *f, const char *p, size_t l) {
402
+ write_u32_le(f, (uint32_t)l);
403
+ if (l) fwrite(p, 1, l, f);
404
+ }
405
+
406
+ static int scrap_cache_store(const char *dir, const char *url,
407
+ long status, const char *etag, size_t etag_len,
408
+ const char *lastmod, size_t lastmod_len,
409
+ const char *ctype, size_t ctype_len,
410
+ const char *body, size_t body_len) {
411
+ char path[1024];
412
+ if (!scrap_cache_path(dir, url, path, sizeof(path))) return 0;
413
+ char tmp[1100];
414
+ snprintf(tmp, sizeof(tmp), "%s.tmp.%d", path, (int)getpid());
415
+ FILE *f = fopen(tmp, "wb");
416
+ if (!f) return 0;
417
+ fwrite("SCRHV001", 1, 8, f);
418
+ write_u32_le(f, (uint32_t)status);
419
+ write_lenstr(f, etag, etag_len);
420
+ write_lenstr(f, lastmod, lastmod_len);
421
+ write_lenstr(f, ctype, ctype_len);
422
+ write_u64_le(f, (uint64_t)body_len);
423
+ if (body_len) fwrite(body, 1, body_len, f);
424
+ fclose(f);
425
+ return rename(tmp, path) == 0;
426
+ }
427
+
428
+ /* ---- charset detection + iconv transcode to UTF-8 ----------------- *
429
+ * Find charset=... in a Content-Type header value (length-bounded), then
430
+ * iconv-transcode the body buffer in place. Replacement bytes are used
431
+ * for invalid sequences so the parse layer never trips on undecodable
432
+ * input. UTF-8 / utf8 / absent charset all skip the conversion.
433
+ */
434
+ static int scrap_extract_charset(const char *ct, size_t ct_len,
435
+ char *out, size_t cap) {
436
+ const char *needle = "charset";
437
+ size_t needle_len = 7;
438
+ for (size_t i = 0; i + needle_len < ct_len; i++) {
439
+ int ok = 1;
440
+ for (size_t j = 0; j < needle_len; j++) {
441
+ char a = ct[i + j];
442
+ if (a >= 'A' && a <= 'Z') a += 32;
443
+ if (a != needle[j]) { ok = 0; break; }
444
+ }
445
+ if (!ok) continue;
446
+ size_t j = i + needle_len;
447
+ while (j < ct_len && (ct[j] == ' ' || ct[j] == '\t')) j++;
448
+ if (j >= ct_len || ct[j] != '=') continue;
449
+ j++;
450
+ while (j < ct_len && (ct[j] == ' ' || ct[j] == '\t')) j++;
451
+ char quote = 0;
452
+ if (j < ct_len && (ct[j] == '"' || ct[j] == '\'')) { quote = ct[j]; j++; }
453
+ size_t s = j;
454
+ while (j < ct_len &&
455
+ (quote ? (ct[j] != quote)
456
+ : (ct[j] != ';' && ct[j] != ' ' && ct[j] != '\t' &&
457
+ ct[j] != '\r' && ct[j] != '\n'))) j++;
458
+ size_t l = j - s;
459
+ if (l == 0 || l + 1 > cap) return 0;
460
+ memcpy(out, ct + s, l);
461
+ out[l] = 0;
462
+ return 1;
463
+ }
464
+ return 0;
465
+ }
466
+
467
+ static int scrap_transcode_to_utf8(char **body, size_t *body_len, size_t *body_cap,
468
+ const char *charset) {
469
+ if (!charset || !*charset) return 0;
470
+ if (strcasecmp(charset, "utf-8") == 0 ||
471
+ strcasecmp(charset, "utf8") == 0 ||
472
+ strcasecmp(charset, "us-ascii") == 0 ||
473
+ strcasecmp(charset, "ascii") == 0) return 0;
474
+ iconv_t cd = iconv_open("UTF-8", charset);
475
+ if (cd == (iconv_t)-1) return 0;
476
+
477
+ size_t in_left = *body_len;
478
+ char *in_ptr = *body;
479
+ size_t out_cap = (*body_len) * 2 + 16;
480
+ char *out = (char *)malloc(out_cap);
481
+ char *out_ptr = out;
482
+ size_t out_left = out_cap;
483
+
484
+ while (in_left > 0) {
485
+ size_t r = iconv(cd, &in_ptr, &in_left, &out_ptr, &out_left);
486
+ if (r != (size_t)-1) continue;
487
+ if (errno == EILSEQ || errno == EINVAL) {
488
+ /* Replace the offending byte with '?' and skip it. */
489
+ if (out_left < 1) {
490
+ size_t used = (size_t)(out_ptr - out);
491
+ out_cap *= 2;
492
+ out = (char *)realloc(out, out_cap);
493
+ out_ptr = out + used;
494
+ out_left = out_cap - used;
495
+ }
496
+ *out_ptr++ = '?'; out_left--;
497
+ in_ptr++; in_left--;
498
+ } else if (errno == E2BIG) {
499
+ size_t used = (size_t)(out_ptr - out);
500
+ out_cap *= 2;
501
+ out = (char *)realloc(out, out_cap);
502
+ out_ptr = out + used;
503
+ out_left = out_cap - used;
504
+ } else {
505
+ iconv_close(cd);
506
+ free(out);
507
+ return 0;
508
+ }
509
+ }
510
+ iconv_close(cd);
511
+ free(*body);
512
+ *body = out;
513
+ *body_len = (size_t)(out_ptr - out);
514
+ *body_cap = out_cap;
515
+ return 1;
516
+ }
517
+
518
+ /* Pull Content-Type out of a header blob and run transcode if its
519
+ * charset is non-UTF-8. No-op when there's no header, no charset, or
520
+ * the charset is already UTF-8 / ASCII. */
521
+ static int scrap_apply_charset(const char *headers_blob, size_t headers_len,
522
+ char **body, size_t *body_len, size_t *body_cap) {
523
+ const char *ct_val = NULL; size_t ct_vlen = 0;
524
+ size_t i = 0;
525
+ while (i < headers_len) {
526
+ size_t ls = i;
527
+ while (i < headers_len && headers_blob[i] != '\n') i++;
528
+ size_t le = i;
529
+ if (le > ls && headers_blob[le-1] == '\r') le--;
530
+ if (i < headers_len) i++;
531
+ if (le == ls) continue;
532
+ size_t colon = (size_t)-1;
533
+ for (size_t k = ls; k < le; k++) {
534
+ if (headers_blob[k] == ':') { colon = k; break; }
535
+ }
536
+ if (colon == (size_t)-1) continue;
537
+ if (colon - ls != 12) continue;
538
+ const char *want = "content-type";
539
+ int ok = 1;
540
+ for (size_t k = 0; k < 12; k++) {
541
+ char a = headers_blob[ls + k];
542
+ if (a >= 'A' && a <= 'Z') a += 32;
543
+ if (a != want[k]) { ok = 0; break; }
544
+ }
545
+ if (!ok) continue;
546
+ size_t vs = colon + 1;
547
+ while (vs < le && (headers_blob[vs] == ' ' || headers_blob[vs] == '\t')) vs++;
548
+ ct_val = headers_blob + vs; ct_vlen = le - vs;
549
+ }
550
+ if (!ct_val) return 0;
551
+ char cs[64];
552
+ if (!scrap_extract_charset(ct_val, ct_vlen, cs, sizeof(cs))) return 0;
553
+ return scrap_transcode_to_utf8(body, body_len, body_cap, cs);
554
+ }
555
+
556
+ /* ---- shared connection cache (CURLSH) ---------------------------- *
557
+ * libcurl easy handles each carry a private connection cache by
558
+ * default. With per-thread handles that means N pthread workers
559
+ * hitting one host open N independent TLS connections. CURLSH lets
560
+ * them all share one connection pool, one DNS cache, and one TLS
561
+ * session cache — so 8 workers against the same HTTP/2 origin
562
+ * settle on one (or a few) multiplexed connections instead of
563
+ * eight handshakes.
564
+ *
565
+ * libcurl requires user-provided locks for the share since the
566
+ * shared data structures can be touched concurrently. We use one
567
+ * pthread mutex per shared resource class. */
568
+
569
+ static CURLSH *g_share = NULL;
570
+ /* One mutex per curl_lock_data class. curl_lock_data values run from
571
+ * 0 (NONE) up to CURL_LOCK_DATA_LAST; sizing the array to 16 covers
572
+ * present + future entries comfortably without an unbounded VLA. */
573
+ #define SCRAP_SHARE_LOCKS 16
574
+ static pthread_mutex_t g_share_locks[SCRAP_SHARE_LOCKS];
575
+
576
+ static void scrap_share_lock(CURL *h, curl_lock_data data,
577
+ curl_lock_access access, void *user) {
578
+ (void)h; (void)access; (void)user;
579
+ if ((int)data >= 0 && (int)data < SCRAP_SHARE_LOCKS) {
580
+ pthread_mutex_lock(&g_share_locks[(int)data]);
581
+ }
582
+ }
583
+ static void scrap_share_unlock(CURL *h, curl_lock_data data, void *user) {
584
+ (void)h; (void)user;
585
+ if ((int)data >= 0 && (int)data < SCRAP_SHARE_LOCKS) {
586
+ pthread_mutex_unlock(&g_share_locks[(int)data]);
587
+ }
588
+ }
589
+ static void scrap_share_init(void) {
590
+ if (g_share) return;
591
+ for (int i = 0; i < SCRAP_SHARE_LOCKS; i++) {
592
+ pthread_mutex_init(&g_share_locks[i], NULL);
593
+ }
594
+ g_share = curl_share_init();
595
+ curl_share_setopt(g_share, CURLSHOPT_LOCKFUNC, scrap_share_lock);
596
+ curl_share_setopt(g_share, CURLSHOPT_UNLOCKFUNC, scrap_share_unlock);
597
+ curl_share_setopt(g_share, CURLSHOPT_SHARE, CURL_LOCK_DATA_CONNECT);
598
+ curl_share_setopt(g_share, CURLSHOPT_SHARE, CURL_LOCK_DATA_DNS);
599
+ curl_share_setopt(g_share, CURLSHOPT_SHARE, CURL_LOCK_DATA_SSL_SESSION);
600
+ }
601
+
602
+ /* Lazy `curl_global_init` so `require "scrapetor"` doesn't kick libcurl
603
+ * — and through it Apple's SystemConfiguration framework — into spinning
604
+ * up Cocoa class +initialize methods on background threads. When the
605
+ * host process (Puma master, Spring preloader, Foreman + sidekiq) forks
606
+ * a worker before that init finishes, the child trips
607
+ * +[NSCharacterSet initialize] may have been in progress in another thread
608
+ * when fork() was called. We cannot safely call it or ignore it ... Crashing.
609
+ * Deferring the init until the first actual fetch lets the master fork
610
+ * cleanly; each post-fork worker then runs the init itself the first
611
+ * time it touches the network. pthread_once gives us safe one-time
612
+ * execution even if multiple worker threads race the first call. */
613
+ static pthread_once_t g_curl_init_once = PTHREAD_ONCE_INIT;
614
+ static void scrap_global_init_once_fn(void) {
615
+ curl_global_init(CURL_GLOBAL_DEFAULT);
616
+ scrap_share_init();
617
+ }
618
+ static inline void scrap_ensure_global_init(void) {
619
+ pthread_once(&g_curl_init_once, scrap_global_init_once_fn);
620
+ }
621
+
622
+ /* ---- per-thread curl handle pool ---------------------------------- *
623
+ * Re-creating an easy handle costs ~30 µs and discards connection
624
+ * cache. Holding one handle per OS thread (via pthread_specific) lets
625
+ * back-to-back fetches against the same host reuse the TLS/HTTP-2
626
+ * session. Cleared automatically on thread exit. */
627
+
628
+ static pthread_key_t g_curl_tls_key;
629
+ static pthread_once_t g_curl_tls_once = PTHREAD_ONCE_INIT;
630
+
631
+ static void curl_tls_dtor(void *p) {
632
+ if (p) curl_easy_cleanup((CURL *)p);
633
+ }
634
+ static void curl_tls_init(void) {
635
+ pthread_key_create(&g_curl_tls_key, curl_tls_dtor);
636
+ }
637
+
638
+ static CURL *get_thread_curl(void) {
639
+ pthread_once(&g_curl_tls_once, curl_tls_init);
640
+ CURL *h = (CURL *)pthread_getspecific(g_curl_tls_key);
641
+ if (!h) {
642
+ h = curl_easy_init();
643
+ pthread_setspecific(g_curl_tls_key, h);
644
+ } else {
645
+ curl_easy_reset(h);
646
+ /* curl_easy_reset does NOT clear cookie engine state. Wipe it
647
+ * explicitly so a previous call's cookies don't leak into the
648
+ * next one on the same per-thread handle. Callers who want
649
+ * cross-request cookie persistence opt in via :cookiejar /
650
+ * :cookiefile, which re-enables the engine for that request. */
651
+ curl_easy_setopt(h, CURLOPT_COOKIELIST, "ALL");
652
+ }
653
+ /* Attach the global share so this handle pulls connections, DNS
654
+ * results, and TLS sessions from the shared pool. Must be set
655
+ * after every reset because curl_easy_reset clears it. */
656
+ if (g_share) curl_easy_setopt(h, CURLOPT_SHARE, g_share);
657
+ return h;
658
+ }
659
+
660
+ /* ---- response buffer --------------------------------------------- */
661
+
662
+ typedef struct {
663
+ char *data;
664
+ size_t len;
665
+ size_t cap;
666
+ } buf_t;
667
+
668
+ static size_t buf_append(buf_t *b, const char *src, size_t n) {
669
+ if (b->len + n + 1 > b->cap) {
670
+ size_t nc = b->cap == 0 ? 16 * 1024 : b->cap * 2;
671
+ while (nc < b->len + n + 1) nc *= 2;
672
+ char *p = (char *)realloc(b->data, nc);
673
+ if (!p) return 0;
674
+ b->data = p; b->cap = nc;
675
+ }
676
+ memcpy(b->data + b->len, src, n);
677
+ b->len += n;
678
+ b->data[b->len] = 0;
679
+ return n;
680
+ }
681
+
682
+ static size_t cb_body(char *ptr, size_t size, size_t nmemb, void *userdata) {
683
+ buf_t *b = (buf_t *)userdata;
684
+ return buf_append(b, ptr, size * nmemb);
685
+ }
686
+
687
+ static size_t cb_header(char *ptr, size_t size, size_t nmemb, void *userdata) {
688
+ buf_t *b = (buf_t *)userdata;
689
+ return buf_append(b, ptr, size * nmemb);
690
+ }
691
+
692
+ /* ---- fetch context ----------------------------------------------- *
693
+ * Built under GVL, then handed to the no-GVL worker which runs
694
+ * curl_easy_perform. */
695
+
696
+ typedef struct {
697
+ CURL *handle;
698
+ buf_t body;
699
+ buf_t headers;
700
+ struct curl_slist *req_headers; /* freed by caller */
701
+ CURLcode rc;
702
+ } fetch_ctx_t;
703
+
704
+ static void *do_fetch_nogvl(void *arg) {
705
+ fetch_ctx_t *fc = (fetch_ctx_t *)arg;
706
+ fc->rc = curl_easy_perform(fc->handle);
707
+ return NULL;
708
+ }
709
+
710
+ /* Parse a HTTP header blob ("HTTP/2 200\r\nHeader: value\r\n...") into
711
+ * a Ruby Hash. Multi-value headers get concatenated. Status lines are
712
+ * filtered out so the Hash only carries response headers. */
713
+ static VALUE parse_headers_blob(const char *data, size_t len) {
714
+ VALUE h = rb_hash_new();
715
+ size_t i = 0;
716
+ while (i < len) {
717
+ size_t line_start = i;
718
+ while (i < len && data[i] != '\n') i++;
719
+ size_t line_end = i;
720
+ if (line_end > line_start && data[line_end - 1] == '\r') line_end--;
721
+ if (i < len) i++;
722
+ if (line_end == line_start) continue;
723
+
724
+ /* Skip the "HTTP/x.y NNN ..." status line; curl emits one per
725
+ * redirect step. Real headers always contain ':'. */
726
+ size_t colon = (size_t)-1;
727
+ for (size_t k = line_start; k < line_end; k++) {
728
+ if (data[k] == ':') { colon = k; break; }
729
+ }
730
+ if (colon == (size_t)-1) continue;
731
+
732
+ size_t name_s = line_start;
733
+ size_t name_e = colon;
734
+ size_t val_s = colon + 1;
735
+ while (val_s < line_end && (data[val_s] == ' ' || data[val_s] == '\t')) val_s++;
736
+ size_t val_e = line_end;
737
+
738
+ VALUE name = rb_str_new(data + name_s, (long)(name_e - name_s));
739
+ VALUE val = rb_str_new(data + val_s, (long)(val_e - val_s));
740
+ rb_enc_associate(name, enc_utf8);
741
+ rb_enc_associate(val, enc_utf8);
742
+ /* Header names are ASCII case-insensitive; downcase for lookup
743
+ * ergonomics on the Ruby side. */
744
+ rb_funcall(name, rb_intern("downcase!"), 0);
745
+ VALUE existing = rb_hash_lookup(h, name);
746
+ if (NIL_P(existing)) {
747
+ rb_hash_aset(h, name, val);
748
+ } else {
749
+ VALUE both = rb_str_dup(existing);
750
+ rb_str_cat_cstr(both, ", ");
751
+ rb_str_append(both, val);
752
+ rb_hash_aset(h, name, both);
753
+ }
754
+ }
755
+ return h;
756
+ }
757
+
758
+ static VALUE scrap_http_get(int argc, VALUE *argv, VALUE self) {
759
+ (void)self;
760
+ scrap_ensure_global_init();
761
+ VALUE url_v, opts_v;
762
+ rb_scan_args(argc, argv, "11", &url_v, &opts_v);
763
+ Check_Type(url_v, T_STRING);
764
+
765
+ long timeout_ms = 30000;
766
+ int follow = 1;
767
+ long max_redirs = 10;
768
+ const char *ua = "scrapetor/0.1 (libcurl)";
769
+ VALUE headers_v = Qnil;
770
+ int insecure = 0;
771
+ const char *method = NULL; /* NULL = GET */
772
+ const char *body = NULL;
773
+ long body_len = 0;
774
+ int nobody = 0; /* HEAD */
775
+ const char *cookiejar = NULL;
776
+ const char *cookiefile = NULL;
777
+ const char *proxy = NULL;
778
+ const char *basic_auth = NULL;
779
+ const char *bearer = NULL;
780
+ const char *ca_path = NULL;
781
+ long rate_limit_ms = 0;
782
+ int transcode_utf8 = 1;
783
+ const char *cache_dir = NULL;
784
+ VALUE multipart_v = Qnil;
785
+ /* mTLS client cert + key */
786
+ const char *ssl_cert = NULL;
787
+ const char *ssl_key = NULL;
788
+ const char *ssl_key_pass = NULL;
789
+ const char *ssl_cert_type = NULL; /* "PEM" / "DER"; NULL = libcurl default */
790
+ /* Proxy auth + type */
791
+ const char *proxy_auth = NULL; /* "user:pass" */
792
+ const char *proxy_type = NULL; /* "http", "socks5", "socks5h", ... */
793
+ /* Stream body to disk instead of buffering. When set, the
794
+ * response :body in the returned hash is an empty String; the
795
+ * caller reads from the file. */
796
+ const char *download_to = NULL;
797
+ long max_recv_bps = 0; /* CURLOPT_MAX_RECV_SPEED_LARGE */
798
+ long max_send_bps = 0; /* CURLOPT_MAX_SEND_SPEED_LARGE */
799
+ /* HTTP version selection. NULL = default (HTTP/2 over TLS with
800
+ * 1.1 fallback). "1.0" / "1.1" / "2" / "3" force the negotiated
801
+ * version. "3" requires libcurl with HTTP/3 support; otherwise
802
+ * curl errors. */
803
+ const char *http_version = NULL;
804
+
805
+ if (!NIL_P(opts_v)) {
806
+ Check_Type(opts_v, T_HASH);
807
+ VALUE v;
808
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
809
+ if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
810
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
811
+ if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
812
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
813
+ if (!NIL_P(v)) max_redirs = NUM2LONG(v);
814
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
815
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
816
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
817
+ if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
818
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
819
+ if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
820
+
821
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("method")));
822
+ if (!NIL_P(v)) {
823
+ if (SYMBOL_P(v)) v = rb_sym2str(v);
824
+ Check_Type(v, T_STRING);
825
+ method = RSTRING_PTR(v);
826
+ if (strcasecmp(method, "head") == 0) { nobody = 1; method = NULL; }
827
+ else if (strcasecmp(method, "get") == 0) method = NULL;
828
+ else {
829
+ /* HTTP methods are case-sensitive; uppercase so
830
+ * picky servers (RFC 7231 strict) accept them. */
831
+ static char method_buf[24];
832
+ size_t mi = 0;
833
+ for (; mi < sizeof(method_buf) - 1 && method[mi]; mi++) {
834
+ char c = method[mi];
835
+ method_buf[mi] = (c >= 'a' && c <= 'z') ? (char)(c - 32) : c;
836
+ }
837
+ method_buf[mi] = 0;
838
+ method = method_buf;
839
+ }
840
+ }
841
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("body")));
842
+ if (!NIL_P(v)) {
843
+ Check_Type(v, T_STRING);
844
+ body = RSTRING_PTR(v);
845
+ body_len = RSTRING_LEN(v);
846
+ }
847
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cookiejar")));
848
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); cookiejar = RSTRING_PTR(v); }
849
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cookiefile")));
850
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); cookiefile = RSTRING_PTR(v); }
851
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("proxy")));
852
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); proxy = RSTRING_PTR(v); }
853
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("basic_auth")));
854
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); basic_auth = RSTRING_PTR(v); }
855
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("bearer_token")));
856
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); bearer = RSTRING_PTR(v); }
857
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ca_path")));
858
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ca_path = RSTRING_PTR(v); }
859
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("rate_limit_ms")));
860
+ if (!NIL_P(v)) rate_limit_ms = NUM2LONG(v);
861
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
862
+ if (!NIL_P(v)) transcode_utf8 = RTEST(v) ? 1 : 0;
863
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cache_dir")));
864
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); cache_dir = RSTRING_PTR(v); }
865
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("multipart")));
866
+ if (!NIL_P(v)) { Check_Type(v, T_HASH); multipart_v = v; }
867
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_cert")));
868
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_cert = RSTRING_PTR(v); }
869
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_key")));
870
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_key = RSTRING_PTR(v); }
871
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_key_password")));
872
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_key_pass = RSTRING_PTR(v); }
873
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_cert_type")));
874
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_cert_type = RSTRING_PTR(v); }
875
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("proxy_auth")));
876
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); proxy_auth = RSTRING_PTR(v); }
877
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("proxy_type")));
878
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); proxy_type = RSTRING_PTR(v); }
879
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("download_to")));
880
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); download_to = RSTRING_PTR(v); }
881
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_recv_bps")));
882
+ if (!NIL_P(v)) max_recv_bps = NUM2LONG(v);
883
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_send_bps")));
884
+ if (!NIL_P(v)) max_send_bps = NUM2LONG(v);
885
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("http_version")));
886
+ if (!NIL_P(v)) {
887
+ if (SYMBOL_P(v)) v = rb_sym2str(v);
888
+ Check_Type(v, T_STRING);
889
+ http_version = RSTRING_PTR(v);
890
+ }
891
+ }
892
+
893
+ CURL *h = get_thread_curl();
894
+ if (!h) rb_raise(rb_eRuntimeError, "curl_easy_init failed");
895
+
896
+ fetch_ctx_t fc;
897
+ memset(&fc, 0, sizeof(fc));
898
+ fc.handle = h;
899
+
900
+ curl_easy_setopt(h, CURLOPT_URL, RSTRING_PTR(url_v));
901
+ /* HTTP version. Default: HTTP/2 over TLS with 1.1 fallback via
902
+ * ALPN — non-HTTPS targets fall back to HTTP/1.1 automatically.
903
+ * Opt in to "3" if the linked libcurl was built with HTTP/3. */
904
+ {
905
+ long ver = (long)CURL_HTTP_VERSION_2TLS;
906
+ if (http_version) {
907
+ if (strcmp(http_version, "1.0") == 0) ver = CURL_HTTP_VERSION_1_0;
908
+ else if (strcmp(http_version, "1.1") == 0) ver = CURL_HTTP_VERSION_1_1;
909
+ else if (strcmp(http_version, "2") == 0) ver = CURL_HTTP_VERSION_2TLS;
910
+ #ifdef CURL_HTTP_VERSION_3
911
+ else if (strcmp(http_version, "3") == 0) ver = CURL_HTTP_VERSION_3;
912
+ #endif
913
+ }
914
+ curl_easy_setopt(h, CURLOPT_HTTP_VERSION, ver);
915
+ }
916
+ /* Tell curl to wait briefly for an existing HTTP/2 connection to
917
+ * the target to become available rather than opening a fresh
918
+ * TCP+TLS handshake. Combined with the shared CONNECT pool this
919
+ * lets N workers multiplex through one connection per host. */
920
+ #ifdef CURLOPT_PIPEWAIT
921
+ curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
922
+ #endif
923
+ /* Accept-Encoding goes through CURLOPT_HTTPHEADER below, not
924
+ * CURLOPT_ACCEPT_ENCODING. The latter binds decompression to
925
+ * libcurl's compile-time codec set and aborts the response on
926
+ * encodings curl wasn't built for — which would defeat our
927
+ * point of shipping in-process brotli/zstd. */
928
+ fc.req_headers = curl_slist_append(
929
+ fc.req_headers, "Accept-Encoding: identity");
930
+ /* Replaced just below if any codec is linked. */
931
+ if (scrap_accept_encoding()[0] && strcmp(scrap_accept_encoding(), "identity") != 0) {
932
+ char ae_line[160];
933
+ snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s",
934
+ scrap_accept_encoding());
935
+ /* Pop the identity line and replace. curl_slist has no
936
+ * direct replace, so we rebuild from scratch. */
937
+ curl_slist_free_all(fc.req_headers);
938
+ fc.req_headers = NULL;
939
+ fc.req_headers = curl_slist_append(fc.req_headers, ae_line);
940
+ }
941
+ curl_easy_setopt(h, CURLOPT_USERAGENT, ua);
942
+ curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)follow);
943
+ curl_easy_setopt(h, CURLOPT_MAXREDIRS, max_redirs);
944
+ curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, timeout_ms);
945
+ curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L); /* required for use inside Ruby */
946
+ curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
947
+ curl_easy_setopt(h, CURLOPT_WRITEDATA, &fc.body);
948
+ curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
949
+ curl_easy_setopt(h, CURLOPT_HEADERDATA, &fc.headers);
950
+ curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
951
+ if (insecure) {
952
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
953
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
954
+ }
955
+ if (ca_path) {
956
+ curl_easy_setopt(h, CURLOPT_CAINFO, ca_path);
957
+ }
958
+
959
+ /* Method + body. CUSTOMREQUEST overrides the verb regardless of
960
+ * POSTFIELDS presence; libcurl auto-switches to POST when POSTFIELDS
961
+ * is set, so we force CUSTOMREQUEST for everything non-GET to be
962
+ * explicit. NOBODY for HEAD strips the response body. */
963
+ if (nobody) {
964
+ curl_easy_setopt(h, CURLOPT_NOBODY, 1L);
965
+ curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, "HEAD");
966
+ } else if (method) {
967
+ curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, method);
968
+ }
969
+ if (body) {
970
+ curl_easy_setopt(h, CURLOPT_POSTFIELDS, body);
971
+ curl_easy_setopt(h, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)body_len);
972
+ }
973
+
974
+ /* Multipart form upload. Each Hash entry becomes a form part:
975
+ * "field" => "string" - text field
976
+ * "field" => { path: "...", filename: ..., content_type: ... }
977
+ * "field" => { data: "...bytes...", filename: ..., content_type: ... }
978
+ * Mixed in any combination. */
979
+ curl_mime *mime = NULL;
980
+ if (!NIL_P(multipart_v)) {
981
+ mime = curl_mime_init(h);
982
+ VALUE keys = rb_funcall(multipart_v, rb_intern("keys"), 0);
983
+ long nk = RARRAY_LEN(keys);
984
+ for (long i = 0; i < nk; i++) {
985
+ VALUE k = rb_ary_entry(keys, i);
986
+ VALUE pv = rb_hash_aref(multipart_v, k);
987
+ VALUE k_s = rb_obj_as_string(k);
988
+ curl_mimepart *part = curl_mime_addpart(mime);
989
+ curl_mime_name(part, RSTRING_PTR(k_s));
990
+ if (RB_TYPE_P(pv, T_STRING)) {
991
+ curl_mime_data(part, RSTRING_PTR(pv), (size_t)RSTRING_LEN(pv));
992
+ } else if (RB_TYPE_P(pv, T_HASH)) {
993
+ VALUE data_v = rb_hash_aref(pv, ID2SYM(rb_intern("data")));
994
+ VALUE path_v = rb_hash_aref(pv, ID2SYM(rb_intern("path")));
995
+ VALUE filename_v = rb_hash_aref(pv, ID2SYM(rb_intern("filename")));
996
+ VALUE ctype_v = rb_hash_aref(pv, ID2SYM(rb_intern("content_type")));
997
+ if (!NIL_P(path_v)) {
998
+ Check_Type(path_v, T_STRING);
999
+ curl_mime_filedata(part, RSTRING_PTR(path_v));
1000
+ } else if (!NIL_P(data_v)) {
1001
+ Check_Type(data_v, T_STRING);
1002
+ curl_mime_data(part, RSTRING_PTR(data_v), (size_t)RSTRING_LEN(data_v));
1003
+ }
1004
+ if (!NIL_P(filename_v)) {
1005
+ Check_Type(filename_v, T_STRING);
1006
+ curl_mime_filename(part, RSTRING_PTR(filename_v));
1007
+ }
1008
+ if (!NIL_P(ctype_v)) {
1009
+ Check_Type(ctype_v, T_STRING);
1010
+ curl_mime_type(part, RSTRING_PTR(ctype_v));
1011
+ }
1012
+ } else {
1013
+ rb_raise(rb_eArgError,
1014
+ "multipart values must be String or Hash with :path/:data");
1015
+ }
1016
+ }
1017
+ curl_easy_setopt(h, CURLOPT_MIMEPOST, mime);
1018
+ }
1019
+
1020
+ if (cookiefile) curl_easy_setopt(h, CURLOPT_COOKIEFILE, cookiefile);
1021
+ if (cookiejar) curl_easy_setopt(h, CURLOPT_COOKIEJAR, cookiejar);
1022
+ if (!cookiefile && cookiejar) {
1023
+ /* Tell curl to start with an empty in-memory jar (so writes
1024
+ * land somewhere) even when no input file is provided. */
1025
+ curl_easy_setopt(h, CURLOPT_COOKIEFILE, "");
1026
+ }
1027
+ if (proxy) curl_easy_setopt(h, CURLOPT_PROXY, proxy);
1028
+ if (proxy_auth) curl_easy_setopt(h, CURLOPT_PROXYUSERPWD, proxy_auth);
1029
+ if (proxy_type) {
1030
+ long pt = CURLPROXY_HTTP;
1031
+ if (strcasecmp(proxy_type, "http") == 0) pt = CURLPROXY_HTTP;
1032
+ else if (strcasecmp(proxy_type, "https") == 0) pt = CURLPROXY_HTTPS;
1033
+ else if (strcasecmp(proxy_type, "socks4") == 0) pt = CURLPROXY_SOCKS4;
1034
+ else if (strcasecmp(proxy_type, "socks4a") == 0) pt = CURLPROXY_SOCKS4A;
1035
+ else if (strcasecmp(proxy_type, "socks5") == 0) pt = CURLPROXY_SOCKS5;
1036
+ else if (strcasecmp(proxy_type, "socks5h") == 0) pt = CURLPROXY_SOCKS5_HOSTNAME;
1037
+ curl_easy_setopt(h, CURLOPT_PROXYTYPE, pt);
1038
+ }
1039
+ /* mTLS: present a client cert during TLS handshake. */
1040
+ if (ssl_cert) curl_easy_setopt(h, CURLOPT_SSLCERT, ssl_cert);
1041
+ if (ssl_cert_type) curl_easy_setopt(h, CURLOPT_SSLCERTTYPE, ssl_cert_type);
1042
+ if (ssl_key) curl_easy_setopt(h, CURLOPT_SSLKEY, ssl_key);
1043
+ if (ssl_key_pass) curl_easy_setopt(h, CURLOPT_KEYPASSWD, ssl_key_pass);
1044
+ /* Bandwidth caps. 0 means unlimited (libcurl default). */
1045
+ if (max_recv_bps > 0) {
1046
+ curl_easy_setopt(h, CURLOPT_MAX_RECV_SPEED_LARGE, (curl_off_t)max_recv_bps);
1047
+ }
1048
+ if (max_send_bps > 0) {
1049
+ curl_easy_setopt(h, CURLOPT_MAX_SEND_SPEED_LARGE, (curl_off_t)max_send_bps);
1050
+ }
1051
+ /* Streaming download to disk. Uses libcurl's default fwrite
1052
+ * callback, bypassing the in-memory body buffer entirely — no
1053
+ * RAM growth regardless of body size. Caller reads from
1054
+ * download_to after the request returns. */
1055
+ FILE *dl_fp = NULL;
1056
+ if (download_to) {
1057
+ dl_fp = fopen(download_to, "wb");
1058
+ if (!dl_fp) {
1059
+ rb_raise(rb_eIOError, "scrapetor http: cannot open download_to %s", download_to);
1060
+ }
1061
+ curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, NULL); /* libcurl's default = fwrite */
1062
+ curl_easy_setopt(h, CURLOPT_WRITEDATA, (void *)dl_fp);
1063
+ }
1064
+ if (basic_auth) {
1065
+ curl_easy_setopt(h, CURLOPT_HTTPAUTH, (long)CURLAUTH_BASIC);
1066
+ curl_easy_setopt(h, CURLOPT_USERPWD, basic_auth);
1067
+ }
1068
+ if (bearer) {
1069
+ #ifdef CURLAUTH_BEARER
1070
+ curl_easy_setopt(h, CURLOPT_HTTPAUTH, (long)CURLAUTH_BEARER);
1071
+ curl_easy_setopt(h, CURLOPT_XOAUTH2_BEARER, bearer);
1072
+ #else
1073
+ /* Older libcurl — fall back to a manual Authorization header. */
1074
+ char line[1024];
1075
+ snprintf(line, sizeof(line), "Authorization: Bearer %s", bearer);
1076
+ fc.req_headers = curl_slist_append(fc.req_headers, line);
1077
+ #endif
1078
+ }
1079
+
1080
+ if (!NIL_P(headers_v)) {
1081
+ VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
1082
+ long n = RARRAY_LEN(keys);
1083
+ for (long i = 0; i < n; i++) {
1084
+ VALUE k = rb_ary_entry(keys, i);
1085
+ VALUE v = rb_hash_aref(headers_v, k);
1086
+ VALUE line = rb_str_dup(k);
1087
+ rb_str_cat_cstr(line, ": ");
1088
+ rb_str_append(line, v);
1089
+ fc.req_headers = curl_slist_append(fc.req_headers, RSTRING_PTR(line));
1090
+ }
1091
+ }
1092
+ /* Always set the slist — at minimum it carries Accept-Encoding so
1093
+ * curl forwards our codec advertisement rather than its own
1094
+ * (which would let curl claim decompression responsibility we
1095
+ * mean to keep). */
1096
+ if (fc.req_headers) {
1097
+ curl_easy_setopt(h, CURLOPT_HTTPHEADER, fc.req_headers);
1098
+ }
1099
+
1100
+ /* HTTP response cache lookup + revalidation. If a cache entry
1101
+ * exists for this URL, attach If-None-Match / If-Modified-Since
1102
+ * so the server can answer 304 (no body) when nothing changed. */
1103
+ scrap_cache_entry_t cached;
1104
+ memset(&cached, 0, sizeof(cached));
1105
+ int have_cached = 0;
1106
+ if (cache_dir && !nobody && !method && !body) {
1107
+ /* Cache only safe-GETs. POST/PUT/DELETE responses aren't
1108
+ * eligible per RFC 7234, and HEAD has no body to serve. */
1109
+ have_cached = scrap_cache_load(cache_dir, RSTRING_PTR(url_v), &cached);
1110
+ if (have_cached) {
1111
+ if (cached.etag_len > 0) {
1112
+ char line[1024];
1113
+ snprintf(line, sizeof(line), "If-None-Match: %.*s",
1114
+ (int)cached.etag_len, cached.etag);
1115
+ fc.req_headers = curl_slist_append(fc.req_headers, line);
1116
+ }
1117
+ if (cached.lastmod_len > 0) {
1118
+ char line[1024];
1119
+ snprintf(line, sizeof(line), "If-Modified-Since: %.*s",
1120
+ (int)cached.lastmod_len, cached.lastmod);
1121
+ fc.req_headers = curl_slist_append(fc.req_headers, line);
1122
+ }
1123
+ }
1124
+ }
1125
+
1126
+ /* Per-host throttle. Honours rate_limit_ms before we even open
1127
+ * the socket; safe to call under GVL or no-GVL since it uses
1128
+ * only pthread + nanosleep. */
1129
+ if (rate_limit_ms > 0) {
1130
+ char host[256];
1131
+ if (scrap_extract_host(RSTRING_PTR(url_v), host, sizeof(host))) {
1132
+ scrap_throttle_wait(host, (uint64_t)rate_limit_ms * 1000000ull);
1133
+ }
1134
+ }
1135
+
1136
+ /* Drop the GVL while curl is on the network. Other Ruby threads
1137
+ * (background loaders, log writers, etc.) keep moving during the
1138
+ * round-trip. */
1139
+ rb_thread_call_without_gvl(do_fetch_nogvl, &fc, NULL, NULL);
1140
+
1141
+ if (fc.req_headers) curl_slist_free_all(fc.req_headers);
1142
+ if (mime) curl_mime_free(mime);
1143
+ if (dl_fp) { fclose(dl_fp); dl_fp = NULL; }
1144
+
1145
+ if (fc.rc != CURLE_OK) {
1146
+ const char *err = curl_easy_strerror(fc.rc);
1147
+ free(fc.body.data);
1148
+ free(fc.headers.data);
1149
+ rb_raise(rb_eIOError, "scrapetor http: %s", err);
1150
+ }
1151
+
1152
+ long status = 0;
1153
+ curl_easy_getinfo(h, CURLINFO_RESPONSE_CODE, &status);
1154
+ char *eff_url = NULL;
1155
+ curl_easy_getinfo(h, CURLINFO_EFFECTIVE_URL, &eff_url);
1156
+ long http_ver = 0;
1157
+ curl_easy_getinfo(h, CURLINFO_HTTP_VERSION, &http_ver);
1158
+
1159
+ /* Flush the cookie jar to disk now rather than at handle cleanup
1160
+ * (which happens on thread exit). Lets callers see Set-Cookie
1161
+ * values immediately after the request completes. */
1162
+ if (cookiejar) curl_easy_setopt(h, CURLOPT_COOKIELIST, "FLUSH");
1163
+
1164
+ /* HTTP cache revalidation: 304 -> serve from cache; 200 with
1165
+ * ETag/Last-Modified -> store new entry. */
1166
+ int served_from_cache = 0;
1167
+ if (cache_dir && have_cached && status == 304) {
1168
+ /* Replace body buffer with cached payload; bump status to 200
1169
+ * so the caller sees a fully-formed response. The actual 304
1170
+ * round-trip was cheap (no body) — this is the cache win. */
1171
+ free(fc.body.data);
1172
+ fc.body.data = (char *)malloc(cached.body_len + 1);
1173
+ memcpy(fc.body.data, cached.body, cached.body_len);
1174
+ fc.body.data[cached.body_len] = 0;
1175
+ fc.body.len = cached.body_len;
1176
+ fc.body.cap = cached.body_len;
1177
+ status = 200;
1178
+ served_from_cache = 1;
1179
+ }
1180
+
1181
+ VALUE headers_h = parse_headers_blob(fc.headers.data ? fc.headers.data : "",
1182
+ fc.headers.len);
1183
+
1184
+ /* When we served from cache, the network response was 304 with no
1185
+ * headers other than status/ETag. Overlay the cached
1186
+ * Content-Type so consumers see the right metadata for the
1187
+ * body they're getting. */
1188
+ if (served_from_cache && cached.ctype_len > 0) {
1189
+ rb_hash_aset(headers_h, rb_str_new_cstr("content-type"),
1190
+ rb_str_new(cached.ctype, (long)cached.ctype_len));
1191
+ rb_hash_aset(headers_h, rb_str_new_cstr("x-scrapetor-cache"),
1192
+ rb_str_new_cstr("hit"));
1193
+ } else if (cache_dir && have_cached) {
1194
+ rb_hash_aset(headers_h, rb_str_new_cstr("x-scrapetor-cache"),
1195
+ rb_str_new_cstr("miss-revalidated"));
1196
+ }
1197
+
1198
+ /* If a Content-Encoding header is still present, libcurl couldn't
1199
+ * decode it (it strips the header on successful auto-decompress).
1200
+ * Try our in-process decoders for brotli / zstd. On success,
1201
+ * remove the header so the body matches what callers see. */
1202
+ {
1203
+ VALUE ce_key = rb_str_new_cstr("content-encoding");
1204
+ VALUE ce_val = rb_hash_lookup(headers_h, ce_key);
1205
+ if (!NIL_P(ce_val)) {
1206
+ const char *ce = RSTRING_PTR(ce_val);
1207
+ long ce_len = RSTRING_LEN(ce_val);
1208
+ /* Trim surrounding whitespace + match the bare codec name. */
1209
+ while (ce_len > 0 && (*ce == ' ' || *ce == '\t')) { ce++; ce_len--; }
1210
+ while (ce_len > 0 && (ce[ce_len-1] == ' ' || ce[ce_len-1] == '\t' ||
1211
+ ce[ce_len-1] == '\r' || ce[ce_len-1] == '\n')) ce_len--;
1212
+ int decoded = 0;
1213
+ #ifdef HAVE_ZLIB
1214
+ if (ce_len == 4 &&
1215
+ (ce[0] == 'g' || ce[0] == 'G') &&
1216
+ (ce[1] == 'z' || ce[1] == 'Z') &&
1217
+ (ce[2] == 'i' || ce[2] == 'I') &&
1218
+ (ce[3] == 'p' || ce[3] == 'P')) {
1219
+ char *out = NULL; size_t out_len = 0;
1220
+ /* 47 = 15 + 32; +32 enables gzip+zlib auto-detect. */
1221
+ if (scrap_zlib_decode(fc.body.data, fc.body.len, 47,
1222
+ &out, &out_len)) {
1223
+ free(fc.body.data);
1224
+ fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
1225
+ decoded = 1;
1226
+ }
1227
+ }
1228
+ if (!decoded && ce_len == 7 &&
1229
+ (ce[0] == 'd' || ce[0] == 'D') &&
1230
+ (ce[1] == 'e' || ce[1] == 'E') &&
1231
+ (ce[2] == 'f' || ce[2] == 'F') &&
1232
+ (ce[3] == 'l' || ce[3] == 'L') &&
1233
+ (ce[4] == 'a' || ce[4] == 'A') &&
1234
+ (ce[5] == 't' || ce[5] == 'T') &&
1235
+ (ce[6] == 'e' || ce[6] == 'E')) {
1236
+ char *out = NULL; size_t out_len = 0;
1237
+ /* Try raw deflate first (-15), fall back to zlib wrapper (15).
1238
+ * Real-world Content-Encoding: deflate is sent both ways. */
1239
+ if (!scrap_zlib_decode(fc.body.data, fc.body.len, -15,
1240
+ &out, &out_len)) {
1241
+ if (scrap_zlib_decode(fc.body.data, fc.body.len, 15,
1242
+ &out, &out_len)) {
1243
+ free(fc.body.data);
1244
+ fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
1245
+ decoded = 1;
1246
+ }
1247
+ } else {
1248
+ free(fc.body.data);
1249
+ fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
1250
+ decoded = 1;
1251
+ }
1252
+ }
1253
+ #endif
1254
+ #ifdef HAVE_BROTLI
1255
+ if (!decoded && ce_len == 2 && (ce[0] == 'b' || ce[0] == 'B') &&
1256
+ (ce[1] == 'r' || ce[1] == 'R')) {
1257
+ char *out = NULL; size_t out_len = 0;
1258
+ if (scrap_brotli_decode(fc.body.data, fc.body.len, &out, &out_len)) {
1259
+ free(fc.body.data);
1260
+ fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
1261
+ decoded = 1;
1262
+ }
1263
+ }
1264
+ #endif
1265
+ #ifdef HAVE_ZSTD
1266
+ if (!decoded && ce_len == 4 &&
1267
+ (ce[0] == 'z' || ce[0] == 'Z') &&
1268
+ (ce[1] == 's' || ce[1] == 'S') &&
1269
+ (ce[2] == 't' || ce[2] == 'T') &&
1270
+ (ce[3] == 'd' || ce[3] == 'D')) {
1271
+ char *out = NULL; size_t out_len = 0;
1272
+ if (scrap_zstd_decode(fc.body.data, fc.body.len, &out, &out_len)) {
1273
+ free(fc.body.data);
1274
+ fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
1275
+ decoded = 1;
1276
+ }
1277
+ }
1278
+ #endif
1279
+ if (decoded) {
1280
+ rb_hash_delete(headers_h, ce_key);
1281
+ }
1282
+ }
1283
+ }
1284
+
1285
+ /* Charset transcode to UTF-8. Runs after content-encoding decode
1286
+ * so iconv sees the raw decoded text. */
1287
+ if (transcode_utf8 && fc.body.data && fc.body.len > 0) {
1288
+ if (scrap_apply_charset(fc.headers.data ? fc.headers.data : "", fc.headers.len,
1289
+ &fc.body.data, &fc.body.len, &fc.body.cap)) {
1290
+ /* Rewrite content-type so consumers see the new charset. */
1291
+ VALUE ct_key = rb_str_new_cstr("content-type");
1292
+ VALUE ct_val = rb_hash_lookup(headers_h, ct_key);
1293
+ if (!NIL_P(ct_val)) {
1294
+ VALUE replaced = rb_funcall(ct_val, rb_intern("sub"), 2,
1295
+ rb_reg_new_str(rb_str_new_cstr("charset\\s*=\\s*\"?[\\w\\-]+\"?"), 1 /* IGNORECASE */),
1296
+ rb_str_new_cstr("charset=utf-8"));
1297
+ rb_hash_aset(headers_h, ct_key, replaced);
1298
+ }
1299
+ }
1300
+ }
1301
+
1302
+ /* Update cache for 2xx responses with cache-relevant headers.
1303
+ * Skip when the body is empty (HEAD already exits earlier) or when
1304
+ * the response was already a cache-served 304. */
1305
+ if (cache_dir && !served_from_cache && status >= 200 && status < 300 &&
1306
+ fc.body.data && fc.body.len > 0) {
1307
+ VALUE etag_v = rb_hash_lookup(headers_h, rb_str_new_cstr("etag"));
1308
+ VALUE lastmod_v = rb_hash_lookup(headers_h, rb_str_new_cstr("last-modified"));
1309
+ VALUE ctype_v = rb_hash_lookup(headers_h, rb_str_new_cstr("content-type"));
1310
+ /* Only cache when there's *some* revalidation token. Otherwise the
1311
+ * entry would be useless (every fetch would always re-download). */
1312
+ if (!NIL_P(etag_v) || !NIL_P(lastmod_v)) {
1313
+ const char *etag_p = NIL_P(etag_v) ? "" : RSTRING_PTR(etag_v);
1314
+ size_t etag_l = NIL_P(etag_v) ? 0 : (size_t)RSTRING_LEN(etag_v);
1315
+ const char *lastmod_p = NIL_P(lastmod_v) ? "" : RSTRING_PTR(lastmod_v);
1316
+ size_t lastmod_l = NIL_P(lastmod_v) ? 0 : (size_t)RSTRING_LEN(lastmod_v);
1317
+ const char *ctype_p = NIL_P(ctype_v) ? "" : RSTRING_PTR(ctype_v);
1318
+ size_t ctype_l = NIL_P(ctype_v) ? 0 : (size_t)RSTRING_LEN(ctype_v);
1319
+ scrap_cache_store(cache_dir, RSTRING_PTR(url_v), status,
1320
+ etag_p, etag_l, lastmod_p, lastmod_l,
1321
+ ctype_p, ctype_l,
1322
+ fc.body.data, fc.body.len);
1323
+ }
1324
+ }
1325
+ scrap_cache_entry_free(&cached);
1326
+
1327
+ /* When download_to is set the body went straight to disk via
1328
+ * libcurl's default fwrite callback — fc.body is empty. Surface
1329
+ * an empty Ruby String + a :downloaded_to key pointing at the
1330
+ * file so the caller knows where to find the bytes. */
1331
+ VALUE body_s = download_to
1332
+ ? rb_enc_str_new("", 0, enc_utf8)
1333
+ : rb_str_new(fc.body.data ? fc.body.data : "", (long)fc.body.len);
1334
+ rb_enc_associate(body_s, enc_utf8);
1335
+
1336
+ free(fc.body.data);
1337
+ free(fc.headers.data);
1338
+
1339
+ VALUE result = rb_hash_new();
1340
+ rb_hash_aset(result, ID2SYM(rb_intern("status")), LONG2NUM(status));
1341
+ rb_hash_aset(result, ID2SYM(rb_intern("headers")), headers_h);
1342
+ rb_hash_aset(result, ID2SYM(rb_intern("body")), body_s);
1343
+ rb_hash_aset(result, ID2SYM(rb_intern("final_url")),
1344
+ rb_str_new_cstr(eff_url ? eff_url : RSTRING_PTR(url_v)));
1345
+ const char *hv_str = "1.1";
1346
+ switch (http_ver) {
1347
+ case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
1348
+ case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
1349
+ case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
1350
+ #ifdef CURL_HTTP_VERSION_3
1351
+ case CURL_HTTP_VERSION_3: hv_str = "3"; break;
1352
+ #endif
1353
+ }
1354
+ rb_hash_aset(result, ID2SYM(rb_intern("http_version")),
1355
+ rb_str_new_cstr(hv_str));
1356
+ if (download_to) {
1357
+ rb_hash_aset(result, ID2SYM(rb_intern("downloaded_to")),
1358
+ rb_str_new_cstr(download_to));
1359
+ }
1360
+
1361
+ return result;
1362
+ }
1363
+
1364
+ static VALUE scrap_http_features(VALUE self) {
1365
+ (void)self;
1366
+ VALUE h = rb_hash_new();
1367
+ curl_version_info_data *vi = curl_version_info(CURLVERSION_NOW);
1368
+ rb_hash_aset(h, ID2SYM(rb_intern("curl_version")),
1369
+ rb_str_new_cstr(vi->version));
1370
+ rb_hash_aset(h, ID2SYM(rb_intern("http2")),
1371
+ (vi->features & CURL_VERSION_HTTP2) ? Qtrue : Qfalse);
1372
+ /* HTTP/3 (QUIC) — only present when libcurl was built with
1373
+ * quiche / ngtcp2. Apple's system libcurl and most distro
1374
+ * defaults are HTTP/2-only; HTTP/3 requires a custom build. */
1375
+ #ifdef CURL_VERSION_HTTP3
1376
+ rb_hash_aset(h, ID2SYM(rb_intern("http3")),
1377
+ (vi->features & CURL_VERSION_HTTP3) ? Qtrue : Qfalse);
1378
+ #else
1379
+ rb_hash_aset(h, ID2SYM(rb_intern("http3")), Qfalse);
1380
+ #endif
1381
+ /* WebSocket support — libcurl 7.86+ via curl_ws_send/recv. */
1382
+ #ifdef CURLWS_BINARY
1383
+ rb_hash_aset(h, ID2SYM(rb_intern("websocket")), Qtrue);
1384
+ #else
1385
+ rb_hash_aset(h, ID2SYM(rb_intern("websocket")), Qfalse);
1386
+ #endif
1387
+
1388
+ /* "brotli" / "zstd" reflect what *we* can deliver, not what
1389
+ * curl can. True if either curl was built with it OR we link
1390
+ * the codec library directly (HAVE_BROTLI / HAVE_ZSTD) for
1391
+ * in-process decoding. */
1392
+ int has_brotli = 0;
1393
+ #ifdef CURL_VERSION_BROTLI
1394
+ has_brotli |= (vi->features & CURL_VERSION_BROTLI) ? 1 : 0;
1395
+ #endif
1396
+ #ifdef HAVE_BROTLI
1397
+ has_brotli = 1;
1398
+ #endif
1399
+ rb_hash_aset(h, ID2SYM(rb_intern("brotli")), has_brotli ? Qtrue : Qfalse);
1400
+ #ifdef HAVE_BROTLI
1401
+ rb_hash_aset(h, ID2SYM(rb_intern("brotli_inproc")), Qtrue);
1402
+ #else
1403
+ rb_hash_aset(h, ID2SYM(rb_intern("brotli_inproc")), Qfalse);
1404
+ #endif
1405
+
1406
+ int has_zstd = 0;
1407
+ #ifdef CURL_VERSION_ZSTD
1408
+ has_zstd |= (vi->features & CURL_VERSION_ZSTD) ? 1 : 0;
1409
+ #endif
1410
+ #ifdef HAVE_ZSTD
1411
+ has_zstd = 1;
1412
+ #endif
1413
+ rb_hash_aset(h, ID2SYM(rb_intern("zstd")), has_zstd ? Qtrue : Qfalse);
1414
+ #ifdef HAVE_ZSTD
1415
+ rb_hash_aset(h, ID2SYM(rb_intern("zstd_inproc")), Qtrue);
1416
+ #else
1417
+ rb_hash_aset(h, ID2SYM(rb_intern("zstd_inproc")), Qfalse);
1418
+ #endif
1419
+
1420
+ rb_hash_aset(h, ID2SYM(rb_intern("libz")),
1421
+ (vi->features & CURL_VERSION_LIBZ) ? Qtrue : Qfalse);
1422
+ rb_hash_aset(h, ID2SYM(rb_intern("accept_encoding")),
1423
+ rb_str_new_cstr(scrap_accept_encoding()));
1424
+ return h;
1425
+ }
1426
+
1427
+ /* ---- parallel fetch ---------------------------------------------- *
1428
+ * N concurrent libcurl GETs across pthread workers. Each worker uses
1429
+ * get_thread_curl() so its handle's connection cache persists for the
1430
+ * duration of the batch — back-to-back URLs against the same host on
1431
+ * the same worker reuse the TLS + HTTP/2 session.
1432
+ *
1433
+ * The whole batch runs under one rb_thread_call_without_gvl; Ruby's
1434
+ * other threads stay live for the entire pool of fetches. Header
1435
+ * parsing into Ruby Hashes is deferred to after-join (it needs GVL),
1436
+ * but the network and the in-process decompression all happen no-GVL.
1437
+ */
1438
+
1439
+ typedef struct {
1440
+ char *url;
1441
+ long status;
1442
+ long http_version;
1443
+ char *body; size_t body_len;
1444
+ char *headers_blob; size_t headers_len;
1445
+ char *final_url;
1446
+ CURLcode rc;
1447
+ char errstr[CURL_ERROR_SIZE];
1448
+ /* Per-item Accept-Encoding header — points at a shared slist for
1449
+ * the whole batch. Not owned. */
1450
+ struct curl_slist *shared_headers;
1451
+ long timeout_ms;
1452
+ long max_redirects;
1453
+ int follow_redirects;
1454
+ int insecure;
1455
+ int transcode_utf8;
1456
+ long rate_limit_ms;
1457
+ const char *user_agent;
1458
+ /* When non-zero, the worker runs dom_parse on the body and stores
1459
+ * the resulting Document in `parsed_doc`. Saves the main thread a
1460
+ * second serial pass over the batch. */
1461
+ int parse_after_fetch;
1462
+ dom_doc_t *parsed_doc;
1463
+ } pfetch_item_t;
1464
+
1465
+ static void pfetch_item_free(pfetch_item_t *it) {
1466
+ free(it->url);
1467
+ free(it->body);
1468
+ free(it->headers_blob);
1469
+ free(it->final_url);
1470
+ }
1471
+
1472
+ /* Strip-and-decode Content-Encoding: takes a raw header blob + a body
1473
+ * buffer (in/out), runs the in-process decoder for the encoding the
1474
+ * server advertised, and replaces the body in place. Standalone so
1475
+ * both pfetch (pthread+easy) and mfetch (curl_multi) paths can call
1476
+ * it from no-GVL workers. */
1477
+ static int scrap_decode_content_encoding(const char *headers_blob, size_t headers_len,
1478
+ char **body, size_t *body_len) {
1479
+ if (!headers_blob) return 1;
1480
+ const char *ce_val = NULL; size_t ce_len = 0;
1481
+ size_t i = 0;
1482
+ while (i < headers_len) {
1483
+ size_t ls = i;
1484
+ while (i < headers_len && headers_blob[i] != '\n') i++;
1485
+ size_t le = i;
1486
+ if (le > ls && headers_blob[le-1] == '\r') le--;
1487
+ if (i < headers_len) i++;
1488
+ if (le == ls) continue;
1489
+ size_t colon = (size_t)-1;
1490
+ for (size_t k = ls; k < le; k++) {
1491
+ if (headers_blob[k] == ':') { colon = k; break; }
1492
+ }
1493
+ if (colon == (size_t)-1) continue;
1494
+ if (colon - ls != 16) continue;
1495
+ const char *want = "content-encoding";
1496
+ int matches = 1;
1497
+ for (size_t k = 0; k < 16; k++) {
1498
+ char a = headers_blob[ls + k];
1499
+ if (a >= 'A' && a <= 'Z') a += 32;
1500
+ if (a != want[k]) { matches = 0; break; }
1501
+ }
1502
+ if (!matches) continue;
1503
+ size_t vs = colon + 1;
1504
+ while (vs < le && (headers_blob[vs] == ' ' || headers_blob[vs] == '\t')) vs++;
1505
+ ce_val = headers_blob + vs; ce_len = le - vs;
1506
+ }
1507
+ if (!ce_val) return 1;
1508
+ while (ce_len > 0 && (ce_val[ce_len-1] == ' ' || ce_val[ce_len-1] == '\t' ||
1509
+ ce_val[ce_len-1] == '\r' || ce_val[ce_len-1] == '\n')) ce_len--;
1510
+
1511
+ char *out = NULL; size_t out_len = 0;
1512
+ int decoded = 0;
1513
+ #ifdef HAVE_ZLIB
1514
+ if (ce_len == 4 && ((ce_val[0] | 0x20) == 'g') && ((ce_val[1] | 0x20) == 'z') &&
1515
+ ((ce_val[2] | 0x20) == 'i') && ((ce_val[3] | 0x20) == 'p')) {
1516
+ decoded = scrap_zlib_decode(*body, *body_len, 47, &out, &out_len);
1517
+ } else if (ce_len == 7 && ((ce_val[0] | 0x20) == 'd') && ((ce_val[1] | 0x20) == 'e') &&
1518
+ ((ce_val[2] | 0x20) == 'f') && ((ce_val[3] | 0x20) == 'l') &&
1519
+ ((ce_val[4] | 0x20) == 'a') && ((ce_val[5] | 0x20) == 't') &&
1520
+ ((ce_val[6] | 0x20) == 'e')) {
1521
+ if (!scrap_zlib_decode(*body, *body_len, -15, &out, &out_len)) {
1522
+ decoded = scrap_zlib_decode(*body, *body_len, 15, &out, &out_len);
1523
+ } else decoded = 1;
1524
+ }
1525
+ #endif
1526
+ #ifdef HAVE_BROTLI
1527
+ if (!decoded && ce_len == 2 && ((ce_val[0] | 0x20) == 'b') && ((ce_val[1] | 0x20) == 'r')) {
1528
+ decoded = scrap_brotli_decode(*body, *body_len, &out, &out_len);
1529
+ }
1530
+ #endif
1531
+ #ifdef HAVE_ZSTD
1532
+ if (!decoded && ce_len == 4 && ((ce_val[0] | 0x20) == 'z') && ((ce_val[1] | 0x20) == 's') &&
1533
+ ((ce_val[2] | 0x20) == 't') && ((ce_val[3] | 0x20) == 'd')) {
1534
+ decoded = scrap_zstd_decode(*body, *body_len, &out, &out_len);
1535
+ }
1536
+ #endif
1537
+ if (decoded) {
1538
+ free(*body);
1539
+ *body = out; *body_len = out_len;
1540
+ }
1541
+ return 1;
1542
+ }
1543
+
1544
+ /* Strip Content-Encoding from a header blob (in place) and run the
1545
+ * matching in-process decoder against the body buffer. Returns 1 on
1546
+ * success or no-op, 0 on decoder failure. Thin wrapper for the
1547
+ * pfetch path which carries everything in a pfetch_item_t. */
1548
+ static int pfetch_decode_body(pfetch_item_t *it) {
1549
+ return scrap_decode_content_encoding(it->headers_blob, it->headers_len,
1550
+ &it->body, &it->body_len);
1551
+ }
1552
+
1553
+ static void pfetch_do_one(pfetch_item_t *it) {
1554
+ /* Per-host throttle. Gates each worker against the global slot
1555
+ * for this item's host — N parallel workers hitting one host
1556
+ * with rate_limit_ms=500 serialise at that gate while different
1557
+ * hosts run concurrently. */
1558
+ if (it->rate_limit_ms > 0) {
1559
+ char host[256];
1560
+ if (scrap_extract_host(it->url, host, sizeof(host))) {
1561
+ scrap_throttle_wait(host, (uint64_t)it->rate_limit_ms * 1000000ull);
1562
+ }
1563
+ }
1564
+
1565
+ CURL *h = get_thread_curl();
1566
+ if (!h) { it->rc = CURLE_FAILED_INIT; return; }
1567
+
1568
+ buf_t bbuf; memset(&bbuf, 0, sizeof(bbuf));
1569
+ buf_t hbuf; memset(&hbuf, 0, sizeof(hbuf));
1570
+
1571
+ curl_easy_setopt(h, CURLOPT_URL, it->url);
1572
+ curl_easy_setopt(h, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
1573
+ /* Tell curl to wait briefly for an existing HTTP/2 connection to
1574
+ * the target to become available rather than opening a fresh
1575
+ * TCP+TLS handshake. Combined with the shared CONNECT pool this
1576
+ * lets N workers multiplex through one connection per host. */
1577
+ #ifdef CURLOPT_PIPEWAIT
1578
+ curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
1579
+ #endif
1580
+ curl_easy_setopt(h, CURLOPT_USERAGENT, it->user_agent);
1581
+ curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)it->follow_redirects);
1582
+ curl_easy_setopt(h, CURLOPT_MAXREDIRS, it->max_redirects);
1583
+ curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, it->timeout_ms);
1584
+ curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L);
1585
+ curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
1586
+ curl_easy_setopt(h, CURLOPT_WRITEDATA, &bbuf);
1587
+ curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
1588
+ curl_easy_setopt(h, CURLOPT_HEADERDATA, &hbuf);
1589
+ curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
1590
+ curl_easy_setopt(h, CURLOPT_ERRORBUFFER, it->errstr);
1591
+ if (it->insecure) {
1592
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
1593
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
1594
+ }
1595
+ if (it->shared_headers) {
1596
+ curl_easy_setopt(h, CURLOPT_HTTPHEADER, it->shared_headers);
1597
+ }
1598
+
1599
+ it->rc = curl_easy_perform(h);
1600
+
1601
+ if (it->rc == CURLE_OK) {
1602
+ long status = 0, hv = 0;
1603
+ char *eff = NULL;
1604
+ curl_easy_getinfo(h, CURLINFO_RESPONSE_CODE, &status);
1605
+ curl_easy_getinfo(h, CURLINFO_HTTP_VERSION, &hv);
1606
+ curl_easy_getinfo(h, CURLINFO_EFFECTIVE_URL, &eff);
1607
+ it->status = status;
1608
+ it->http_version = hv;
1609
+ if (eff) {
1610
+ size_t l = strlen(eff);
1611
+ it->final_url = (char *)malloc(l + 1);
1612
+ memcpy(it->final_url, eff, l + 1);
1613
+ }
1614
+ }
1615
+ it->body = bbuf.data; it->body_len = bbuf.len;
1616
+ it->headers_blob = hbuf.data; it->headers_len = hbuf.len;
1617
+
1618
+ /* In-process decompression while we still hold no GVL. */
1619
+ if (it->rc == CURLE_OK) {
1620
+ pfetch_decode_body(it);
1621
+ if (it->transcode_utf8 && it->body && it->body_len > 0) {
1622
+ size_t cap = it->body_len; /* tracked separately just for the iconv path */
1623
+ scrap_apply_charset(it->headers_blob ? it->headers_blob : "", it->headers_len,
1624
+ &it->body, &it->body_len, &cap);
1625
+ }
1626
+ /* Optional in-worker parse. The body buffer is handed over to a
1627
+ * dom_doc that takes ownership; we clear our pointers so the
1628
+ * post-join Ruby hash doesn't see (and free) the same memory. */
1629
+ if (it->parse_after_fetch && it->body && it->body_len > 0) {
1630
+ char *owned = it->body;
1631
+ size_t owned_len = it->body_len;
1632
+ it->body = NULL;
1633
+ it->body_len = 0;
1634
+ it->parsed_doc = scrap_dom_make_owned_doc(owned, owned_len);
1635
+ scrap_dom_parse_eager_nocache(it->parsed_doc);
1636
+ }
1637
+ }
1638
+ }
1639
+
1640
+ typedef struct {
1641
+ pfetch_item_t *items;
1642
+ size_t n;
1643
+ int next_idx;
1644
+ } pfetch_ctx_t;
1645
+
1646
+ static void *pfetch_worker(void *arg) {
1647
+ pfetch_ctx_t *ctx = (pfetch_ctx_t *)arg;
1648
+ while (1) {
1649
+ int i = __atomic_fetch_add(&ctx->next_idx, 1, __ATOMIC_RELAXED);
1650
+ if (i >= (int)ctx->n) return NULL;
1651
+ pfetch_do_one(&ctx->items[i]);
1652
+ }
1653
+ }
1654
+
1655
+ typedef struct {
1656
+ pfetch_ctx_t *ctx;
1657
+ int n_threads;
1658
+ } pfetch_run_arg_t;
1659
+
1660
+ static void *pfetch_run(void *arg) {
1661
+ pfetch_run_arg_t *ra = (pfetch_run_arg_t *)arg;
1662
+ int nt = ra->n_threads;
1663
+ pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * (size_t)nt);
1664
+ int spawned = 0;
1665
+ for (int i = 0; i < nt; i++) {
1666
+ if (pthread_create(&threads[i], NULL, pfetch_worker, ra->ctx) == 0) spawned++;
1667
+ }
1668
+ if (spawned < nt) pfetch_worker(ra->ctx);
1669
+ for (int i = 0; i < spawned; i++) pthread_join(threads[i], NULL);
1670
+ free(threads);
1671
+ return NULL;
1672
+ }
1673
+
1674
+ static VALUE scrap_parallel_fetch(int argc, VALUE *argv, VALUE self) {
1675
+ (void)self;
1676
+ scrap_ensure_global_init();
1677
+ VALUE urls_v, opts_v;
1678
+ rb_scan_args(argc, argv, "11", &urls_v, &opts_v);
1679
+ Check_Type(urls_v, T_ARRAY);
1680
+ long n = RARRAY_LEN(urls_v);
1681
+ if (n == 0) return rb_ary_new();
1682
+
1683
+ int n_threads = 4;
1684
+ long timeout_ms = 30000;
1685
+ int follow = 1;
1686
+ long max_redirs = 10;
1687
+ const char *ua = "scrapetor/0.1 (libcurl)";
1688
+ int insecure = 0;
1689
+ int transcode_utf8 = 1;
1690
+ long rate_limit_ms = 0;
1691
+ int parse_after = 0;
1692
+ VALUE headers_v = Qnil;
1693
+ if (!NIL_P(opts_v)) {
1694
+ Check_Type(opts_v, T_HASH);
1695
+ VALUE v;
1696
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("threads")));
1697
+ if (!NIL_P(v)) n_threads = NUM2INT(v);
1698
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
1699
+ if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
1700
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
1701
+ if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
1702
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
1703
+ if (!NIL_P(v)) max_redirs = NUM2LONG(v);
1704
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
1705
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
1706
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
1707
+ if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
1708
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
1709
+ if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
1710
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
1711
+ if (!NIL_P(v)) transcode_utf8 = RTEST(v) ? 1 : 0;
1712
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("rate_limit_ms")));
1713
+ if (!NIL_P(v)) rate_limit_ms = NUM2LONG(v);
1714
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("parse")));
1715
+ if (!NIL_P(v)) parse_after = RTEST(v) ? 1 : 0;
1716
+ }
1717
+ if (n_threads < 1) n_threads = 1;
1718
+ if (n_threads > (int)n) n_threads = (int)n;
1719
+
1720
+ /* One shared slist for the whole batch: Accept-Encoding + user
1721
+ * headers. All workers point at this; no mutation after build. */
1722
+ struct curl_slist *shared = NULL;
1723
+ {
1724
+ char ae_line[160];
1725
+ snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s", scrap_accept_encoding());
1726
+ shared = curl_slist_append(shared, ae_line);
1727
+ }
1728
+ if (!NIL_P(headers_v)) {
1729
+ VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
1730
+ long nk = RARRAY_LEN(keys);
1731
+ for (long i = 0; i < nk; i++) {
1732
+ VALUE k = rb_ary_entry(keys, i);
1733
+ VALUE vv = rb_hash_aref(headers_v, k);
1734
+ VALUE line = rb_str_dup(k);
1735
+ rb_str_cat_cstr(line, ": ");
1736
+ rb_str_append(line, vv);
1737
+ shared = curl_slist_append(shared, RSTRING_PTR(line));
1738
+ }
1739
+ }
1740
+
1741
+ pfetch_item_t *items = (pfetch_item_t *)calloc((size_t)n, sizeof(pfetch_item_t));
1742
+ for (long i = 0; i < n; i++) {
1743
+ VALUE u = rb_ary_entry(urls_v, i);
1744
+ Check_Type(u, T_STRING);
1745
+ size_t ul = (size_t)RSTRING_LEN(u);
1746
+ items[i].url = (char *)malloc(ul + 1);
1747
+ memcpy(items[i].url, RSTRING_PTR(u), ul);
1748
+ items[i].url[ul] = 0;
1749
+ items[i].shared_headers = shared;
1750
+ items[i].timeout_ms = timeout_ms;
1751
+ items[i].follow_redirects = follow;
1752
+ items[i].max_redirects = max_redirs;
1753
+ items[i].user_agent = ua;
1754
+ items[i].insecure = insecure;
1755
+ items[i].transcode_utf8 = transcode_utf8;
1756
+ items[i].rate_limit_ms = rate_limit_ms;
1757
+ items[i].parse_after_fetch = parse_after;
1758
+ }
1759
+
1760
+ pfetch_ctx_t ctx; ctx.items = items; ctx.n = (size_t)n; ctx.next_idx = 0;
1761
+ pfetch_run_arg_t ra; ra.ctx = &ctx; ra.n_threads = n_threads;
1762
+ rb_thread_call_without_gvl(pfetch_run, &ra, NULL, NULL);
1763
+
1764
+ /* Re-acquired GVL — assemble Ruby Hashes from the C results. */
1765
+ VALUE doc_klass = Qnil;
1766
+ if (parse_after) {
1767
+ doc_klass = rb_path2class("Scrapetor::Native::Document");
1768
+ }
1769
+ VALUE result = rb_ary_new_capa(n);
1770
+ for (long i = 0; i < n; i++) {
1771
+ pfetch_item_t *it = &items[i];
1772
+ VALUE h = rb_hash_new();
1773
+ if (it->rc != CURLE_OK) {
1774
+ VALUE err = rb_hash_new();
1775
+ rb_hash_aset(err, ID2SYM(rb_intern("url")), rb_str_new_cstr(it->url));
1776
+ rb_hash_aset(err, ID2SYM(rb_intern("error")),
1777
+ rb_str_new_cstr(it->errstr[0] ? it->errstr : curl_easy_strerror(it->rc)));
1778
+ rb_hash_aset(h, ID2SYM(rb_intern("error")), err);
1779
+ rb_ary_push(result, h);
1780
+ pfetch_item_free(it);
1781
+ continue;
1782
+ }
1783
+ rb_hash_aset(h, ID2SYM(rb_intern("status")), LONG2NUM(it->status));
1784
+ /* When the worker parsed the body, body bytes were transferred to
1785
+ * the dom_doc — the item's own body pointer is NULL. Surface the
1786
+ * Document and emit an empty body string. */
1787
+ if (it->parsed_doc) {
1788
+ rb_hash_aset(h, ID2SYM(rb_intern("document")),
1789
+ scrap_dom_wrap_doc(doc_klass, it->parsed_doc));
1790
+ it->parsed_doc = NULL; /* ownership transferred to the wrap */
1791
+ rb_hash_aset(h, ID2SYM(rb_intern("body")), rb_enc_str_new("", 0, enc_utf8));
1792
+ } else {
1793
+ rb_hash_aset(h, ID2SYM(rb_intern("body")),
1794
+ rb_enc_str_new(it->body ? it->body : "", (long)it->body_len, enc_utf8));
1795
+ }
1796
+ VALUE headers_h = parse_headers_blob(it->headers_blob ? it->headers_blob : "",
1797
+ it->headers_len);
1798
+ /* Drop CE so headers + body stay consistent. */
1799
+ rb_hash_delete(headers_h, rb_str_new_cstr("content-encoding"));
1800
+ rb_hash_aset(h, ID2SYM(rb_intern("headers")), headers_h);
1801
+ rb_hash_aset(h, ID2SYM(rb_intern("final_url")),
1802
+ rb_str_new_cstr(it->final_url ? it->final_url : it->url));
1803
+ const char *hv_str = "1.1";
1804
+ switch (it->http_version) {
1805
+ case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
1806
+ case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
1807
+ case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
1808
+ #ifdef CURL_HTTP_VERSION_3
1809
+ case CURL_HTTP_VERSION_3: hv_str = "3"; break;
1810
+ #endif
1811
+ }
1812
+ rb_hash_aset(h, ID2SYM(rb_intern("http_version")), rb_str_new_cstr(hv_str));
1813
+ rb_ary_push(result, h);
1814
+ pfetch_item_free(it);
1815
+ }
1816
+ free(items);
1817
+ curl_slist_free_all(shared);
1818
+ return result;
1819
+ }
1820
+
1821
+ /* ---- curl_multi bulk fetch --------------------------------------- *
1822
+ * Single-handle curl_multi driving N concurrent transfers. Complements
1823
+ * the pthread+easy parallel_fetch path:
1824
+ * - parallel_fetch: N pthread workers, each running its own easy
1825
+ * handle blocking. Best when each transfer has meaningful CPU work
1826
+ * (decode + parse) since the GVL is released across the full batch
1827
+ * and CPU work scales with cores.
1828
+ * - multi_fetch: one driver thread, one multi handle, N concurrent
1829
+ * transfers multiplexed via curl_multi_perform. Best for
1830
+ * I/O-dominated high-fan-out fetches (hundreds of URLs across
1831
+ * diverse hosts) where the cost of pthread setup outweighs the
1832
+ * in-flight transfer count.
1833
+ *
1834
+ * Both share the same global CURLSH, so connection pool / DNS / TLS
1835
+ * sessions are shared across them too.
1836
+ */
1837
+
1838
+ typedef struct {
1839
+ char *url;
1840
+ CURL *easy;
1841
+ buf_t body;
1842
+ buf_t headers;
1843
+ long status;
1844
+ long http_version;
1845
+ char *final_url; /* strdup */
1846
+ CURLcode rc;
1847
+ char errstr[CURL_ERROR_SIZE];
1848
+ struct curl_slist *req_headers; /* per-easy slist, freed after harvest */
1849
+ /* In-loop decode/parse output. Populated by the perform thread
1850
+ * as each transfer completes — keeps the per-completion CPU work
1851
+ * (decompress + transcode + tokenise) inside the same no-GVL
1852
+ * window. */
1853
+ int decoded; /* 1 after we've drained the message for this slot */
1854
+ dom_doc_t *parsed_doc; /* optional, set when parse_after */
1855
+ /* HTTP cache: populated pre-perform from disk lookup; checked
1856
+ * post-perform for 304 revalidation. */
1857
+ scrap_cache_entry_t cached;
1858
+ int have_cached;
1859
+ int served_from_cache;
1860
+ } mfetch_slot_t;
1861
+
1862
+ typedef struct {
1863
+ CURLM *multi;
1864
+ mfetch_slot_t *slots;
1865
+ size_t n;
1866
+ CURLMcode multi_rc;
1867
+ int transcode_utf8;
1868
+ int parse_after;
1869
+ const char *cache_dir;
1870
+ } mfetch_ctx_t;
1871
+
1872
+ static void mfetch_finalize_slot_nogvl(mfetch_ctx_t *ctx, mfetch_slot_t *s,
1873
+ CURL *easy, CURLcode result) {
1874
+ s->rc = result;
1875
+ if (result != CURLE_OK) { s->decoded = 1; return; }
1876
+ curl_easy_getinfo(easy, CURLINFO_RESPONSE_CODE, &s->status);
1877
+ curl_easy_getinfo(easy, CURLINFO_HTTP_VERSION, &s->http_version);
1878
+ char *eff = NULL;
1879
+ curl_easy_getinfo(easy, CURLINFO_EFFECTIVE_URL, &eff);
1880
+ if (eff) {
1881
+ size_t l = strlen(eff);
1882
+ s->final_url = (char *)malloc(l + 1);
1883
+ memcpy(s->final_url, eff, l + 1);
1884
+ }
1885
+ /* 304 revalidation: server says cached body still valid. Swap
1886
+ * the body buffer for the cached payload and rewrite status to
1887
+ * 200 so consumers see a fully-formed response. */
1888
+ if (ctx->cache_dir && s->have_cached && s->status == 304) {
1889
+ free(s->body.data);
1890
+ s->body.data = (char *)malloc(s->cached.body_len + 1);
1891
+ memcpy(s->body.data, s->cached.body, s->cached.body_len);
1892
+ s->body.data[s->cached.body_len] = 0;
1893
+ s->body.len = s->cached.body_len;
1894
+ s->body.cap = s->cached.body_len;
1895
+ s->status = 200;
1896
+ s->served_from_cache = 1;
1897
+ }
1898
+ /* Decompress + transcode under no-GVL. */
1899
+ if (s->body.data && s->body.len > 0) {
1900
+ scrap_decode_content_encoding(s->headers.data ? s->headers.data : "",
1901
+ s->headers.len, &s->body.data, &s->body.len);
1902
+ }
1903
+ if (ctx->transcode_utf8 && s->body.data && s->body.len > 0) {
1904
+ size_t cap = s->body.len;
1905
+ scrap_apply_charset(s->headers.data ? s->headers.data : "", s->headers.len,
1906
+ &s->body.data, &s->body.len, &cap);
1907
+ s->body.cap = cap;
1908
+ }
1909
+ /* Optional in-loop parse — same trick as parallel_fetch: hand
1910
+ * ownership of the body buffer to a dom_doc_t and run
1911
+ * dom_parse_eager_nocache. */
1912
+ if (ctx->parse_after && s->body.data && s->body.len > 0) {
1913
+ char *owned = s->body.data;
1914
+ size_t owned_len = s->body.len;
1915
+ s->body.data = NULL;
1916
+ s->body.len = 0;
1917
+ s->parsed_doc = scrap_dom_make_owned_doc(owned, owned_len);
1918
+ scrap_dom_parse_eager_nocache(s->parsed_doc);
1919
+ }
1920
+ s->decoded = 1;
1921
+ }
1922
+
1923
+ static void *mfetch_run_nogvl(void *arg) {
1924
+ mfetch_ctx_t *ctx = (mfetch_ctx_t *)arg;
1925
+ int running = -1;
1926
+ while (1) {
1927
+ ctx->multi_rc = curl_multi_perform(ctx->multi, &running);
1928
+ if (ctx->multi_rc != CURLM_OK) break;
1929
+
1930
+ /* Drain completed messages now so decompression / transcode /
1931
+ * parse runs in parallel with other in-flight transfers
1932
+ * (still on this same driver thread, but interleaved with
1933
+ * curl_multi_perform). */
1934
+ CURLMsg *msg;
1935
+ int q;
1936
+ while ((msg = curl_multi_info_read(ctx->multi, &q))) {
1937
+ if (msg->msg != CURLMSG_DONE) continue;
1938
+ long idx = -1;
1939
+ curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &idx);
1940
+ if (idx < 0 || idx >= (long)ctx->n) continue;
1941
+ mfetch_slot_t *s = &ctx->slots[idx];
1942
+ if (s->decoded) continue;
1943
+ mfetch_finalize_slot_nogvl(ctx, s, msg->easy_handle, msg->data.result);
1944
+ }
1945
+
1946
+ if (running == 0) break;
1947
+ int numfds = 0;
1948
+ curl_multi_poll(ctx->multi, NULL, 0, 200, &numfds);
1949
+ }
1950
+ return NULL;
1951
+ }
1952
+
1953
+ static VALUE scrap_multi_fetch(int argc, VALUE *argv, VALUE self) {
1954
+ (void)self;
1955
+ scrap_ensure_global_init();
1956
+ VALUE urls_v, opts_v;
1957
+ rb_scan_args(argc, argv, "11", &urls_v, &opts_v);
1958
+ Check_Type(urls_v, T_ARRAY);
1959
+ long n = RARRAY_LEN(urls_v);
1960
+ if (n == 0) return rb_ary_new();
1961
+
1962
+ long timeout_ms = 30000;
1963
+ int follow = 1;
1964
+ long max_redirs = 10;
1965
+ const char *ua = "scrapetor/0.1 (libcurl)";
1966
+ int insecure = 0;
1967
+ long max_concurrent = 0; /* 0 = no cap (let multi run as wide as needed) */
1968
+ int transcode_utf8 = 1;
1969
+ int parse_after = 0;
1970
+ const char *cache_dir = NULL;
1971
+ const char *method_opt = NULL;
1972
+ int nobody_opt = 0;
1973
+ VALUE headers_v = Qnil;
1974
+ if (!NIL_P(opts_v)) {
1975
+ Check_Type(opts_v, T_HASH);
1976
+ VALUE v;
1977
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
1978
+ if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
1979
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
1980
+ if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
1981
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
1982
+ if (!NIL_P(v)) max_redirs = NUM2LONG(v);
1983
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
1984
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
1985
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
1986
+ if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
1987
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
1988
+ if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
1989
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_concurrent")));
1990
+ if (!NIL_P(v)) max_concurrent = NUM2LONG(v);
1991
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
1992
+ if (!NIL_P(v)) transcode_utf8 = RTEST(v) ? 1 : 0;
1993
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("parse")));
1994
+ if (!NIL_P(v)) parse_after = RTEST(v) ? 1 : 0;
1995
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cache_dir")));
1996
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); cache_dir = RSTRING_PTR(v); }
1997
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("method")));
1998
+ if (!NIL_P(v)) {
1999
+ if (SYMBOL_P(v)) v = rb_sym2str(v);
2000
+ Check_Type(v, T_STRING);
2001
+ method_opt = RSTRING_PTR(v);
2002
+ if (strcasecmp(method_opt, "head") == 0) { nobody_opt = 1; method_opt = NULL; }
2003
+ else if (strcasecmp(method_opt, "get") == 0) method_opt = NULL;
2004
+ }
2005
+ }
2006
+
2007
+ CURLM *multi = curl_multi_init();
2008
+ if (max_concurrent > 0) {
2009
+ curl_multi_setopt(multi, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_concurrent);
2010
+ }
2011
+ #ifdef CURLPIPE_MULTIPLEX
2012
+ /* Let multi pile new requests onto an existing HTTP/2 connection
2013
+ * to the same host. With CURLOPT_PIPEWAIT also set per-handle, the
2014
+ * multi pool tends to settle on one connection per origin. */
2015
+ curl_multi_setopt(multi, CURLMOPT_PIPELINING, (long)CURLPIPE_MULTIPLEX);
2016
+ #endif
2017
+
2018
+ mfetch_slot_t *slots = (mfetch_slot_t *)calloc((size_t)n, sizeof(mfetch_slot_t));
2019
+
2020
+ for (long i = 0; i < n; i++) {
2021
+ VALUE u = rb_ary_entry(urls_v, i);
2022
+ Check_Type(u, T_STRING);
2023
+ size_t ul = (size_t)RSTRING_LEN(u);
2024
+ slots[i].url = (char *)malloc(ul + 1);
2025
+ memcpy(slots[i].url, RSTRING_PTR(u), ul);
2026
+ slots[i].url[ul] = 0;
2027
+
2028
+ CURL *h = curl_easy_init();
2029
+ slots[i].easy = h;
2030
+ curl_easy_setopt(h, CURLOPT_URL, slots[i].url);
2031
+ curl_easy_setopt(h, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
2032
+ #ifdef CURLOPT_PIPEWAIT
2033
+ curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
2034
+ #endif
2035
+ curl_easy_setopt(h, CURLOPT_USERAGENT, ua);
2036
+ curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)follow);
2037
+ curl_easy_setopt(h, CURLOPT_MAXREDIRS, max_redirs);
2038
+ curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, timeout_ms);
2039
+ curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L);
2040
+ curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
2041
+ curl_easy_setopt(h, CURLOPT_WRITEDATA, &slots[i].body);
2042
+ curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
2043
+ curl_easy_setopt(h, CURLOPT_HEADERDATA, &slots[i].headers);
2044
+ curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
2045
+ curl_easy_setopt(h, CURLOPT_ERRORBUFFER, slots[i].errstr);
2046
+ curl_easy_setopt(h, CURLOPT_PRIVATE, (void *)(intptr_t)i);
2047
+ if (g_share) curl_easy_setopt(h, CURLOPT_SHARE, g_share);
2048
+ if (insecure) {
2049
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
2050
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
2051
+ }
2052
+ if (nobody_opt) {
2053
+ curl_easy_setopt(h, CURLOPT_NOBODY, 1L);
2054
+ curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, "HEAD");
2055
+ } else if (method_opt) {
2056
+ /* Upcase + custom-request for non-GET. */
2057
+ char mbuf[24];
2058
+ size_t mi = 0;
2059
+ for (; mi < sizeof(mbuf) - 1 && method_opt[mi]; mi++) {
2060
+ char c = method_opt[mi];
2061
+ mbuf[mi] = (c >= 'a' && c <= 'z') ? (char)(c - 32) : c;
2062
+ }
2063
+ mbuf[mi] = 0;
2064
+ curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, mbuf);
2065
+ }
2066
+ /* HTTP cache: pre-load entry for this URL so we can attach
2067
+ * If-None-Match / If-Modified-Since and identify 304s in the
2068
+ * worker. HEAD is allowed here because the revalidate flow
2069
+ * uses HEAD specifically to ping the server about freshness;
2070
+ * non-GET methods other than HEAD (POST/PUT/DELETE/...) are
2071
+ * skipped per RFC 7234. */
2072
+ if (cache_dir && !method_opt) {
2073
+ slots[i].have_cached = scrap_cache_load(cache_dir, slots[i].url, &slots[i].cached);
2074
+ }
2075
+ /* Per-handle Accept-Encoding + user headers slist. */
2076
+ {
2077
+ char ae_line[160];
2078
+ snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s",
2079
+ scrap_accept_encoding());
2080
+ slots[i].req_headers = curl_slist_append(slots[i].req_headers, ae_line);
2081
+ }
2082
+ if (slots[i].have_cached) {
2083
+ if (slots[i].cached.etag_len > 0) {
2084
+ char line[1024];
2085
+ snprintf(line, sizeof(line), "If-None-Match: %.*s",
2086
+ (int)slots[i].cached.etag_len, slots[i].cached.etag);
2087
+ slots[i].req_headers = curl_slist_append(slots[i].req_headers, line);
2088
+ }
2089
+ if (slots[i].cached.lastmod_len > 0) {
2090
+ char line[1024];
2091
+ snprintf(line, sizeof(line), "If-Modified-Since: %.*s",
2092
+ (int)slots[i].cached.lastmod_len, slots[i].cached.lastmod);
2093
+ slots[i].req_headers = curl_slist_append(slots[i].req_headers, line);
2094
+ }
2095
+ }
2096
+ if (!NIL_P(headers_v)) {
2097
+ VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
2098
+ long nk = RARRAY_LEN(keys);
2099
+ for (long k = 0; k < nk; k++) {
2100
+ VALUE kk = rb_ary_entry(keys, k);
2101
+ VALUE vv = rb_hash_aref(headers_v, kk);
2102
+ VALUE line = rb_str_dup(kk);
2103
+ rb_str_cat_cstr(line, ": ");
2104
+ rb_str_append(line, vv);
2105
+ slots[i].req_headers = curl_slist_append(slots[i].req_headers, RSTRING_PTR(line));
2106
+ }
2107
+ }
2108
+ curl_easy_setopt(h, CURLOPT_HTTPHEADER, slots[i].req_headers);
2109
+ curl_multi_add_handle(multi, h);
2110
+ }
2111
+
2112
+ mfetch_ctx_t ctx;
2113
+ ctx.multi = multi;
2114
+ ctx.slots = slots;
2115
+ ctx.n = (size_t)n;
2116
+ ctx.multi_rc = CURLM_OK;
2117
+ ctx.transcode_utf8 = transcode_utf8;
2118
+ ctx.parse_after = parse_after;
2119
+ ctx.cache_dir = cache_dir;
2120
+ rb_thread_call_without_gvl(mfetch_run_nogvl, &ctx, NULL, NULL);
2121
+
2122
+ /* Sweep any final messages the worker didn't drain (defensive —
2123
+ * the worker loop normally consumes them all, but if the multi
2124
+ * exited via error or the exit condition fired between perform
2125
+ * and info_read, a message could still be queued). */
2126
+ {
2127
+ CURLMsg *msg;
2128
+ int q;
2129
+ while ((msg = curl_multi_info_read(multi, &q))) {
2130
+ if (msg->msg != CURLMSG_DONE) continue;
2131
+ long idx = -1;
2132
+ curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &idx);
2133
+ if (idx < 0 || idx >= (long)n) continue;
2134
+ if (!slots[idx].decoded) {
2135
+ mfetch_finalize_slot_nogvl(&ctx, &slots[idx], msg->easy_handle, msg->data.result);
2136
+ }
2137
+ }
2138
+ }
2139
+
2140
+ /* All slots have been decoded by the worker (or by the sweep
2141
+ * above). The harvest pass just builds the Ruby surface. */
2142
+ VALUE doc_klass = parse_after ? rb_path2class("Scrapetor::Native::Document") : Qnil;
2143
+ VALUE result = rb_ary_new_capa(n);
2144
+ for (long i = 0; i < n; i++) {
2145
+ mfetch_slot_t *s = &slots[i];
2146
+ VALUE h = rb_hash_new();
2147
+ if (s->rc != CURLE_OK) {
2148
+ VALUE err = rb_hash_new();
2149
+ rb_hash_aset(err, ID2SYM(rb_intern("url")), rb_str_new_cstr(s->url));
2150
+ rb_hash_aset(err, ID2SYM(rb_intern("error")),
2151
+ rb_str_new_cstr(s->errstr[0] ? s->errstr : curl_easy_strerror(s->rc)));
2152
+ rb_hash_aset(h, ID2SYM(rb_intern("error")), err);
2153
+ } else {
2154
+ rb_hash_aset(h, ID2SYM(rb_intern("status")), LONG2NUM(s->status));
2155
+ if (s->parsed_doc) {
2156
+ rb_hash_aset(h, ID2SYM(rb_intern("document")),
2157
+ scrap_dom_wrap_doc(doc_klass, s->parsed_doc));
2158
+ s->parsed_doc = NULL;
2159
+ rb_hash_aset(h, ID2SYM(rb_intern("body")), rb_enc_str_new("", 0, enc_utf8));
2160
+ } else {
2161
+ rb_hash_aset(h, ID2SYM(rb_intern("body")),
2162
+ rb_enc_str_new(s->body.data ? s->body.data : "",
2163
+ (long)s->body.len, enc_utf8));
2164
+ }
2165
+ VALUE hh = parse_headers_blob(s->headers.data ? s->headers.data : "",
2166
+ s->headers.len);
2167
+ rb_hash_delete(hh, rb_str_new_cstr("content-encoding"));
2168
+ if (s->served_from_cache && s->cached.ctype_len > 0) {
2169
+ rb_hash_aset(hh, rb_str_new_cstr("content-type"),
2170
+ rb_str_new(s->cached.ctype, (long)s->cached.ctype_len));
2171
+ rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
2172
+ rb_str_new_cstr("hit"));
2173
+ } else if (cache_dir && s->have_cached) {
2174
+ rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
2175
+ rb_str_new_cstr("miss-revalidated"));
2176
+ }
2177
+ rb_hash_aset(h, ID2SYM(rb_intern("headers")), hh);
2178
+ rb_hash_aset(h, ID2SYM(rb_intern("final_url")),
2179
+ rb_str_new_cstr(s->final_url ? s->final_url : s->url));
2180
+ const char *hv_str = "1.1";
2181
+ switch (s->http_version) {
2182
+ case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
2183
+ case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
2184
+ case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
2185
+ #ifdef CURL_HTTP_VERSION_3
2186
+ case CURL_HTTP_VERSION_3: hv_str = "3"; break;
2187
+ #endif
2188
+ }
2189
+ rb_hash_aset(h, ID2SYM(rb_intern("http_version")), rb_str_new_cstr(hv_str));
2190
+ /* Store the response in cache for next-time revalidation
2191
+ * (only for cache-eligible 2xx responses with a token). */
2192
+ if (cache_dir && !s->served_from_cache && s->status >= 200 && s->status < 300 &&
2193
+ s->body.data && s->body.len > 0) {
2194
+ VALUE etag_v = rb_hash_lookup(hh, rb_str_new_cstr("etag"));
2195
+ VALUE lastmod_v = rb_hash_lookup(hh, rb_str_new_cstr("last-modified"));
2196
+ VALUE ctype_v = rb_hash_lookup(hh, rb_str_new_cstr("content-type"));
2197
+ if (!NIL_P(etag_v) || !NIL_P(lastmod_v)) {
2198
+ const char *etag_p = NIL_P(etag_v) ? "" : RSTRING_PTR(etag_v);
2199
+ size_t etag_l = NIL_P(etag_v) ? 0 : (size_t)RSTRING_LEN(etag_v);
2200
+ const char *lastmod_p = NIL_P(lastmod_v) ? "" : RSTRING_PTR(lastmod_v);
2201
+ size_t lastmod_l = NIL_P(lastmod_v) ? 0 : (size_t)RSTRING_LEN(lastmod_v);
2202
+ const char *ctype_p = NIL_P(ctype_v) ? "" : RSTRING_PTR(ctype_v);
2203
+ size_t ctype_l = NIL_P(ctype_v) ? 0 : (size_t)RSTRING_LEN(ctype_v);
2204
+ scrap_cache_store(cache_dir, s->url, s->status,
2205
+ etag_p, etag_l, lastmod_p, lastmod_l,
2206
+ ctype_p, ctype_l, s->body.data, s->body.len);
2207
+ }
2208
+ }
2209
+ }
2210
+ rb_ary_push(result, h);
2211
+
2212
+ curl_multi_remove_handle(multi, s->easy);
2213
+ curl_easy_cleanup(s->easy);
2214
+ curl_slist_free_all(s->req_headers);
2215
+ free(s->url);
2216
+ free(s->body.data);
2217
+ free(s->headers.data);
2218
+ free(s->final_url);
2219
+ scrap_cache_entry_free(&s->cached);
2220
+ }
2221
+ curl_multi_cleanup(multi);
2222
+ free(slots);
2223
+ return result;
2224
+ }
2225
+
2226
+ /* ---- streaming multi batch (yield as transfers complete) --------- *
2227
+ * Wraps a CURLM handle + slots in a typed-data object so Ruby can pull
2228
+ * completed responses one at a time via #next. Each #next advances
2229
+ * curl_multi_perform under no-GVL until at least one new transfer
2230
+ * completes, finalises that slot (decompress / transcode / optional
2231
+ * parse), and returns its Ruby hash. nil when the whole batch is done.
2232
+ *
2233
+ * Pattern: Fetcher.multi_each(urls) { |r| ... } yields each response
2234
+ * in completion order — earliest-arriving first — so the user starts
2235
+ * processing while later transfers are still on the wire.
2236
+ */
2237
+ typedef struct {
2238
+ CURLM *multi;
2239
+ mfetch_slot_t *slots;
2240
+ size_t n;
2241
+ /* Completion ring: indices of slots that finished and aren't
2242
+ * yielded yet. ready_tail bumps in the worker, ready_head bumps
2243
+ * on each #next pop. */
2244
+ size_t *ready_queue;
2245
+ size_t ready_head;
2246
+ size_t ready_tail;
2247
+ int running;
2248
+ int done;
2249
+ /* Carried opts (mirrors mfetch_ctx_t shape so we can reuse
2250
+ * mfetch_finalize_slot_nogvl). */
2251
+ int transcode_utf8;
2252
+ int parse_after;
2253
+ char *cache_dir_owned; /* strdup, may be NULL */
2254
+ /* Whole-batch shared slist for Accept-Encoding + user headers.
2255
+ * Owned; freed at cleanup. */
2256
+ struct curl_slist *shared_headers;
2257
+ /* All easy handles also live here so we can free them on GC. */
2258
+ } mbatch_t;
2259
+
2260
+ static void mbatch_free(void *p) {
2261
+ mbatch_t *b = (mbatch_t *)p;
2262
+ if (!b) return;
2263
+ if (b->slots) {
2264
+ for (size_t i = 0; i < b->n; i++) {
2265
+ mfetch_slot_t *s = &b->slots[i];
2266
+ if (s->easy) {
2267
+ if (b->multi) curl_multi_remove_handle(b->multi, s->easy);
2268
+ curl_easy_cleanup(s->easy);
2269
+ }
2270
+ curl_slist_free_all(s->req_headers);
2271
+ free(s->url);
2272
+ free(s->body.data);
2273
+ free(s->headers.data);
2274
+ free(s->final_url);
2275
+ scrap_cache_entry_free(&s->cached);
2276
+ if (s->parsed_doc) {
2277
+ /* parsed_doc may not have been yielded yet — its bytes
2278
+ * are owned by the dom_doc so just let it leak through
2279
+ * the parse-doc free path. */
2280
+ /* No direct free here; the dom_doc's own free handles it
2281
+ * once the wrap is GC'd. Without a wrap, it leaks. */
2282
+ }
2283
+ }
2284
+ free(b->slots);
2285
+ }
2286
+ if (b->multi) curl_multi_cleanup(b->multi);
2287
+ free(b->ready_queue);
2288
+ free(b->cache_dir_owned);
2289
+ free(b);
2290
+ }
2291
+
2292
+ static size_t mbatch_memsize(const void *p) {
2293
+ const mbatch_t *b = (const mbatch_t *)p;
2294
+ return sizeof(*b) + (b ? b->n * sizeof(mfetch_slot_t) : 0);
2295
+ }
2296
+
2297
+ static const rb_data_type_t mbatch_data_type = {
2298
+ "Scrapetor::Native::Http::MultiBatch",
2299
+ {NULL, mbatch_free, mbatch_memsize},
2300
+ NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY,
2301
+ };
2302
+
2303
+ static VALUE mbatch_alloc(VALUE klass) {
2304
+ mbatch_t *b = (mbatch_t *)calloc(1, sizeof(mbatch_t));
2305
+ return TypedData_Wrap_Struct(klass, &mbatch_data_type, b);
2306
+ }
2307
+
2308
+ /* No-GVL stepper: one perform call + drain any completed messages,
2309
+ * finalising each as it lands. May poll for socket activity if no
2310
+ * completion is ready yet. */
2311
+ static void *mbatch_step_nogvl(void *arg) {
2312
+ mbatch_t *b = (mbatch_t *)arg;
2313
+ curl_multi_perform(b->multi, &b->running);
2314
+ CURLMsg *msg;
2315
+ int q;
2316
+ while ((msg = curl_multi_info_read(b->multi, &q))) {
2317
+ if (msg->msg != CURLMSG_DONE) continue;
2318
+ long idx = -1;
2319
+ curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &idx);
2320
+ if (idx < 0 || idx >= (long)b->n) continue;
2321
+ if (b->slots[idx].decoded) continue;
2322
+ mfetch_ctx_t ctx_proxy;
2323
+ memset(&ctx_proxy, 0, sizeof(ctx_proxy));
2324
+ ctx_proxy.transcode_utf8 = b->transcode_utf8;
2325
+ ctx_proxy.parse_after = b->parse_after;
2326
+ ctx_proxy.cache_dir = b->cache_dir_owned;
2327
+ mfetch_finalize_slot_nogvl(&ctx_proxy, &b->slots[idx],
2328
+ msg->easy_handle, msg->data.result);
2329
+ b->ready_queue[b->ready_tail++] = (size_t)idx;
2330
+ }
2331
+ if (b->ready_head >= b->ready_tail && b->running > 0) {
2332
+ int numfds = 0;
2333
+ curl_multi_poll(b->multi, NULL, 0, 200, &numfds);
2334
+ }
2335
+ if (b->running == 0) b->done = 1;
2336
+ return NULL;
2337
+ }
2338
+
2339
+ /* Build the Ruby Hash for a finalised slot. Same shape as
2340
+ * scrap_multi_fetch's harvest path. */
2341
+ static VALUE mbatch_build_hash(mbatch_t *b, mfetch_slot_t *s) {
2342
+ VALUE h = rb_hash_new();
2343
+ if (s->rc != CURLE_OK) {
2344
+ VALUE err = rb_hash_new();
2345
+ rb_hash_aset(err, ID2SYM(rb_intern("url")), rb_str_new_cstr(s->url));
2346
+ rb_hash_aset(err, ID2SYM(rb_intern("error")),
2347
+ rb_str_new_cstr(s->errstr[0] ? s->errstr : curl_easy_strerror(s->rc)));
2348
+ rb_hash_aset(h, ID2SYM(rb_intern("error")), err);
2349
+ return h;
2350
+ }
2351
+ rb_hash_aset(h, ID2SYM(rb_intern("status")), LONG2NUM(s->status));
2352
+ if (s->parsed_doc) {
2353
+ VALUE doc_klass = rb_path2class("Scrapetor::Native::Document");
2354
+ rb_hash_aset(h, ID2SYM(rb_intern("document")),
2355
+ scrap_dom_wrap_doc(doc_klass, s->parsed_doc));
2356
+ s->parsed_doc = NULL;
2357
+ rb_hash_aset(h, ID2SYM(rb_intern("body")), rb_enc_str_new("", 0, enc_utf8));
2358
+ } else {
2359
+ rb_hash_aset(h, ID2SYM(rb_intern("body")),
2360
+ rb_enc_str_new(s->body.data ? s->body.data : "",
2361
+ (long)s->body.len, enc_utf8));
2362
+ }
2363
+ VALUE hh = parse_headers_blob(s->headers.data ? s->headers.data : "",
2364
+ s->headers.len);
2365
+ rb_hash_delete(hh, rb_str_new_cstr("content-encoding"));
2366
+ if (s->served_from_cache && s->cached.ctype_len > 0) {
2367
+ rb_hash_aset(hh, rb_str_new_cstr("content-type"),
2368
+ rb_str_new(s->cached.ctype, (long)s->cached.ctype_len));
2369
+ rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
2370
+ rb_str_new_cstr("hit"));
2371
+ } else if (b->cache_dir_owned && s->have_cached) {
2372
+ rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
2373
+ rb_str_new_cstr("miss-revalidated"));
2374
+ }
2375
+ rb_hash_aset(h, ID2SYM(rb_intern("headers")), hh);
2376
+ rb_hash_aset(h, ID2SYM(rb_intern("final_url")),
2377
+ rb_str_new_cstr(s->final_url ? s->final_url : s->url));
2378
+ const char *hv_str = "1.1";
2379
+ switch (s->http_version) {
2380
+ case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
2381
+ case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
2382
+ case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
2383
+ #ifdef CURL_HTTP_VERSION_3
2384
+ case CURL_HTTP_VERSION_3: hv_str = "3"; break;
2385
+ #endif
2386
+ }
2387
+ rb_hash_aset(h, ID2SYM(rb_intern("http_version")), rb_str_new_cstr(hv_str));
2388
+ return h;
2389
+ }
2390
+
2391
+ static VALUE mbatch_initialize(int argc, VALUE *argv, VALUE self) {
2392
+ scrap_ensure_global_init();
2393
+ VALUE urls_v, opts_v;
2394
+ rb_scan_args(argc, argv, "11", &urls_v, &opts_v);
2395
+ Check_Type(urls_v, T_ARRAY);
2396
+ long n = RARRAY_LEN(urls_v);
2397
+
2398
+ mbatch_t *b;
2399
+ TypedData_Get_Struct(self, mbatch_t, &mbatch_data_type, b);
2400
+
2401
+ long timeout_ms = 30000;
2402
+ int follow = 1;
2403
+ long max_redirs = 10;
2404
+ const char *ua = "scrapetor/0.1 (libcurl)";
2405
+ int insecure = 0;
2406
+ long max_concurrent = 0;
2407
+ b->transcode_utf8 = 1;
2408
+ b->parse_after = 0;
2409
+ VALUE headers_v = Qnil;
2410
+ const char *method_opt = NULL;
2411
+ int nobody_opt = 0;
2412
+ if (!NIL_P(opts_v)) {
2413
+ Check_Type(opts_v, T_HASH);
2414
+ VALUE v;
2415
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
2416
+ if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
2417
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
2418
+ if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
2419
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
2420
+ if (!NIL_P(v)) max_redirs = NUM2LONG(v);
2421
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
2422
+ if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
2423
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
2424
+ if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
2425
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
2426
+ if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
2427
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_concurrent")));
2428
+ if (!NIL_P(v)) max_concurrent = NUM2LONG(v);
2429
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
2430
+ if (!NIL_P(v)) b->transcode_utf8 = RTEST(v) ? 1 : 0;
2431
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("parse")));
2432
+ if (!NIL_P(v)) b->parse_after = RTEST(v) ? 1 : 0;
2433
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cache_dir")));
2434
+ if (!NIL_P(v)) {
2435
+ Check_Type(v, T_STRING);
2436
+ size_t l = (size_t)RSTRING_LEN(v);
2437
+ b->cache_dir_owned = (char *)malloc(l + 1);
2438
+ memcpy(b->cache_dir_owned, RSTRING_PTR(v), l);
2439
+ b->cache_dir_owned[l] = 0;
2440
+ }
2441
+ v = rb_hash_aref(opts_v, ID2SYM(rb_intern("method")));
2442
+ if (!NIL_P(v)) {
2443
+ if (SYMBOL_P(v)) v = rb_sym2str(v);
2444
+ Check_Type(v, T_STRING);
2445
+ method_opt = RSTRING_PTR(v);
2446
+ if (strcasecmp(method_opt, "head") == 0) { nobody_opt = 1; method_opt = NULL; }
2447
+ else if (strcasecmp(method_opt, "get") == 0) method_opt = NULL;
2448
+ }
2449
+ }
2450
+
2451
+ b->n = (size_t)n;
2452
+ b->multi = curl_multi_init();
2453
+ if (max_concurrent > 0) {
2454
+ curl_multi_setopt(b->multi, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_concurrent);
2455
+ }
2456
+ #ifdef CURLPIPE_MULTIPLEX
2457
+ curl_multi_setopt(b->multi, CURLMOPT_PIPELINING, (long)CURLPIPE_MULTIPLEX);
2458
+ #endif
2459
+ b->slots = (mfetch_slot_t *)calloc(b->n, sizeof(mfetch_slot_t));
2460
+ b->ready_queue = (size_t *)calloc(b->n, sizeof(size_t));
2461
+
2462
+ for (long i = 0; i < n; i++) {
2463
+ VALUE u = rb_ary_entry(urls_v, i);
2464
+ Check_Type(u, T_STRING);
2465
+ size_t ul = (size_t)RSTRING_LEN(u);
2466
+ b->slots[i].url = (char *)malloc(ul + 1);
2467
+ memcpy(b->slots[i].url, RSTRING_PTR(u), ul);
2468
+ b->slots[i].url[ul] = 0;
2469
+
2470
+ CURL *h = curl_easy_init();
2471
+ b->slots[i].easy = h;
2472
+ curl_easy_setopt(h, CURLOPT_URL, b->slots[i].url);
2473
+ curl_easy_setopt(h, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
2474
+ #ifdef CURLOPT_PIPEWAIT
2475
+ curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
2476
+ #endif
2477
+ curl_easy_setopt(h, CURLOPT_USERAGENT, ua);
2478
+ curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)follow);
2479
+ curl_easy_setopt(h, CURLOPT_MAXREDIRS, max_redirs);
2480
+ curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, timeout_ms);
2481
+ curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L);
2482
+ curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
2483
+ curl_easy_setopt(h, CURLOPT_WRITEDATA, &b->slots[i].body);
2484
+ curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
2485
+ curl_easy_setopt(h, CURLOPT_HEADERDATA, &b->slots[i].headers);
2486
+ curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
2487
+ curl_easy_setopt(h, CURLOPT_ERRORBUFFER, b->slots[i].errstr);
2488
+ curl_easy_setopt(h, CURLOPT_PRIVATE, (void *)(intptr_t)i);
2489
+ if (g_share) curl_easy_setopt(h, CURLOPT_SHARE, g_share);
2490
+ if (insecure) {
2491
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
2492
+ curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
2493
+ }
2494
+ if (nobody_opt) {
2495
+ curl_easy_setopt(h, CURLOPT_NOBODY, 1L);
2496
+ curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, "HEAD");
2497
+ } else if (method_opt) {
2498
+ char mbuf[24];
2499
+ size_t mi = 0;
2500
+ for (; mi < sizeof(mbuf) - 1 && method_opt[mi]; mi++) {
2501
+ char c = method_opt[mi];
2502
+ mbuf[mi] = (c >= 'a' && c <= 'z') ? (char)(c - 32) : c;
2503
+ }
2504
+ mbuf[mi] = 0;
2505
+ curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, mbuf);
2506
+ }
2507
+ if (b->cache_dir_owned && !method_opt) {
2508
+ b->slots[i].have_cached =
2509
+ scrap_cache_load(b->cache_dir_owned, b->slots[i].url, &b->slots[i].cached);
2510
+ }
2511
+ {
2512
+ char ae_line[160];
2513
+ snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s",
2514
+ scrap_accept_encoding());
2515
+ b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, ae_line);
2516
+ }
2517
+ if (b->slots[i].have_cached) {
2518
+ if (b->slots[i].cached.etag_len > 0) {
2519
+ char line[1024];
2520
+ snprintf(line, sizeof(line), "If-None-Match: %.*s",
2521
+ (int)b->slots[i].cached.etag_len, b->slots[i].cached.etag);
2522
+ b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, line);
2523
+ }
2524
+ if (b->slots[i].cached.lastmod_len > 0) {
2525
+ char line[1024];
2526
+ snprintf(line, sizeof(line), "If-Modified-Since: %.*s",
2527
+ (int)b->slots[i].cached.lastmod_len, b->slots[i].cached.lastmod);
2528
+ b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, line);
2529
+ }
2530
+ }
2531
+ if (!NIL_P(headers_v)) {
2532
+ VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
2533
+ long nk = RARRAY_LEN(keys);
2534
+ for (long k = 0; k < nk; k++) {
2535
+ VALUE kk = rb_ary_entry(keys, k);
2536
+ VALUE vv = rb_hash_aref(headers_v, kk);
2537
+ VALUE line = rb_str_dup(kk);
2538
+ rb_str_cat_cstr(line, ": ");
2539
+ rb_str_append(line, vv);
2540
+ b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, RSTRING_PTR(line));
2541
+ }
2542
+ }
2543
+ curl_easy_setopt(h, CURLOPT_HTTPHEADER, b->slots[i].req_headers);
2544
+ curl_multi_add_handle(b->multi, h);
2545
+ }
2546
+ b->running = (int)b->n;
2547
+ return self;
2548
+ }
2549
+
2550
+ static VALUE mbatch_next(VALUE self) {
2551
+ mbatch_t *b;
2552
+ TypedData_Get_Struct(self, mbatch_t, &mbatch_data_type, b);
2553
+ while (b->ready_head >= b->ready_tail && !b->done) {
2554
+ rb_thread_call_without_gvl(mbatch_step_nogvl, b, NULL, NULL);
2555
+ }
2556
+ if (b->ready_head >= b->ready_tail) return Qnil;
2557
+ size_t idx = b->ready_queue[b->ready_head++];
2558
+ return mbatch_build_hash(b, &b->slots[idx]);
2559
+ }
2560
+
2561
+ void Init_scrapetor_http(VALUE mod_native) {
2562
+ /* Intentionally NOT calling curl_global_init / scrap_share_init here.
2563
+ * See scrap_ensure_global_init above — eager init at require-time
2564
+ * races macOS Cocoa initialisers against the host's fork(), which
2565
+ * crashes Puma / Spring / Foreman workers on macOS. The first fetch
2566
+ * call (in each post-fork worker) does the init lazily. */
2567
+ VALUE mod_http = rb_define_module_under(mod_native, "Http");
2568
+ rb_define_singleton_method(mod_http, "get", scrap_http_get, -1);
2569
+ rb_define_singleton_method(mod_http, "parallel_fetch", scrap_parallel_fetch, -1);
2570
+ rb_define_singleton_method(mod_http, "multi_fetch", scrap_multi_fetch, -1);
2571
+ rb_define_singleton_method(mod_http, "features", scrap_http_features, 0);
2572
+ rb_define_const(mod_http, "AVAILABLE", Qtrue);
2573
+
2574
+ /* Streaming multi-batch iterator. */
2575
+ VALUE mb = rb_define_class_under(mod_http, "MultiBatch", rb_cObject);
2576
+ rb_define_alloc_func(mb, mbatch_alloc);
2577
+ rb_define_method(mb, "initialize", mbatch_initialize, -1);
2578
+ rb_define_method(mb, "next", mbatch_next, 0);
2579
+ }
2580
+
2581
+ #else /* HAVE_LIBCURL */
2582
+
2583
+ /* Stub: HTTP layer was not built (libcurl missing at compile time).
2584
+ * Define the constant so the Ruby side can detect this and provide a
2585
+ * useful error. */
2586
+ void Init_scrapetor_http(VALUE mod_native) {
2587
+ VALUE mod_http = rb_define_module_under(mod_native, "Http");
2588
+ rb_define_const(mod_http, "AVAILABLE", Qfalse);
2589
+ }
2590
+
2591
+ #endif