scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,2591 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* scrapetor_http.c
|
|
3
|
+
*
|
|
4
|
+
* Native HTTP/2-capable fetch layer. Wraps libcurl's easy interface
|
|
5
|
+
* with sensible scraping defaults: HTTP/2 over TLS, automatic
|
|
6
|
+
* Accept-Encoding (gzip + brotli + zstd when the linked libcurl was
|
|
7
|
+
* built with them), connection reuse via a per-thread persistent
|
|
8
|
+
* handle, redirect following with a cap, and total-time timeout.
|
|
9
|
+
*
|
|
10
|
+
* Build-time conditional. If pkg-config can't find libcurl (or the
|
|
11
|
+
* caller passes --without-libcurl to extconf), this whole file
|
|
12
|
+
* collapses to a stub that raises a clear error at fetch time —
|
|
13
|
+
* Scrapetor still loads, only the HTTP surface is unavailable.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
#include <ruby.h>
|
|
17
|
+
#include <ruby/encoding.h>
|
|
18
|
+
#include <ruby/thread.h>
|
|
19
|
+
|
|
20
|
+
#ifdef HAVE_LIBCURL
|
|
21
|
+
|
|
22
|
+
#include <curl/curl.h>
|
|
23
|
+
#include <pthread.h>
|
|
24
|
+
#include <string.h>
|
|
25
|
+
#include <stdlib.h>
|
|
26
|
+
#include <time.h>
|
|
27
|
+
#include <errno.h>
|
|
28
|
+
#include <iconv.h>
|
|
29
|
+
#include <sys/stat.h>
|
|
30
|
+
#include <unistd.h>
|
|
31
|
+
|
|
32
|
+
#ifdef HAVE_ZLIB
|
|
33
|
+
#include <zlib.h>
|
|
34
|
+
#endif
|
|
35
|
+
|
|
36
|
+
#ifdef HAVE_BROTLI
|
|
37
|
+
#include <brotli/decode.h>
|
|
38
|
+
#endif
|
|
39
|
+
|
|
40
|
+
#ifdef HAVE_ZSTD
|
|
41
|
+
#include <zstd.h>
|
|
42
|
+
#endif
|
|
43
|
+
|
|
44
|
+
extern rb_encoding *enc_utf8;
|
|
45
|
+
|
|
46
|
+
/* Hooks into scrapetor_dom.c so we can run dom_parse on each
|
|
47
|
+
* response body inside the same no-GVL worker that fetched it. */
|
|
48
|
+
typedef struct dom_doc dom_doc_t;
|
|
49
|
+
extern dom_doc_t *scrap_dom_make_owned_doc(char *bytes, size_t len);
|
|
50
|
+
extern void scrap_dom_parse_eager_nocache(dom_doc_t *d);
|
|
51
|
+
extern VALUE scrap_dom_wrap_doc(VALUE klass, dom_doc_t *d);
|
|
52
|
+
|
|
53
|
+
/* ---- Accept-Encoding negotiation --------------------------------- *
|
|
54
|
+
* Returns the comma-separated list of content codings this build can
|
|
55
|
+
* decode. We own decompression end-to-end — CURLOPT_ACCEPT_ENCODING is
|
|
56
|
+
* intentionally left unset so libcurl doesn't reject responses whose
|
|
57
|
+
* encoding it wasn't compiled for. */
|
|
58
|
+
static const char *scrap_accept_encoding(void) {
|
|
59
|
+
static char cached[128];
|
|
60
|
+
static int inited = 0;
|
|
61
|
+
if (inited) return cached;
|
|
62
|
+
cached[0] = 0;
|
|
63
|
+
int first = 1;
|
|
64
|
+
#ifdef HAVE_ZLIB
|
|
65
|
+
{ strcat(cached, first ? "gzip, deflate" : ", gzip, deflate"); first = 0; }
|
|
66
|
+
#endif
|
|
67
|
+
#ifdef HAVE_BROTLI
|
|
68
|
+
{ strcat(cached, first ? "br" : ", br"); first = 0; }
|
|
69
|
+
#endif
|
|
70
|
+
#ifdef HAVE_ZSTD
|
|
71
|
+
{ strcat(cached, first ? "zstd" : ", zstd"); first = 0; }
|
|
72
|
+
#endif
|
|
73
|
+
if (first) {
|
|
74
|
+
/* No codecs linked at all — advertise identity so servers know
|
|
75
|
+
* not to compress. */
|
|
76
|
+
strcpy(cached, "identity");
|
|
77
|
+
}
|
|
78
|
+
inited = 1;
|
|
79
|
+
return cached;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/* ---- in-process zlib / brotli / zstd decoders -------------------- */
|
|
83
|
+
|
|
84
|
+
#ifdef HAVE_ZLIB
|
|
85
|
+
/* `gzip` and `deflate`. window_bits selects which: 31 = gzip wrapper,
|
|
86
|
+
* 15 = zlib wrapper, -15 = raw deflate. The 47 path auto-detects gzip
|
|
87
|
+
* vs zlib, which is what we want since some servers send Content-
|
|
88
|
+
* Encoding: deflate with the zlib wrapper and others without. */
|
|
89
|
+
static int scrap_zlib_decode(const char *in, size_t in_len,
|
|
90
|
+
int window_bits,
|
|
91
|
+
char **out, size_t *out_len) {
|
|
92
|
+
z_stream s; memset(&s, 0, sizeof(s));
|
|
93
|
+
if (inflateInit2(&s, window_bits) != Z_OK) return 0;
|
|
94
|
+
size_t cap = in_len * 4 + 4096;
|
|
95
|
+
char *buf = (char *)malloc(cap);
|
|
96
|
+
size_t total = 0;
|
|
97
|
+
s.next_in = (Bytef *)in;
|
|
98
|
+
s.avail_in = (uInt)in_len;
|
|
99
|
+
while (1) {
|
|
100
|
+
if (cap - total < 4096) {
|
|
101
|
+
cap *= 2;
|
|
102
|
+
buf = (char *)realloc(buf, cap);
|
|
103
|
+
}
|
|
104
|
+
s.next_out = (Bytef *)(buf + total);
|
|
105
|
+
s.avail_out = (uInt)(cap - total);
|
|
106
|
+
int r = inflate(&s, Z_NO_FLUSH);
|
|
107
|
+
total = cap - s.avail_out;
|
|
108
|
+
if (r == Z_STREAM_END) break;
|
|
109
|
+
if (r != Z_OK) { inflateEnd(&s); free(buf); return 0; }
|
|
110
|
+
if (s.avail_in == 0 && s.avail_out > 0) break;
|
|
111
|
+
}
|
|
112
|
+
inflateEnd(&s);
|
|
113
|
+
*out = buf; *out_len = total;
|
|
114
|
+
return 1;
|
|
115
|
+
}
|
|
116
|
+
#endif
|
|
117
|
+
|
|
118
|
+
/* ---- in-process brotli / zstd decoders --------------------------- */
|
|
119
|
+
|
|
120
|
+
#ifdef HAVE_BROTLI
|
|
121
|
+
static int scrap_brotli_decode(const char *in, size_t in_len,
|
|
122
|
+
char **out, size_t *out_len) {
|
|
123
|
+
BrotliDecoderState *st = BrotliDecoderCreateInstance(NULL, NULL, NULL);
|
|
124
|
+
if (!st) return 0;
|
|
125
|
+
size_t cap = in_len * 4 + 1024;
|
|
126
|
+
char *buf = (char *)malloc(cap);
|
|
127
|
+
size_t total = 0;
|
|
128
|
+
const uint8_t *next_in = (const uint8_t *)in;
|
|
129
|
+
size_t avail_in = in_len;
|
|
130
|
+
BrotliDecoderResult r;
|
|
131
|
+
do {
|
|
132
|
+
uint8_t *next_out = (uint8_t *)(buf + total);
|
|
133
|
+
size_t avail_out = cap - total;
|
|
134
|
+
r = BrotliDecoderDecompressStream(st, &avail_in, &next_in,
|
|
135
|
+
&avail_out, &next_out, NULL);
|
|
136
|
+
total = (size_t)((char *)next_out - buf);
|
|
137
|
+
if (r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
|
|
138
|
+
cap *= 2;
|
|
139
|
+
buf = (char *)realloc(buf, cap);
|
|
140
|
+
}
|
|
141
|
+
} while (r == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT ||
|
|
142
|
+
r == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT);
|
|
143
|
+
BrotliDecoderDestroyInstance(st);
|
|
144
|
+
if (r != BROTLI_DECODER_RESULT_SUCCESS) { free(buf); return 0; }
|
|
145
|
+
*out = buf; *out_len = total;
|
|
146
|
+
return 1;
|
|
147
|
+
}
|
|
148
|
+
#endif
|
|
149
|
+
|
|
150
|
+
#ifdef HAVE_ZSTD
|
|
151
|
+
static int scrap_zstd_decode(const char *in, size_t in_len,
|
|
152
|
+
char **out, size_t *out_len) {
|
|
153
|
+
/* Use streaming zstd so we don't have to trust the frame's
|
|
154
|
+
* declared size. */
|
|
155
|
+
ZSTD_DStream *zds = ZSTD_createDStream();
|
|
156
|
+
if (!zds) return 0;
|
|
157
|
+
ZSTD_initDStream(zds);
|
|
158
|
+
size_t cap = ZSTD_DStreamOutSize();
|
|
159
|
+
if (cap < in_len * 4) cap = in_len * 4 + 4096;
|
|
160
|
+
char *buf = (char *)malloc(cap);
|
|
161
|
+
size_t total = 0;
|
|
162
|
+
ZSTD_inBuffer zin = { in, in_len, 0 };
|
|
163
|
+
while (zin.pos < zin.size) {
|
|
164
|
+
if (cap - total < ZSTD_DStreamOutSize()) {
|
|
165
|
+
cap *= 2;
|
|
166
|
+
buf = (char *)realloc(buf, cap);
|
|
167
|
+
}
|
|
168
|
+
ZSTD_outBuffer zout = { buf + total, cap - total, 0 };
|
|
169
|
+
size_t r = ZSTD_decompressStream(zds, &zout, &zin);
|
|
170
|
+
if (ZSTD_isError(r)) { ZSTD_freeDStream(zds); free(buf); return 0; }
|
|
171
|
+
total += zout.pos;
|
|
172
|
+
if (r == 0) break; /* frame complete */
|
|
173
|
+
}
|
|
174
|
+
ZSTD_freeDStream(zds);
|
|
175
|
+
*out = buf; *out_len = total;
|
|
176
|
+
return 1;
|
|
177
|
+
}
|
|
178
|
+
#endif
|
|
179
|
+
|
|
180
|
+
/* ---- per-host throttle table ------------------------------------- *
|
|
181
|
+
* Global, thread-safe map from host name to last-request timestamp.
|
|
182
|
+
* Drives polite-scraping rate limits across both single-fetch and
|
|
183
|
+
* parallel-fetch paths — a parallel batch of 32 URLs against the same
|
|
184
|
+
* host with rate_limit_ms=500 will serialise at that host through this
|
|
185
|
+
* table even though the worker threads themselves are independent.
|
|
186
|
+
*
|
|
187
|
+
* 256 slots is plenty: scrapers target dozens of hosts max in practice;
|
|
188
|
+
* past that we round-robin LRU evictions.
|
|
189
|
+
*/
|
|
190
|
+
typedef struct {
|
|
191
|
+
char *host; /* malloc'd, lowercase */
|
|
192
|
+
uint64_t last_ns; /* monotonic-ns timestamp of last completed wait */
|
|
193
|
+
uint32_t hits; /* LRU counter for eviction */
|
|
194
|
+
} throttle_slot_t;
|
|
195
|
+
|
|
196
|
+
#define THROTTLE_CAP 256
|
|
197
|
+
static throttle_slot_t g_throttle[THROTTLE_CAP];
|
|
198
|
+
static int g_throttle_n = 0;
|
|
199
|
+
static pthread_mutex_t g_throttle_mu = PTHREAD_MUTEX_INITIALIZER;
|
|
200
|
+
|
|
201
|
+
static uint64_t mono_ns(void) {
|
|
202
|
+
struct timespec ts;
|
|
203
|
+
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
204
|
+
return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/* Pull the host out of a URL: bytes between "://" and the next
|
|
208
|
+
* '/', '?', '#', or ':'. Returns 1 on success. */
|
|
209
|
+
static int scrap_extract_host(const char *url, char *out, size_t cap) {
|
|
210
|
+
const char *p = strstr(url, "://");
|
|
211
|
+
if (!p) return 0;
|
|
212
|
+
p += 3;
|
|
213
|
+
const char *e = p;
|
|
214
|
+
while (*e && *e != '/' && *e != '?' && *e != '#' && *e != ':') e++;
|
|
215
|
+
size_t l = (size_t)(e - p);
|
|
216
|
+
if (l == 0 || l + 1 > cap) return 0;
|
|
217
|
+
for (size_t i = 0; i < l; i++) {
|
|
218
|
+
char c = p[i];
|
|
219
|
+
out[i] = (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c;
|
|
220
|
+
}
|
|
221
|
+
out[l] = 0;
|
|
222
|
+
return 1;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/* Wait long enough since the last request to `host` to honour the
|
|
226
|
+
* min interval, then mark the new "last" time. The sleep happens
|
|
227
|
+
* outside the mutex so concurrent workers for different hosts don't
|
|
228
|
+
* block each other.
|
|
229
|
+
*
|
|
230
|
+
* Safe to call from a no-GVL worker — uses nanosleep, no Ruby state. */
|
|
231
|
+
static void scrap_throttle_wait(const char *host, uint64_t min_interval_ns) {
|
|
232
|
+
if (!host || !*host || min_interval_ns == 0) return;
|
|
233
|
+
pthread_mutex_lock(&g_throttle_mu);
|
|
234
|
+
int idx = -1;
|
|
235
|
+
for (int i = 0; i < g_throttle_n; i++) {
|
|
236
|
+
if (strcmp(g_throttle[i].host, host) == 0) { idx = i; break; }
|
|
237
|
+
}
|
|
238
|
+
if (idx < 0) {
|
|
239
|
+
if (g_throttle_n < THROTTLE_CAP) {
|
|
240
|
+
idx = g_throttle_n++;
|
|
241
|
+
} else {
|
|
242
|
+
/* Evict the least-recently-used slot. */
|
|
243
|
+
int lru = 0;
|
|
244
|
+
for (int i = 1; i < THROTTLE_CAP; i++) {
|
|
245
|
+
if (g_throttle[i].hits < g_throttle[lru].hits) lru = i;
|
|
246
|
+
}
|
|
247
|
+
idx = lru;
|
|
248
|
+
free(g_throttle[idx].host);
|
|
249
|
+
g_throttle[idx].host = NULL;
|
|
250
|
+
}
|
|
251
|
+
g_throttle[idx].host = strdup(host);
|
|
252
|
+
g_throttle[idx].last_ns = 0;
|
|
253
|
+
g_throttle[idx].hits = 0;
|
|
254
|
+
}
|
|
255
|
+
g_throttle[idx].hits++;
|
|
256
|
+
uint64_t now = mono_ns();
|
|
257
|
+
/* last_ns is the "earliest allowed start" for the next request to
|
|
258
|
+
* this host. Each worker reserves its slot by advancing
|
|
259
|
+
* last_ns = max(now, last_ns) + min_interval_ns. Concurrent workers
|
|
260
|
+
* to the same host see ever-increasing reservations and serialise
|
|
261
|
+
* cleanly; concurrent workers to *different* hosts hit different
|
|
262
|
+
* slots and don't block each other. */
|
|
263
|
+
uint64_t earliest = g_throttle[idx].last_ns;
|
|
264
|
+
uint64_t start = (earliest <= now) ? now : earliest;
|
|
265
|
+
uint64_t wait_ns = start - now;
|
|
266
|
+
g_throttle[idx].last_ns = start + min_interval_ns;
|
|
267
|
+
pthread_mutex_unlock(&g_throttle_mu);
|
|
268
|
+
|
|
269
|
+
if (wait_ns > 0) {
|
|
270
|
+
struct timespec ts;
|
|
271
|
+
ts.tv_sec = (time_t)(wait_ns / 1000000000ull);
|
|
272
|
+
ts.tv_nsec = (long)(wait_ns % 1000000000ull);
|
|
273
|
+
nanosleep(&ts, NULL);
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/* ---- HTTP response cache (ETag / Last-Modified) ------------------ *
|
|
278
|
+
* Disk-backed cache of completed GET responses keyed by URL. Each entry
|
|
279
|
+
* stores status + ETag + Last-Modified + Content-Type + body in a
|
|
280
|
+
* tagged binary format:
|
|
281
|
+
*
|
|
282
|
+
* 8 bytes magic "SCRHV001"
|
|
283
|
+
* 4 bytes uint32_le status
|
|
284
|
+
* 4 + N etag (length-prefixed)
|
|
285
|
+
* 4 + N lastmod (length-prefixed)
|
|
286
|
+
* 4 + N ctype (length-prefixed)
|
|
287
|
+
* 8 + N body (uint64_le length-prefixed)
|
|
288
|
+
*
|
|
289
|
+
* When :cache_dir is set on a request and a cache entry exists for the
|
|
290
|
+
* URL, the fetch adds If-None-Match / If-Modified-Since automatically.
|
|
291
|
+
* A 304 response is served from the cached body with the cached
|
|
292
|
+
* Content-Type — the network round-trip stayed cheap (no body) but
|
|
293
|
+
* the caller sees a fully-formed 200-shaped response.
|
|
294
|
+
*
|
|
295
|
+
* Cache key is a 128-bit FNV1a-double (two FNV64s with different
|
|
296
|
+
* seeds) — collision probability for any realistic crawl is
|
|
297
|
+
* effectively zero, and avoiding a SHA-2 dep keeps the binary lean.
|
|
298
|
+
*/
|
|
299
|
+
|
|
300
|
+
static void scrap_cache_key(const char *url, char out[33]) {
|
|
301
|
+
uint64_t h1 = 0xcbf29ce484222325ull;
|
|
302
|
+
uint64_t h2 = 0x84222325cbf29ce4ull;
|
|
303
|
+
for (size_t i = 0; url[i]; i++) {
|
|
304
|
+
uint8_t c = (uint8_t)url[i];
|
|
305
|
+
h1 ^= c; h1 *= 0x100000001b3ull;
|
|
306
|
+
h2 ^= c; h2 *= 0x9e3779b97f4a7c15ull;
|
|
307
|
+
}
|
|
308
|
+
snprintf(out, 33, "%016llx%016llx",
|
|
309
|
+
(unsigned long long)h1, (unsigned long long)h2);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
typedef struct {
|
|
313
|
+
long status;
|
|
314
|
+
char *etag; size_t etag_len;
|
|
315
|
+
char *lastmod; size_t lastmod_len;
|
|
316
|
+
char *ctype; size_t ctype_len;
|
|
317
|
+
char *body; size_t body_len;
|
|
318
|
+
} scrap_cache_entry_t;
|
|
319
|
+
|
|
320
|
+
static void scrap_cache_entry_free(scrap_cache_entry_t *e) {
|
|
321
|
+
free(e->etag); free(e->lastmod); free(e->ctype); free(e->body);
|
|
322
|
+
memset(e, 0, sizeof(*e));
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
static int scrap_cache_path(const char *dir, const char *url,
|
|
326
|
+
char *out, size_t cap) {
|
|
327
|
+
char key[33];
|
|
328
|
+
scrap_cache_key(url, key);
|
|
329
|
+
int n = snprintf(out, cap, "%s/%c%c", dir, key[0], key[1]);
|
|
330
|
+
if (n <= 0 || (size_t)n >= cap) return 0;
|
|
331
|
+
mkdir(dir, 0755); /* best-effort; the leaf mkdir below is what matters */
|
|
332
|
+
mkdir(out, 0755);
|
|
333
|
+
return snprintf(out, cap, "%s/%c%c/%s.cache", dir, key[0], key[1], key) > 0;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
static int read_u32_le(FILE *f, uint32_t *out) {
|
|
337
|
+
uint8_t b[4];
|
|
338
|
+
if (fread(b, 1, 4, f) != 4) return 0;
|
|
339
|
+
*out = (uint32_t)b[0] | ((uint32_t)b[1] << 8) |
|
|
340
|
+
((uint32_t)b[2] << 16) | ((uint32_t)b[3] << 24);
|
|
341
|
+
return 1;
|
|
342
|
+
}
|
|
343
|
+
static int read_u64_le(FILE *f, uint64_t *out) {
|
|
344
|
+
uint8_t b[8];
|
|
345
|
+
if (fread(b, 1, 8, f) != 8) return 0;
|
|
346
|
+
*out = 0;
|
|
347
|
+
for (int i = 0; i < 8; i++) *out |= (uint64_t)b[i] << (i * 8);
|
|
348
|
+
return 1;
|
|
349
|
+
}
|
|
350
|
+
static int read_lenstr(FILE *f, char **out, size_t *out_len) {
|
|
351
|
+
uint32_t l;
|
|
352
|
+
if (!read_u32_le(f, &l)) return 0;
|
|
353
|
+
*out_len = l;
|
|
354
|
+
if (l == 0) { *out = NULL; return 1; }
|
|
355
|
+
*out = (char *)malloc(l + 1);
|
|
356
|
+
if (fread(*out, 1, l, f) != l) { free(*out); *out = NULL; return 0; }
|
|
357
|
+
(*out)[l] = 0;
|
|
358
|
+
return 1;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
static int scrap_cache_load(const char *dir, const char *url,
|
|
362
|
+
scrap_cache_entry_t *e) {
|
|
363
|
+
memset(e, 0, sizeof(*e));
|
|
364
|
+
char path[1024];
|
|
365
|
+
if (!scrap_cache_path(dir, url, path, sizeof(path))) return 0;
|
|
366
|
+
FILE *f = fopen(path, "rb");
|
|
367
|
+
if (!f) return 0;
|
|
368
|
+
char magic[8];
|
|
369
|
+
if (fread(magic, 1, 8, f) != 8 || memcmp(magic, "SCRHV001", 8) != 0) {
|
|
370
|
+
fclose(f); return 0;
|
|
371
|
+
}
|
|
372
|
+
uint32_t status;
|
|
373
|
+
if (!read_u32_le(f, &status)) { fclose(f); return 0; }
|
|
374
|
+
e->status = (long)status;
|
|
375
|
+
if (!read_lenstr(f, &e->etag, &e->etag_len)) { fclose(f); return 0; }
|
|
376
|
+
if (!read_lenstr(f, &e->lastmod, &e->lastmod_len)) { fclose(f); return 0; }
|
|
377
|
+
if (!read_lenstr(f, &e->ctype, &e->ctype_len)) { fclose(f); return 0; }
|
|
378
|
+
uint64_t body_len;
|
|
379
|
+
if (!read_u64_le(f, &body_len)) { fclose(f); return 0; }
|
|
380
|
+
e->body_len = body_len;
|
|
381
|
+
if (body_len > 0) {
|
|
382
|
+
e->body = (char *)malloc(body_len + 1);
|
|
383
|
+
if (fread(e->body, 1, body_len, f) != body_len) {
|
|
384
|
+
fclose(f); scrap_cache_entry_free(e); return 0;
|
|
385
|
+
}
|
|
386
|
+
e->body[body_len] = 0;
|
|
387
|
+
}
|
|
388
|
+
fclose(f);
|
|
389
|
+
return 1;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
static void write_u32_le(FILE *f, uint32_t v) {
|
|
393
|
+
uint8_t b[4] = { v & 0xFF, (v >> 8) & 0xFF, (v >> 16) & 0xFF, (v >> 24) & 0xFF };
|
|
394
|
+
fwrite(b, 1, 4, f);
|
|
395
|
+
}
|
|
396
|
+
static void write_u64_le(FILE *f, uint64_t v) {
|
|
397
|
+
uint8_t b[8];
|
|
398
|
+
for (int i = 0; i < 8; i++) b[i] = (v >> (i * 8)) & 0xFF;
|
|
399
|
+
fwrite(b, 1, 8, f);
|
|
400
|
+
}
|
|
401
|
+
static void write_lenstr(FILE *f, const char *p, size_t l) {
|
|
402
|
+
write_u32_le(f, (uint32_t)l);
|
|
403
|
+
if (l) fwrite(p, 1, l, f);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
static int scrap_cache_store(const char *dir, const char *url,
|
|
407
|
+
long status, const char *etag, size_t etag_len,
|
|
408
|
+
const char *lastmod, size_t lastmod_len,
|
|
409
|
+
const char *ctype, size_t ctype_len,
|
|
410
|
+
const char *body, size_t body_len) {
|
|
411
|
+
char path[1024];
|
|
412
|
+
if (!scrap_cache_path(dir, url, path, sizeof(path))) return 0;
|
|
413
|
+
char tmp[1100];
|
|
414
|
+
snprintf(tmp, sizeof(tmp), "%s.tmp.%d", path, (int)getpid());
|
|
415
|
+
FILE *f = fopen(tmp, "wb");
|
|
416
|
+
if (!f) return 0;
|
|
417
|
+
fwrite("SCRHV001", 1, 8, f);
|
|
418
|
+
write_u32_le(f, (uint32_t)status);
|
|
419
|
+
write_lenstr(f, etag, etag_len);
|
|
420
|
+
write_lenstr(f, lastmod, lastmod_len);
|
|
421
|
+
write_lenstr(f, ctype, ctype_len);
|
|
422
|
+
write_u64_le(f, (uint64_t)body_len);
|
|
423
|
+
if (body_len) fwrite(body, 1, body_len, f);
|
|
424
|
+
fclose(f);
|
|
425
|
+
return rename(tmp, path) == 0;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/* ---- charset detection + iconv transcode to UTF-8 ----------------- *
|
|
429
|
+
* Find charset=... in a Content-Type header value (length-bounded), then
|
|
430
|
+
* iconv-transcode the body buffer in place. Replacement bytes are used
|
|
431
|
+
* for invalid sequences so the parse layer never trips on undecodable
|
|
432
|
+
* input. UTF-8 / utf8 / absent charset all skip the conversion.
|
|
433
|
+
*/
|
|
434
|
+
static int scrap_extract_charset(const char *ct, size_t ct_len,
|
|
435
|
+
char *out, size_t cap) {
|
|
436
|
+
const char *needle = "charset";
|
|
437
|
+
size_t needle_len = 7;
|
|
438
|
+
for (size_t i = 0; i + needle_len < ct_len; i++) {
|
|
439
|
+
int ok = 1;
|
|
440
|
+
for (size_t j = 0; j < needle_len; j++) {
|
|
441
|
+
char a = ct[i + j];
|
|
442
|
+
if (a >= 'A' && a <= 'Z') a += 32;
|
|
443
|
+
if (a != needle[j]) { ok = 0; break; }
|
|
444
|
+
}
|
|
445
|
+
if (!ok) continue;
|
|
446
|
+
size_t j = i + needle_len;
|
|
447
|
+
while (j < ct_len && (ct[j] == ' ' || ct[j] == '\t')) j++;
|
|
448
|
+
if (j >= ct_len || ct[j] != '=') continue;
|
|
449
|
+
j++;
|
|
450
|
+
while (j < ct_len && (ct[j] == ' ' || ct[j] == '\t')) j++;
|
|
451
|
+
char quote = 0;
|
|
452
|
+
if (j < ct_len && (ct[j] == '"' || ct[j] == '\'')) { quote = ct[j]; j++; }
|
|
453
|
+
size_t s = j;
|
|
454
|
+
while (j < ct_len &&
|
|
455
|
+
(quote ? (ct[j] != quote)
|
|
456
|
+
: (ct[j] != ';' && ct[j] != ' ' && ct[j] != '\t' &&
|
|
457
|
+
ct[j] != '\r' && ct[j] != '\n'))) j++;
|
|
458
|
+
size_t l = j - s;
|
|
459
|
+
if (l == 0 || l + 1 > cap) return 0;
|
|
460
|
+
memcpy(out, ct + s, l);
|
|
461
|
+
out[l] = 0;
|
|
462
|
+
return 1;
|
|
463
|
+
}
|
|
464
|
+
return 0;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
static int scrap_transcode_to_utf8(char **body, size_t *body_len, size_t *body_cap,
|
|
468
|
+
const char *charset) {
|
|
469
|
+
if (!charset || !*charset) return 0;
|
|
470
|
+
if (strcasecmp(charset, "utf-8") == 0 ||
|
|
471
|
+
strcasecmp(charset, "utf8") == 0 ||
|
|
472
|
+
strcasecmp(charset, "us-ascii") == 0 ||
|
|
473
|
+
strcasecmp(charset, "ascii") == 0) return 0;
|
|
474
|
+
iconv_t cd = iconv_open("UTF-8", charset);
|
|
475
|
+
if (cd == (iconv_t)-1) return 0;
|
|
476
|
+
|
|
477
|
+
size_t in_left = *body_len;
|
|
478
|
+
char *in_ptr = *body;
|
|
479
|
+
size_t out_cap = (*body_len) * 2 + 16;
|
|
480
|
+
char *out = (char *)malloc(out_cap);
|
|
481
|
+
char *out_ptr = out;
|
|
482
|
+
size_t out_left = out_cap;
|
|
483
|
+
|
|
484
|
+
while (in_left > 0) {
|
|
485
|
+
size_t r = iconv(cd, &in_ptr, &in_left, &out_ptr, &out_left);
|
|
486
|
+
if (r != (size_t)-1) continue;
|
|
487
|
+
if (errno == EILSEQ || errno == EINVAL) {
|
|
488
|
+
/* Replace the offending byte with '?' and skip it. */
|
|
489
|
+
if (out_left < 1) {
|
|
490
|
+
size_t used = (size_t)(out_ptr - out);
|
|
491
|
+
out_cap *= 2;
|
|
492
|
+
out = (char *)realloc(out, out_cap);
|
|
493
|
+
out_ptr = out + used;
|
|
494
|
+
out_left = out_cap - used;
|
|
495
|
+
}
|
|
496
|
+
*out_ptr++ = '?'; out_left--;
|
|
497
|
+
in_ptr++; in_left--;
|
|
498
|
+
} else if (errno == E2BIG) {
|
|
499
|
+
size_t used = (size_t)(out_ptr - out);
|
|
500
|
+
out_cap *= 2;
|
|
501
|
+
out = (char *)realloc(out, out_cap);
|
|
502
|
+
out_ptr = out + used;
|
|
503
|
+
out_left = out_cap - used;
|
|
504
|
+
} else {
|
|
505
|
+
iconv_close(cd);
|
|
506
|
+
free(out);
|
|
507
|
+
return 0;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
iconv_close(cd);
|
|
511
|
+
free(*body);
|
|
512
|
+
*body = out;
|
|
513
|
+
*body_len = (size_t)(out_ptr - out);
|
|
514
|
+
*body_cap = out_cap;
|
|
515
|
+
return 1;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
/* Pull Content-Type out of a header blob and run transcode if its
|
|
519
|
+
* charset is non-UTF-8. No-op when there's no header, no charset, or
|
|
520
|
+
* the charset is already UTF-8 / ASCII. */
|
|
521
|
+
static int scrap_apply_charset(const char *headers_blob, size_t headers_len,
|
|
522
|
+
char **body, size_t *body_len, size_t *body_cap) {
|
|
523
|
+
const char *ct_val = NULL; size_t ct_vlen = 0;
|
|
524
|
+
size_t i = 0;
|
|
525
|
+
while (i < headers_len) {
|
|
526
|
+
size_t ls = i;
|
|
527
|
+
while (i < headers_len && headers_blob[i] != '\n') i++;
|
|
528
|
+
size_t le = i;
|
|
529
|
+
if (le > ls && headers_blob[le-1] == '\r') le--;
|
|
530
|
+
if (i < headers_len) i++;
|
|
531
|
+
if (le == ls) continue;
|
|
532
|
+
size_t colon = (size_t)-1;
|
|
533
|
+
for (size_t k = ls; k < le; k++) {
|
|
534
|
+
if (headers_blob[k] == ':') { colon = k; break; }
|
|
535
|
+
}
|
|
536
|
+
if (colon == (size_t)-1) continue;
|
|
537
|
+
if (colon - ls != 12) continue;
|
|
538
|
+
const char *want = "content-type";
|
|
539
|
+
int ok = 1;
|
|
540
|
+
for (size_t k = 0; k < 12; k++) {
|
|
541
|
+
char a = headers_blob[ls + k];
|
|
542
|
+
if (a >= 'A' && a <= 'Z') a += 32;
|
|
543
|
+
if (a != want[k]) { ok = 0; break; }
|
|
544
|
+
}
|
|
545
|
+
if (!ok) continue;
|
|
546
|
+
size_t vs = colon + 1;
|
|
547
|
+
while (vs < le && (headers_blob[vs] == ' ' || headers_blob[vs] == '\t')) vs++;
|
|
548
|
+
ct_val = headers_blob + vs; ct_vlen = le - vs;
|
|
549
|
+
}
|
|
550
|
+
if (!ct_val) return 0;
|
|
551
|
+
char cs[64];
|
|
552
|
+
if (!scrap_extract_charset(ct_val, ct_vlen, cs, sizeof(cs))) return 0;
|
|
553
|
+
return scrap_transcode_to_utf8(body, body_len, body_cap, cs);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
/* ---- shared connection cache (CURLSH) ---------------------------- *
|
|
557
|
+
* libcurl easy handles each carry a private connection cache by
|
|
558
|
+
* default. With per-thread handles that means N pthread workers
|
|
559
|
+
* hitting one host open N independent TLS connections. CURLSH lets
|
|
560
|
+
* them all share one connection pool, one DNS cache, and one TLS
|
|
561
|
+
* session cache — so 8 workers against the same HTTP/2 origin
|
|
562
|
+
* settle on one (or a few) multiplexed connections instead of
|
|
563
|
+
* eight handshakes.
|
|
564
|
+
*
|
|
565
|
+
* libcurl requires user-provided locks for the share since the
|
|
566
|
+
* shared data structures can be touched concurrently. We use one
|
|
567
|
+
* pthread mutex per shared resource class. */
|
|
568
|
+
|
|
569
|
+
static CURLSH *g_share = NULL;
|
|
570
|
+
/* One mutex per curl_lock_data class. curl_lock_data values run from
|
|
571
|
+
* 0 (NONE) up to CURL_LOCK_DATA_LAST; sizing the array to 16 covers
|
|
572
|
+
* present + future entries comfortably without an unbounded VLA. */
|
|
573
|
+
#define SCRAP_SHARE_LOCKS 16
|
|
574
|
+
static pthread_mutex_t g_share_locks[SCRAP_SHARE_LOCKS];
|
|
575
|
+
|
|
576
|
+
static void scrap_share_lock(CURL *h, curl_lock_data data,
|
|
577
|
+
curl_lock_access access, void *user) {
|
|
578
|
+
(void)h; (void)access; (void)user;
|
|
579
|
+
if ((int)data >= 0 && (int)data < SCRAP_SHARE_LOCKS) {
|
|
580
|
+
pthread_mutex_lock(&g_share_locks[(int)data]);
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
static void scrap_share_unlock(CURL *h, curl_lock_data data, void *user) {
|
|
584
|
+
(void)h; (void)user;
|
|
585
|
+
if ((int)data >= 0 && (int)data < SCRAP_SHARE_LOCKS) {
|
|
586
|
+
pthread_mutex_unlock(&g_share_locks[(int)data]);
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
static void scrap_share_init(void) {
|
|
590
|
+
if (g_share) return;
|
|
591
|
+
for (int i = 0; i < SCRAP_SHARE_LOCKS; i++) {
|
|
592
|
+
pthread_mutex_init(&g_share_locks[i], NULL);
|
|
593
|
+
}
|
|
594
|
+
g_share = curl_share_init();
|
|
595
|
+
curl_share_setopt(g_share, CURLSHOPT_LOCKFUNC, scrap_share_lock);
|
|
596
|
+
curl_share_setopt(g_share, CURLSHOPT_UNLOCKFUNC, scrap_share_unlock);
|
|
597
|
+
curl_share_setopt(g_share, CURLSHOPT_SHARE, CURL_LOCK_DATA_CONNECT);
|
|
598
|
+
curl_share_setopt(g_share, CURLSHOPT_SHARE, CURL_LOCK_DATA_DNS);
|
|
599
|
+
curl_share_setopt(g_share, CURLSHOPT_SHARE, CURL_LOCK_DATA_SSL_SESSION);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
/* Lazy `curl_global_init` so `require "scrapetor"` doesn't kick libcurl
|
|
603
|
+
* — and through it Apple's SystemConfiguration framework — into spinning
|
|
604
|
+
* up Cocoa class +initialize methods on background threads. When the
|
|
605
|
+
* host process (Puma master, Spring preloader, Foreman + sidekiq) forks
|
|
606
|
+
* a worker before that init finishes, the child trips
|
|
607
|
+
* +[NSCharacterSet initialize] may have been in progress in another thread
|
|
608
|
+
* when fork() was called. We cannot safely call it or ignore it ... Crashing.
|
|
609
|
+
* Deferring the init until the first actual fetch lets the master fork
|
|
610
|
+
* cleanly; each post-fork worker then runs the init itself the first
|
|
611
|
+
* time it touches the network. pthread_once gives us safe one-time
|
|
612
|
+
* execution even if multiple worker threads race the first call. */
|
|
613
|
+
static pthread_once_t g_curl_init_once = PTHREAD_ONCE_INIT;
|
|
614
|
+
static void scrap_global_init_once_fn(void) {
|
|
615
|
+
curl_global_init(CURL_GLOBAL_DEFAULT);
|
|
616
|
+
scrap_share_init();
|
|
617
|
+
}
|
|
618
|
+
static inline void scrap_ensure_global_init(void) {
|
|
619
|
+
pthread_once(&g_curl_init_once, scrap_global_init_once_fn);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/* ---- per-thread curl handle pool ---------------------------------- *
|
|
623
|
+
* Re-creating an easy handle costs ~30 µs and discards connection
|
|
624
|
+
* cache. Holding one handle per OS thread (via pthread_specific) lets
|
|
625
|
+
* back-to-back fetches against the same host reuse the TLS/HTTP-2
|
|
626
|
+
* session. Cleared automatically on thread exit. */
|
|
627
|
+
|
|
628
|
+
static pthread_key_t g_curl_tls_key;
|
|
629
|
+
static pthread_once_t g_curl_tls_once = PTHREAD_ONCE_INIT;
|
|
630
|
+
|
|
631
|
+
static void curl_tls_dtor(void *p) {
|
|
632
|
+
if (p) curl_easy_cleanup((CURL *)p);
|
|
633
|
+
}
|
|
634
|
+
static void curl_tls_init(void) {
|
|
635
|
+
pthread_key_create(&g_curl_tls_key, curl_tls_dtor);
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
static CURL *get_thread_curl(void) {
|
|
639
|
+
pthread_once(&g_curl_tls_once, curl_tls_init);
|
|
640
|
+
CURL *h = (CURL *)pthread_getspecific(g_curl_tls_key);
|
|
641
|
+
if (!h) {
|
|
642
|
+
h = curl_easy_init();
|
|
643
|
+
pthread_setspecific(g_curl_tls_key, h);
|
|
644
|
+
} else {
|
|
645
|
+
curl_easy_reset(h);
|
|
646
|
+
/* curl_easy_reset does NOT clear cookie engine state. Wipe it
|
|
647
|
+
* explicitly so a previous call's cookies don't leak into the
|
|
648
|
+
* next one on the same per-thread handle. Callers who want
|
|
649
|
+
* cross-request cookie persistence opt in via :cookiejar /
|
|
650
|
+
* :cookiefile, which re-enables the engine for that request. */
|
|
651
|
+
curl_easy_setopt(h, CURLOPT_COOKIELIST, "ALL");
|
|
652
|
+
}
|
|
653
|
+
/* Attach the global share so this handle pulls connections, DNS
|
|
654
|
+
* results, and TLS sessions from the shared pool. Must be set
|
|
655
|
+
* after every reset because curl_easy_reset clears it. */
|
|
656
|
+
if (g_share) curl_easy_setopt(h, CURLOPT_SHARE, g_share);
|
|
657
|
+
return h;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
/* ---- response buffer --------------------------------------------- */
|
|
661
|
+
|
|
662
|
+
typedef struct {
|
|
663
|
+
char *data;
|
|
664
|
+
size_t len;
|
|
665
|
+
size_t cap;
|
|
666
|
+
} buf_t;
|
|
667
|
+
|
|
668
|
+
static size_t buf_append(buf_t *b, const char *src, size_t n) {
|
|
669
|
+
if (b->len + n + 1 > b->cap) {
|
|
670
|
+
size_t nc = b->cap == 0 ? 16 * 1024 : b->cap * 2;
|
|
671
|
+
while (nc < b->len + n + 1) nc *= 2;
|
|
672
|
+
char *p = (char *)realloc(b->data, nc);
|
|
673
|
+
if (!p) return 0;
|
|
674
|
+
b->data = p; b->cap = nc;
|
|
675
|
+
}
|
|
676
|
+
memcpy(b->data + b->len, src, n);
|
|
677
|
+
b->len += n;
|
|
678
|
+
b->data[b->len] = 0;
|
|
679
|
+
return n;
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
static size_t cb_body(char *ptr, size_t size, size_t nmemb, void *userdata) {
|
|
683
|
+
buf_t *b = (buf_t *)userdata;
|
|
684
|
+
return buf_append(b, ptr, size * nmemb);
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
static size_t cb_header(char *ptr, size_t size, size_t nmemb, void *userdata) {
|
|
688
|
+
buf_t *b = (buf_t *)userdata;
|
|
689
|
+
return buf_append(b, ptr, size * nmemb);
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
/* ---- fetch context ----------------------------------------------- *
|
|
693
|
+
* Built under GVL, then handed to the no-GVL worker which runs
|
|
694
|
+
* curl_easy_perform. */
|
|
695
|
+
|
|
696
|
+
typedef struct {
|
|
697
|
+
CURL *handle;
|
|
698
|
+
buf_t body;
|
|
699
|
+
buf_t headers;
|
|
700
|
+
struct curl_slist *req_headers; /* freed by caller */
|
|
701
|
+
CURLcode rc;
|
|
702
|
+
} fetch_ctx_t;
|
|
703
|
+
|
|
704
|
+
static void *do_fetch_nogvl(void *arg) {
|
|
705
|
+
fetch_ctx_t *fc = (fetch_ctx_t *)arg;
|
|
706
|
+
fc->rc = curl_easy_perform(fc->handle);
|
|
707
|
+
return NULL;
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
/* Parse a HTTP header blob ("HTTP/2 200\r\nHeader: value\r\n...") into
|
|
711
|
+
* a Ruby Hash. Multi-value headers get concatenated. Status lines are
|
|
712
|
+
* filtered out so the Hash only carries response headers. */
|
|
713
|
+
static VALUE parse_headers_blob(const char *data, size_t len) {
|
|
714
|
+
VALUE h = rb_hash_new();
|
|
715
|
+
size_t i = 0;
|
|
716
|
+
while (i < len) {
|
|
717
|
+
size_t line_start = i;
|
|
718
|
+
while (i < len && data[i] != '\n') i++;
|
|
719
|
+
size_t line_end = i;
|
|
720
|
+
if (line_end > line_start && data[line_end - 1] == '\r') line_end--;
|
|
721
|
+
if (i < len) i++;
|
|
722
|
+
if (line_end == line_start) continue;
|
|
723
|
+
|
|
724
|
+
/* Skip the "HTTP/x.y NNN ..." status line; curl emits one per
|
|
725
|
+
* redirect step. Real headers always contain ':'. */
|
|
726
|
+
size_t colon = (size_t)-1;
|
|
727
|
+
for (size_t k = line_start; k < line_end; k++) {
|
|
728
|
+
if (data[k] == ':') { colon = k; break; }
|
|
729
|
+
}
|
|
730
|
+
if (colon == (size_t)-1) continue;
|
|
731
|
+
|
|
732
|
+
size_t name_s = line_start;
|
|
733
|
+
size_t name_e = colon;
|
|
734
|
+
size_t val_s = colon + 1;
|
|
735
|
+
while (val_s < line_end && (data[val_s] == ' ' || data[val_s] == '\t')) val_s++;
|
|
736
|
+
size_t val_e = line_end;
|
|
737
|
+
|
|
738
|
+
VALUE name = rb_str_new(data + name_s, (long)(name_e - name_s));
|
|
739
|
+
VALUE val = rb_str_new(data + val_s, (long)(val_e - val_s));
|
|
740
|
+
rb_enc_associate(name, enc_utf8);
|
|
741
|
+
rb_enc_associate(val, enc_utf8);
|
|
742
|
+
/* Header names are ASCII case-insensitive; downcase for lookup
|
|
743
|
+
* ergonomics on the Ruby side. */
|
|
744
|
+
rb_funcall(name, rb_intern("downcase!"), 0);
|
|
745
|
+
VALUE existing = rb_hash_lookup(h, name);
|
|
746
|
+
if (NIL_P(existing)) {
|
|
747
|
+
rb_hash_aset(h, name, val);
|
|
748
|
+
} else {
|
|
749
|
+
VALUE both = rb_str_dup(existing);
|
|
750
|
+
rb_str_cat_cstr(both, ", ");
|
|
751
|
+
rb_str_append(both, val);
|
|
752
|
+
rb_hash_aset(h, name, both);
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
return h;
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
static VALUE scrap_http_get(int argc, VALUE *argv, VALUE self) {
|
|
759
|
+
(void)self;
|
|
760
|
+
scrap_ensure_global_init();
|
|
761
|
+
VALUE url_v, opts_v;
|
|
762
|
+
rb_scan_args(argc, argv, "11", &url_v, &opts_v);
|
|
763
|
+
Check_Type(url_v, T_STRING);
|
|
764
|
+
|
|
765
|
+
long timeout_ms = 30000;
|
|
766
|
+
int follow = 1;
|
|
767
|
+
long max_redirs = 10;
|
|
768
|
+
const char *ua = "scrapetor/0.1 (libcurl)";
|
|
769
|
+
VALUE headers_v = Qnil;
|
|
770
|
+
int insecure = 0;
|
|
771
|
+
const char *method = NULL; /* NULL = GET */
|
|
772
|
+
const char *body = NULL;
|
|
773
|
+
long body_len = 0;
|
|
774
|
+
int nobody = 0; /* HEAD */
|
|
775
|
+
const char *cookiejar = NULL;
|
|
776
|
+
const char *cookiefile = NULL;
|
|
777
|
+
const char *proxy = NULL;
|
|
778
|
+
const char *basic_auth = NULL;
|
|
779
|
+
const char *bearer = NULL;
|
|
780
|
+
const char *ca_path = NULL;
|
|
781
|
+
long rate_limit_ms = 0;
|
|
782
|
+
int transcode_utf8 = 1;
|
|
783
|
+
const char *cache_dir = NULL;
|
|
784
|
+
VALUE multipart_v = Qnil;
|
|
785
|
+
/* mTLS client cert + key */
|
|
786
|
+
const char *ssl_cert = NULL;
|
|
787
|
+
const char *ssl_key = NULL;
|
|
788
|
+
const char *ssl_key_pass = NULL;
|
|
789
|
+
const char *ssl_cert_type = NULL; /* "PEM" / "DER"; NULL = libcurl default */
|
|
790
|
+
/* Proxy auth + type */
|
|
791
|
+
const char *proxy_auth = NULL; /* "user:pass" */
|
|
792
|
+
const char *proxy_type = NULL; /* "http", "socks5", "socks5h", ... */
|
|
793
|
+
/* Stream body to disk instead of buffering. When set, the
|
|
794
|
+
* response :body in the returned hash is an empty String; the
|
|
795
|
+
* caller reads from the file. */
|
|
796
|
+
const char *download_to = NULL;
|
|
797
|
+
long max_recv_bps = 0; /* CURLOPT_MAX_RECV_SPEED_LARGE */
|
|
798
|
+
long max_send_bps = 0; /* CURLOPT_MAX_SEND_SPEED_LARGE */
|
|
799
|
+
/* HTTP version selection. NULL = default (HTTP/2 over TLS with
|
|
800
|
+
* 1.1 fallback). "1.0" / "1.1" / "2" / "3" force the negotiated
|
|
801
|
+
* version. "3" requires libcurl with HTTP/3 support; otherwise
|
|
802
|
+
* curl errors. */
|
|
803
|
+
const char *http_version = NULL;
|
|
804
|
+
|
|
805
|
+
if (!NIL_P(opts_v)) {
|
|
806
|
+
Check_Type(opts_v, T_HASH);
|
|
807
|
+
VALUE v;
|
|
808
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
|
|
809
|
+
if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
|
|
810
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
|
|
811
|
+
if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
|
|
812
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
|
|
813
|
+
if (!NIL_P(v)) max_redirs = NUM2LONG(v);
|
|
814
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
|
|
815
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
|
|
816
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
|
|
817
|
+
if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
|
|
818
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
|
|
819
|
+
if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
|
|
820
|
+
|
|
821
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("method")));
|
|
822
|
+
if (!NIL_P(v)) {
|
|
823
|
+
if (SYMBOL_P(v)) v = rb_sym2str(v);
|
|
824
|
+
Check_Type(v, T_STRING);
|
|
825
|
+
method = RSTRING_PTR(v);
|
|
826
|
+
if (strcasecmp(method, "head") == 0) { nobody = 1; method = NULL; }
|
|
827
|
+
else if (strcasecmp(method, "get") == 0) method = NULL;
|
|
828
|
+
else {
|
|
829
|
+
/* HTTP methods are case-sensitive; uppercase so
|
|
830
|
+
* picky servers (RFC 7231 strict) accept them. */
|
|
831
|
+
static char method_buf[24];
|
|
832
|
+
size_t mi = 0;
|
|
833
|
+
for (; mi < sizeof(method_buf) - 1 && method[mi]; mi++) {
|
|
834
|
+
char c = method[mi];
|
|
835
|
+
method_buf[mi] = (c >= 'a' && c <= 'z') ? (char)(c - 32) : c;
|
|
836
|
+
}
|
|
837
|
+
method_buf[mi] = 0;
|
|
838
|
+
method = method_buf;
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("body")));
|
|
842
|
+
if (!NIL_P(v)) {
|
|
843
|
+
Check_Type(v, T_STRING);
|
|
844
|
+
body = RSTRING_PTR(v);
|
|
845
|
+
body_len = RSTRING_LEN(v);
|
|
846
|
+
}
|
|
847
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cookiejar")));
|
|
848
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); cookiejar = RSTRING_PTR(v); }
|
|
849
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cookiefile")));
|
|
850
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); cookiefile = RSTRING_PTR(v); }
|
|
851
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("proxy")));
|
|
852
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); proxy = RSTRING_PTR(v); }
|
|
853
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("basic_auth")));
|
|
854
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); basic_auth = RSTRING_PTR(v); }
|
|
855
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("bearer_token")));
|
|
856
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); bearer = RSTRING_PTR(v); }
|
|
857
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ca_path")));
|
|
858
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ca_path = RSTRING_PTR(v); }
|
|
859
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("rate_limit_ms")));
|
|
860
|
+
if (!NIL_P(v)) rate_limit_ms = NUM2LONG(v);
|
|
861
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
|
|
862
|
+
if (!NIL_P(v)) transcode_utf8 = RTEST(v) ? 1 : 0;
|
|
863
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cache_dir")));
|
|
864
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); cache_dir = RSTRING_PTR(v); }
|
|
865
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("multipart")));
|
|
866
|
+
if (!NIL_P(v)) { Check_Type(v, T_HASH); multipart_v = v; }
|
|
867
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_cert")));
|
|
868
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_cert = RSTRING_PTR(v); }
|
|
869
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_key")));
|
|
870
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_key = RSTRING_PTR(v); }
|
|
871
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_key_password")));
|
|
872
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_key_pass = RSTRING_PTR(v); }
|
|
873
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("ssl_cert_type")));
|
|
874
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ssl_cert_type = RSTRING_PTR(v); }
|
|
875
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("proxy_auth")));
|
|
876
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); proxy_auth = RSTRING_PTR(v); }
|
|
877
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("proxy_type")));
|
|
878
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); proxy_type = RSTRING_PTR(v); }
|
|
879
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("download_to")));
|
|
880
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); download_to = RSTRING_PTR(v); }
|
|
881
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_recv_bps")));
|
|
882
|
+
if (!NIL_P(v)) max_recv_bps = NUM2LONG(v);
|
|
883
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_send_bps")));
|
|
884
|
+
if (!NIL_P(v)) max_send_bps = NUM2LONG(v);
|
|
885
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("http_version")));
|
|
886
|
+
if (!NIL_P(v)) {
|
|
887
|
+
if (SYMBOL_P(v)) v = rb_sym2str(v);
|
|
888
|
+
Check_Type(v, T_STRING);
|
|
889
|
+
http_version = RSTRING_PTR(v);
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
CURL *h = get_thread_curl();
|
|
894
|
+
if (!h) rb_raise(rb_eRuntimeError, "curl_easy_init failed");
|
|
895
|
+
|
|
896
|
+
fetch_ctx_t fc;
|
|
897
|
+
memset(&fc, 0, sizeof(fc));
|
|
898
|
+
fc.handle = h;
|
|
899
|
+
|
|
900
|
+
curl_easy_setopt(h, CURLOPT_URL, RSTRING_PTR(url_v));
|
|
901
|
+
/* HTTP version. Default: HTTP/2 over TLS with 1.1 fallback via
|
|
902
|
+
* ALPN — non-HTTPS targets fall back to HTTP/1.1 automatically.
|
|
903
|
+
* Opt in to "3" if the linked libcurl was built with HTTP/3. */
|
|
904
|
+
{
|
|
905
|
+
long ver = (long)CURL_HTTP_VERSION_2TLS;
|
|
906
|
+
if (http_version) {
|
|
907
|
+
if (strcmp(http_version, "1.0") == 0) ver = CURL_HTTP_VERSION_1_0;
|
|
908
|
+
else if (strcmp(http_version, "1.1") == 0) ver = CURL_HTTP_VERSION_1_1;
|
|
909
|
+
else if (strcmp(http_version, "2") == 0) ver = CURL_HTTP_VERSION_2TLS;
|
|
910
|
+
#ifdef CURL_HTTP_VERSION_3
|
|
911
|
+
else if (strcmp(http_version, "3") == 0) ver = CURL_HTTP_VERSION_3;
|
|
912
|
+
#endif
|
|
913
|
+
}
|
|
914
|
+
curl_easy_setopt(h, CURLOPT_HTTP_VERSION, ver);
|
|
915
|
+
}
|
|
916
|
+
/* Tell curl to wait briefly for an existing HTTP/2 connection to
|
|
917
|
+
* the target to become available rather than opening a fresh
|
|
918
|
+
* TCP+TLS handshake. Combined with the shared CONNECT pool this
|
|
919
|
+
* lets N workers multiplex through one connection per host. */
|
|
920
|
+
#ifdef CURLOPT_PIPEWAIT
|
|
921
|
+
curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
|
|
922
|
+
#endif
|
|
923
|
+
/* Accept-Encoding goes through CURLOPT_HTTPHEADER below, not
|
|
924
|
+
* CURLOPT_ACCEPT_ENCODING. The latter binds decompression to
|
|
925
|
+
* libcurl's compile-time codec set and aborts the response on
|
|
926
|
+
* encodings curl wasn't built for — which would defeat our
|
|
927
|
+
* point of shipping in-process brotli/zstd. */
|
|
928
|
+
fc.req_headers = curl_slist_append(
|
|
929
|
+
fc.req_headers, "Accept-Encoding: identity");
|
|
930
|
+
/* Replaced just below if any codec is linked. */
|
|
931
|
+
if (scrap_accept_encoding()[0] && strcmp(scrap_accept_encoding(), "identity") != 0) {
|
|
932
|
+
char ae_line[160];
|
|
933
|
+
snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s",
|
|
934
|
+
scrap_accept_encoding());
|
|
935
|
+
/* Pop the identity line and replace. curl_slist has no
|
|
936
|
+
* direct replace, so we rebuild from scratch. */
|
|
937
|
+
curl_slist_free_all(fc.req_headers);
|
|
938
|
+
fc.req_headers = NULL;
|
|
939
|
+
fc.req_headers = curl_slist_append(fc.req_headers, ae_line);
|
|
940
|
+
}
|
|
941
|
+
curl_easy_setopt(h, CURLOPT_USERAGENT, ua);
|
|
942
|
+
curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)follow);
|
|
943
|
+
curl_easy_setopt(h, CURLOPT_MAXREDIRS, max_redirs);
|
|
944
|
+
curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, timeout_ms);
|
|
945
|
+
curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L); /* required for use inside Ruby */
|
|
946
|
+
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
|
|
947
|
+
curl_easy_setopt(h, CURLOPT_WRITEDATA, &fc.body);
|
|
948
|
+
curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
|
|
949
|
+
curl_easy_setopt(h, CURLOPT_HEADERDATA, &fc.headers);
|
|
950
|
+
curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
|
|
951
|
+
if (insecure) {
|
|
952
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
|
|
953
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
|
|
954
|
+
}
|
|
955
|
+
if (ca_path) {
|
|
956
|
+
curl_easy_setopt(h, CURLOPT_CAINFO, ca_path);
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
/* Method + body. CUSTOMREQUEST overrides the verb regardless of
|
|
960
|
+
* POSTFIELDS presence; libcurl auto-switches to POST when POSTFIELDS
|
|
961
|
+
* is set, so we force CUSTOMREQUEST for everything non-GET to be
|
|
962
|
+
* explicit. NOBODY for HEAD strips the response body. */
|
|
963
|
+
if (nobody) {
|
|
964
|
+
curl_easy_setopt(h, CURLOPT_NOBODY, 1L);
|
|
965
|
+
curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, "HEAD");
|
|
966
|
+
} else if (method) {
|
|
967
|
+
curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, method);
|
|
968
|
+
}
|
|
969
|
+
if (body) {
|
|
970
|
+
curl_easy_setopt(h, CURLOPT_POSTFIELDS, body);
|
|
971
|
+
curl_easy_setopt(h, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)body_len);
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
/* Multipart form upload. Each Hash entry becomes a form part:
|
|
975
|
+
* "field" => "string" - text field
|
|
976
|
+
* "field" => { path: "...", filename: ..., content_type: ... }
|
|
977
|
+
* "field" => { data: "...bytes...", filename: ..., content_type: ... }
|
|
978
|
+
* Mixed in any combination. */
|
|
979
|
+
curl_mime *mime = NULL;
|
|
980
|
+
if (!NIL_P(multipart_v)) {
|
|
981
|
+
mime = curl_mime_init(h);
|
|
982
|
+
VALUE keys = rb_funcall(multipart_v, rb_intern("keys"), 0);
|
|
983
|
+
long nk = RARRAY_LEN(keys);
|
|
984
|
+
for (long i = 0; i < nk; i++) {
|
|
985
|
+
VALUE k = rb_ary_entry(keys, i);
|
|
986
|
+
VALUE pv = rb_hash_aref(multipart_v, k);
|
|
987
|
+
VALUE k_s = rb_obj_as_string(k);
|
|
988
|
+
curl_mimepart *part = curl_mime_addpart(mime);
|
|
989
|
+
curl_mime_name(part, RSTRING_PTR(k_s));
|
|
990
|
+
if (RB_TYPE_P(pv, T_STRING)) {
|
|
991
|
+
curl_mime_data(part, RSTRING_PTR(pv), (size_t)RSTRING_LEN(pv));
|
|
992
|
+
} else if (RB_TYPE_P(pv, T_HASH)) {
|
|
993
|
+
VALUE data_v = rb_hash_aref(pv, ID2SYM(rb_intern("data")));
|
|
994
|
+
VALUE path_v = rb_hash_aref(pv, ID2SYM(rb_intern("path")));
|
|
995
|
+
VALUE filename_v = rb_hash_aref(pv, ID2SYM(rb_intern("filename")));
|
|
996
|
+
VALUE ctype_v = rb_hash_aref(pv, ID2SYM(rb_intern("content_type")));
|
|
997
|
+
if (!NIL_P(path_v)) {
|
|
998
|
+
Check_Type(path_v, T_STRING);
|
|
999
|
+
curl_mime_filedata(part, RSTRING_PTR(path_v));
|
|
1000
|
+
} else if (!NIL_P(data_v)) {
|
|
1001
|
+
Check_Type(data_v, T_STRING);
|
|
1002
|
+
curl_mime_data(part, RSTRING_PTR(data_v), (size_t)RSTRING_LEN(data_v));
|
|
1003
|
+
}
|
|
1004
|
+
if (!NIL_P(filename_v)) {
|
|
1005
|
+
Check_Type(filename_v, T_STRING);
|
|
1006
|
+
curl_mime_filename(part, RSTRING_PTR(filename_v));
|
|
1007
|
+
}
|
|
1008
|
+
if (!NIL_P(ctype_v)) {
|
|
1009
|
+
Check_Type(ctype_v, T_STRING);
|
|
1010
|
+
curl_mime_type(part, RSTRING_PTR(ctype_v));
|
|
1011
|
+
}
|
|
1012
|
+
} else {
|
|
1013
|
+
rb_raise(rb_eArgError,
|
|
1014
|
+
"multipart values must be String or Hash with :path/:data");
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
curl_easy_setopt(h, CURLOPT_MIMEPOST, mime);
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
if (cookiefile) curl_easy_setopt(h, CURLOPT_COOKIEFILE, cookiefile);
|
|
1021
|
+
if (cookiejar) curl_easy_setopt(h, CURLOPT_COOKIEJAR, cookiejar);
|
|
1022
|
+
if (!cookiefile && cookiejar) {
|
|
1023
|
+
/* Tell curl to start with an empty in-memory jar (so writes
|
|
1024
|
+
* land somewhere) even when no input file is provided. */
|
|
1025
|
+
curl_easy_setopt(h, CURLOPT_COOKIEFILE, "");
|
|
1026
|
+
}
|
|
1027
|
+
if (proxy) curl_easy_setopt(h, CURLOPT_PROXY, proxy);
|
|
1028
|
+
if (proxy_auth) curl_easy_setopt(h, CURLOPT_PROXYUSERPWD, proxy_auth);
|
|
1029
|
+
if (proxy_type) {
|
|
1030
|
+
long pt = CURLPROXY_HTTP;
|
|
1031
|
+
if (strcasecmp(proxy_type, "http") == 0) pt = CURLPROXY_HTTP;
|
|
1032
|
+
else if (strcasecmp(proxy_type, "https") == 0) pt = CURLPROXY_HTTPS;
|
|
1033
|
+
else if (strcasecmp(proxy_type, "socks4") == 0) pt = CURLPROXY_SOCKS4;
|
|
1034
|
+
else if (strcasecmp(proxy_type, "socks4a") == 0) pt = CURLPROXY_SOCKS4A;
|
|
1035
|
+
else if (strcasecmp(proxy_type, "socks5") == 0) pt = CURLPROXY_SOCKS5;
|
|
1036
|
+
else if (strcasecmp(proxy_type, "socks5h") == 0) pt = CURLPROXY_SOCKS5_HOSTNAME;
|
|
1037
|
+
curl_easy_setopt(h, CURLOPT_PROXYTYPE, pt);
|
|
1038
|
+
}
|
|
1039
|
+
/* mTLS: present a client cert during TLS handshake. */
|
|
1040
|
+
if (ssl_cert) curl_easy_setopt(h, CURLOPT_SSLCERT, ssl_cert);
|
|
1041
|
+
if (ssl_cert_type) curl_easy_setopt(h, CURLOPT_SSLCERTTYPE, ssl_cert_type);
|
|
1042
|
+
if (ssl_key) curl_easy_setopt(h, CURLOPT_SSLKEY, ssl_key);
|
|
1043
|
+
if (ssl_key_pass) curl_easy_setopt(h, CURLOPT_KEYPASSWD, ssl_key_pass);
|
|
1044
|
+
/* Bandwidth caps. 0 means unlimited (libcurl default). */
|
|
1045
|
+
if (max_recv_bps > 0) {
|
|
1046
|
+
curl_easy_setopt(h, CURLOPT_MAX_RECV_SPEED_LARGE, (curl_off_t)max_recv_bps);
|
|
1047
|
+
}
|
|
1048
|
+
if (max_send_bps > 0) {
|
|
1049
|
+
curl_easy_setopt(h, CURLOPT_MAX_SEND_SPEED_LARGE, (curl_off_t)max_send_bps);
|
|
1050
|
+
}
|
|
1051
|
+
/* Streaming download to disk. Uses libcurl's default fwrite
|
|
1052
|
+
* callback, bypassing the in-memory body buffer entirely — no
|
|
1053
|
+
* RAM growth regardless of body size. Caller reads from
|
|
1054
|
+
* download_to after the request returns. */
|
|
1055
|
+
FILE *dl_fp = NULL;
|
|
1056
|
+
if (download_to) {
|
|
1057
|
+
dl_fp = fopen(download_to, "wb");
|
|
1058
|
+
if (!dl_fp) {
|
|
1059
|
+
rb_raise(rb_eIOError, "scrapetor http: cannot open download_to %s", download_to);
|
|
1060
|
+
}
|
|
1061
|
+
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, NULL); /* libcurl's default = fwrite */
|
|
1062
|
+
curl_easy_setopt(h, CURLOPT_WRITEDATA, (void *)dl_fp);
|
|
1063
|
+
}
|
|
1064
|
+
if (basic_auth) {
|
|
1065
|
+
curl_easy_setopt(h, CURLOPT_HTTPAUTH, (long)CURLAUTH_BASIC);
|
|
1066
|
+
curl_easy_setopt(h, CURLOPT_USERPWD, basic_auth);
|
|
1067
|
+
}
|
|
1068
|
+
if (bearer) {
|
|
1069
|
+
#ifdef CURLAUTH_BEARER
|
|
1070
|
+
curl_easy_setopt(h, CURLOPT_HTTPAUTH, (long)CURLAUTH_BEARER);
|
|
1071
|
+
curl_easy_setopt(h, CURLOPT_XOAUTH2_BEARER, bearer);
|
|
1072
|
+
#else
|
|
1073
|
+
/* Older libcurl — fall back to a manual Authorization header. */
|
|
1074
|
+
char line[1024];
|
|
1075
|
+
snprintf(line, sizeof(line), "Authorization: Bearer %s", bearer);
|
|
1076
|
+
fc.req_headers = curl_slist_append(fc.req_headers, line);
|
|
1077
|
+
#endif
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
if (!NIL_P(headers_v)) {
|
|
1081
|
+
VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
|
|
1082
|
+
long n = RARRAY_LEN(keys);
|
|
1083
|
+
for (long i = 0; i < n; i++) {
|
|
1084
|
+
VALUE k = rb_ary_entry(keys, i);
|
|
1085
|
+
VALUE v = rb_hash_aref(headers_v, k);
|
|
1086
|
+
VALUE line = rb_str_dup(k);
|
|
1087
|
+
rb_str_cat_cstr(line, ": ");
|
|
1088
|
+
rb_str_append(line, v);
|
|
1089
|
+
fc.req_headers = curl_slist_append(fc.req_headers, RSTRING_PTR(line));
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
/* Always set the slist — at minimum it carries Accept-Encoding so
|
|
1093
|
+
* curl forwards our codec advertisement rather than its own
|
|
1094
|
+
* (which would let curl claim decompression responsibility we
|
|
1095
|
+
* mean to keep). */
|
|
1096
|
+
if (fc.req_headers) {
|
|
1097
|
+
curl_easy_setopt(h, CURLOPT_HTTPHEADER, fc.req_headers);
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
/* HTTP response cache lookup + revalidation. If a cache entry
|
|
1101
|
+
* exists for this URL, attach If-None-Match / If-Modified-Since
|
|
1102
|
+
* so the server can answer 304 (no body) when nothing changed. */
|
|
1103
|
+
scrap_cache_entry_t cached;
|
|
1104
|
+
memset(&cached, 0, sizeof(cached));
|
|
1105
|
+
int have_cached = 0;
|
|
1106
|
+
if (cache_dir && !nobody && !method && !body) {
|
|
1107
|
+
/* Cache only safe-GETs. POST/PUT/DELETE responses aren't
|
|
1108
|
+
* eligible per RFC 7234, and HEAD has no body to serve. */
|
|
1109
|
+
have_cached = scrap_cache_load(cache_dir, RSTRING_PTR(url_v), &cached);
|
|
1110
|
+
if (have_cached) {
|
|
1111
|
+
if (cached.etag_len > 0) {
|
|
1112
|
+
char line[1024];
|
|
1113
|
+
snprintf(line, sizeof(line), "If-None-Match: %.*s",
|
|
1114
|
+
(int)cached.etag_len, cached.etag);
|
|
1115
|
+
fc.req_headers = curl_slist_append(fc.req_headers, line);
|
|
1116
|
+
}
|
|
1117
|
+
if (cached.lastmod_len > 0) {
|
|
1118
|
+
char line[1024];
|
|
1119
|
+
snprintf(line, sizeof(line), "If-Modified-Since: %.*s",
|
|
1120
|
+
(int)cached.lastmod_len, cached.lastmod);
|
|
1121
|
+
fc.req_headers = curl_slist_append(fc.req_headers, line);
|
|
1122
|
+
}
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
/* Per-host throttle. Honours rate_limit_ms before we even open
|
|
1127
|
+
* the socket; safe to call under GVL or no-GVL since it uses
|
|
1128
|
+
* only pthread + nanosleep. */
|
|
1129
|
+
if (rate_limit_ms > 0) {
|
|
1130
|
+
char host[256];
|
|
1131
|
+
if (scrap_extract_host(RSTRING_PTR(url_v), host, sizeof(host))) {
|
|
1132
|
+
scrap_throttle_wait(host, (uint64_t)rate_limit_ms * 1000000ull);
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
/* Drop the GVL while curl is on the network. Other Ruby threads
|
|
1137
|
+
* (background loaders, log writers, etc.) keep moving during the
|
|
1138
|
+
* round-trip. */
|
|
1139
|
+
rb_thread_call_without_gvl(do_fetch_nogvl, &fc, NULL, NULL);
|
|
1140
|
+
|
|
1141
|
+
if (fc.req_headers) curl_slist_free_all(fc.req_headers);
|
|
1142
|
+
if (mime) curl_mime_free(mime);
|
|
1143
|
+
if (dl_fp) { fclose(dl_fp); dl_fp = NULL; }
|
|
1144
|
+
|
|
1145
|
+
if (fc.rc != CURLE_OK) {
|
|
1146
|
+
const char *err = curl_easy_strerror(fc.rc);
|
|
1147
|
+
free(fc.body.data);
|
|
1148
|
+
free(fc.headers.data);
|
|
1149
|
+
rb_raise(rb_eIOError, "scrapetor http: %s", err);
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
long status = 0;
|
|
1153
|
+
curl_easy_getinfo(h, CURLINFO_RESPONSE_CODE, &status);
|
|
1154
|
+
char *eff_url = NULL;
|
|
1155
|
+
curl_easy_getinfo(h, CURLINFO_EFFECTIVE_URL, &eff_url);
|
|
1156
|
+
long http_ver = 0;
|
|
1157
|
+
curl_easy_getinfo(h, CURLINFO_HTTP_VERSION, &http_ver);
|
|
1158
|
+
|
|
1159
|
+
/* Flush the cookie jar to disk now rather than at handle cleanup
|
|
1160
|
+
* (which happens on thread exit). Lets callers see Set-Cookie
|
|
1161
|
+
* values immediately after the request completes. */
|
|
1162
|
+
if (cookiejar) curl_easy_setopt(h, CURLOPT_COOKIELIST, "FLUSH");
|
|
1163
|
+
|
|
1164
|
+
/* HTTP cache revalidation: 304 -> serve from cache; 200 with
|
|
1165
|
+
* ETag/Last-Modified -> store new entry. */
|
|
1166
|
+
int served_from_cache = 0;
|
|
1167
|
+
if (cache_dir && have_cached && status == 304) {
|
|
1168
|
+
/* Replace body buffer with cached payload; bump status to 200
|
|
1169
|
+
* so the caller sees a fully-formed response. The actual 304
|
|
1170
|
+
* round-trip was cheap (no body) — this is the cache win. */
|
|
1171
|
+
free(fc.body.data);
|
|
1172
|
+
fc.body.data = (char *)malloc(cached.body_len + 1);
|
|
1173
|
+
memcpy(fc.body.data, cached.body, cached.body_len);
|
|
1174
|
+
fc.body.data[cached.body_len] = 0;
|
|
1175
|
+
fc.body.len = cached.body_len;
|
|
1176
|
+
fc.body.cap = cached.body_len;
|
|
1177
|
+
status = 200;
|
|
1178
|
+
served_from_cache = 1;
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
VALUE headers_h = parse_headers_blob(fc.headers.data ? fc.headers.data : "",
|
|
1182
|
+
fc.headers.len);
|
|
1183
|
+
|
|
1184
|
+
/* When we served from cache, the network response was 304 with no
|
|
1185
|
+
* headers other than status/ETag. Overlay the cached
|
|
1186
|
+
* Content-Type so consumers see the right metadata for the
|
|
1187
|
+
* body they're getting. */
|
|
1188
|
+
if (served_from_cache && cached.ctype_len > 0) {
|
|
1189
|
+
rb_hash_aset(headers_h, rb_str_new_cstr("content-type"),
|
|
1190
|
+
rb_str_new(cached.ctype, (long)cached.ctype_len));
|
|
1191
|
+
rb_hash_aset(headers_h, rb_str_new_cstr("x-scrapetor-cache"),
|
|
1192
|
+
rb_str_new_cstr("hit"));
|
|
1193
|
+
} else if (cache_dir && have_cached) {
|
|
1194
|
+
rb_hash_aset(headers_h, rb_str_new_cstr("x-scrapetor-cache"),
|
|
1195
|
+
rb_str_new_cstr("miss-revalidated"));
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
/* If a Content-Encoding header is still present, libcurl couldn't
|
|
1199
|
+
* decode it (it strips the header on successful auto-decompress).
|
|
1200
|
+
* Try our in-process decoders for brotli / zstd. On success,
|
|
1201
|
+
* remove the header so the body matches what callers see. */
|
|
1202
|
+
{
|
|
1203
|
+
VALUE ce_key = rb_str_new_cstr("content-encoding");
|
|
1204
|
+
VALUE ce_val = rb_hash_lookup(headers_h, ce_key);
|
|
1205
|
+
if (!NIL_P(ce_val)) {
|
|
1206
|
+
const char *ce = RSTRING_PTR(ce_val);
|
|
1207
|
+
long ce_len = RSTRING_LEN(ce_val);
|
|
1208
|
+
/* Trim surrounding whitespace + match the bare codec name. */
|
|
1209
|
+
while (ce_len > 0 && (*ce == ' ' || *ce == '\t')) { ce++; ce_len--; }
|
|
1210
|
+
while (ce_len > 0 && (ce[ce_len-1] == ' ' || ce[ce_len-1] == '\t' ||
|
|
1211
|
+
ce[ce_len-1] == '\r' || ce[ce_len-1] == '\n')) ce_len--;
|
|
1212
|
+
int decoded = 0;
|
|
1213
|
+
#ifdef HAVE_ZLIB
|
|
1214
|
+
if (ce_len == 4 &&
|
|
1215
|
+
(ce[0] == 'g' || ce[0] == 'G') &&
|
|
1216
|
+
(ce[1] == 'z' || ce[1] == 'Z') &&
|
|
1217
|
+
(ce[2] == 'i' || ce[2] == 'I') &&
|
|
1218
|
+
(ce[3] == 'p' || ce[3] == 'P')) {
|
|
1219
|
+
char *out = NULL; size_t out_len = 0;
|
|
1220
|
+
/* 47 = 15 + 32; +32 enables gzip+zlib auto-detect. */
|
|
1221
|
+
if (scrap_zlib_decode(fc.body.data, fc.body.len, 47,
|
|
1222
|
+
&out, &out_len)) {
|
|
1223
|
+
free(fc.body.data);
|
|
1224
|
+
fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
|
|
1225
|
+
decoded = 1;
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
if (!decoded && ce_len == 7 &&
|
|
1229
|
+
(ce[0] == 'd' || ce[0] == 'D') &&
|
|
1230
|
+
(ce[1] == 'e' || ce[1] == 'E') &&
|
|
1231
|
+
(ce[2] == 'f' || ce[2] == 'F') &&
|
|
1232
|
+
(ce[3] == 'l' || ce[3] == 'L') &&
|
|
1233
|
+
(ce[4] == 'a' || ce[4] == 'A') &&
|
|
1234
|
+
(ce[5] == 't' || ce[5] == 'T') &&
|
|
1235
|
+
(ce[6] == 'e' || ce[6] == 'E')) {
|
|
1236
|
+
char *out = NULL; size_t out_len = 0;
|
|
1237
|
+
/* Try raw deflate first (-15), fall back to zlib wrapper (15).
|
|
1238
|
+
* Real-world Content-Encoding: deflate is sent both ways. */
|
|
1239
|
+
if (!scrap_zlib_decode(fc.body.data, fc.body.len, -15,
|
|
1240
|
+
&out, &out_len)) {
|
|
1241
|
+
if (scrap_zlib_decode(fc.body.data, fc.body.len, 15,
|
|
1242
|
+
&out, &out_len)) {
|
|
1243
|
+
free(fc.body.data);
|
|
1244
|
+
fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
|
|
1245
|
+
decoded = 1;
|
|
1246
|
+
}
|
|
1247
|
+
} else {
|
|
1248
|
+
free(fc.body.data);
|
|
1249
|
+
fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
|
|
1250
|
+
decoded = 1;
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
#endif
|
|
1254
|
+
#ifdef HAVE_BROTLI
|
|
1255
|
+
if (!decoded && ce_len == 2 && (ce[0] == 'b' || ce[0] == 'B') &&
|
|
1256
|
+
(ce[1] == 'r' || ce[1] == 'R')) {
|
|
1257
|
+
char *out = NULL; size_t out_len = 0;
|
|
1258
|
+
if (scrap_brotli_decode(fc.body.data, fc.body.len, &out, &out_len)) {
|
|
1259
|
+
free(fc.body.data);
|
|
1260
|
+
fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
|
|
1261
|
+
decoded = 1;
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
#endif
|
|
1265
|
+
#ifdef HAVE_ZSTD
|
|
1266
|
+
if (!decoded && ce_len == 4 &&
|
|
1267
|
+
(ce[0] == 'z' || ce[0] == 'Z') &&
|
|
1268
|
+
(ce[1] == 's' || ce[1] == 'S') &&
|
|
1269
|
+
(ce[2] == 't' || ce[2] == 'T') &&
|
|
1270
|
+
(ce[3] == 'd' || ce[3] == 'D')) {
|
|
1271
|
+
char *out = NULL; size_t out_len = 0;
|
|
1272
|
+
if (scrap_zstd_decode(fc.body.data, fc.body.len, &out, &out_len)) {
|
|
1273
|
+
free(fc.body.data);
|
|
1274
|
+
fc.body.data = out; fc.body.len = out_len; fc.body.cap = out_len;
|
|
1275
|
+
decoded = 1;
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
#endif
|
|
1279
|
+
if (decoded) {
|
|
1280
|
+
rb_hash_delete(headers_h, ce_key);
|
|
1281
|
+
}
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
/* Charset transcode to UTF-8. Runs after content-encoding decode
|
|
1286
|
+
* so iconv sees the raw decoded text. */
|
|
1287
|
+
if (transcode_utf8 && fc.body.data && fc.body.len > 0) {
|
|
1288
|
+
if (scrap_apply_charset(fc.headers.data ? fc.headers.data : "", fc.headers.len,
|
|
1289
|
+
&fc.body.data, &fc.body.len, &fc.body.cap)) {
|
|
1290
|
+
/* Rewrite content-type so consumers see the new charset. */
|
|
1291
|
+
VALUE ct_key = rb_str_new_cstr("content-type");
|
|
1292
|
+
VALUE ct_val = rb_hash_lookup(headers_h, ct_key);
|
|
1293
|
+
if (!NIL_P(ct_val)) {
|
|
1294
|
+
VALUE replaced = rb_funcall(ct_val, rb_intern("sub"), 2,
|
|
1295
|
+
rb_reg_new_str(rb_str_new_cstr("charset\\s*=\\s*\"?[\\w\\-]+\"?"), 1 /* IGNORECASE */),
|
|
1296
|
+
rb_str_new_cstr("charset=utf-8"));
|
|
1297
|
+
rb_hash_aset(headers_h, ct_key, replaced);
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
/* Update cache for 2xx responses with cache-relevant headers.
|
|
1303
|
+
* Skip when the body is empty (HEAD already exits earlier) or when
|
|
1304
|
+
* the response was already a cache-served 304. */
|
|
1305
|
+
if (cache_dir && !served_from_cache && status >= 200 && status < 300 &&
|
|
1306
|
+
fc.body.data && fc.body.len > 0) {
|
|
1307
|
+
VALUE etag_v = rb_hash_lookup(headers_h, rb_str_new_cstr("etag"));
|
|
1308
|
+
VALUE lastmod_v = rb_hash_lookup(headers_h, rb_str_new_cstr("last-modified"));
|
|
1309
|
+
VALUE ctype_v = rb_hash_lookup(headers_h, rb_str_new_cstr("content-type"));
|
|
1310
|
+
/* Only cache when there's *some* revalidation token. Otherwise the
|
|
1311
|
+
* entry would be useless (every fetch would always re-download). */
|
|
1312
|
+
if (!NIL_P(etag_v) || !NIL_P(lastmod_v)) {
|
|
1313
|
+
const char *etag_p = NIL_P(etag_v) ? "" : RSTRING_PTR(etag_v);
|
|
1314
|
+
size_t etag_l = NIL_P(etag_v) ? 0 : (size_t)RSTRING_LEN(etag_v);
|
|
1315
|
+
const char *lastmod_p = NIL_P(lastmod_v) ? "" : RSTRING_PTR(lastmod_v);
|
|
1316
|
+
size_t lastmod_l = NIL_P(lastmod_v) ? 0 : (size_t)RSTRING_LEN(lastmod_v);
|
|
1317
|
+
const char *ctype_p = NIL_P(ctype_v) ? "" : RSTRING_PTR(ctype_v);
|
|
1318
|
+
size_t ctype_l = NIL_P(ctype_v) ? 0 : (size_t)RSTRING_LEN(ctype_v);
|
|
1319
|
+
scrap_cache_store(cache_dir, RSTRING_PTR(url_v), status,
|
|
1320
|
+
etag_p, etag_l, lastmod_p, lastmod_l,
|
|
1321
|
+
ctype_p, ctype_l,
|
|
1322
|
+
fc.body.data, fc.body.len);
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
scrap_cache_entry_free(&cached);
|
|
1326
|
+
|
|
1327
|
+
/* When download_to is set the body went straight to disk via
|
|
1328
|
+
* libcurl's default fwrite callback — fc.body is empty. Surface
|
|
1329
|
+
* an empty Ruby String + a :downloaded_to key pointing at the
|
|
1330
|
+
* file so the caller knows where to find the bytes. */
|
|
1331
|
+
VALUE body_s = download_to
|
|
1332
|
+
? rb_enc_str_new("", 0, enc_utf8)
|
|
1333
|
+
: rb_str_new(fc.body.data ? fc.body.data : "", (long)fc.body.len);
|
|
1334
|
+
rb_enc_associate(body_s, enc_utf8);
|
|
1335
|
+
|
|
1336
|
+
free(fc.body.data);
|
|
1337
|
+
free(fc.headers.data);
|
|
1338
|
+
|
|
1339
|
+
VALUE result = rb_hash_new();
|
|
1340
|
+
rb_hash_aset(result, ID2SYM(rb_intern("status")), LONG2NUM(status));
|
|
1341
|
+
rb_hash_aset(result, ID2SYM(rb_intern("headers")), headers_h);
|
|
1342
|
+
rb_hash_aset(result, ID2SYM(rb_intern("body")), body_s);
|
|
1343
|
+
rb_hash_aset(result, ID2SYM(rb_intern("final_url")),
|
|
1344
|
+
rb_str_new_cstr(eff_url ? eff_url : RSTRING_PTR(url_v)));
|
|
1345
|
+
const char *hv_str = "1.1";
|
|
1346
|
+
switch (http_ver) {
|
|
1347
|
+
case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
|
|
1348
|
+
case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
|
|
1349
|
+
case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
|
|
1350
|
+
#ifdef CURL_HTTP_VERSION_3
|
|
1351
|
+
case CURL_HTTP_VERSION_3: hv_str = "3"; break;
|
|
1352
|
+
#endif
|
|
1353
|
+
}
|
|
1354
|
+
rb_hash_aset(result, ID2SYM(rb_intern("http_version")),
|
|
1355
|
+
rb_str_new_cstr(hv_str));
|
|
1356
|
+
if (download_to) {
|
|
1357
|
+
rb_hash_aset(result, ID2SYM(rb_intern("downloaded_to")),
|
|
1358
|
+
rb_str_new_cstr(download_to));
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
return result;
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
static VALUE scrap_http_features(VALUE self) {
|
|
1365
|
+
(void)self;
|
|
1366
|
+
VALUE h = rb_hash_new();
|
|
1367
|
+
curl_version_info_data *vi = curl_version_info(CURLVERSION_NOW);
|
|
1368
|
+
rb_hash_aset(h, ID2SYM(rb_intern("curl_version")),
|
|
1369
|
+
rb_str_new_cstr(vi->version));
|
|
1370
|
+
rb_hash_aset(h, ID2SYM(rb_intern("http2")),
|
|
1371
|
+
(vi->features & CURL_VERSION_HTTP2) ? Qtrue : Qfalse);
|
|
1372
|
+
/* HTTP/3 (QUIC) — only present when libcurl was built with
|
|
1373
|
+
* quiche / ngtcp2. Apple's system libcurl and most distro
|
|
1374
|
+
* defaults are HTTP/2-only; HTTP/3 requires a custom build. */
|
|
1375
|
+
#ifdef CURL_VERSION_HTTP3
|
|
1376
|
+
rb_hash_aset(h, ID2SYM(rb_intern("http3")),
|
|
1377
|
+
(vi->features & CURL_VERSION_HTTP3) ? Qtrue : Qfalse);
|
|
1378
|
+
#else
|
|
1379
|
+
rb_hash_aset(h, ID2SYM(rb_intern("http3")), Qfalse);
|
|
1380
|
+
#endif
|
|
1381
|
+
/* WebSocket support — libcurl 7.86+ via curl_ws_send/recv. */
|
|
1382
|
+
#ifdef CURLWS_BINARY
|
|
1383
|
+
rb_hash_aset(h, ID2SYM(rb_intern("websocket")), Qtrue);
|
|
1384
|
+
#else
|
|
1385
|
+
rb_hash_aset(h, ID2SYM(rb_intern("websocket")), Qfalse);
|
|
1386
|
+
#endif
|
|
1387
|
+
|
|
1388
|
+
/* "brotli" / "zstd" reflect what *we* can deliver, not what
|
|
1389
|
+
* curl can. True if either curl was built with it OR we link
|
|
1390
|
+
* the codec library directly (HAVE_BROTLI / HAVE_ZSTD) for
|
|
1391
|
+
* in-process decoding. */
|
|
1392
|
+
int has_brotli = 0;
|
|
1393
|
+
#ifdef CURL_VERSION_BROTLI
|
|
1394
|
+
has_brotli |= (vi->features & CURL_VERSION_BROTLI) ? 1 : 0;
|
|
1395
|
+
#endif
|
|
1396
|
+
#ifdef HAVE_BROTLI
|
|
1397
|
+
has_brotli = 1;
|
|
1398
|
+
#endif
|
|
1399
|
+
rb_hash_aset(h, ID2SYM(rb_intern("brotli")), has_brotli ? Qtrue : Qfalse);
|
|
1400
|
+
#ifdef HAVE_BROTLI
|
|
1401
|
+
rb_hash_aset(h, ID2SYM(rb_intern("brotli_inproc")), Qtrue);
|
|
1402
|
+
#else
|
|
1403
|
+
rb_hash_aset(h, ID2SYM(rb_intern("brotli_inproc")), Qfalse);
|
|
1404
|
+
#endif
|
|
1405
|
+
|
|
1406
|
+
int has_zstd = 0;
|
|
1407
|
+
#ifdef CURL_VERSION_ZSTD
|
|
1408
|
+
has_zstd |= (vi->features & CURL_VERSION_ZSTD) ? 1 : 0;
|
|
1409
|
+
#endif
|
|
1410
|
+
#ifdef HAVE_ZSTD
|
|
1411
|
+
has_zstd = 1;
|
|
1412
|
+
#endif
|
|
1413
|
+
rb_hash_aset(h, ID2SYM(rb_intern("zstd")), has_zstd ? Qtrue : Qfalse);
|
|
1414
|
+
#ifdef HAVE_ZSTD
|
|
1415
|
+
rb_hash_aset(h, ID2SYM(rb_intern("zstd_inproc")), Qtrue);
|
|
1416
|
+
#else
|
|
1417
|
+
rb_hash_aset(h, ID2SYM(rb_intern("zstd_inproc")), Qfalse);
|
|
1418
|
+
#endif
|
|
1419
|
+
|
|
1420
|
+
rb_hash_aset(h, ID2SYM(rb_intern("libz")),
|
|
1421
|
+
(vi->features & CURL_VERSION_LIBZ) ? Qtrue : Qfalse);
|
|
1422
|
+
rb_hash_aset(h, ID2SYM(rb_intern("accept_encoding")),
|
|
1423
|
+
rb_str_new_cstr(scrap_accept_encoding()));
|
|
1424
|
+
return h;
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
/* ---- parallel fetch ---------------------------------------------- *
|
|
1428
|
+
* N concurrent libcurl GETs across pthread workers. Each worker uses
|
|
1429
|
+
* get_thread_curl() so its handle's connection cache persists for the
|
|
1430
|
+
* duration of the batch — back-to-back URLs against the same host on
|
|
1431
|
+
* the same worker reuse the TLS + HTTP/2 session.
|
|
1432
|
+
*
|
|
1433
|
+
* The whole batch runs under one rb_thread_call_without_gvl; Ruby's
|
|
1434
|
+
* other threads stay live for the entire pool of fetches. Header
|
|
1435
|
+
* parsing into Ruby Hashes is deferred to after-join (it needs GVL),
|
|
1436
|
+
* but the network and the in-process decompression all happen no-GVL.
|
|
1437
|
+
*/
|
|
1438
|
+
|
|
1439
|
+
typedef struct {
|
|
1440
|
+
char *url;
|
|
1441
|
+
long status;
|
|
1442
|
+
long http_version;
|
|
1443
|
+
char *body; size_t body_len;
|
|
1444
|
+
char *headers_blob; size_t headers_len;
|
|
1445
|
+
char *final_url;
|
|
1446
|
+
CURLcode rc;
|
|
1447
|
+
char errstr[CURL_ERROR_SIZE];
|
|
1448
|
+
/* Per-item Accept-Encoding header — points at a shared slist for
|
|
1449
|
+
* the whole batch. Not owned. */
|
|
1450
|
+
struct curl_slist *shared_headers;
|
|
1451
|
+
long timeout_ms;
|
|
1452
|
+
long max_redirects;
|
|
1453
|
+
int follow_redirects;
|
|
1454
|
+
int insecure;
|
|
1455
|
+
int transcode_utf8;
|
|
1456
|
+
long rate_limit_ms;
|
|
1457
|
+
const char *user_agent;
|
|
1458
|
+
/* When non-zero, the worker runs dom_parse on the body and stores
|
|
1459
|
+
* the resulting Document in `parsed_doc`. Saves the main thread a
|
|
1460
|
+
* second serial pass over the batch. */
|
|
1461
|
+
int parse_after_fetch;
|
|
1462
|
+
dom_doc_t *parsed_doc;
|
|
1463
|
+
} pfetch_item_t;
|
|
1464
|
+
|
|
1465
|
+
static void pfetch_item_free(pfetch_item_t *it) {
|
|
1466
|
+
free(it->url);
|
|
1467
|
+
free(it->body);
|
|
1468
|
+
free(it->headers_blob);
|
|
1469
|
+
free(it->final_url);
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
/* Strip-and-decode Content-Encoding: takes a raw header blob + a body
|
|
1473
|
+
* buffer (in/out), runs the in-process decoder for the encoding the
|
|
1474
|
+
* server advertised, and replaces the body in place. Standalone so
|
|
1475
|
+
* both pfetch (pthread+easy) and mfetch (curl_multi) paths can call
|
|
1476
|
+
* it from no-GVL workers. */
|
|
1477
|
+
static int scrap_decode_content_encoding(const char *headers_blob, size_t headers_len,
|
|
1478
|
+
char **body, size_t *body_len) {
|
|
1479
|
+
if (!headers_blob) return 1;
|
|
1480
|
+
const char *ce_val = NULL; size_t ce_len = 0;
|
|
1481
|
+
size_t i = 0;
|
|
1482
|
+
while (i < headers_len) {
|
|
1483
|
+
size_t ls = i;
|
|
1484
|
+
while (i < headers_len && headers_blob[i] != '\n') i++;
|
|
1485
|
+
size_t le = i;
|
|
1486
|
+
if (le > ls && headers_blob[le-1] == '\r') le--;
|
|
1487
|
+
if (i < headers_len) i++;
|
|
1488
|
+
if (le == ls) continue;
|
|
1489
|
+
size_t colon = (size_t)-1;
|
|
1490
|
+
for (size_t k = ls; k < le; k++) {
|
|
1491
|
+
if (headers_blob[k] == ':') { colon = k; break; }
|
|
1492
|
+
}
|
|
1493
|
+
if (colon == (size_t)-1) continue;
|
|
1494
|
+
if (colon - ls != 16) continue;
|
|
1495
|
+
const char *want = "content-encoding";
|
|
1496
|
+
int matches = 1;
|
|
1497
|
+
for (size_t k = 0; k < 16; k++) {
|
|
1498
|
+
char a = headers_blob[ls + k];
|
|
1499
|
+
if (a >= 'A' && a <= 'Z') a += 32;
|
|
1500
|
+
if (a != want[k]) { matches = 0; break; }
|
|
1501
|
+
}
|
|
1502
|
+
if (!matches) continue;
|
|
1503
|
+
size_t vs = colon + 1;
|
|
1504
|
+
while (vs < le && (headers_blob[vs] == ' ' || headers_blob[vs] == '\t')) vs++;
|
|
1505
|
+
ce_val = headers_blob + vs; ce_len = le - vs;
|
|
1506
|
+
}
|
|
1507
|
+
if (!ce_val) return 1;
|
|
1508
|
+
while (ce_len > 0 && (ce_val[ce_len-1] == ' ' || ce_val[ce_len-1] == '\t' ||
|
|
1509
|
+
ce_val[ce_len-1] == '\r' || ce_val[ce_len-1] == '\n')) ce_len--;
|
|
1510
|
+
|
|
1511
|
+
char *out = NULL; size_t out_len = 0;
|
|
1512
|
+
int decoded = 0;
|
|
1513
|
+
#ifdef HAVE_ZLIB
|
|
1514
|
+
if (ce_len == 4 && ((ce_val[0] | 0x20) == 'g') && ((ce_val[1] | 0x20) == 'z') &&
|
|
1515
|
+
((ce_val[2] | 0x20) == 'i') && ((ce_val[3] | 0x20) == 'p')) {
|
|
1516
|
+
decoded = scrap_zlib_decode(*body, *body_len, 47, &out, &out_len);
|
|
1517
|
+
} else if (ce_len == 7 && ((ce_val[0] | 0x20) == 'd') && ((ce_val[1] | 0x20) == 'e') &&
|
|
1518
|
+
((ce_val[2] | 0x20) == 'f') && ((ce_val[3] | 0x20) == 'l') &&
|
|
1519
|
+
((ce_val[4] | 0x20) == 'a') && ((ce_val[5] | 0x20) == 't') &&
|
|
1520
|
+
((ce_val[6] | 0x20) == 'e')) {
|
|
1521
|
+
if (!scrap_zlib_decode(*body, *body_len, -15, &out, &out_len)) {
|
|
1522
|
+
decoded = scrap_zlib_decode(*body, *body_len, 15, &out, &out_len);
|
|
1523
|
+
} else decoded = 1;
|
|
1524
|
+
}
|
|
1525
|
+
#endif
|
|
1526
|
+
#ifdef HAVE_BROTLI
|
|
1527
|
+
if (!decoded && ce_len == 2 && ((ce_val[0] | 0x20) == 'b') && ((ce_val[1] | 0x20) == 'r')) {
|
|
1528
|
+
decoded = scrap_brotli_decode(*body, *body_len, &out, &out_len);
|
|
1529
|
+
}
|
|
1530
|
+
#endif
|
|
1531
|
+
#ifdef HAVE_ZSTD
|
|
1532
|
+
if (!decoded && ce_len == 4 && ((ce_val[0] | 0x20) == 'z') && ((ce_val[1] | 0x20) == 's') &&
|
|
1533
|
+
((ce_val[2] | 0x20) == 't') && ((ce_val[3] | 0x20) == 'd')) {
|
|
1534
|
+
decoded = scrap_zstd_decode(*body, *body_len, &out, &out_len);
|
|
1535
|
+
}
|
|
1536
|
+
#endif
|
|
1537
|
+
if (decoded) {
|
|
1538
|
+
free(*body);
|
|
1539
|
+
*body = out; *body_len = out_len;
|
|
1540
|
+
}
|
|
1541
|
+
return 1;
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
/* Strip Content-Encoding from a header blob (in place) and run the
|
|
1545
|
+
* matching in-process decoder against the body buffer. Returns 1 on
|
|
1546
|
+
* success or no-op, 0 on decoder failure. Thin wrapper for the
|
|
1547
|
+
* pfetch path which carries everything in a pfetch_item_t. */
|
|
1548
|
+
static int pfetch_decode_body(pfetch_item_t *it) {
|
|
1549
|
+
return scrap_decode_content_encoding(it->headers_blob, it->headers_len,
|
|
1550
|
+
&it->body, &it->body_len);
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
static void pfetch_do_one(pfetch_item_t *it) {
|
|
1554
|
+
/* Per-host throttle. Gates each worker against the global slot
|
|
1555
|
+
* for this item's host — N parallel workers hitting one host
|
|
1556
|
+
* with rate_limit_ms=500 serialise at that gate while different
|
|
1557
|
+
* hosts run concurrently. */
|
|
1558
|
+
if (it->rate_limit_ms > 0) {
|
|
1559
|
+
char host[256];
|
|
1560
|
+
if (scrap_extract_host(it->url, host, sizeof(host))) {
|
|
1561
|
+
scrap_throttle_wait(host, (uint64_t)it->rate_limit_ms * 1000000ull);
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
CURL *h = get_thread_curl();
|
|
1566
|
+
if (!h) { it->rc = CURLE_FAILED_INIT; return; }
|
|
1567
|
+
|
|
1568
|
+
buf_t bbuf; memset(&bbuf, 0, sizeof(bbuf));
|
|
1569
|
+
buf_t hbuf; memset(&hbuf, 0, sizeof(hbuf));
|
|
1570
|
+
|
|
1571
|
+
curl_easy_setopt(h, CURLOPT_URL, it->url);
|
|
1572
|
+
curl_easy_setopt(h, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
|
|
1573
|
+
/* Tell curl to wait briefly for an existing HTTP/2 connection to
|
|
1574
|
+
* the target to become available rather than opening a fresh
|
|
1575
|
+
* TCP+TLS handshake. Combined with the shared CONNECT pool this
|
|
1576
|
+
* lets N workers multiplex through one connection per host. */
|
|
1577
|
+
#ifdef CURLOPT_PIPEWAIT
|
|
1578
|
+
curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
|
|
1579
|
+
#endif
|
|
1580
|
+
curl_easy_setopt(h, CURLOPT_USERAGENT, it->user_agent);
|
|
1581
|
+
curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)it->follow_redirects);
|
|
1582
|
+
curl_easy_setopt(h, CURLOPT_MAXREDIRS, it->max_redirects);
|
|
1583
|
+
curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, it->timeout_ms);
|
|
1584
|
+
curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L);
|
|
1585
|
+
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
|
|
1586
|
+
curl_easy_setopt(h, CURLOPT_WRITEDATA, &bbuf);
|
|
1587
|
+
curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
|
|
1588
|
+
curl_easy_setopt(h, CURLOPT_HEADERDATA, &hbuf);
|
|
1589
|
+
curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
|
|
1590
|
+
curl_easy_setopt(h, CURLOPT_ERRORBUFFER, it->errstr);
|
|
1591
|
+
if (it->insecure) {
|
|
1592
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
|
|
1593
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
|
|
1594
|
+
}
|
|
1595
|
+
if (it->shared_headers) {
|
|
1596
|
+
curl_easy_setopt(h, CURLOPT_HTTPHEADER, it->shared_headers);
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
it->rc = curl_easy_perform(h);
|
|
1600
|
+
|
|
1601
|
+
if (it->rc == CURLE_OK) {
|
|
1602
|
+
long status = 0, hv = 0;
|
|
1603
|
+
char *eff = NULL;
|
|
1604
|
+
curl_easy_getinfo(h, CURLINFO_RESPONSE_CODE, &status);
|
|
1605
|
+
curl_easy_getinfo(h, CURLINFO_HTTP_VERSION, &hv);
|
|
1606
|
+
curl_easy_getinfo(h, CURLINFO_EFFECTIVE_URL, &eff);
|
|
1607
|
+
it->status = status;
|
|
1608
|
+
it->http_version = hv;
|
|
1609
|
+
if (eff) {
|
|
1610
|
+
size_t l = strlen(eff);
|
|
1611
|
+
it->final_url = (char *)malloc(l + 1);
|
|
1612
|
+
memcpy(it->final_url, eff, l + 1);
|
|
1613
|
+
}
|
|
1614
|
+
}
|
|
1615
|
+
it->body = bbuf.data; it->body_len = bbuf.len;
|
|
1616
|
+
it->headers_blob = hbuf.data; it->headers_len = hbuf.len;
|
|
1617
|
+
|
|
1618
|
+
/* In-process decompression while we still hold no GVL. */
|
|
1619
|
+
if (it->rc == CURLE_OK) {
|
|
1620
|
+
pfetch_decode_body(it);
|
|
1621
|
+
if (it->transcode_utf8 && it->body && it->body_len > 0) {
|
|
1622
|
+
size_t cap = it->body_len; /* tracked separately just for the iconv path */
|
|
1623
|
+
scrap_apply_charset(it->headers_blob ? it->headers_blob : "", it->headers_len,
|
|
1624
|
+
&it->body, &it->body_len, &cap);
|
|
1625
|
+
}
|
|
1626
|
+
/* Optional in-worker parse. The body buffer is handed over to a
|
|
1627
|
+
* dom_doc that takes ownership; we clear our pointers so the
|
|
1628
|
+
* post-join Ruby hash doesn't see (and free) the same memory. */
|
|
1629
|
+
if (it->parse_after_fetch && it->body && it->body_len > 0) {
|
|
1630
|
+
char *owned = it->body;
|
|
1631
|
+
size_t owned_len = it->body_len;
|
|
1632
|
+
it->body = NULL;
|
|
1633
|
+
it->body_len = 0;
|
|
1634
|
+
it->parsed_doc = scrap_dom_make_owned_doc(owned, owned_len);
|
|
1635
|
+
scrap_dom_parse_eager_nocache(it->parsed_doc);
|
|
1636
|
+
}
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
|
|
1640
|
+
typedef struct {
|
|
1641
|
+
pfetch_item_t *items;
|
|
1642
|
+
size_t n;
|
|
1643
|
+
int next_idx;
|
|
1644
|
+
} pfetch_ctx_t;
|
|
1645
|
+
|
|
1646
|
+
static void *pfetch_worker(void *arg) {
|
|
1647
|
+
pfetch_ctx_t *ctx = (pfetch_ctx_t *)arg;
|
|
1648
|
+
while (1) {
|
|
1649
|
+
int i = __atomic_fetch_add(&ctx->next_idx, 1, __ATOMIC_RELAXED);
|
|
1650
|
+
if (i >= (int)ctx->n) return NULL;
|
|
1651
|
+
pfetch_do_one(&ctx->items[i]);
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
typedef struct {
|
|
1656
|
+
pfetch_ctx_t *ctx;
|
|
1657
|
+
int n_threads;
|
|
1658
|
+
} pfetch_run_arg_t;
|
|
1659
|
+
|
|
1660
|
+
static void *pfetch_run(void *arg) {
|
|
1661
|
+
pfetch_run_arg_t *ra = (pfetch_run_arg_t *)arg;
|
|
1662
|
+
int nt = ra->n_threads;
|
|
1663
|
+
pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * (size_t)nt);
|
|
1664
|
+
int spawned = 0;
|
|
1665
|
+
for (int i = 0; i < nt; i++) {
|
|
1666
|
+
if (pthread_create(&threads[i], NULL, pfetch_worker, ra->ctx) == 0) spawned++;
|
|
1667
|
+
}
|
|
1668
|
+
if (spawned < nt) pfetch_worker(ra->ctx);
|
|
1669
|
+
for (int i = 0; i < spawned; i++) pthread_join(threads[i], NULL);
|
|
1670
|
+
free(threads);
|
|
1671
|
+
return NULL;
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
static VALUE scrap_parallel_fetch(int argc, VALUE *argv, VALUE self) {
|
|
1675
|
+
(void)self;
|
|
1676
|
+
scrap_ensure_global_init();
|
|
1677
|
+
VALUE urls_v, opts_v;
|
|
1678
|
+
rb_scan_args(argc, argv, "11", &urls_v, &opts_v);
|
|
1679
|
+
Check_Type(urls_v, T_ARRAY);
|
|
1680
|
+
long n = RARRAY_LEN(urls_v);
|
|
1681
|
+
if (n == 0) return rb_ary_new();
|
|
1682
|
+
|
|
1683
|
+
int n_threads = 4;
|
|
1684
|
+
long timeout_ms = 30000;
|
|
1685
|
+
int follow = 1;
|
|
1686
|
+
long max_redirs = 10;
|
|
1687
|
+
const char *ua = "scrapetor/0.1 (libcurl)";
|
|
1688
|
+
int insecure = 0;
|
|
1689
|
+
int transcode_utf8 = 1;
|
|
1690
|
+
long rate_limit_ms = 0;
|
|
1691
|
+
int parse_after = 0;
|
|
1692
|
+
VALUE headers_v = Qnil;
|
|
1693
|
+
if (!NIL_P(opts_v)) {
|
|
1694
|
+
Check_Type(opts_v, T_HASH);
|
|
1695
|
+
VALUE v;
|
|
1696
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("threads")));
|
|
1697
|
+
if (!NIL_P(v)) n_threads = NUM2INT(v);
|
|
1698
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
|
|
1699
|
+
if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
|
|
1700
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
|
|
1701
|
+
if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
|
|
1702
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
|
|
1703
|
+
if (!NIL_P(v)) max_redirs = NUM2LONG(v);
|
|
1704
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
|
|
1705
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
|
|
1706
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
|
|
1707
|
+
if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
|
|
1708
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
|
|
1709
|
+
if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
|
|
1710
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
|
|
1711
|
+
if (!NIL_P(v)) transcode_utf8 = RTEST(v) ? 1 : 0;
|
|
1712
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("rate_limit_ms")));
|
|
1713
|
+
if (!NIL_P(v)) rate_limit_ms = NUM2LONG(v);
|
|
1714
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("parse")));
|
|
1715
|
+
if (!NIL_P(v)) parse_after = RTEST(v) ? 1 : 0;
|
|
1716
|
+
}
|
|
1717
|
+
if (n_threads < 1) n_threads = 1;
|
|
1718
|
+
if (n_threads > (int)n) n_threads = (int)n;
|
|
1719
|
+
|
|
1720
|
+
/* One shared slist for the whole batch: Accept-Encoding + user
|
|
1721
|
+
* headers. All workers point at this; no mutation after build. */
|
|
1722
|
+
struct curl_slist *shared = NULL;
|
|
1723
|
+
{
|
|
1724
|
+
char ae_line[160];
|
|
1725
|
+
snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s", scrap_accept_encoding());
|
|
1726
|
+
shared = curl_slist_append(shared, ae_line);
|
|
1727
|
+
}
|
|
1728
|
+
if (!NIL_P(headers_v)) {
|
|
1729
|
+
VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
|
|
1730
|
+
long nk = RARRAY_LEN(keys);
|
|
1731
|
+
for (long i = 0; i < nk; i++) {
|
|
1732
|
+
VALUE k = rb_ary_entry(keys, i);
|
|
1733
|
+
VALUE vv = rb_hash_aref(headers_v, k);
|
|
1734
|
+
VALUE line = rb_str_dup(k);
|
|
1735
|
+
rb_str_cat_cstr(line, ": ");
|
|
1736
|
+
rb_str_append(line, vv);
|
|
1737
|
+
shared = curl_slist_append(shared, RSTRING_PTR(line));
|
|
1738
|
+
}
|
|
1739
|
+
}
|
|
1740
|
+
|
|
1741
|
+
pfetch_item_t *items = (pfetch_item_t *)calloc((size_t)n, sizeof(pfetch_item_t));
|
|
1742
|
+
for (long i = 0; i < n; i++) {
|
|
1743
|
+
VALUE u = rb_ary_entry(urls_v, i);
|
|
1744
|
+
Check_Type(u, T_STRING);
|
|
1745
|
+
size_t ul = (size_t)RSTRING_LEN(u);
|
|
1746
|
+
items[i].url = (char *)malloc(ul + 1);
|
|
1747
|
+
memcpy(items[i].url, RSTRING_PTR(u), ul);
|
|
1748
|
+
items[i].url[ul] = 0;
|
|
1749
|
+
items[i].shared_headers = shared;
|
|
1750
|
+
items[i].timeout_ms = timeout_ms;
|
|
1751
|
+
items[i].follow_redirects = follow;
|
|
1752
|
+
items[i].max_redirects = max_redirs;
|
|
1753
|
+
items[i].user_agent = ua;
|
|
1754
|
+
items[i].insecure = insecure;
|
|
1755
|
+
items[i].transcode_utf8 = transcode_utf8;
|
|
1756
|
+
items[i].rate_limit_ms = rate_limit_ms;
|
|
1757
|
+
items[i].parse_after_fetch = parse_after;
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
pfetch_ctx_t ctx; ctx.items = items; ctx.n = (size_t)n; ctx.next_idx = 0;
|
|
1761
|
+
pfetch_run_arg_t ra; ra.ctx = &ctx; ra.n_threads = n_threads;
|
|
1762
|
+
rb_thread_call_without_gvl(pfetch_run, &ra, NULL, NULL);
|
|
1763
|
+
|
|
1764
|
+
/* Re-acquired GVL — assemble Ruby Hashes from the C results. */
|
|
1765
|
+
VALUE doc_klass = Qnil;
|
|
1766
|
+
if (parse_after) {
|
|
1767
|
+
doc_klass = rb_path2class("Scrapetor::Native::Document");
|
|
1768
|
+
}
|
|
1769
|
+
VALUE result = rb_ary_new_capa(n);
|
|
1770
|
+
for (long i = 0; i < n; i++) {
|
|
1771
|
+
pfetch_item_t *it = &items[i];
|
|
1772
|
+
VALUE h = rb_hash_new();
|
|
1773
|
+
if (it->rc != CURLE_OK) {
|
|
1774
|
+
VALUE err = rb_hash_new();
|
|
1775
|
+
rb_hash_aset(err, ID2SYM(rb_intern("url")), rb_str_new_cstr(it->url));
|
|
1776
|
+
rb_hash_aset(err, ID2SYM(rb_intern("error")),
|
|
1777
|
+
rb_str_new_cstr(it->errstr[0] ? it->errstr : curl_easy_strerror(it->rc)));
|
|
1778
|
+
rb_hash_aset(h, ID2SYM(rb_intern("error")), err);
|
|
1779
|
+
rb_ary_push(result, h);
|
|
1780
|
+
pfetch_item_free(it);
|
|
1781
|
+
continue;
|
|
1782
|
+
}
|
|
1783
|
+
rb_hash_aset(h, ID2SYM(rb_intern("status")), LONG2NUM(it->status));
|
|
1784
|
+
/* When the worker parsed the body, body bytes were transferred to
|
|
1785
|
+
* the dom_doc — the item's own body pointer is NULL. Surface the
|
|
1786
|
+
* Document and emit an empty body string. */
|
|
1787
|
+
if (it->parsed_doc) {
|
|
1788
|
+
rb_hash_aset(h, ID2SYM(rb_intern("document")),
|
|
1789
|
+
scrap_dom_wrap_doc(doc_klass, it->parsed_doc));
|
|
1790
|
+
it->parsed_doc = NULL; /* ownership transferred to the wrap */
|
|
1791
|
+
rb_hash_aset(h, ID2SYM(rb_intern("body")), rb_enc_str_new("", 0, enc_utf8));
|
|
1792
|
+
} else {
|
|
1793
|
+
rb_hash_aset(h, ID2SYM(rb_intern("body")),
|
|
1794
|
+
rb_enc_str_new(it->body ? it->body : "", (long)it->body_len, enc_utf8));
|
|
1795
|
+
}
|
|
1796
|
+
VALUE headers_h = parse_headers_blob(it->headers_blob ? it->headers_blob : "",
|
|
1797
|
+
it->headers_len);
|
|
1798
|
+
/* Drop CE so headers + body stay consistent. */
|
|
1799
|
+
rb_hash_delete(headers_h, rb_str_new_cstr("content-encoding"));
|
|
1800
|
+
rb_hash_aset(h, ID2SYM(rb_intern("headers")), headers_h);
|
|
1801
|
+
rb_hash_aset(h, ID2SYM(rb_intern("final_url")),
|
|
1802
|
+
rb_str_new_cstr(it->final_url ? it->final_url : it->url));
|
|
1803
|
+
const char *hv_str = "1.1";
|
|
1804
|
+
switch (it->http_version) {
|
|
1805
|
+
case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
|
|
1806
|
+
case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
|
|
1807
|
+
case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
|
|
1808
|
+
#ifdef CURL_HTTP_VERSION_3
|
|
1809
|
+
case CURL_HTTP_VERSION_3: hv_str = "3"; break;
|
|
1810
|
+
#endif
|
|
1811
|
+
}
|
|
1812
|
+
rb_hash_aset(h, ID2SYM(rb_intern("http_version")), rb_str_new_cstr(hv_str));
|
|
1813
|
+
rb_ary_push(result, h);
|
|
1814
|
+
pfetch_item_free(it);
|
|
1815
|
+
}
|
|
1816
|
+
free(items);
|
|
1817
|
+
curl_slist_free_all(shared);
|
|
1818
|
+
return result;
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
/* ---- curl_multi bulk fetch --------------------------------------- *
|
|
1822
|
+
* Single-handle curl_multi driving N concurrent transfers. Complements
|
|
1823
|
+
* the pthread+easy parallel_fetch path:
|
|
1824
|
+
* - parallel_fetch: N pthread workers, each running its own easy
|
|
1825
|
+
* handle blocking. Best when each transfer has meaningful CPU work
|
|
1826
|
+
* (decode + parse) since the GVL is released across the full batch
|
|
1827
|
+
* and CPU work scales with cores.
|
|
1828
|
+
* - multi_fetch: one driver thread, one multi handle, N concurrent
|
|
1829
|
+
* transfers multiplexed via curl_multi_perform. Best for
|
|
1830
|
+
* I/O-dominated high-fan-out fetches (hundreds of URLs across
|
|
1831
|
+
* diverse hosts) where the cost of pthread setup outweighs the
|
|
1832
|
+
* in-flight transfer count.
|
|
1833
|
+
*
|
|
1834
|
+
* Both share the same global CURLSH, so connection pool / DNS / TLS
|
|
1835
|
+
* sessions are shared across them too.
|
|
1836
|
+
*/
|
|
1837
|
+
|
|
1838
|
+
typedef struct {
|
|
1839
|
+
char *url;
|
|
1840
|
+
CURL *easy;
|
|
1841
|
+
buf_t body;
|
|
1842
|
+
buf_t headers;
|
|
1843
|
+
long status;
|
|
1844
|
+
long http_version;
|
|
1845
|
+
char *final_url; /* strdup */
|
|
1846
|
+
CURLcode rc;
|
|
1847
|
+
char errstr[CURL_ERROR_SIZE];
|
|
1848
|
+
struct curl_slist *req_headers; /* per-easy slist, freed after harvest */
|
|
1849
|
+
/* In-loop decode/parse output. Populated by the perform thread
|
|
1850
|
+
* as each transfer completes — keeps the per-completion CPU work
|
|
1851
|
+
* (decompress + transcode + tokenise) inside the same no-GVL
|
|
1852
|
+
* window. */
|
|
1853
|
+
int decoded; /* 1 after we've drained the message for this slot */
|
|
1854
|
+
dom_doc_t *parsed_doc; /* optional, set when parse_after */
|
|
1855
|
+
/* HTTP cache: populated pre-perform from disk lookup; checked
|
|
1856
|
+
* post-perform for 304 revalidation. */
|
|
1857
|
+
scrap_cache_entry_t cached;
|
|
1858
|
+
int have_cached;
|
|
1859
|
+
int served_from_cache;
|
|
1860
|
+
} mfetch_slot_t;
|
|
1861
|
+
|
|
1862
|
+
typedef struct {
|
|
1863
|
+
CURLM *multi;
|
|
1864
|
+
mfetch_slot_t *slots;
|
|
1865
|
+
size_t n;
|
|
1866
|
+
CURLMcode multi_rc;
|
|
1867
|
+
int transcode_utf8;
|
|
1868
|
+
int parse_after;
|
|
1869
|
+
const char *cache_dir;
|
|
1870
|
+
} mfetch_ctx_t;
|
|
1871
|
+
|
|
1872
|
+
static void mfetch_finalize_slot_nogvl(mfetch_ctx_t *ctx, mfetch_slot_t *s,
|
|
1873
|
+
CURL *easy, CURLcode result) {
|
|
1874
|
+
s->rc = result;
|
|
1875
|
+
if (result != CURLE_OK) { s->decoded = 1; return; }
|
|
1876
|
+
curl_easy_getinfo(easy, CURLINFO_RESPONSE_CODE, &s->status);
|
|
1877
|
+
curl_easy_getinfo(easy, CURLINFO_HTTP_VERSION, &s->http_version);
|
|
1878
|
+
char *eff = NULL;
|
|
1879
|
+
curl_easy_getinfo(easy, CURLINFO_EFFECTIVE_URL, &eff);
|
|
1880
|
+
if (eff) {
|
|
1881
|
+
size_t l = strlen(eff);
|
|
1882
|
+
s->final_url = (char *)malloc(l + 1);
|
|
1883
|
+
memcpy(s->final_url, eff, l + 1);
|
|
1884
|
+
}
|
|
1885
|
+
/* 304 revalidation: server says cached body still valid. Swap
|
|
1886
|
+
* the body buffer for the cached payload and rewrite status to
|
|
1887
|
+
* 200 so consumers see a fully-formed response. */
|
|
1888
|
+
if (ctx->cache_dir && s->have_cached && s->status == 304) {
|
|
1889
|
+
free(s->body.data);
|
|
1890
|
+
s->body.data = (char *)malloc(s->cached.body_len + 1);
|
|
1891
|
+
memcpy(s->body.data, s->cached.body, s->cached.body_len);
|
|
1892
|
+
s->body.data[s->cached.body_len] = 0;
|
|
1893
|
+
s->body.len = s->cached.body_len;
|
|
1894
|
+
s->body.cap = s->cached.body_len;
|
|
1895
|
+
s->status = 200;
|
|
1896
|
+
s->served_from_cache = 1;
|
|
1897
|
+
}
|
|
1898
|
+
/* Decompress + transcode under no-GVL. */
|
|
1899
|
+
if (s->body.data && s->body.len > 0) {
|
|
1900
|
+
scrap_decode_content_encoding(s->headers.data ? s->headers.data : "",
|
|
1901
|
+
s->headers.len, &s->body.data, &s->body.len);
|
|
1902
|
+
}
|
|
1903
|
+
if (ctx->transcode_utf8 && s->body.data && s->body.len > 0) {
|
|
1904
|
+
size_t cap = s->body.len;
|
|
1905
|
+
scrap_apply_charset(s->headers.data ? s->headers.data : "", s->headers.len,
|
|
1906
|
+
&s->body.data, &s->body.len, &cap);
|
|
1907
|
+
s->body.cap = cap;
|
|
1908
|
+
}
|
|
1909
|
+
/* Optional in-loop parse — same trick as parallel_fetch: hand
|
|
1910
|
+
* ownership of the body buffer to a dom_doc_t and run
|
|
1911
|
+
* dom_parse_eager_nocache. */
|
|
1912
|
+
if (ctx->parse_after && s->body.data && s->body.len > 0) {
|
|
1913
|
+
char *owned = s->body.data;
|
|
1914
|
+
size_t owned_len = s->body.len;
|
|
1915
|
+
s->body.data = NULL;
|
|
1916
|
+
s->body.len = 0;
|
|
1917
|
+
s->parsed_doc = scrap_dom_make_owned_doc(owned, owned_len);
|
|
1918
|
+
scrap_dom_parse_eager_nocache(s->parsed_doc);
|
|
1919
|
+
}
|
|
1920
|
+
s->decoded = 1;
|
|
1921
|
+
}
|
|
1922
|
+
|
|
1923
|
+
static void *mfetch_run_nogvl(void *arg) {
|
|
1924
|
+
mfetch_ctx_t *ctx = (mfetch_ctx_t *)arg;
|
|
1925
|
+
int running = -1;
|
|
1926
|
+
while (1) {
|
|
1927
|
+
ctx->multi_rc = curl_multi_perform(ctx->multi, &running);
|
|
1928
|
+
if (ctx->multi_rc != CURLM_OK) break;
|
|
1929
|
+
|
|
1930
|
+
/* Drain completed messages now so decompression / transcode /
|
|
1931
|
+
* parse runs in parallel with other in-flight transfers
|
|
1932
|
+
* (still on this same driver thread, but interleaved with
|
|
1933
|
+
* curl_multi_perform). */
|
|
1934
|
+
CURLMsg *msg;
|
|
1935
|
+
int q;
|
|
1936
|
+
while ((msg = curl_multi_info_read(ctx->multi, &q))) {
|
|
1937
|
+
if (msg->msg != CURLMSG_DONE) continue;
|
|
1938
|
+
long idx = -1;
|
|
1939
|
+
curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &idx);
|
|
1940
|
+
if (idx < 0 || idx >= (long)ctx->n) continue;
|
|
1941
|
+
mfetch_slot_t *s = &ctx->slots[idx];
|
|
1942
|
+
if (s->decoded) continue;
|
|
1943
|
+
mfetch_finalize_slot_nogvl(ctx, s, msg->easy_handle, msg->data.result);
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
if (running == 0) break;
|
|
1947
|
+
int numfds = 0;
|
|
1948
|
+
curl_multi_poll(ctx->multi, NULL, 0, 200, &numfds);
|
|
1949
|
+
}
|
|
1950
|
+
return NULL;
|
|
1951
|
+
}
|
|
1952
|
+
|
|
1953
|
+
static VALUE scrap_multi_fetch(int argc, VALUE *argv, VALUE self) {
|
|
1954
|
+
(void)self;
|
|
1955
|
+
scrap_ensure_global_init();
|
|
1956
|
+
VALUE urls_v, opts_v;
|
|
1957
|
+
rb_scan_args(argc, argv, "11", &urls_v, &opts_v);
|
|
1958
|
+
Check_Type(urls_v, T_ARRAY);
|
|
1959
|
+
long n = RARRAY_LEN(urls_v);
|
|
1960
|
+
if (n == 0) return rb_ary_new();
|
|
1961
|
+
|
|
1962
|
+
long timeout_ms = 30000;
|
|
1963
|
+
int follow = 1;
|
|
1964
|
+
long max_redirs = 10;
|
|
1965
|
+
const char *ua = "scrapetor/0.1 (libcurl)";
|
|
1966
|
+
int insecure = 0;
|
|
1967
|
+
long max_concurrent = 0; /* 0 = no cap (let multi run as wide as needed) */
|
|
1968
|
+
int transcode_utf8 = 1;
|
|
1969
|
+
int parse_after = 0;
|
|
1970
|
+
const char *cache_dir = NULL;
|
|
1971
|
+
const char *method_opt = NULL;
|
|
1972
|
+
int nobody_opt = 0;
|
|
1973
|
+
VALUE headers_v = Qnil;
|
|
1974
|
+
if (!NIL_P(opts_v)) {
|
|
1975
|
+
Check_Type(opts_v, T_HASH);
|
|
1976
|
+
VALUE v;
|
|
1977
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
|
|
1978
|
+
if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
|
|
1979
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
|
|
1980
|
+
if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
|
|
1981
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
|
|
1982
|
+
if (!NIL_P(v)) max_redirs = NUM2LONG(v);
|
|
1983
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
|
|
1984
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
|
|
1985
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
|
|
1986
|
+
if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
|
|
1987
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
|
|
1988
|
+
if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
|
|
1989
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_concurrent")));
|
|
1990
|
+
if (!NIL_P(v)) max_concurrent = NUM2LONG(v);
|
|
1991
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
|
|
1992
|
+
if (!NIL_P(v)) transcode_utf8 = RTEST(v) ? 1 : 0;
|
|
1993
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("parse")));
|
|
1994
|
+
if (!NIL_P(v)) parse_after = RTEST(v) ? 1 : 0;
|
|
1995
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cache_dir")));
|
|
1996
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); cache_dir = RSTRING_PTR(v); }
|
|
1997
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("method")));
|
|
1998
|
+
if (!NIL_P(v)) {
|
|
1999
|
+
if (SYMBOL_P(v)) v = rb_sym2str(v);
|
|
2000
|
+
Check_Type(v, T_STRING);
|
|
2001
|
+
method_opt = RSTRING_PTR(v);
|
|
2002
|
+
if (strcasecmp(method_opt, "head") == 0) { nobody_opt = 1; method_opt = NULL; }
|
|
2003
|
+
else if (strcasecmp(method_opt, "get") == 0) method_opt = NULL;
|
|
2004
|
+
}
|
|
2005
|
+
}
|
|
2006
|
+
|
|
2007
|
+
CURLM *multi = curl_multi_init();
|
|
2008
|
+
if (max_concurrent > 0) {
|
|
2009
|
+
curl_multi_setopt(multi, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_concurrent);
|
|
2010
|
+
}
|
|
2011
|
+
#ifdef CURLPIPE_MULTIPLEX
|
|
2012
|
+
/* Let multi pile new requests onto an existing HTTP/2 connection
|
|
2013
|
+
* to the same host. With CURLOPT_PIPEWAIT also set per-handle, the
|
|
2014
|
+
* multi pool tends to settle on one connection per origin. */
|
|
2015
|
+
curl_multi_setopt(multi, CURLMOPT_PIPELINING, (long)CURLPIPE_MULTIPLEX);
|
|
2016
|
+
#endif
|
|
2017
|
+
|
|
2018
|
+
mfetch_slot_t *slots = (mfetch_slot_t *)calloc((size_t)n, sizeof(mfetch_slot_t));
|
|
2019
|
+
|
|
2020
|
+
for (long i = 0; i < n; i++) {
|
|
2021
|
+
VALUE u = rb_ary_entry(urls_v, i);
|
|
2022
|
+
Check_Type(u, T_STRING);
|
|
2023
|
+
size_t ul = (size_t)RSTRING_LEN(u);
|
|
2024
|
+
slots[i].url = (char *)malloc(ul + 1);
|
|
2025
|
+
memcpy(slots[i].url, RSTRING_PTR(u), ul);
|
|
2026
|
+
slots[i].url[ul] = 0;
|
|
2027
|
+
|
|
2028
|
+
CURL *h = curl_easy_init();
|
|
2029
|
+
slots[i].easy = h;
|
|
2030
|
+
curl_easy_setopt(h, CURLOPT_URL, slots[i].url);
|
|
2031
|
+
curl_easy_setopt(h, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
|
|
2032
|
+
#ifdef CURLOPT_PIPEWAIT
|
|
2033
|
+
curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
|
|
2034
|
+
#endif
|
|
2035
|
+
curl_easy_setopt(h, CURLOPT_USERAGENT, ua);
|
|
2036
|
+
curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)follow);
|
|
2037
|
+
curl_easy_setopt(h, CURLOPT_MAXREDIRS, max_redirs);
|
|
2038
|
+
curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, timeout_ms);
|
|
2039
|
+
curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L);
|
|
2040
|
+
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
|
|
2041
|
+
curl_easy_setopt(h, CURLOPT_WRITEDATA, &slots[i].body);
|
|
2042
|
+
curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
|
|
2043
|
+
curl_easy_setopt(h, CURLOPT_HEADERDATA, &slots[i].headers);
|
|
2044
|
+
curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
|
|
2045
|
+
curl_easy_setopt(h, CURLOPT_ERRORBUFFER, slots[i].errstr);
|
|
2046
|
+
curl_easy_setopt(h, CURLOPT_PRIVATE, (void *)(intptr_t)i);
|
|
2047
|
+
if (g_share) curl_easy_setopt(h, CURLOPT_SHARE, g_share);
|
|
2048
|
+
if (insecure) {
|
|
2049
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
|
|
2050
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
|
|
2051
|
+
}
|
|
2052
|
+
if (nobody_opt) {
|
|
2053
|
+
curl_easy_setopt(h, CURLOPT_NOBODY, 1L);
|
|
2054
|
+
curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, "HEAD");
|
|
2055
|
+
} else if (method_opt) {
|
|
2056
|
+
/* Upcase + custom-request for non-GET. */
|
|
2057
|
+
char mbuf[24];
|
|
2058
|
+
size_t mi = 0;
|
|
2059
|
+
for (; mi < sizeof(mbuf) - 1 && method_opt[mi]; mi++) {
|
|
2060
|
+
char c = method_opt[mi];
|
|
2061
|
+
mbuf[mi] = (c >= 'a' && c <= 'z') ? (char)(c - 32) : c;
|
|
2062
|
+
}
|
|
2063
|
+
mbuf[mi] = 0;
|
|
2064
|
+
curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, mbuf);
|
|
2065
|
+
}
|
|
2066
|
+
/* HTTP cache: pre-load entry for this URL so we can attach
|
|
2067
|
+
* If-None-Match / If-Modified-Since and identify 304s in the
|
|
2068
|
+
* worker. HEAD is allowed here because the revalidate flow
|
|
2069
|
+
* uses HEAD specifically to ping the server about freshness;
|
|
2070
|
+
* non-GET methods other than HEAD (POST/PUT/DELETE/...) are
|
|
2071
|
+
* skipped per RFC 7234. */
|
|
2072
|
+
if (cache_dir && !method_opt) {
|
|
2073
|
+
slots[i].have_cached = scrap_cache_load(cache_dir, slots[i].url, &slots[i].cached);
|
|
2074
|
+
}
|
|
2075
|
+
/* Per-handle Accept-Encoding + user headers slist. */
|
|
2076
|
+
{
|
|
2077
|
+
char ae_line[160];
|
|
2078
|
+
snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s",
|
|
2079
|
+
scrap_accept_encoding());
|
|
2080
|
+
slots[i].req_headers = curl_slist_append(slots[i].req_headers, ae_line);
|
|
2081
|
+
}
|
|
2082
|
+
if (slots[i].have_cached) {
|
|
2083
|
+
if (slots[i].cached.etag_len > 0) {
|
|
2084
|
+
char line[1024];
|
|
2085
|
+
snprintf(line, sizeof(line), "If-None-Match: %.*s",
|
|
2086
|
+
(int)slots[i].cached.etag_len, slots[i].cached.etag);
|
|
2087
|
+
slots[i].req_headers = curl_slist_append(slots[i].req_headers, line);
|
|
2088
|
+
}
|
|
2089
|
+
if (slots[i].cached.lastmod_len > 0) {
|
|
2090
|
+
char line[1024];
|
|
2091
|
+
snprintf(line, sizeof(line), "If-Modified-Since: %.*s",
|
|
2092
|
+
(int)slots[i].cached.lastmod_len, slots[i].cached.lastmod);
|
|
2093
|
+
slots[i].req_headers = curl_slist_append(slots[i].req_headers, line);
|
|
2094
|
+
}
|
|
2095
|
+
}
|
|
2096
|
+
if (!NIL_P(headers_v)) {
|
|
2097
|
+
VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
|
|
2098
|
+
long nk = RARRAY_LEN(keys);
|
|
2099
|
+
for (long k = 0; k < nk; k++) {
|
|
2100
|
+
VALUE kk = rb_ary_entry(keys, k);
|
|
2101
|
+
VALUE vv = rb_hash_aref(headers_v, kk);
|
|
2102
|
+
VALUE line = rb_str_dup(kk);
|
|
2103
|
+
rb_str_cat_cstr(line, ": ");
|
|
2104
|
+
rb_str_append(line, vv);
|
|
2105
|
+
slots[i].req_headers = curl_slist_append(slots[i].req_headers, RSTRING_PTR(line));
|
|
2106
|
+
}
|
|
2107
|
+
}
|
|
2108
|
+
curl_easy_setopt(h, CURLOPT_HTTPHEADER, slots[i].req_headers);
|
|
2109
|
+
curl_multi_add_handle(multi, h);
|
|
2110
|
+
}
|
|
2111
|
+
|
|
2112
|
+
mfetch_ctx_t ctx;
|
|
2113
|
+
ctx.multi = multi;
|
|
2114
|
+
ctx.slots = slots;
|
|
2115
|
+
ctx.n = (size_t)n;
|
|
2116
|
+
ctx.multi_rc = CURLM_OK;
|
|
2117
|
+
ctx.transcode_utf8 = transcode_utf8;
|
|
2118
|
+
ctx.parse_after = parse_after;
|
|
2119
|
+
ctx.cache_dir = cache_dir;
|
|
2120
|
+
rb_thread_call_without_gvl(mfetch_run_nogvl, &ctx, NULL, NULL);
|
|
2121
|
+
|
|
2122
|
+
/* Sweep any final messages the worker didn't drain (defensive —
|
|
2123
|
+
* the worker loop normally consumes them all, but if the multi
|
|
2124
|
+
* exited via error or the exit condition fired between perform
|
|
2125
|
+
* and info_read, a message could still be queued). */
|
|
2126
|
+
{
|
|
2127
|
+
CURLMsg *msg;
|
|
2128
|
+
int q;
|
|
2129
|
+
while ((msg = curl_multi_info_read(multi, &q))) {
|
|
2130
|
+
if (msg->msg != CURLMSG_DONE) continue;
|
|
2131
|
+
long idx = -1;
|
|
2132
|
+
curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &idx);
|
|
2133
|
+
if (idx < 0 || idx >= (long)n) continue;
|
|
2134
|
+
if (!slots[idx].decoded) {
|
|
2135
|
+
mfetch_finalize_slot_nogvl(&ctx, &slots[idx], msg->easy_handle, msg->data.result);
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
}
|
|
2139
|
+
|
|
2140
|
+
/* All slots have been decoded by the worker (or by the sweep
|
|
2141
|
+
* above). The harvest pass just builds the Ruby surface. */
|
|
2142
|
+
VALUE doc_klass = parse_after ? rb_path2class("Scrapetor::Native::Document") : Qnil;
|
|
2143
|
+
VALUE result = rb_ary_new_capa(n);
|
|
2144
|
+
for (long i = 0; i < n; i++) {
|
|
2145
|
+
mfetch_slot_t *s = &slots[i];
|
|
2146
|
+
VALUE h = rb_hash_new();
|
|
2147
|
+
if (s->rc != CURLE_OK) {
|
|
2148
|
+
VALUE err = rb_hash_new();
|
|
2149
|
+
rb_hash_aset(err, ID2SYM(rb_intern("url")), rb_str_new_cstr(s->url));
|
|
2150
|
+
rb_hash_aset(err, ID2SYM(rb_intern("error")),
|
|
2151
|
+
rb_str_new_cstr(s->errstr[0] ? s->errstr : curl_easy_strerror(s->rc)));
|
|
2152
|
+
rb_hash_aset(h, ID2SYM(rb_intern("error")), err);
|
|
2153
|
+
} else {
|
|
2154
|
+
rb_hash_aset(h, ID2SYM(rb_intern("status")), LONG2NUM(s->status));
|
|
2155
|
+
if (s->parsed_doc) {
|
|
2156
|
+
rb_hash_aset(h, ID2SYM(rb_intern("document")),
|
|
2157
|
+
scrap_dom_wrap_doc(doc_klass, s->parsed_doc));
|
|
2158
|
+
s->parsed_doc = NULL;
|
|
2159
|
+
rb_hash_aset(h, ID2SYM(rb_intern("body")), rb_enc_str_new("", 0, enc_utf8));
|
|
2160
|
+
} else {
|
|
2161
|
+
rb_hash_aset(h, ID2SYM(rb_intern("body")),
|
|
2162
|
+
rb_enc_str_new(s->body.data ? s->body.data : "",
|
|
2163
|
+
(long)s->body.len, enc_utf8));
|
|
2164
|
+
}
|
|
2165
|
+
VALUE hh = parse_headers_blob(s->headers.data ? s->headers.data : "",
|
|
2166
|
+
s->headers.len);
|
|
2167
|
+
rb_hash_delete(hh, rb_str_new_cstr("content-encoding"));
|
|
2168
|
+
if (s->served_from_cache && s->cached.ctype_len > 0) {
|
|
2169
|
+
rb_hash_aset(hh, rb_str_new_cstr("content-type"),
|
|
2170
|
+
rb_str_new(s->cached.ctype, (long)s->cached.ctype_len));
|
|
2171
|
+
rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
|
|
2172
|
+
rb_str_new_cstr("hit"));
|
|
2173
|
+
} else if (cache_dir && s->have_cached) {
|
|
2174
|
+
rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
|
|
2175
|
+
rb_str_new_cstr("miss-revalidated"));
|
|
2176
|
+
}
|
|
2177
|
+
rb_hash_aset(h, ID2SYM(rb_intern("headers")), hh);
|
|
2178
|
+
rb_hash_aset(h, ID2SYM(rb_intern("final_url")),
|
|
2179
|
+
rb_str_new_cstr(s->final_url ? s->final_url : s->url));
|
|
2180
|
+
const char *hv_str = "1.1";
|
|
2181
|
+
switch (s->http_version) {
|
|
2182
|
+
case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
|
|
2183
|
+
case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
|
|
2184
|
+
case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
|
|
2185
|
+
#ifdef CURL_HTTP_VERSION_3
|
|
2186
|
+
case CURL_HTTP_VERSION_3: hv_str = "3"; break;
|
|
2187
|
+
#endif
|
|
2188
|
+
}
|
|
2189
|
+
rb_hash_aset(h, ID2SYM(rb_intern("http_version")), rb_str_new_cstr(hv_str));
|
|
2190
|
+
/* Store the response in cache for next-time revalidation
|
|
2191
|
+
* (only for cache-eligible 2xx responses with a token). */
|
|
2192
|
+
if (cache_dir && !s->served_from_cache && s->status >= 200 && s->status < 300 &&
|
|
2193
|
+
s->body.data && s->body.len > 0) {
|
|
2194
|
+
VALUE etag_v = rb_hash_lookup(hh, rb_str_new_cstr("etag"));
|
|
2195
|
+
VALUE lastmod_v = rb_hash_lookup(hh, rb_str_new_cstr("last-modified"));
|
|
2196
|
+
VALUE ctype_v = rb_hash_lookup(hh, rb_str_new_cstr("content-type"));
|
|
2197
|
+
if (!NIL_P(etag_v) || !NIL_P(lastmod_v)) {
|
|
2198
|
+
const char *etag_p = NIL_P(etag_v) ? "" : RSTRING_PTR(etag_v);
|
|
2199
|
+
size_t etag_l = NIL_P(etag_v) ? 0 : (size_t)RSTRING_LEN(etag_v);
|
|
2200
|
+
const char *lastmod_p = NIL_P(lastmod_v) ? "" : RSTRING_PTR(lastmod_v);
|
|
2201
|
+
size_t lastmod_l = NIL_P(lastmod_v) ? 0 : (size_t)RSTRING_LEN(lastmod_v);
|
|
2202
|
+
const char *ctype_p = NIL_P(ctype_v) ? "" : RSTRING_PTR(ctype_v);
|
|
2203
|
+
size_t ctype_l = NIL_P(ctype_v) ? 0 : (size_t)RSTRING_LEN(ctype_v);
|
|
2204
|
+
scrap_cache_store(cache_dir, s->url, s->status,
|
|
2205
|
+
etag_p, etag_l, lastmod_p, lastmod_l,
|
|
2206
|
+
ctype_p, ctype_l, s->body.data, s->body.len);
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2209
|
+
}
|
|
2210
|
+
rb_ary_push(result, h);
|
|
2211
|
+
|
|
2212
|
+
curl_multi_remove_handle(multi, s->easy);
|
|
2213
|
+
curl_easy_cleanup(s->easy);
|
|
2214
|
+
curl_slist_free_all(s->req_headers);
|
|
2215
|
+
free(s->url);
|
|
2216
|
+
free(s->body.data);
|
|
2217
|
+
free(s->headers.data);
|
|
2218
|
+
free(s->final_url);
|
|
2219
|
+
scrap_cache_entry_free(&s->cached);
|
|
2220
|
+
}
|
|
2221
|
+
curl_multi_cleanup(multi);
|
|
2222
|
+
free(slots);
|
|
2223
|
+
return result;
|
|
2224
|
+
}
|
|
2225
|
+
|
|
2226
|
+
/* ---- streaming multi batch (yield as transfers complete) --------- *
|
|
2227
|
+
* Wraps a CURLM handle + slots in a typed-data object so Ruby can pull
|
|
2228
|
+
* completed responses one at a time via #next. Each #next advances
|
|
2229
|
+
* curl_multi_perform under no-GVL until at least one new transfer
|
|
2230
|
+
* completes, finalises that slot (decompress / transcode / optional
|
|
2231
|
+
* parse), and returns its Ruby hash. nil when the whole batch is done.
|
|
2232
|
+
*
|
|
2233
|
+
* Pattern: Fetcher.multi_each(urls) { |r| ... } yields each response
|
|
2234
|
+
* in completion order — earliest-arriving first — so the user starts
|
|
2235
|
+
* processing while later transfers are still on the wire.
|
|
2236
|
+
*/
|
|
2237
|
+
typedef struct {
|
|
2238
|
+
CURLM *multi;
|
|
2239
|
+
mfetch_slot_t *slots;
|
|
2240
|
+
size_t n;
|
|
2241
|
+
/* Completion ring: indices of slots that finished and aren't
|
|
2242
|
+
* yielded yet. ready_tail bumps in the worker, ready_head bumps
|
|
2243
|
+
* on each #next pop. */
|
|
2244
|
+
size_t *ready_queue;
|
|
2245
|
+
size_t ready_head;
|
|
2246
|
+
size_t ready_tail;
|
|
2247
|
+
int running;
|
|
2248
|
+
int done;
|
|
2249
|
+
/* Carried opts (mirrors mfetch_ctx_t shape so we can reuse
|
|
2250
|
+
* mfetch_finalize_slot_nogvl). */
|
|
2251
|
+
int transcode_utf8;
|
|
2252
|
+
int parse_after;
|
|
2253
|
+
char *cache_dir_owned; /* strdup, may be NULL */
|
|
2254
|
+
/* Whole-batch shared slist for Accept-Encoding + user headers.
|
|
2255
|
+
* Owned; freed at cleanup. */
|
|
2256
|
+
struct curl_slist *shared_headers;
|
|
2257
|
+
/* All easy handles also live here so we can free them on GC. */
|
|
2258
|
+
} mbatch_t;
|
|
2259
|
+
|
|
2260
|
+
static void mbatch_free(void *p) {
|
|
2261
|
+
mbatch_t *b = (mbatch_t *)p;
|
|
2262
|
+
if (!b) return;
|
|
2263
|
+
if (b->slots) {
|
|
2264
|
+
for (size_t i = 0; i < b->n; i++) {
|
|
2265
|
+
mfetch_slot_t *s = &b->slots[i];
|
|
2266
|
+
if (s->easy) {
|
|
2267
|
+
if (b->multi) curl_multi_remove_handle(b->multi, s->easy);
|
|
2268
|
+
curl_easy_cleanup(s->easy);
|
|
2269
|
+
}
|
|
2270
|
+
curl_slist_free_all(s->req_headers);
|
|
2271
|
+
free(s->url);
|
|
2272
|
+
free(s->body.data);
|
|
2273
|
+
free(s->headers.data);
|
|
2274
|
+
free(s->final_url);
|
|
2275
|
+
scrap_cache_entry_free(&s->cached);
|
|
2276
|
+
if (s->parsed_doc) {
|
|
2277
|
+
/* parsed_doc may not have been yielded yet — its bytes
|
|
2278
|
+
* are owned by the dom_doc so just let it leak through
|
|
2279
|
+
* the parse-doc free path. */
|
|
2280
|
+
/* No direct free here; the dom_doc's own free handles it
|
|
2281
|
+
* once the wrap is GC'd. Without a wrap, it leaks. */
|
|
2282
|
+
}
|
|
2283
|
+
}
|
|
2284
|
+
free(b->slots);
|
|
2285
|
+
}
|
|
2286
|
+
if (b->multi) curl_multi_cleanup(b->multi);
|
|
2287
|
+
free(b->ready_queue);
|
|
2288
|
+
free(b->cache_dir_owned);
|
|
2289
|
+
free(b);
|
|
2290
|
+
}
|
|
2291
|
+
|
|
2292
|
+
static size_t mbatch_memsize(const void *p) {
|
|
2293
|
+
const mbatch_t *b = (const mbatch_t *)p;
|
|
2294
|
+
return sizeof(*b) + (b ? b->n * sizeof(mfetch_slot_t) : 0);
|
|
2295
|
+
}
|
|
2296
|
+
|
|
2297
|
+
static const rb_data_type_t mbatch_data_type = {
|
|
2298
|
+
"Scrapetor::Native::Http::MultiBatch",
|
|
2299
|
+
{NULL, mbatch_free, mbatch_memsize},
|
|
2300
|
+
NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY,
|
|
2301
|
+
};
|
|
2302
|
+
|
|
2303
|
+
static VALUE mbatch_alloc(VALUE klass) {
|
|
2304
|
+
mbatch_t *b = (mbatch_t *)calloc(1, sizeof(mbatch_t));
|
|
2305
|
+
return TypedData_Wrap_Struct(klass, &mbatch_data_type, b);
|
|
2306
|
+
}
|
|
2307
|
+
|
|
2308
|
+
/* No-GVL stepper: one perform call + drain any completed messages,
|
|
2309
|
+
* finalising each as it lands. May poll for socket activity if no
|
|
2310
|
+
* completion is ready yet. */
|
|
2311
|
+
static void *mbatch_step_nogvl(void *arg) {
|
|
2312
|
+
mbatch_t *b = (mbatch_t *)arg;
|
|
2313
|
+
curl_multi_perform(b->multi, &b->running);
|
|
2314
|
+
CURLMsg *msg;
|
|
2315
|
+
int q;
|
|
2316
|
+
while ((msg = curl_multi_info_read(b->multi, &q))) {
|
|
2317
|
+
if (msg->msg != CURLMSG_DONE) continue;
|
|
2318
|
+
long idx = -1;
|
|
2319
|
+
curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &idx);
|
|
2320
|
+
if (idx < 0 || idx >= (long)b->n) continue;
|
|
2321
|
+
if (b->slots[idx].decoded) continue;
|
|
2322
|
+
mfetch_ctx_t ctx_proxy;
|
|
2323
|
+
memset(&ctx_proxy, 0, sizeof(ctx_proxy));
|
|
2324
|
+
ctx_proxy.transcode_utf8 = b->transcode_utf8;
|
|
2325
|
+
ctx_proxy.parse_after = b->parse_after;
|
|
2326
|
+
ctx_proxy.cache_dir = b->cache_dir_owned;
|
|
2327
|
+
mfetch_finalize_slot_nogvl(&ctx_proxy, &b->slots[idx],
|
|
2328
|
+
msg->easy_handle, msg->data.result);
|
|
2329
|
+
b->ready_queue[b->ready_tail++] = (size_t)idx;
|
|
2330
|
+
}
|
|
2331
|
+
if (b->ready_head >= b->ready_tail && b->running > 0) {
|
|
2332
|
+
int numfds = 0;
|
|
2333
|
+
curl_multi_poll(b->multi, NULL, 0, 200, &numfds);
|
|
2334
|
+
}
|
|
2335
|
+
if (b->running == 0) b->done = 1;
|
|
2336
|
+
return NULL;
|
|
2337
|
+
}
|
|
2338
|
+
|
|
2339
|
+
/* Build the Ruby Hash for a finalised slot. Same shape as
|
|
2340
|
+
* scrap_multi_fetch's harvest path. */
|
|
2341
|
+
static VALUE mbatch_build_hash(mbatch_t *b, mfetch_slot_t *s) {
|
|
2342
|
+
VALUE h = rb_hash_new();
|
|
2343
|
+
if (s->rc != CURLE_OK) {
|
|
2344
|
+
VALUE err = rb_hash_new();
|
|
2345
|
+
rb_hash_aset(err, ID2SYM(rb_intern("url")), rb_str_new_cstr(s->url));
|
|
2346
|
+
rb_hash_aset(err, ID2SYM(rb_intern("error")),
|
|
2347
|
+
rb_str_new_cstr(s->errstr[0] ? s->errstr : curl_easy_strerror(s->rc)));
|
|
2348
|
+
rb_hash_aset(h, ID2SYM(rb_intern("error")), err);
|
|
2349
|
+
return h;
|
|
2350
|
+
}
|
|
2351
|
+
rb_hash_aset(h, ID2SYM(rb_intern("status")), LONG2NUM(s->status));
|
|
2352
|
+
if (s->parsed_doc) {
|
|
2353
|
+
VALUE doc_klass = rb_path2class("Scrapetor::Native::Document");
|
|
2354
|
+
rb_hash_aset(h, ID2SYM(rb_intern("document")),
|
|
2355
|
+
scrap_dom_wrap_doc(doc_klass, s->parsed_doc));
|
|
2356
|
+
s->parsed_doc = NULL;
|
|
2357
|
+
rb_hash_aset(h, ID2SYM(rb_intern("body")), rb_enc_str_new("", 0, enc_utf8));
|
|
2358
|
+
} else {
|
|
2359
|
+
rb_hash_aset(h, ID2SYM(rb_intern("body")),
|
|
2360
|
+
rb_enc_str_new(s->body.data ? s->body.data : "",
|
|
2361
|
+
(long)s->body.len, enc_utf8));
|
|
2362
|
+
}
|
|
2363
|
+
VALUE hh = parse_headers_blob(s->headers.data ? s->headers.data : "",
|
|
2364
|
+
s->headers.len);
|
|
2365
|
+
rb_hash_delete(hh, rb_str_new_cstr("content-encoding"));
|
|
2366
|
+
if (s->served_from_cache && s->cached.ctype_len > 0) {
|
|
2367
|
+
rb_hash_aset(hh, rb_str_new_cstr("content-type"),
|
|
2368
|
+
rb_str_new(s->cached.ctype, (long)s->cached.ctype_len));
|
|
2369
|
+
rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
|
|
2370
|
+
rb_str_new_cstr("hit"));
|
|
2371
|
+
} else if (b->cache_dir_owned && s->have_cached) {
|
|
2372
|
+
rb_hash_aset(hh, rb_str_new_cstr("x-scrapetor-cache"),
|
|
2373
|
+
rb_str_new_cstr("miss-revalidated"));
|
|
2374
|
+
}
|
|
2375
|
+
rb_hash_aset(h, ID2SYM(rb_intern("headers")), hh);
|
|
2376
|
+
rb_hash_aset(h, ID2SYM(rb_intern("final_url")),
|
|
2377
|
+
rb_str_new_cstr(s->final_url ? s->final_url : s->url));
|
|
2378
|
+
const char *hv_str = "1.1";
|
|
2379
|
+
switch (s->http_version) {
|
|
2380
|
+
case CURL_HTTP_VERSION_1_0: hv_str = "1.0"; break;
|
|
2381
|
+
case CURL_HTTP_VERSION_1_1: hv_str = "1.1"; break;
|
|
2382
|
+
case CURL_HTTP_VERSION_2_0: hv_str = "2"; break;
|
|
2383
|
+
#ifdef CURL_HTTP_VERSION_3
|
|
2384
|
+
case CURL_HTTP_VERSION_3: hv_str = "3"; break;
|
|
2385
|
+
#endif
|
|
2386
|
+
}
|
|
2387
|
+
rb_hash_aset(h, ID2SYM(rb_intern("http_version")), rb_str_new_cstr(hv_str));
|
|
2388
|
+
return h;
|
|
2389
|
+
}
|
|
2390
|
+
|
|
2391
|
+
static VALUE mbatch_initialize(int argc, VALUE *argv, VALUE self) {
|
|
2392
|
+
scrap_ensure_global_init();
|
|
2393
|
+
VALUE urls_v, opts_v;
|
|
2394
|
+
rb_scan_args(argc, argv, "11", &urls_v, &opts_v);
|
|
2395
|
+
Check_Type(urls_v, T_ARRAY);
|
|
2396
|
+
long n = RARRAY_LEN(urls_v);
|
|
2397
|
+
|
|
2398
|
+
mbatch_t *b;
|
|
2399
|
+
TypedData_Get_Struct(self, mbatch_t, &mbatch_data_type, b);
|
|
2400
|
+
|
|
2401
|
+
long timeout_ms = 30000;
|
|
2402
|
+
int follow = 1;
|
|
2403
|
+
long max_redirs = 10;
|
|
2404
|
+
const char *ua = "scrapetor/0.1 (libcurl)";
|
|
2405
|
+
int insecure = 0;
|
|
2406
|
+
long max_concurrent = 0;
|
|
2407
|
+
b->transcode_utf8 = 1;
|
|
2408
|
+
b->parse_after = 0;
|
|
2409
|
+
VALUE headers_v = Qnil;
|
|
2410
|
+
const char *method_opt = NULL;
|
|
2411
|
+
int nobody_opt = 0;
|
|
2412
|
+
if (!NIL_P(opts_v)) {
|
|
2413
|
+
Check_Type(opts_v, T_HASH);
|
|
2414
|
+
VALUE v;
|
|
2415
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("timeout_ms")));
|
|
2416
|
+
if (!NIL_P(v)) timeout_ms = NUM2LONG(v);
|
|
2417
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("follow_redirects")));
|
|
2418
|
+
if (!NIL_P(v)) follow = RTEST(v) ? 1 : 0;
|
|
2419
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_redirects")));
|
|
2420
|
+
if (!NIL_P(v)) max_redirs = NUM2LONG(v);
|
|
2421
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("user_agent")));
|
|
2422
|
+
if (!NIL_P(v)) { Check_Type(v, T_STRING); ua = RSTRING_PTR(v); }
|
|
2423
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("insecure")));
|
|
2424
|
+
if (!NIL_P(v)) insecure = RTEST(v) ? 1 : 0;
|
|
2425
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("headers")));
|
|
2426
|
+
if (!NIL_P(v)) { Check_Type(v, T_HASH); headers_v = v; }
|
|
2427
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("max_concurrent")));
|
|
2428
|
+
if (!NIL_P(v)) max_concurrent = NUM2LONG(v);
|
|
2429
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("transcode_utf8")));
|
|
2430
|
+
if (!NIL_P(v)) b->transcode_utf8 = RTEST(v) ? 1 : 0;
|
|
2431
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("parse")));
|
|
2432
|
+
if (!NIL_P(v)) b->parse_after = RTEST(v) ? 1 : 0;
|
|
2433
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("cache_dir")));
|
|
2434
|
+
if (!NIL_P(v)) {
|
|
2435
|
+
Check_Type(v, T_STRING);
|
|
2436
|
+
size_t l = (size_t)RSTRING_LEN(v);
|
|
2437
|
+
b->cache_dir_owned = (char *)malloc(l + 1);
|
|
2438
|
+
memcpy(b->cache_dir_owned, RSTRING_PTR(v), l);
|
|
2439
|
+
b->cache_dir_owned[l] = 0;
|
|
2440
|
+
}
|
|
2441
|
+
v = rb_hash_aref(opts_v, ID2SYM(rb_intern("method")));
|
|
2442
|
+
if (!NIL_P(v)) {
|
|
2443
|
+
if (SYMBOL_P(v)) v = rb_sym2str(v);
|
|
2444
|
+
Check_Type(v, T_STRING);
|
|
2445
|
+
method_opt = RSTRING_PTR(v);
|
|
2446
|
+
if (strcasecmp(method_opt, "head") == 0) { nobody_opt = 1; method_opt = NULL; }
|
|
2447
|
+
else if (strcasecmp(method_opt, "get") == 0) method_opt = NULL;
|
|
2448
|
+
}
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2451
|
+
b->n = (size_t)n;
|
|
2452
|
+
b->multi = curl_multi_init();
|
|
2453
|
+
if (max_concurrent > 0) {
|
|
2454
|
+
curl_multi_setopt(b->multi, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_concurrent);
|
|
2455
|
+
}
|
|
2456
|
+
#ifdef CURLPIPE_MULTIPLEX
|
|
2457
|
+
curl_multi_setopt(b->multi, CURLMOPT_PIPELINING, (long)CURLPIPE_MULTIPLEX);
|
|
2458
|
+
#endif
|
|
2459
|
+
b->slots = (mfetch_slot_t *)calloc(b->n, sizeof(mfetch_slot_t));
|
|
2460
|
+
b->ready_queue = (size_t *)calloc(b->n, sizeof(size_t));
|
|
2461
|
+
|
|
2462
|
+
for (long i = 0; i < n; i++) {
|
|
2463
|
+
VALUE u = rb_ary_entry(urls_v, i);
|
|
2464
|
+
Check_Type(u, T_STRING);
|
|
2465
|
+
size_t ul = (size_t)RSTRING_LEN(u);
|
|
2466
|
+
b->slots[i].url = (char *)malloc(ul + 1);
|
|
2467
|
+
memcpy(b->slots[i].url, RSTRING_PTR(u), ul);
|
|
2468
|
+
b->slots[i].url[ul] = 0;
|
|
2469
|
+
|
|
2470
|
+
CURL *h = curl_easy_init();
|
|
2471
|
+
b->slots[i].easy = h;
|
|
2472
|
+
curl_easy_setopt(h, CURLOPT_URL, b->slots[i].url);
|
|
2473
|
+
curl_easy_setopt(h, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
|
|
2474
|
+
#ifdef CURLOPT_PIPEWAIT
|
|
2475
|
+
curl_easy_setopt(h, CURLOPT_PIPEWAIT, 1L);
|
|
2476
|
+
#endif
|
|
2477
|
+
curl_easy_setopt(h, CURLOPT_USERAGENT, ua);
|
|
2478
|
+
curl_easy_setopt(h, CURLOPT_FOLLOWLOCATION, (long)follow);
|
|
2479
|
+
curl_easy_setopt(h, CURLOPT_MAXREDIRS, max_redirs);
|
|
2480
|
+
curl_easy_setopt(h, CURLOPT_TIMEOUT_MS, timeout_ms);
|
|
2481
|
+
curl_easy_setopt(h, CURLOPT_NOSIGNAL, 1L);
|
|
2482
|
+
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, cb_body);
|
|
2483
|
+
curl_easy_setopt(h, CURLOPT_WRITEDATA, &b->slots[i].body);
|
|
2484
|
+
curl_easy_setopt(h, CURLOPT_HEADERFUNCTION, cb_header);
|
|
2485
|
+
curl_easy_setopt(h, CURLOPT_HEADERDATA, &b->slots[i].headers);
|
|
2486
|
+
curl_easy_setopt(h, CURLOPT_TCP_KEEPALIVE, 1L);
|
|
2487
|
+
curl_easy_setopt(h, CURLOPT_ERRORBUFFER, b->slots[i].errstr);
|
|
2488
|
+
curl_easy_setopt(h, CURLOPT_PRIVATE, (void *)(intptr_t)i);
|
|
2489
|
+
if (g_share) curl_easy_setopt(h, CURLOPT_SHARE, g_share);
|
|
2490
|
+
if (insecure) {
|
|
2491
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
|
|
2492
|
+
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
|
|
2493
|
+
}
|
|
2494
|
+
if (nobody_opt) {
|
|
2495
|
+
curl_easy_setopt(h, CURLOPT_NOBODY, 1L);
|
|
2496
|
+
curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, "HEAD");
|
|
2497
|
+
} else if (method_opt) {
|
|
2498
|
+
char mbuf[24];
|
|
2499
|
+
size_t mi = 0;
|
|
2500
|
+
for (; mi < sizeof(mbuf) - 1 && method_opt[mi]; mi++) {
|
|
2501
|
+
char c = method_opt[mi];
|
|
2502
|
+
mbuf[mi] = (c >= 'a' && c <= 'z') ? (char)(c - 32) : c;
|
|
2503
|
+
}
|
|
2504
|
+
mbuf[mi] = 0;
|
|
2505
|
+
curl_easy_setopt(h, CURLOPT_CUSTOMREQUEST, mbuf);
|
|
2506
|
+
}
|
|
2507
|
+
if (b->cache_dir_owned && !method_opt) {
|
|
2508
|
+
b->slots[i].have_cached =
|
|
2509
|
+
scrap_cache_load(b->cache_dir_owned, b->slots[i].url, &b->slots[i].cached);
|
|
2510
|
+
}
|
|
2511
|
+
{
|
|
2512
|
+
char ae_line[160];
|
|
2513
|
+
snprintf(ae_line, sizeof(ae_line), "Accept-Encoding: %s",
|
|
2514
|
+
scrap_accept_encoding());
|
|
2515
|
+
b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, ae_line);
|
|
2516
|
+
}
|
|
2517
|
+
if (b->slots[i].have_cached) {
|
|
2518
|
+
if (b->slots[i].cached.etag_len > 0) {
|
|
2519
|
+
char line[1024];
|
|
2520
|
+
snprintf(line, sizeof(line), "If-None-Match: %.*s",
|
|
2521
|
+
(int)b->slots[i].cached.etag_len, b->slots[i].cached.etag);
|
|
2522
|
+
b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, line);
|
|
2523
|
+
}
|
|
2524
|
+
if (b->slots[i].cached.lastmod_len > 0) {
|
|
2525
|
+
char line[1024];
|
|
2526
|
+
snprintf(line, sizeof(line), "If-Modified-Since: %.*s",
|
|
2527
|
+
(int)b->slots[i].cached.lastmod_len, b->slots[i].cached.lastmod);
|
|
2528
|
+
b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, line);
|
|
2529
|
+
}
|
|
2530
|
+
}
|
|
2531
|
+
if (!NIL_P(headers_v)) {
|
|
2532
|
+
VALUE keys = rb_funcall(headers_v, rb_intern("keys"), 0);
|
|
2533
|
+
long nk = RARRAY_LEN(keys);
|
|
2534
|
+
for (long k = 0; k < nk; k++) {
|
|
2535
|
+
VALUE kk = rb_ary_entry(keys, k);
|
|
2536
|
+
VALUE vv = rb_hash_aref(headers_v, kk);
|
|
2537
|
+
VALUE line = rb_str_dup(kk);
|
|
2538
|
+
rb_str_cat_cstr(line, ": ");
|
|
2539
|
+
rb_str_append(line, vv);
|
|
2540
|
+
b->slots[i].req_headers = curl_slist_append(b->slots[i].req_headers, RSTRING_PTR(line));
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
curl_easy_setopt(h, CURLOPT_HTTPHEADER, b->slots[i].req_headers);
|
|
2544
|
+
curl_multi_add_handle(b->multi, h);
|
|
2545
|
+
}
|
|
2546
|
+
b->running = (int)b->n;
|
|
2547
|
+
return self;
|
|
2548
|
+
}
|
|
2549
|
+
|
|
2550
|
+
static VALUE mbatch_next(VALUE self) {
|
|
2551
|
+
mbatch_t *b;
|
|
2552
|
+
TypedData_Get_Struct(self, mbatch_t, &mbatch_data_type, b);
|
|
2553
|
+
while (b->ready_head >= b->ready_tail && !b->done) {
|
|
2554
|
+
rb_thread_call_without_gvl(mbatch_step_nogvl, b, NULL, NULL);
|
|
2555
|
+
}
|
|
2556
|
+
if (b->ready_head >= b->ready_tail) return Qnil;
|
|
2557
|
+
size_t idx = b->ready_queue[b->ready_head++];
|
|
2558
|
+
return mbatch_build_hash(b, &b->slots[idx]);
|
|
2559
|
+
}
|
|
2560
|
+
|
|
2561
|
+
void Init_scrapetor_http(VALUE mod_native) {
|
|
2562
|
+
/* Intentionally NOT calling curl_global_init / scrap_share_init here.
|
|
2563
|
+
* See scrap_ensure_global_init above — eager init at require-time
|
|
2564
|
+
* races macOS Cocoa initialisers against the host's fork(), which
|
|
2565
|
+
* crashes Puma / Spring / Foreman workers on macOS. The first fetch
|
|
2566
|
+
* call (in each post-fork worker) does the init lazily. */
|
|
2567
|
+
VALUE mod_http = rb_define_module_under(mod_native, "Http");
|
|
2568
|
+
rb_define_singleton_method(mod_http, "get", scrap_http_get, -1);
|
|
2569
|
+
rb_define_singleton_method(mod_http, "parallel_fetch", scrap_parallel_fetch, -1);
|
|
2570
|
+
rb_define_singleton_method(mod_http, "multi_fetch", scrap_multi_fetch, -1);
|
|
2571
|
+
rb_define_singleton_method(mod_http, "features", scrap_http_features, 0);
|
|
2572
|
+
rb_define_const(mod_http, "AVAILABLE", Qtrue);
|
|
2573
|
+
|
|
2574
|
+
/* Streaming multi-batch iterator. */
|
|
2575
|
+
VALUE mb = rb_define_class_under(mod_http, "MultiBatch", rb_cObject);
|
|
2576
|
+
rb_define_alloc_func(mb, mbatch_alloc);
|
|
2577
|
+
rb_define_method(mb, "initialize", mbatch_initialize, -1);
|
|
2578
|
+
rb_define_method(mb, "next", mbatch_next, 0);
|
|
2579
|
+
}
|
|
2580
|
+
|
|
2581
|
+
#else /* HAVE_LIBCURL */
|
|
2582
|
+
|
|
2583
|
+
/* Stub: HTTP layer was not built (libcurl missing at compile time).
|
|
2584
|
+
* Define the constant so the Ruby side can detect this and provide a
|
|
2585
|
+
* useful error. */
|
|
2586
|
+
void Init_scrapetor_http(VALUE mod_native) {
|
|
2587
|
+
VALUE mod_http = rb_define_module_under(mod_native, "Http");
|
|
2588
|
+
rb_define_const(mod_http, "AVAILABLE", Qfalse);
|
|
2589
|
+
}
|
|
2590
|
+
|
|
2591
|
+
#endif
|