scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,1156 @@
1
+ /*
2
+ * scrapetor_native.c
3
+ *
4
+ * Streaming HTML extraction engine.
5
+ *
6
+ * Tokenizes HTML in a single forward pass without constructing a DOM,
7
+ * matches a schema of "repeated" blocks and field selectors during
8
+ * tokenization, and emits structured records as repeated-block close
9
+ * tags fire. One Ruby boundary crossing per document.
10
+ *
11
+ * Compatible with Ruby 2.0+ (uses only the stable public C API:
12
+ * rb_define_module, rb_str_new, rb_hash_aset, rb_ary_push, RSTRING_*,
13
+ * ID2SYM, NIL_P, RTEST, DBL2NUM, rb_enc_associate).
14
+ *
15
+ * Workload target: §16 wedge from plan.md — extract N product cards
16
+ * (title, price-as-money, absolute URL, image URL) from an e-commerce
17
+ * listing page. Designed to beat Nokolexbor on this workload by
18
+ * eliminating DOM construction and per-field C<->Ruby crossings.
19
+ *
20
+ * Selector subset supported in the native fast path:
21
+ * tag e.g. div
22
+ * .class e.g. .product-card
23
+ * tag.class e.g. span.price
24
+ *
25
+ * Schemas that exceed this subset transparently fall back to Ruby.
26
+ */
27
+
28
+ #include <ruby.h>
29
+ #include <ruby/encoding.h>
30
+ #include <string.h>
31
+ #include <strings.h>
32
+ #include <ctype.h>
33
+ #include <stdlib.h>
34
+ #include <stdint.h>
35
+
36
+ rb_encoding *enc_utf8;
37
+ static inline VALUE u8(VALUE s) { rb_enc_associate(s, enc_utf8); return s; }
38
+
39
+ #define MAX_STACK 1024
40
+ #define MAX_FIELDS 64
41
+ #define MAX_GROUPS 16
42
+ #define MAX_ATTRS 64
43
+ #define MAX_CLASSES_PER_SEL 8
44
+ #define MAX_ATTRMATCH_PER_SEL 8
45
+ #define FIELD_BUF_INIT 64
46
+
47
+ #define FT_TEXT 0
48
+ #define FT_MONEY 1
49
+ #define FT_URL 2
50
+ #define FT_INTEGER 3
51
+ #define FT_FLOAT 4
52
+
53
+ #define AOP_EXISTS 0
54
+ #define AOP_EQ 1
55
+ #define AOP_PREFIX 2
56
+ #define AOP_SUFFIX 3
57
+ #define AOP_CONTAINS 4
58
+ #define AOP_WORD 5
59
+ #define AOP_DASH 6
60
+
61
+ #define COMB_NONE 0
62
+ #define COMB_DESCENDANT 1
63
+ #define COMB_CHILD 2
64
+
65
+ #ifndef RB_TYPE_P
66
+ # define RB_TYPE_P(v, t) (TYPE(v) == (t))
67
+ #endif
68
+
69
+ /* ---- string-view + helpers ----------------------------------------- */
70
+
71
+ typedef struct { const char *p; size_t len; } strv;
72
+
73
+ static inline int ascii_lower(int c) {
74
+ return (c >= 'A' && c <= 'Z') ? c + 32 : c;
75
+ }
76
+
77
+ static inline int strv_ieq_cstr(strv a, const char *s, size_t n) {
78
+ if (a.len != n) return 0;
79
+ for (size_t i = 0; i < n; i++) {
80
+ if (ascii_lower((unsigned char)a.p[i]) != ascii_lower((unsigned char)s[i])) return 0;
81
+ }
82
+ return 1;
83
+ }
84
+
85
+ static inline int strv_ieq(strv a, strv b) {
86
+ if (a.len != b.len) return 0;
87
+ for (size_t i = 0; i < a.len; i++) {
88
+ if (ascii_lower((unsigned char)a.p[i]) != ascii_lower((unsigned char)b.p[i])) return 0;
89
+ }
90
+ return 1;
91
+ }
92
+
93
+ static int class_present(strv classes, strv cls) {
94
+ if (cls.len == 0) return 1;
95
+ size_t i = 0;
96
+ while (i < classes.len) {
97
+ while (i < classes.len && (classes.p[i] == ' ' || classes.p[i] == '\t' || classes.p[i] == '\n' || classes.p[i] == '\r' || classes.p[i] == '\f')) i++;
98
+ size_t j = i;
99
+ while (j < classes.len && classes.p[j] != ' ' && classes.p[j] != '\t' && classes.p[j] != '\n' && classes.p[j] != '\r' && classes.p[j] != '\f') j++;
100
+ if (j - i == cls.len && memcmp(classes.p + i, cls.p, cls.len) == 0) return 1;
101
+ i = j;
102
+ }
103
+ return 0;
104
+ }
105
+
106
+ typedef struct {
107
+ strv name;
108
+ int op;
109
+ strv val;
110
+ } attr_match;
111
+
112
+ typedef struct {
113
+ strv tag;
114
+ strv classes[MAX_CLASSES_PER_SEL];
115
+ int n_classes;
116
+ strv id;
117
+ attr_match attrs[MAX_ATTRMATCH_PER_SEL];
118
+ int n_attrs;
119
+ } ssel;
120
+
121
+ static inline int strv_eq_bytes(strv a, strv b) {
122
+ return a.len == b.len && memcmp(a.p, b.p, a.len) == 0;
123
+ }
124
+
125
+ static int strv_contains(strv hay, strv needle) {
126
+ if (needle.len == 0) return 0;
127
+ if (hay.len < needle.len) return 0;
128
+ for (size_t i = 0; i <= hay.len - needle.len; i++) {
129
+ if (memcmp(hay.p + i, needle.p, needle.len) == 0) return 1;
130
+ }
131
+ return 0;
132
+ }
133
+
134
+ static int dash_match(strv hay, strv needle) {
135
+ if (needle.len == 0) return hay.len == 0;
136
+ if (hay.len < needle.len) return 0;
137
+ if (memcmp(hay.p, needle.p, needle.len) != 0) return 0;
138
+ if (hay.len == needle.len) return 1;
139
+ return hay.p[needle.len] == '-';
140
+ }
141
+
142
+ static int sel_match(const ssel *s, strv tag, strv cls_attr, strv id_attr,
143
+ const strv attrs[][2], int n_attrs) {
144
+ if (s->tag.len > 0 && !strv_ieq(s->tag, tag)) return 0;
145
+ for (int i = 0; i < s->n_classes; i++) {
146
+ if (!class_present(cls_attr, s->classes[i])) return 0;
147
+ }
148
+ if (s->id.len > 0) {
149
+ if (id_attr.len == 0) return 0;
150
+ if (id_attr.len != s->id.len || memcmp(id_attr.p, s->id.p, s->id.len) != 0) return 0;
151
+ }
152
+ for (int i = 0; i < s->n_attrs; i++) {
153
+ strv av = { NULL, 0 };
154
+ int found = 0;
155
+ for (int a = 0; a < n_attrs; a++) {
156
+ if (strv_ieq(attrs[a][0], s->attrs[i].name)) {
157
+ av = attrs[a][1];
158
+ found = 1;
159
+ break;
160
+ }
161
+ }
162
+ if (!found) return 0;
163
+ switch (s->attrs[i].op) {
164
+ case AOP_EXISTS:
165
+ /* presence sufficient */
166
+ break;
167
+ case AOP_EQ:
168
+ if (!strv_eq_bytes(av, s->attrs[i].val)) return 0;
169
+ break;
170
+ case AOP_PREFIX:
171
+ if (av.len < s->attrs[i].val.len ||
172
+ memcmp(av.p, s->attrs[i].val.p, s->attrs[i].val.len) != 0) return 0;
173
+ break;
174
+ case AOP_SUFFIX:
175
+ if (av.len < s->attrs[i].val.len ||
176
+ memcmp(av.p + av.len - s->attrs[i].val.len,
177
+ s->attrs[i].val.p, s->attrs[i].val.len) != 0) return 0;
178
+ break;
179
+ case AOP_CONTAINS:
180
+ if (!strv_contains(av, s->attrs[i].val)) return 0;
181
+ break;
182
+ case AOP_WORD:
183
+ if (!class_present(av, s->attrs[i].val)) return 0;
184
+ break;
185
+ case AOP_DASH:
186
+ if (!dash_match(av, s->attrs[i].val)) return 0;
187
+ break;
188
+ default:
189
+ return 0;
190
+ }
191
+ }
192
+ return 1;
193
+ }
194
+
195
+ /* ---- HTML entity decode -------------------------------------------- */
196
+
197
+ static void emit_utf8(char *out, int code, int *outlen) {
198
+ if (code < 0x80) {
199
+ out[0] = (char)code; *outlen = 1;
200
+ } else if (code < 0x800) {
201
+ out[0] = (char)(0xC0 | (code >> 6));
202
+ out[1] = (char)(0x80 | (code & 0x3F));
203
+ *outlen = 2;
204
+ } else if (code < 0x10000) {
205
+ out[0] = (char)(0xE0 | (code >> 12));
206
+ out[1] = (char)(0x80 | ((code >> 6) & 0x3F));
207
+ out[2] = (char)(0x80 | (code & 0x3F));
208
+ *outlen = 3;
209
+ } else {
210
+ out[0] = (char)(0xF0 | (code >> 18));
211
+ out[1] = (char)(0x80 | ((code >> 12) & 0x3F));
212
+ out[2] = (char)(0x80 | ((code >> 6) & 0x3F));
213
+ out[3] = (char)(0x80 | (code & 0x3F));
214
+ *outlen = 4;
215
+ }
216
+ }
217
+
218
+ static void append_decoded(VALUE buf, const char *s, size_t len) {
219
+ size_t start = 0, i = 0;
220
+ while (i < len) {
221
+ if (s[i] == '&') {
222
+ if (i > start) rb_str_buf_cat(buf, s + start, i - start);
223
+ size_t j = i + 1;
224
+ size_t cap = (len - j < 10) ? (len - j) : 10;
225
+ while (j < i + 1 + cap && s[j] != ';' && s[j] != '&' && s[j] != ' ' && s[j] != '<') j++;
226
+ int matched = 0;
227
+ if (j < len && s[j] == ';') {
228
+ size_t elen = j - i - 1;
229
+ const char *e = s + i + 1;
230
+ char rep[4]; int rl = 0;
231
+ if (elen == 3 && memcmp(e, "amp", 3) == 0) { rep[0] = '&'; rl = 1; }
232
+ else if (elen == 2 && memcmp(e, "lt", 2) == 0) { rep[0] = '<'; rl = 1; }
233
+ else if (elen == 2 && memcmp(e, "gt", 2) == 0) { rep[0] = '>'; rl = 1; }
234
+ else if (elen == 4 && memcmp(e, "quot", 4) == 0) { rep[0] = '"'; rl = 1; }
235
+ else if (elen == 4 && memcmp(e, "apos", 4) == 0) { rep[0] = '\''; rl = 1; }
236
+ else if (elen == 4 && memcmp(e, "nbsp", 4) == 0) { rep[0] = ' '; rl = 1; }
237
+ else if (elen >= 2 && e[0] == '#') {
238
+ int code = 0;
239
+ if (e[1] == 'x' || e[1] == 'X') {
240
+ for (size_t k = 2; k < elen; k++) {
241
+ int c = e[k];
242
+ if (c >= '0' && c <= '9') code = code * 16 + (c - '0');
243
+ else if (c >= 'a' && c <= 'f') code = code * 16 + (c - 'a' + 10);
244
+ else if (c >= 'A' && c <= 'F') code = code * 16 + (c - 'A' + 10);
245
+ else { code = -1; break; }
246
+ }
247
+ } else {
248
+ for (size_t k = 1; k < elen; k++) {
249
+ int c = e[k];
250
+ if (c >= '0' && c <= '9') code = code * 10 + (c - '0');
251
+ else { code = -1; break; }
252
+ }
253
+ }
254
+ if (code > 0 && code <= 0x10FFFF) emit_utf8(rep, code, &rl);
255
+ }
256
+ if (rl > 0) {
257
+ rb_str_buf_cat(buf, rep, rl);
258
+ i = j + 1;
259
+ start = i;
260
+ matched = 1;
261
+ }
262
+ }
263
+ if (!matched) {
264
+ rb_str_buf_cat(buf, "&", 1);
265
+ i++;
266
+ start = i;
267
+ }
268
+ } else {
269
+ i++;
270
+ }
271
+ }
272
+ if (i > start) rb_str_buf_cat(buf, s + start, i - start);
273
+ }
274
+
275
+ static VALUE rb_str_decoded(const char *p, size_t len) {
276
+ VALUE s = rb_str_buf_new(len);
277
+ append_decoded(s, p, len);
278
+ return u8(s);
279
+ }
280
+
281
+ /* ---- HTML5 void elements ------------------------------------------- */
282
+
283
+ static int is_void(strv tag) {
284
+ static const struct { const char *n; size_t l; } voids[] = {
285
+ {"area",4},{"base",4},{"br",2},{"col",3},{"embed",5},
286
+ {"hr",2},{"img",3},{"input",5},{"link",4},{"meta",4},
287
+ {"source",6},{"track",5},{"wbr",3},{NULL,0}
288
+ };
289
+ for (int i = 0; voids[i].n; i++) {
290
+ if (strv_ieq_cstr(tag, voids[i].n, voids[i].l)) return 1;
291
+ }
292
+ return 0;
293
+ }
294
+
295
+ /* ---- field-text buffer (stack-first, grows to heap) ---------------- */
296
+
297
+ #define FB_STACK_BYTES 1024
298
+
299
+ typedef struct {
300
+ char *p;
301
+ size_t len;
302
+ size_t cap;
303
+ char inline_buf[FB_STACK_BYTES];
304
+ } fbuf_t;
305
+
306
+ static void fbuf_reset(fbuf_t *b) {
307
+ b->p = b->inline_buf;
308
+ b->cap = FB_STACK_BYTES;
309
+ b->len = 0;
310
+ }
311
+
312
+ static void fbuf_free(fbuf_t *b) {
313
+ if (b->p != b->inline_buf && b->p != NULL) {
314
+ free(b->p);
315
+ b->p = NULL;
316
+ }
317
+ }
318
+
319
+ static void fbuf_append(fbuf_t *b, const char *s, size_t n) {
320
+ if (b->len + n > b->cap) {
321
+ size_t ncap = b->cap;
322
+ while (ncap < b->len + n) ncap *= 2;
323
+ if (b->p == b->inline_buf) {
324
+ char *np = (char *)malloc(ncap);
325
+ if (!np) return;
326
+ memcpy(np, b->p, b->len);
327
+ b->p = np;
328
+ } else {
329
+ char *np = (char *)realloc(b->p, ncap);
330
+ if (!np) return;
331
+ b->p = np;
332
+ }
333
+ b->cap = ncap;
334
+ }
335
+ memcpy(b->p + b->len, s, n);
336
+ b->len += n;
337
+ }
338
+
339
+ /* ---- schema desc parsed into native form --------------------------- */
340
+
341
+ typedef struct {
342
+ ID name;
343
+ ssel sel;
344
+ strv attr; /* len==0 => text capture */
345
+ int type;
346
+ int clean;
347
+ int normalize_url;
348
+ int multi;
349
+ /* Optional context constraint: field matches only when the
350
+ * primary sel is reached AND `context_sel` matched an ancestor
351
+ * (or immediate parent for child combinator). */
352
+ int has_context;
353
+ int combinator; /* COMB_NONE | COMB_DESCENDANT | COMB_CHILD */
354
+ ssel context_sel;
355
+ } field_t;
356
+
357
+ typedef struct {
358
+ ID name;
359
+ ssel sel;
360
+ int n_fields;
361
+ field_t fields[MAX_FIELDS];
362
+ VALUE results; /* Ruby Array */
363
+ } group_t;
364
+
365
+ typedef struct {
366
+ strv tag;
367
+ int gi_when_pushed;
368
+ uint64_t started_fields; /* bitmask: bit fi set => field fi captures here */
369
+ uint64_t context_open; /* bitmask: bit fi set => this frame matches field fi's context_sel */
370
+ int opens_record;
371
+ } frame_t;
372
+
373
+ typedef struct {
374
+ const char *html;
375
+ size_t len;
376
+ size_t pos;
377
+
378
+ frame_t stack[MAX_STACK];
379
+ int sp;
380
+
381
+ group_t groups[MAX_GROUPS];
382
+ int n_groups;
383
+
384
+ int active_gi;
385
+ int record_depth;
386
+ VALUE record;
387
+ fbuf_t ftext[MAX_FIELDS];
388
+ int fdone[MAX_FIELDS];
389
+
390
+ VALUE base_url;
391
+ const char *base_url_p;
392
+ size_t base_url_len;
393
+ long base_origin_len;
394
+ } ctx_t;
395
+
396
+ /* ---- coercion ------------------------------------------------------ */
397
+
398
+ static VALUE finalize_text(fbuf_t *b, int clean) {
399
+ if (!clean) {
400
+ VALUE out = rb_str_buf_new(b->len);
401
+ append_decoded(out, b->p, b->len);
402
+ return u8(out);
403
+ }
404
+ /* Squeeze whitespace first into a stack buffer, then decode entities. */
405
+ char stack_tmp[2048];
406
+ char *tmp = (b->len <= sizeof(stack_tmp)) ? stack_tmp : (char *)malloc(b->len);
407
+ if (!tmp) tmp = stack_tmp;
408
+ size_t tl = 0;
409
+ int in_ws = 1;
410
+ for (size_t i = 0; i < b->len; i++) {
411
+ unsigned char ch = (unsigned char)b->p[i];
412
+ if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' || ch == '\v') {
413
+ if (!in_ws) { tmp[tl++] = ' '; in_ws = 1; }
414
+ } else {
415
+ tmp[tl++] = (char)ch;
416
+ in_ws = 0;
417
+ }
418
+ }
419
+ if (tl > 0 && tmp[tl - 1] == ' ') tl--;
420
+ VALUE out = rb_str_buf_new(tl);
421
+ append_decoded(out, tmp, tl);
422
+ if (tmp != stack_tmp) free(tmp);
423
+ return u8(out);
424
+ }
425
+
426
+ static VALUE coerce_money_raw(const char *p, size_t l) {
427
+ if (l == 0) return Qnil;
428
+ size_t i = 0;
429
+ while (i < l && !isdigit((unsigned char)p[i]) && p[i] != '-') i++;
430
+ if (i >= l) return Qnil;
431
+ size_t start = i;
432
+ if (p[i] == '-') i++;
433
+ while (i < l && (isdigit((unsigned char)p[i]) || p[i] == '.' || p[i] == ',')) i++;
434
+ size_t end = i;
435
+ if (end == start) return Qnil;
436
+
437
+ char buf[80]; size_t bl = 0;
438
+ int dots = 0, commas = 0;
439
+ for (size_t k = start; k < end && bl < sizeof(buf) - 1; k++) {
440
+ if (p[k] == '.') dots++;
441
+ else if (p[k] == ',') commas++;
442
+ buf[bl++] = p[k];
443
+ }
444
+ buf[bl] = 0;
445
+
446
+ if (dots > 0 && commas > 0) {
447
+ const char *last_dot = strrchr(buf, '.');
448
+ const char *last_comma = strrchr(buf, ',');
449
+ char tmp[80]; size_t ti = 0;
450
+ if (last_dot > last_comma) {
451
+ for (size_t k = 0; k < bl; k++) if (buf[k] != ',') tmp[ti++] = buf[k];
452
+ } else {
453
+ for (size_t k = 0; k < bl; k++) {
454
+ if (buf[k] == '.') continue;
455
+ tmp[ti++] = (buf[k] == ',') ? '.' : buf[k];
456
+ }
457
+ }
458
+ tmp[ti] = 0; memcpy(buf, tmp, ti + 1); bl = ti;
459
+ } else if (commas > 1) {
460
+ char tmp[80]; size_t ti = 0;
461
+ for (size_t k = 0; k < bl; k++) if (buf[k] != ',') tmp[ti++] = buf[k];
462
+ tmp[ti] = 0; memcpy(buf, tmp, ti + 1); bl = ti;
463
+ } else if (commas == 1) {
464
+ char *cp = strchr(buf, ',');
465
+ size_t rest = bl - (size_t)(cp - buf) - 1;
466
+ int all_d = 1;
467
+ for (size_t k = 0; k < rest; k++) if (!isdigit((unsigned char)cp[1 + k])) { all_d = 0; break; }
468
+ if (all_d && rest == 3) {
469
+ char tmp[80]; size_t ti = 0;
470
+ for (size_t k = 0; k < bl; k++) if (buf[k] != ',') tmp[ti++] = buf[k];
471
+ tmp[ti] = 0; memcpy(buf, tmp, ti + 1); bl = ti;
472
+ } else {
473
+ for (size_t k = 0; k < bl; k++) if (buf[k] == ',') buf[k] = '.';
474
+ }
475
+ } else if (dots > 1) {
476
+ const char *cp = strrchr(buf, '.');
477
+ size_t rest = bl - (size_t)(cp - buf) - 1;
478
+ if (rest == 3) {
479
+ char tmp[80]; size_t ti = 0;
480
+ for (size_t k = 0; k < bl; k++) if (buf[k] != '.') tmp[ti++] = buf[k];
481
+ tmp[ti] = 0; memcpy(buf, tmp, ti + 1); bl = ti;
482
+ }
483
+ }
484
+ double d = strtod(buf, NULL);
485
+ return DBL2NUM(d);
486
+ }
487
+
488
+ static VALUE coerce_money_fbuf(fbuf_t *b) { return coerce_money_raw(b->p, b->len); }
489
+ static VALUE coerce_money_strv(strv v) { return coerce_money_raw(v.p, v.len); }
490
+
491
+ static VALUE url_absolute(const char *p, size_t l, ctx_t *c) {
492
+ VALUE s = rb_str_buf_new(l);
493
+ append_decoded(s, p, l);
494
+ long sl = RSTRING_LEN(s);
495
+ if (sl == 0) return Qnil;
496
+ const char *sp = RSTRING_PTR(s);
497
+
498
+ if (sl >= 7 && memcmp(sp, "http://", 7) == 0) return u8(s);
499
+ if (sl >= 8 && memcmp(sp, "https://", 8) == 0) return u8(s);
500
+
501
+ if (NIL_P(c->base_url)) return u8(s);
502
+
503
+ if (sl >= 2 && sp[0] == '/' && sp[1] == '/') {
504
+ long scheme_end = 0;
505
+ while (scheme_end < (long)c->base_url_len && c->base_url_p[scheme_end] != ':') scheme_end++;
506
+ VALUE out = rb_str_buf_new(scheme_end + 1 + sl);
507
+ rb_str_buf_cat(out, c->base_url_p, scheme_end);
508
+ rb_str_buf_cat(out, ":", 1);
509
+ rb_str_buf_cat(out, sp, sl);
510
+ return u8(out);
511
+ }
512
+
513
+ if (sl >= 1 && sp[0] == '/') {
514
+ VALUE out = rb_str_buf_new(c->base_origin_len + sl);
515
+ rb_str_buf_cat(out, c->base_url_p, c->base_origin_len);
516
+ rb_str_buf_cat(out, sp, sl);
517
+ return u8(out);
518
+ }
519
+
520
+ VALUE uri_mod = rb_const_get(rb_cObject, rb_intern("URI"));
521
+ VALUE joined = rb_funcall(uri_mod, rb_intern("join"), 2, c->base_url, u8(s));
522
+ return rb_funcall(joined, rb_intern("to_s"), 0);
523
+ }
524
+
525
+ static VALUE field_array_for(ctx_t *c, ID name) {
526
+ VALUE sym = ID2SYM(name);
527
+ VALUE arr = rb_hash_lookup(c->record, sym);
528
+ if (!RB_TYPE_P(arr, T_ARRAY)) {
529
+ arr = rb_ary_new();
530
+ rb_hash_aset(c->record, sym, arr);
531
+ }
532
+ return arr;
533
+ }
534
+
535
+ static void assign_field_value(ctx_t *c, field_t *f, VALUE val) {
536
+ if (NIL_P(val)) return;
537
+ if (f->multi) {
538
+ VALUE arr = field_array_for(c, f->name);
539
+ rb_ary_push(arr, val);
540
+ } else {
541
+ rb_hash_aset(c->record, ID2SYM(f->name), val);
542
+ }
543
+ }
544
+
545
+ static void finalize_field(ctx_t *c, int gi, int fi) {
546
+ field_t *f = &c->groups[gi].fields[fi];
547
+ fbuf_t *b = &c->ftext[fi];
548
+ VALUE val = Qnil;
549
+ switch (f->type) {
550
+ case FT_TEXT:
551
+ val = finalize_text(b, f->clean);
552
+ break;
553
+ case FT_MONEY:
554
+ val = coerce_money_fbuf(b);
555
+ break;
556
+ case FT_URL:
557
+ val = f->normalize_url ? url_absolute(b->p, b->len, c) : finalize_text(b, 0);
558
+ break;
559
+ case FT_INTEGER: {
560
+ char tmp[64]; size_t tl = b->len < 63 ? b->len : 63;
561
+ size_t ti = 0;
562
+ for (size_t k = 0; k < tl; k++) {
563
+ char ch = b->p[k];
564
+ if ((ch >= '0' && ch <= '9') || ch == '-') tmp[ti++] = ch;
565
+ }
566
+ if (ti == 0) { val = Qnil; break; }
567
+ tmp[ti] = 0;
568
+ val = INT2NUM(atoi(tmp));
569
+ break;
570
+ }
571
+ case FT_FLOAT: {
572
+ char tmp[64]; size_t tl = b->len < 63 ? b->len : 63;
573
+ size_t ti = 0;
574
+ for (size_t k = 0; k < tl; k++) {
575
+ char ch = b->p[k];
576
+ if ((ch >= '0' && ch <= '9') || ch == '.' || ch == '-') tmp[ti++] = ch;
577
+ }
578
+ if (ti == 0) { val = Qnil; break; }
579
+ tmp[ti] = 0;
580
+ val = DBL2NUM(strtod(tmp, NULL));
581
+ break;
582
+ }
583
+ default:
584
+ val = finalize_text(b, f->clean);
585
+ }
586
+ assign_field_value(c, f, val);
587
+ if (f->multi) {
588
+ fbuf_reset(b); /* ready for the next match in this record */
589
+ } else {
590
+ c->fdone[fi] = 1;
591
+ }
592
+ }
593
+
594
+ /* ---- tokenizer ----------------------------------------------------- */
595
+
596
+ static void skip_comment(ctx_t *c) {
597
+ c->pos += 4;
598
+ while (c->pos + 2 < c->len) {
599
+ if (c->html[c->pos] == '-' && c->html[c->pos + 1] == '-' && c->html[c->pos + 2] == '>') {
600
+ c->pos += 3;
601
+ return;
602
+ }
603
+ c->pos++;
604
+ }
605
+ c->pos = c->len;
606
+ }
607
+
608
+ static void skip_until_gt(ctx_t *c) {
609
+ while (c->pos < c->len && c->html[c->pos] != '>') c->pos++;
610
+ if (c->pos < c->len) c->pos++;
611
+ }
612
+
613
+ static void skip_raw_until(ctx_t *c, const char *name, size_t nlen) {
614
+ while (c->pos < c->len) {
615
+ const char *next_lt = (const char *)memchr(c->html + c->pos, '<', c->len - c->pos);
616
+ if (!next_lt) { c->pos = c->len; return; }
617
+ size_t p = (size_t)(next_lt - c->html);
618
+ if (p + 1 + nlen < c->len && c->html[p + 1] == '/' && strncasecmp(c->html + p + 2, name, nlen) == 0) {
619
+ char after = (p + 2 + nlen < c->len) ? c->html[p + 2 + nlen] : '\0';
620
+ if (after == '>' || after == ' ' || after == '\t' || after == '\n' || after == '/' || after == '\r') {
621
+ c->pos = p;
622
+ skip_until_gt(c);
623
+ return;
624
+ }
625
+ }
626
+ c->pos = p + 1;
627
+ }
628
+ }
629
+
630
+ static inline int is_name_start(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'; }
631
+ static inline int is_name_char(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch == '-' || ch == '_' || ch == ':'; }
632
+ static inline int is_ws(int ch) { return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f' || ch == '\v'; }
633
+
634
+ static inline void emit_text_span(ctx_t *c, size_t start, size_t end) {
635
+ if (start >= end) return;
636
+ if (c->active_gi < 0) return;
637
+ /* Union of all "started" fields across frames inside the current record. */
638
+ uint64_t open = 0;
639
+ for (int i = c->record_depth; i < c->sp; i++) open |= c->stack[i].started_fields;
640
+ if (open == 0) return;
641
+ const char *src = c->html + start;
642
+ size_t span = end - start;
643
+ /* Iterate set bits */
644
+ while (open) {
645
+ int fi = __builtin_ctzll(open);
646
+ open &= open - 1;
647
+ field_t *f = &c->groups[c->active_gi].fields[fi];
648
+ if (f->multi || !c->fdone[fi]) {
649
+ fbuf_append(&c->ftext[fi], src, span);
650
+ }
651
+ }
652
+ }
653
+
654
+ static void open_record(ctx_t *c, int gi) {
655
+ c->active_gi = gi;
656
+ c->record_depth = c->sp;
657
+ c->record = rb_hash_new();
658
+ group_t *g = &c->groups[gi];
659
+ for (int i = 0; i < g->n_fields; i++) {
660
+ fbuf_reset(&c->ftext[i]);
661
+ c->fdone[i] = 0;
662
+ }
663
+ }
664
+
665
+ static void close_record(ctx_t *c) {
666
+ if (c->active_gi < 0) return;
667
+ group_t *g = &c->groups[c->active_gi];
668
+ for (int i = 0; i < g->n_fields; i++) {
669
+ if (!c->fdone[i] && c->ftext[i].len > 0) finalize_field(c, c->active_gi, i);
670
+ }
671
+ rb_ary_push(g->results, c->record);
672
+ c->record = Qnil;
673
+ c->active_gi = -1;
674
+ }
675
+
676
+ static void handle_start_tag(ctx_t *c) {
677
+ c->pos++;
678
+ size_t tag_start = c->pos;
679
+ while (c->pos < c->len && is_name_char((unsigned char)c->html[c->pos])) c->pos++;
680
+ strv tag = { c->html + tag_start, c->pos - tag_start };
681
+ if (tag.len == 0) return;
682
+
683
+ strv attrs[MAX_ATTRS][2];
684
+ int n_attrs = 0;
685
+ strv cls = { NULL, 0 };
686
+ strv id_attr = { NULL, 0 };
687
+ int self_closing = 0;
688
+
689
+ while (c->pos < c->len) {
690
+ while (c->pos < c->len && is_ws((unsigned char)c->html[c->pos])) c->pos++;
691
+ if (c->pos >= c->len) break;
692
+ char ch = c->html[c->pos];
693
+ if (ch == '>') { c->pos++; break; }
694
+ if (ch == '/' && c->pos + 1 < c->len && c->html[c->pos + 1] == '>') {
695
+ self_closing = 1;
696
+ c->pos += 2;
697
+ break;
698
+ }
699
+ size_t an_s = c->pos;
700
+ while (c->pos < c->len) {
701
+ unsigned char nc = (unsigned char)c->html[c->pos];
702
+ if (nc == '=' || nc == '>' || nc == '/' || is_ws(nc)) break;
703
+ c->pos++;
704
+ }
705
+ strv aname = { c->html + an_s, c->pos - an_s };
706
+ strv aval = { NULL, 0 };
707
+
708
+ while (c->pos < c->len && is_ws((unsigned char)c->html[c->pos])) c->pos++;
709
+ if (c->pos < c->len && c->html[c->pos] == '=') {
710
+ c->pos++;
711
+ while (c->pos < c->len && is_ws((unsigned char)c->html[c->pos])) c->pos++;
712
+ if (c->pos < c->len) {
713
+ char q = c->html[c->pos];
714
+ if (q == '"' || q == '\'') {
715
+ c->pos++;
716
+ size_t av_s = c->pos;
717
+ while (c->pos < c->len && c->html[c->pos] != q) c->pos++;
718
+ aval.p = c->html + av_s;
719
+ aval.len = c->pos - av_s;
720
+ if (c->pos < c->len) c->pos++;
721
+ } else {
722
+ size_t av_s = c->pos;
723
+ while (c->pos < c->len && !is_ws((unsigned char)c->html[c->pos]) && c->html[c->pos] != '>') c->pos++;
724
+ aval.p = c->html + av_s;
725
+ aval.len = c->pos - av_s;
726
+ }
727
+ }
728
+ }
729
+ if (aname.len > 0 && n_attrs < MAX_ATTRS) {
730
+ attrs[n_attrs][0] = aname;
731
+ attrs[n_attrs][1] = aval;
732
+ n_attrs++;
733
+ }
734
+ if (aname.len == 5 && strv_ieq_cstr(aname, "class", 5)) cls = aval;
735
+ else if (aname.len == 2 && strv_ieq_cstr(aname, "id", 2)) id_attr = aval;
736
+ }
737
+
738
+ int void_el = is_void(tag);
739
+ int will_push = !void_el && !self_closing;
740
+
741
+ if (c->active_gi < 0) {
742
+ for (int gi = 0; gi < c->n_groups; gi++) {
743
+ if (sel_match(&c->groups[gi].sel, tag, cls, id_attr, attrs, n_attrs)) {
744
+ if (will_push) open_record(c, gi);
745
+ break;
746
+ }
747
+ }
748
+ }
749
+
750
+ uint64_t my_fields = 0;
751
+ uint64_t my_context = 0;
752
+ if (c->active_gi >= 0) {
753
+ group_t *g = &c->groups[c->active_gi];
754
+ int nfields = g->n_fields;
755
+ if (nfields > 64) nfields = 64;
756
+
757
+ /* First pass: which fields' context selectors does this frame match? */
758
+ for (int fi = 0; fi < nfields; fi++) {
759
+ field_t *f = &g->fields[fi];
760
+ if (!f->has_context) continue;
761
+ if (sel_match(&f->context_sel, tag, cls, id_attr, attrs, n_attrs)) {
762
+ my_context |= ((uint64_t)1 << fi);
763
+ }
764
+ }
765
+
766
+ /* Second pass: primary matching, gated by context constraint. */
767
+ for (int fi = 0; fi < nfields; fi++) {
768
+ field_t *f = &g->fields[fi];
769
+ if (!f->multi && c->fdone[fi]) continue;
770
+ if (!sel_match(&f->sel, tag, cls, id_attr, attrs, n_attrs)) continue;
771
+
772
+ if (f->has_context) {
773
+ uint64_t open = 0;
774
+ if (f->combinator == COMB_DESCENDANT) {
775
+ for (int i = c->record_depth; i < c->sp; i++) {
776
+ open |= c->stack[i].context_open;
777
+ }
778
+ } else if (f->combinator == COMB_CHILD) {
779
+ if (c->sp > 0) open = c->stack[c->sp - 1].context_open;
780
+ }
781
+ if (!(open & ((uint64_t)1 << fi))) continue;
782
+ }
783
+ if (f->attr.len > 0) {
784
+ strv av = { NULL, 0 };
785
+ int found = 0;
786
+ for (int a = 0; a < n_attrs; a++) {
787
+ if (strv_ieq(attrs[a][0], f->attr)) {
788
+ av = attrs[a][1];
789
+ found = 1;
790
+ break;
791
+ }
792
+ }
793
+ if (found) {
794
+ VALUE val = Qnil;
795
+ switch (f->type) {
796
+ case FT_URL:
797
+ val = f->normalize_url ? url_absolute(av.p, av.len, c)
798
+ : rb_str_decoded(av.p, av.len);
799
+ break;
800
+ case FT_MONEY: val = coerce_money_strv(av); break;
801
+ default: val = rb_str_decoded(av.p, av.len); break;
802
+ }
803
+ assign_field_value(c, f, val);
804
+ if (!f->multi) c->fdone[fi] = 1;
805
+ }
806
+ } else {
807
+ if (will_push) {
808
+ fbuf_reset(&c->ftext[fi]);
809
+ my_fields |= ((uint64_t)1 << fi);
810
+ }
811
+ }
812
+ }
813
+ }
814
+
815
+ if (strv_ieq_cstr(tag, "script", 6)) {
816
+ skip_raw_until(c, "script", 6);
817
+ return;
818
+ }
819
+ if (strv_ieq_cstr(tag, "style", 5)) {
820
+ skip_raw_until(c, "style", 5);
821
+ return;
822
+ }
823
+
824
+ if (will_push) {
825
+ if (c->sp < MAX_STACK) {
826
+ frame_t *fr = &c->stack[c->sp];
827
+ fr->tag = tag;
828
+ fr->gi_when_pushed = c->active_gi;
829
+ fr->started_fields = my_fields;
830
+ fr->context_open = my_context;
831
+ fr->opens_record = (c->active_gi >= 0 && c->record_depth == c->sp) ? 1 : 0;
832
+ c->sp++;
833
+ }
834
+ }
835
+ }
836
+
837
+ static void handle_end_tag(ctx_t *c) {
838
+ c->pos += 2;
839
+ size_t name_s = c->pos;
840
+ while (c->pos < c->len && is_name_char((unsigned char)c->html[c->pos])) c->pos++;
841
+ strv name = { c->html + name_s, c->pos - name_s };
842
+ skip_until_gt(c);
843
+
844
+ if (c->sp == 0 || name.len == 0) return;
845
+
846
+ int found_at = -1;
847
+ for (int i = c->sp - 1; i >= 0; i--) {
848
+ if (strv_ieq(c->stack[i].tag, name)) { found_at = i; break; }
849
+ }
850
+ if (found_at < 0) return;
851
+
852
+ while (c->sp > found_at) {
853
+ c->sp--;
854
+ frame_t *fr = &c->stack[c->sp];
855
+ uint64_t open = fr->started_fields;
856
+ while (open && c->active_gi >= 0) {
857
+ int fi = __builtin_ctzll(open);
858
+ open &= open - 1;
859
+ field_t *f = &c->groups[c->active_gi].fields[fi];
860
+ if (f->multi || !c->fdone[fi]) {
861
+ finalize_field(c, c->active_gi, fi);
862
+ }
863
+ }
864
+ if (fr->opens_record) {
865
+ close_record(c);
866
+ }
867
+ }
868
+ }
869
+
870
+ static void scan(ctx_t *c) {
871
+ while (c->pos < c->len) {
872
+ size_t text_start = c->pos;
873
+ const char *next_lt = (const char *)memchr(c->html + c->pos, '<', c->len - c->pos);
874
+ size_t lt = next_lt ? (size_t)(next_lt - c->html) : c->len;
875
+ emit_text_span(c, text_start, lt);
876
+ c->pos = lt;
877
+ if (c->pos >= c->len) break;
878
+
879
+ if (c->pos + 3 < c->len &&
880
+ c->html[c->pos + 1] == '!' && c->html[c->pos + 2] == '-' && c->html[c->pos + 3] == '-') {
881
+ skip_comment(c);
882
+ continue;
883
+ }
884
+ if (c->pos + 1 < c->len && c->html[c->pos + 1] == '!') {
885
+ skip_until_gt(c);
886
+ continue;
887
+ }
888
+ if (c->pos + 1 < c->len && c->html[c->pos + 1] == '/') {
889
+ handle_end_tag(c);
890
+ continue;
891
+ }
892
+ if (c->pos + 1 < c->len && is_name_start((unsigned char)c->html[c->pos + 1])) {
893
+ handle_start_tag(c);
894
+ continue;
895
+ }
896
+ emit_text_span(c, c->pos, c->pos + 1);
897
+ c->pos++;
898
+ }
899
+ }
900
+
901
+ /* ---- descriptor parsing -------------------------------------------- */
902
+
903
+ static ID id_text, id_money, id_url, id_integer, id_float;
904
+
905
+ /* New selector format from Ruby:
906
+ * sel = [tag_or_nil, classes_array, id_or_nil, attrs_array]
907
+ * attrs_array = [[name_str, op_str_or_nil, val_str_or_nil], ...]
908
+ * op_str ∈ {"=", "*=", "^=", "$=", "~=", "|=", nil}
909
+ */
910
+ static int parse_attr_op(VALUE op_v) {
911
+ if (NIL_P(op_v)) return AOP_EXISTS;
912
+ if (!RB_TYPE_P(op_v, T_STRING)) return -1;
913
+ const char *p = RSTRING_PTR(op_v);
914
+ long l = RSTRING_LEN(op_v);
915
+ if (l == 1 && p[0] == '=') return AOP_EQ;
916
+ if (l == 2 && p[1] == '=') {
917
+ switch (p[0]) {
918
+ case '*': return AOP_CONTAINS;
919
+ case '^': return AOP_PREFIX;
920
+ case '$': return AOP_SUFFIX;
921
+ case '~': return AOP_WORD;
922
+ case '|': return AOP_DASH;
923
+ }
924
+ }
925
+ return -1;
926
+ }
927
+
928
+ static int parse_selector_value(VALUE sel_v, ssel *out) {
929
+ memset(out, 0, sizeof(*out));
930
+ if (!RB_TYPE_P(sel_v, T_ARRAY) || RARRAY_LEN(sel_v) < 4) return 0;
931
+
932
+ VALUE tag_v = rb_ary_entry(sel_v, 0);
933
+ if (NIL_P(tag_v)) { out->tag.p = NULL; out->tag.len = 0; }
934
+ else {
935
+ if (!RB_TYPE_P(tag_v, T_STRING)) return 0;
936
+ out->tag.p = RSTRING_PTR(tag_v);
937
+ out->tag.len = (size_t)RSTRING_LEN(tag_v);
938
+ }
939
+
940
+ VALUE classes_v = rb_ary_entry(sel_v, 1);
941
+ if (!RB_TYPE_P(classes_v, T_ARRAY)) return 0;
942
+ long nc = RARRAY_LEN(classes_v);
943
+ if (nc > MAX_CLASSES_PER_SEL) return 0;
944
+ out->n_classes = (int)nc;
945
+ for (int i = 0; i < out->n_classes; i++) {
946
+ VALUE c = rb_ary_entry(classes_v, i);
947
+ if (!RB_TYPE_P(c, T_STRING)) return 0;
948
+ out->classes[i].p = RSTRING_PTR(c);
949
+ out->classes[i].len = (size_t)RSTRING_LEN(c);
950
+ }
951
+
952
+ VALUE id_v = rb_ary_entry(sel_v, 2);
953
+ if (NIL_P(id_v)) { out->id.p = NULL; out->id.len = 0; }
954
+ else {
955
+ if (!RB_TYPE_P(id_v, T_STRING)) return 0;
956
+ out->id.p = RSTRING_PTR(id_v);
957
+ out->id.len = (size_t)RSTRING_LEN(id_v);
958
+ }
959
+
960
+ VALUE attrs_v = rb_ary_entry(sel_v, 3);
961
+ if (!RB_TYPE_P(attrs_v, T_ARRAY)) return 0;
962
+ long na = RARRAY_LEN(attrs_v);
963
+ if (na > MAX_ATTRMATCH_PER_SEL) return 0;
964
+ out->n_attrs = (int)na;
965
+ for (int i = 0; i < out->n_attrs; i++) {
966
+ VALUE a = rb_ary_entry(attrs_v, i);
967
+ if (!RB_TYPE_P(a, T_ARRAY) || RARRAY_LEN(a) < 3) return 0;
968
+ VALUE n = rb_ary_entry(a, 0);
969
+ VALUE o = rb_ary_entry(a, 1);
970
+ VALUE v = rb_ary_entry(a, 2);
971
+ if (!RB_TYPE_P(n, T_STRING)) return 0;
972
+ int op = parse_attr_op(o);
973
+ if (op < 0) return 0;
974
+ out->attrs[i].name.p = RSTRING_PTR(n);
975
+ out->attrs[i].name.len = (size_t)RSTRING_LEN(n);
976
+ out->attrs[i].op = op;
977
+ if (op == AOP_EXISTS) {
978
+ out->attrs[i].val.p = NULL;
979
+ out->attrs[i].val.len = 0;
980
+ } else {
981
+ if (!RB_TYPE_P(v, T_STRING)) return 0;
982
+ out->attrs[i].val.p = RSTRING_PTR(v);
983
+ out->attrs[i].val.len = (size_t)RSTRING_LEN(v);
984
+ }
985
+ }
986
+
987
+ return 1;
988
+ }
989
+
990
+ /* Descriptor format (from Ruby):
991
+ *
992
+ * desc = [group, group, ...]
993
+ * group = [name_sym, sel, fields_array]
994
+ * field = [name_sym, sel, attr_str_or_nil, type_sym, clean_bool,
995
+ * normalize_url_bool, multi_bool,
996
+ * context_sel_or_nil, combinator_or_nil]
997
+ * sel = [tag_or_nil, classes_array, id_or_nil, attrs_array]
998
+ * combinator ∈ { nil, "descendant", "child" }
999
+ */
1000
+ static int parse_descriptor(VALUE desc, ctx_t *c) {
1001
+ if (!RB_TYPE_P(desc, T_ARRAY)) return 0;
1002
+ long n = RARRAY_LEN(desc);
1003
+ if (n > MAX_GROUPS) n = MAX_GROUPS;
1004
+ c->n_groups = (int)n;
1005
+
1006
+ for (int gi = 0; gi < c->n_groups; gi++) {
1007
+ VALUE gd = rb_ary_entry(desc, gi);
1008
+ if (!RB_TYPE_P(gd, T_ARRAY) || RARRAY_LEN(gd) < 3) return 0;
1009
+
1010
+ group_t *g = &c->groups[gi];
1011
+ VALUE name_sym = rb_ary_entry(gd, 0);
1012
+ if (!SYMBOL_P(name_sym)) return 0;
1013
+ g->name = SYM2ID(name_sym);
1014
+
1015
+ if (!parse_selector_value(rb_ary_entry(gd, 1), &g->sel)) return 0;
1016
+
1017
+ VALUE fields_v = rb_ary_entry(gd, 2);
1018
+ if (!RB_TYPE_P(fields_v, T_ARRAY)) return 0;
1019
+ long nf = RARRAY_LEN(fields_v);
1020
+ if (nf > MAX_FIELDS) nf = MAX_FIELDS;
1021
+ g->n_fields = (int)nf;
1022
+
1023
+ for (int fi = 0; fi < g->n_fields; fi++) {
1024
+ VALUE fd = rb_ary_entry(fields_v, fi);
1025
+ if (!RB_TYPE_P(fd, T_ARRAY) || RARRAY_LEN(fd) < 7) return 0;
1026
+ field_t *f = &g->fields[fi];
1027
+ VALUE fname = rb_ary_entry(fd, 0);
1028
+ if (!SYMBOL_P(fname)) return 0;
1029
+ f->name = SYM2ID(fname);
1030
+ if (!parse_selector_value(rb_ary_entry(fd, 1), &f->sel)) return 0;
1031
+ VALUE fattr = rb_ary_entry(fd, 2);
1032
+ if (NIL_P(fattr)) { f->attr.p = NULL; f->attr.len = 0; }
1033
+ else {
1034
+ if (!RB_TYPE_P(fattr, T_STRING)) return 0;
1035
+ f->attr.p = RSTRING_PTR(fattr);
1036
+ f->attr.len = (size_t)RSTRING_LEN(fattr);
1037
+ }
1038
+ VALUE ftype = rb_ary_entry(fd, 3);
1039
+ if (!SYMBOL_P(ftype)) return 0;
1040
+ ID tid = SYM2ID(ftype);
1041
+ if (tid == id_money) f->type = FT_MONEY;
1042
+ else if (tid == id_url) f->type = FT_URL;
1043
+ else if (tid == id_integer) f->type = FT_INTEGER;
1044
+ else if (tid == id_float) f->type = FT_FLOAT;
1045
+ else f->type = FT_TEXT;
1046
+ f->clean = RTEST(rb_ary_entry(fd, 4)) ? 1 : 0;
1047
+ f->normalize_url = RTEST(rb_ary_entry(fd, 5)) ? 1 : 0;
1048
+ f->multi = RTEST(rb_ary_entry(fd, 6)) ? 1 : 0;
1049
+
1050
+ /* Optional context + combinator (positions 7, 8). */
1051
+ f->has_context = 0;
1052
+ f->combinator = COMB_NONE;
1053
+ memset(&f->context_sel, 0, sizeof(f->context_sel));
1054
+ if (RARRAY_LEN(fd) >= 9) {
1055
+ VALUE ctx = rb_ary_entry(fd, 7);
1056
+ VALUE comb = rb_ary_entry(fd, 8);
1057
+ if (!NIL_P(ctx) && !NIL_P(comb)) {
1058
+ if (!parse_selector_value(ctx, &f->context_sel)) return 0;
1059
+ if (!RB_TYPE_P(comb, T_STRING)) return 0;
1060
+ const char *cp = RSTRING_PTR(comb);
1061
+ long cl = RSTRING_LEN(comb);
1062
+ if (cl == 10 && memcmp(cp, "descendant", 10) == 0) f->combinator = COMB_DESCENDANT;
1063
+ else if (cl == 5 && memcmp(cp, "child", 5) == 0) f->combinator = COMB_CHILD;
1064
+ else return 0;
1065
+ f->has_context = 1;
1066
+ }
1067
+ }
1068
+ }
1069
+ g->results = rb_ary_new();
1070
+ }
1071
+ return 1;
1072
+ }
1073
+
1074
+ static long compute_origin_len(const char *p, long l) {
1075
+ long i = 0;
1076
+ while (i < l && p[i] != ':') i++;
1077
+ if (i + 2 >= l || p[i + 1] != '/' || p[i + 2] != '/') return l;
1078
+ i += 3;
1079
+ while (i < l && p[i] != '/') i++;
1080
+ return i;
1081
+ }
1082
+
1083
+ /* ---- entrypoint ---------------------------------------------------- */
1084
+
1085
+ static VALUE scrapetor_extract(VALUE self, VALUE html_v, VALUE desc_v, VALUE base_url_v) {
1086
+ (void)self;
1087
+ Check_Type(html_v, T_STRING);
1088
+
1089
+ ctx_t ctx;
1090
+ memset(&ctx, 0, sizeof(ctx));
1091
+ ctx.html = RSTRING_PTR(html_v);
1092
+ ctx.len = (size_t)RSTRING_LEN(html_v);
1093
+ ctx.pos = 0;
1094
+ ctx.sp = 0;
1095
+ ctx.active_gi = -1;
1096
+ ctx.record = Qnil;
1097
+ ctx.base_url = NIL_P(base_url_v) ? Qnil : base_url_v;
1098
+ if (!NIL_P(ctx.base_url)) {
1099
+ Check_Type(ctx.base_url, T_STRING);
1100
+ ctx.base_url_p = RSTRING_PTR(ctx.base_url);
1101
+ ctx.base_url_len = (size_t)RSTRING_LEN(ctx.base_url);
1102
+ ctx.base_origin_len = compute_origin_len(ctx.base_url_p, (long)ctx.base_url_len);
1103
+ }
1104
+ for (int i = 0; i < MAX_FIELDS; i++) fbuf_reset(&ctx.ftext[i]);
1105
+
1106
+ if (!parse_descriptor(desc_v, &ctx)) {
1107
+ for (int i = 0; i < MAX_FIELDS; i++) fbuf_free(&ctx.ftext[i]);
1108
+ rb_raise(rb_eArgError, "scrapetor_native: invalid schema descriptor");
1109
+ }
1110
+
1111
+ scan(&ctx);
1112
+
1113
+ if (ctx.active_gi >= 0) {
1114
+ group_t *g = &ctx.groups[ctx.active_gi];
1115
+ for (int i = 0; i < g->n_fields; i++) {
1116
+ if (!ctx.fdone[i] && ctx.ftext[i].len > 0) finalize_field(&ctx, ctx.active_gi, i);
1117
+ }
1118
+ rb_ary_push(g->results, ctx.record);
1119
+ }
1120
+
1121
+ VALUE result = rb_hash_new();
1122
+ for (int gi = 0; gi < ctx.n_groups; gi++) {
1123
+ rb_hash_aset(result, ID2SYM(ctx.groups[gi].name), ctx.groups[gi].results);
1124
+ }
1125
+
1126
+ for (int i = 0; i < MAX_FIELDS; i++) fbuf_free(&ctx.ftext[i]);
1127
+
1128
+ return result;
1129
+ }
1130
+
1131
+ /* ---- module init --------------------------------------------------- */
1132
+
1133
+ #if defined(__GNUC__) || defined(__clang__)
1134
+ __attribute__((visibility("default")))
1135
+ #endif
1136
+ void Init_scrapetor_native(void) {
1137
+ id_text = rb_intern("text");
1138
+ id_money = rb_intern("money");
1139
+ id_url = rb_intern("url");
1140
+ id_integer = rb_intern("integer");
1141
+ id_float = rb_intern("float");
1142
+ enc_utf8 = rb_utf8_encoding();
1143
+
1144
+ VALUE mod_scrapetor = rb_define_module("Scrapetor");
1145
+ VALUE mod_native = rb_define_module_under(mod_scrapetor, "Native");
1146
+ rb_define_singleton_method(mod_native, "extract", scrapetor_extract, 3);
1147
+
1148
+ /* Register the native arena-DOM module too. */
1149
+ extern void Init_scrapetor_dom(VALUE);
1150
+ Init_scrapetor_dom(mod_native);
1151
+
1152
+ /* Optional libcurl-backed HTTP/2 fetch layer. Stubs out cleanly
1153
+ * when extconf couldn't find libcurl. */
1154
+ extern void Init_scrapetor_http(VALUE);
1155
+ Init_scrapetor_http(mod_native);
1156
+ }