scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+
5
+ $CFLAGS << " -std=c99 -O3 -Wall -fno-strict-aliasing"
6
+ $CFLAGS << " -fvisibility=hidden"
7
+ $CFLAGS << " -DNDEBUG"
8
+
9
+ # Architecture-specific flags
10
+ arch = `uname -m`.strip
11
+ case arch
12
+ when "arm64", "aarch64"
13
+ # NEON is implicit on aarch64; nothing to add
14
+ when "x86_64", "amd64"
15
+ $CFLAGS << " -msse4.2"
16
+ end
17
+
18
+ # Ruby version compatibility shim defines
19
+ $CFLAGS << " -DRUBY_VERSION_MAJOR=#{RbConfig::CONFIG["MAJOR"]}"
20
+
21
+ # Optional libcurl-backed HTTP layer. Tries pkg-config first (the
22
+ # standard libcurl install carries a .pc file); falls back to header
23
+ # + library probes. Defines HAVE_LIBCURL when both succeed. The HTTP
24
+ # module compiles to a stub otherwise so the rest of the gem still
25
+ # loads cleanly.
26
+ have_libcurl = false
27
+ unless ENV["SCRAP_NO_LIBCURL"] == "1"
28
+ if pkg_config("libcurl")
29
+ have_libcurl = true
30
+ elsif have_header("curl/curl.h") && have_library("curl", "curl_easy_init")
31
+ have_libcurl = true
32
+ end
33
+ end
34
+ $defs << "-DHAVE_LIBCURL" if have_libcurl
35
+
36
+ # zlib for gzip/deflate decoding. Almost universally available;
37
+ # libcurl already pulls it in on most systems. We need direct access
38
+ # so we can drive decompression ourselves rather than relying on
39
+ # libcurl's CURLOPT_ACCEPT_ENCODING (which rejects responses with
40
+ # encodings libcurl wasn't compiled for, before our brotli/zstd
41
+ # fallback can run).
42
+ if have_libcurl
43
+ if pkg_config("zlib") ||
44
+ (have_header("zlib.h") && have_library("z", "inflateInit_"))
45
+ $defs << "-DHAVE_ZLIB"
46
+ end
47
+ end
48
+
49
+ # Optional brotli + zstd in-process decoders. When the linked libcurl
50
+ # wasn't built with these (e.g. macOS system libcurl as of 8.7.1),
51
+ # Scrapetor can still advertise br/zstd in Accept-Encoding and
52
+ # decode the response body itself. Each decoder is opt-in via the
53
+ # corresponding library probe; missing libraries downgrade silently.
54
+ if have_libcurl && ENV["SCRAP_NO_BROTLI"] != "1"
55
+ if pkg_config("libbrotlidec") ||
56
+ (have_header("brotli/decode.h") && have_library("brotlidec", "BrotliDecoderDecompress"))
57
+ $defs << "-DHAVE_BROTLI"
58
+ end
59
+ end
60
+ if have_libcurl && ENV["SCRAP_NO_ZSTD"] != "1"
61
+ if pkg_config("libzstd") ||
62
+ (have_header("zstd.h") && have_library("zstd", "ZSTD_decompress"))
63
+ $defs << "-DHAVE_ZSTD"
64
+ end
65
+ end
66
+
67
+ create_makefile("scrapetor/scrapetor_native")