scrapetor 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +242 -0
- data/LICENSE +21 -0
- data/README.md +440 -0
- data/bin/scrapetor +190 -0
- data/bin/scrapetor-bench +5 -0
- data/ext/scrapetor/README.md +53 -0
- data/ext/scrapetor/native/extconf.rb +67 -0
- data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
- data/ext/scrapetor/native/scrapetor_http.c +2591 -0
- data/ext/scrapetor/native/scrapetor_native.c +1156 -0
- data/lib/scrapetor/builder.rb +158 -0
- data/lib/scrapetor/cleaner.rb +10 -0
- data/lib/scrapetor/comment_node.rb +67 -0
- data/lib/scrapetor/document.rb +457 -0
- data/lib/scrapetor/dom/parser.rb +69 -0
- data/lib/scrapetor/dom/selectors.rb +208 -0
- data/lib/scrapetor/dom.rb +563 -0
- data/lib/scrapetor/encoding.rb +85 -0
- data/lib/scrapetor/entities.rb +90 -0
- data/lib/scrapetor/errors.rb +12 -0
- data/lib/scrapetor/extractor.rb +147 -0
- data/lib/scrapetor/fetcher.rb +390 -0
- data/lib/scrapetor/fingerprint.rb +29 -0
- data/lib/scrapetor/form.rb +141 -0
- data/lib/scrapetor/http.rb +114 -0
- data/lib/scrapetor/microdata.rb +132 -0
- data/lib/scrapetor/money.rb +30 -0
- data/lib/scrapetor/native.rb +291 -0
- data/lib/scrapetor/native_dom.rb +2258 -0
- data/lib/scrapetor/node.rb +539 -0
- data/lib/scrapetor/node_set.rb +301 -0
- data/lib/scrapetor/page_type.rb +95 -0
- data/lib/scrapetor/pagination.rb +109 -0
- data/lib/scrapetor/persistent_cache.rb +130 -0
- data/lib/scrapetor/robots.rb +159 -0
- data/lib/scrapetor/sax.rb +285 -0
- data/lib/scrapetor/schema.rb +144 -0
- data/lib/scrapetor/selector.rb +576 -0
- data/lib/scrapetor/session.rb +141 -0
- data/lib/scrapetor/sitemap.rb +52 -0
- data/lib/scrapetor/stream.rb +111 -0
- data/lib/scrapetor/structured_data.rb +74 -0
- data/lib/scrapetor/template_registry.rb +24 -0
- data/lib/scrapetor/text_node.rb +101 -0
- data/lib/scrapetor/url.rb +21 -0
- data/lib/scrapetor/version.rb +5 -0
- data/lib/scrapetor/xpath.rb +1603 -0
- data/lib/scrapetor.rb +167 -0
- data/scrapetor.gemspec +77 -0
- metadata +200 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mkmf"
|
|
4
|
+
|
|
5
|
+
$CFLAGS << " -std=c99 -O3 -Wall -fno-strict-aliasing"
|
|
6
|
+
$CFLAGS << " -fvisibility=hidden"
|
|
7
|
+
$CFLAGS << " -DNDEBUG"
|
|
8
|
+
|
|
9
|
+
# Architecture-specific flags
|
|
10
|
+
arch = `uname -m`.strip
|
|
11
|
+
case arch
|
|
12
|
+
when "arm64", "aarch64"
|
|
13
|
+
# NEON is implicit on aarch64; nothing to add
|
|
14
|
+
when "x86_64", "amd64"
|
|
15
|
+
$CFLAGS << " -msse4.2"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Ruby version compatibility shim defines
|
|
19
|
+
$CFLAGS << " -DRUBY_VERSION_MAJOR=#{RbConfig::CONFIG["MAJOR"]}"
|
|
20
|
+
|
|
21
|
+
# Optional libcurl-backed HTTP layer. Tries pkg-config first (the
|
|
22
|
+
# standard libcurl install carries a .pc file); falls back to header
|
|
23
|
+
# + library probes. Defines HAVE_LIBCURL when both succeed. The HTTP
|
|
24
|
+
# module compiles to a stub otherwise so the rest of the gem still
|
|
25
|
+
# loads cleanly.
|
|
26
|
+
have_libcurl = false
|
|
27
|
+
unless ENV["SCRAP_NO_LIBCURL"] == "1"
|
|
28
|
+
if pkg_config("libcurl")
|
|
29
|
+
have_libcurl = true
|
|
30
|
+
elsif have_header("curl/curl.h") && have_library("curl", "curl_easy_init")
|
|
31
|
+
have_libcurl = true
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
$defs << "-DHAVE_LIBCURL" if have_libcurl
|
|
35
|
+
|
|
36
|
+
# zlib for gzip/deflate decoding. Almost universally available;
|
|
37
|
+
# libcurl already pulls it in on most systems. We need direct access
|
|
38
|
+
# so we can drive decompression ourselves rather than relying on
|
|
39
|
+
# libcurl's CURLOPT_ACCEPT_ENCODING (which rejects responses with
|
|
40
|
+
# encodings libcurl wasn't compiled for, before our brotli/zstd
|
|
41
|
+
# fallback can run).
|
|
42
|
+
if have_libcurl
|
|
43
|
+
if pkg_config("zlib") ||
|
|
44
|
+
(have_header("zlib.h") && have_library("z", "inflateInit_"))
|
|
45
|
+
$defs << "-DHAVE_ZLIB"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Optional brotli + zstd in-process decoders. When the linked libcurl
|
|
50
|
+
# wasn't built with these (e.g. macOS system libcurl as of 8.7.1),
|
|
51
|
+
# Scrapetor can still advertise br/zstd in Accept-Encoding and
|
|
52
|
+
# decode the response body itself. Each decoder is opt-in via the
|
|
53
|
+
# corresponding library probe; missing libraries downgrade silently.
|
|
54
|
+
if have_libcurl && ENV["SCRAP_NO_BROTLI"] != "1"
|
|
55
|
+
if pkg_config("libbrotlidec") ||
|
|
56
|
+
(have_header("brotli/decode.h") && have_library("brotlidec", "BrotliDecoderDecompress"))
|
|
57
|
+
$defs << "-DHAVE_BROTLI"
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
if have_libcurl && ENV["SCRAP_NO_ZSTD"] != "1"
|
|
61
|
+
if pkg_config("libzstd") ||
|
|
62
|
+
(have_header("zstd.h") && have_library("zstd", "ZSTD_decompress"))
|
|
63
|
+
$defs << "-DHAVE_ZSTD"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
create_makefile("scrapetor/scrapetor_native")
|