scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "tempfile"
4
+ require "uri"
5
+ require "thread"
6
+
7
+ module Scrapetor
8
+ # Stateful HTTP session. Wraps Scrapetor::Fetcher with:
9
+ # - persistent cookie jar (libcurl COOKIEJAR/COOKIEFILE)
10
+ # - default headers merged into every request
11
+ # - basic / bearer auth applied automatically
12
+ # - per-host rate limiting (polite throttle)
13
+ # - default retry/backoff
14
+ # - auto charset transcoding of HTML bodies to UTF-8
15
+ #
16
+ # session = Scrapetor::Session.new(
17
+ # cookies: true, # ephemeral tempfile jar
18
+ # user_agent: "MyBot/1.0",
19
+ # rate_limit: 0.5, # min seconds between same-host requests
20
+ # retry: 3,
21
+ # headers: { "Accept-Language" => "en-US" },
22
+ # )
23
+ # doc = session.fetch("https://example.com/login")
24
+ # session.post("https://example.com/login", form: { user: "x", pass: "y" })
25
+ # doc = session.fetch("https://example.com/dashboard")
26
+ #
27
+ # Cookies set during the login persist for the dashboard call.
28
+ class Session
29
+ DEFAULT_HEADERS = {
30
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
31
+ "Accept-Language" => "en-US,en;q=0.5",
32
+ }.freeze
33
+
34
+ attr_reader :cookie_jar_path
35
+
36
+ def initialize(cookies: true,
37
+ user_agent: nil,
38
+ headers: {},
39
+ basic_auth: nil,
40
+ bearer_token: nil,
41
+ proxy: nil,
42
+ ca_path: nil,
43
+ rate_limit: nil,
44
+ retry: 0,
45
+ backoff: 0.3,
46
+ max_backoff: 10.0,
47
+ timeout_ms: 30_000,
48
+ follow_redirects: true,
49
+ insecure: false,
50
+ transcode_charset: true)
51
+ Scrapetor::Fetcher.ensure_available!
52
+ @cookie_jar_path =
53
+ case cookies
54
+ when String then cookies
55
+ when true then ephemeral_jar_path
56
+ when false, nil then nil
57
+ else raise ArgumentError, "cookies: must be String/true/false"
58
+ end
59
+ @defaults = {
60
+ user_agent: user_agent || Scrapetor::Fetcher::DEFAULT_USER_AGENT,
61
+ headers: DEFAULT_HEADERS.merge(headers),
62
+ basic_auth: basic_auth,
63
+ bearer_token: bearer_token,
64
+ proxy: proxy,
65
+ ca_path: ca_path,
66
+ retry: binding.local_variable_get(:retry),
67
+ backoff: backoff,
68
+ max_backoff: max_backoff,
69
+ timeout_ms: timeout_ms,
70
+ follow_redirects: follow_redirects,
71
+ insecure: insecure,
72
+ }.compact
73
+ @defaults[:transcode_utf8] = transcode_charset
74
+ @defaults[:rate_limit_ms] = (rate_limit * 1000).to_i if rate_limit
75
+ end
76
+
77
+ %w[get post put patch delete head].each do |verb|
78
+ define_method(verb) do |url, **opts|
79
+ merged = merge_opts(opts)
80
+ Scrapetor::Fetcher.public_send(verb, url, **merged)
81
+ end
82
+ end
83
+
84
+ # GET + parse to a Document.
85
+ def fetch(url, **opts)
86
+ resp = get(url, **opts)
87
+ raise Scrapetor::Fetcher::FetchError.new(
88
+ "Session.fetch #{url} -> HTTP #{resp[:status]}",
89
+ status: resp[:status], response: resp
90
+ ) if resp[:status] < 200 || resp[:status] >= 400
91
+ Scrapetor.parse(resp[:body], base_url: resp[:final_url])
92
+ end
93
+
94
+ # parallel_get respects the session's defaults (cookies, headers,
95
+ # auth, per-host rate limit). The native batch honours
96
+ # rate_limit_ms per-host via a shared C-side throttle table, so N
97
+ # parallel workers hitting one host all queue at that gate while
98
+ # different hosts run concurrently.
99
+ def parallel_get(urls, **opts)
100
+ merged = merge_opts(opts)
101
+ Scrapetor::Fetcher.parallel_get(urls, **merged)
102
+ end
103
+
104
+ def close
105
+ File.delete(@cookie_jar_path) if @cookie_jar_path && File.exist?(@cookie_jar_path) && @ephemeral
106
+ rescue StandardError
107
+ # tempfile may have already been GC'd; ignore
108
+ end
109
+
110
+ private
111
+
112
+ def ephemeral_jar_path
113
+ @ephemeral = true
114
+ f = Tempfile.new(["scrapetor_jar", ".txt"])
115
+ f.close
116
+ path = f.path
117
+ ObjectSpace.define_finalizer(self, self.class.send(:make_jar_finalizer, path))
118
+ path
119
+ end
120
+
121
+ def self.make_jar_finalizer(path)
122
+ proc { File.delete(path) if File.exist?(path) rescue nil }
123
+ end
124
+
125
+ def merge_opts(opts)
126
+ m = @defaults.merge(opts) do |_, ours, theirs|
127
+ if ours.is_a?(Hash) && theirs.is_a?(Hash)
128
+ ours.merge(theirs)
129
+ else
130
+ theirs.nil? ? ours : theirs
131
+ end
132
+ end
133
+ if @cookie_jar_path
134
+ m[:cookiejar] ||= @cookie_jar_path
135
+ m[:cookiefile] ||= @cookie_jar_path
136
+ end
137
+ m
138
+ end
139
+
140
+ end
141
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ module Scrapetor
6
+ # Sitemap.xml ingestion. Handles both <urlset> (URL listings) and
7
+ # <sitemapindex> (nested sitemap references), streaming so a huge
8
+ # sitemap doesn't have to fit in memory at once.
9
+ #
10
+ # Scrapetor::Sitemap.urls("https://example.com/sitemap.xml") do |url, meta|
11
+ # puts url, meta[:lastmod], meta[:priority]
12
+ # end
13
+ #
14
+ # Or, return an array:
15
+ #
16
+ # Scrapetor::Sitemap.urls("https://example.com/sitemap.xml").to_a
17
+ module Sitemap
18
+ # Stream-iterate every URL in the sitemap. Recurses into
19
+ # <sitemapindex> entries automatically. Yields (url, meta) where
20
+ # meta carries :lastmod / :changefreq / :priority when present.
21
+ def self.urls(source, depth: 0, max_depth: 5, &block)
22
+ return enum_for(:urls, source, depth: depth, max_depth: max_depth) unless block
23
+ raise ArgumentError, "sitemap recursion too deep" if depth > max_depth
24
+ io = open_source(source)
25
+ Scrapetor.stream(io, outer: "url") do |doc|
26
+ loc = doc.at_css("loc")&.text&.strip
27
+ next unless loc && !loc.empty?
28
+ meta = {
29
+ lastmod: doc.at_css("lastmod")&.text&.strip,
30
+ changefreq: doc.at_css("changefreq")&.text&.strip,
31
+ priority: doc.at_css("priority")&.text&.strip,
32
+ }
33
+ yield loc, meta
34
+ end
35
+ # If the file was a sitemapindex instead, the <url> stream above
36
+ # found nothing. Re-open and scan for <sitemap><loc>.
37
+ child_io = open_source(source)
38
+ Scrapetor.stream(child_io, outer: "sitemap") do |doc|
39
+ child_loc = doc.at_css("loc")&.text&.strip
40
+ next unless child_loc
41
+ urls(child_loc, depth: depth + 1, max_depth: max_depth, &block)
42
+ end
43
+ end
44
+
45
+ def self.open_source(source)
46
+ return source if source.respond_to?(:read)
47
+ return StringIO.new(source) if source.is_a?(String) && !source.start_with?("http")
48
+ resp = Scrapetor::Fetcher.get(source.to_s)
49
+ StringIO.new(resp[:body])
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ module Scrapetor
6
+ # Streaming parser. Reads HTML incrementally from an IO and yields one
7
+ # complete row at a time. Peak memory stays bounded to roughly
8
+ # max(read_chunk, longest_row_in_bytes) regardless of total document
9
+ # size, so multi-gigabyte fixtures, paginated dumps, and slow socket
10
+ # feeds work without buffering the whole thing.
11
+ #
12
+ # The "row" boundary is byte-scanned in C — no DOM is built for the
13
+ # outer-document context. Once a row is found, its HTML slice is
14
+ # parsed as a fragment through the standard native path so all the
15
+ # normal Document / Element / extract APIs are available.
16
+ #
17
+ # Scrapetor.stream(io, outer: "div.result") do |doc|
18
+ # puts doc.at_css(".title")&.text
19
+ # end
20
+ #
21
+ # With a schema, each row is run through the native extractor and
22
+ # yielded as a Hash:
23
+ #
24
+ # Scrapetor.stream(io, outer: "li.product", fields: {
25
+ # title: ".title::text",
26
+ # price: ".price::text",
27
+ # }) do |row|
28
+ # puts row[:title]
29
+ # end
30
+ #
31
+ # The outer pattern accepts:
32
+ # - "tag" (any element of that name)
33
+ # - "tag.class" (element with that class token)
34
+ # - ".class" — not supported; provide a tag for byte scanning
35
+ class Stream
36
+ DEFAULT_CHUNK = 64 * 1024
37
+
38
+ def initialize(io, outer:, fields: nil, chunk_size: DEFAULT_CHUNK)
39
+ tag, id, classes = self.class.parse_outer(outer)
40
+ @native = Scrapetor::Native::Stream.new(tag, id, classes)
41
+ @io = io
42
+ @fields = fields
43
+ @chunk_size = chunk_size
44
+ end
45
+
46
+ def each
47
+ return enum_for(:each) unless block_given?
48
+ loop do
49
+ # Pull every row currently available in the buffer.
50
+ while (row_html = @native.next_row)
51
+ yield materialise(row_html)
52
+ end
53
+ break if @native.done?
54
+ chunk = @io.read(@chunk_size)
55
+ if chunk.nil? || chunk.empty?
56
+ @native.set_eof
57
+ # Final drain after EOF — buffer may still have buffered rows.
58
+ while (row_html = @native.next_row)
59
+ yield materialise(row_html)
60
+ end
61
+ break
62
+ else
63
+ @native.feed(chunk)
64
+ end
65
+ end
66
+ self
67
+ end
68
+
69
+ # Accepts:
70
+ # "tag" -> [tag, nil, []]
71
+ # "tag.class" -> [tag, nil, ["class"]]
72
+ # "tag.cls1.cls2" -> [tag, nil, ["cls1", "cls2"]]
73
+ # "tag#id" -> [tag, "id", []]
74
+ # "tag#id.cls1" -> [tag, "id", ["cls1"]]
75
+ # "tag.cls#id" -> [tag, "id", ["cls"]] (any order after tag)
76
+ def self.parse_outer(outer)
77
+ m = outer.match(/\A([a-zA-Z][\w-]*)((?:[.#][\w-]+)*)\z/)
78
+ raise ArgumentError,
79
+ "Scrapetor.stream outer must be 'tag', 'tag.class', 'tag#id', " \
80
+ "or 'tag#id.cls1.cls2' (got #{outer.inspect})" unless m
81
+ tag = m[1]
82
+ tail = m[2]
83
+ id = nil
84
+ classes = []
85
+ tail.scan(/([.#])([\w-]+)/).each do |sigil, name|
86
+ if sigil == "#"
87
+ raise ArgumentError,
88
+ "Scrapetor.stream outer: only one #id is supported (got #{outer.inspect})" if id
89
+ id = name
90
+ else
91
+ classes << name
92
+ end
93
+ end
94
+ [tag, id, classes]
95
+ end
96
+
97
+ private
98
+
99
+ def materialise(row_html)
100
+ doc = Scrapetor.parse(row_html)
101
+ return doc unless @fields
102
+ root = doc.css("*").first || doc
103
+ root.extract(@fields)
104
+ end
105
+ end
106
+
107
+ def self.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block)
108
+ io = source.respond_to?(:read) ? source : StringIO.new(source)
109
+ Stream.new(io, outer: outer, fields: fields, chunk_size: chunk_size).each(&block)
110
+ end
111
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Scrapetor
6
+ # Extract structured-data signals every SEO/RAG pipeline needs:
7
+ # JSON-LD, OpenGraph, Twitter Cards, Schema.org microdata.
8
+ #
9
+ # These are deterministic and fast — no DOM walk beyond `doc.css(...)`
10
+ # which is delegated to the backing tokenizer.
11
+ module StructuredData
12
+ JSON_LD_SELECTOR = 'script[type="application/ld+json"]'.freeze
13
+
14
+ def self.json_ld(doc)
15
+ out = []
16
+ doc.css(JSON_LD_SELECTOR).each do |script|
17
+ body = script.text
18
+ next if body.nil? || body.strip.empty?
19
+ begin
20
+ parsed = JSON.parse(body)
21
+ rescue JSON::ParserError
22
+ next
23
+ end
24
+ if parsed.is_a?(Array)
25
+ out.concat(parsed)
26
+ elsif parsed.is_a?(Hash) && parsed["@graph"].is_a?(Array)
27
+ out.concat(parsed["@graph"])
28
+ else
29
+ out << parsed
30
+ end
31
+ end
32
+ out
33
+ end
34
+
35
+ def self.opengraph(doc)
36
+ collect_meta(doc, prefix: "og:")
37
+ end
38
+
39
+ def self.twitter_card(doc)
40
+ collect_meta(doc, prefix: "twitter:")
41
+ end
42
+
43
+ def self.schema_org(doc, type: nil)
44
+ list = json_ld(doc)
45
+ return list if type.nil?
46
+ target = type.to_s
47
+ list.select do |item|
48
+ next false unless item.is_a?(Hash)
49
+ t = item["@type"]
50
+ case t
51
+ when String then t == target
52
+ when Array then t.include?(target)
53
+ else false
54
+ end
55
+ end
56
+ end
57
+
58
+ def self.collect_meta(doc, prefix:)
59
+ h = {}
60
+ doc.css("meta").each do |meta|
61
+ # OpenGraph uses `property=`; Twitter Cards use `name=`. Some sites
62
+ # do both. Check both.
63
+ key = meta.attr("property") || meta.attr("name")
64
+ next if key.nil?
65
+ next unless key.start_with?(prefix)
66
+ val = meta.attr("content")
67
+ next if val.nil?
68
+ short_key = key[prefix.length..]
69
+ h[short_key] = val if !h.key?(short_key)
70
+ end
71
+ h
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Phase 1: in-process registry mapping structural fingerprints to
5
+ # compiled extraction plans (Schema instances). Phase 8 (per plan.md)
6
+ # promotes this to an mmap-backed cross-process store.
7
+ class TemplateRegistry
8
+ def initialize
9
+ @plans = {}
10
+ end
11
+
12
+ def store(fingerprint, plan)
13
+ @plans[fingerprint] = plan
14
+ end
15
+
16
+ def fetch(fingerprint)
17
+ @plans[fingerprint]
18
+ end
19
+
20
+ def size
21
+ @plans.size
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Result type for `::text` and `::attr(name)` pseudo-element queries.
5
+ #
6
+ # Scrapy / Parsel-style code expects strings directly from these
7
+ # selectors (`doc.css("h3::text").get`), but Nokogiri-style scrapers
8
+ # routinely chain a `.text` / `.content` accessor onto each result
9
+ # (`doc.css("h3::text").first.text` or `node.at("a::attr(href)").text`).
10
+ # Returning a bare String breaks the Nokogiri-style call path with
11
+ # NoMethodError, even though the String already _is_ the text we
12
+ # would have returned.
13
+ #
14
+ # TextNode is a thin String subclass that closes the gap: it equals,
15
+ # compares, splits, and concatenates exactly like a String, and adds
16
+ # the Node-shaped accessors (`text`, `content`, `inner_text`, `name`,
17
+ # `element?`, `text?`) plus the Parsel-shaped `get` / `getall`. The
18
+ # underlying byte string is the actual text content; the extra methods
19
+ # all return self (or trivial derivatives), so chaining stays cheap.
20
+ class TextNode < String
21
+ def text; String.new(self); end
22
+ alias inner_text text
23
+ alias content text
24
+
25
+ # Parsel-style accessors.
26
+ def get; String.new(self); end
27
+ def getall; [String.new(self)]; end
28
+
29
+ # Node-shape predicates so duck-typing checks (`n.element?`,
30
+ # `n.text?`, `n.name == "#text"`) don't blow up.
31
+ def name; "#text"; end
32
+ def element?; false; end
33
+ def text?; true; end
34
+ def comment?; false; end
35
+ def document?; false; end
36
+ def cdata?; false; end
37
+
38
+ def to_html; self.to_s; end
39
+ alias outer_html to_html
40
+ alias inner_html to_html
41
+
42
+ # No-op mutation API. Heterogeneous selectors like
43
+ # `.foo > ::text, .bar` can hand a TextNode to a caller that
44
+ # assumes an Element interface (e.g.
45
+ # `node.inner_html = node.inner_html.gsub(...)`). The reassignment
46
+ # would crash on bare String; we accept the write silently so the
47
+ # subsequent `.text` read still works. The mutation is intentionally
48
+ # dropped — TextNode wraps frozen content of the original element.
49
+ def inner_html=(_v); _v; end
50
+ def content=(_v); _v; end
51
+ def []=(*_args); nil; end
52
+ def add_class(_k); self; end
53
+ def remove_class(*_); self; end
54
+ def remove; self; end
55
+ def unlink; self; end
56
+
57
+ # Containing element (the node whose text/attribute this TextNode
58
+ # represents). Set by the css() boundary when we know the parent;
59
+ # left nil otherwise. Production code chains
60
+ # `result.at(::text).parent.css(...)` to navigate to siblings of
61
+ # the text node, mirroring the Nokogiri shape where text nodes
62
+ # carry a `.parent` back-reference.
63
+ attr_accessor :parent_node
64
+
65
+ def parent; @parent_node; end
66
+ def next_sibling; nil; end
67
+ def previous_sibling; nil; end
68
+ def next_element_sibling; nil; end
69
+ def previous_element_sibling; nil; end
70
+ def children; []; end
71
+ def element_children; []; end
72
+ def attributes; {}; end
73
+ def attribute_nodes; []; end
74
+ def attribute(_name); nil; end
75
+ def keys; []; end
76
+ def values; []; end
77
+ def classes; []; end
78
+ def has_class?(_klass); false; end
79
+ def [](*args)
80
+ # String byte/range subscript when called with a single non-string
81
+ # argument; nil for attribute-style String access.
82
+ if args.size == 1 && args.first.is_a?(String)
83
+ nil
84
+ elsif args.size == 1 && args.first.is_a?(Symbol)
85
+ nil
86
+ else
87
+ super
88
+ end
89
+ end
90
+ def css(_selector); []; end
91
+ def at_css(_selector); nil; end
92
+ def at(_selector); nil; end
93
+ def search(_selector); []; end
94
+ def xpath(*_args); []; end
95
+ def at_xpath(*_args); nil; end
96
+
97
+ def inspect
98
+ "#<Scrapetor::TextNode #{super}>"
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Scrapetor
6
+ module URL
7
+ ABSOLUTE = %r{\A[a-zA-Z][\w+.\-]*://}.freeze
8
+
9
+ def self.absolute(href, base = nil)
10
+ return nil if href.nil?
11
+ h = href.to_s
12
+ return h if h.match?(ABSOLUTE)
13
+ return h if base.nil?
14
+ begin
15
+ URI.join(base.to_s, h).to_s
16
+ rescue URI::InvalidURIError, ArgumentError
17
+ h
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ VERSION = "0.2.0"
5
+ end