scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Scrapetor
6
+ # HTML form helper. Pulls fields + default values out of a `<form>`
7
+ # element, lets the caller override or add values, and submits via
8
+ # the right method/action.
9
+ #
10
+ # doc = Scrapetor::Fetcher.fetch("https://example.com/login")
11
+ # form = Scrapetor::Form.new(doc.at_css("form#login"),
12
+ # base_url: "https://example.com/login")
13
+ # form["username"] = "alice"
14
+ # form["password"] = "secret"
15
+ # resp = form.submit # uses Scrapetor::Fetcher
16
+ #
17
+ # Captures every named control's default value (incl. <select> /
18
+ # <input type=hidden|checkbox|radio> / <textarea>); pre-loaded
19
+ # fields like CSRF tokens carry forward automatically. Buttons are
20
+ # NOT included unless explicitly set — the caller decides which
21
+ # submit button "fired".
22
+ class Form
23
+ attr_reader :action, :method, :enctype, :fields
24
+
25
+ def initialize(form_node, base_url: nil, http: nil)
26
+ raise ArgumentError, "form_node is required" if form_node.nil?
27
+ @form = form_node
28
+ @base = base_url
29
+ @http = http
30
+ @method = (form_node["method"] || form_node[:method] || "GET").upcase
31
+ @enctype = (form_node["enctype"] || form_node[:enctype] || "application/x-www-form-urlencoded").downcase
32
+ raw_action = form_node["action"] || form_node[:action] || ""
33
+ @action = if raw_action.empty?
34
+ base_url
35
+ elsif base_url
36
+ begin
37
+ URI.join(base_url, raw_action).to_s
38
+ rescue URI::InvalidURIError
39
+ raw_action
40
+ end
41
+ else
42
+ raw_action
43
+ end
44
+ @fields = capture_defaults(form_node)
45
+ end
46
+
47
+ def [](name); @fields[name.to_s]; end
48
+ def []=(name, value); @fields[name.to_s] = value.to_s; end
49
+ def delete(name); @fields.delete(name.to_s); end
50
+ def merge!(hash); hash.each { |k, v| self[k] = v }; self; end
51
+
52
+ # Returns the params Hash that would be submitted, with all the
53
+ # captured defaults plus user overrides. Useful for inspection
54
+ # before #submit fires the request.
55
+ def to_h
56
+ @fields.dup
57
+ end
58
+
59
+ def submit(extra: {}, **fetcher_opts)
60
+ params = @fields.merge(extra.transform_keys(&:to_s))
61
+ client = @http || Scrapetor::Fetcher
62
+ case @method
63
+ when "GET"
64
+ url = append_query(@action, params)
65
+ client.get(url, **fetcher_opts)
66
+ when "POST"
67
+ if @enctype.include?("multipart")
68
+ client.post(@action, multipart: params, **fetcher_opts)
69
+ else
70
+ client.post(@action, form: params, **fetcher_opts)
71
+ end
72
+ else
73
+ # PUT/PATCH/DELETE via form are non-standard but supported.
74
+ verb = @method.downcase.to_sym
75
+ client.send(verb, @action,
76
+ body: URI.encode_www_form(params),
77
+ **fetcher_opts.merge(
78
+ headers: (fetcher_opts[:headers] || {}).merge(
79
+ "Content-Type" => "application/x-www-form-urlencoded"
80
+ )
81
+ ))
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def capture_defaults(form)
88
+ out = {}
89
+ # <input>
90
+ form.css("input").each do |inp|
91
+ name = (inp["name"] || inp[:name])&.to_s
92
+ next if name.nil? || name.empty?
93
+ type = (inp["type"] || inp[:type] || "text").to_s.downcase
94
+ case type
95
+ when "submit", "button", "image", "reset", "file"
96
+ # Skip — submit buttons are caller-driven; file inputs
97
+ # need explicit Fetcher.upload_file via :extra.
98
+ next
99
+ when "checkbox", "radio"
100
+ # Default-checked controls contribute their value; others
101
+ # don't. Falls back to "on" per HTML spec.
102
+ if inp["checked"] || inp[:checked]
103
+ out[name] = (inp["value"] || inp[:value] || "on").to_s
104
+ end
105
+ else
106
+ out[name] = (inp["value"] || inp[:value] || "").to_s
107
+ end
108
+ end
109
+ # <select>
110
+ form.css("select").each do |sel|
111
+ name = (sel["name"] || sel[:name])&.to_s
112
+ next if name.nil? || name.empty?
113
+ # First check for an option marked selected; fall back to
114
+ # the first option (HTML semantics for single-select).
115
+ selected = sel.css("option").find { |o| o["selected"] || o[:selected] }
116
+ selected ||= sel.at_css("option")
117
+ out[name] = selected ? (selected["value"] || selected[:value] || selected.text).to_s : ""
118
+ end
119
+ # <textarea>
120
+ form.css("textarea").each do |t|
121
+ name = (t["name"] || t[:name])&.to_s
122
+ next if name.nil? || name.empty?
123
+ out[name] = t.text.to_s
124
+ end
125
+ out
126
+ end
127
+
128
+ def append_query(url, params)
129
+ return url if params.empty?
130
+ uri = URI(url)
131
+ existing = uri.query ? URI.decode_www_form(uri.query) : []
132
+ override_names = params.keys.to_set
133
+ existing.reject! { |k, _| override_names.include?(k) }
134
+ merged = existing + params.to_a
135
+ uri.query = URI.encode_www_form(merged)
136
+ uri.to_s
137
+ end
138
+ end
139
+ end
140
+
141
+ require "set"
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+
6
+ module Scrapetor
7
+ # Convenience HTTP fetcher built on `Net::HTTP` (Ruby stdlib — no
8
+ # external runtime dep).
9
+ #
10
+ # doc = Scrapetor.fetch("https://example.com/products")
11
+ # doc.css(".product").map { |p| p.at(".title").text }
12
+ #
13
+ # Handles 3xx redirects, sets a sensible User-Agent, applies the
14
+ # response's encoding to the parsed document, and uses the request URL
15
+ # as `base_url` for absolute-URL helpers.
16
+ #
17
+ # For production scraping you'll usually want a real HTTP client
18
+ # (HTTPX, Typhoeus, Faraday) with connection pooling, retries, and
19
+ # cookie storage. `Scrapetor.fetch` is intentionally minimal — it's
20
+ # here so simple scripts and the CLI don't need extra deps.
21
+ module HTTP
22
+ DEFAULT_HEADERS = {
23
+ "User-Agent" => "Scrapetor/#{Scrapetor::VERSION} (+https://scrapetor.org)",
24
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
25
+ "Accept-Language" => "en-US,en;q=0.5",
26
+ "Accept-Encoding" => "identity"
27
+ }.freeze
28
+
29
+ MAX_REDIRECTS = 5
30
+
31
+ class FetchError < Scrapetor::Error; end
32
+ class TooManyRedirects < FetchError; end
33
+
34
+ def self.get(url, headers: {}, follow_redirects: true, max_redirects: MAX_REDIRECTS, open_timeout: 10, read_timeout: 30)
35
+ uri = URI(url.to_s)
36
+ raise FetchError, "unsupported scheme: #{uri.scheme.inspect}" unless %w[http https].include?(uri.scheme)
37
+
38
+ hops = 0
39
+ loop do
40
+ req = Net::HTTP::Get.new(uri.request_uri)
41
+ DEFAULT_HEADERS.each { |k, v| req[k] = v }
42
+ headers.each { |k, v| req[k.to_s] = v.to_s }
43
+
44
+ net = Net::HTTP.new(uri.host, uri.port)
45
+ net.use_ssl = (uri.scheme == "https")
46
+ net.open_timeout = open_timeout
47
+ net.read_timeout = read_timeout
48
+
49
+ resp = net.start { |h| h.request(req) }
50
+
51
+ case resp
52
+ when Net::HTTPSuccess
53
+ return Response.new(resp, uri)
54
+ when Net::HTTPRedirection
55
+ raise TooManyRedirects, "exceeded #{max_redirects} redirects" if hops >= max_redirects
56
+ raise FetchError, "redirect with no Location header" unless resp["location"]
57
+ uri = URI.join(uri.to_s, resp["location"])
58
+ hops += 1
59
+ next if follow_redirects
60
+ return Response.new(resp, uri)
61
+ else
62
+ raise FetchError, "HTTP #{resp.code} #{resp.message} for #{uri}"
63
+ end
64
+ end
65
+ end
66
+
67
+ # Fetch + parse + return a `Scrapetor::Document` whose `base_url` is
68
+ # the final URL after redirects.
69
+ def self.fetch(url, **opts)
70
+ resp = get(url, **opts)
71
+ Scrapetor.parse(resp.body, base_url: resp.final_url.to_s)
72
+ end
73
+
74
+ # Fetch + extract.
75
+ def self.fetch_extract(url, schema, **opts)
76
+ resp = get(url, **opts)
77
+ Scrapetor.parse(resp.body, base_url: resp.final_url.to_s).extract(schema)
78
+ end
79
+
80
+ class Response
81
+ attr_reader :net_response, :final_url
82
+
83
+ def initialize(net_response, final_url)
84
+ @net_response = net_response
85
+ @final_url = final_url
86
+ end
87
+
88
+ def body
89
+ @net_response.body
90
+ end
91
+
92
+ def status
93
+ @net_response.code.to_i
94
+ end
95
+
96
+ def headers
97
+ @net_response.to_hash
98
+ end
99
+
100
+ def [](header_name)
101
+ @net_response[header_name]
102
+ end
103
+ end
104
+ end
105
+
106
+ # Module-level shortcut. Most users only want this.
107
+ def self.fetch(url, **opts)
108
+ HTTP.fetch(url, **opts)
109
+ end
110
+
111
+ def self.fetch_extract(url, schema, **opts)
112
+ HTTP.fetch_extract(url, schema, **opts)
113
+ end
114
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Microdata extractor (HTML5 itemscope / itemprop / itemtype).
5
+ #
6
+ # Walks the DOM looking for itemscope elements and emits a nested
7
+ # hash structure of items + properties. The format mirrors what
8
+ # https://schema.org/docs/datamodel.html describes:
9
+ #
10
+ # {
11
+ # "type" => "https://schema.org/Product", # from itemtype
12
+ # "id" => "...", # from itemid
13
+ # "properties" => {
14
+ # "name" => "Widget",
15
+ # "price" => "19.99",
16
+ # "offer" => { "type" => "https://schema.org/Offer", ... }
17
+ # }
18
+ # }
19
+ module Microdata
20
+ def self.extract(doc)
21
+ items = []
22
+ doc.css("[itemscope]").each do |node|
23
+ # Skip nested items — they'll be reached via the parent's properties.
24
+ next if has_itemscope_ancestor?(node)
25
+ items << build_item(node)
26
+ end
27
+ items
28
+ end
29
+
30
+ def self.has_itemscope_ancestor?(node)
31
+ ancestor = node.parent
32
+ while ancestor
33
+ return true if ancestor.respond_to?(:[]) && ancestor["itemscope"]
34
+ ancestor = ancestor.respond_to?(:parent) ? ancestor.parent : nil
35
+ end
36
+ false
37
+ end
38
+
39
+ def self.build_item(node)
40
+ item = {}
41
+ item["type"] = node["itemtype"] if node["itemtype"]
42
+ item["id"] = node["itemid"] if node["itemid"]
43
+ props = {}
44
+ gather_props(node, props)
45
+ item["properties"] = props
46
+ item
47
+ end
48
+
49
+ def self.gather_props(scope, props)
50
+ scope.css("[itemprop]").each do |el|
51
+ # Only direct descendants in microdata terms: an itemprop on a
52
+ # descendant of a nested itemscope belongs to the nested item.
53
+ next if descendant_of_nested_itemscope?(el, scope)
54
+
55
+ names = (el["itemprop"] || "").split(/\s+/).reject(&:empty?)
56
+ next if names.empty?
57
+ value = property_value(el)
58
+ names.each do |n|
59
+ if props.key?(n)
60
+ props[n] = [props[n]] unless props[n].is_a?(Array)
61
+ props[n] << value
62
+ else
63
+ props[n] = value
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ def self.descendant_of_nested_itemscope?(el, scope)
70
+ cur = el.parent
71
+ while cur && cur != scope
72
+ return true if cur.respond_to?(:[]) && cur["itemscope"]
73
+ cur = cur.respond_to?(:parent) ? cur.parent : nil
74
+ end
75
+ false
76
+ end
77
+
78
+ def self.property_value(el)
79
+ if el["itemscope"]
80
+ return build_item(el)
81
+ end
82
+ tag = el.respond_to?(:name) ? el.name.to_s.downcase : ""
83
+ case tag
84
+ when "meta" then el["content"]
85
+ when "audio", "embed", "iframe", "img", "source", "track", "video"
86
+ el["src"]
87
+ when "a", "area", "link" then el["href"]
88
+ when "object" then el["data"]
89
+ when "data" then el["value"] || el.text
90
+ when "meter" then el["value"] || el.text
91
+ when "time" then el["datetime"] || el.text
92
+ else
93
+ text = el.text.to_s
94
+ text.gsub(/\s+/, " ").strip
95
+ end
96
+ end
97
+ end
98
+
99
+ # RDFa extractor — minimal implementation covering the typical
100
+ # subset used on the web (property, content, datatype, typeof).
101
+ module RDFa
102
+ def self.extract(doc)
103
+ out = []
104
+ doc.css("[typeof]").each do |node|
105
+ item = {
106
+ "type" => node["typeof"],
107
+ "about" => node["about"] || node["resource"],
108
+ "properties" => collect_props(node)
109
+ }
110
+ out << item
111
+ end
112
+ out
113
+ end
114
+
115
+ def self.collect_props(scope)
116
+ props = {}
117
+ scope.css("[property]").each do |el|
118
+ names = (el["property"] || "").split(/\s+/).reject(&:empty?)
119
+ value = el["content"] || el.text.to_s.strip
120
+ names.each do |n|
121
+ if props.key?(n)
122
+ props[n] = [props[n]] unless props[n].is_a?(Array)
123
+ props[n] << value
124
+ else
125
+ props[n] = value
126
+ end
127
+ end
128
+ end
129
+ props
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ module Money
5
+ NUMERIC = /-?\d[\d.,]*/.freeze
6
+ THOUSAND_GROUPED_COMMAS = /\A-?\d{1,3}(?:,\d{3})+\z/.freeze
7
+ THOUSAND_GROUPED_DOTS = /\A-?\d{1,3}(?:\.\d{3})+\z/.freeze
8
+
9
+ def self.parse(s)
10
+ return nil if s.nil?
11
+ m = s.to_s.match(NUMERIC)
12
+ return nil unless m
13
+ num = m[0]
14
+ dots = num.count(".")
15
+ commas = num.count(",")
16
+ if dots > 0 && commas > 0
17
+ if num.rindex(".") > num.rindex(",")
18
+ num = num.delete(",")
19
+ else
20
+ num = num.delete(".").tr(",", ".")
21
+ end
22
+ elsif commas > 0
23
+ num = THOUSAND_GROUPED_COMMAS.match?(num) ? num.delete(",") : num.tr(",", ".")
24
+ elsif dots > 1
25
+ num = THOUSAND_GROUPED_DOTS.match?(num) ? num.delete(".") : num
26
+ end
27
+ num.to_f
28
+ end
29
+ end
30
+ end