scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +242 -0
  3. data/LICENSE +21 -0
  4. data/README.md +440 -0
  5. data/bin/scrapetor +190 -0
  6. data/bin/scrapetor-bench +5 -0
  7. data/ext/scrapetor/README.md +53 -0
  8. data/ext/scrapetor/native/extconf.rb +67 -0
  9. data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
  10. data/ext/scrapetor/native/scrapetor_http.c +2591 -0
  11. data/ext/scrapetor/native/scrapetor_native.c +1156 -0
  12. data/lib/scrapetor/builder.rb +158 -0
  13. data/lib/scrapetor/cleaner.rb +10 -0
  14. data/lib/scrapetor/comment_node.rb +67 -0
  15. data/lib/scrapetor/document.rb +457 -0
  16. data/lib/scrapetor/dom/parser.rb +69 -0
  17. data/lib/scrapetor/dom/selectors.rb +208 -0
  18. data/lib/scrapetor/dom.rb +563 -0
  19. data/lib/scrapetor/encoding.rb +85 -0
  20. data/lib/scrapetor/entities.rb +90 -0
  21. data/lib/scrapetor/errors.rb +12 -0
  22. data/lib/scrapetor/extractor.rb +147 -0
  23. data/lib/scrapetor/fetcher.rb +390 -0
  24. data/lib/scrapetor/fingerprint.rb +29 -0
  25. data/lib/scrapetor/form.rb +141 -0
  26. data/lib/scrapetor/http.rb +114 -0
  27. data/lib/scrapetor/microdata.rb +132 -0
  28. data/lib/scrapetor/money.rb +30 -0
  29. data/lib/scrapetor/native.rb +291 -0
  30. data/lib/scrapetor/native_dom.rb +2258 -0
  31. data/lib/scrapetor/node.rb +539 -0
  32. data/lib/scrapetor/node_set.rb +301 -0
  33. data/lib/scrapetor/page_type.rb +95 -0
  34. data/lib/scrapetor/pagination.rb +109 -0
  35. data/lib/scrapetor/persistent_cache.rb +130 -0
  36. data/lib/scrapetor/robots.rb +159 -0
  37. data/lib/scrapetor/sax.rb +285 -0
  38. data/lib/scrapetor/schema.rb +144 -0
  39. data/lib/scrapetor/selector.rb +576 -0
  40. data/lib/scrapetor/session.rb +141 -0
  41. data/lib/scrapetor/sitemap.rb +52 -0
  42. data/lib/scrapetor/stream.rb +111 -0
  43. data/lib/scrapetor/structured_data.rb +74 -0
  44. data/lib/scrapetor/template_registry.rb +24 -0
  45. data/lib/scrapetor/text_node.rb +101 -0
  46. data/lib/scrapetor/url.rb +21 -0
  47. data/lib/scrapetor/version.rb +5 -0
  48. data/lib/scrapetor/xpath.rb +1603 -0
  49. data/lib/scrapetor.rb +167 -0
  50. data/scrapetor.gemspec +77 -0
  51. metadata +200 -0
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+
5
+ module Scrapetor
6
+ # robots.txt parser + path-match decider.
7
+ #
8
+ # r = Scrapetor::Robots.fetch_for("https://example.com")
9
+ # r.allowed?("https://example.com/private")
10
+ # r.crawl_delay
11
+ # r.sitemaps
12
+ #
13
+ # Implements the de-facto Google / RFC 9309 longest-match semantics:
14
+ # the most-specific (longest pattern) Allow/Disallow rule wins.
15
+ # User-agent matching is case-insensitive prefix; '*' is the fallback.
16
+ class Robots
17
+ Rule = Struct.new(:type, :pattern) # type: :allow or :disallow
18
+
19
+ attr_reader :sitemaps
20
+
21
+ def initialize(body, user_agent: "*")
22
+ @ua = user_agent
23
+ @groups = {} # ua_pattern (lowercased) => Array<Rule>
24
+ @delays = {} # ua_pattern => Float
25
+ @sitemaps = []
26
+ parse!(body.to_s)
27
+ end
28
+
29
+ def allowed?(url)
30
+ s = url.to_s
31
+ path =
32
+ if s.start_with?("/")
33
+ s
34
+ else
35
+ uri = URI(s)
36
+ (uri.path.empty? ? "/" : uri.path) + (uri.query ? "?#{uri.query}" : "")
37
+ end
38
+ rules = applicable_rules
39
+ return true if rules.empty?
40
+ # Find the longest matching pattern (Google convention; RFC 9309
41
+ # also says the most specific match wins).
42
+ best = nil
43
+ rules.each do |r|
44
+ next unless path_matches?(path, r.pattern)
45
+ if best.nil? || r.pattern.length > best.pattern.length
46
+ best = r
47
+ end
48
+ end
49
+ best.nil? || best.type == :allow
50
+ end
51
+
52
+ def disallowed?(url)
53
+ !allowed?(url)
54
+ end
55
+
56
+ def crawl_delay
57
+ ua = ua_for(@ua)
58
+ @delays[ua] || @delays["*"]
59
+ end
60
+
61
+ def self.fetch_for(origin, user_agent: "*", **opts)
62
+ uri = URI(origin.to_s)
63
+ url = "#{uri.scheme}://#{uri.host}#{uri.port == uri.default_port ? "" : ":#{uri.port}"}/robots.txt"
64
+ resp = Scrapetor::Fetcher.get(url, raise_for_status: false, **opts)
65
+ body = resp[:status] == 200 ? resp[:body] : ""
66
+ new(body, user_agent: user_agent)
67
+ end
68
+
69
+ private
70
+
71
+ def applicable_rules
72
+ ua = ua_for(@ua)
73
+ @groups[ua] || @groups["*"] || []
74
+ end
75
+
76
+ # Pick the most-specific UA group whose name is a case-insensitive
77
+ # prefix of @ua, or '*' as fallback.
78
+ def ua_for(ua)
79
+ ua_lc = ua.to_s.downcase
80
+ best = nil
81
+ @groups.each_key do |key|
82
+ next if key == "*"
83
+ if ua_lc.start_with?(key) && (best.nil? || key.length > best.length)
84
+ best = key
85
+ end
86
+ end
87
+ best || "*"
88
+ end
89
+
90
+ # robots.txt allows '*' wildcards and '$' end-anchor inside patterns.
91
+ # Translate to regex once per call; for hot-path callers, cache.
92
+ def path_matches?(path, pattern)
93
+ regex = pattern_cache(pattern)
94
+ regex.match?(path)
95
+ end
96
+
97
+ def pattern_cache(pattern)
98
+ @pattern_cache ||= {}
99
+ @pattern_cache[pattern] ||= compile_pattern(pattern)
100
+ end
101
+
102
+ def compile_pattern(pattern)
103
+ buf = +"\\A"
104
+ i = 0
105
+ while i < pattern.length
106
+ ch = pattern[i]
107
+ if ch == "*"
108
+ buf << ".*"
109
+ elsif ch == "$" && i == pattern.length - 1
110
+ buf << "\\z"
111
+ else
112
+ buf << Regexp.escape(ch)
113
+ end
114
+ i += 1
115
+ end
116
+ Regexp.new(buf)
117
+ end
118
+
119
+ def parse!(body)
120
+ current_uas = []
121
+ buffer = []
122
+ flush = lambda do
123
+ current_uas.each { |u| (@groups[u] ||= []).concat(buffer) }
124
+ buffer = []
125
+ end
126
+ body.each_line do |line|
127
+ # Strip trailing newline before slicing off the inline comment.
128
+ # Using \z against an each_line chunk would leave the '#' run
129
+ # in place because '.' doesn't span newlines.
130
+ line = line.chomp.sub(/#.*\z/, "").strip
131
+ next if line.empty?
132
+ key, val = line.split(":", 2)
133
+ next unless val
134
+ key = key.strip.downcase
135
+ val = val.strip
136
+ case key
137
+ when "user-agent"
138
+ flush.call unless buffer.empty?
139
+ if current_uas.empty? || current_uas.last == val.downcase
140
+ current_uas << val.downcase
141
+ else
142
+ current_uas = [val.downcase]
143
+ end
144
+ when "disallow"
145
+ # An empty disallow means "allow all"; skip — empty pattern would match everything.
146
+ buffer << Rule.new(:disallow, val) unless val.empty?
147
+ when "allow"
148
+ buffer << Rule.new(:allow, val) unless val.empty?
149
+ when "crawl-delay"
150
+ d = val.to_f
151
+ current_uas.each { |u| @delays[u] = d } if d > 0
152
+ when "sitemap"
153
+ @sitemaps << val unless val.empty?
154
+ end
155
+ end
156
+ flush.call unless buffer.empty?
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ # Pure-Ruby SAX-style streaming HTML parser.
5
+ #
6
+ # The hot path for production extraction is the C streaming engine
7
+ # behind `doc.extract`. This module exists for the cases where you
8
+ # genuinely want token-by-token control — debugging, custom incremental
9
+ # processors, conversion to other formats.
10
+ #
11
+ # Usage:
12
+ #
13
+ # class MyHandler < Scrapetor::SAX::Document
14
+ # def start_element(name, attrs); puts "<#{name}>"; end
15
+ # def end_element(name); puts "</#{name}>"; end
16
+ # def characters(text); puts text; end
17
+ # def comment(text); puts "<!--#{text}-->"; end
18
+ # def doctype(name); puts "<!DOCTYPE #{name}>"; end
19
+ # end
20
+ #
21
+ # Scrapetor::SAX::Parser.new(MyHandler.new).parse(html)
22
+ module SAX
23
+ # Subclass to selectively override callbacks. All default to no-ops.
24
+ class Document
25
+ def start_document; end
26
+ def end_document; end
27
+ def start_element(name, attrs); end
28
+ def end_element(name); end
29
+ def characters(text); end
30
+ def comment(text); end
31
+ def doctype(name); end
32
+ def cdata_block(text); end
33
+ def error(msg); end
34
+ def warning(msg); end
35
+ end
36
+
37
+ class Parser
38
+ def initialize(handler)
39
+ @handler = handler
40
+ end
41
+
42
+ def parse(html)
43
+ Tokenizer.new(html).each_event do |event|
44
+ type, *args = event
45
+ case type
46
+ when :doc_start then @handler.start_document
47
+ when :doc_end then @handler.end_document
48
+ when :start then @handler.start_element(args[0], args[1])
49
+ when :end then @handler.end_element(args[0])
50
+ when :text then @handler.characters(args[0])
51
+ when :comment then @handler.comment(args[0])
52
+ when :doctype then @handler.doctype(args[0])
53
+ when :cdata then @handler.cdata_block(args[0])
54
+ end
55
+ end
56
+ self
57
+ end
58
+
59
+ def parse_file(path)
60
+ parse(File.read(path))
61
+ end
62
+
63
+ def parse_io(io)
64
+ parse(io.read)
65
+ end
66
+ end
67
+
68
+ # Standalone tokenizer — yields events without going through a handler.
69
+ # Useful when you just want an enumerator:
70
+ #
71
+ # Scrapetor::SAX::Tokenizer.new(html).each_event do |type, *args|
72
+ # # ...
73
+ # end
74
+ class Tokenizer
75
+ VOID = %w[
76
+ area base br col embed hr img input link meta source track wbr
77
+ ].freeze
78
+ RAW_TEXT = %w[script style].freeze
79
+
80
+ def initialize(html)
81
+ @html = Scrapetor::Encoding.to_utf8(html)
82
+ @pos = 0
83
+ @len = @html.bytesize
84
+ end
85
+
86
+ def each_event(&block)
87
+ return enum_for(:each_event) unless block_given?
88
+ block.call([:doc_start])
89
+ while @pos < @len
90
+ ch = byte(@pos)
91
+ if ch == 0x3C # '<'
92
+ handle_open(&block)
93
+ else
94
+ handle_text(&block)
95
+ end
96
+ end
97
+ block.call([:doc_end])
98
+ self
99
+ end
100
+
101
+ private
102
+
103
+ def byte(i)
104
+ @html.getbyte(i)
105
+ end
106
+
107
+ def slice(s, e)
108
+ @html.byteslice(s, e - s) || ""
109
+ end
110
+
111
+ def handle_text(&block)
112
+ start = @pos
113
+ while @pos < @len && byte(@pos) != 0x3C
114
+ @pos += 1
115
+ end
116
+ text = slice(start, @pos)
117
+ block.call([:text, text]) unless text.empty?
118
+ end
119
+
120
+ def handle_open(&block)
121
+ return unless @pos + 1 < @len
122
+
123
+ nxt = byte(@pos + 1)
124
+
125
+ # Comment
126
+ if nxt == 0x21 && @pos + 3 < @len && byte(@pos + 2) == 0x2D && byte(@pos + 3) == 0x2D
127
+ start = @pos + 4
128
+ e = @html.index("-->", start)
129
+ if e.nil?
130
+ @pos = @len
131
+ return
132
+ end
133
+ block.call([:comment, slice(start, e)])
134
+ @pos = e + 3
135
+ return
136
+ end
137
+
138
+ # Doctype or bogus !
139
+ if nxt == 0x21
140
+ gt = @html.index(">", @pos)
141
+ if gt.nil?
142
+ @pos = @len
143
+ return
144
+ end
145
+ decl = slice(@pos + 2, gt)
146
+ if decl =~ /\A\s*DOCTYPE\b\s*([^\s>]+)?/i
147
+ block.call([:doctype, ($1 || "").downcase])
148
+ end
149
+ @pos = gt + 1
150
+ return
151
+ end
152
+
153
+ # End tag
154
+ if nxt == 0x2F # '/'
155
+ @pos += 2
156
+ name_start = @pos
157
+ while @pos < @len && name_char?(byte(@pos))
158
+ @pos += 1
159
+ end
160
+ name = slice(name_start, @pos).downcase
161
+ # Skip to '>'
162
+ while @pos < @len && byte(@pos) != 0x3E
163
+ @pos += 1
164
+ end
165
+ @pos += 1 if @pos < @len
166
+ block.call([:end, name]) unless name.empty?
167
+ return
168
+ end
169
+
170
+ # Start tag
171
+ if name_start?(nxt)
172
+ @pos += 1
173
+ name_start = @pos
174
+ while @pos < @len && name_char?(byte(@pos))
175
+ @pos += 1
176
+ end
177
+ name = slice(name_start, @pos).downcase
178
+ attrs = parse_attrs
179
+ self_closing = consume_close
180
+ block.call([:start, name, attrs])
181
+ if VOID.include?(name) || self_closing
182
+ block.call([:end, name])
183
+ elsif RAW_TEXT.include?(name)
184
+ # Raw text content until matching </name>
185
+ text_start = @pos
186
+ needle = "</#{name}"
187
+ close_idx = @html.downcase.index(needle, @pos)
188
+ close_idx ||= @len
189
+ block.call([:text, slice(text_start, close_idx)]) if close_idx > text_start
190
+ @pos = close_idx
191
+ # consume </name ... >
192
+ if @pos < @len
193
+ while @pos < @len && byte(@pos) != 0x3E
194
+ @pos += 1
195
+ end
196
+ @pos += 1 if @pos < @len
197
+ block.call([:end, name])
198
+ end
199
+ end
200
+ return
201
+ end
202
+
203
+ # Literal '<' followed by non-name — emit as text
204
+ block.call([:text, "<"])
205
+ @pos += 1
206
+ end
207
+
208
+ def parse_attrs
209
+ attrs = {}
210
+ while @pos < @len
211
+ skip_ws
212
+ break if @pos >= @len
213
+ ch = byte(@pos)
214
+ break if ch == 0x3E # '>'
215
+ break if ch == 0x2F # '/' (self-closing marker)
216
+ # Attribute name
217
+ name_start = @pos
218
+ while @pos < @len
219
+ nc = byte(@pos)
220
+ break if nc == 0x3D || nc == 0x3E || nc == 0x2F || ws?(nc)
221
+ @pos += 1
222
+ end
223
+ aname = slice(name_start, @pos).downcase
224
+ next if aname.empty?
225
+ skip_ws
226
+ value = nil
227
+ if @pos < @len && byte(@pos) == 0x3D
228
+ @pos += 1
229
+ skip_ws
230
+ if @pos < @len
231
+ q = byte(@pos)
232
+ if q == 0x22 || q == 0x27
233
+ @pos += 1
234
+ val_start = @pos
235
+ while @pos < @len && byte(@pos) != q
236
+ @pos += 1
237
+ end
238
+ value = slice(val_start, @pos)
239
+ @pos += 1 if @pos < @len
240
+ else
241
+ val_start = @pos
242
+ while @pos < @len && !ws?(byte(@pos)) && byte(@pos) != 0x3E
243
+ @pos += 1
244
+ end
245
+ value = slice(val_start, @pos)
246
+ end
247
+ end
248
+ end
249
+ attrs[aname] = value || ""
250
+ end
251
+ attrs
252
+ end
253
+
254
+ def consume_close
255
+ self_closing = false
256
+ if @pos < @len && byte(@pos) == 0x2F
257
+ self_closing = true
258
+ @pos += 1
259
+ end
260
+ while @pos < @len && byte(@pos) != 0x3E
261
+ @pos += 1
262
+ end
263
+ @pos += 1 if @pos < @len
264
+ self_closing
265
+ end
266
+
267
+ def skip_ws
268
+ @pos += 1 while @pos < @len && ws?(byte(@pos))
269
+ end
270
+
271
+ def name_start?(b)
272
+ (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A) || b == 0x5F
273
+ end
274
+
275
+ def name_char?(b)
276
+ (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A) ||
277
+ (b >= 0x30 && b <= 0x39) || b == 0x2D || b == 0x5F || b == 0x3A
278
+ end
279
+
280
+ def ws?(b)
281
+ b == 0x20 || b == 0x09 || b == 0x0A || b == 0x0D || b == 0x0C || b == 0x0B
282
+ end
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Scrapetor
4
+ class Schema
5
+ Field = Struct.new(
6
+ :name, :selector, :attr, :attr_str, :type, :clean, :multi,
7
+ :normalize_url, :default, :required, :transform, :delimiter
8
+ )
9
+ Group = Struct.new(:name, :selector, :fields, :groups)
10
+
11
+ attr_reader :fields, :groups
12
+
13
+ def initialize
14
+ @fields = []
15
+ @groups = []
16
+ end
17
+
18
+ def self.build(&block)
19
+ s = new
20
+ s.instance_eval(&block) if block
21
+ s
22
+ end
23
+
24
+ # field :name, from: SELECTOR, attr: SYM, type: SYM,
25
+ # clean: BOOL, multi: BOOL, normalize_url: BOOL,
26
+ # default: VALUE, required: BOOL,
27
+ # transform: PROC, delimiter: STRING_OR_REGEX
28
+ #
29
+ # from: may be a String selector or an Array of selectors (tried in
30
+ # order until one matches).
31
+ #
32
+ # Types: :text :integer :float :money :url :date :json :html :list
33
+ # :boolean :array (alias for multi:true)
34
+ def field(name,
35
+ from:,
36
+ attr: nil,
37
+ type: :text,
38
+ clean: false,
39
+ multi: false,
40
+ normalize_url: false,
41
+ default: nil,
42
+ required: false,
43
+ transform: nil,
44
+ delimiter: /\s*,\s*/)
45
+ multi = true if type == :array
46
+ type = :text if type == :array
47
+ @fields << Field.new(
48
+ name, from, attr, attr && attr.to_s, type, clean, multi,
49
+ normalize_url, default, required, transform, delimiter
50
+ )
51
+ end
52
+
53
+ def repeated(selector, as:, &block)
54
+ sub = self.class.build(&block)
55
+ @groups << Group.new(as, selector, sub.fields, sub.groups)
56
+ end
57
+
58
+ # ----- Cross-process plan cache -----
59
+ #
60
+ # Serialize a schema to a binary blob (Marshal) so a worker can
61
+ # restore the compiled descriptor without re-parsing the Ruby DSL.
62
+ # Schemas using `transform:` (procs) can't be dumped — those plans
63
+ # must be rebuilt from source.
64
+
65
+ def dump
66
+ Marshal.dump(self.class.dumpable(self))
67
+ end
68
+
69
+ def self.load(blob)
70
+ new_from_h(Marshal.load(blob)) # rubocop:disable Security/MarshalLoad
71
+ end
72
+
73
+ def self.dump_to_file(schema, path)
74
+ File.binwrite(path, schema.dump)
75
+ path
76
+ end
77
+
78
+ def self.load_file(path)
79
+ load(File.binread(path))
80
+ end
81
+
82
+ # Convert a schema to a portable Hash (no procs).
83
+ def self.dumpable(schema)
84
+ {
85
+ fields: schema.fields.map { |f| field_to_h(f) },
86
+ groups: schema.groups.map { |g| group_to_h(g) }
87
+ }
88
+ end
89
+
90
+ def self.field_to_h(f)
91
+ raise SchemaError, "transform: blocks can't be serialized" if f.transform
92
+ {
93
+ name: f.name,
94
+ selector: f.selector,
95
+ attr: f.attr,
96
+ attr_str: f.attr_str,
97
+ type: f.type,
98
+ clean: f.clean,
99
+ multi: f.multi,
100
+ normalize_url: f.normalize_url,
101
+ default: f.default,
102
+ required: f.required,
103
+ delimiter: f.delimiter
104
+ }
105
+ end
106
+
107
+ def self.group_to_h(g)
108
+ {
109
+ name: g.name,
110
+ selector: g.selector,
111
+ fields: g.fields.map { |f| field_to_h(f) },
112
+ groups: g.groups.map { |sub| group_to_h(sub) }
113
+ }
114
+ end
115
+
116
+ def self.new_from_h(h)
117
+ schema = new
118
+ h[:fields].each { |fh| schema.fields << field_from_h(fh) }
119
+ h[:groups].each { |gh| schema.groups << group_from_h(gh) }
120
+ schema
121
+ end
122
+
123
+ def self.field_from_h(h)
124
+ Field.new(
125
+ h[:name], h[:selector], h[:attr], h[:attr_str], h[:type],
126
+ h[:clean], h[:multi], h[:normalize_url], h[:default],
127
+ h[:required], nil, h[:delimiter]
128
+ )
129
+ end
130
+
131
+ def self.group_from_h(h)
132
+ Group.new(
133
+ h[:name],
134
+ h[:selector],
135
+ h[:fields].map { |fh| field_from_h(fh) },
136
+ h[:groups].map { |gh| group_from_h(gh) }
137
+ )
138
+ end
139
+
140
+ def to_h
141
+ self.class.dumpable(self)
142
+ end
143
+ end
144
+ end