scrapi 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
1
+ Version 1.1.2 (August 13, 2006)
2
+
3
+ * Changed: Allows multiple :not pseudo classes to be used with the same
4
+ element (meaning, select where none of the negators match).
5
+ * Fixed: first-of-type, last-of-type.
6
+
7
+ Version 1.1.1 (August 8, 2006)
8
+
9
+ * Added: select() method to each element, that selects from that element.
10
+ * Fixed: Inheritence bug resulting in infinite loop. Credit: Andrew Turner
11
+
12
+ Version 1.1.0 (July 26, 2006)
13
+
14
+ * Added: CSS 3 pseudo classes. nth-child, first-child, not, empty, etc.
15
+ * Added: Quoted attribute values.
16
+ * Added: Gem.
17
+ * Fixed: Group selectors not parsing correctly.
18
+ * Fixed: Case sensitive (shouldn't be).
19
+
20
+ Version 1.0.0 (July 11, 2006)
21
+
22
+ * First release.
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2006 Assaf Arkin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,88 @@
1
+ == ScrAPI toolkit for Ruby
2
+
3
+ A framework for writing scrapers using CSS selectors and simple
4
+ select => extract => store processing rules.
5
+
6
+ Here’s an example that scrapes auctions from eBay:
7
+
8
+ ebay_auction = Scraper.define do
9
+ process "h3.ens>a", :description=>:text,
10
+ :url=>"@href"
11
+ process "td.ebcPr>span", :price=>:text
12
+ process "div.ebPicture >a>img", :image=>"@src"
13
+
14
+ result :description, :url, :price, :image
15
+ end
16
+
17
+ ebay = Scraper.define do
18
+ array :auctions
19
+
20
+ process "table.ebItemlist tr.single",
21
+ :auctions => ebay_auction
22
+
23
+ result :auctions
24
+ end
25
+
26
+ And using the scraper:
27
+
28
+ auctions = ebay.scrape(html)
29
+
30
+ # No. of auctions found
31
+ puts auctions.size
32
+
33
+ # First auction:
34
+ auction = auctions[0]
35
+ puts auction.description
36
+ puts auction.url
37
+
38
+
39
+ To get the latest source code with regular updates:
40
+
41
+ svn co http://labnotes.org/svn/public/ruby/scrapi
42
+
43
+
44
+ == Using TIDY
45
+
46
+ By default scrAPI uses Tidy to cleanup the HTML.
47
+
48
+ You need to install the Tidy Gem for Ruby:
49
+ gem install tidy
50
+
51
+ And the Tidy binary libraries, available here:
52
+
53
+ http://tidy.sourceforge.net/
54
+
55
+ By default scrAPI looks for the Tidy DLL (Windows) or shared library (Linux) in the directory lib/tidy. That's one place to place the Tidy library.
56
+
57
+ Alternatively, just point Tidy to the library with:
58
+
59
+ Tidy.path = "...."
60
+
61
+ On Linux this would probably be:
62
+
63
+ Tidy.path = "/usr/local/lib/libtidy.so"
64
+
65
+ On OS/X this would probably be:
66
+
67
+ Tidy.path = “/usr/lib/libtidy.dylib”
68
+
69
+ For testing purposes, you can also use the built in HTML parser. It's useful for testing and getting up to grabs with scrAPI, but it doesn't deal well with broken HTML. So for testing only:
70
+
71
+ Scraper::Base.parser :html_parser
72
+
73
+
74
+ == License
75
+
76
+ Copyright (c) 2006 Assaf Arkin, under Creative Commons Attribution and/or MIT License
77
+
78
+ Developed for http://co.mments.com
79
+
80
+ Code and documention: http://labnotes.org
81
+
82
+ HTML cleanup and good hygene by Tidy, Copyright (c) 1998-2003 World Wide Web Consortium.
83
+ License at http://tidy.sourceforge.net/license.html
84
+
85
+ HTML DOM extracted from Rails, Copyright (c) 2004 David Heinemeier Hansson. Under MIT license.
86
+
87
+ HTML parser by Takahiro Maebashi and Katsuyuki Komatsu, Ruby license.
88
+ http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html
@@ -0,0 +1,67 @@
1
+ require "benchmark"
2
+ require "rubygems"
3
+ Gem::manage_gems
4
+ require "rake"
5
+ require "rake/testtask"
6
+ require "rake/rdoctask"
7
+ require "rake/gempackagetask"
8
+
9
+
10
+
11
+ desc "Generate documentation"
12
+ Rake::RDocTask.new(:rdoc) do |rdoc|
13
+ rdoc.rdoc_dir = "rdoc"
14
+ rdoc.title = "Scraper"
15
+ rdoc.options << "--line-numbers"
16
+ rdoc.options << "--inline-source"
17
+ rdoc.rdoc_files.include("README")
18
+ rdoc.rdoc_files.include("lib/**/*.rb")
19
+ end
20
+
21
+
22
+ desc "Run all tests"
23
+ Rake::TestTask.new(:test) do |test|
24
+ test.libs << "lib"
25
+ test.pattern = "test/**/*_test.rb"
26
+ test.verbose = true
27
+ end
28
+
29
+
30
+ desc "Package as a Gem"
31
+ gem_spec = Gem::Specification.new do |spec|
32
+
33
+ version = nil
34
+ File.readlines("CHANGELOG").each do |line|
35
+ if line =~ /Version (\d+\.\d+\.\d+)/
36
+ version = $1
37
+ break
38
+ end
39
+ end
40
+ raise RuntimeError, "Can't find version number in changelog" unless version
41
+
42
+ spec.name = "scrapi"
43
+ spec.version = version
44
+ spec.summary = "scrAPI toolkit for Ruby"
45
+ spec.description = <<-EOF
46
+ A framework for writing scrapers using CSS selectors and simple
47
+ select => extract => store processing rules.
48
+ EOF
49
+ spec.author = "Assaf Arkin"
50
+ spec.email = "assaf.arkin@gmail.com"
51
+ spec.homepage = "http://labnotes.org/"
52
+
53
+ spec.files = FileList["{test,lib}/**/*", "README", "CHANGELOG", "Rakefile", "MIT-LICENSE"].to_a
54
+ spec.require_path = "lib"
55
+ spec.autorequire = "scrapi.rb"
56
+ spec.requirements << "Tidy"
57
+ spec.add_dependency "tidy", ">=1.1.0"
58
+ spec.has_rdoc = true
59
+ spec.rdoc_options << "--main" << "README" << "--title" << "scrAPI toolkit for Ruby" << "--line-numbers"
60
+ spec.extra_rdoc_files = ["README"]
61
+ spec.rubyforge_project = "scrapi"
62
+ end
63
+
64
+ gem = Rake::GemPackageTask.new(gem_spec) do |pkg|
65
+ pkg.need_tar = true
66
+ pkg.need_zip = true
67
+ end
@@ -0,0 +1,64 @@
1
+ require File.dirname(__FILE__) + '/tokenizer'
2
+ require File.dirname(__FILE__) + '/node'
3
+
4
+ module HTML #:nodoc:
5
+
6
+ # A top-level HTMl document. You give it a body of text, and it will parse that
7
+ # text into a tree of nodes.
8
+ class Document #:nodoc:
9
+
10
+ # The root of the parsed document.
11
+ attr_reader :root
12
+
13
+ # Create a new Document from the given text.
14
+ def initialize(text, strict=false, xml=false)
15
+ tokenizer = Tokenizer.new(text)
16
+ @root = Node.new(nil)
17
+ node_stack = [ @root ]
18
+ while token = tokenizer.next
19
+ node = Node.parse(node_stack.last, tokenizer.line, tokenizer.position, token, strict)
20
+
21
+ node_stack.last.children << node unless node.tag? && node.closing == :close
22
+ if node.tag?
23
+ if node_stack.length > 1 && node.closing == :close
24
+ if node_stack.last.name == node.name
25
+ node_stack.pop
26
+ else
27
+ open_start = node_stack.last.position - 20
28
+ open_start = 0 if open_start < 0
29
+ close_start = node.position - 20
30
+ close_start = 0 if close_start < 0
31
+ msg = <<EOF.strip
32
+ ignoring attempt to close #{node_stack.last.name} with #{node.name}
33
+ opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
34
+ closed at byte #{node.position}, line #{node.line}
35
+ attributes at open: #{node_stack.last.attributes.inspect}
36
+ text around open: #{text[open_start,40].inspect}
37
+ text around close: #{text[close_start,40].inspect}
38
+ EOF
39
+ strict ? raise(msg) : warn(msg)
40
+ end
41
+ elsif !node.childless?(xml) && node.closing != :close
42
+ node_stack.push node
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ # Search the tree for (and return) the first node that matches the given
49
+ # conditions. The conditions are interpreted differently for different node
50
+ # types, see HTML::Text#find and HTML::Tag#find.
51
+ def find(conditions)
52
+ @root.find(conditions)
53
+ end
54
+
55
+ # Search the tree for (and return) all nodes that match the given
56
+ # conditions. The conditions are interpreted differently for different node
57
+ # types, see HTML::Text#find and HTML::Tag#find.
58
+ def find_all(conditions)
59
+ @root.find_all(conditions)
60
+ end
61
+
62
+ end
63
+
64
+ end
@@ -0,0 +1,407 @@
1
+ module HTML #:nodoc:
2
+
3
+ # A parser for SGML, using the derived class as static DTD.
4
+
5
+ class SGMLParser
6
+
7
+ # Regular expressions used for parsing:
8
+ Interesting = /[&<]/
9
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
10
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
11
+ '![^<>]*)?')
12
+
13
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
14
+ Charref = /&#([0-9]+)[^0-9]/
15
+
16
+ Starttagopen = /<[>a-zA-Z]/
17
+ Endtagopen = /<\/[<>a-zA-Z]/
18
+ # Assaf: fixed to allow tag to close itself (XHTML)
19
+ Endbracket = /<|>|\/>/
20
+ Special = /<![^<>]*>/
21
+ Commentopen = /<!--/
22
+ Commentclose = /--[ \t\n]*>/
23
+ Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
24
+ # Assaf: / is no longer part of allowed attribute value
25
+ Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
26
+ '(\s*=\s*' +
27
+ "('[^']*'" +
28
+ '|"[^"]*"' +
29
+ '|[-~a-zA-Z0-9,.:+*%?!()_#=]*))?')
30
+
31
+ Entitydefs =
32
+ {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
33
+
34
+ def initialize(verbose=false)
35
+ @verbose = verbose
36
+ reset
37
+ end
38
+
39
+ def reset
40
+ @rawdata = ''
41
+ @stack = []
42
+ @lasttag = '???'
43
+ @nomoretags = false
44
+ @literal = false
45
+ end
46
+
47
+ def has_context(gi)
48
+ @stack.include? gi
49
+ end
50
+
51
+ def setnomoretags
52
+ @nomoretags = true
53
+ @literal = true
54
+ end
55
+
56
+ def setliteral(*args)
57
+ @literal = true
58
+ end
59
+
60
+ def feed(data)
61
+ @rawdata << data
62
+ goahead(false)
63
+ end
64
+
65
+ def close
66
+ goahead(true)
67
+ end
68
+
69
+ def goahead(_end)
70
+ rawdata = @rawdata
71
+ i = 0
72
+ n = rawdata.length
73
+ while i < n
74
+ if @nomoretags
75
+ handle_data(rawdata[i..(n-1)])
76
+ i = n
77
+ break
78
+ end
79
+ j = rawdata.index(Interesting, i)
80
+ j = n unless j
81
+ if i < j
82
+ handle_data(rawdata[i..(j-1)])
83
+ end
84
+ i = j
85
+ break if (i == n)
86
+ if rawdata[i] == ?< #
87
+ if rawdata.index(Starttagopen, i) == i
88
+ if @literal
89
+ handle_data(rawdata[i, 1])
90
+ i += 1
91
+ next
92
+ end
93
+ k = parse_starttag(i)
94
+ break unless k
95
+ i = k
96
+ next
97
+ end
98
+ if rawdata.index(Endtagopen, i) == i
99
+ k = parse_endtag(i)
100
+ break unless k
101
+ i = k
102
+ @literal = false
103
+ next
104
+ end
105
+ if rawdata.index(Commentopen, i) == i
106
+ if @literal
107
+ handle_data(rawdata[i,1])
108
+ i += 1
109
+ next
110
+ end
111
+ k = parse_comment(i)
112
+ break unless k
113
+ i += k
114
+ next
115
+ end
116
+ if rawdata.index(Special, i) == i
117
+ if @literal
118
+ handle_data(rawdata[i, 1])
119
+ i += 1
120
+ next
121
+ end
122
+ k = parse_special(i)
123
+ break unless k
124
+ i += k
125
+ next
126
+ end
127
+ elsif rawdata[i] == ?& #
128
+ if rawdata.index(Charref, i) == i
129
+ i += $&.length
130
+ handle_charref($1)
131
+ i -= 1 unless rawdata[i-1] == ?;
132
+ next
133
+ end
134
+ if rawdata.index(Entityref, i) == i
135
+ i += $&.length
136
+ handle_entityref($1)
137
+ i -= 1 unless rawdata[i-1] == ?;
138
+ next
139
+ end
140
+ else
141
+ raise RuntimeError, 'neither < nor & ??'
142
+ end
143
+ # We get here only if incomplete matches but
144
+ # nothing else
145
+ match = rawdata.index(Incomplete, i)
146
+ unless match == i
147
+ handle_data(rawdata[i, 1])
148
+ i += 1
149
+ next
150
+ end
151
+ j = match + $&.length
152
+ break if j == n # Really incomplete
153
+ handle_data(rawdata[i..(j-1)])
154
+ i = j
155
+ end
156
+ # end while
157
+ if _end and i < n
158
+ handle_data(@rawdata[i..(n-1)])
159
+ i = n
160
+ end
161
+ @rawdata = rawdata[i..-1]
162
+ end
163
+
164
+ def parse_comment(i)
165
+ rawdata = @rawdata
166
+ if rawdata[i, 4] != '<!--'
167
+ raise RuntimeError, 'unexpected call to handle_comment'
168
+ end
169
+ match = rawdata.index(Commentclose, i)
170
+ return nil unless match
171
+ matched_length = $&.length
172
+ j = match
173
+ handle_comment(rawdata[i+4..(j-1)])
174
+ j = match + matched_length
175
+ return j-i
176
+ end
177
+
178
+ def parse_starttag(i)
179
+ rawdata = @rawdata
180
+ j = rawdata.index(Endbracket, i + 1)
181
+ return nil unless j
182
+ attrs = []
183
+ if rawdata[i+1] == ?> #
184
+ # SGML shorthand: <> == <last open tag seen>
185
+ k = j
186
+ tag = @lasttag
187
+ else
188
+ match = rawdata.index(Tagfind, i + 1)
189
+ unless match
190
+ raise RuntimeError, 'unexpected call to parse_starttag'
191
+ end
192
+ k = i + 1 + ($&.length)
193
+ tag = $&.downcase
194
+ @lasttag = tag
195
+ end
196
+ while k < j
197
+ # Assaf: fixed to allow tag to close itself (XHTML)
198
+ break unless idx = rawdata.index(Attrfind, k) and idx < j
199
+ matched_length = $&.length
200
+ attrname, rest, attrvalue = $1, $2, $3
201
+ if not rest
202
+ attrvalue = '' # was: = attrname
203
+ # Assaf: fixed to handle double quoted attribute values properly
204
+ elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
205
+ (attrvalue[0] == ?" && attrvalue[-1] == ?")
206
+ attrvalue = attrvalue[1..-2]
207
+ end
208
+ attrs << [attrname.downcase, attrvalue]
209
+ k += matched_length
210
+ end
211
+ # Assaf: fixed to allow tag to close itself (XHTML)
212
+ if rawdata[j,2] == '/>'
213
+ j += 2
214
+ finish_starttag(tag, attrs)
215
+ finish_endtag(tag)
216
+ else
217
+ if rawdata[j] == ?> #
218
+ j += 1
219
+ end
220
+ finish_starttag(tag, attrs)
221
+ end
222
+ return j
223
+ end
224
+
225
+ def parse_endtag(i)
226
+ rawdata = @rawdata
227
+ j = rawdata.index(Endbracket, i + 1)
228
+ return nil unless j
229
+ tag = (rawdata[i+2..j-1].strip).downcase
230
+ if rawdata[j] == ?> #
231
+ j += 1
232
+ end
233
+ finish_endtag(tag)
234
+ return j
235
+ end
236
+
237
+ def finish_starttag(tag, attrs)
238
+ method = 'start_' + tag
239
+ if self.respond_to?(method)
240
+ @stack << tag
241
+ handle_starttag(tag, method, attrs)
242
+ return 1
243
+ else
244
+ method = 'do_' + tag
245
+ if self.respond_to?(method)
246
+ handle_starttag(tag, method, attrs)
247
+ return 0
248
+ else
249
+ unknown_starttag(tag, attrs)
250
+ return -1
251
+ end
252
+ end
253
+ end
254
+
255
+ def finish_endtag(tag)
256
+ if tag == ''
257
+ found = @stack.length - 1
258
+ if found < 0
259
+ unknown_endtag(tag)
260
+ return
261
+ end
262
+ else
263
+ unless @stack.include? tag
264
+ method = 'end_' + tag
265
+ unless self.respond_to?(method)
266
+ unknown_endtag(tag)
267
+ end
268
+ return
269
+ end
270
+ found = @stack.index(tag) #or @stack.length
271
+ end
272
+ while @stack.length > found
273
+ tag = @stack[-1]
274
+ method = 'end_' + tag
275
+ if respond_to?(method)
276
+ handle_endtag(tag, method)
277
+ else
278
+ unknown_endtag(tag)
279
+ end
280
+ @stack.pop
281
+ end
282
+ end
283
+
284
+ def parse_special(i)
285
+ rawdata = @rawdata
286
+ match = rawdata.index(Endbracket, i+1)
287
+ return nil unless match
288
+ matched_length = $&.length
289
+ handle_special(rawdata[i+1..(match-1)])
290
+ return match - i + matched_length
291
+ end
292
+
293
+ def handle_starttag(tag, method, attrs)
294
+ self.send(method, attrs)
295
+ end
296
+
297
+ def handle_endtag(tag, method)
298
+ self.send(method)
299
+ end
300
+
301
+ def report_unbalanced(tag)
302
+ if @verbose
303
+ print '*** Unbalanced </' + tag + '>', "\n"
304
+ print '*** Stack:', self.stack, "\n"
305
+ end
306
+ end
307
+
308
+ def handle_charref(name)
309
+ n = Integer(name) rescue -1
310
+ if !(0 <= n && n <= 255)
311
+ unknown_charref(name)
312
+ return
313
+ end
314
+ handle_data(n.chr)
315
+ end
316
+
317
+ def handle_entityref(name)
318
+ table = Entitydefs
319
+ if table.include?(name)
320
+ handle_data(table[name])
321
+ else
322
+ unknown_entityref(name)
323
+ return
324
+ end
325
+ end
326
+
327
+ def handle_data(data)
328
+ end
329
+
330
+ def handle_comment(data)
331
+ end
332
+
333
+ def handle_special(data)
334
+ end
335
+
336
+ def unknown_starttag(tag, attrs)
337
+ end
338
+ def unknown_endtag(tag)
339
+ end
340
+ def unknown_charref(ref)
341
+ end
342
+ def unknown_entityref(ref)
343
+ end
344
+
345
+ end
346
+
347
+
348
+ # (X)HTML parser.
349
+ #
350
+ # Parses a String and returns an REXML::Document with the (X)HTML content.
351
+ #
352
+ # For example:
353
+ # html = "<p>paragraph</p>"
354
+ # parser = HTMLParser.new(html)
355
+ # puts parser.document
356
+ #
357
+ # Requires a patched version of SGMLParser.
358
+ class HTMLParser < SGMLParser
359
+
360
+ attr :document
361
+
362
+ def self.parse(html)
363
+ parser = HTMLParser.new
364
+ parser.feed(html)
365
+ parser.document
366
+ end
367
+
368
+ def initialize()
369
+ super
370
+ @document = HTML::Document.new("")
371
+ @current = @document.root
372
+ end
373
+
374
+ def handle_data(data)
375
+ @current.children << HTML::Text.new(@current, 0, 0, data)
376
+ end
377
+
378
+ def handle_comment(data)
379
+ end
380
+
381
+ def handle_special(data)
382
+ end
383
+
384
+ def unknown_starttag(tag, attrs)
385
+ attrs = attrs.inject({}) do |hash, attr|
386
+ hash[attr[0].downcase] = attr[1]
387
+ hash
388
+ end
389
+ element = HTML::Tag.new(@current || @document, 0, 0, tag.downcase, attrs, true)
390
+ @current.children << element
391
+ @current = element
392
+ end
393
+
394
+ def unknown_endtag(tag)
395
+ @current = @current.parent if @current.parent
396
+ end
397
+
398
+ def unknown_charref(ref)
399
+ end
400
+
401
+ def unknown_entityref(ref)
402
+ @current.children << HTML::Text.new(@current, 0, 0, "&amp;#{ref}&lt;")
403
+ end
404
+
405
+ end
406
+
407
+ end