scrapes 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README +123 -0
- data/demo/demo.rb +33 -0
- data/demo/pages/about.rb +32 -0
- data/demo/pages/main.rb +32 -0
- data/lib/scrapes.rb +41 -0
- data/lib/scrapes/cache.rb +110 -0
- data/lib/scrapes/cookbook.rb +53 -0
- data/lib/scrapes/cookies.rb +45 -0
- data/lib/scrapes/crawler.rb +97 -0
- data/lib/scrapes/hpricot.rb +110 -0
- data/lib/scrapes/initializer.rb +86 -0
- data/lib/scrapes/page.rb +319 -0
- data/lib/scrapes/rule_parser.rb +327 -0
- data/lib/scrapes/session.rb +155 -0
- data/lib/scrapes/to_proxy.rb +50 -0
- data/test/cache.rb +75 -0
- data/test/cookies.rb +34 -0
- data/test/crawler.rb +69 -0
- data/test/hpricot.rb +55 -0
- data/test/initializer.rb +54 -0
- data/test/lib/server.rb +63 -0
- data/test/page.rb +77 -0
- data/test/pages/foils.rb +61 -0
- data/test/pages/foils2.rb +38 -0
- data/test/pages/redhanded_entries.rb +36 -0
- data/test/pages/redhanded_main.rb +58 -0
- data/test/pages/rule_parser.rb +81 -0
- data/test/pages/simple.rb +21 -0
- data/test/public/foil72.html +10 -0
- data/test/public/foil73.html +9 -0
- data/test/public/foil74.html +11 -0
- data/test/public/foo.txt +1 -0
- data/test/public/index.html +20 -0
- data/test/public/redhanded.html +1208 -0
- data/test/public/rule_parser.html +21 -0
- data/test/public/simple.html +8 -0
- data/test/rule_parser.rb +151 -0
- data/test/session.rb +45 -0
- data/test/textcontent.rb +71 -0
- metadata +123 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
module Scrapes
|
26
|
+
################################################################################
|
27
|
+
# Make it easy to access HTTP cookies
|
28
|
+
class Cookies < Hash
|
29
|
+
################################################################################
|
30
|
+
# Convert the current set of cookies into HTTP headers.
|
31
|
+
def to_header
|
32
|
+
map {|k,v| "#{k}=#{v}"}.join(';')
|
33
|
+
end
|
34
|
+
|
35
|
+
################################################################################
|
36
|
+
# Parse HTTP cookie headers
|
37
|
+
def from_header (header)
|
38
|
+
k, v = header.sub(/;.*$/, '').split(/\s*=\s*/, 2)
|
39
|
+
self[k] = v
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
################################################################################
|
44
|
+
end
|
45
|
+
################################################################################
|
@@ -0,0 +1,97 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'net/http'
|
26
|
+
require 'pathname'
|
27
|
+
require 'scrapes/cache'
|
28
|
+
################################################################################
|
29
|
+
module Scrapes
|
30
|
+
################################################################################
|
31
|
+
# Try to suck down a URI
|
32
|
+
class Crawler
|
33
|
+
################################################################################
|
34
|
+
# The cache object that this crawler is using
|
35
|
+
attr_accessor :cache
|
36
|
+
|
37
|
+
################################################################################
|
38
|
+
# The optional log object that this crawler is using
|
39
|
+
attr_accessor :log
|
40
|
+
|
41
|
+
################################################################################
|
42
|
+
# Create a new crawler for the given session
|
43
|
+
def initialize (session)
|
44
|
+
@session = session
|
45
|
+
@log = nil
|
46
|
+
@verbose = 0
|
47
|
+
@delay = 0.5
|
48
|
+
@cache = Cache.new
|
49
|
+
end
|
50
|
+
|
51
|
+
################################################################################
|
52
|
+
# Fetch a URI, using HTTP GET unless you supply <tt>post</tt>.
|
53
|
+
def fetch (uri, post={}, headers={})
|
54
|
+
@session.refresh
|
55
|
+
uri = URI.parse(@session.absolute_uri(uri))
|
56
|
+
|
57
|
+
post.empty? and cached = @cache.check(uri)
|
58
|
+
@log.info((cached ? 'C ' : 'N ') + uri.to_s) if @log
|
59
|
+
|
60
|
+
return cached if cached # FIXME
|
61
|
+
sleep(@delay) if @delay != 0
|
62
|
+
|
63
|
+
path = uri.path.dup
|
64
|
+
path << "/" if path.empty?
|
65
|
+
path << "?" + uri.query if uri.query
|
66
|
+
|
67
|
+
req = post.empty? ? Net::HTTP::Get.new(path) : Net::HTTP::Post.new(path)
|
68
|
+
req.set_form_data(post) unless post.empty?
|
69
|
+
|
70
|
+
req['Cookie'] = @session.cookies.to_header
|
71
|
+
headers.each {|k,v| req[k] = v}
|
72
|
+
|
73
|
+
res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req)}
|
74
|
+
|
75
|
+
if @verbose >= 2
|
76
|
+
STDERR.puts "-----------------------------------------------"
|
77
|
+
STDERR.puts res.class
|
78
|
+
res.each_header {|k,v| STDERR.puts "#{k}: #{v}"}
|
79
|
+
end
|
80
|
+
|
81
|
+
# FIXME, what to do about more than one cookie
|
82
|
+
@session.cookies.from_header(res['set-cookie']) if res.key?('set-cookie')
|
83
|
+
|
84
|
+
case res
|
85
|
+
when Net::HTTPRedirection
|
86
|
+
@session.base_uris[-1] = @session.absolute_uri(res['location'])
|
87
|
+
res = fetch(res['location'], {}, headers)
|
88
|
+
end
|
89
|
+
|
90
|
+
post.empty? and @cache.update(uri, res.body)
|
91
|
+
res
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
################################################################################
|
96
|
+
end
|
97
|
+
################################################################################
|
@@ -0,0 +1,110 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'cgi'
|
26
|
+
require 'hpricot'
|
27
|
+
################################################################################
|
28
|
+
module Scrapes
|
29
|
+
################################################################################
|
30
|
+
module Hpricot # :nodoc:
|
31
|
+
################################################################################
|
32
|
+
module Extractors
|
33
|
+
################################################################################
|
34
|
+
# Returns the text of any child text nodes recursively concatenated.
|
35
|
+
def text(node)
|
36
|
+
text_process(node,String) do |e| text(e) end
|
37
|
+
end
|
38
|
+
|
39
|
+
################################################################################
|
40
|
+
# Returns the text of any child text nodes recursively as nested Array.
|
41
|
+
def texts(node)
|
42
|
+
text_process(node,Array) do |e| texts(e) end
|
43
|
+
end
|
44
|
+
|
45
|
+
################################################################################
|
46
|
+
# Returns the text of any child text nodes concatenated.
|
47
|
+
def content(node)
|
48
|
+
text_process(node,String) do |e| e.content end
|
49
|
+
end
|
50
|
+
|
51
|
+
################################################################################
|
52
|
+
# Returns the text of any child text nodes as an Array.
|
53
|
+
def contents(node)
|
54
|
+
text_process(node,Array) do |e| e.content end
|
55
|
+
end
|
56
|
+
|
57
|
+
################################################################################
|
58
|
+
# The result of text() with whitespace reduceded to single spaces and striped.
|
59
|
+
def word(node)
|
60
|
+
text_process(node,String) do |e| word(e).gsub(/\s+/,' ').strip end
|
61
|
+
end
|
62
|
+
|
63
|
+
################################################################################
|
64
|
+
# The result of texts() striped, flattened, whitespace reduced to single spaces, and
|
65
|
+
# with all blank?s rejected.
|
66
|
+
def words(node)
|
67
|
+
texts(node).flatten.compact.map{|e|e.gsub(/\s+/,' ').strip}.reject{|e| e.blank?}
|
68
|
+
end
|
69
|
+
|
70
|
+
################################################################################
|
71
|
+
# Just reuturn the yielded node.
|
72
|
+
def xml(node)
|
73
|
+
node
|
74
|
+
end
|
75
|
+
|
76
|
+
protected
|
77
|
+
################################################################################
|
78
|
+
def unescape
|
79
|
+
case result = yield
|
80
|
+
when String then CGI::unescapeHTML(result).gsub(' ', ' ')
|
81
|
+
when Array then result.map{|e| Extractors::unescape{e}}
|
82
|
+
when NilClass then nil
|
83
|
+
else raise "should be Array or String, was: #{result.class}"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
################################################################################
|
87
|
+
def text_process(node, klass, &block)
|
88
|
+
Extractors::unescape do
|
89
|
+
case node
|
90
|
+
when Array, ::Hpricot::Elements
|
91
|
+
node.map do |elem|
|
92
|
+
text_process(elem,klass,&block)
|
93
|
+
end
|
94
|
+
when ::Hpricot::Elem, ::Hpricot::Doc
|
95
|
+
node.children.inject(klass.new) do |value,child|
|
96
|
+
(value << block.call(child)) rescue nil
|
97
|
+
value
|
98
|
+
end
|
99
|
+
when ::Hpricot::Text then node.content
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
module_function :word, :words, :text, :texts, :content, :contents, :text_process
|
105
|
+
end
|
106
|
+
################################################################################
|
107
|
+
end
|
108
|
+
################################################################################
|
109
|
+
end
|
110
|
+
################################################################################
|
@@ -0,0 +1,86 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
module Scrapes
|
26
|
+
################################################################################
|
27
|
+
# Initialize the Scrapes library
|
28
|
+
class Initializer
|
29
|
+
################################################################################
|
30
|
+
# The directory name where the pages classes are kept
|
31
|
+
attr_accessor :pages_dir
|
32
|
+
|
33
|
+
################################################################################
|
34
|
+
# The parent directory where the pages_dir can be found
|
35
|
+
attr_accessor :pages_parent
|
36
|
+
|
37
|
+
################################################################################
|
38
|
+
# Create a new Initializer and run it
|
39
|
+
def self.run (&block)
|
40
|
+
initializer = self.new
|
41
|
+
yield initializer if block
|
42
|
+
initializer
|
43
|
+
end
|
44
|
+
|
45
|
+
################################################################################
|
46
|
+
# Establish all the defaults
|
47
|
+
def initialize
|
48
|
+
@pages_dir = 'pages'
|
49
|
+
@pages_parent = File.dirname($0)
|
50
|
+
end
|
51
|
+
|
52
|
+
################################################################################
|
53
|
+
# Run all the initilization methods
|
54
|
+
def process
|
55
|
+
load_pages
|
56
|
+
end
|
57
|
+
|
58
|
+
################################################################################
|
59
|
+
private
|
60
|
+
|
61
|
+
################################################################################
|
62
|
+
# load all files in the pages directory
|
63
|
+
def load_pages
|
64
|
+
reloader(Dir.glob(@pages_parent + '/' + @pages_dir + '/*.rb').sort)
|
65
|
+
end
|
66
|
+
|
67
|
+
################################################################################
|
68
|
+
# try to keep loading files until all NameError issues are resolved
|
69
|
+
def reloader (files, limit=4)
|
70
|
+
reload = []
|
71
|
+
|
72
|
+
files.each do |file|
|
73
|
+
begin
|
74
|
+
load File.expand_path(file)
|
75
|
+
rescue NameError
|
76
|
+
raise if limit <= 0
|
77
|
+
reload << file
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
reloader(reload, limit - 1) unless reload.empty?
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
################################################################################
|
data/lib/scrapes/page.rb
ADDED
@@ -0,0 +1,319 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'scrapes/rule_parser'
|
26
|
+
require 'hpricot'
|
27
|
+
require 'rextra'
|
28
|
+
################################################################################
|
29
|
+
module Scrapes
|
30
|
+
################################################################################
|
31
|
+
# The page class is used as a base class for scraping data out of one web
|
32
|
+
# page. To use it, you inherit from it and setup some rules. You can also
|
33
|
+
# use validators to ensure that the page was scraped correctly.
|
34
|
+
#
|
35
|
+
# == Setup
|
36
|
+
#
|
37
|
+
# class MyPageScraper < Scrapes::Page
|
38
|
+
# rule :rule_name, blah
|
39
|
+
# end
|
40
|
+
# Scrapes::RuleParser explains the use of rules.
|
41
|
+
#
|
42
|
+
# == Auto Loading
|
43
|
+
#
|
44
|
+
# Scrapes will automatically 'require' ruby files placed in a special 'pages' directory.
|
45
|
+
# The idea is to place one Scrapes::Page derived class per file in the pages directory,
|
46
|
+
# and have it required for you.
|
47
|
+
#
|
48
|
+
# == Validations
|
49
|
+
#
|
50
|
+
# There are a few class methods that you can use to validate the contents you scraped
|
51
|
+
# from a given web page.
|
52
|
+
class Page
|
53
|
+
include Scrapes::Hpricot::Extractors
|
54
|
+
|
55
|
+
XSLTPROC = 'xsltproc' # :nodoc
|
56
|
+
|
57
|
+
################################################################################
|
58
|
+
# RuleParser is used to extract data from web pages using CSS selectors
|
59
|
+
# and raw element access by using procs.
|
60
|
+
include RuleParser
|
61
|
+
|
62
|
+
################################################################################
|
63
|
+
# Access the URI where this page's data came from
|
64
|
+
attr_accessor :uri
|
65
|
+
|
66
|
+
################################################################################
|
67
|
+
# Access the session object that was used to fetch this page's data
|
68
|
+
attr_accessor :session
|
69
|
+
|
70
|
+
################################################################################
|
71
|
+
# Access the Hpricot object that the selectors are passed
|
72
|
+
attr_accessor :hpricot
|
73
|
+
|
74
|
+
################################################################################
|
75
|
+
# If the page that you are parsing is paginated (one page in many of similar data)
|
76
|
+
# you can use this class method to automatically fetch all pages. In order for this
|
77
|
+
# to work, you need to provide a few special methods:
|
78
|
+
#
|
79
|
+
# === Next Page
|
80
|
+
#
|
81
|
+
# If you know the URL to the next page, then provide a instance method called
|
82
|
+
# <tt>next_page</tt>. It should return the URL for the next page, or nil when
|
83
|
+
# the current page is the last page.
|
84
|
+
#
|
85
|
+
# class NextPageExample < Scrapes::Page
|
86
|
+
# rule(:next_page, 'a[href~=next]', '@href', 1)
|
87
|
+
# end
|
88
|
+
#
|
89
|
+
# === Link for Page
|
90
|
+
#
|
91
|
+
# Alternatively, you can provide a instance method <tt>link_for_page</tt> and
|
92
|
+
# another one called <tt>pages</tt>. The <tt>pages</tt> method should return the
|
93
|
+
# number of pages in this paginated set. The <tt>link_for_page</tt> method should
|
94
|
+
# take a page number, and return a URL to fetch that page.
|
95
|
+
#
|
96
|
+
# class LinkForPageExample < Scrapes::Page
|
97
|
+
# rule_1(:page) {|e| m = e.text.match(/Page\s+\d+\s+of\s+(\d+)/) and m[1].to_i}
|
98
|
+
#
|
99
|
+
# def link_for_page (page)
|
100
|
+
# uri.sub(/page=\d+/, "page=#{page}")
|
101
|
+
# end
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# === Append to Page
|
105
|
+
#
|
106
|
+
# Finally, you must provide a <tt>append_page</tt> method. It takes an instance
|
107
|
+
# of your Scrapes::Page derived class as an argument. Its job is to add the data
|
108
|
+
# found on the current page to its instance variables. This is because when you use
|
109
|
+
# paginated, it only returns one instance of your class.
|
110
|
+
def self.paginated
|
111
|
+
meta_eval { @paginated = true }
|
112
|
+
end
|
113
|
+
|
114
|
+
################################################################################
|
115
|
+
# Make Page.extract return an array by calling the given method. This can be
|
116
|
+
# very useful for when your class does nothing more than collect a set of links
|
117
|
+
# for some other page to process. It cases Session#page to call the given block
|
118
|
+
# once for each object returned from method_to_call.
|
119
|
+
def self.acts_as_array (method_to_call)
|
120
|
+
meta_eval { @as_array = method_to_call }
|
121
|
+
end
|
122
|
+
|
123
|
+
################################################################################
|
124
|
+
# Preprocess the HTML by sending it through an XSLT stylesheet. The stylesheet
|
125
|
+
# should return a document that can be then processed using your rules. Using
|
126
|
+
# this feature requires that you have the xsltproc utility in your PATH.
|
127
|
+
# You can get xsltproc from libxslt: http://xmlsoft.org/XSLT/
|
128
|
+
def self.with_xslt (filename)
|
129
|
+
raise "#{XSLTPROC} could not be found" unless `#{XSLTPROC} --version 2>&1`.match(/libxslt/)
|
130
|
+
meta_eval { @with_xslt = filename }
|
131
|
+
end
|
132
|
+
|
133
|
+
################################################################################
|
134
|
+
# Ensure that the given attributes have been set by matching rules
|
135
|
+
def self.validates_presence_of (*attrs)
|
136
|
+
attrs, options = attrs_options(attrs, {
|
137
|
+
:message => 'rule never matched',
|
138
|
+
})
|
139
|
+
|
140
|
+
validates_from(attrs, options, lambda {|a| !a.nil?})
|
141
|
+
end
|
142
|
+
|
143
|
+
################################################################################
|
144
|
+
# Ensure that the given attributes are not #blank?
|
145
|
+
def self.validates_not_blank (*attrs)
|
146
|
+
attrs, options = attrs_options(attrs, {
|
147
|
+
:message => 'rule never matched',
|
148
|
+
})
|
149
|
+
|
150
|
+
validates_from(attrs, options, lambda {|a| !a.blank?})
|
151
|
+
end
|
152
|
+
|
153
|
+
################################################################################
|
154
|
+
# Ensure that the given attributes have the correct format
|
155
|
+
def self.validates_format_of (*attrs)
|
156
|
+
attrs, options = attrs_options(attrs, {
|
157
|
+
:message => 'did not match regular expression',
|
158
|
+
:with => /.*/,
|
159
|
+
})
|
160
|
+
|
161
|
+
validates_from(attrs, options, lambda {|a| a.to_s.match(options[:with])})
|
162
|
+
end
|
163
|
+
|
164
|
+
################################################################################
|
165
|
+
# Ensure that the given attributes have values in the given list
|
166
|
+
def self.validates_inclusion_of (*attrs)
|
167
|
+
attrs, options = attrs_options(attrs, {
|
168
|
+
:message => 'is not in the list of accepted values',
|
169
|
+
:in => [],
|
170
|
+
})
|
171
|
+
|
172
|
+
validates_from(attrs, options, lambda {|a| options[:in].include?(a)})
|
173
|
+
end
|
174
|
+
|
175
|
+
################################################################################
|
176
|
+
# Ensure that the given attribute is a number
|
177
|
+
def self.validates_numericality_of (*attrs)
|
178
|
+
attrs, options = attrs_options(attrs, {
|
179
|
+
:message => 'is not a number',
|
180
|
+
})
|
181
|
+
|
182
|
+
closure = lambda do |a|
|
183
|
+
begin
|
184
|
+
Kernel.Float(a.to_s)
|
185
|
+
rescue ArgumentError, TypeError
|
186
|
+
false
|
187
|
+
else
|
188
|
+
true
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
validates_from(attrs, options, closure)
|
193
|
+
end
|
194
|
+
|
195
|
+
################################################################################
|
196
|
+
# If using acts_as_array that returns links, send them to another class
|
197
|
+
def self.to (other_class)
|
198
|
+
ToProxy.new(self, other_class)
|
199
|
+
end
|
200
|
+
|
201
|
+
################################################################################
|
202
|
+
# Called by the crawler to process a web page
|
203
|
+
def self.extract (data, uri, session, &block)
|
204
|
+
obj = process_page(data, uri, session)
|
205
|
+
|
206
|
+
if meta_eval {@paginated}
|
207
|
+
if obj.respond_to?(:next_page)
|
208
|
+
sister = obj
|
209
|
+
|
210
|
+
while sister_uri = sister.next_page
|
211
|
+
sister = extract_sister(session, obj, sister_uri)
|
212
|
+
end
|
213
|
+
elsif obj.respond_to?(:link_for_page)
|
214
|
+
(2 .. obj.pages).each do |page|
|
215
|
+
sister_uri = obj.link_for_page(page)
|
216
|
+
extract_sister(session, obj, sister_uri)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
as_array = meta_eval {@as_array}
|
222
|
+
obj = obj.send(as_array) if as_array
|
223
|
+
|
224
|
+
return obj unless block
|
225
|
+
obj.respond_to?(:each) ? obj.each {|o| yield(o)} : yield(obj)
|
226
|
+
end
|
227
|
+
|
228
|
+
################################################################################
|
229
|
+
# Have a chance to do something after parsing, but before validataion
|
230
|
+
def after_parse
|
231
|
+
end
|
232
|
+
|
233
|
+
################################################################################
|
234
|
+
# Called by the extract method to validate scraped data. If you override this
|
235
|
+
# method, you should call super. This method will probably be changed in the
|
236
|
+
# future so that you don't have to call super.
|
237
|
+
def validate
|
238
|
+
validations = self.class.meta_eval { @validations }
|
239
|
+
|
240
|
+
validations.each do |v|
|
241
|
+
raise "#{self.class}.#{v[:name]} #{v[:options][:message]}" unless
|
242
|
+
v[:proc].call(send(v[:name]))
|
243
|
+
end
|
244
|
+
|
245
|
+
self
|
246
|
+
end
|
247
|
+
|
248
|
+
################################################################################
|
249
|
+
protected
|
250
|
+
|
251
|
+
################################################################################
|
252
|
+
# Called by extract to process a page object
|
253
|
+
def self.process_page (data, uri, session)
|
254
|
+
if file = meta_eval { @with_xslt }
|
255
|
+
options = "--html '#{file}' -"
|
256
|
+
|
257
|
+
open("|#{XSLTPROC} #{options} 2> /dev/null", 'w+') do |xsltproc|
|
258
|
+
xsltproc << data
|
259
|
+
xsltproc.close_write
|
260
|
+
data = xsltproc.read
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
obj = parse(Hpricot(data))
|
265
|
+
obj.uri = uri
|
266
|
+
obj.session = session
|
267
|
+
obj.after_parse
|
268
|
+
obj.validate
|
269
|
+
obj
|
270
|
+
end
|
271
|
+
|
272
|
+
################################################################################
|
273
|
+
# Called by extract to process paginated objects
|
274
|
+
def self.extract_sister (session, obj, sister_uri)
|
275
|
+
res = session.crawler.fetch(sister_uri)
|
276
|
+
sister = process_page(res.body, sister_uri, session)
|
277
|
+
obj.append_page(sister)
|
278
|
+
sister
|
279
|
+
end
|
280
|
+
|
281
|
+
################################################################################
|
282
|
+
private
|
283
|
+
|
284
|
+
################################################################################
|
285
|
+
# Add some things to sub-classes
|
286
|
+
def self.inherited (klass)
|
287
|
+
klass.meta_eval do
|
288
|
+
@validations = []
|
289
|
+
@paginated = false
|
290
|
+
@as_array = false
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
################################################################################
|
295
|
+
# generic way to add validation
|
296
|
+
def self.validates_from (attrs, options, closure)
|
297
|
+
meta_eval do
|
298
|
+
attrs.each do |a|
|
299
|
+
@validations << {
|
300
|
+
:name => a,
|
301
|
+
:options => options,
|
302
|
+
:proc => closure,
|
303
|
+
}
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
################################################################################
|
309
|
+
# helper to correctly parse the validate calls
|
310
|
+
def self.attrs_options (attrs, options)
|
311
|
+
ops = attrs.pop if attrs.last.is_a?(Hash)
|
312
|
+
options.update(ops) if ops
|
313
|
+
[attrs, options]
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
317
|
+
################################################################################
|
318
|
+
end
|
319
|
+
################################################################################
|