medusa-crawler 1.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,229 @@
1
+ require 'rubygems'
2
+ require 'cgi'
3
+ require 'nokogiri'
4
+ require 'ostruct'
5
+ require 'webrick/cookie'
6
+
7
+ module Medusa
8
+ class Page
9
+
10
+ # The URL of the page
11
+ attr_reader :url
12
+ # The raw HTTP response body of the page
13
+ attr_reader :body
14
+ # Headers of the HTTP response
15
+ attr_reader :headers
16
+ # URL of the page this one redirected to, if any
17
+ attr_reader :redirect_to
18
+ # Exception object, if one was raised during HTTP#fetch_page
19
+ attr_reader :error
20
+
21
+ # OpenStruct for user-stored data
22
+ attr_accessor :data
23
+ # Integer response code of the page
24
+ attr_accessor :code
25
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
26
+ attr_accessor :visited
27
+ # Depth of this page from the root of the crawl. This is not necessarily the
28
+ # shortest path; use PageStore#shortest_paths! to find that value.
29
+ attr_accessor :depth
30
+ # URL of the page that brought us to this page
31
+ attr_accessor :referer
32
+ # Response time of the request for this page in milliseconds
33
+ attr_accessor :response_time
34
+
35
+ #
36
+ # Create a new page
37
+ #
38
+ def initialize(url, params = {})
39
+ @url = url
40
+ @data = OpenStruct.new
41
+
42
+ @links = nil
43
+ @visited = false
44
+ @body = nil
45
+ @doc = nil
46
+ @base = nil
47
+
48
+ @code = params[:code]
49
+ @headers = params[:headers] || {}
50
+ @headers['content-type'] ||= ''
51
+ @aliases = Array(params[:aka]).compact
52
+ @referer = params[:referer]
53
+ @depth = params[:depth] || 0
54
+ @redirect_to = to_absolute(params[:redirect_to])
55
+ @response_time = params[:response_time]
56
+ @body = params[:body]
57
+ @error = params[:error]
58
+
59
+ @fetched = !params[:code].nil?
60
+ end
61
+
62
+ #
63
+ # Array of distinct A tag HREFs from the page
64
+ #
65
+ def links
66
+ return @links unless @links.nil?
67
+ @links = []
68
+ return @links if !doc
69
+
70
+ doc.search("//a[@href]").each do |a|
71
+ next if a['data-method'] && a['data-method'] != 'get'
72
+ u = a['href']
73
+ next if u.nil? or u.empty?
74
+ abs = to_absolute(u) rescue next
75
+ @links << abs if in_domain?(abs)
76
+ end
77
+ @links.uniq!
78
+ @links
79
+ end
80
+
81
+ #
82
+ # Nokogiri document for the HTML body
83
+ #
84
+ def doc
85
+ return @doc if @doc
86
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
87
+ end
88
+
89
+ #
90
+ # Delete the Nokogiri document and response body to conserve memory
91
+ #
92
+ def discard_doc!
93
+ links # force parsing of page links before we trash the document
94
+ @doc = @body = nil
95
+ end
96
+
97
+ #
98
+ # Was the page successfully fetched?
99
+ # +true+ if the page was fetched with no error, +false+ otherwise.
100
+ #
101
+ def fetched?
102
+ @fetched
103
+ end
104
+
105
+ #
106
+ # Array of cookies received with this page as WEBrick::Cookie objects.
107
+ #
108
+ def cookies
109
+ WEBrick::Cookie.parse_set_cookies(@headers['set-cookie']) rescue []
110
+ end
111
+
112
+ #
113
+ # The content-type returned by the HTTP request for this page
114
+ #
115
+ def content_type
116
+ headers['content-type']
117
+ end
118
+
119
+ #
120
+ # Returns +true+ if the page is a HTML document, returns +false+
121
+ # otherwise.
122
+ #
123
+ def html?
124
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
125
+ end
126
+
127
+ #
128
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
129
+ # otherwise.
130
+ #
131
+ def redirect?
132
+ (300..307).include?(@code)
133
+ end
134
+
135
+ #
136
+ # Returns +true+ if the page was not found (returned 404 code),
137
+ # returns +false+ otherwise.
138
+ #
139
+ def not_found?
140
+ 404 == @code
141
+ end
142
+
143
+ #
144
+ # Base URI from the HTML doc head element
145
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
146
+ #
147
+ def base
148
+ @base = if doc
149
+ href = doc.search('//head/base/@href')
150
+ URI(href.to_s) unless href.nil? rescue nil
151
+ end unless @base
152
+
153
+ return nil if @base && @base.to_s().empty?
154
+ @base
155
+ end
156
+
157
+
158
+ #
159
+ # Converts relative URL *link* into an absolute URL based on the
160
+ # location of the page
161
+ #
162
+ def to_absolute(link)
163
+ return nil if link.nil?
164
+
165
+ # remove anchor
166
+ link = link.to_s.gsub(/#.*$/,'')
167
+ if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
168
+ link = URI.encode(URI.decode(link))
169
+ end
170
+
171
+ relative = URI(link)
172
+ absolute = base ? base.merge(relative) : @url.merge(relative)
173
+
174
+ absolute.path = '/' if absolute.path.empty?
175
+
176
+ return absolute
177
+ end
178
+
179
+ #
180
+ # Returns +true+ if *uri* is in the same domain as the page, returns
181
+ # +false+ otherwise
182
+ #
183
+ def in_domain?(uri)
184
+ uri.host == @url.host
185
+ end
186
+
187
+ def marshal_dump
188
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
189
+ end
190
+
191
+ def marshal_load(ary)
192
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
193
+ end
194
+
195
+ def to_hash
196
+ {'url' => @url.to_s,
197
+ 'headers' => Marshal.dump(@headers),
198
+ 'data' => Marshal.dump(@data),
199
+ 'body' => @body,
200
+ 'links' => links.map(&:to_s),
201
+ 'code' => @code,
202
+ 'visited' => @visited,
203
+ 'depth' => @depth,
204
+ 'referer' => @referer.to_s,
205
+ 'redirect_to' => @redirect_to.to_s,
206
+ 'response_time' => @response_time,
207
+ 'fetched' => @fetched}
208
+ end
209
+
210
+ def self.from_hash(hash)
211
+ page = self.new(URI(hash['url']))
212
+ {'@headers' => Marshal.load(hash['headers']),
213
+ '@data' => Marshal.load(hash['data']),
214
+ '@body' => hash['body'],
215
+ '@links' => hash['links'].map { |link| URI(link) },
216
+ '@code' => hash['code'].to_i,
217
+ '@visited' => hash['visited'],
218
+ '@depth' => hash['depth'].to_i,
219
+ '@referer' => hash['referer'],
220
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
221
+ '@response_time' => hash['response_time'].to_i,
222
+ '@fetched' => hash['fetched']
223
+ }.each do |var, value|
224
+ page.instance_variable_set(var, value)
225
+ end
226
+ page
227
+ end
228
+ end
229
+ end
@@ -0,0 +1,160 @@
1
+ require 'forwardable'
2
+
3
+ module Medusa
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {})
10
+ @storage = storage
11
+ end
12
+
13
+ # We typically index the hash with a URI,
14
+ # but convert it to a String for easier retrieval
15
+ def [](index)
16
+ @storage[index.to_s]
17
+ end
18
+
19
+ def []=(index, other)
20
+ @storage[index.to_s] = other
21
+ end
22
+
23
+ def delete(key)
24
+ @storage.delete key.to_s
25
+ end
26
+
27
+ def has_key?(key)
28
+ @storage.has_key? key.to_s
29
+ end
30
+
31
+ def each_value
32
+ each { |key, value| yield value }
33
+ end
34
+
35
+ def values
36
+ result = []
37
+ each { |key, value| result << value }
38
+ result
39
+ end
40
+
41
+ def touch_key(key)
42
+ self[key] = Page.new(key)
43
+ end
44
+
45
+ def touch_keys(keys)
46
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
47
+ end
48
+
49
+ # Does this PageStore contain the specified URL?
50
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
51
+ def has_page?(url)
52
+ schemes = %w(http https)
53
+ if schemes.include? url.scheme
54
+ u = url.dup
55
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
56
+ end
57
+
58
+ has_key? url
59
+ end
60
+
61
+ #
62
+ # Use a breadth-first search to calculate the single-source
63
+ # shortest paths from *root* to all pages in the PageStore
64
+ #
65
+ def shortest_paths!(root)
66
+ root = URI(root) if root.is_a?(String)
67
+ raise "Root node not found" if !has_key?(root)
68
+
69
+ q = Queue.new
70
+
71
+ q.enq root
72
+ root_page = self[root]
73
+ root_page.depth = 0
74
+ root_page.visited = true
75
+ self[root] = root_page
76
+ while !q.empty?
77
+ page = self[q.deq]
78
+ page.links.each do |u|
79
+ begin
80
+ link = self[u]
81
+ next if link.nil? || !link.fetched? || link.visited
82
+
83
+ q << u unless link.redirect?
84
+ link.visited = true
85
+ link.depth = page.depth + 1
86
+ self[u] = link
87
+
88
+ if link.redirect?
89
+ u = link.redirect_to
90
+ redo
91
+ end
92
+ end
93
+ end
94
+ end
95
+
96
+ self
97
+ end
98
+
99
+ #
100
+ # Removes all Pages from storage where redirect? is true
101
+ #
102
+ def uniq!
103
+ each_value { |page| delete page.url if page.redirect? }
104
+ self
105
+ end
106
+
107
+ #
108
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
109
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
110
+ #
111
+ def pages_linking_to(urls)
112
+ unless urls.is_a?(Array)
113
+ urls = [urls]
114
+ single = true
115
+ end
116
+
117
+ urls.map! do |url|
118
+ unless url.is_a?(URI)
119
+ URI(url) rescue nil
120
+ else
121
+ url
122
+ end
123
+ end
124
+ urls.compact
125
+
126
+ links = {}
127
+ urls.each { |url| links[url] = [] }
128
+ values.each do |page|
129
+ urls.each { |url| links[url] << page if page.links.include?(url) }
130
+ end
131
+
132
+ if single and !links.empty?
133
+ return links[urls.first]
134
+ else
135
+ return links
136
+ end
137
+ end
138
+
139
+ #
140
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
141
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
142
+ #
143
+ def urls_linking_to(urls)
144
+ unless urls.is_a?(Array)
145
+ urls = [urls] unless urls.is_a?(Array)
146
+ single = true
147
+ end
148
+
149
+ links = pages_linking_to(urls)
150
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
151
+
152
+ if single and !links.empty?
153
+ return links[urls.first]
154
+ else
155
+ return links
156
+ end
157
+ end
158
+
159
+ end
160
+ end
@@ -0,0 +1,8 @@
1
+ module Medusa
2
+ module Storage
3
+ def self.Moneta(*args)
4
+ require 'medusa/storage/moneta'
5
+ self::Moneta.new(*args)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,81 @@
1
+ require 'medusa/storage/exceptions'
2
+
3
+ module Medusa
4
+ module Storage
5
+ class Base
6
+
7
+ def initialize(adapter)
8
+ @adap = adapter
9
+
10
+ # verify adapter conforms to this class's methods
11
+ public_methods(false).each do |method|
12
+ if !@adap.respond_to?(method.to_sym)
13
+ raise "Storage adapter must support method #{method}"
14
+ end
15
+ end
16
+ end
17
+
18
+ def [](key)
19
+ @adap[key]
20
+ rescue
21
+ puts key
22
+ raise RetrievalError
23
+ end
24
+
25
+ def []=(key, value)
26
+ @adap[key] = value
27
+ rescue
28
+ raise InsertionError
29
+ end
30
+
31
+ def delete(key)
32
+ @adap.delete(key)
33
+ rescue
34
+ raise DeletionError
35
+ end
36
+
37
+ def clear
38
+ @adap.clear
39
+ rescue
40
+ raise GenericError
41
+ end
42
+
43
+ def each
44
+ @adap.each { |k, v| yield k, v }
45
+ rescue
46
+ raise GenericError
47
+ end
48
+
49
+ def merge!(hash)
50
+ @adap.merge!(hash)
51
+ rescue
52
+ raise GenericError
53
+ end
54
+
55
+ def close
56
+ @adap.close
57
+ rescue
58
+ raise CloseError
59
+ end
60
+
61
+ def size
62
+ @adap.size
63
+ rescue
64
+ raise GenericError
65
+ end
66
+
67
+ def keys
68
+ @adap.keys
69
+ rescue
70
+ raise GenericError
71
+ end
72
+
73
+ def has_key?(key)
74
+ @adap.has_key?(key)
75
+ rescue
76
+ raise GenericError
77
+ end
78
+
79
+ end
80
+ end
81
+ end