medusa-crawler 1.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ require 'rubygems'
2
+ require 'cgi'
3
+ require 'nokogiri'
4
+ require 'ostruct'
5
+ require 'webrick/cookie'
6
+
7
+ module Medusa
8
+ class Page
9
+
10
+ # The URL of the page
11
+ attr_reader :url
12
+ # The raw HTTP response body of the page
13
+ attr_reader :body
14
+ # Headers of the HTTP response
15
+ attr_reader :headers
16
+ # URL of the page this one redirected to, if any
17
+ attr_reader :redirect_to
18
+ # Exception object, if one was raised during HTTP#fetch_page
19
+ attr_reader :error
20
+
21
+ # OpenStruct for user-stored data
22
+ attr_accessor :data
23
+ # Integer response code of the page
24
+ attr_accessor :code
25
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
26
+ attr_accessor :visited
27
+ # Depth of this page from the root of the crawl. This is not necessarily the
28
+ # shortest path; use PageStore#shortest_paths! to find that value.
29
+ attr_accessor :depth
30
+ # URL of the page that brought us to this page
31
+ attr_accessor :referer
32
+ # Response time of the request for this page in milliseconds
33
+ attr_accessor :response_time
34
+
35
+ #
36
+ # Create a new page
37
+ #
38
+ def initialize(url, params = {})
39
+ @url = url
40
+ @data = OpenStruct.new
41
+
42
+ @links = nil
43
+ @visited = false
44
+ @body = nil
45
+ @doc = nil
46
+ @base = nil
47
+
48
+ @code = params[:code]
49
+ @headers = params[:headers] || {}
50
+ @headers['content-type'] ||= ''
51
+ @aliases = Array(params[:aka]).compact
52
+ @referer = params[:referer]
53
+ @depth = params[:depth] || 0
54
+ @redirect_to = to_absolute(params[:redirect_to])
55
+ @response_time = params[:response_time]
56
+ @body = params[:body]
57
+ @error = params[:error]
58
+
59
+ @fetched = !params[:code].nil?
60
+ end
61
+
62
+ #
63
+ # Array of distinct A tag HREFs from the page
64
+ #
65
+ def links
66
+ return @links unless @links.nil?
67
+ @links = []
68
+ return @links if !doc
69
+
70
+ doc.search("//a[@href]").each do |a|
71
+ next if a['data-method'] && a['data-method'] != 'get'
72
+ u = a['href']
73
+ next if u.nil? or u.empty?
74
+ abs = to_absolute(u) rescue next
75
+ @links << abs if in_domain?(abs)
76
+ end
77
+ @links.uniq!
78
+ @links
79
+ end
80
+
81
+ #
82
+ # Nokogiri document for the HTML body
83
+ #
84
+ def doc
85
+ return @doc if @doc
86
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
87
+ end
88
+
89
+ #
90
+ # Delete the Nokogiri document and response body to conserve memory
91
+ #
92
+ def discard_doc!
93
+ links # force parsing of page links before we trash the document
94
+ @doc = @body = nil
95
+ end
96
+
97
+ #
98
+ # Was the page successfully fetched?
99
+ # +true+ if the page was fetched with no error, +false+ otherwise.
100
+ #
101
+ def fetched?
102
+ @fetched
103
+ end
104
+
105
+ #
106
+ # Array of cookies received with this page as WEBrick::Cookie objects.
107
+ #
108
+ def cookies
109
+ WEBrick::Cookie.parse_set_cookies(@headers['set-cookie']) rescue []
110
+ end
111
+
112
+ #
113
+ # The content-type returned by the HTTP request for this page
114
+ #
115
+ def content_type
116
+ headers['content-type']
117
+ end
118
+
119
+ #
120
+ # Returns +true+ if the page is a HTML document, returns +false+
121
+ # otherwise.
122
+ #
123
+ def html?
124
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
125
+ end
126
+
127
+ #
128
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
129
+ # otherwise.
130
+ #
131
+ def redirect?
132
+ (300..307).include?(@code)
133
+ end
134
+
135
+ #
136
+ # Returns +true+ if the page was not found (returned 404 code),
137
+ # returns +false+ otherwise.
138
+ #
139
+ def not_found?
140
+ 404 == @code
141
+ end
142
+
143
+ #
144
+ # Base URI from the HTML doc head element
145
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
146
+ #
147
+ def base
148
+ @base = if doc
149
+ href = doc.search('//head/base/@href')
150
+ URI(href.to_s) unless href.nil? rescue nil
151
+ end unless @base
152
+
153
+ return nil if @base && @base.to_s().empty?
154
+ @base
155
+ end
156
+
157
+
158
+ #
159
+ # Converts relative URL *link* into an absolute URL based on the
160
+ # location of the page
161
+ #
162
+ def to_absolute(link)
163
+ return nil if link.nil?
164
+
165
+ # remove anchor
166
+ link = link.to_s.gsub(/#.*$/,'')
167
+ if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
168
+ link = URI.encode(URI.decode(link))
169
+ end
170
+
171
+ relative = URI(link)
172
+ absolute = base ? base.merge(relative) : @url.merge(relative)
173
+
174
+ absolute.path = '/' if absolute.path.empty?
175
+
176
+ return absolute
177
+ end
178
+
179
+ #
180
+ # Returns +true+ if *uri* is in the same domain as the page, returns
181
+ # +false+ otherwise
182
+ #
183
+ def in_domain?(uri)
184
+ uri.host == @url.host
185
+ end
186
+
187
+ def marshal_dump
188
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
189
+ end
190
+
191
+ def marshal_load(ary)
192
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
193
+ end
194
+
195
+ def to_hash
196
+ {'url' => @url.to_s,
197
+ 'headers' => Marshal.dump(@headers),
198
+ 'data' => Marshal.dump(@data),
199
+ 'body' => @body,
200
+ 'links' => links.map(&:to_s),
201
+ 'code' => @code,
202
+ 'visited' => @visited,
203
+ 'depth' => @depth,
204
+ 'referer' => @referer.to_s,
205
+ 'redirect_to' => @redirect_to.to_s,
206
+ 'response_time' => @response_time,
207
+ 'fetched' => @fetched}
208
+ end
209
+
210
+ def self.from_hash(hash)
211
+ page = self.new(URI(hash['url']))
212
+ {'@headers' => Marshal.load(hash['headers']),
213
+ '@data' => Marshal.load(hash['data']),
214
+ '@body' => hash['body'],
215
+ '@links' => hash['links'].map { |link| URI(link) },
216
+ '@code' => hash['code'].to_i,
217
+ '@visited' => hash['visited'],
218
+ '@depth' => hash['depth'].to_i,
219
+ '@referer' => hash['referer'],
220
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
221
+ '@response_time' => hash['response_time'].to_i,
222
+ '@fetched' => hash['fetched']
223
+ }.each do |var, value|
224
+ page.instance_variable_set(var, value)
225
+ end
226
+ page
227
+ end
228
+ end
229
+ end
@@ -0,0 +1,160 @@
1
+ require 'forwardable'
2
+
3
+ module Medusa
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {})
10
+ @storage = storage
11
+ end
12
+
13
+ # We typically index the hash with a URI,
14
+ # but convert it to a String for easier retrieval
15
+ def [](index)
16
+ @storage[index.to_s]
17
+ end
18
+
19
+ def []=(index, other)
20
+ @storage[index.to_s] = other
21
+ end
22
+
23
+ def delete(key)
24
+ @storage.delete key.to_s
25
+ end
26
+
27
+ def has_key?(key)
28
+ @storage.has_key? key.to_s
29
+ end
30
+
31
+ def each_value
32
+ each { |key, value| yield value }
33
+ end
34
+
35
+ def values
36
+ result = []
37
+ each { |key, value| result << value }
38
+ result
39
+ end
40
+
41
+ def touch_key(key)
42
+ self[key] = Page.new(key)
43
+ end
44
+
45
+ def touch_keys(keys)
46
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
47
+ end
48
+
49
+ # Does this PageStore contain the specified URL?
50
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
51
+ def has_page?(url)
52
+ schemes = %w(http https)
53
+ if schemes.include? url.scheme
54
+ u = url.dup
55
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
56
+ end
57
+
58
+ has_key? url
59
+ end
60
+
61
+ #
62
+ # Use a breadth-first search to calculate the single-source
63
+ # shortest paths from *root* to all pages in the PageStore
64
+ #
65
+ def shortest_paths!(root)
66
+ root = URI(root) if root.is_a?(String)
67
+ raise "Root node not found" if !has_key?(root)
68
+
69
+ q = Queue.new
70
+
71
+ q.enq root
72
+ root_page = self[root]
73
+ root_page.depth = 0
74
+ root_page.visited = true
75
+ self[root] = root_page
76
+ while !q.empty?
77
+ page = self[q.deq]
78
+ page.links.each do |u|
79
+ begin
80
+ link = self[u]
81
+ next if link.nil? || !link.fetched? || link.visited
82
+
83
+ q << u unless link.redirect?
84
+ link.visited = true
85
+ link.depth = page.depth + 1
86
+ self[u] = link
87
+
88
+ if link.redirect?
89
+ u = link.redirect_to
90
+ redo
91
+ end
92
+ end
93
+ end
94
+ end
95
+
96
+ self
97
+ end
98
+
99
+ #
100
+ # Removes all Pages from storage where redirect? is true
101
+ #
102
+ def uniq!
103
+ each_value { |page| delete page.url if page.redirect? }
104
+ self
105
+ end
106
+
107
+ #
108
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
109
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
110
+ #
111
+ def pages_linking_to(urls)
112
+ unless urls.is_a?(Array)
113
+ urls = [urls]
114
+ single = true
115
+ end
116
+
117
+ urls.map! do |url|
118
+ unless url.is_a?(URI)
119
+ URI(url) rescue nil
120
+ else
121
+ url
122
+ end
123
+ end
124
+ urls.compact
125
+
126
+ links = {}
127
+ urls.each { |url| links[url] = [] }
128
+ values.each do |page|
129
+ urls.each { |url| links[url] << page if page.links.include?(url) }
130
+ end
131
+
132
+ if single and !links.empty?
133
+ return links[urls.first]
134
+ else
135
+ return links
136
+ end
137
+ end
138
+
139
+ #
140
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
141
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
142
+ #
143
+ def urls_linking_to(urls)
144
+ unless urls.is_a?(Array)
145
+ urls = [urls] unless urls.is_a?(Array)
146
+ single = true
147
+ end
148
+
149
+ links = pages_linking_to(urls)
150
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
151
+
152
+ if single and !links.empty?
153
+ return links[urls.first]
154
+ else
155
+ return links
156
+ end
157
+ end
158
+
159
+ end
160
+ end
@@ -0,0 +1,8 @@
1
+ module Medusa
2
+ module Storage
3
+ def self.Moneta(*args)
4
+ require 'medusa/storage/moneta'
5
+ self::Moneta.new(*args)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,81 @@
1
+ require 'medusa/storage/exceptions'
2
+
3
+ module Medusa
4
+ module Storage
5
+ class Base
6
+
7
+ def initialize(adapter)
8
+ @adap = adapter
9
+
10
+ # verify adapter conforms to this class's methods
11
+ public_methods(false).each do |method|
12
+ if !@adap.respond_to?(method.to_sym)
13
+ raise "Storage adapter must support method #{method}"
14
+ end
15
+ end
16
+ end
17
+
18
+ def [](key)
19
+ @adap[key]
20
+ rescue
21
+ puts key
22
+ raise RetrievalError
23
+ end
24
+
25
+ def []=(key, value)
26
+ @adap[key] = value
27
+ rescue
28
+ raise InsertionError
29
+ end
30
+
31
+ def delete(key)
32
+ @adap.delete(key)
33
+ rescue
34
+ raise DeletionError
35
+ end
36
+
37
+ def clear
38
+ @adap.clear
39
+ rescue
40
+ raise GenericError
41
+ end
42
+
43
+ def each
44
+ @adap.each { |k, v| yield k, v }
45
+ rescue
46
+ raise GenericError
47
+ end
48
+
49
+ def merge!(hash)
50
+ @adap.merge!(hash)
51
+ rescue
52
+ raise GenericError
53
+ end
54
+
55
+ def close
56
+ @adap.close
57
+ rescue
58
+ raise CloseError
59
+ end
60
+
61
+ def size
62
+ @adap.size
63
+ rescue
64
+ raise GenericError
65
+ end
66
+
67
+ def keys
68
+ @adap.keys
69
+ rescue
70
+ raise GenericError
71
+ end
72
+
73
+ def has_key?(key)
74
+ @adap.has_key?(key)
75
+ rescue
76
+ raise GenericError
77
+ end
78
+
79
+ end
80
+ end
81
+ end