medusa-crawler 1.0.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/CHANGELOG.md +20 -0
- data/CONTRIBUTORS.md +22 -0
- data/LICENSE.txt +20 -0
- data/README.md +48 -0
- data/Rakefile +24 -0
- data/VERSION +1 -0
- data/bin/medusa +4 -0
- data/lib/medusa.rb +2 -0
- data/lib/medusa/cli.rb +24 -0
- data/lib/medusa/cli/count.rb +22 -0
- data/lib/medusa/cli/cron.rb +90 -0
- data/lib/medusa/cli/pagedepth.rb +32 -0
- data/lib/medusa/cli/serialize.rb +35 -0
- data/lib/medusa/cli/url_list.rb +41 -0
- data/lib/medusa/cookie_store.rb +35 -0
- data/lib/medusa/core.rb +305 -0
- data/lib/medusa/exceptions.rb +5 -0
- data/lib/medusa/http.rb +202 -0
- data/lib/medusa/page.rb +229 -0
- data/lib/medusa/page_store.rb +160 -0
- data/lib/medusa/storage.rb +8 -0
- data/lib/medusa/storage/base.rb +81 -0
- data/lib/medusa/storage/exceptions.rb +15 -0
- data/lib/medusa/storage/moneta.rb +42 -0
- data/lib/medusa/tentacle.rb +39 -0
- data/lib/medusa/version.rb +3 -0
- data/spec/fakeweb_helper.rb +85 -0
- data/spec/medusa_helper.rb +5 -0
- data/spec/medusa_spec.rb +14 -0
- data/spec/spec_helper.rb +104 -0
- metadata +187 -0
- metadata.gz.sig +0 -0
data/lib/medusa/page.rb
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'cgi'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'webrick/cookie'
|
6
|
+
|
7
|
+
module Medusa
|
8
|
+
class Page
|
9
|
+
|
10
|
+
# The URL of the page
|
11
|
+
attr_reader :url
|
12
|
+
# The raw HTTP response body of the page
|
13
|
+
attr_reader :body
|
14
|
+
# Headers of the HTTP response
|
15
|
+
attr_reader :headers
|
16
|
+
# URL of the page this one redirected to, if any
|
17
|
+
attr_reader :redirect_to
|
18
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
19
|
+
attr_reader :error
|
20
|
+
|
21
|
+
# OpenStruct for user-stored data
|
22
|
+
attr_accessor :data
|
23
|
+
# Integer response code of the page
|
24
|
+
attr_accessor :code
|
25
|
+
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
|
26
|
+
attr_accessor :visited
|
27
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
28
|
+
# shortest path; use PageStore#shortest_paths! to find that value.
|
29
|
+
attr_accessor :depth
|
30
|
+
# URL of the page that brought us to this page
|
31
|
+
attr_accessor :referer
|
32
|
+
# Response time of the request for this page in milliseconds
|
33
|
+
attr_accessor :response_time
|
34
|
+
|
35
|
+
#
|
36
|
+
# Create a new page
|
37
|
+
#
|
38
|
+
def initialize(url, params = {})
|
39
|
+
@url = url
|
40
|
+
@data = OpenStruct.new
|
41
|
+
|
42
|
+
@links = nil
|
43
|
+
@visited = false
|
44
|
+
@body = nil
|
45
|
+
@doc = nil
|
46
|
+
@base = nil
|
47
|
+
|
48
|
+
@code = params[:code]
|
49
|
+
@headers = params[:headers] || {}
|
50
|
+
@headers['content-type'] ||= ''
|
51
|
+
@aliases = Array(params[:aka]).compact
|
52
|
+
@referer = params[:referer]
|
53
|
+
@depth = params[:depth] || 0
|
54
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
55
|
+
@response_time = params[:response_time]
|
56
|
+
@body = params[:body]
|
57
|
+
@error = params[:error]
|
58
|
+
|
59
|
+
@fetched = !params[:code].nil?
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Array of distinct A tag HREFs from the page
|
64
|
+
#
|
65
|
+
def links
|
66
|
+
return @links unless @links.nil?
|
67
|
+
@links = []
|
68
|
+
return @links if !doc
|
69
|
+
|
70
|
+
doc.search("//a[@href]").each do |a|
|
71
|
+
next if a['data-method'] && a['data-method'] != 'get'
|
72
|
+
u = a['href']
|
73
|
+
next if u.nil? or u.empty?
|
74
|
+
abs = to_absolute(u) rescue next
|
75
|
+
@links << abs if in_domain?(abs)
|
76
|
+
end
|
77
|
+
@links.uniq!
|
78
|
+
@links
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Nokogiri document for the HTML body
|
83
|
+
#
|
84
|
+
def doc
|
85
|
+
return @doc if @doc
|
86
|
+
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
87
|
+
end
|
88
|
+
|
89
|
+
#
|
90
|
+
# Delete the Nokogiri document and response body to conserve memory
|
91
|
+
#
|
92
|
+
def discard_doc!
|
93
|
+
links # force parsing of page links before we trash the document
|
94
|
+
@doc = @body = nil
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# Was the page successfully fetched?
|
99
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
100
|
+
#
|
101
|
+
def fetched?
|
102
|
+
@fetched
|
103
|
+
end
|
104
|
+
|
105
|
+
#
|
106
|
+
# Array of cookies received with this page as WEBrick::Cookie objects.
|
107
|
+
#
|
108
|
+
def cookies
|
109
|
+
WEBrick::Cookie.parse_set_cookies(@headers['set-cookie']) rescue []
|
110
|
+
end
|
111
|
+
|
112
|
+
#
|
113
|
+
# The content-type returned by the HTTP request for this page
|
114
|
+
#
|
115
|
+
def content_type
|
116
|
+
headers['content-type']
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
121
|
+
# otherwise.
|
122
|
+
#
|
123
|
+
def html?
|
124
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
129
|
+
# otherwise.
|
130
|
+
#
|
131
|
+
def redirect?
|
132
|
+
(300..307).include?(@code)
|
133
|
+
end
|
134
|
+
|
135
|
+
#
|
136
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
137
|
+
# returns +false+ otherwise.
|
138
|
+
#
|
139
|
+
def not_found?
|
140
|
+
404 == @code
|
141
|
+
end
|
142
|
+
|
143
|
+
#
|
144
|
+
# Base URI from the HTML doc head element
|
145
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
146
|
+
#
|
147
|
+
def base
|
148
|
+
@base = if doc
|
149
|
+
href = doc.search('//head/base/@href')
|
150
|
+
URI(href.to_s) unless href.nil? rescue nil
|
151
|
+
end unless @base
|
152
|
+
|
153
|
+
return nil if @base && @base.to_s().empty?
|
154
|
+
@base
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
#
|
159
|
+
# Converts relative URL *link* into an absolute URL based on the
|
160
|
+
# location of the page
|
161
|
+
#
|
162
|
+
def to_absolute(link)
|
163
|
+
return nil if link.nil?
|
164
|
+
|
165
|
+
# remove anchor
|
166
|
+
link = link.to_s.gsub(/#.*$/,'')
|
167
|
+
if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
|
168
|
+
link = URI.encode(URI.decode(link))
|
169
|
+
end
|
170
|
+
|
171
|
+
relative = URI(link)
|
172
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
173
|
+
|
174
|
+
absolute.path = '/' if absolute.path.empty?
|
175
|
+
|
176
|
+
return absolute
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
181
|
+
# +false+ otherwise
|
182
|
+
#
|
183
|
+
def in_domain?(uri)
|
184
|
+
uri.host == @url.host
|
185
|
+
end
|
186
|
+
|
187
|
+
def marshal_dump
|
188
|
+
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
|
189
|
+
end
|
190
|
+
|
191
|
+
def marshal_load(ary)
|
192
|
+
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
193
|
+
end
|
194
|
+
|
195
|
+
def to_hash
|
196
|
+
{'url' => @url.to_s,
|
197
|
+
'headers' => Marshal.dump(@headers),
|
198
|
+
'data' => Marshal.dump(@data),
|
199
|
+
'body' => @body,
|
200
|
+
'links' => links.map(&:to_s),
|
201
|
+
'code' => @code,
|
202
|
+
'visited' => @visited,
|
203
|
+
'depth' => @depth,
|
204
|
+
'referer' => @referer.to_s,
|
205
|
+
'redirect_to' => @redirect_to.to_s,
|
206
|
+
'response_time' => @response_time,
|
207
|
+
'fetched' => @fetched}
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.from_hash(hash)
|
211
|
+
page = self.new(URI(hash['url']))
|
212
|
+
{'@headers' => Marshal.load(hash['headers']),
|
213
|
+
'@data' => Marshal.load(hash['data']),
|
214
|
+
'@body' => hash['body'],
|
215
|
+
'@links' => hash['links'].map { |link| URI(link) },
|
216
|
+
'@code' => hash['code'].to_i,
|
217
|
+
'@visited' => hash['visited'],
|
218
|
+
'@depth' => hash['depth'].to_i,
|
219
|
+
'@referer' => hash['referer'],
|
220
|
+
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
221
|
+
'@response_time' => hash['response_time'].to_i,
|
222
|
+
'@fetched' => hash['fetched']
|
223
|
+
}.each do |var, value|
|
224
|
+
page.instance_variable_set(var, value)
|
225
|
+
end
|
226
|
+
page
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module Medusa
|
4
|
+
class PageStore
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@storage, :keys, :values, :size, :each
|
8
|
+
|
9
|
+
def initialize(storage = {})
|
10
|
+
@storage = storage
|
11
|
+
end
|
12
|
+
|
13
|
+
# We typically index the hash with a URI,
|
14
|
+
# but convert it to a String for easier retrieval
|
15
|
+
def [](index)
|
16
|
+
@storage[index.to_s]
|
17
|
+
end
|
18
|
+
|
19
|
+
def []=(index, other)
|
20
|
+
@storage[index.to_s] = other
|
21
|
+
end
|
22
|
+
|
23
|
+
def delete(key)
|
24
|
+
@storage.delete key.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
def has_key?(key)
|
28
|
+
@storage.has_key? key.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
def each_value
|
32
|
+
each { |key, value| yield value }
|
33
|
+
end
|
34
|
+
|
35
|
+
def values
|
36
|
+
result = []
|
37
|
+
each { |key, value| result << value }
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
41
|
+
def touch_key(key)
|
42
|
+
self[key] = Page.new(key)
|
43
|
+
end
|
44
|
+
|
45
|
+
def touch_keys(keys)
|
46
|
+
@storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Does this PageStore contain the specified URL?
|
50
|
+
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
51
|
+
def has_page?(url)
|
52
|
+
schemes = %w(http https)
|
53
|
+
if schemes.include? url.scheme
|
54
|
+
u = url.dup
|
55
|
+
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
56
|
+
end
|
57
|
+
|
58
|
+
has_key? url
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Use a breadth-first search to calculate the single-source
|
63
|
+
# shortest paths from *root* to all pages in the PageStore
|
64
|
+
#
|
65
|
+
def shortest_paths!(root)
|
66
|
+
root = URI(root) if root.is_a?(String)
|
67
|
+
raise "Root node not found" if !has_key?(root)
|
68
|
+
|
69
|
+
q = Queue.new
|
70
|
+
|
71
|
+
q.enq root
|
72
|
+
root_page = self[root]
|
73
|
+
root_page.depth = 0
|
74
|
+
root_page.visited = true
|
75
|
+
self[root] = root_page
|
76
|
+
while !q.empty?
|
77
|
+
page = self[q.deq]
|
78
|
+
page.links.each do |u|
|
79
|
+
begin
|
80
|
+
link = self[u]
|
81
|
+
next if link.nil? || !link.fetched? || link.visited
|
82
|
+
|
83
|
+
q << u unless link.redirect?
|
84
|
+
link.visited = true
|
85
|
+
link.depth = page.depth + 1
|
86
|
+
self[u] = link
|
87
|
+
|
88
|
+
if link.redirect?
|
89
|
+
u = link.redirect_to
|
90
|
+
redo
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
self
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# Removes all Pages from storage where redirect? is true
|
101
|
+
#
|
102
|
+
def uniq!
|
103
|
+
each_value { |page| delete page.url if page.redirect? }
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
109
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
110
|
+
#
|
111
|
+
def pages_linking_to(urls)
|
112
|
+
unless urls.is_a?(Array)
|
113
|
+
urls = [urls]
|
114
|
+
single = true
|
115
|
+
end
|
116
|
+
|
117
|
+
urls.map! do |url|
|
118
|
+
unless url.is_a?(URI)
|
119
|
+
URI(url) rescue nil
|
120
|
+
else
|
121
|
+
url
|
122
|
+
end
|
123
|
+
end
|
124
|
+
urls.compact
|
125
|
+
|
126
|
+
links = {}
|
127
|
+
urls.each { |url| links[url] = [] }
|
128
|
+
values.each do |page|
|
129
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
130
|
+
end
|
131
|
+
|
132
|
+
if single and !links.empty?
|
133
|
+
return links[urls.first]
|
134
|
+
else
|
135
|
+
return links
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
#
|
140
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
141
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
142
|
+
#
|
143
|
+
def urls_linking_to(urls)
|
144
|
+
unless urls.is_a?(Array)
|
145
|
+
urls = [urls] unless urls.is_a?(Array)
|
146
|
+
single = true
|
147
|
+
end
|
148
|
+
|
149
|
+
links = pages_linking_to(urls)
|
150
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
151
|
+
|
152
|
+
if single and !links.empty?
|
153
|
+
return links[urls.first]
|
154
|
+
else
|
155
|
+
return links
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'medusa/storage/exceptions'
|
2
|
+
|
3
|
+
module Medusa
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
|
7
|
+
def initialize(adapter)
|
8
|
+
@adap = adapter
|
9
|
+
|
10
|
+
# verify adapter conforms to this class's methods
|
11
|
+
public_methods(false).each do |method|
|
12
|
+
if !@adap.respond_to?(method.to_sym)
|
13
|
+
raise "Storage adapter must support method #{method}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](key)
|
19
|
+
@adap[key]
|
20
|
+
rescue
|
21
|
+
puts key
|
22
|
+
raise RetrievalError
|
23
|
+
end
|
24
|
+
|
25
|
+
def []=(key, value)
|
26
|
+
@adap[key] = value
|
27
|
+
rescue
|
28
|
+
raise InsertionError
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
@adap.delete(key)
|
33
|
+
rescue
|
34
|
+
raise DeletionError
|
35
|
+
end
|
36
|
+
|
37
|
+
def clear
|
38
|
+
@adap.clear
|
39
|
+
rescue
|
40
|
+
raise GenericError
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
@adap.each { |k, v| yield k, v }
|
45
|
+
rescue
|
46
|
+
raise GenericError
|
47
|
+
end
|
48
|
+
|
49
|
+
def merge!(hash)
|
50
|
+
@adap.merge!(hash)
|
51
|
+
rescue
|
52
|
+
raise GenericError
|
53
|
+
end
|
54
|
+
|
55
|
+
def close
|
56
|
+
@adap.close
|
57
|
+
rescue
|
58
|
+
raise CloseError
|
59
|
+
end
|
60
|
+
|
61
|
+
def size
|
62
|
+
@adap.size
|
63
|
+
rescue
|
64
|
+
raise GenericError
|
65
|
+
end
|
66
|
+
|
67
|
+
def keys
|
68
|
+
@adap.keys
|
69
|
+
rescue
|
70
|
+
raise GenericError
|
71
|
+
end
|
72
|
+
|
73
|
+
def has_key?(key)
|
74
|
+
@adap.has_key?(key)
|
75
|
+
rescue
|
76
|
+
raise GenericError
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|