spk-anemone 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +34 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +24 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/core.rb +280 -0
- data/lib/anemone/http.rb +126 -0
- data/lib/anemone/page.rb +158 -0
- data/lib/anemone/page_hash.rb +142 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +15 -0
- data/spec/core_spec.rb +203 -0
- data/spec/fakeweb_helper.rb +57 -0
- data/spec/page_spec.rb +52 -0
- data/spec/spec_helper.rb +7 -0
- metadata +97 -0
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'net/https'
|
2
|
+
require 'anemone/page'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class HTTP
|
6
|
+
# Maximum number of redirects to follow on each get_response
|
7
|
+
REDIRECT_LIMIT = 5
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@connections = {}
|
11
|
+
@opts = opts
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Create a new Page from the response of an HTTP request to *url*
|
16
|
+
#
|
17
|
+
def fetch_page(url, from_page = nil)
|
18
|
+
begin
|
19
|
+
url = URI(url) unless url.is_a?(URI)
|
20
|
+
|
21
|
+
if from_page
|
22
|
+
referer = from_page.url
|
23
|
+
depth = from_page.depth + 1
|
24
|
+
end
|
25
|
+
|
26
|
+
response, code, location, response_time = get(url, referer)
|
27
|
+
|
28
|
+
aka = nil
|
29
|
+
if !url.eql?(location)
|
30
|
+
aka = location
|
31
|
+
end
|
32
|
+
|
33
|
+
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
34
|
+
rescue => e
|
35
|
+
if verbose?
|
36
|
+
puts e.inspect
|
37
|
+
puts e.backtrace
|
38
|
+
end
|
39
|
+
return Page.new(url)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
#
|
46
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
47
|
+
# Returns the response object, response code, and final URI location.
|
48
|
+
#
|
49
|
+
def get(url, referer = nil)
|
50
|
+
response, response_time = get_response(url, referer)
|
51
|
+
code = Integer(response.code)
|
52
|
+
loc = url
|
53
|
+
|
54
|
+
limit = redirect_limit
|
55
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
56
|
+
loc = URI(response['location'])
|
57
|
+
loc = url.merge(loc) if loc.relative?
|
58
|
+
response, response_time = get_response(loc, referer)
|
59
|
+
limit -= 1
|
60
|
+
end
|
61
|
+
|
62
|
+
return response, code, loc, response_time
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
67
|
+
#
|
68
|
+
def get_response(url, referer = nil)
|
69
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
70
|
+
|
71
|
+
opts = {}
|
72
|
+
opts['User-Agent'] = user_agent if user_agent
|
73
|
+
opts['Referer'] = referer.to_s if referer
|
74
|
+
opts['Authorization'] = authorization if authorization
|
75
|
+
|
76
|
+
retries = 0
|
77
|
+
begin
|
78
|
+
start = Time.now()
|
79
|
+
response = connection(url).get(full_path, opts)
|
80
|
+
finish = Time.now()
|
81
|
+
response_time = ((finish - start) * 1000).round
|
82
|
+
return response, response_time
|
83
|
+
rescue EOFError
|
84
|
+
refresh_connection(url)
|
85
|
+
retries += 1
|
86
|
+
retry unless retries > 3
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def connection(url)
|
91
|
+
@connections[url.host] ||= {}
|
92
|
+
|
93
|
+
if conn = @connections[url.host][url.port]
|
94
|
+
return conn
|
95
|
+
end
|
96
|
+
|
97
|
+
refresh_connection(url)
|
98
|
+
end
|
99
|
+
|
100
|
+
def refresh_connection(url)
|
101
|
+
http = Net::HTTP.new(url.host, url.port)
|
102
|
+
if url.scheme == 'https'
|
103
|
+
http.use_ssl = true
|
104
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
105
|
+
end
|
106
|
+
@connections[url.host][url.port] = http.start
|
107
|
+
end
|
108
|
+
|
109
|
+
def redirect_limit
|
110
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
111
|
+
end
|
112
|
+
|
113
|
+
def user_agent
|
114
|
+
@opts[:user_agent]
|
115
|
+
end
|
116
|
+
|
117
|
+
def verbose?
|
118
|
+
@opts[:verbose]
|
119
|
+
end
|
120
|
+
|
121
|
+
def authorization
|
122
|
+
@opts[:authorization]
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|
data/lib/anemone/page.rb
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'ostruct'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class Page
|
6
|
+
|
7
|
+
# The URL of the page
|
8
|
+
attr_reader :url
|
9
|
+
# Headers of the HTTP response
|
10
|
+
attr_reader :headers
|
11
|
+
|
12
|
+
# OpenStruct for user-stored data
|
13
|
+
attr_accessor :data
|
14
|
+
# HTML body
|
15
|
+
attr_accessor :body
|
16
|
+
# Nokogiri document for the HTML body
|
17
|
+
attr_accessor :doc
|
18
|
+
# Integer response code of the page
|
19
|
+
attr_accessor :code
|
20
|
+
# Array of redirect-aliases for the page
|
21
|
+
attr_accessor :aliases
|
22
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
23
|
+
attr_accessor :visited
|
24
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
25
|
+
# shortest path; use PageHash#shortest_paths! to find that value.
|
26
|
+
attr_accessor :depth
|
27
|
+
# URL of the page that brought us to this page
|
28
|
+
attr_accessor :referer
|
29
|
+
# Response time of the request for this page in milliseconds
|
30
|
+
attr_accessor :response_time
|
31
|
+
|
32
|
+
#
|
33
|
+
# Create a new page
|
34
|
+
#
|
35
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
|
36
|
+
@url = url
|
37
|
+
@code = code
|
38
|
+
@headers = headers || {}
|
39
|
+
@headers['content-type'] ||= ['']
|
40
|
+
@aliases = Array(aka)
|
41
|
+
@data = OpenStruct.new
|
42
|
+
@referer = referer
|
43
|
+
@depth = depth || 0
|
44
|
+
@response_time = response_time
|
45
|
+
@body = body
|
46
|
+
@doc = Nokogiri::HTML(body) if body && html? rescue nil
|
47
|
+
end
|
48
|
+
|
49
|
+
# Array of distinct A tag HREFs from the page
|
50
|
+
def links
|
51
|
+
return @links unless @links.nil?
|
52
|
+
@links = []
|
53
|
+
return @links if !doc
|
54
|
+
|
55
|
+
doc.css('a').each do |a|
|
56
|
+
u = a.attributes['href'].content rescue nil
|
57
|
+
next if u.nil? or u.empty?
|
58
|
+
abs = to_absolute(URI(u)) rescue next
|
59
|
+
@links << abs if in_domain?(abs)
|
60
|
+
end
|
61
|
+
@links.uniq!
|
62
|
+
@links
|
63
|
+
end
|
64
|
+
|
65
|
+
def discard_doc!
|
66
|
+
links # force parsing of page links before we trash the document
|
67
|
+
@doc = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Return a new page with the same *response* and *url*, but
|
72
|
+
# with a 200 response code
|
73
|
+
#
|
74
|
+
def alias_clone(url)
|
75
|
+
p = clone
|
76
|
+
p.add_alias!(@aka) if !@aka.nil?
|
77
|
+
p.code = 200
|
78
|
+
p
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
83
|
+
#
|
84
|
+
# Returns *self*
|
85
|
+
#
|
86
|
+
def add_alias!(aka)
|
87
|
+
@aliases << aka if !@aliases.include?(aka)
|
88
|
+
self
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Returns an Array of all links from this page, and all the
|
93
|
+
# redirect-aliases of those pages, as String objects.
|
94
|
+
#
|
95
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
96
|
+
#
|
97
|
+
def links_and_their_aliases(page_hash)
|
98
|
+
links.inject([]) do |results, link|
|
99
|
+
results.concat([link].concat(page_hash[link].aliases))
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# The content-type returned by the HTTP request for this page
|
105
|
+
#
|
106
|
+
def content_type
|
107
|
+
headers['content-type'].first
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
112
|
+
# otherwise.
|
113
|
+
#
|
114
|
+
def html?
|
115
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
120
|
+
# otherwise.
|
121
|
+
#
|
122
|
+
def redirect?
|
123
|
+
(300..399).include?(@code)
|
124
|
+
end
|
125
|
+
|
126
|
+
#
|
127
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
128
|
+
# returns +false+ otherwise.
|
129
|
+
#
|
130
|
+
def not_found?
|
131
|
+
404 == @code
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Converts relative URL *link* into an absolute URL based on the
|
136
|
+
# location of the page
|
137
|
+
#
|
138
|
+
def to_absolute(link)
|
139
|
+
# remove anchor
|
140
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
141
|
+
|
142
|
+
relative = URI(link)
|
143
|
+
absolute = @url.merge(relative)
|
144
|
+
|
145
|
+
absolute.path = '/' if absolute.path.empty?
|
146
|
+
|
147
|
+
return absolute
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
152
|
+
# +false+ otherwise
|
153
|
+
#
|
154
|
+
def in_domain?(uri)
|
155
|
+
uri.host == @url.host
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
# We typically index the hash with a URI,
|
5
|
+
# but convert it to a String for easier retrieval
|
6
|
+
def [](index)
|
7
|
+
super(index.to_s)
|
8
|
+
end
|
9
|
+
|
10
|
+
def []=(index, other)
|
11
|
+
super(index.to_s, other)
|
12
|
+
end
|
13
|
+
|
14
|
+
def has_key?(key)
|
15
|
+
super(key.to_s)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Does this PageHash contain the specified URL?
|
19
|
+
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
20
|
+
def has_page?(url)
|
21
|
+
schemes = %w(http https)
|
22
|
+
if schemes.include? url.scheme
|
23
|
+
u = url.dup
|
24
|
+
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
25
|
+
end
|
26
|
+
|
27
|
+
has_key?(url)
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Use a breadth-first search to calculate the single-source
|
32
|
+
# shortest paths from *root* to all pages in the PageHash
|
33
|
+
#
|
34
|
+
def shortest_paths!(root)
|
35
|
+
root = URI(root) if root.is_a?(String)
|
36
|
+
raise "Root node not found" if !has_key?(root)
|
37
|
+
|
38
|
+
each_value {|p| p.visited = false if p}
|
39
|
+
|
40
|
+
q = Queue.new
|
41
|
+
|
42
|
+
q.enq(root)
|
43
|
+
self[root].depth = 0
|
44
|
+
self[root].visited = true
|
45
|
+
while(!q.empty?)
|
46
|
+
url = q.deq
|
47
|
+
|
48
|
+
next if !has_key?(url)
|
49
|
+
|
50
|
+
page = self[url]
|
51
|
+
|
52
|
+
page.links.each do |u|
|
53
|
+
next if !has_key?(u) or self[u].nil?
|
54
|
+
link = self[u]
|
55
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
56
|
+
|
57
|
+
aliases.each do |node|
|
58
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
59
|
+
node.depth = page.depth + 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
q.enq(self[u].url) if !self[u].visited
|
64
|
+
self[u].visited = true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
73
|
+
# non-redirect Page
|
74
|
+
#
|
75
|
+
def uniq
|
76
|
+
results = PageHash.new
|
77
|
+
each do |url, page|
|
78
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
79
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
80
|
+
if !page.redirect? and !page_added
|
81
|
+
results[url] = page.clone
|
82
|
+
results[url].aliases = []
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
results
|
87
|
+
end
|
88
|
+
|
89
|
+
#
|
90
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
91
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
92
|
+
#
|
93
|
+
def pages_linking_to(urls)
|
94
|
+
unless urls.is_a?(Array)
|
95
|
+
urls = [urls] unless urls.is_a?(Array)
|
96
|
+
single = true
|
97
|
+
end
|
98
|
+
|
99
|
+
urls.map! do |url|
|
100
|
+
if url.is_a?(String)
|
101
|
+
URI(url) rescue nil
|
102
|
+
else
|
103
|
+
url
|
104
|
+
end
|
105
|
+
end
|
106
|
+
urls.compact
|
107
|
+
|
108
|
+
links = {}
|
109
|
+
urls.each { |url| links[url] = [] }
|
110
|
+
values.each do |page|
|
111
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
112
|
+
end
|
113
|
+
|
114
|
+
if single and !links.empty?
|
115
|
+
return links.first
|
116
|
+
else
|
117
|
+
return links
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
123
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
124
|
+
#
|
125
|
+
def urls_linking_to(urls)
|
126
|
+
unless urls.is_a?(Array)
|
127
|
+
urls = [urls] unless urls.is_a?(Array)
|
128
|
+
single = true
|
129
|
+
end
|
130
|
+
|
131
|
+
links = pages_linking_to(urls)
|
132
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
133
|
+
|
134
|
+
if single and !links.empty?
|
135
|
+
return links.first
|
136
|
+
else
|
137
|
+
return links
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class Tentacle
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new Tentacle
|
8
|
+
#
|
9
|
+
def initialize(link_queue, page_queue, opts = {})
|
10
|
+
@link_queue = link_queue
|
11
|
+
@page_queue = page_queue
|
12
|
+
@http = Anemone::HTTP.new(opts)
|
13
|
+
@opts = opts
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Gets links from @link_queue, and returns the fetched
|
18
|
+
# Page objects into @page_queue
|
19
|
+
#
|
20
|
+
def run
|
21
|
+
loop do
|
22
|
+
link, from_page = @link_queue.deq
|
23
|
+
|
24
|
+
break if link == :END
|
25
|
+
|
26
|
+
@page_queue << @http.fetch_page(link, from_page)
|
27
|
+
|
28
|
+
delay
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def delay
|
35
|
+
sleep @opts[:delay] if @opts[:delay]
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|