shingara-anemone 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +27 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +24 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/core.rb +256 -0
- data/lib/anemone/http.rb +123 -0
- data/lib/anemone/page.rb +155 -0
- data/lib/anemone/page_hash.rb +142 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +15 -0
- data/spec/core_spec.rb +203 -0
- data/spec/fakeweb_helper.rb +57 -0
- data/spec/page_spec.rb +52 -0
- data/spec/spec_helper.rb +7 -0
- metadata +96 -0
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'net/https'
|
2
|
+
require 'anemone/page'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class HTTP
|
6
|
+
# Maximum number of redirects to follow on each get_response
|
7
|
+
REDIRECT_LIMIT = 5
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@connections = {}
|
11
|
+
@opts = opts
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Create a new Page from the response of an HTTP request to *url*
|
16
|
+
#
|
17
|
+
def fetch_page(url, from_page = nil)
|
18
|
+
begin
|
19
|
+
url = URI(url) unless url.is_a?(URI)
|
20
|
+
|
21
|
+
if from_page
|
22
|
+
referer = from_page.url
|
23
|
+
depth = from_page.depth + 1
|
24
|
+
end
|
25
|
+
|
26
|
+
response, code, location, response_time = get(url, referer)
|
27
|
+
|
28
|
+
aka = nil
|
29
|
+
if !url.eql?(location)
|
30
|
+
aka = location
|
31
|
+
end
|
32
|
+
|
33
|
+
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
34
|
+
rescue => e
|
35
|
+
if verbose?
|
36
|
+
puts e.inspect
|
37
|
+
puts e.backtrace
|
38
|
+
end
|
39
|
+
return Page.new(url)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
#
|
46
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
47
|
+
# Returns the response object, response code, and final URI location.
|
48
|
+
#
|
49
|
+
def get(url, referer = nil)
|
50
|
+
response, response_time = get_response(url, referer)
|
51
|
+
code = Integer(response.code)
|
52
|
+
loc = url
|
53
|
+
|
54
|
+
limit = redirect_limit
|
55
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
56
|
+
loc = URI(response['location'])
|
57
|
+
loc = url.merge(loc) if loc.relative?
|
58
|
+
response, response_time = get_response(loc, referer)
|
59
|
+
limit -= 1
|
60
|
+
end
|
61
|
+
|
62
|
+
return response, code, loc, response_time
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
67
|
+
#
|
68
|
+
def get_response(url, referer = nil)
|
69
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
70
|
+
|
71
|
+
opts = {}
|
72
|
+
opts['User-Agent'] = user_agent if user_agent
|
73
|
+
opts['Referer'] = referer.to_s if referer
|
74
|
+
|
75
|
+
retries = 0
|
76
|
+
begin
|
77
|
+
start = Time.now()
|
78
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
79
|
+
req.basic_auth url.user, url.password if url.user
|
80
|
+
response = connection(url).request(req)
|
81
|
+
finish = Time.now()
|
82
|
+
response_time = ((finish - start) * 1000).round
|
83
|
+
return response, response_time
|
84
|
+
rescue EOFError
|
85
|
+
refresh_connection(url)
|
86
|
+
retries += 1
|
87
|
+
retry unless retries > 3
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def connection(url)
|
92
|
+
@connections[url.host] ||= {}
|
93
|
+
|
94
|
+
if conn = @connections[url.host][url.port]
|
95
|
+
return conn
|
96
|
+
end
|
97
|
+
|
98
|
+
refresh_connection(url)
|
99
|
+
end
|
100
|
+
|
101
|
+
def refresh_connection(url)
|
102
|
+
http = Net::HTTP.new(url.host, url.port)
|
103
|
+
if url.scheme == 'https'
|
104
|
+
http.use_ssl = true
|
105
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
106
|
+
end
|
107
|
+
@connections[url.host][url.port] = http.start
|
108
|
+
end
|
109
|
+
|
110
|
+
def redirect_limit
|
111
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
112
|
+
end
|
113
|
+
|
114
|
+
def user_agent
|
115
|
+
@opts[:user_agent]
|
116
|
+
end
|
117
|
+
|
118
|
+
def verbose?
|
119
|
+
@opts[:verbose]
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
data/lib/anemone/page.rb
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'ostruct'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class Page
|
6
|
+
|
7
|
+
# The URL of the page
|
8
|
+
attr_reader :url
|
9
|
+
# Headers of the HTTP response
|
10
|
+
attr_reader :headers
|
11
|
+
|
12
|
+
# OpenStruct for user-stored data
|
13
|
+
attr_accessor :data
|
14
|
+
# Nokogiri document for the HTML body
|
15
|
+
attr_accessor :doc
|
16
|
+
# Integer response code of the page
|
17
|
+
attr_accessor :code
|
18
|
+
# Array of redirect-aliases for the page
|
19
|
+
attr_accessor :aliases
|
20
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
21
|
+
attr_accessor :visited
|
22
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
23
|
+
# shortest path; use PageHash#shortest_paths! to find that value.
|
24
|
+
attr_accessor :depth
|
25
|
+
# URL of the page that brought us to this page
|
26
|
+
attr_accessor :referer
|
27
|
+
# Response time of the request for this page in milliseconds
|
28
|
+
attr_accessor :response_time
|
29
|
+
|
30
|
+
#
|
31
|
+
# Create a new page
|
32
|
+
#
|
33
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
|
34
|
+
@url = url
|
35
|
+
@code = code
|
36
|
+
@headers = headers || {}
|
37
|
+
@headers['content-type'] ||= ['']
|
38
|
+
@aliases = Array(aka)
|
39
|
+
@data = OpenStruct.new
|
40
|
+
@referer = referer
|
41
|
+
@depth = depth || 0
|
42
|
+
@response_time = response_time
|
43
|
+
@doc = Nokogiri::HTML(body) if body && html? rescue nil
|
44
|
+
end
|
45
|
+
|
46
|
+
# Array of distinct A tag HREFs from the page
|
47
|
+
def links
|
48
|
+
return @links unless @links.nil?
|
49
|
+
@links = []
|
50
|
+
return @links if !doc
|
51
|
+
|
52
|
+
doc.css('a').each do |a|
|
53
|
+
u = a.attributes['href'].content rescue nil
|
54
|
+
next if u.nil? or u.empty?
|
55
|
+
abs = to_absolute(URI(u)) rescue next
|
56
|
+
@links << abs if in_domain?(abs)
|
57
|
+
end
|
58
|
+
@links.uniq!
|
59
|
+
@links
|
60
|
+
end
|
61
|
+
|
62
|
+
def discard_doc!
|
63
|
+
links # force parsing of page links before we trash the document
|
64
|
+
@doc = nil
|
65
|
+
end
|
66
|
+
|
67
|
+
#
|
68
|
+
# Return a new page with the same *response* and *url*, but
|
69
|
+
# with a 200 response code
|
70
|
+
#
|
71
|
+
def alias_clone(url)
|
72
|
+
p = clone
|
73
|
+
p.add_alias!(@aka) if !@aka.nil?
|
74
|
+
p.code = 200
|
75
|
+
p
|
76
|
+
end
|
77
|
+
|
78
|
+
#
|
79
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
80
|
+
#
|
81
|
+
# Returns *self*
|
82
|
+
#
|
83
|
+
def add_alias!(aka)
|
84
|
+
@aliases << aka if !@aliases.include?(aka)
|
85
|
+
self
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Returns an Array of all links from this page, and all the
|
90
|
+
# redirect-aliases of those pages, as String objects.
|
91
|
+
#
|
92
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
93
|
+
#
|
94
|
+
def links_and_their_aliases(page_hash)
|
95
|
+
links.inject([]) do |results, link|
|
96
|
+
results.concat([link].concat(page_hash[link].aliases))
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# The content-type returned by the HTTP request for this page
|
102
|
+
#
|
103
|
+
def content_type
|
104
|
+
headers['content-type'].first
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
109
|
+
# otherwise.
|
110
|
+
#
|
111
|
+
def html?
|
112
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
117
|
+
# otherwise.
|
118
|
+
#
|
119
|
+
def redirect?
|
120
|
+
(300..399).include?(@code)
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
125
|
+
# returns +false+ otherwise.
|
126
|
+
#
|
127
|
+
def not_found?
|
128
|
+
404 == @code
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Converts relative URL *link* into an absolute URL based on the
|
133
|
+
# location of the page
|
134
|
+
#
|
135
|
+
def to_absolute(link)
|
136
|
+
# remove anchor
|
137
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
138
|
+
|
139
|
+
relative = URI(link)
|
140
|
+
absolute = @url.merge(relative)
|
141
|
+
|
142
|
+
absolute.path = '/' if absolute.path.empty?
|
143
|
+
|
144
|
+
return absolute
|
145
|
+
end
|
146
|
+
|
147
|
+
#
|
148
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
149
|
+
# +false+ otherwise
|
150
|
+
#
|
151
|
+
def in_domain?(uri)
|
152
|
+
uri.host == @url.host
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
# We typically index the hash with a URI,
|
5
|
+
# but convert it to a String for easier retrieval
|
6
|
+
def [](index)
|
7
|
+
super(index.to_s)
|
8
|
+
end
|
9
|
+
|
10
|
+
def []=(index, other)
|
11
|
+
super(index.to_s, other)
|
12
|
+
end
|
13
|
+
|
14
|
+
def has_key?(key)
|
15
|
+
super(key.to_s)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Does this PageHash contain the specified URL?
|
19
|
+
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
20
|
+
def has_page?(url)
|
21
|
+
schemes = %w(http https)
|
22
|
+
if schemes.include? url.scheme
|
23
|
+
u = url.dup
|
24
|
+
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
25
|
+
end
|
26
|
+
|
27
|
+
has_key?(url)
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Use a breadth-first search to calculate the single-source
|
32
|
+
# shortest paths from *root* to all pages in the PageHash
|
33
|
+
#
|
34
|
+
def shortest_paths!(root)
|
35
|
+
root = URI(root) if root.is_a?(String)
|
36
|
+
raise "Root node not found" if !has_key?(root)
|
37
|
+
|
38
|
+
each_value {|p| p.visited = false if p}
|
39
|
+
|
40
|
+
q = Queue.new
|
41
|
+
|
42
|
+
q.enq(root)
|
43
|
+
self[root].depth = 0
|
44
|
+
self[root].visited = true
|
45
|
+
while(!q.empty?)
|
46
|
+
url = q.deq
|
47
|
+
|
48
|
+
next if !has_key?(url)
|
49
|
+
|
50
|
+
page = self[url]
|
51
|
+
|
52
|
+
page.links.each do |u|
|
53
|
+
next if !has_key?(u) or self[u].nil?
|
54
|
+
link = self[u]
|
55
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
56
|
+
|
57
|
+
aliases.each do |node|
|
58
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
59
|
+
node.depth = page.depth + 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
q.enq(self[u].url) if !self[u].visited
|
64
|
+
self[u].visited = true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
73
|
+
# non-redirect Page
|
74
|
+
#
|
75
|
+
def uniq
|
76
|
+
results = PageHash.new
|
77
|
+
each do |url, page|
|
78
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
79
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
80
|
+
if !page.redirect? and !page_added
|
81
|
+
results[url] = page.clone
|
82
|
+
results[url].aliases = []
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
results
|
87
|
+
end
|
88
|
+
|
89
|
+
#
|
90
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
91
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
92
|
+
#
|
93
|
+
def pages_linking_to(urls)
|
94
|
+
unless urls.is_a?(Array)
|
95
|
+
urls = [urls] unless urls.is_a?(Array)
|
96
|
+
single = true
|
97
|
+
end
|
98
|
+
|
99
|
+
urls.map! do |url|
|
100
|
+
if url.is_a?(String)
|
101
|
+
URI(url) rescue nil
|
102
|
+
else
|
103
|
+
url
|
104
|
+
end
|
105
|
+
end
|
106
|
+
urls.compact
|
107
|
+
|
108
|
+
links = {}
|
109
|
+
urls.each { |url| links[url] = [] }
|
110
|
+
values.each do |page|
|
111
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
112
|
+
end
|
113
|
+
|
114
|
+
if single and !links.empty?
|
115
|
+
return links.first
|
116
|
+
else
|
117
|
+
return links
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
123
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
124
|
+
#
|
125
|
+
def urls_linking_to(urls)
|
126
|
+
unless urls.is_a?(Array)
|
127
|
+
urls = [urls] unless urls.is_a?(Array)
|
128
|
+
single = true
|
129
|
+
end
|
130
|
+
|
131
|
+
links = pages_linking_to(urls)
|
132
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
133
|
+
|
134
|
+
if single and !links.empty?
|
135
|
+
return links.first
|
136
|
+
else
|
137
|
+
return links
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class Tentacle
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new Tentacle
|
8
|
+
#
|
9
|
+
def initialize(link_queue, page_queue, opts = {})
|
10
|
+
@link_queue = link_queue
|
11
|
+
@page_queue = page_queue
|
12
|
+
@http = Anemone::HTTP.new(opts)
|
13
|
+
@opts = opts
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Gets links from @link_queue, and returns the fetched
|
18
|
+
# Page objects into @page_queue
|
19
|
+
#
|
20
|
+
def run
|
21
|
+
loop do
|
22
|
+
link, from_page = @link_queue.deq
|
23
|
+
|
24
|
+
break if link == :END
|
25
|
+
|
26
|
+
@page_queue << @http.fetch_page(link, from_page)
|
27
|
+
|
28
|
+
delay
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def delay
|
35
|
+
sleep @opts[:delay] if @opts[:delay]
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|