spk-anemone 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,126 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+
4
+ module Anemone
5
+ class HTTP
6
+ # Maximum number of redirects to follow on each get_response
7
+ REDIRECT_LIMIT = 5
8
+
9
+ def initialize(opts = {})
10
+ @connections = {}
11
+ @opts = opts
12
+ end
13
+
14
+ #
15
+ # Create a new Page from the response of an HTTP request to *url*
16
+ #
17
+ def fetch_page(url, from_page = nil)
18
+ begin
19
+ url = URI(url) unless url.is_a?(URI)
20
+
21
+ if from_page
22
+ referer = from_page.url
23
+ depth = from_page.depth + 1
24
+ end
25
+
26
+ response, code, location, response_time = get(url, referer)
27
+
28
+ aka = nil
29
+ if !url.eql?(location)
30
+ aka = location
31
+ end
32
+
33
+ return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
34
+ rescue => e
35
+ if verbose?
36
+ puts e.inspect
37
+ puts e.backtrace
38
+ end
39
+ return Page.new(url)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ #
46
+ # Retrieve an HTTP response for *url*, following redirects.
47
+ # Returns the response object, response code, and final URI location.
48
+ #
49
+ def get(url, referer = nil)
50
+ response, response_time = get_response(url, referer)
51
+ code = Integer(response.code)
52
+ loc = url
53
+
54
+ limit = redirect_limit
55
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
56
+ loc = URI(response['location'])
57
+ loc = url.merge(loc) if loc.relative?
58
+ response, response_time = get_response(loc, referer)
59
+ limit -= 1
60
+ end
61
+
62
+ return response, code, loc, response_time
63
+ end
64
+
65
+ #
66
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
67
+ #
68
+ def get_response(url, referer = nil)
69
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
70
+
71
+ opts = {}
72
+ opts['User-Agent'] = user_agent if user_agent
73
+ opts['Referer'] = referer.to_s if referer
74
+ opts['Authorization'] = authorization if authorization
75
+
76
+ retries = 0
77
+ begin
78
+ start = Time.now()
79
+ response = connection(url).get(full_path, opts)
80
+ finish = Time.now()
81
+ response_time = ((finish - start) * 1000).round
82
+ return response, response_time
83
+ rescue EOFError
84
+ refresh_connection(url)
85
+ retries += 1
86
+ retry unless retries > 3
87
+ end
88
+ end
89
+
90
+ def connection(url)
91
+ @connections[url.host] ||= {}
92
+
93
+ if conn = @connections[url.host][url.port]
94
+ return conn
95
+ end
96
+
97
+ refresh_connection(url)
98
+ end
99
+
100
+ def refresh_connection(url)
101
+ http = Net::HTTP.new(url.host, url.port)
102
+ if url.scheme == 'https'
103
+ http.use_ssl = true
104
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
105
+ end
106
+ @connections[url.host][url.port] = http.start
107
+ end
108
+
109
+ def redirect_limit
110
+ @opts[:redirect_limit] || REDIRECT_LIMIT
111
+ end
112
+
113
+ def user_agent
114
+ @opts[:user_agent]
115
+ end
116
+
117
+ def verbose?
118
+ @opts[:verbose]
119
+ end
120
+
121
+ def authorization
122
+ @opts[:authorization]
123
+ end
124
+
125
+ end
126
+ end
@@ -0,0 +1,158 @@
1
+ require 'nokogiri'
2
+ require 'ostruct'
3
+
4
+ module Anemone
5
+ class Page
6
+
7
+ # The URL of the page
8
+ attr_reader :url
9
+ # Headers of the HTTP response
10
+ attr_reader :headers
11
+
12
+ # OpenStruct for user-stored data
13
+ attr_accessor :data
14
+ # HTML body
15
+ attr_accessor :body
16
+ # Nokogiri document for the HTML body
17
+ attr_accessor :doc
18
+ # Integer response code of the page
19
+ attr_accessor :code
20
+ # Array of redirect-aliases for the page
21
+ attr_accessor :aliases
22
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
23
+ attr_accessor :visited
24
+ # Depth of this page from the root of the crawl. This is not necessarily the
25
+ # shortest path; use PageHash#shortest_paths! to find that value.
26
+ attr_accessor :depth
27
+ # URL of the page that brought us to this page
28
+ attr_accessor :referer
29
+ # Response time of the request for this page in milliseconds
30
+ attr_accessor :response_time
31
+
32
+ #
33
+ # Create a new page
34
+ #
35
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
36
+ @url = url
37
+ @code = code
38
+ @headers = headers || {}
39
+ @headers['content-type'] ||= ['']
40
+ @aliases = Array(aka)
41
+ @data = OpenStruct.new
42
+ @referer = referer
43
+ @depth = depth || 0
44
+ @response_time = response_time
45
+ @body = body
46
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
47
+ end
48
+
49
+ # Array of distinct A tag HREFs from the page
50
+ def links
51
+ return @links unless @links.nil?
52
+ @links = []
53
+ return @links if !doc
54
+
55
+ doc.css('a').each do |a|
56
+ u = a.attributes['href'].content rescue nil
57
+ next if u.nil? or u.empty?
58
+ abs = to_absolute(URI(u)) rescue next
59
+ @links << abs if in_domain?(abs)
60
+ end
61
+ @links.uniq!
62
+ @links
63
+ end
64
+
65
+ def discard_doc!
66
+ links # force parsing of page links before we trash the document
67
+ @doc = nil
68
+ end
69
+
70
+ #
71
+ # Return a new page with the same *response* and *url*, but
72
+ # with a 200 response code
73
+ #
74
+ def alias_clone(url)
75
+ p = clone
76
+ p.add_alias!(@aka) if !@aka.nil?
77
+ p.code = 200
78
+ p
79
+ end
80
+
81
+ #
82
+ # Add a redirect-alias String *aka* to the list of the page's aliases
83
+ #
84
+ # Returns *self*
85
+ #
86
+ def add_alias!(aka)
87
+ @aliases << aka if !@aliases.include?(aka)
88
+ self
89
+ end
90
+
91
+ #
92
+ # Returns an Array of all links from this page, and all the
93
+ # redirect-aliases of those pages, as String objects.
94
+ #
95
+ # *page_hash* is a PageHash object with the results of the current crawl.
96
+ #
97
+ def links_and_their_aliases(page_hash)
98
+ links.inject([]) do |results, link|
99
+ results.concat([link].concat(page_hash[link].aliases))
100
+ end
101
+ end
102
+
103
+ #
104
+ # The content-type returned by the HTTP request for this page
105
+ #
106
+ def content_type
107
+ headers['content-type'].first
108
+ end
109
+
110
+ #
111
+ # Returns +true+ if the page is a HTML document, returns +false+
112
+ # otherwise.
113
+ #
114
+ def html?
115
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
116
+ end
117
+
118
+ #
119
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
120
+ # otherwise.
121
+ #
122
+ def redirect?
123
+ (300..399).include?(@code)
124
+ end
125
+
126
+ #
127
+ # Returns +true+ if the page was not found (returned 404 code),
128
+ # returns +false+ otherwise.
129
+ #
130
+ def not_found?
131
+ 404 == @code
132
+ end
133
+
134
+ #
135
+ # Converts relative URL *link* into an absolute URL based on the
136
+ # location of the page
137
+ #
138
+ def to_absolute(link)
139
+ # remove anchor
140
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
141
+
142
+ relative = URI(link)
143
+ absolute = @url.merge(relative)
144
+
145
+ absolute.path = '/' if absolute.path.empty?
146
+
147
+ return absolute
148
+ end
149
+
150
+ #
151
+ # Returns +true+ if *uri* is in the same domain as the page, returns
152
+ # +false+ otherwise
153
+ #
154
+ def in_domain?(uri)
155
+ uri.host == @url.host
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,142 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ # We typically index the hash with a URI,
5
+ # but convert it to a String for easier retrieval
6
+ def [](index)
7
+ super(index.to_s)
8
+ end
9
+
10
+ def []=(index, other)
11
+ super(index.to_s, other)
12
+ end
13
+
14
+ def has_key?(key)
15
+ super(key.to_s)
16
+ end
17
+
18
+ # Does this PageHash contain the specified URL?
19
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
20
+ def has_page?(url)
21
+ schemes = %w(http https)
22
+ if schemes.include? url.scheme
23
+ u = url.dup
24
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
+ end
26
+
27
+ has_key?(url)
28
+ end
29
+
30
+ #
31
+ # Use a breadth-first search to calculate the single-source
32
+ # shortest paths from *root* to all pages in the PageHash
33
+ #
34
+ def shortest_paths!(root)
35
+ root = URI(root) if root.is_a?(String)
36
+ raise "Root node not found" if !has_key?(root)
37
+
38
+ each_value {|p| p.visited = false if p}
39
+
40
+ q = Queue.new
41
+
42
+ q.enq(root)
43
+ self[root].depth = 0
44
+ self[root].visited = true
45
+ while(!q.empty?)
46
+ url = q.deq
47
+
48
+ next if !has_key?(url)
49
+
50
+ page = self[url]
51
+
52
+ page.links.each do |u|
53
+ next if !has_key?(u) or self[u].nil?
54
+ link = self[u]
55
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
56
+
57
+ aliases.each do |node|
58
+ if node.depth.nil? or page.depth + 1 < node.depth
59
+ node.depth = page.depth + 1
60
+ end
61
+ end
62
+
63
+ q.enq(self[u].url) if !self[u].visited
64
+ self[u].visited = true
65
+ end
66
+ end
67
+
68
+ self
69
+ end
70
+
71
+ #
72
+ # Returns a new PageHash by removing redirect-aliases for each
73
+ # non-redirect Page
74
+ #
75
+ def uniq
76
+ results = PageHash.new
77
+ each do |url, page|
78
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
+ if !page.redirect? and !page_added
81
+ results[url] = page.clone
82
+ results[url].aliases = []
83
+ end
84
+ end
85
+
86
+ results
87
+ end
88
+
89
+ #
90
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
+ #
93
+ def pages_linking_to(urls)
94
+ unless urls.is_a?(Array)
95
+ urls = [urls] unless urls.is_a?(Array)
96
+ single = true
97
+ end
98
+
99
+ urls.map! do |url|
100
+ if url.is_a?(String)
101
+ URI(url) rescue nil
102
+ else
103
+ url
104
+ end
105
+ end
106
+ urls.compact
107
+
108
+ links = {}
109
+ urls.each { |url| links[url] = [] }
110
+ values.each do |page|
111
+ urls.each { |url| links[url] << page if page.links.include?(url) }
112
+ end
113
+
114
+ if single and !links.empty?
115
+ return links.first
116
+ else
117
+ return links
118
+ end
119
+ end
120
+
121
+ #
122
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
123
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
124
+ #
125
+ def urls_linking_to(urls)
126
+ unless urls.is_a?(Array)
127
+ urls = [urls] unless urls.is_a?(Array)
128
+ single = true
129
+ end
130
+
131
+ links = pages_linking_to(urls)
132
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
+
134
+ if single and !links.empty?
135
+ return links.first
136
+ else
137
+ return links
138
+ end
139
+ end
140
+
141
+ end
142
+ end
@@ -0,0 +1,39 @@
1
+ require 'anemone/http'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, from_page = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @page_queue << @http.fetch_page(link, from_page)
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay]
36
+ end
37
+
38
+ end
39
+ end