spk-anemone 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+
4
+ module Anemone
5
+ class HTTP
6
+ # Maximum number of redirects to follow on each get_response
7
+ REDIRECT_LIMIT = 5
8
+
9
+ def initialize(opts = {})
10
+ @connections = {}
11
+ @opts = opts
12
+ end
13
+
14
+ #
15
+ # Create a new Page from the response of an HTTP request to *url*
16
+ #
17
+ def fetch_page(url, from_page = nil)
18
+ begin
19
+ url = URI(url) unless url.is_a?(URI)
20
+
21
+ if from_page
22
+ referer = from_page.url
23
+ depth = from_page.depth + 1
24
+ end
25
+
26
+ response, code, location, response_time = get(url, referer)
27
+
28
+ aka = nil
29
+ if !url.eql?(location)
30
+ aka = location
31
+ end
32
+
33
+ return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
34
+ rescue => e
35
+ if verbose?
36
+ puts e.inspect
37
+ puts e.backtrace
38
+ end
39
+ return Page.new(url)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ #
46
+ # Retrieve an HTTP response for *url*, following redirects.
47
+ # Returns the response object, response code, and final URI location.
48
+ #
49
+ def get(url, referer = nil)
50
+ response, response_time = get_response(url, referer)
51
+ code = Integer(response.code)
52
+ loc = url
53
+
54
+ limit = redirect_limit
55
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
56
+ loc = URI(response['location'])
57
+ loc = url.merge(loc) if loc.relative?
58
+ response, response_time = get_response(loc, referer)
59
+ limit -= 1
60
+ end
61
+
62
+ return response, code, loc, response_time
63
+ end
64
+
65
+ #
66
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
67
+ #
68
+ def get_response(url, referer = nil)
69
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
70
+
71
+ opts = {}
72
+ opts['User-Agent'] = user_agent if user_agent
73
+ opts['Referer'] = referer.to_s if referer
74
+ opts['Authorization'] = authorization if authorization
75
+
76
+ retries = 0
77
+ begin
78
+ start = Time.now()
79
+ response = connection(url).get(full_path, opts)
80
+ finish = Time.now()
81
+ response_time = ((finish - start) * 1000).round
82
+ return response, response_time
83
+ rescue EOFError
84
+ refresh_connection(url)
85
+ retries += 1
86
+ retry unless retries > 3
87
+ end
88
+ end
89
+
90
+ def connection(url)
91
+ @connections[url.host] ||= {}
92
+
93
+ if conn = @connections[url.host][url.port]
94
+ return conn
95
+ end
96
+
97
+ refresh_connection(url)
98
+ end
99
+
100
+ def refresh_connection(url)
101
+ http = Net::HTTP.new(url.host, url.port)
102
+ if url.scheme == 'https'
103
+ http.use_ssl = true
104
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
105
+ end
106
+ @connections[url.host][url.port] = http.start
107
+ end
108
+
109
+ def redirect_limit
110
+ @opts[:redirect_limit] || REDIRECT_LIMIT
111
+ end
112
+
113
+ def user_agent
114
+ @opts[:user_agent]
115
+ end
116
+
117
+ def verbose?
118
+ @opts[:verbose]
119
+ end
120
+
121
+ def authorization
122
+ @opts[:authorization]
123
+ end
124
+
125
+ end
126
+ end
@@ -0,0 +1,158 @@
1
+ require 'nokogiri'
2
+ require 'ostruct'
3
+
4
+ module Anemone
5
+ class Page
6
+
7
+ # The URL of the page
8
+ attr_reader :url
9
+ # Headers of the HTTP response
10
+ attr_reader :headers
11
+
12
+ # OpenStruct for user-stored data
13
+ attr_accessor :data
14
+ # HTML body
15
+ attr_accessor :body
16
+ # Nokogiri document for the HTML body
17
+ attr_accessor :doc
18
+ # Integer response code of the page
19
+ attr_accessor :code
20
+ # Array of redirect-aliases for the page
21
+ attr_accessor :aliases
22
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
23
+ attr_accessor :visited
24
+ # Depth of this page from the root of the crawl. This is not necessarily the
25
+ # shortest path; use PageHash#shortest_paths! to find that value.
26
+ attr_accessor :depth
27
+ # URL of the page that brought us to this page
28
+ attr_accessor :referer
29
+ # Response time of the request for this page in milliseconds
30
+ attr_accessor :response_time
31
+
32
+ #
33
+ # Create a new page
34
+ #
35
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
36
+ @url = url
37
+ @code = code
38
+ @headers = headers || {}
39
+ @headers['content-type'] ||= ['']
40
+ @aliases = Array(aka)
41
+ @data = OpenStruct.new
42
+ @referer = referer
43
+ @depth = depth || 0
44
+ @response_time = response_time
45
+ @body = body
46
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
47
+ end
48
+
49
+ # Array of distinct A tag HREFs from the page
50
+ def links
51
+ return @links unless @links.nil?
52
+ @links = []
53
+ return @links if !doc
54
+
55
+ doc.css('a').each do |a|
56
+ u = a.attributes['href'].content rescue nil
57
+ next if u.nil? or u.empty?
58
+ abs = to_absolute(URI(u)) rescue next
59
+ @links << abs if in_domain?(abs)
60
+ end
61
+ @links.uniq!
62
+ @links
63
+ end
64
+
65
+ def discard_doc!
66
+ links # force parsing of page links before we trash the document
67
+ @doc = nil
68
+ end
69
+
70
+ #
71
+ # Return a new page with the same *response* and *url*, but
72
+ # with a 200 response code
73
+ #
74
+ def alias_clone(url)
75
+ p = clone
76
+ p.add_alias!(@aka) if !@aka.nil?
77
+ p.code = 200
78
+ p
79
+ end
80
+
81
+ #
82
+ # Add a redirect-alias String *aka* to the list of the page's aliases
83
+ #
84
+ # Returns *self*
85
+ #
86
+ def add_alias!(aka)
87
+ @aliases << aka if !@aliases.include?(aka)
88
+ self
89
+ end
90
+
91
+ #
92
+ # Returns an Array of all links from this page, and all the
93
+ # redirect-aliases of those pages, as String objects.
94
+ #
95
+ # *page_hash* is a PageHash object with the results of the current crawl.
96
+ #
97
+ def links_and_their_aliases(page_hash)
98
+ links.inject([]) do |results, link|
99
+ results.concat([link].concat(page_hash[link].aliases))
100
+ end
101
+ end
102
+
103
+ #
104
+ # The content-type returned by the HTTP request for this page
105
+ #
106
+ def content_type
107
+ headers['content-type'].first
108
+ end
109
+
110
+ #
111
+ # Returns +true+ if the page is a HTML document, returns +false+
112
+ # otherwise.
113
+ #
114
+ def html?
115
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
116
+ end
117
+
118
+ #
119
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
120
+ # otherwise.
121
+ #
122
+ def redirect?
123
+ (300..399).include?(@code)
124
+ end
125
+
126
+ #
127
+ # Returns +true+ if the page was not found (returned 404 code),
128
+ # returns +false+ otherwise.
129
+ #
130
+ def not_found?
131
+ 404 == @code
132
+ end
133
+
134
+ #
135
+ # Converts relative URL *link* into an absolute URL based on the
136
+ # location of the page
137
+ #
138
+ def to_absolute(link)
139
+ # remove anchor
140
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
141
+
142
+ relative = URI(link)
143
+ absolute = @url.merge(relative)
144
+
145
+ absolute.path = '/' if absolute.path.empty?
146
+
147
+ return absolute
148
+ end
149
+
150
+ #
151
+ # Returns +true+ if *uri* is in the same domain as the page, returns
152
+ # +false+ otherwise
153
+ #
154
+ def in_domain?(uri)
155
+ uri.host == @url.host
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,142 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ # We typically index the hash with a URI,
5
+ # but convert it to a String for easier retrieval
6
+ def [](index)
7
+ super(index.to_s)
8
+ end
9
+
10
+ def []=(index, other)
11
+ super(index.to_s, other)
12
+ end
13
+
14
+ def has_key?(key)
15
+ super(key.to_s)
16
+ end
17
+
18
+ # Does this PageHash contain the specified URL?
19
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
20
+ def has_page?(url)
21
+ schemes = %w(http https)
22
+ if schemes.include? url.scheme
23
+ u = url.dup
24
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
+ end
26
+
27
+ has_key?(url)
28
+ end
29
+
30
+ #
31
+ # Use a breadth-first search to calculate the single-source
32
+ # shortest paths from *root* to all pages in the PageHash
33
+ #
34
+ def shortest_paths!(root)
35
+ root = URI(root) if root.is_a?(String)
36
+ raise "Root node not found" if !has_key?(root)
37
+
38
+ each_value {|p| p.visited = false if p}
39
+
40
+ q = Queue.new
41
+
42
+ q.enq(root)
43
+ self[root].depth = 0
44
+ self[root].visited = true
45
+ while(!q.empty?)
46
+ url = q.deq
47
+
48
+ next if !has_key?(url)
49
+
50
+ page = self[url]
51
+
52
+ page.links.each do |u|
53
+ next if !has_key?(u) or self[u].nil?
54
+ link = self[u]
55
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
56
+
57
+ aliases.each do |node|
58
+ if node.depth.nil? or page.depth + 1 < node.depth
59
+ node.depth = page.depth + 1
60
+ end
61
+ end
62
+
63
+ q.enq(self[u].url) if !self[u].visited
64
+ self[u].visited = true
65
+ end
66
+ end
67
+
68
+ self
69
+ end
70
+
71
+ #
72
+ # Returns a new PageHash by removing redirect-aliases for each
73
+ # non-redirect Page
74
+ #
75
+ def uniq
76
+ results = PageHash.new
77
+ each do |url, page|
78
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
+ if !page.redirect? and !page_added
81
+ results[url] = page.clone
82
+ results[url].aliases = []
83
+ end
84
+ end
85
+
86
+ results
87
+ end
88
+
89
+ #
90
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
+ #
93
+ def pages_linking_to(urls)
94
+ unless urls.is_a?(Array)
95
+ urls = [urls] unless urls.is_a?(Array)
96
+ single = true
97
+ end
98
+
99
+ urls.map! do |url|
100
+ if url.is_a?(String)
101
+ URI(url) rescue nil
102
+ else
103
+ url
104
+ end
105
+ end
106
+ urls.compact
107
+
108
+ links = {}
109
+ urls.each { |url| links[url] = [] }
110
+ values.each do |page|
111
+ urls.each { |url| links[url] << page if page.links.include?(url) }
112
+ end
113
+
114
+ if single and !links.empty?
115
+ return links.first
116
+ else
117
+ return links
118
+ end
119
+ end
120
+
121
+ #
122
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
123
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
124
+ #
125
+ def urls_linking_to(urls)
126
+ unless urls.is_a?(Array)
127
+ urls = [urls] unless urls.is_a?(Array)
128
+ single = true
129
+ end
130
+
131
+ links = pages_linking_to(urls)
132
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
+
134
+ if single and !links.empty?
135
+ return links.first
136
+ else
137
+ return links
138
+ end
139
+ end
140
+
141
+ end
142
+ end
@@ -0,0 +1,39 @@
1
+ require 'anemone/http'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, from_page = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @page_queue << @http.fetch_page(link, from_page)
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay]
36
+ end
37
+
38
+ end
39
+ end