shingara-anemone 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,123 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+
4
+ module Anemone
5
+ class HTTP
6
+ # Maximum number of redirects to follow on each get_response
7
+ REDIRECT_LIMIT = 5
8
+
9
+ def initialize(opts = {})
10
+ @connections = {}
11
+ @opts = opts
12
+ end
13
+
14
+ #
15
+ # Create a new Page from the response of an HTTP request to *url*
16
+ #
17
+ def fetch_page(url, from_page = nil)
18
+ begin
19
+ url = URI(url) unless url.is_a?(URI)
20
+
21
+ if from_page
22
+ referer = from_page.url
23
+ depth = from_page.depth + 1
24
+ end
25
+
26
+ response, code, location, response_time = get(url, referer)
27
+
28
+ aka = nil
29
+ if !url.eql?(location)
30
+ aka = location
31
+ end
32
+
33
+ return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
34
+ rescue => e
35
+ if verbose?
36
+ puts e.inspect
37
+ puts e.backtrace
38
+ end
39
+ return Page.new(url)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ #
46
+ # Retrieve an HTTP response for *url*, following redirects.
47
+ # Returns the response object, response code, and final URI location.
48
+ #
49
+ def get(url, referer = nil)
50
+ response, response_time = get_response(url, referer)
51
+ code = Integer(response.code)
52
+ loc = url
53
+
54
+ limit = redirect_limit
55
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
56
+ loc = URI(response['location'])
57
+ loc = url.merge(loc) if loc.relative?
58
+ response, response_time = get_response(loc, referer)
59
+ limit -= 1
60
+ end
61
+
62
+ return response, code, loc, response_time
63
+ end
64
+
65
+ #
66
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
67
+ #
68
+ def get_response(url, referer = nil)
69
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
70
+
71
+ opts = {}
72
+ opts['User-Agent'] = user_agent if user_agent
73
+ opts['Referer'] = referer.to_s if referer
74
+
75
+ retries = 0
76
+ begin
77
+ start = Time.now()
78
+ req = Net::HTTP::Get.new(full_path, opts)
79
+ req.basic_auth url.user, url.password if url.user
80
+ response = connection(url).request(req)
81
+ finish = Time.now()
82
+ response_time = ((finish - start) * 1000).round
83
+ return response, response_time
84
+ rescue EOFError
85
+ refresh_connection(url)
86
+ retries += 1
87
+ retry unless retries > 3
88
+ end
89
+ end
90
+
91
+ def connection(url)
92
+ @connections[url.host] ||= {}
93
+
94
+ if conn = @connections[url.host][url.port]
95
+ return conn
96
+ end
97
+
98
+ refresh_connection(url)
99
+ end
100
+
101
+ def refresh_connection(url)
102
+ http = Net::HTTP.new(url.host, url.port)
103
+ if url.scheme == 'https'
104
+ http.use_ssl = true
105
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
106
+ end
107
+ @connections[url.host][url.port] = http.start
108
+ end
109
+
110
+ def redirect_limit
111
+ @opts[:redirect_limit] || REDIRECT_LIMIT
112
+ end
113
+
114
+ def user_agent
115
+ @opts[:user_agent]
116
+ end
117
+
118
+ def verbose?
119
+ @opts[:verbose]
120
+ end
121
+
122
+ end
123
+ end
@@ -0,0 +1,155 @@
1
+ require 'nokogiri'
2
+ require 'ostruct'
3
+
4
+ module Anemone
5
+ class Page
6
+
7
+ # The URL of the page
8
+ attr_reader :url
9
+ # Headers of the HTTP response
10
+ attr_reader :headers
11
+
12
+ # OpenStruct for user-stored data
13
+ attr_accessor :data
14
+ # Nokogiri document for the HTML body
15
+ attr_accessor :doc
16
+ # Integer response code of the page
17
+ attr_accessor :code
18
+ # Array of redirect-aliases for the page
19
+ attr_accessor :aliases
20
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
21
+ attr_accessor :visited
22
+ # Depth of this page from the root of the crawl. This is not necessarily the
23
+ # shortest path; use PageHash#shortest_paths! to find that value.
24
+ attr_accessor :depth
25
+ # URL of the page that brought us to this page
26
+ attr_accessor :referer
27
+ # Response time of the request for this page in milliseconds
28
+ attr_accessor :response_time
29
+
30
+ #
31
+ # Create a new page
32
+ #
33
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
34
+ @url = url
35
+ @code = code
36
+ @headers = headers || {}
37
+ @headers['content-type'] ||= ['']
38
+ @aliases = Array(aka)
39
+ @data = OpenStruct.new
40
+ @referer = referer
41
+ @depth = depth || 0
42
+ @response_time = response_time
43
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
44
+ end
45
+
46
+ # Array of distinct A tag HREFs from the page
47
+ def links
48
+ return @links unless @links.nil?
49
+ @links = []
50
+ return @links if !doc
51
+
52
+ doc.css('a').each do |a|
53
+ u = a.attributes['href'].content rescue nil
54
+ next if u.nil? or u.empty?
55
+ abs = to_absolute(URI(u)) rescue next
56
+ @links << abs if in_domain?(abs)
57
+ end
58
+ @links.uniq!
59
+ @links
60
+ end
61
+
62
+ def discard_doc!
63
+ links # force parsing of page links before we trash the document
64
+ @doc = nil
65
+ end
66
+
67
+ #
68
+ # Return a new page with the same *response* and *url*, but
69
+ # with a 200 response code
70
+ #
71
+ def alias_clone(url)
72
+ p = clone
73
+ p.add_alias!(@aka) if !@aka.nil?
74
+ p.code = 200
75
+ p
76
+ end
77
+
78
+ #
79
+ # Add a redirect-alias String *aka* to the list of the page's aliases
80
+ #
81
+ # Returns *self*
82
+ #
83
+ def add_alias!(aka)
84
+ @aliases << aka if !@aliases.include?(aka)
85
+ self
86
+ end
87
+
88
+ #
89
+ # Returns an Array of all links from this page, and all the
90
+ # redirect-aliases of those pages, as String objects.
91
+ #
92
+ # *page_hash* is a PageHash object with the results of the current crawl.
93
+ #
94
+ def links_and_their_aliases(page_hash)
95
+ links.inject([]) do |results, link|
96
+ results.concat([link].concat(page_hash[link].aliases))
97
+ end
98
+ end
99
+
100
+ #
101
+ # The content-type returned by the HTTP request for this page
102
+ #
103
+ def content_type
104
+ headers['content-type'].first
105
+ end
106
+
107
+ #
108
+ # Returns +true+ if the page is a HTML document, returns +false+
109
+ # otherwise.
110
+ #
111
+ def html?
112
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
113
+ end
114
+
115
+ #
116
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
117
+ # otherwise.
118
+ #
119
+ def redirect?
120
+ (300..399).include?(@code)
121
+ end
122
+
123
+ #
124
+ # Returns +true+ if the page was not found (returned 404 code),
125
+ # returns +false+ otherwise.
126
+ #
127
+ def not_found?
128
+ 404 == @code
129
+ end
130
+
131
+ #
132
+ # Converts relative URL *link* into an absolute URL based on the
133
+ # location of the page
134
+ #
135
+ def to_absolute(link)
136
+ # remove anchor
137
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
138
+
139
+ relative = URI(link)
140
+ absolute = @url.merge(relative)
141
+
142
+ absolute.path = '/' if absolute.path.empty?
143
+
144
+ return absolute
145
+ end
146
+
147
+ #
148
+ # Returns +true+ if *uri* is in the same domain as the page, returns
149
+ # +false+ otherwise
150
+ #
151
+ def in_domain?(uri)
152
+ uri.host == @url.host
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,142 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ # We typically index the hash with a URI,
5
+ # but convert it to a String for easier retrieval
6
+ def [](index)
7
+ super(index.to_s)
8
+ end
9
+
10
+ def []=(index, other)
11
+ super(index.to_s, other)
12
+ end
13
+
14
+ def has_key?(key)
15
+ super(key.to_s)
16
+ end
17
+
18
+ # Does this PageHash contain the specified URL?
19
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
20
+ def has_page?(url)
21
+ schemes = %w(http https)
22
+ if schemes.include? url.scheme
23
+ u = url.dup
24
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
+ end
26
+
27
+ has_key?(url)
28
+ end
29
+
30
+ #
31
+ # Use a breadth-first search to calculate the single-source
32
+ # shortest paths from *root* to all pages in the PageHash
33
+ #
34
+ def shortest_paths!(root)
35
+ root = URI(root) if root.is_a?(String)
36
+ raise "Root node not found" if !has_key?(root)
37
+
38
+ each_value {|p| p.visited = false if p}
39
+
40
+ q = Queue.new
41
+
42
+ q.enq(root)
43
+ self[root].depth = 0
44
+ self[root].visited = true
45
+ while(!q.empty?)
46
+ url = q.deq
47
+
48
+ next if !has_key?(url)
49
+
50
+ page = self[url]
51
+
52
+ page.links.each do |u|
53
+ next if !has_key?(u) or self[u].nil?
54
+ link = self[u]
55
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
56
+
57
+ aliases.each do |node|
58
+ if node.depth.nil? or page.depth + 1 < node.depth
59
+ node.depth = page.depth + 1
60
+ end
61
+ end
62
+
63
+ q.enq(self[u].url) if !self[u].visited
64
+ self[u].visited = true
65
+ end
66
+ end
67
+
68
+ self
69
+ end
70
+
71
+ #
72
+ # Returns a new PageHash by removing redirect-aliases for each
73
+ # non-redirect Page
74
+ #
75
+ def uniq
76
+ results = PageHash.new
77
+ each do |url, page|
78
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
+ if !page.redirect? and !page_added
81
+ results[url] = page.clone
82
+ results[url].aliases = []
83
+ end
84
+ end
85
+
86
+ results
87
+ end
88
+
89
+ #
90
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
+ #
93
+ def pages_linking_to(urls)
94
+ unless urls.is_a?(Array)
95
+ urls = [urls] unless urls.is_a?(Array)
96
+ single = true
97
+ end
98
+
99
+ urls.map! do |url|
100
+ if url.is_a?(String)
101
+ URI(url) rescue nil
102
+ else
103
+ url
104
+ end
105
+ end
106
+ urls.compact
107
+
108
+ links = {}
109
+ urls.each { |url| links[url] = [] }
110
+ values.each do |page|
111
+ urls.each { |url| links[url] << page if page.links.include?(url) }
112
+ end
113
+
114
+ if single and !links.empty?
115
+ return links.first
116
+ else
117
+ return links
118
+ end
119
+ end
120
+
121
+ #
122
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
123
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
124
+ #
125
+ def urls_linking_to(urls)
126
+ unless urls.is_a?(Array)
127
+ urls = [urls] unless urls.is_a?(Array)
128
+ single = true
129
+ end
130
+
131
+ links = pages_linking_to(urls)
132
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
+
134
+ if single and !links.empty?
135
+ return links.first
136
+ else
137
+ return links
138
+ end
139
+ end
140
+
141
+ end
142
+ end
@@ -0,0 +1,39 @@
1
+ require 'anemone/http'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, from_page = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @page_queue << @http.fetch_page(link, from_page)
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay]
36
+ end
37
+
38
+ end
39
+ end