shingara-anemone 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,123 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+
4
+ module Anemone
5
+ class HTTP
6
+ # Maximum number of redirects to follow on each get_response
7
+ REDIRECT_LIMIT = 5
8
+
9
+ def initialize(opts = {})
10
+ @connections = {}
11
+ @opts = opts
12
+ end
13
+
14
+ #
15
+ # Create a new Page from the response of an HTTP request to *url*
16
+ #
17
+ def fetch_page(url, from_page = nil)
18
+ begin
19
+ url = URI(url) unless url.is_a?(URI)
20
+
21
+ if from_page
22
+ referer = from_page.url
23
+ depth = from_page.depth + 1
24
+ end
25
+
26
+ response, code, location, response_time = get(url, referer)
27
+
28
+ aka = nil
29
+ if !url.eql?(location)
30
+ aka = location
31
+ end
32
+
33
+ return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
34
+ rescue => e
35
+ if verbose?
36
+ puts e.inspect
37
+ puts e.backtrace
38
+ end
39
+ return Page.new(url)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ #
46
+ # Retrieve an HTTP response for *url*, following redirects.
47
+ # Returns the response object, response code, and final URI location.
48
+ #
49
+ def get(url, referer = nil)
50
+ response, response_time = get_response(url, referer)
51
+ code = Integer(response.code)
52
+ loc = url
53
+
54
+ limit = redirect_limit
55
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
56
+ loc = URI(response['location'])
57
+ loc = url.merge(loc) if loc.relative?
58
+ response, response_time = get_response(loc, referer)
59
+ limit -= 1
60
+ end
61
+
62
+ return response, code, loc, response_time
63
+ end
64
+
65
+ #
66
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
67
+ #
68
+ def get_response(url, referer = nil)
69
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
70
+
71
+ opts = {}
72
+ opts['User-Agent'] = user_agent if user_agent
73
+ opts['Referer'] = referer.to_s if referer
74
+
75
+ retries = 0
76
+ begin
77
+ start = Time.now()
78
+ req = Net::HTTP::Get.new(full_path, opts)
79
+ req.basic_auth url.user, url.password if url.user
80
+ response = connection(url).request(req)
81
+ finish = Time.now()
82
+ response_time = ((finish - start) * 1000).round
83
+ return response, response_time
84
+ rescue EOFError
85
+ refresh_connection(url)
86
+ retries += 1
87
+ retry unless retries > 3
88
+ end
89
+ end
90
+
91
+ def connection(url)
92
+ @connections[url.host] ||= {}
93
+
94
+ if conn = @connections[url.host][url.port]
95
+ return conn
96
+ end
97
+
98
+ refresh_connection(url)
99
+ end
100
+
101
+ def refresh_connection(url)
102
+ http = Net::HTTP.new(url.host, url.port)
103
+ if url.scheme == 'https'
104
+ http.use_ssl = true
105
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
106
+ end
107
+ @connections[url.host][url.port] = http.start
108
+ end
109
+
110
+ def redirect_limit
111
+ @opts[:redirect_limit] || REDIRECT_LIMIT
112
+ end
113
+
114
+ def user_agent
115
+ @opts[:user_agent]
116
+ end
117
+
118
+ def verbose?
119
+ @opts[:verbose]
120
+ end
121
+
122
+ end
123
+ end
@@ -0,0 +1,155 @@
1
+ require 'nokogiri'
2
+ require 'ostruct'
3
+
4
+ module Anemone
5
+ class Page
6
+
7
+ # The URL of the page
8
+ attr_reader :url
9
+ # Headers of the HTTP response
10
+ attr_reader :headers
11
+
12
+ # OpenStruct for user-stored data
13
+ attr_accessor :data
14
+ # Nokogiri document for the HTML body
15
+ attr_accessor :doc
16
+ # Integer response code of the page
17
+ attr_accessor :code
18
+ # Array of redirect-aliases for the page
19
+ attr_accessor :aliases
20
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
21
+ attr_accessor :visited
22
+ # Depth of this page from the root of the crawl. This is not necessarily the
23
+ # shortest path; use PageHash#shortest_paths! to find that value.
24
+ attr_accessor :depth
25
+ # URL of the page that brought us to this page
26
+ attr_accessor :referer
27
+ # Response time of the request for this page in milliseconds
28
+ attr_accessor :response_time
29
+
30
+ #
31
+ # Create a new page
32
+ #
33
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
34
+ @url = url
35
+ @code = code
36
+ @headers = headers || {}
37
+ @headers['content-type'] ||= ['']
38
+ @aliases = Array(aka)
39
+ @data = OpenStruct.new
40
+ @referer = referer
41
+ @depth = depth || 0
42
+ @response_time = response_time
43
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
44
+ end
45
+
46
+ # Array of distinct A tag HREFs from the page
47
+ def links
48
+ return @links unless @links.nil?
49
+ @links = []
50
+ return @links if !doc
51
+
52
+ doc.css('a').each do |a|
53
+ u = a.attributes['href'].content rescue nil
54
+ next if u.nil? or u.empty?
55
+ abs = to_absolute(URI(u)) rescue next
56
+ @links << abs if in_domain?(abs)
57
+ end
58
+ @links.uniq!
59
+ @links
60
+ end
61
+
62
+ def discard_doc!
63
+ links # force parsing of page links before we trash the document
64
+ @doc = nil
65
+ end
66
+
67
+ #
68
+ # Return a new page with the same *response* and *url*, but
69
+ # with a 200 response code
70
+ #
71
+ def alias_clone(url)
72
+ p = clone
73
+ p.add_alias!(@aka) if !@aka.nil?
74
+ p.code = 200
75
+ p
76
+ end
77
+
78
+ #
79
+ # Add a redirect-alias String *aka* to the list of the page's aliases
80
+ #
81
+ # Returns *self*
82
+ #
83
+ def add_alias!(aka)
84
+ @aliases << aka if !@aliases.include?(aka)
85
+ self
86
+ end
87
+
88
+ #
89
+ # Returns an Array of all links from this page, and all the
90
+ # redirect-aliases of those pages, as String objects.
91
+ #
92
+ # *page_hash* is a PageHash object with the results of the current crawl.
93
+ #
94
+ def links_and_their_aliases(page_hash)
95
+ links.inject([]) do |results, link|
96
+ results.concat([link].concat(page_hash[link].aliases))
97
+ end
98
+ end
99
+
100
+ #
101
+ # The content-type returned by the HTTP request for this page
102
+ #
103
+ def content_type
104
+ headers['content-type'].first
105
+ end
106
+
107
+ #
108
+ # Returns +true+ if the page is a HTML document, returns +false+
109
+ # otherwise.
110
+ #
111
+ def html?
112
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
113
+ end
114
+
115
+ #
116
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
117
+ # otherwise.
118
+ #
119
+ def redirect?
120
+ (300..399).include?(@code)
121
+ end
122
+
123
+ #
124
+ # Returns +true+ if the page was not found (returned 404 code),
125
+ # returns +false+ otherwise.
126
+ #
127
+ def not_found?
128
+ 404 == @code
129
+ end
130
+
131
+ #
132
+ # Converts relative URL *link* into an absolute URL based on the
133
+ # location of the page
134
+ #
135
+ def to_absolute(link)
136
+ # remove anchor
137
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
138
+
139
+ relative = URI(link)
140
+ absolute = @url.merge(relative)
141
+
142
+ absolute.path = '/' if absolute.path.empty?
143
+
144
+ return absolute
145
+ end
146
+
147
+ #
148
+ # Returns +true+ if *uri* is in the same domain as the page, returns
149
+ # +false+ otherwise
150
+ #
151
+ def in_domain?(uri)
152
+ uri.host == @url.host
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,142 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ # We typically index the hash with a URI,
5
+ # but convert it to a String for easier retrieval
6
+ def [](index)
7
+ super(index.to_s)
8
+ end
9
+
10
+ def []=(index, other)
11
+ super(index.to_s, other)
12
+ end
13
+
14
+ def has_key?(key)
15
+ super(key.to_s)
16
+ end
17
+
18
+ # Does this PageHash contain the specified URL?
19
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
20
+ def has_page?(url)
21
+ schemes = %w(http https)
22
+ if schemes.include? url.scheme
23
+ u = url.dup
24
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
+ end
26
+
27
+ has_key?(url)
28
+ end
29
+
30
+ #
31
+ # Use a breadth-first search to calculate the single-source
32
+ # shortest paths from *root* to all pages in the PageHash
33
+ #
34
+ def shortest_paths!(root)
35
+ root = URI(root) if root.is_a?(String)
36
+ raise "Root node not found" if !has_key?(root)
37
+
38
+ each_value {|p| p.visited = false if p}
39
+
40
+ q = Queue.new
41
+
42
+ q.enq(root)
43
+ self[root].depth = 0
44
+ self[root].visited = true
45
+ while(!q.empty?)
46
+ url = q.deq
47
+
48
+ next if !has_key?(url)
49
+
50
+ page = self[url]
51
+
52
+ page.links.each do |u|
53
+ next if !has_key?(u) or self[u].nil?
54
+ link = self[u]
55
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
56
+
57
+ aliases.each do |node|
58
+ if node.depth.nil? or page.depth + 1 < node.depth
59
+ node.depth = page.depth + 1
60
+ end
61
+ end
62
+
63
+ q.enq(self[u].url) if !self[u].visited
64
+ self[u].visited = true
65
+ end
66
+ end
67
+
68
+ self
69
+ end
70
+
71
+ #
72
+ # Returns a new PageHash by removing redirect-aliases for each
73
+ # non-redirect Page
74
+ #
75
+ def uniq
76
+ results = PageHash.new
77
+ each do |url, page|
78
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
+ if !page.redirect? and !page_added
81
+ results[url] = page.clone
82
+ results[url].aliases = []
83
+ end
84
+ end
85
+
86
+ results
87
+ end
88
+
89
+ #
90
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
+ #
93
+ def pages_linking_to(urls)
94
+ unless urls.is_a?(Array)
95
+ urls = [urls] unless urls.is_a?(Array)
96
+ single = true
97
+ end
98
+
99
+ urls.map! do |url|
100
+ if url.is_a?(String)
101
+ URI(url) rescue nil
102
+ else
103
+ url
104
+ end
105
+ end
106
+ urls.compact
107
+
108
+ links = {}
109
+ urls.each { |url| links[url] = [] }
110
+ values.each do |page|
111
+ urls.each { |url| links[url] << page if page.links.include?(url) }
112
+ end
113
+
114
+ if single and !links.empty?
115
+ return links.first
116
+ else
117
+ return links
118
+ end
119
+ end
120
+
121
+ #
122
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
123
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
124
+ #
125
+ def urls_linking_to(urls)
126
+ unless urls.is_a?(Array)
127
+ urls = [urls] unless urls.is_a?(Array)
128
+ single = true
129
+ end
130
+
131
+ links = pages_linking_to(urls)
132
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
+
134
+ if single and !links.empty?
135
+ return links.first
136
+ else
137
+ return links
138
+ end
139
+ end
140
+
141
+ end
142
+ end
@@ -0,0 +1,39 @@
1
+ require 'anemone/http'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, from_page = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @page_queue << @http.fetch_page(link, from_page)
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay]
36
+ end
37
+
38
+ end
39
+ end