jeremyf-anemone 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/anemone'
@@ -0,0 +1,37 @@
1
+ require 'ostruct'
2
+ require 'anemone/core'
3
+
4
+ module Anemone
5
+ # Version number
6
+ VERSION = '0.1.1'
7
+
8
+ # User-Agent string used for HTTP requests
9
+ USER_AGENT = "Anemone/#{self::VERSION}"
10
+
11
+ #module-wide options
12
+ def Anemone.options=(options)
13
+ @options = options
14
+ end
15
+
16
+ def Anemone.options
17
+ @options
18
+ end
19
+
20
+ #
21
+ # Convenience method to start a crawl using Core
22
+ #
23
+ def Anemone.crawl(urls, options = {}, &block)
24
+ Anemone.options = OpenStruct.new(options)
25
+
26
+ #by default, run 4 Tentacle threads to fetch pages
27
+ Anemone.options.threads ||= 4
28
+
29
+ #disable verbose output by default
30
+ Anemone.options.verbose ||= false
31
+
32
+ #by default, don't throw away the page response body after scanning it for links
33
+ Anemone.options.discard_page_bodies ||= false
34
+
35
+ Core.crawl(urls, &block)
36
+ end
37
+ end
@@ -0,0 +1,211 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page_hash'
5
+
6
+ module Anemone
7
+ class Core
8
+ # PageHash storing all Page objects encountered during the crawl
9
+ attr_reader :pages
10
+
11
+ #
12
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
+ # and optional *block*
14
+ #
15
+ def initialize(urls, &block)
16
+ @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
17
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
18
+
19
+ @tentacles = []
20
+ @pages = PageHash.new
21
+ @on_every_page_blocks = []
22
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
23
+ @skip_link_patterns = []
24
+ @after_crawl_blocks = []
25
+
26
+ block.call(self) if block
27
+ end
28
+
29
+ #
30
+ # Convenience method to start a new crawl
31
+ #
32
+ def self.crawl(root, &block)
33
+ self.new(root) do |core|
34
+ block.call(core) if block
35
+ core.run
36
+ return core
37
+ end
38
+ end
39
+
40
+ #
41
+ # Add a block to be executed on the PageHash after the crawl
42
+ # is finished
43
+ #
44
+ def after_crawl(&block)
45
+ @after_crawl_blocks << block
46
+ self
47
+ end
48
+
49
+ #
50
+ # Add one ore more Regex patterns for URLs which should not be
51
+ # followed
52
+ #
53
+ def skip_links_like(*patterns)
54
+ if patterns
55
+ patterns.each do |pattern|
56
+ @skip_link_patterns << pattern
57
+ end
58
+ end
59
+ self
60
+ end
61
+
62
+ #
63
+ # Add a block to be executed on every Page as they are encountered
64
+ # during the crawl
65
+ #
66
+ def on_every_page(&block)
67
+ @on_every_page_blocks << block
68
+ self
69
+ end
70
+
71
+ #
72
+ # Add a block to be executed on Page objects with a URL matching
73
+ # one or more patterns
74
+ #
75
+ def on_pages_like(*patterns, &block)
76
+ if patterns
77
+ patterns.each do |pattern|
78
+ @on_pages_like_blocks[pattern] << block
79
+ end
80
+ end
81
+ self
82
+ end
83
+
84
+ #
85
+ # Specify a block which will select which links to follow on each page.
86
+ # The block should return an Array of URI objects.
87
+ #
88
+ def focus_crawl(&block)
89
+ @focus_crawl_block = block
90
+ self
91
+ end
92
+
93
+ #
94
+ # Perform the crawl
95
+ #
96
+ def run
97
+ @urls.delete_if { |url| !visit_link?(url) }
98
+ return if @urls.empty?
99
+
100
+ link_queue = Queue.new
101
+ page_queue = Queue.new
102
+
103
+ Anemone.options.threads.times do |id|
104
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
105
+ end
106
+
107
+ @urls.each{ |url| link_queue.enq(url) }
108
+
109
+ loop do
110
+ page = page_queue.deq
111
+
112
+ @pages[page.url] = page
113
+
114
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
115
+
116
+ #perform the on_every_page blocks for this page
117
+ do_page_blocks(page)
118
+
119
+ page.doc = nil if Anemone.options.discard_page_bodies
120
+
121
+ links_to_follow(page).each do |link|
122
+
123
+ eval(%(def link.from_url; '#{page.url}'; end))
124
+ link_queue.enq(link)
125
+ @pages[link] = nil
126
+ end
127
+
128
+ #create an entry in the page hash for each alias of this page,
129
+ #i.e. all the pages that redirected to this page
130
+ page.aliases.each do |aka|
131
+ if !@pages.has_key?(aka) or @pages[aka].nil?
132
+ @pages[aka] = page.alias_clone(aka)
133
+ end
134
+ @pages[aka].add_alias!(page.url)
135
+ end
136
+
137
+ # if we are done with the crawl, tell the threads to end
138
+ if link_queue.empty? and page_queue.empty?
139
+ until link_queue.num_waiting == @tentacles.size
140
+ Thread.pass
141
+ end
142
+
143
+ if page_queue.empty?
144
+ @tentacles.size.times { |i| link_queue.enq(:END)}
145
+ break
146
+ end
147
+ end
148
+
149
+ end
150
+
151
+ @tentacles.each { |t| t.join }
152
+
153
+ do_after_crawl_blocks()
154
+
155
+ self
156
+ end
157
+
158
+ private
159
+
160
+ #
161
+ # Execute the after_crawl blocks
162
+ #
163
+ def do_after_crawl_blocks
164
+ @after_crawl_blocks.each {|b| b.call(@pages)}
165
+ end
166
+
167
+ #
168
+ # Execute the on_every_page blocks for *page*
169
+ #
170
+ def do_page_blocks(page)
171
+ @on_every_page_blocks.each do |blk|
172
+ blk.call(page)
173
+ end
174
+
175
+ @on_pages_like_blocks.each do |pattern, blks|
176
+ if page.url.to_s =~ pattern
177
+ blks.each { |blk| blk.call(page) }
178
+ end
179
+ end
180
+ end
181
+
182
+ #
183
+ # Return an Array of links to follow from the given page.
184
+ # Based on whether or not the link has already been crawled,
185
+ # and the block given to focus_crawl()
186
+ #
187
+ def links_to_follow(page)
188
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
189
+ links.find_all { |link| visit_link?(link) }
190
+ end
191
+
192
+ #
193
+ # Returns +true+ if *link* has not been visited already,
194
+ # and is not excluded by a skip_link pattern. Returns
195
+ # +false+ otherwise.
196
+ #
197
+ def visit_link?(link)
198
+ !@pages.has_key?(link) and !skip_link?(link)
199
+ end
200
+
201
+ #
202
+ # Returns +true+ if *link* should not be visited because
203
+ # its URL matches a skip_link pattern.
204
+ #
205
+ def skip_link?(link)
206
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
207
+ return false
208
+ end
209
+
210
+ end
211
+ end
@@ -0,0 +1,38 @@
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = get_response(loc)
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
+ Net::HTTP.start(url.host, url.port) do |http|
34
+ return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT })
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,180 @@
1
+ require 'anemone/http'
2
+ require 'nokogiri'
3
+ require 'ostruct'
4
+
5
+ module Anemone
6
+ class Page
7
+
8
+ # The URL of the page
9
+ attr_reader :url
10
+ attr_reader :from_url
11
+
12
+ # Array of distinct A tag HREFs from the page
13
+ attr_reader :links
14
+ # Headers of the HTTP response
15
+ attr_reader :headers
16
+
17
+ # OpenStruct for user-stored data
18
+ attr_accessor :data
19
+ # Nokogiri document for the HTML body
20
+ attr_accessor :doc
21
+ # Integer response code of the page
22
+ attr_accessor :code
23
+ # Array of redirect-aliases for the page
24
+ attr_accessor :aliases
25
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
26
+ attr_accessor :visited
27
+ # Used by PageHash#shortest_paths! to store depth of the page
28
+ attr_accessor :depth
29
+
30
+ #
31
+ # Create a new Page from the response of an HTTP request to *url*
32
+ #
33
+ def self.fetch(url)
34
+ begin
35
+ url = URI(url) if url.is_a?(String)
36
+
37
+ response, code, location = Anemone::HTTP.get(url)
38
+
39
+ aka = nil
40
+ if !url.eql?(location)
41
+ aka = location
42
+ end
43
+
44
+ return Page.new(url, response.body, code, response.to_hash, aka, url.respond_to?(:from_url) ? url.from_url : nil)
45
+ rescue
46
+ return Page.new(url)
47
+ end
48
+ end
49
+
50
+ #
51
+ # Create a new page
52
+ #
53
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, from_url = nil)
54
+ @from_url = from_url
55
+ @url = url
56
+ @code = code
57
+ @headers = headers
58
+ @links = []
59
+ @aliases = []
60
+ @data = OpenStruct.new
61
+
62
+ @aliases << aka if !aka.nil?
63
+
64
+ if body
65
+ begin
66
+ @doc = Nokogiri::HTML(body)
67
+ rescue
68
+ return
69
+ end
70
+
71
+ return if @doc.nil?
72
+
73
+ #get a list of distinct links on the page, in absolute url form
74
+ @doc.css('a').each do |a|
75
+ u = a.attributes['href'].content if a.attributes['href']
76
+ next if u.nil?
77
+
78
+ begin
79
+ abs = to_absolute(URI(u))
80
+ rescue
81
+ next
82
+ end
83
+
84
+ @links << abs if in_domain?(abs)
85
+ end
86
+
87
+ @links.uniq!
88
+ end
89
+ end
90
+
91
+
92
+ #
93
+ # Return a new page with the same *response* and *url*, but
94
+ # with a 200 response code
95
+ #
96
+ def alias_clone(url)
97
+ p = clone
98
+ p.add_alias!(@aka) if !@aka.nil?
99
+ p.code = 200
100
+ p
101
+ end
102
+
103
+ #
104
+ # Add a redirect-alias String *aka* to the list of the page's aliases
105
+ #
106
+ # Returns *self*
107
+ #
108
+ def add_alias!(aka)
109
+ @aliases << aka if !@aliases.include?(aka)
110
+ self
111
+ end
112
+
113
+ #
114
+ # Returns an Array of all links from this page, and all the
115
+ # redirect-aliases of those pages, as String objects.
116
+ #
117
+ # *page_hash* is a PageHash object with the results of the current crawl.
118
+ #
119
+ def links_and_their_aliases(page_hash)
120
+ @links.inject([]) do |results, link|
121
+ results.concat([link].concat(page_hash[link].aliases))
122
+ end
123
+ end
124
+
125
+ #
126
+ # The content-type returned by the HTTP request for this page
127
+ #
128
+ def content_type
129
+ @headers['content-type'][0] rescue nil
130
+ end
131
+
132
+ #
133
+ # Returns +true+ if the page is a HTML document, returns +false+
134
+ # otherwise.
135
+ #
136
+ def html?
137
+ (@content_type =~ /text\/html/) == 0
138
+ end
139
+
140
+ #
141
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
142
+ # otherwise.
143
+ #
144
+ def redirect?
145
+ (300..399).include?(@code)
146
+ end
147
+
148
+ #
149
+ # Returns +true+ if the page was not found (returned 404 code),
150
+ # returns +false+ otherwise.
151
+ #
152
+ def not_found?
153
+ 404 == @code
154
+ end
155
+
156
+ #
157
+ # Converts relative URL *link* into an absolute URL based on the
158
+ # location of the page
159
+ #
160
+ def to_absolute(link)
161
+ # remove anchor
162
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
163
+
164
+ relative = URI(link)
165
+ absolute = @url.merge(relative)
166
+
167
+ absolute.path = '/' if absolute.path.empty?
168
+
169
+ return absolute
170
+ end
171
+
172
+ #
173
+ # Returns +true+ if *uri* is in the same domain as the page, returns
174
+ # +false+ otherwise
175
+ #
176
+ def in_domain?(uri)
177
+ uri.host == @url.host
178
+ end
179
+ end
180
+ end