parolkar-anemone 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+ * Can crawl obeying robots.txt
14
+
15
+ == REQUIREMENTS
16
+ * nokogiri
17
+
18
+ == EXAMPLES
19
+ See the +bin+ directory for several examples of useful Anemone tasks.
@@ -0,0 +1,36 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_count.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ Anemone.crawl(ARGV[0]) do |anemone|
31
+ anemone.after_crawl do |pages|
32
+ puts pages.uniq.size
33
+ end
34
+ end
35
+
36
+
@@ -0,0 +1,106 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'ostruct'
21
+
22
+ def usage
23
+ puts <<END
24
+ Usage: anemone_url_list.rb [options] url
25
+
26
+ Options:
27
+ -r, --relative Output relative URLs (rather than absolute)
28
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
+ END
30
+ end
31
+
32
+ options = OpenStruct.new
33
+ options.relative = false
34
+ options.output_file = 'urls.txt'
35
+
36
+ # make sure that the last option is a URL we can crawl
37
+ begin
38
+ URI(ARGV.last)
39
+ rescue
40
+ usage
41
+ Process.exit
42
+ end
43
+
44
+ # parse command-line options
45
+ opts = OptionParser.new
46
+ opts.on('-r', '--relative') { options.relative = true }
47
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
48
+ opts.parse!(ARGV)
49
+
50
+ root = ARGV.last
51
+
52
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
+
54
+ anemone.after_crawl do |pages|
55
+ puts "Crawl results for #{root}\n"
56
+
57
+ # print a list of 404's
58
+ not_found = []
59
+ pages.each_value do |page|
60
+ url = page.url.to_s
61
+ not_found << url if page.not_found?
62
+ end
63
+ unless not_found.empty?
64
+ puts "\n404's:"
65
+
66
+ missing_links = pages.urls_linking_to(not_found)
67
+ missing_links.each do |url, links|
68
+ if options.relative
69
+ puts URI(url).path.to_s
70
+ else
71
+ puts url
72
+ end
73
+ links.slice(0..10).each do |u|
74
+ u = u.path if options.relative
75
+ puts " linked from #{u}"
76
+ end
77
+
78
+ puts " ..." if links.size > 10
79
+ end
80
+
81
+ print "\n"
82
+ end
83
+
84
+ # remove redirect aliases, and calculate pagedepths
85
+ pages = pages.shortest_paths!(root).uniq
86
+ depths = pages.values.inject({}) do |depths, page|
87
+ depths[page.depth] ||= 0
88
+ depths[page.depth] += 1
89
+ depths
90
+ end
91
+
92
+ # print the page count
93
+ puts "Total pages: #{pages.size}\n"
94
+
95
+ # print a list of depths
96
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
97
+
98
+ # output a list of urls to file
99
+ file = open(options.output_file, 'w')
100
+ pages.each_key do |url|
101
+ url = options.relative ? url.path.to_s : url.to_s
102
+ file.puts url
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,44 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_pagedepth.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ root = ARGV[0]
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
+
34
+ anemone.after_crawl do |pages|
35
+ pages = pages.shortest_paths!(root).uniq
36
+ depths = pages.values.inject({}) do |depths, page|
37
+ depths[page.depth] ||= 0
38
+ depths[page.depth] += 1
39
+ depths
40
+ end
41
+
42
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_serialize.rb [options] url
24
+
25
+ Options:
26
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
+ END
28
+ end
29
+
30
+ # make sure that the first option is a URL we can crawl
31
+ begin
32
+ URI(ARGV[0])
33
+ rescue
34
+ usage
35
+ Process.exit
36
+ end
37
+
38
+ options = OpenStruct.new
39
+ options.output_file = "crawl.#{Time.now.to_i}"
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
44
+ opts.parse!(ARGV)
45
+
46
+ root = ARGV[0]
47
+ Anemone.crawl(root) do |anemone|
48
+ anemone.after_crawl do |pages|
49
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
+ end
51
+ end
@@ -0,0 +1,54 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_url_list.rb [options] url
24
+
25
+ Options:
26
+ -r, --relative Output relative URLs (rather than absolute)
27
+ END
28
+ end
29
+
30
+ options = OpenStruct.new
31
+ options.relative = false
32
+
33
+ # make sure that the last option is a URL we can crawl
34
+ begin
35
+ URI(ARGV.last)
36
+ rescue
37
+ usage
38
+ Process.exit
39
+ end
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-r', '--relative') { options.relative = true }
44
+ opts.parse!(ARGV)
45
+
46
+ Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
+ anemone.on_every_page do |page|
48
+ if options.relative
49
+ puts page.url.path
50
+ else
51
+ puts page.url
52
+ end
53
+ end
54
+ end
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/anemone'
@@ -0,0 +1,56 @@
1
+ require 'ostruct'
2
+ require 'anemone/core'
3
+
4
+ module Anemone
5
+ # Version number
6
+ VERSION = '0.1.2'
7
+
8
+ #module-wide options
9
+ def Anemone.options=(options)
10
+ @options = options
11
+ end
12
+
13
+ def Anemone.options
14
+ @options
15
+ end
16
+
17
+ #
18
+ # Convenience method to start a crawl using Core
19
+ #
20
+ def Anemone.crawl(urls, options = {}, &block)
21
+ Anemone.options = OpenStruct.new(options)
22
+
23
+ #by default, run 4 Tentacle threads to fetch pages
24
+ Anemone.options.threads ||= 4
25
+
26
+ #disable verbose output by default
27
+ Anemone.options.verbose ||= false
28
+
29
+ #by default, don't throw away the page response body after scanning it for links
30
+ Anemone.options.discard_page_bodies ||= false
31
+
32
+ #by default, identify self as Anemone/VERSION
33
+ Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
34
+
35
+ #Obey Robots.txt
36
+ Anemone.options.obey_robots_dot_txt ||= false
37
+ if Anemone.options.obey_robots_dot_txt == true
38
+ begin
39
+ require 'obey_robots_dot_txt'
40
+ rescue LoadError
41
+ warn "You need the 'obey_robots_dot_txt' gem installed, (you may run sudo gem install parolkar-obey_robots_dot_txt --source http://gems.github.com )"
42
+ exit
43
+ end
44
+ end
45
+
46
+ #no delay between requests by default
47
+ Anemone.options.delay ||= 0
48
+
49
+ #use a single thread if a delay was requested
50
+ if(Anemone.options.delay != 0)
51
+ Anemone.options.threads = 1
52
+ end
53
+
54
+ Core.crawl(urls, &block)
55
+ end
56
+ end
@@ -0,0 +1,209 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page_hash'
5
+
6
+ module Anemone
7
+ class Core
8
+ # PageHash storing all Page objects encountered during the crawl
9
+ attr_reader :pages
10
+
11
+ #
12
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
+ # and optional *block*
14
+ #
15
+ def initialize(urls, &block)
16
+ @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
17
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
18
+
19
+ @tentacles = []
20
+ @pages = PageHash.new
21
+ @on_every_page_blocks = []
22
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
23
+ @skip_link_patterns = []
24
+ @after_crawl_blocks = []
25
+
26
+ block.call(self) if block
27
+ end
28
+
29
+ #
30
+ # Convenience method to start a new crawl
31
+ #
32
+ def self.crawl(root, &block)
33
+ self.new(root) do |core|
34
+ block.call(core) if block
35
+ core.run
36
+ return core
37
+ end
38
+ end
39
+
40
+ #
41
+ # Add a block to be executed on the PageHash after the crawl
42
+ # is finished
43
+ #
44
+ def after_crawl(&block)
45
+ @after_crawl_blocks << block
46
+ self
47
+ end
48
+
49
+ #
50
+ # Add one ore more Regex patterns for URLs which should not be
51
+ # followed
52
+ #
53
+ def skip_links_like(*patterns)
54
+ if patterns
55
+ patterns.each do |pattern|
56
+ @skip_link_patterns << pattern
57
+ end
58
+ end
59
+ self
60
+ end
61
+
62
+ #
63
+ # Add a block to be executed on every Page as they are encountered
64
+ # during the crawl
65
+ #
66
+ def on_every_page(&block)
67
+ @on_every_page_blocks << block
68
+ self
69
+ end
70
+
71
+ #
72
+ # Add a block to be executed on Page objects with a URL matching
73
+ # one or more patterns
74
+ #
75
+ def on_pages_like(*patterns, &block)
76
+ if patterns
77
+ patterns.each do |pattern|
78
+ @on_pages_like_blocks[pattern] << block
79
+ end
80
+ end
81
+ self
82
+ end
83
+
84
+ #
85
+ # Specify a block which will select which links to follow on each page.
86
+ # The block should return an Array of URI objects.
87
+ #
88
+ def focus_crawl(&block)
89
+ @focus_crawl_block = block
90
+ self
91
+ end
92
+
93
+ #
94
+ # Perform the crawl
95
+ #
96
+ def run
97
+ @urls.delete_if { |url| !visit_link?(url) }
98
+ return if @urls.empty?
99
+
100
+ link_queue = Queue.new
101
+ page_queue = Queue.new
102
+
103
+ Anemone.options.threads.times do |id|
104
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
105
+ end
106
+
107
+ @urls.each{ |url| link_queue.enq(url) }
108
+
109
+ loop do
110
+ page = page_queue.deq
111
+
112
+ @pages[page.url] = page
113
+
114
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
115
+
116
+ #perform the on_every_page blocks for this page
117
+ do_page_blocks(page)
118
+
119
+ page.doc = nil if Anemone.options.discard_page_bodies
120
+
121
+ links_to_follow(page).each do |link|
122
+ link_queue.enq(link)
123
+ @pages[link] = nil
124
+ end
125
+
126
+ #create an entry in the page hash for each alias of this page,
127
+ #i.e. all the pages that redirected to this page
128
+ page.aliases.each do |aka|
129
+ if !@pages.has_key?(aka) or @pages[aka].nil?
130
+ @pages[aka] = page.alias_clone(aka)
131
+ end
132
+ @pages[aka].add_alias!(page.url)
133
+ end
134
+
135
+ # if we are done with the crawl, tell the threads to end
136
+ if link_queue.empty? and page_queue.empty?
137
+ until link_queue.num_waiting == @tentacles.size
138
+ Thread.pass
139
+ end
140
+
141
+ if page_queue.empty?
142
+ @tentacles.size.times { |i| link_queue.enq(:END)}
143
+ break
144
+ end
145
+ end
146
+
147
+ end
148
+
149
+ @tentacles.each { |t| t.join }
150
+
151
+ do_after_crawl_blocks()
152
+
153
+ self
154
+ end
155
+
156
+ private
157
+
158
+ #
159
+ # Execute the after_crawl blocks
160
+ #
161
+ def do_after_crawl_blocks
162
+ @after_crawl_blocks.each {|b| b.call(@pages)}
163
+ end
164
+
165
+ #
166
+ # Execute the on_every_page blocks for *page*
167
+ #
168
+ def do_page_blocks(page)
169
+ @on_every_page_blocks.each do |blk|
170
+ blk.call(page)
171
+ end
172
+
173
+ @on_pages_like_blocks.each do |pattern, blks|
174
+ if page.url.to_s =~ pattern
175
+ blks.each { |blk| blk.call(page) }
176
+ end
177
+ end
178
+ end
179
+
180
+ #
181
+ # Return an Array of links to follow from the given page.
182
+ # Based on whether or not the link has already been crawled,
183
+ # and the block given to focus_crawl()
184
+ #
185
+ def links_to_follow(page)
186
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
187
+ links.find_all { |link| visit_link?(link) }
188
+ end
189
+
190
+ #
191
+ # Returns +true+ if *link* has not been visited already,
192
+ # and is not excluded by a skip_link pattern. Returns
193
+ # +false+ otherwise.
194
+ #
195
+ def visit_link?(link)
196
+ !@pages.has_key?(link) and !skip_link?(link)
197
+ end
198
+
199
+ #
200
+ # Returns +true+ if *link* should not be visited because
201
+ # its URL matches a skip_link pattern.
202
+ #
203
+ def skip_link?(link)
204
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
205
+ return false
206
+ end
207
+
208
+ end
209
+ end
@@ -0,0 +1,38 @@
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = (Anemone.options.obey_robots_dot_txt ? (Net::HTTP.get_obeying_robots(loc)) : get_response(loc) )
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
+ Net::HTTP.start(url.host, url.port) do |http|
34
+ return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,177 @@
1
+ require 'anemone/http'
2
+ require 'nokogiri'
3
+ require 'ostruct'
4
+
5
+ module Anemone
6
+ class Page
7
+
8
+ # The URL of the page
9
+ attr_reader :url
10
+ # Array of distinct A tag HREFs from the page
11
+ attr_reader :links
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
+
15
+ # OpenStruct for user-stored data
16
+ attr_accessor :data
17
+ # Nokogiri document for the HTML body
18
+ attr_accessor :doc
19
+ # Integer response code of the page
20
+ attr_accessor :code
21
+ # Array of redirect-aliases for the page
22
+ attr_accessor :aliases
23
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
24
+ attr_accessor :visited
25
+ # Used by PageHash#shortest_paths! to store depth of the page
26
+ attr_accessor :depth
27
+
28
+ #
29
+ # Create a new Page from the response of an HTTP request to *url*
30
+ #
31
+ def self.fetch(url)
32
+ begin
33
+ url = URI(url) if url.is_a?(String)
34
+
35
+ response, code, location = Anemone::HTTP.get(url)
36
+
37
+ aka = nil
38
+ if !url.eql?(location)
39
+ aka = location
40
+ end
41
+
42
+ return Page.new(url, response.body, code, response.to_hash, aka)
43
+ rescue
44
+ return Page.new(url)
45
+ end
46
+ end
47
+
48
+ #
49
+ # Create a new page
50
+ #
51
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
52
+ @url = url
53
+ @code = code
54
+ @headers = headers
55
+ @links = []
56
+ @aliases = []
57
+ @data = OpenStruct.new
58
+
59
+ @aliases << aka if !aka.nil?
60
+
61
+ if body
62
+ begin
63
+ @doc = Nokogiri::HTML(body)
64
+ rescue
65
+ return
66
+ end
67
+
68
+ return if @doc.nil?
69
+
70
+ #get a list of distinct links on the page, in absolute url form
71
+ @doc.css('a').each do |a|
72
+ u = a.attributes['href'].content if a.attributes['href']
73
+ next if u.nil?
74
+
75
+ begin
76
+ abs = to_absolute(URI(u))
77
+ rescue
78
+ next
79
+ end
80
+
81
+ @links << abs if in_domain?(abs)
82
+ end
83
+
84
+ @links.uniq!
85
+ end
86
+ end
87
+
88
+
89
+ #
90
+ # Return a new page with the same *response* and *url*, but
91
+ # with a 200 response code
92
+ #
93
+ def alias_clone(url)
94
+ p = clone
95
+ p.add_alias!(@aka) if !@aka.nil?
96
+ p.code = 200
97
+ p
98
+ end
99
+
100
+ #
101
+ # Add a redirect-alias String *aka* to the list of the page's aliases
102
+ #
103
+ # Returns *self*
104
+ #
105
+ def add_alias!(aka)
106
+ @aliases << aka if !@aliases.include?(aka)
107
+ self
108
+ end
109
+
110
+ #
111
+ # Returns an Array of all links from this page, and all the
112
+ # redirect-aliases of those pages, as String objects.
113
+ #
114
+ # *page_hash* is a PageHash object with the results of the current crawl.
115
+ #
116
+ def links_and_their_aliases(page_hash)
117
+ @links.inject([]) do |results, link|
118
+ results.concat([link].concat(page_hash[link].aliases))
119
+ end
120
+ end
121
+
122
+ #
123
+ # The content-type returned by the HTTP request for this page
124
+ #
125
+ def content_type
126
+ @headers['content-type'][0] rescue nil
127
+ end
128
+
129
+ #
130
+ # Returns +true+ if the page is a HTML document, returns +false+
131
+ # otherwise.
132
+ #
133
+ def html?
134
+ (@content_type =~ /text\/html/) == 0
135
+ end
136
+
137
+ #
138
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
139
+ # otherwise.
140
+ #
141
+ def redirect?
142
+ (300..399).include?(@code)
143
+ end
144
+
145
+ #
146
+ # Returns +true+ if the page was not found (returned 404 code),
147
+ # returns +false+ otherwise.
148
+ #
149
+ def not_found?
150
+ 404 == @code
151
+ end
152
+
153
+ #
154
+ # Converts relative URL *link* into an absolute URL based on the
155
+ # location of the page
156
+ #
157
+ def to_absolute(link)
158
+ # remove anchor
159
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
160
+
161
+ relative = URI(link)
162
+ absolute = @url.merge(relative)
163
+
164
+ absolute.path = '/' if absolute.path.empty?
165
+
166
+ return absolute
167
+ end
168
+
169
+ #
170
+ # Returns +true+ if *uri* is in the same domain as the page, returns
171
+ # +false+ otherwise
172
+ #
173
+ def in_domain?(uri)
174
+ uri.host == @url.host
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,116 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
65
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
66
+ #
67
+ def pages_linking_to(urls)
68
+ unless urls.is_a?(Array)
69
+ urls = [urls] unless urls.is_a?(Array)
70
+ single = true
71
+ end
72
+
73
+ urls.map! do |url|
74
+ if url.is_a?(String)
75
+ URI(url) rescue nil
76
+ else
77
+ url
78
+ end
79
+ end
80
+ urls.compact
81
+
82
+ links = {}
83
+ urls.each { |url| links[url] = [] }
84
+ values.each do |page|
85
+ urls.each { |url| links[url] << page if page.links.include?(url) }
86
+ end
87
+
88
+ if single and !links.empty?
89
+ return links.first
90
+ else
91
+ return links
92
+ end
93
+ end
94
+
95
+ #
96
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
97
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
98
+ #
99
+ def urls_linking_to(urls)
100
+ unless urls.is_a?(Array)
101
+ urls = [urls] unless urls.is_a?(Array)
102
+ single = true
103
+ end
104
+
105
+ links = pages_linking_to(urls)
106
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
107
+
108
+ if single and !links.empty?
109
+ return links.first
110
+ else
111
+ return links
112
+ end
113
+ end
114
+
115
+ end
116
+ end
@@ -0,0 +1,33 @@
1
+ require 'anemone/page'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue)
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ end
13
+
14
+ #
15
+ # Gets links from @link_queue, and returns the fetched
16
+ # Page objects into @page_queue
17
+ #
18
+ def run
19
+ while true do
20
+ link = @link_queue.deq
21
+
22
+ break if link == :END
23
+
24
+ page = Page.fetch(link)
25
+
26
+ @page_queue.enq(page)
27
+
28
+ sleep Anemone.options.delay
29
+ end
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,41 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ end
8
+
9
+ it "should have options" do
10
+ Anemone.should respond_to(:options)
11
+ end
12
+
13
+ it "should accept options for the crawl" do
14
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false,
15
+ :threads => 2,
16
+ :discard_page_bodies => true,
17
+ :user_agent => 'test')
18
+ Anemone.options.verbose.should == false
19
+ Anemone.options.threads.should == 2
20
+ Anemone.options.discard_page_bodies.should == true
21
+ Anemone.options.delay.should == 0
22
+ Anemone.options.user_agent.should == 'test'
23
+ end
24
+
25
+ it "should accept options of obeying Robots.txt for the crawl" do
26
+ Anemone.crawl(SPEC_DOMAIN, :obey_robots_dot_txt => true)
27
+ Anemone.options.obey_robots_dot_txt.should == true
28
+ end
29
+
30
+ it "should use 1 thread if a delay is requested" do
31
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
32
+ Anemone.options.threads.should == 1
33
+ end
34
+
35
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
36
+ result = Anemone.crawl(SPEC_DOMAIN)
37
+ result.should be_an_instance_of(Anemone::Core)
38
+ result.pages.should be_an_instance_of(Anemone::PageHash)
39
+ end
40
+
41
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,128 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+
68
+ core = Anemone.crawl(pages[0].url) do |a|
69
+ a.skip_links_like /1/
70
+ end
71
+
72
+ core.should have(2).pages
73
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
74
+ end
75
+
76
+ it "should be able to call a block on every page" do
77
+ pages = []
78
+ pages << FakePage.new('0', :links => ['1', '2'])
79
+ pages << FakePage.new('1')
80
+ pages << FakePage.new('2')
81
+
82
+ count = 0
83
+ Anemone.crawl(pages[0].url) do |a|
84
+ a.on_every_page { count += 1 }
85
+ end
86
+
87
+ count.should == 3
88
+ end
89
+
90
+ it "should not discard page bodies by default" do
91
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
92
+ end
93
+
94
+ it "should optionally discard page bodies to conserve memory" do
95
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
96
+ core.pages.values.first.doc.should be_nil
97
+ end
98
+
99
+ it "should provide a focus_crawl method to select the links on each page to follow" do
100
+ pages = []
101
+ pages << FakePage.new('0', :links => ['1', '2'])
102
+ pages << FakePage.new('1')
103
+ pages << FakePage.new('2')
104
+
105
+ core = Anemone.crawl(pages[0].url) do |a|
106
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
107
+ end
108
+
109
+ core.should have(2).pages
110
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
111
+ end
112
+
113
+ it "should optionally delay between page requests" do
114
+ delay = 0.25
115
+
116
+ pages = []
117
+ pages << FakePage.new('0', :links => '1')
118
+ pages << FakePage.new('1')
119
+
120
+ start = Time.now
121
+ Anemone.crawl(pages[0].url, :delay => delay)
122
+ finish = Time.now
123
+
124
+ (finish - start).should satisfy {|t| t > delay * 2}
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,55 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = [options[:links]].flatten if options.has_key?(:links)
20
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
+ @redirect = options[:redirect] if options.has_key?(:redirect)
22
+
23
+ create_body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+ options[:location] = SPEC_DOMAIN + @redirect
46
+ end
47
+
48
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
49
+ end
50
+ end
51
+ end
52
+
53
+ #default root
54
+ Anemone::FakePage.new
55
+
data/spec/page_spec.rb ADDED
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:each) do
7
+ @page = Page.fetch(FakePage.new('home').url)
8
+ end
9
+
10
+ it "should be able to fetch a page" do
11
+ @page.should_not be_nil
12
+ @page.url.to_s.should include('home')
13
+ end
14
+
15
+ it "should store the response headers when fetching a page" do
16
+ @page.headers.should_not be_nil
17
+ @page.headers.should have_key('content-type')
18
+ end
19
+
20
+ it "should have an OpenStruct attribute for the developer to store data in" do
21
+ @page.data.should_not be_nil
22
+ @page.data.should be_an_instance_of(OpenStruct)
23
+
24
+ @page.data.test = 'test'
25
+ @page.data.test.should == 'test'
26
+ end
27
+
28
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
29
+ @page.doc.should_not be_nil
30
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
31
+ end
32
+
33
+ it "should indicate whether it was fetched after an HTTP redirect" do
34
+ @page.should respond_to(:redirect?)
35
+
36
+ @page.redirect?.should == false
37
+
38
+ Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
+ end
40
+
41
+ it "should have a method to tell if a URI is in the same domain as the page" do
42
+ @page.should respond_to(:in_domain?)
43
+
44
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
+ @page.in_domain?(URI('http://www.other.com/')).should == false
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,7 @@
1
+ require File.dirname(__FILE__) + '/fakeweb_helper'
2
+ require 'rubygems'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parolkar-anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-05-16 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.3.0
24
+ version:
25
+ description:
26
+ email:
27
+ executables:
28
+ - anemone_count.rb
29
+ - anemone_cron.rb
30
+ - anemone_pagedepth.rb
31
+ - anemone_serialize.rb
32
+ - anemone_url_list.rb
33
+ extensions: []
34
+
35
+ extra_rdoc_files:
36
+ - README.rdoc
37
+ files:
38
+ - LICENSE.txt
39
+ - README.rdoc
40
+ - bin/anemone_count.rb
41
+ - bin/anemone_cron.rb
42
+ - bin/anemone_pagedepth.rb
43
+ - bin/anemone_serialize.rb
44
+ - bin/anemone_url_list.rb
45
+ - lib/anemone.rb
46
+ - lib/anemone/anemone.rb
47
+ - lib/anemone/core.rb
48
+ - lib/anemone/http.rb
49
+ - lib/anemone/page.rb
50
+ - lib/anemone/page_hash.rb
51
+ - lib/anemone/tentacle.rb
52
+ has_rdoc: true
53
+ homepage: http://anemone.rubyforge.org
54
+ post_install_message:
55
+ rdoc_options:
56
+ - -m
57
+ - README.rdoc
58
+ - -t
59
+ - Anemone
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: "0"
73
+ version:
74
+ requirements: []
75
+
76
+ rubyforge_project: anemone
77
+ rubygems_version: 1.2.0
78
+ signing_key:
79
+ specification_version: 2
80
+ summary: Anemone web-spider framework
81
+ test_files:
82
+ - spec/anemone_spec.rb
83
+ - spec/core_spec.rb
84
+ - spec/page_spec.rb
85
+ - spec/fakeweb_helper.rb
86
+ - spec/spec_helper.rb