parolkar-anemone 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+ * Can crawl obeying robots.txt
14
+
15
+ == REQUIREMENTS
16
+ * nokogiri
17
+
18
+ == EXAMPLES
19
+ See the +bin+ directory for several examples of useful Anemone tasks.
@@ -0,0 +1,36 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_count.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ Anemone.crawl(ARGV[0]) do |anemone|
31
+ anemone.after_crawl do |pages|
32
+ puts pages.uniq.size
33
+ end
34
+ end
35
+
36
+
@@ -0,0 +1,106 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'ostruct'
21
+
22
+ def usage
23
+ puts <<END
24
+ Usage: anemone_url_list.rb [options] url
25
+
26
+ Options:
27
+ -r, --relative Output relative URLs (rather than absolute)
28
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
+ END
30
+ end
31
+
32
+ options = OpenStruct.new
33
+ options.relative = false
34
+ options.output_file = 'urls.txt'
35
+
36
+ # make sure that the last option is a URL we can crawl
37
+ begin
38
+ URI(ARGV.last)
39
+ rescue
40
+ usage
41
+ Process.exit
42
+ end
43
+
44
+ # parse command-line options
45
+ opts = OptionParser.new
46
+ opts.on('-r', '--relative') { options.relative = true }
47
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
48
+ opts.parse!(ARGV)
49
+
50
+ root = ARGV.last
51
+
52
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
+
54
+ anemone.after_crawl do |pages|
55
+ puts "Crawl results for #{root}\n"
56
+
57
+ # print a list of 404's
58
+ not_found = []
59
+ pages.each_value do |page|
60
+ url = page.url.to_s
61
+ not_found << url if page.not_found?
62
+ end
63
+ unless not_found.empty?
64
+ puts "\n404's:"
65
+
66
+ missing_links = pages.urls_linking_to(not_found)
67
+ missing_links.each do |url, links|
68
+ if options.relative
69
+ puts URI(url).path.to_s
70
+ else
71
+ puts url
72
+ end
73
+ links.slice(0..10).each do |u|
74
+ u = u.path if options.relative
75
+ puts " linked from #{u}"
76
+ end
77
+
78
+ puts " ..." if links.size > 10
79
+ end
80
+
81
+ print "\n"
82
+ end
83
+
84
+ # remove redirect aliases, and calculate pagedepths
85
+ pages = pages.shortest_paths!(root).uniq
86
+ depths = pages.values.inject({}) do |depths, page|
87
+ depths[page.depth] ||= 0
88
+ depths[page.depth] += 1
89
+ depths
90
+ end
91
+
92
+ # print the page count
93
+ puts "Total pages: #{pages.size}\n"
94
+
95
+ # print a list of depths
96
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
97
+
98
+ # output a list of urls to file
99
+ file = open(options.output_file, 'w')
100
+ pages.each_key do |url|
101
+ url = options.relative ? url.path.to_s : url.to_s
102
+ file.puts url
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,44 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_pagedepth.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ root = ARGV[0]
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
+
34
+ anemone.after_crawl do |pages|
35
+ pages = pages.shortest_paths!(root).uniq
36
+ depths = pages.values.inject({}) do |depths, page|
37
+ depths[page.depth] ||= 0
38
+ depths[page.depth] += 1
39
+ depths
40
+ end
41
+
42
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_serialize.rb [options] url
24
+
25
+ Options:
26
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
+ END
28
+ end
29
+
30
+ # make sure that the first option is a URL we can crawl
31
+ begin
32
+ URI(ARGV[0])
33
+ rescue
34
+ usage
35
+ Process.exit
36
+ end
37
+
38
+ options = OpenStruct.new
39
+ options.output_file = "crawl.#{Time.now.to_i}"
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
44
+ opts.parse!(ARGV)
45
+
46
+ root = ARGV[0]
47
+ Anemone.crawl(root) do |anemone|
48
+ anemone.after_crawl do |pages|
49
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
+ end
51
+ end
@@ -0,0 +1,54 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_url_list.rb [options] url
24
+
25
+ Options:
26
+ -r, --relative Output relative URLs (rather than absolute)
27
+ END
28
+ end
29
+
30
+ options = OpenStruct.new
31
+ options.relative = false
32
+
33
+ # make sure that the last option is a URL we can crawl
34
+ begin
35
+ URI(ARGV.last)
36
+ rescue
37
+ usage
38
+ Process.exit
39
+ end
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-r', '--relative') { options.relative = true }
44
+ opts.parse!(ARGV)
45
+
46
+ Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
+ anemone.on_every_page do |page|
48
+ if options.relative
49
+ puts page.url.path
50
+ else
51
+ puts page.url
52
+ end
53
+ end
54
+ end
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/anemone'
@@ -0,0 +1,56 @@
1
+ require 'ostruct'
2
+ require 'anemone/core'
3
+
4
+ module Anemone
5
+ # Version number
6
+ VERSION = '0.1.2'
7
+
8
+ #module-wide options
9
+ def Anemone.options=(options)
10
+ @options = options
11
+ end
12
+
13
+ def Anemone.options
14
+ @options
15
+ end
16
+
17
+ #
18
+ # Convenience method to start a crawl using Core
19
+ #
20
+ def Anemone.crawl(urls, options = {}, &block)
21
+ Anemone.options = OpenStruct.new(options)
22
+
23
+ #by default, run 4 Tentacle threads to fetch pages
24
+ Anemone.options.threads ||= 4
25
+
26
+ #disable verbose output by default
27
+ Anemone.options.verbose ||= false
28
+
29
+ #by default, don't throw away the page response body after scanning it for links
30
+ Anemone.options.discard_page_bodies ||= false
31
+
32
+ #by default, identify self as Anemone/VERSION
33
+ Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
34
+
35
+ #Obey Robots.txt
36
+ Anemone.options.obey_robots_dot_txt ||= false
37
+ if Anemone.options.obey_robots_dot_txt == true
38
+ begin
39
+ require 'obey_robots_dot_txt'
40
+ rescue LoadError
41
+ warn "You need the 'obey_robots_dot_txt' gem installed, (you may run sudo gem install parolkar-obey_robots_dot_txt --source http://gems.github.com )"
42
+ exit
43
+ end
44
+ end
45
+
46
+ #no delay between requests by default
47
+ Anemone.options.delay ||= 0
48
+
49
+ #use a single thread if a delay was requested
50
+ if(Anemone.options.delay != 0)
51
+ Anemone.options.threads = 1
52
+ end
53
+
54
+ Core.crawl(urls, &block)
55
+ end
56
+ end
@@ -0,0 +1,209 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page_hash'
5
+
6
+ module Anemone
7
+ class Core
8
+ # PageHash storing all Page objects encountered during the crawl
9
+ attr_reader :pages
10
+
11
+ #
12
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
+ # and optional *block*
14
+ #
15
+ def initialize(urls, &block)
16
+ @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
17
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
18
+
19
+ @tentacles = []
20
+ @pages = PageHash.new
21
+ @on_every_page_blocks = []
22
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
23
+ @skip_link_patterns = []
24
+ @after_crawl_blocks = []
25
+
26
+ block.call(self) if block
27
+ end
28
+
29
+ #
30
+ # Convenience method to start a new crawl
31
+ #
32
+ def self.crawl(root, &block)
33
+ self.new(root) do |core|
34
+ block.call(core) if block
35
+ core.run
36
+ return core
37
+ end
38
+ end
39
+
40
+ #
41
+ # Add a block to be executed on the PageHash after the crawl
42
+ # is finished
43
+ #
44
+ def after_crawl(&block)
45
+ @after_crawl_blocks << block
46
+ self
47
+ end
48
+
49
+ #
50
+ # Add one ore more Regex patterns for URLs which should not be
51
+ # followed
52
+ #
53
+ def skip_links_like(*patterns)
54
+ if patterns
55
+ patterns.each do |pattern|
56
+ @skip_link_patterns << pattern
57
+ end
58
+ end
59
+ self
60
+ end
61
+
62
+ #
63
+ # Add a block to be executed on every Page as they are encountered
64
+ # during the crawl
65
+ #
66
+ def on_every_page(&block)
67
+ @on_every_page_blocks << block
68
+ self
69
+ end
70
+
71
+ #
72
+ # Add a block to be executed on Page objects with a URL matching
73
+ # one or more patterns
74
+ #
75
+ def on_pages_like(*patterns, &block)
76
+ if patterns
77
+ patterns.each do |pattern|
78
+ @on_pages_like_blocks[pattern] << block
79
+ end
80
+ end
81
+ self
82
+ end
83
+
84
+ #
85
+ # Specify a block which will select which links to follow on each page.
86
+ # The block should return an Array of URI objects.
87
+ #
88
+ def focus_crawl(&block)
89
+ @focus_crawl_block = block
90
+ self
91
+ end
92
+
93
+ #
94
+ # Perform the crawl
95
+ #
96
+ def run
97
+ @urls.delete_if { |url| !visit_link?(url) }
98
+ return if @urls.empty?
99
+
100
+ link_queue = Queue.new
101
+ page_queue = Queue.new
102
+
103
+ Anemone.options.threads.times do |id|
104
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
105
+ end
106
+
107
+ @urls.each{ |url| link_queue.enq(url) }
108
+
109
+ loop do
110
+ page = page_queue.deq
111
+
112
+ @pages[page.url] = page
113
+
114
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
115
+
116
+ #perform the on_every_page blocks for this page
117
+ do_page_blocks(page)
118
+
119
+ page.doc = nil if Anemone.options.discard_page_bodies
120
+
121
+ links_to_follow(page).each do |link|
122
+ link_queue.enq(link)
123
+ @pages[link] = nil
124
+ end
125
+
126
+ #create an entry in the page hash for each alias of this page,
127
+ #i.e. all the pages that redirected to this page
128
+ page.aliases.each do |aka|
129
+ if !@pages.has_key?(aka) or @pages[aka].nil?
130
+ @pages[aka] = page.alias_clone(aka)
131
+ end
132
+ @pages[aka].add_alias!(page.url)
133
+ end
134
+
135
+ # if we are done with the crawl, tell the threads to end
136
+ if link_queue.empty? and page_queue.empty?
137
+ until link_queue.num_waiting == @tentacles.size
138
+ Thread.pass
139
+ end
140
+
141
+ if page_queue.empty?
142
+ @tentacles.size.times { |i| link_queue.enq(:END)}
143
+ break
144
+ end
145
+ end
146
+
147
+ end
148
+
149
+ @tentacles.each { |t| t.join }
150
+
151
+ do_after_crawl_blocks()
152
+
153
+ self
154
+ end
155
+
156
+ private
157
+
158
+ #
159
+ # Execute the after_crawl blocks
160
+ #
161
+ def do_after_crawl_blocks
162
+ @after_crawl_blocks.each {|b| b.call(@pages)}
163
+ end
164
+
165
+ #
166
+ # Execute the on_every_page blocks for *page*
167
+ #
168
+ def do_page_blocks(page)
169
+ @on_every_page_blocks.each do |blk|
170
+ blk.call(page)
171
+ end
172
+
173
+ @on_pages_like_blocks.each do |pattern, blks|
174
+ if page.url.to_s =~ pattern
175
+ blks.each { |blk| blk.call(page) }
176
+ end
177
+ end
178
+ end
179
+
180
+ #
181
+ # Return an Array of links to follow from the given page.
182
+ # Based on whether or not the link has already been crawled,
183
+ # and the block given to focus_crawl()
184
+ #
185
+ def links_to_follow(page)
186
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
187
+ links.find_all { |link| visit_link?(link) }
188
+ end
189
+
190
+ #
191
+ # Returns +true+ if *link* has not been visited already,
192
+ # and is not excluded by a skip_link pattern. Returns
193
+ # +false+ otherwise.
194
+ #
195
+ def visit_link?(link)
196
+ !@pages.has_key?(link) and !skip_link?(link)
197
+ end
198
+
199
+ #
200
+ # Returns +true+ if *link* should not be visited because
201
+ # its URL matches a skip_link pattern.
202
+ #
203
+ def skip_link?(link)
204
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
205
+ return false
206
+ end
207
+
208
+ end
209
+ end
@@ -0,0 +1,38 @@
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = (Anemone.options.obey_robots_dot_txt ? (Net::HTTP.get_obeying_robots(loc)) : get_response(loc) )
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
+ Net::HTTP.start(url.host, url.port) do |http|
34
+ return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,177 @@
1
+ require 'anemone/http'
2
+ require 'nokogiri'
3
+ require 'ostruct'
4
+
5
+ module Anemone
6
+ class Page
7
+
8
+ # The URL of the page
9
+ attr_reader :url
10
+ # Array of distinct A tag HREFs from the page
11
+ attr_reader :links
12
+ # Headers of the HTTP response
13
+ attr_reader :headers
14
+
15
+ # OpenStruct for user-stored data
16
+ attr_accessor :data
17
+ # Nokogiri document for the HTML body
18
+ attr_accessor :doc
19
+ # Integer response code of the page
20
+ attr_accessor :code
21
+ # Array of redirect-aliases for the page
22
+ attr_accessor :aliases
23
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
24
+ attr_accessor :visited
25
+ # Used by PageHash#shortest_paths! to store depth of the page
26
+ attr_accessor :depth
27
+
28
+ #
29
+ # Create a new Page from the response of an HTTP request to *url*
30
+ #
31
+ def self.fetch(url)
32
+ begin
33
+ url = URI(url) if url.is_a?(String)
34
+
35
+ response, code, location = Anemone::HTTP.get(url)
36
+
37
+ aka = nil
38
+ if !url.eql?(location)
39
+ aka = location
40
+ end
41
+
42
+ return Page.new(url, response.body, code, response.to_hash, aka)
43
+ rescue
44
+ return Page.new(url)
45
+ end
46
+ end
47
+
48
+ #
49
+ # Create a new page
50
+ #
51
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
52
+ @url = url
53
+ @code = code
54
+ @headers = headers
55
+ @links = []
56
+ @aliases = []
57
+ @data = OpenStruct.new
58
+
59
+ @aliases << aka if !aka.nil?
60
+
61
+ if body
62
+ begin
63
+ @doc = Nokogiri::HTML(body)
64
+ rescue
65
+ return
66
+ end
67
+
68
+ return if @doc.nil?
69
+
70
+ #get a list of distinct links on the page, in absolute url form
71
+ @doc.css('a').each do |a|
72
+ u = a.attributes['href'].content if a.attributes['href']
73
+ next if u.nil?
74
+
75
+ begin
76
+ abs = to_absolute(URI(u))
77
+ rescue
78
+ next
79
+ end
80
+
81
+ @links << abs if in_domain?(abs)
82
+ end
83
+
84
+ @links.uniq!
85
+ end
86
+ end
87
+
88
+
89
+ #
90
+ # Return a new page with the same *response* and *url*, but
91
+ # with a 200 response code
92
+ #
93
+ def alias_clone(url)
94
+ p = clone
95
+ p.add_alias!(@aka) if !@aka.nil?
96
+ p.code = 200
97
+ p
98
+ end
99
+
100
+ #
101
+ # Add a redirect-alias String *aka* to the list of the page's aliases
102
+ #
103
+ # Returns *self*
104
+ #
105
+ def add_alias!(aka)
106
+ @aliases << aka if !@aliases.include?(aka)
107
+ self
108
+ end
109
+
110
+ #
111
+ # Returns an Array of all links from this page, and all the
112
+ # redirect-aliases of those pages, as String objects.
113
+ #
114
+ # *page_hash* is a PageHash object with the results of the current crawl.
115
+ #
116
+ def links_and_their_aliases(page_hash)
117
+ @links.inject([]) do |results, link|
118
+ results.concat([link].concat(page_hash[link].aliases))
119
+ end
120
+ end
121
+
122
+ #
123
+ # The content-type returned by the HTTP request for this page
124
+ #
125
+ def content_type
126
+ @headers['content-type'][0] rescue nil
127
+ end
128
+
129
+ #
130
+ # Returns +true+ if the page is a HTML document, returns +false+
131
+ # otherwise.
132
+ #
133
+ def html?
134
+ (@content_type =~ /text\/html/) == 0
135
+ end
136
+
137
+ #
138
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
139
+ # otherwise.
140
+ #
141
+ def redirect?
142
+ (300..399).include?(@code)
143
+ end
144
+
145
+ #
146
+ # Returns +true+ if the page was not found (returned 404 code),
147
+ # returns +false+ otherwise.
148
+ #
149
+ def not_found?
150
+ 404 == @code
151
+ end
152
+
153
+ #
154
+ # Converts relative URL *link* into an absolute URL based on the
155
+ # location of the page
156
+ #
157
+ def to_absolute(link)
158
+ # remove anchor
159
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
160
+
161
+ relative = URI(link)
162
+ absolute = @url.merge(relative)
163
+
164
+ absolute.path = '/' if absolute.path.empty?
165
+
166
+ return absolute
167
+ end
168
+
169
+ #
170
+ # Returns +true+ if *uri* is in the same domain as the page, returns
171
+ # +false+ otherwise
172
+ #
173
+ def in_domain?(uri)
174
+ uri.host == @url.host
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,116 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
65
+ # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
66
+ #
67
+ def pages_linking_to(urls)
68
+ unless urls.is_a?(Array)
69
+ urls = [urls] unless urls.is_a?(Array)
70
+ single = true
71
+ end
72
+
73
+ urls.map! do |url|
74
+ if url.is_a?(String)
75
+ URI(url) rescue nil
76
+ else
77
+ url
78
+ end
79
+ end
80
+ urls.compact
81
+
82
+ links = {}
83
+ urls.each { |url| links[url] = [] }
84
+ values.each do |page|
85
+ urls.each { |url| links[url] << page if page.links.include?(url) }
86
+ end
87
+
88
+ if single and !links.empty?
89
+ return links.first
90
+ else
91
+ return links
92
+ end
93
+ end
94
+
95
+ #
96
+ # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
97
+ # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
98
+ #
99
+ def urls_linking_to(urls)
100
+ unless urls.is_a?(Array)
101
+ urls = [urls] unless urls.is_a?(Array)
102
+ single = true
103
+ end
104
+
105
+ links = pages_linking_to(urls)
106
+ links.each { |url, pages| links[url] = pages.map{|p| p.url} }
107
+
108
+ if single and !links.empty?
109
+ return links.first
110
+ else
111
+ return links
112
+ end
113
+ end
114
+
115
+ end
116
+ end
@@ -0,0 +1,33 @@
1
+ require 'anemone/page'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue)
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ end
13
+
14
+ #
15
+ # Gets links from @link_queue, and returns the fetched
16
+ # Page objects into @page_queue
17
+ #
18
+ def run
19
+ while true do
20
+ link = @link_queue.deq
21
+
22
+ break if link == :END
23
+
24
+ page = Page.fetch(link)
25
+
26
+ @page_queue.enq(page)
27
+
28
+ sleep Anemone.options.delay
29
+ end
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,41 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe Anemone do
4
+
5
+ it "should have a version" do
6
+ Anemone.const_defined?('VERSION').should == true
7
+ end
8
+
9
+ it "should have options" do
10
+ Anemone.should respond_to(:options)
11
+ end
12
+
13
+ it "should accept options for the crawl" do
14
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false,
15
+ :threads => 2,
16
+ :discard_page_bodies => true,
17
+ :user_agent => 'test')
18
+ Anemone.options.verbose.should == false
19
+ Anemone.options.threads.should == 2
20
+ Anemone.options.discard_page_bodies.should == true
21
+ Anemone.options.delay.should == 0
22
+ Anemone.options.user_agent.should == 'test'
23
+ end
24
+
25
+ it "should accept options of obeying Robots.txt for the crawl" do
26
+ Anemone.crawl(SPEC_DOMAIN, :obey_robots_dot_txt => true)
27
+ Anemone.options.obey_robots_dot_txt.should == true
28
+ end
29
+
30
+ it "should use 1 thread if a delay is requested" do
31
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
32
+ Anemone.options.threads.should == 1
33
+ end
34
+
35
+ it "should return a Anemone::Core from the crawl, which has a PageHash" do
36
+ result = Anemone.crawl(SPEC_DOMAIN)
37
+ result.should be_an_instance_of(Anemone::Core)
38
+ result.pages.should be_an_instance_of(Anemone::PageHash)
39
+ end
40
+
41
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,128 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Core do
5
+
6
+ before(:each) do
7
+ FakeWeb.clean_registry
8
+ end
9
+
10
+ it "should crawl all the html pages in a domain by following <a> href's" do
11
+ pages = []
12
+ pages << FakePage.new('0', :links => ['1', '2'])
13
+ pages << FakePage.new('1', :links => ['3'])
14
+ pages << FakePage.new('2')
15
+ pages << FakePage.new('3')
16
+
17
+ Anemone.crawl(pages[0].url).should have(4).pages
18
+ end
19
+
20
+ it "should not leave the original domain" do
21
+ pages = []
22
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
+ pages << FakePage.new('1')
24
+
25
+ core = Anemone.crawl(pages[0].url)
26
+
27
+ core.should have(2).pages
28
+ core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
29
+ end
30
+
31
+ it "should follow http redirects" do
32
+ pages = []
33
+ pages << FakePage.new('0', :links => ['1'])
34
+ pages << FakePage.new('1', :redirect => '2')
35
+ pages << FakePage.new('2')
36
+
37
+ Anemone.crawl(pages[0].url).should have(3).pages
38
+ end
39
+
40
+ it "should accept multiple starting URLs" do
41
+ pages = []
42
+ pages << FakePage.new('0', :links => ['1'])
43
+ pages << FakePage.new('1')
44
+ pages << FakePage.new('2', :links => ['3'])
45
+ pages << FakePage.new('3')
46
+
47
+ Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
+ end
49
+
50
+ it "should include the query string when following links" do
51
+ pages = []
52
+ pages << FakePage.new('0', :links => ['1?foo=1'])
53
+ pages << FakePage.new('1?foo=1')
54
+ pages << FakePage.new('1')
55
+
56
+ core = Anemone.crawl(pages[0].url)
57
+
58
+ core.should have(2).pages
59
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
60
+ end
61
+
62
+ it "should be able to skip links based on a RegEx" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1', '2'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2')
67
+
68
+ core = Anemone.crawl(pages[0].url) do |a|
69
+ a.skip_links_like /1/
70
+ end
71
+
72
+ core.should have(2).pages
73
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
74
+ end
75
+
76
+ it "should be able to call a block on every page" do
77
+ pages = []
78
+ pages << FakePage.new('0', :links => ['1', '2'])
79
+ pages << FakePage.new('1')
80
+ pages << FakePage.new('2')
81
+
82
+ count = 0
83
+ Anemone.crawl(pages[0].url) do |a|
84
+ a.on_every_page { count += 1 }
85
+ end
86
+
87
+ count.should == 3
88
+ end
89
+
90
+ it "should not discard page bodies by default" do
91
+ Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
92
+ end
93
+
94
+ it "should optionally discard page bodies to conserve memory" do
95
+ core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
96
+ core.pages.values.first.doc.should be_nil
97
+ end
98
+
99
+ it "should provide a focus_crawl method to select the links on each page to follow" do
100
+ pages = []
101
+ pages << FakePage.new('0', :links => ['1', '2'])
102
+ pages << FakePage.new('1')
103
+ pages << FakePage.new('2')
104
+
105
+ core = Anemone.crawl(pages[0].url) do |a|
106
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
107
+ end
108
+
109
+ core.should have(2).pages
110
+ core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
111
+ end
112
+
113
+ it "should optionally delay between page requests" do
114
+ delay = 0.25
115
+
116
+ pages = []
117
+ pages << FakePage.new('0', :links => '1')
118
+ pages << FakePage.new('1')
119
+
120
+ start = Time.now
121
+ Anemone.crawl(pages[0].url, :delay => delay)
122
+ finish = Time.now
123
+
124
+ (finish - start).should satisfy {|t| t > delay * 2}
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,55 @@
1
+ begin
2
+ require 'fakeweb'
3
+ rescue LoadError
4
+ warn "You need the 'fakeweb' gem installed to test Anemone"
5
+ exit
6
+ end
7
+
8
+ FakeWeb.allow_net_connect = false
9
+
10
+ module Anemone
11
+ SPEC_DOMAIN = "http://www.example.com/"
12
+
13
+ class FakePage
14
+ attr_accessor :links
15
+ attr_accessor :hrefs
16
+
17
+ def initialize(name = '', options = {})
18
+ @name = name
19
+ @links = [options[:links]].flatten if options.has_key?(:links)
20
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
+ @redirect = options[:redirect] if options.has_key?(:redirect)
22
+
23
+ create_body
24
+ add_to_fakeweb
25
+ end
26
+
27
+ def url
28
+ SPEC_DOMAIN + @name
29
+ end
30
+
31
+ private
32
+
33
+ def create_body
34
+ @body = "<html><body>"
35
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
36
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
37
+ @body += "</body></html>"
38
+ end
39
+
40
+ def add_to_fakeweb
41
+ options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
42
+
43
+ if @redirect
44
+ options[:status] = [301, "Permanently Moved"]
45
+ options[:location] = SPEC_DOMAIN + @redirect
46
+ end
47
+
48
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
49
+ end
50
+ end
51
+ end
52
+
53
+ #default root
54
+ Anemone::FakePage.new
55
+
data/spec/page_spec.rb ADDED
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe Page do
5
+
6
+ before(:each) do
7
+ @page = Page.fetch(FakePage.new('home').url)
8
+ end
9
+
10
+ it "should be able to fetch a page" do
11
+ @page.should_not be_nil
12
+ @page.url.to_s.should include('home')
13
+ end
14
+
15
+ it "should store the response headers when fetching a page" do
16
+ @page.headers.should_not be_nil
17
+ @page.headers.should have_key('content-type')
18
+ end
19
+
20
+ it "should have an OpenStruct attribute for the developer to store data in" do
21
+ @page.data.should_not be_nil
22
+ @page.data.should be_an_instance_of(OpenStruct)
23
+
24
+ @page.data.test = 'test'
25
+ @page.data.test.should == 'test'
26
+ end
27
+
28
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
29
+ @page.doc.should_not be_nil
30
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
31
+ end
32
+
33
+ it "should indicate whether it was fetched after an HTTP redirect" do
34
+ @page.should respond_to(:redirect?)
35
+
36
+ @page.redirect?.should == false
37
+
38
+ Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
+ end
40
+
41
+ it "should have a method to tell if a URI is in the same domain as the page" do
42
+ @page.should respond_to(:in_domain?)
43
+
44
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
+ @page.in_domain?(URI('http://www.other.com/')).should == false
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,7 @@
1
+ require File.dirname(__FILE__) + '/fakeweb_helper'
2
+ require 'rubygems'
3
+
4
+ $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
+ require 'anemone'
6
+
7
+ SPEC_DOMAIN = 'http://www.example.com/'
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parolkar-anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-05-16 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.3.0
24
+ version:
25
+ description:
26
+ email:
27
+ executables:
28
+ - anemone_count.rb
29
+ - anemone_cron.rb
30
+ - anemone_pagedepth.rb
31
+ - anemone_serialize.rb
32
+ - anemone_url_list.rb
33
+ extensions: []
34
+
35
+ extra_rdoc_files:
36
+ - README.rdoc
37
+ files:
38
+ - LICENSE.txt
39
+ - README.rdoc
40
+ - bin/anemone_count.rb
41
+ - bin/anemone_cron.rb
42
+ - bin/anemone_pagedepth.rb
43
+ - bin/anemone_serialize.rb
44
+ - bin/anemone_url_list.rb
45
+ - lib/anemone.rb
46
+ - lib/anemone/anemone.rb
47
+ - lib/anemone/core.rb
48
+ - lib/anemone/http.rb
49
+ - lib/anemone/page.rb
50
+ - lib/anemone/page_hash.rb
51
+ - lib/anemone/tentacle.rb
52
+ has_rdoc: true
53
+ homepage: http://anemone.rubyforge.org
54
+ post_install_message:
55
+ rdoc_options:
56
+ - -m
57
+ - README.rdoc
58
+ - -t
59
+ - Anemone
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: "0"
73
+ version:
74
+ requirements: []
75
+
76
+ rubyforge_project: anemone
77
+ rubygems_version: 1.2.0
78
+ signing_key:
79
+ specification_version: 2
80
+ summary: Anemone web-spider framework
81
+ test_files:
82
+ - spec/anemone_spec.rb
83
+ - spec/core_spec.rb
84
+ - spec/page_spec.rb
85
+ - spec/fakeweb_helper.rb
86
+ - spec/spec_helper.rb