anemone 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.txt ADDED
@@ -0,0 +1,18 @@
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+
14
+ == REQUIREMENTS
15
+ * hpricot
16
+
17
+ == EXAMPLES
18
+ See the +bin+ directory for several examples of useful Anemone tasks.
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+ require 'rdoc/usage'
16
+
17
+ # make sure that the first option is a URL we can crawl
18
+ begin
19
+ URI(ARGV[0])
20
+ rescue
21
+ RDoc::usage()
22
+ Process.exit
23
+ end
24
+
25
+ Anemone.crawl(ARGV[0]) do |anemone|
26
+ anemone.after_crawl do |pages|
27
+ puts pages.uniq.size
28
+ end
29
+ end
30
+
31
+
@@ -0,0 +1,99 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'rdoc/usage'
21
+ require 'ostruct'
22
+
23
+ options = OpenStruct.new
24
+ options.relative = false
25
+ options.output_file = 'urls.txt'
26
+
27
+ # make sure that the last option is a URL we can crawl
28
+ begin
29
+ URI(ARGV.last)
30
+ rescue
31
+ RDoc::usage()
32
+ Process.exit
33
+ end
34
+
35
+ # parse command-line options
36
+ opts = OptionParser.new
37
+ opts.on('-r', '--relative') { options.relative = true }
38
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
39
+ opts.parse!(ARGV)
40
+
41
+ root = ARGV.last
42
+
43
+ Anemone.crawl(root) do |anemone|
44
+
45
+ anemone.after_crawl do |pages|
46
+ puts "Crawl results for #{root}\n"
47
+
48
+ # print a list of 404's
49
+ not_found = []
50
+ pages.each_value do |page|
51
+ url = page.url.to_s
52
+ not_found << url if page.not_found?
53
+ end
54
+ if !not_found.empty?
55
+ puts "\n404's:"
56
+ not_found.each do |url|
57
+ if options.relative
58
+ puts URI(url).path.to_s
59
+ else
60
+ puts url
61
+ end
62
+ num_linked_from = 0
63
+ pages.urls_linking_to(url).each do |u|
64
+ u = u.path if options.relative
65
+ num_linked_from += 1
66
+ puts " linked from #{u}"
67
+ if num_linked_from > 10
68
+ puts " ..."
69
+ break
70
+ end
71
+ end
72
+ end
73
+
74
+ print "\n"
75
+ end
76
+
77
+ # remove redirect aliases, and calculate pagedepths
78
+ pages = pages.shortest_paths!(root).uniq
79
+ depths = pages.values.inject({}) do |depths, page|
80
+ depths[page.depth] ||= 0
81
+ depths[page.depth] += 1
82
+ depths
83
+ end
84
+
85
+ # print the page count
86
+ puts "Total pages: #{pages.size}\n"
87
+
88
+ # print a list of depths
89
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
90
+
91
+ # output a list of urls to file
92
+ file = open(options.output_file, 'w')
93
+ pages.each_key do |url|
94
+ url = options.relative ? url.path.to_s : url.to_s
95
+ file.puts url
96
+ end
97
+
98
+ end
99
+ end
@@ -0,0 +1,39 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+ require 'rdoc/usage'
16
+
17
+ # make sure that the first option is a URL we can crawl
18
+ begin
19
+ URI(ARGV[0])
20
+ rescue
21
+ RDoc::usage()
22
+ Process.exit
23
+ end
24
+
25
+ root = ARGV[0]
26
+ Anemone.crawl(root) do |anemone|
27
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
28
+
29
+ anemone.after_crawl do |pages|
30
+ pages = pages.shortest_paths!(root).uniq
31
+ depths = pages.values.inject({}) do |depths, page|
32
+ depths[page.depth] ||= 0
33
+ depths[page.depth] += 1
34
+ depths
35
+ end
36
+
37
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
38
+ end
39
+ end
@@ -0,0 +1,43 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'rdoc/usage'
20
+ require 'ostruct'
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ RDoc::usage()
27
+ Process.exit
28
+ end
29
+
30
+ options = OpenStruct.new
31
+ options.output_file = "crawl.#{Time.now.to_i}"
32
+
33
+ # parse command-line options
34
+ opts = OptionParser.new
35
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
36
+ opts.parse!(ARGV)
37
+
38
+ root = ARGV[0]
39
+ Anemone.crawl(root) do |anemone|
40
+ anemone.after_crawl do |pages|
41
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
42
+ end
43
+ end
@@ -0,0 +1,46 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'rdoc/usage'
20
+ require 'ostruct'
21
+
22
+ options = OpenStruct.new
23
+ options.relative = false
24
+
25
+ # make sure that the last option is a URL we can crawl
26
+ begin
27
+ URI(ARGV.last)
28
+ rescue
29
+ RDoc::usage()
30
+ Process.exit
31
+ end
32
+
33
+ # parse command-line options
34
+ opts = OptionParser.new
35
+ opts.on('-r', '--relative') { options.relative = true }
36
+ opts.parse!(ARGV)
37
+
38
+ Anemone.crawl(ARGV.last) do |anemone|
39
+ anemone.on_every_page do |page|
40
+ if options.relative
41
+ puts page.url.path
42
+ else
43
+ puts page.url
44
+ end
45
+ end
46
+ end
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/anemone'
@@ -0,0 +1,16 @@
1
+ require 'anemone/core'
2
+
3
+ module Anemone
4
+ # Version number
5
+ VERSION = '0.0.1'
6
+
7
+ # User-Agent string used for HTTP requests
8
+ USER_AGENT = "Anemone/#{self::VERSION}"
9
+
10
+ #
11
+ # Convenience method to start a crawl using Core
12
+ #
13
+ def Anemone.crawl(url, options = {}, &block)
14
+ Core.crawl(url, options, &block)
15
+ end
16
+ end
@@ -0,0 +1,183 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page_hash'
5
+
6
+ module Anemone
7
+ class Core
8
+ # PageHash storing all Page objects encountered during the crawl
9
+ attr_reader :pages
10
+
11
+ #
12
+ # Initialize the crawl with a starting *url*, *options*, and optional *block*
13
+ #
14
+ def initialize(url, options={}, &block)
15
+ url = URI(url) if url.is_a?(String)
16
+ @url = url
17
+ @options = options
18
+ @tentacles = []
19
+ @pages = PageHash.new
20
+ @on_every_page_blocks = []
21
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
22
+ @skip_link_patterns = []
23
+ @after_crawl_blocks = []
24
+
25
+ @options[:threads] ||= 4
26
+ @options[:verbose] ||= false
27
+
28
+ block.call(self) if block
29
+ end
30
+
31
+ #
32
+ # Convenience method to start a new crawl
33
+ #
34
+ def self.crawl(root, options={}, &block)
35
+ self.new(root, options) do |core|
36
+ block.call(core) if block
37
+ core.run
38
+ core.do_after_crawl_blocks
39
+ return core
40
+ end
41
+ end
42
+
43
+ #
44
+ # Add a block to be executed on the PageHash after the crawl
45
+ # is finished
46
+ #
47
+ def after_crawl(&block)
48
+ @after_crawl_blocks << block
49
+ self
50
+ end
51
+
52
+ #
53
+ # Add one ore more Regex patterns for URLs which should not be
54
+ # followed
55
+ #
56
+ def skip_links_like(*patterns)
57
+ if patterns
58
+ patterns.each do |pattern|
59
+ @skip_link_patterns << pattern
60
+ end
61
+ end
62
+ self
63
+ end
64
+
65
+ #
66
+ # Add a block to be executed on every Page as they are encountered
67
+ # during the crawl
68
+ #
69
+ def on_every_page(&block)
70
+ @on_every_page_blocks << block
71
+ self
72
+ end
73
+
74
+ #
75
+ # Add a block to be executed on Page objects with a URL matching
76
+ # one or more patterns
77
+ #
78
+ def on_pages_like(*patterns, &block)
79
+ if patterns
80
+ patterns.each do |pattern|
81
+ @on_pages_like_blocks[pattern] << block
82
+ end
83
+ end
84
+ self
85
+ end
86
+
87
+ #
88
+ # Perform the crawl
89
+ #
90
+ def run
91
+ link_queue = Queue.new
92
+ page_queue = Queue.new
93
+
94
+ @options[:threads].times do |id|
95
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
96
+ end
97
+
98
+ return if !visit_link?(@url)
99
+
100
+ link_queue.enq(@url)
101
+
102
+ while true do
103
+ page = page_queue.deq
104
+
105
+ @pages[page.url] = page
106
+
107
+ puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
108
+
109
+ do_page_blocks(page)
110
+
111
+ page.links.each do |link|
112
+ if visit_link?(link)
113
+ link_queue.enq(link)
114
+ @pages[link] = nil
115
+ end
116
+ end
117
+
118
+ page.aliases.each do |aka|
119
+ if !@pages.has_key?(aka) or @pages[aka].nil?
120
+ @pages[aka] = page.alias_clone(aka)
121
+ end
122
+ @pages[aka].add_alias!(page.url)
123
+ end
124
+
125
+ # if we are done with the crawl, tell the threads to end
126
+ if link_queue.empty? and page_queue.empty?
127
+ until link_queue.num_waiting == @tentacles.size
128
+ Thread.pass
129
+ end
130
+
131
+ if page_queue.empty?
132
+ @tentacles.size.times { |i| link_queue.enq(:END)}
133
+ break
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ @tentacles.each { |t| t.join }
140
+
141
+ self
142
+ end
143
+
144
+ #
145
+ # Execute the after_crawl blocks
146
+ #
147
+ def do_after_crawl_blocks
148
+ @after_crawl_blocks.each {|b| b.call(@pages)}
149
+ end
150
+
151
+ #
152
+ # Execute the on_every_page blocks for *page*
153
+ #
154
+ def do_page_blocks(page)
155
+ @on_every_page_blocks.each do |blk|
156
+ blk.call(page)
157
+ end
158
+
159
+ @on_pages_like_blocks.each do |pattern, blk|
160
+ blk.call(page) if page.url.to_s =~ pattern
161
+ end
162
+ end
163
+
164
+ #
165
+ # Returns +true+ if *link* has not been visited already,
166
+ # and is not excluded by a skip_link pattern. Returns
167
+ # +false+ otherwise.
168
+ #
169
+ def visit_link?(link)
170
+ !@pages.has_key?(link) and !skip_link?(link)
171
+ end
172
+
173
+ #
174
+ # Returns +true+ if *link* should not be visited because
175
+ # its URL matches a skip_link pattern.
176
+ #
177
+ def skip_link?(link)
178
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
179
+ return false
180
+ end
181
+
182
+ end
183
+ end
@@ -0,0 +1,37 @@
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = get_response(loc)
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ Net::HTTP.start(url.host, url.port) do |http|
33
+ return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,165 @@
1
+ require 'anemone/http'
2
+ require 'hpricot'
3
+
4
+ module Anemone
5
+ class Page
6
+ # The URL of the page
7
+ attr_reader :url
8
+ # Array of distinct A tag HREFs from the page
9
+ attr_reader :links
10
+ # Integer response code of the page
11
+ attr_reader :code
12
+
13
+ # Array of redirect-aliases for the page
14
+ attr_accessor :aliases
15
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
16
+ attr_accessor :visited
17
+ # Used by PageHash#shortest_paths! to store depth of the page
18
+ attr_accessor :depth
19
+
20
+ #
21
+ # Create a new Page from the response of an HTTP request to *url*
22
+ #
23
+ def self.fetch(url)
24
+ begin
25
+ url = URI(url) if url.is_a?(String)
26
+
27
+ response, code, location = Anemone::HTTP.get(url)
28
+
29
+ aka = nil
30
+ if !url.eql?(location)
31
+ aka = location
32
+ end
33
+
34
+ return Page.new(url, response, code, aka)
35
+ rescue
36
+ return Page.new(url)
37
+ end
38
+ end
39
+
40
+ #
41
+ # Create a new page
42
+ #
43
+ def initialize(url, response = nil, code = nil, aka = nil)
44
+ @url = url
45
+ @response = response
46
+ @code = code
47
+ @links = []
48
+ @aliases = []
49
+
50
+ @aliases << aka if !aka.nil?
51
+
52
+ #get a list of distinct links on the page, in absolute url form
53
+ if @response and @response.body
54
+ Hpricot(@response.body).search('a').each do |a|
55
+ u = a['href']
56
+ next if u.nil?
57
+
58
+ begin
59
+ u = URI(u)
60
+ rescue
61
+ next
62
+ end
63
+
64
+ abs = to_absolute(u)
65
+ @links << abs if in_domain?(abs)
66
+ end
67
+
68
+ @links.uniq!
69
+ end
70
+ end
71
+
72
+
73
+ #
74
+ # Return a new page with the same *response* and *url*, but
75
+ # with a 200 response code
76
+ #
77
+ def alias_clone(url)
78
+ Page.new(url, @response, 200, @url)
79
+ end
80
+
81
+ #
82
+ # Add a redirect-alias String *aka* to the list of the page's aliases
83
+ #
84
+ # Returns *self*
85
+ #
86
+ def add_alias!(aka)
87
+ @aliases << aka if !@aliases.include?(aka)
88
+ self
89
+ end
90
+
91
+ #
92
+ # Returns an Array of all links from this page, and all the
93
+ # redirect-aliases of those pages, as String objects.
94
+ #
95
+ # *page_hash* is a PageHash object with the results of the current crawl.
96
+ #
97
+ def links_and_their_aliases(page_hash)
98
+ @links.inject([]) do |results, link|
99
+ results.concat([link].concat(page_hash[link].aliases))
100
+ end
101
+ end
102
+
103
+ #
104
+ # Returns the response body for the page
105
+ #
106
+ def body
107
+ @response.body
108
+ end
109
+
110
+ #
111
+ # Returns the +Content-Type+ header for the page
112
+ #
113
+ def content_type
114
+ @response['Content-Type']
115
+ end
116
+
117
+ #
118
+ # Returns +true+ if the page is a HTML document, returns +false+
119
+ # otherwise.
120
+ #
121
+ def html?
122
+ (content_type =~ /text\/html/) == 0
123
+ end
124
+
125
+ #
126
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
127
+ # otherwise.
128
+ #
129
+ def redirect?
130
+ (300..399).include?(@code)
131
+ end
132
+
133
+ #
134
+ # Returns +true+ if the page was not found (returned 404 code),
135
+ # returns +false+ otherwise.
136
+ #
137
+ def not_found?
138
+ 404 == @code
139
+ end
140
+
141
+ #
142
+ # Converts relative URL *link* into an absolute URL based on the
143
+ # location of the page
144
+ #
145
+ def to_absolute(link)
146
+ # remove anchor
147
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
148
+
149
+ relative = URI(link)
150
+ absolute = @url.merge(relative)
151
+
152
+ absolute.path = '/' if absolute.path.empty?
153
+
154
+ return absolute
155
+ end
156
+
157
+ #
158
+ # Returns +true+ if *uri* is in the same domain as the page, returns
159
+ # +false+ otherwise
160
+ #
161
+ def in_domain?(uri)
162
+ uri.host == @url.host
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,83 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # Return an Array of Page objects which link to the given url
65
+ #
66
+ def pages_linking_to url
67
+ begin
68
+ url = URI(url) if url.is_a?(String)
69
+ rescue
70
+ return []
71
+ end
72
+
73
+ values.delete_if { |p| !p.links.include?(url) }
74
+ end
75
+
76
+ #
77
+ # Return an Array of URI objects of Pages linking to the given url
78
+ def urls_linking_to url
79
+ pages_linking_to(url).map{|p| p.url}
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,31 @@
1
+ require 'anemone/page'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue)
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ end
13
+
14
+ #
15
+ # Gets links from @link_queue, and returns the fetched
16
+ # Page objects into @page_queue
17
+ #
18
+ def run
19
+ while true do
20
+ link = @link_queue.deq
21
+
22
+ break if link == :END
23
+
24
+ page = Page.fetch(link)
25
+
26
+ @page_queue.enq(page)
27
+ end
28
+ end
29
+
30
+ end
31
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-14 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.7.0
24
+ version:
25
+ description:
26
+ email:
27
+ executables:
28
+ - anemone_count.rb
29
+ - anemone_cron.rb
30
+ - anemone_pagedepth.rb
31
+ - anemone_serialize.rb
32
+ - anemone_url_list.rb
33
+ extensions: []
34
+
35
+ extra_rdoc_files:
36
+ - README.txt
37
+ files:
38
+ - bin/anemone_count.rb
39
+ - bin/anemone_cron.rb
40
+ - bin/anemone_pagedepth.rb
41
+ - bin/anemone_serialize.rb
42
+ - bin/anemone_url_list.rb
43
+ - lib/anemone
44
+ - lib/anemone/anemone.rb
45
+ - lib/anemone/core.rb
46
+ - lib/anemone/http.rb
47
+ - lib/anemone/page.rb
48
+ - lib/anemone/page_hash.rb
49
+ - lib/anemone/tentacle.rb
50
+ - lib/anemone.rb
51
+ - README.txt
52
+ has_rdoc: true
53
+ homepage: http://anemone.rubyforge.org
54
+ post_install_message:
55
+ rdoc_options:
56
+ - -m
57
+ - README.txt
58
+ - -t
59
+ - Anemone
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: "0"
73
+ version:
74
+ requirements: []
75
+
76
+ rubyforge_project: anemone
77
+ rubygems_version: 1.3.1
78
+ signing_key:
79
+ specification_version: 2
80
+ summary: Anemone web-spider framework
81
+ test_files: []
82
+