anemone 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.txt ADDED
@@ -0,0 +1,18 @@
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+
14
+ == REQUIREMENTS
15
+ * hpricot
16
+
17
+ == EXAMPLES
18
+ See the +bin+ directory for several examples of useful Anemone tasks.
@@ -0,0 +1,31 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+ require 'rdoc/usage'
16
+
17
+ # make sure that the first option is a URL we can crawl
18
+ begin
19
+ URI(ARGV[0])
20
+ rescue
21
+ RDoc::usage()
22
+ Process.exit
23
+ end
24
+
25
+ Anemone.crawl(ARGV[0]) do |anemone|
26
+ anemone.after_crawl do |pages|
27
+ puts pages.uniq.size
28
+ end
29
+ end
30
+
31
+
@@ -0,0 +1,99 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'rdoc/usage'
21
+ require 'ostruct'
22
+
23
+ options = OpenStruct.new
24
+ options.relative = false
25
+ options.output_file = 'urls.txt'
26
+
27
+ # make sure that the last option is a URL we can crawl
28
+ begin
29
+ URI(ARGV.last)
30
+ rescue
31
+ RDoc::usage()
32
+ Process.exit
33
+ end
34
+
35
+ # parse command-line options
36
+ opts = OptionParser.new
37
+ opts.on('-r', '--relative') { options.relative = true }
38
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
39
+ opts.parse!(ARGV)
40
+
41
+ root = ARGV.last
42
+
43
+ Anemone.crawl(root) do |anemone|
44
+
45
+ anemone.after_crawl do |pages|
46
+ puts "Crawl results for #{root}\n"
47
+
48
+ # print a list of 404's
49
+ not_found = []
50
+ pages.each_value do |page|
51
+ url = page.url.to_s
52
+ not_found << url if page.not_found?
53
+ end
54
+ if !not_found.empty?
55
+ puts "\n404's:"
56
+ not_found.each do |url|
57
+ if options.relative
58
+ puts URI(url).path.to_s
59
+ else
60
+ puts url
61
+ end
62
+ num_linked_from = 0
63
+ pages.urls_linking_to(url).each do |u|
64
+ u = u.path if options.relative
65
+ num_linked_from += 1
66
+ puts " linked from #{u}"
67
+ if num_linked_from > 10
68
+ puts " ..."
69
+ break
70
+ end
71
+ end
72
+ end
73
+
74
+ print "\n"
75
+ end
76
+
77
+ # remove redirect aliases, and calculate pagedepths
78
+ pages = pages.shortest_paths!(root).uniq
79
+ depths = pages.values.inject({}) do |depths, page|
80
+ depths[page.depth] ||= 0
81
+ depths[page.depth] += 1
82
+ depths
83
+ end
84
+
85
+ # print the page count
86
+ puts "Total pages: #{pages.size}\n"
87
+
88
+ # print a list of depths
89
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
90
+
91
+ # output a list of urls to file
92
+ file = open(options.output_file, 'w')
93
+ pages.each_key do |url|
94
+ url = options.relative ? url.path.to_s : url.to_s
95
+ file.puts url
96
+ end
97
+
98
+ end
99
+ end
@@ -0,0 +1,39 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+ require 'rdoc/usage'
16
+
17
+ # make sure that the first option is a URL we can crawl
18
+ begin
19
+ URI(ARGV[0])
20
+ rescue
21
+ RDoc::usage()
22
+ Process.exit
23
+ end
24
+
25
+ root = ARGV[0]
26
+ Anemone.crawl(root) do |anemone|
27
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
28
+
29
+ anemone.after_crawl do |pages|
30
+ pages = pages.shortest_paths!(root).uniq
31
+ depths = pages.values.inject({}) do |depths, page|
32
+ depths[page.depth] ||= 0
33
+ depths[page.depth] += 1
34
+ depths
35
+ end
36
+
37
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
38
+ end
39
+ end
@@ -0,0 +1,43 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'rdoc/usage'
20
+ require 'ostruct'
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ RDoc::usage()
27
+ Process.exit
28
+ end
29
+
30
+ options = OpenStruct.new
31
+ options.output_file = "crawl.#{Time.now.to_i}"
32
+
33
+ # parse command-line options
34
+ opts = OptionParser.new
35
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
36
+ opts.parse!(ARGV)
37
+
38
+ root = ARGV[0]
39
+ Anemone.crawl(root) do |anemone|
40
+ anemone.after_crawl do |pages|
41
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
42
+ end
43
+ end
@@ -0,0 +1,46 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'rdoc/usage'
20
+ require 'ostruct'
21
+
22
+ options = OpenStruct.new
23
+ options.relative = false
24
+
25
+ # make sure that the last option is a URL we can crawl
26
+ begin
27
+ URI(ARGV.last)
28
+ rescue
29
+ RDoc::usage()
30
+ Process.exit
31
+ end
32
+
33
+ # parse command-line options
34
+ opts = OptionParser.new
35
+ opts.on('-r', '--relative') { options.relative = true }
36
+ opts.parse!(ARGV)
37
+
38
+ Anemone.crawl(ARGV.last) do |anemone|
39
+ anemone.on_every_page do |page|
40
+ if options.relative
41
+ puts page.url.path
42
+ else
43
+ puts page.url
44
+ end
45
+ end
46
+ end
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/anemone'
@@ -0,0 +1,16 @@
1
+ require 'anemone/core'
2
+
3
+ module Anemone
4
+ # Version number
5
+ VERSION = '0.0.1'
6
+
7
+ # User-Agent string used for HTTP requests
8
+ USER_AGENT = "Anemone/#{self::VERSION}"
9
+
10
+ #
11
+ # Convenience method to start a crawl using Core
12
+ #
13
+ def Anemone.crawl(url, options = {}, &block)
14
+ Core.crawl(url, options, &block)
15
+ end
16
+ end
@@ -0,0 +1,183 @@
1
+ require 'net/http'
2
+ require 'thread'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page_hash'
5
+
6
+ module Anemone
7
+ class Core
8
+ # PageHash storing all Page objects encountered during the crawl
9
+ attr_reader :pages
10
+
11
+ #
12
+ # Initialize the crawl with a starting *url*, *options*, and optional *block*
13
+ #
14
+ def initialize(url, options={}, &block)
15
+ url = URI(url) if url.is_a?(String)
16
+ @url = url
17
+ @options = options
18
+ @tentacles = []
19
+ @pages = PageHash.new
20
+ @on_every_page_blocks = []
21
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
22
+ @skip_link_patterns = []
23
+ @after_crawl_blocks = []
24
+
25
+ @options[:threads] ||= 4
26
+ @options[:verbose] ||= false
27
+
28
+ block.call(self) if block
29
+ end
30
+
31
+ #
32
+ # Convenience method to start a new crawl
33
+ #
34
+ def self.crawl(root, options={}, &block)
35
+ self.new(root, options) do |core|
36
+ block.call(core) if block
37
+ core.run
38
+ core.do_after_crawl_blocks
39
+ return core
40
+ end
41
+ end
42
+
43
+ #
44
+ # Add a block to be executed on the PageHash after the crawl
45
+ # is finished
46
+ #
47
+ def after_crawl(&block)
48
+ @after_crawl_blocks << block
49
+ self
50
+ end
51
+
52
+ #
53
+ # Add one ore more Regex patterns for URLs which should not be
54
+ # followed
55
+ #
56
+ def skip_links_like(*patterns)
57
+ if patterns
58
+ patterns.each do |pattern|
59
+ @skip_link_patterns << pattern
60
+ end
61
+ end
62
+ self
63
+ end
64
+
65
+ #
66
+ # Add a block to be executed on every Page as they are encountered
67
+ # during the crawl
68
+ #
69
+ def on_every_page(&block)
70
+ @on_every_page_blocks << block
71
+ self
72
+ end
73
+
74
+ #
75
+ # Add a block to be executed on Page objects with a URL matching
76
+ # one or more patterns
77
+ #
78
+ def on_pages_like(*patterns, &block)
79
+ if patterns
80
+ patterns.each do |pattern|
81
+ @on_pages_like_blocks[pattern] << block
82
+ end
83
+ end
84
+ self
85
+ end
86
+
87
+ #
88
+ # Perform the crawl
89
+ #
90
+ def run
91
+ link_queue = Queue.new
92
+ page_queue = Queue.new
93
+
94
+ @options[:threads].times do |id|
95
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
96
+ end
97
+
98
+ return if !visit_link?(@url)
99
+
100
+ link_queue.enq(@url)
101
+
102
+ while true do
103
+ page = page_queue.deq
104
+
105
+ @pages[page.url] = page
106
+
107
+ puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
108
+
109
+ do_page_blocks(page)
110
+
111
+ page.links.each do |link|
112
+ if visit_link?(link)
113
+ link_queue.enq(link)
114
+ @pages[link] = nil
115
+ end
116
+ end
117
+
118
+ page.aliases.each do |aka|
119
+ if !@pages.has_key?(aka) or @pages[aka].nil?
120
+ @pages[aka] = page.alias_clone(aka)
121
+ end
122
+ @pages[aka].add_alias!(page.url)
123
+ end
124
+
125
+ # if we are done with the crawl, tell the threads to end
126
+ if link_queue.empty? and page_queue.empty?
127
+ until link_queue.num_waiting == @tentacles.size
128
+ Thread.pass
129
+ end
130
+
131
+ if page_queue.empty?
132
+ @tentacles.size.times { |i| link_queue.enq(:END)}
133
+ break
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ @tentacles.each { |t| t.join }
140
+
141
+ self
142
+ end
143
+
144
+ #
145
+ # Execute the after_crawl blocks
146
+ #
147
+ def do_after_crawl_blocks
148
+ @after_crawl_blocks.each {|b| b.call(@pages)}
149
+ end
150
+
151
+ #
152
+ # Execute the on_every_page blocks for *page*
153
+ #
154
+ def do_page_blocks(page)
155
+ @on_every_page_blocks.each do |blk|
156
+ blk.call(page)
157
+ end
158
+
159
+ @on_pages_like_blocks.each do |pattern, blk|
160
+ blk.call(page) if page.url.to_s =~ pattern
161
+ end
162
+ end
163
+
164
+ #
165
+ # Returns +true+ if *link* has not been visited already,
166
+ # and is not excluded by a skip_link pattern. Returns
167
+ # +false+ otherwise.
168
+ #
169
+ def visit_link?(link)
170
+ !@pages.has_key?(link) and !skip_link?(link)
171
+ end
172
+
173
+ #
174
+ # Returns +true+ if *link* should not be visited because
175
+ # its URL matches a skip_link pattern.
176
+ #
177
+ def skip_link?(link)
178
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
179
+ return false
180
+ end
181
+
182
+ end
183
+ end
@@ -0,0 +1,37 @@
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = get_response(loc)
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ Net::HTTP.start(url.host, url.port) do |http|
33
+ return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,165 @@
1
+ require 'anemone/http'
2
+ require 'hpricot'
3
+
4
+ module Anemone
5
+ class Page
6
+ # The URL of the page
7
+ attr_reader :url
8
+ # Array of distinct A tag HREFs from the page
9
+ attr_reader :links
10
+ # Integer response code of the page
11
+ attr_reader :code
12
+
13
+ # Array of redirect-aliases for the page
14
+ attr_accessor :aliases
15
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
16
+ attr_accessor :visited
17
+ # Used by PageHash#shortest_paths! to store depth of the page
18
+ attr_accessor :depth
19
+
20
+ #
21
+ # Create a new Page from the response of an HTTP request to *url*
22
+ #
23
+ def self.fetch(url)
24
+ begin
25
+ url = URI(url) if url.is_a?(String)
26
+
27
+ response, code, location = Anemone::HTTP.get(url)
28
+
29
+ aka = nil
30
+ if !url.eql?(location)
31
+ aka = location
32
+ end
33
+
34
+ return Page.new(url, response, code, aka)
35
+ rescue
36
+ return Page.new(url)
37
+ end
38
+ end
39
+
40
+ #
41
+ # Create a new page
42
+ #
43
+ def initialize(url, response = nil, code = nil, aka = nil)
44
+ @url = url
45
+ @response = response
46
+ @code = code
47
+ @links = []
48
+ @aliases = []
49
+
50
+ @aliases << aka if !aka.nil?
51
+
52
+ #get a list of distinct links on the page, in absolute url form
53
+ if @response and @response.body
54
+ Hpricot(@response.body).search('a').each do |a|
55
+ u = a['href']
56
+ next if u.nil?
57
+
58
+ begin
59
+ u = URI(u)
60
+ rescue
61
+ next
62
+ end
63
+
64
+ abs = to_absolute(u)
65
+ @links << abs if in_domain?(abs)
66
+ end
67
+
68
+ @links.uniq!
69
+ end
70
+ end
71
+
72
+
73
+ #
74
+ # Return a new page with the same *response* and *url*, but
75
+ # with a 200 response code
76
+ #
77
+ def alias_clone(url)
78
+ Page.new(url, @response, 200, @url)
79
+ end
80
+
81
+ #
82
+ # Add a redirect-alias String *aka* to the list of the page's aliases
83
+ #
84
+ # Returns *self*
85
+ #
86
+ def add_alias!(aka)
87
+ @aliases << aka if !@aliases.include?(aka)
88
+ self
89
+ end
90
+
91
+ #
92
+ # Returns an Array of all links from this page, and all the
93
+ # redirect-aliases of those pages, as String objects.
94
+ #
95
+ # *page_hash* is a PageHash object with the results of the current crawl.
96
+ #
97
+ def links_and_their_aliases(page_hash)
98
+ @links.inject([]) do |results, link|
99
+ results.concat([link].concat(page_hash[link].aliases))
100
+ end
101
+ end
102
+
103
+ #
104
+ # Returns the response body for the page
105
+ #
106
+ def body
107
+ @response.body
108
+ end
109
+
110
+ #
111
+ # Returns the +Content-Type+ header for the page
112
+ #
113
+ def content_type
114
+ @response['Content-Type']
115
+ end
116
+
117
+ #
118
+ # Returns +true+ if the page is a HTML document, returns +false+
119
+ # otherwise.
120
+ #
121
+ def html?
122
+ (content_type =~ /text\/html/) == 0
123
+ end
124
+
125
+ #
126
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
127
+ # otherwise.
128
+ #
129
+ def redirect?
130
+ (300..399).include?(@code)
131
+ end
132
+
133
+ #
134
+ # Returns +true+ if the page was not found (returned 404 code),
135
+ # returns +false+ otherwise.
136
+ #
137
+ def not_found?
138
+ 404 == @code
139
+ end
140
+
141
+ #
142
+ # Converts relative URL *link* into an absolute URL based on the
143
+ # location of the page
144
+ #
145
+ def to_absolute(link)
146
+ # remove anchor
147
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
148
+
149
+ relative = URI(link)
150
+ absolute = @url.merge(relative)
151
+
152
+ absolute.path = '/' if absolute.path.empty?
153
+
154
+ return absolute
155
+ end
156
+
157
+ #
158
+ # Returns +true+ if *uri* is in the same domain as the page, returns
159
+ # +false+ otherwise
160
+ #
161
+ def in_domain?(uri)
162
+ uri.host == @url.host
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,83 @@
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # Return an Array of Page objects which link to the given url
65
+ #
66
+ def pages_linking_to url
67
+ begin
68
+ url = URI(url) if url.is_a?(String)
69
+ rescue
70
+ return []
71
+ end
72
+
73
+ values.delete_if { |p| !p.links.include?(url) }
74
+ end
75
+
76
+ #
77
+ # Return an Array of URI objects of Pages linking to the given url
78
+ def urls_linking_to url
79
+ pages_linking_to(url).map{|p| p.url}
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,31 @@
1
+ require 'anemone/page'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue)
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ end
13
+
14
+ #
15
+ # Gets links from @link_queue, and returns the fetched
16
+ # Page objects into @page_queue
17
+ #
18
+ def run
19
+ while true do
20
+ link = @link_queue.deq
21
+
22
+ break if link == :END
23
+
24
+ page = Page.fetch(link)
25
+
26
+ @page_queue.enq(page)
27
+ end
28
+ end
29
+
30
+ end
31
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: anemone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Chris Kite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-14 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.7.0
24
+ version:
25
+ description:
26
+ email:
27
+ executables:
28
+ - anemone_count.rb
29
+ - anemone_cron.rb
30
+ - anemone_pagedepth.rb
31
+ - anemone_serialize.rb
32
+ - anemone_url_list.rb
33
+ extensions: []
34
+
35
+ extra_rdoc_files:
36
+ - README.txt
37
+ files:
38
+ - bin/anemone_count.rb
39
+ - bin/anemone_cron.rb
40
+ - bin/anemone_pagedepth.rb
41
+ - bin/anemone_serialize.rb
42
+ - bin/anemone_url_list.rb
43
+ - lib/anemone
44
+ - lib/anemone/anemone.rb
45
+ - lib/anemone/core.rb
46
+ - lib/anemone/http.rb
47
+ - lib/anemone/page.rb
48
+ - lib/anemone/page_hash.rb
49
+ - lib/anemone/tentacle.rb
50
+ - lib/anemone.rb
51
+ - README.txt
52
+ has_rdoc: true
53
+ homepage: http://anemone.rubyforge.org
54
+ post_install_message:
55
+ rdoc_options:
56
+ - -m
57
+ - README.txt
58
+ - -t
59
+ - Anemone
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: "0"
73
+ version:
74
+ requirements: []
75
+
76
+ rubyforge_project: anemone
77
+ rubygems_version: 1.3.1
78
+ signing_key:
79
+ specification_version: 2
80
+ summary: Anemone web-spider framework
81
+ test_files: []
82
+