anemone 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,46 +1,54 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the URL of each page
4
- # in the domain as they are encountered.
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'rdoc/usage'
20
- require 'ostruct'
21
-
22
- options = OpenStruct.new
23
- options.relative = false
24
-
25
- # make sure that the last option is a URL we can crawl
26
- begin
27
- URI(ARGV.last)
28
- rescue
29
- RDoc::usage()
30
- Process.exit
31
- end
32
-
33
- # parse command-line options
34
- opts = OptionParser.new
35
- opts.on('-r', '--relative') { options.relative = true }
36
- opts.parse!(ARGV)
37
-
38
- Anemone.crawl(ARGV.last) do |anemone|
39
- anemone.on_every_page do |page|
40
- if options.relative
41
- puts page.url.path
42
- else
43
- puts page.url
44
- end
45
- end
46
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_url_list.rb [options] url
24
+
25
+ Options:
26
+ -r, --relative Output relative URLs (rather than absolute)
27
+ END
28
+ end
29
+
30
+ options = OpenStruct.new
31
+ options.relative = false
32
+
33
+ # make sure that the last option is a URL we can crawl
34
+ begin
35
+ URI(ARGV.last)
36
+ rescue
37
+ usage
38
+ Process.exit
39
+ end
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-r', '--relative') { options.relative = true }
44
+ opts.parse!(ARGV)
45
+
46
+ Anemone.crawl(ARGV.last) do |anemone|
47
+ anemone.on_every_page do |page|
48
+ if options.relative
49
+ puts page.url.path
50
+ else
51
+ puts page.url
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,58 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_url_list.rb [options] url
24
+
25
+ Options:
26
+ -r, --relative Output relative URLs (rather than absolute)
27
+ END
28
+ end
29
+
30
+ options = OpenStruct.new
31
+ options.relative = false
32
+
33
+ # make sure that the last option is a URL we can crawl
34
+ begin
35
+ URI(ARGV.last)
36
+ rescue
37
+ usage
38
+ Process.exit
39
+ end
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-r', '--relative') { options.relative = true }
44
+ opts.parse!(ARGV)
45
+
46
+ Anemone.crawl(ARGV.last) do |anemone|
47
+ anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
48
+ puts "WOOZLE #{page.url}"
49
+ end
50
+
51
+ anemone.on_every_page do |page|
52
+ if options.relative
53
+ puts page.url.path
54
+ else
55
+ puts page.url
56
+ end
57
+ end
58
+ end
data/lib/anemone.rb CHANGED
@@ -1,2 +1,2 @@
1
- require 'rubygems'
1
+ require 'rubygems'
2
2
  require 'anemone/anemone'
@@ -1,37 +1,37 @@
1
- require 'ostruct'
2
- require 'anemone/core'
3
-
4
- module Anemone
5
- # Version number
6
- VERSION = '0.0.2'
7
-
8
- # User-Agent string used for HTTP requests
9
- USER_AGENT = "Anemone/#{self::VERSION}"
10
-
11
- #module-wide options
12
- def Anemone.options=(options)
13
- @options = options
14
- end
15
-
16
- def Anemone.options
17
- @options
18
- end
19
-
20
- #
21
- # Convenience method to start a crawl using Core
22
- #
23
- def Anemone.crawl(url, options = {}, &block)
24
- Anemone.options = OpenStruct.new(options)
25
-
26
- #by default, run 4 Tentacle threads to fetch pages
27
- Anemone.options.threads ||= 4
28
-
29
- #disable verbose output by default
30
- Anemone.options.verbose ||= false
31
-
32
- #by default, throw away the page response body after scanning it for links, to save memory
33
- Anemone.options.discard_page_bodies ||= true
34
-
35
- Core.crawl(url, &block)
36
- end
1
+ require 'ostruct'
2
+ require 'anemone/core'
3
+
4
+ module Anemone
5
+ # Version number
6
+ VERSION = '0.0.2'
7
+
8
+ # User-Agent string used for HTTP requests
9
+ USER_AGENT = "Anemone/#{self::VERSION}"
10
+
11
+ #module-wide options
12
+ def Anemone.options=(options)
13
+ @options = options
14
+ end
15
+
16
+ def Anemone.options
17
+ @options
18
+ end
19
+
20
+ #
21
+ # Convenience method to start a crawl using Core
22
+ #
23
+ def Anemone.crawl(url, options = {}, &block)
24
+ Anemone.options = OpenStruct.new(options)
25
+
26
+ #by default, run 4 Tentacle threads to fetch pages
27
+ Anemone.options.threads ||= 4
28
+
29
+ #disable verbose output by default
30
+ Anemone.options.verbose ||= false
31
+
32
+ #by default, don't throw away the page response body after scanning it for links
33
+ Anemone.options.discard_page_bodies ||= false
34
+
35
+ Core.crawl(url, &block)
36
+ end
37
37
  end
data/lib/anemone/core.rb CHANGED
@@ -1,179 +1,181 @@
1
- require 'net/http'
2
- require 'thread'
3
- require 'anemone/tentacle'
4
- require 'anemone/page_hash'
5
-
6
- module Anemone
7
- class Core
8
- # PageHash storing all Page objects encountered during the crawl
9
- attr_reader :pages
10
-
11
- #
12
- # Initialize the crawl with a starting *url*, *options*, and optional *block*
13
- #
14
- def initialize(url, &block)
15
- url = URI(url) if url.is_a?(String)
16
- @url = url
17
- @tentacles = []
18
- @pages = PageHash.new
19
- @on_every_page_blocks = []
20
- @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
21
- @skip_link_patterns = []
22
- @after_crawl_blocks = []
23
-
24
- block.call(self) if block
25
- end
26
-
27
- #
28
- # Convenience method to start a new crawl
29
- #
30
- def self.crawl(root, &block)
31
- self.new(root) do |core|
32
- block.call(core) if block
33
- core.run
34
- core.do_after_crawl_blocks
35
- return core
36
- end
37
- end
38
-
39
- #
40
- # Add a block to be executed on the PageHash after the crawl
41
- # is finished
42
- #
43
- def after_crawl(&block)
44
- @after_crawl_blocks << block
45
- self
46
- end
47
-
48
- #
49
- # Add one ore more Regex patterns for URLs which should not be
50
- # followed
51
- #
52
- def skip_links_like(*patterns)
53
- if patterns
54
- patterns.each do |pattern|
55
- @skip_link_patterns << pattern
56
- end
57
- end
58
- self
59
- end
60
-
61
- #
62
- # Add a block to be executed on every Page as they are encountered
63
- # during the crawl
64
- #
65
- def on_every_page(&block)
66
- @on_every_page_blocks << block
67
- self
68
- end
69
-
70
- #
71
- # Add a block to be executed on Page objects with a URL matching
72
- # one or more patterns
73
- #
74
- def on_pages_like(*patterns, &block)
75
- if patterns
76
- patterns.each do |pattern|
77
- @on_pages_like_blocks[pattern] << block
78
- end
79
- end
80
- self
81
- end
82
-
83
- #
84
- # Perform the crawl
85
- #
86
- def run
87
- link_queue = Queue.new
88
- page_queue = Queue.new
89
-
90
- Anemone.options.threads.times do |id|
91
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
92
- end
93
-
94
- return if !visit_link?(@url)
95
-
96
- link_queue.enq(@url)
97
-
98
- while true do
99
- page = page_queue.deq
100
-
101
- @pages[page.url] = page
102
-
103
- puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
104
-
105
- do_page_blocks(page)
106
-
107
- page.links.each do |link|
108
- if visit_link?(link)
109
- link_queue.enq(link)
110
- @pages[link] = nil
111
- end
112
- end
113
-
114
- page.aliases.each do |aka|
115
- if !@pages.has_key?(aka) or @pages[aka].nil?
116
- @pages[aka] = page.alias_clone(aka)
117
- end
118
- @pages[aka].add_alias!(page.url)
119
- end
120
-
121
- # if we are done with the crawl, tell the threads to end
122
- if link_queue.empty? and page_queue.empty?
123
- until link_queue.num_waiting == @tentacles.size
124
- Thread.pass
125
- end
126
-
127
- if page_queue.empty?
128
- @tentacles.size.times { |i| link_queue.enq(:END)}
129
- break
130
- end
131
- end
132
-
133
- end
134
-
135
- @tentacles.each { |t| t.join }
136
-
137
- self
138
- end
139
-
140
- #
141
- # Execute the after_crawl blocks
142
- #
143
- def do_after_crawl_blocks
144
- @after_crawl_blocks.each {|b| b.call(@pages)}
145
- end
146
-
147
- #
148
- # Execute the on_every_page blocks for *page*
149
- #
150
- def do_page_blocks(page)
151
- @on_every_page_blocks.each do |blk|
152
- blk.call(page)
153
- end
154
-
155
- @on_pages_like_blocks.each do |pattern, blk|
156
- blk.call(page) if page.url.to_s =~ pattern
157
- end
158
- end
159
-
160
- #
161
- # Returns +true+ if *link* has not been visited already,
162
- # and is not excluded by a skip_link pattern. Returns
163
- # +false+ otherwise.
164
- #
165
- def visit_link?(link)
166
- !@pages.has_key?(link) and !skip_link?(link)
167
- end
168
-
169
- #
170
- # Returns +true+ if *link* should not be visited because
171
- # its URL matches a skip_link pattern.
172
- #
173
- def skip_link?(link)
174
- @skip_link_patterns.each { |p| return true if link.path =~ p}
175
- return false
176
- end
177
-
178
- end
179
- end
1
+ require 'net/http'
2
+ require 'thread'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page_hash'
5
+
6
+ module Anemone
7
+ class Core
8
+ # PageHash storing all Page objects encountered during the crawl
9
+ attr_reader :pages
10
+
11
+ #
12
+ # Initialize the crawl with a starting *url*, *options*, and optional *block*
13
+ #
14
+ def initialize(url, &block)
15
+ url = URI(url) if url.is_a?(String)
16
+ @url = url
17
+ @tentacles = []
18
+ @pages = PageHash.new
19
+ @on_every_page_blocks = []
20
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
21
+ @skip_link_patterns = []
22
+ @after_crawl_blocks = []
23
+
24
+ block.call(self) if block
25
+ end
26
+
27
+ #
28
+ # Convenience method to start a new crawl
29
+ #
30
+ def self.crawl(root, &block)
31
+ self.new(root) do |core|
32
+ block.call(core) if block
33
+ core.run
34
+ core.do_after_crawl_blocks
35
+ return core
36
+ end
37
+ end
38
+
39
+ #
40
+ # Add a block to be executed on the PageHash after the crawl
41
+ # is finished
42
+ #
43
+ def after_crawl(&block)
44
+ @after_crawl_blocks << block
45
+ self
46
+ end
47
+
48
+ #
49
+ # Add one ore more Regex patterns for URLs which should not be
50
+ # followed
51
+ #
52
+ def skip_links_like(*patterns)
53
+ if patterns
54
+ patterns.each do |pattern|
55
+ @skip_link_patterns << pattern
56
+ end
57
+ end
58
+ self
59
+ end
60
+
61
+ #
62
+ # Add a block to be executed on every Page as they are encountered
63
+ # during the crawl
64
+ #
65
+ def on_every_page(&block)
66
+ @on_every_page_blocks << block
67
+ self
68
+ end
69
+
70
+ #
71
+ # Add a block to be executed on Page objects with a URL matching
72
+ # one or more patterns
73
+ #
74
+ def on_pages_like(*patterns, &block)
75
+ if patterns
76
+ patterns.each do |pattern|
77
+ @on_pages_like_blocks[pattern] << block
78
+ end
79
+ end
80
+ self
81
+ end
82
+
83
+ #
84
+ # Perform the crawl
85
+ #
86
+ def run
87
+ link_queue = Queue.new
88
+ page_queue = Queue.new
89
+
90
+ Anemone.options.threads.times do |id|
91
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
92
+ end
93
+
94
+ return if !visit_link?(@url)
95
+
96
+ link_queue.enq(@url)
97
+
98
+ while true do
99
+ page = page_queue.deq
100
+
101
+ @pages[page.url] = page
102
+
103
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
104
+
105
+ do_page_blocks(page)
106
+
107
+ page.links.each do |link|
108
+ if visit_link?(link)
109
+ link_queue.enq(link)
110
+ @pages[link] = nil
111
+ end
112
+ end
113
+
114
+ page.aliases.each do |aka|
115
+ if !@pages.has_key?(aka) or @pages[aka].nil?
116
+ @pages[aka] = page.alias_clone(aka)
117
+ end
118
+ @pages[aka].add_alias!(page.url)
119
+ end
120
+
121
+ # if we are done with the crawl, tell the threads to end
122
+ if link_queue.empty? and page_queue.empty?
123
+ until link_queue.num_waiting == @tentacles.size
124
+ Thread.pass
125
+ end
126
+
127
+ if page_queue.empty?
128
+ @tentacles.size.times { |i| link_queue.enq(:END)}
129
+ break
130
+ end
131
+ end
132
+
133
+ end
134
+
135
+ @tentacles.each { |t| t.join }
136
+
137
+ self
138
+ end
139
+
140
+ #
141
+ # Execute the after_crawl blocks
142
+ #
143
+ def do_after_crawl_blocks
144
+ @after_crawl_blocks.each {|b| b.call(@pages)}
145
+ end
146
+
147
+ #
148
+ # Execute the on_every_page blocks for *page*
149
+ #
150
+ def do_page_blocks(page)
151
+ @on_every_page_blocks.each do |blk|
152
+ blk.call(page)
153
+ end
154
+
155
+ @on_pages_like_blocks.each do |pattern, blks|
156
+ if page.url.to_s =~ pattern
157
+ blks.each { |blk| blk.call(page) }
158
+ end
159
+ end
160
+ end
161
+
162
+ #
163
+ # Returns +true+ if *link* has not been visited already,
164
+ # and is not excluded by a skip_link pattern. Returns
165
+ # +false+ otherwise.
166
+ #
167
+ def visit_link?(link)
168
+ !@pages.has_key?(link) and !skip_link?(link)
169
+ end
170
+
171
+ #
172
+ # Returns +true+ if *link* should not be visited because
173
+ # its URL matches a skip_link pattern.
174
+ #
175
+ def skip_link?(link)
176
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
177
+ return false
178
+ end
179
+
180
+ end
181
+ end