anemone 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +17 -17
- data/bin/anemone_count.rb +36 -31
- data/bin/anemone_cron.rb +107 -98
- data/bin/anemone_pagedepth.rb +43 -38
- data/bin/anemone_serialize.rb +50 -42
- data/bin/anemone_url_list.rb +54 -46
- data/bin/anemone_url_list.rb~ +58 -0
- data/lib/anemone.rb +1 -1
- data/lib/anemone/anemone.rb +36 -36
- data/lib/anemone/core.rb +181 -179
- data/lib/anemone/http.rb +36 -36
- data/lib/anemone/page.rb +184 -159
- data/lib/anemone/page_hash.rb +82 -82
- data/lib/anemone/tentacle.rb +30 -30
- metadata +10 -9
data/bin/anemone_url_list.rb
CHANGED
@@ -1,46 +1,54 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
-
# in the domain as they are encountered.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require '
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_url_list.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-r, --relative Output relative URLs (rather than absolute)
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
options = OpenStruct.new
|
31
|
+
options.relative = false
|
32
|
+
|
33
|
+
# make sure that the last option is a URL we can crawl
|
34
|
+
begin
|
35
|
+
URI(ARGV.last)
|
36
|
+
rescue
|
37
|
+
usage
|
38
|
+
Process.exit
|
39
|
+
end
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-r', '--relative') { options.relative = true }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
Anemone.crawl(ARGV.last) do |anemone|
|
47
|
+
anemone.on_every_page do |page|
|
48
|
+
if options.relative
|
49
|
+
puts page.url.path
|
50
|
+
else
|
51
|
+
puts page.url
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_url_list.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-r, --relative Output relative URLs (rather than absolute)
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
options = OpenStruct.new
|
31
|
+
options.relative = false
|
32
|
+
|
33
|
+
# make sure that the last option is a URL we can crawl
|
34
|
+
begin
|
35
|
+
URI(ARGV.last)
|
36
|
+
rescue
|
37
|
+
usage
|
38
|
+
Process.exit
|
39
|
+
end
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-r', '--relative') { options.relative = true }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
Anemone.crawl(ARGV.last) do |anemone|
|
47
|
+
anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
|
48
|
+
puts "WOOZLE #{page.url}"
|
49
|
+
end
|
50
|
+
|
51
|
+
anemone.on_every_page do |page|
|
52
|
+
if options.relative
|
53
|
+
puts page.url.path
|
54
|
+
else
|
55
|
+
puts page.url
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
data/lib/anemone.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
require 'rubygems'
|
1
|
+
require 'rubygems'
|
2
2
|
require 'anemone/anemone'
|
data/lib/anemone/anemone.rb
CHANGED
@@ -1,37 +1,37 @@
|
|
1
|
-
require 'ostruct'
|
2
|
-
require 'anemone/core'
|
3
|
-
|
4
|
-
module Anemone
|
5
|
-
# Version number
|
6
|
-
VERSION = '0.0.2'
|
7
|
-
|
8
|
-
# User-Agent string used for HTTP requests
|
9
|
-
USER_AGENT = "Anemone/#{self::VERSION}"
|
10
|
-
|
11
|
-
#module-wide options
|
12
|
-
def Anemone.options=(options)
|
13
|
-
@options = options
|
14
|
-
end
|
15
|
-
|
16
|
-
def Anemone.options
|
17
|
-
@options
|
18
|
-
end
|
19
|
-
|
20
|
-
#
|
21
|
-
# Convenience method to start a crawl using Core
|
22
|
-
#
|
23
|
-
def Anemone.crawl(url, options = {}, &block)
|
24
|
-
Anemone.options = OpenStruct.new(options)
|
25
|
-
|
26
|
-
#by default, run 4 Tentacle threads to fetch pages
|
27
|
-
Anemone.options.threads ||= 4
|
28
|
-
|
29
|
-
#disable verbose output by default
|
30
|
-
Anemone.options.verbose ||= false
|
31
|
-
|
32
|
-
#by default, throw away the page response body after scanning it for links
|
33
|
-
Anemone.options.discard_page_bodies ||=
|
34
|
-
|
35
|
-
Core.crawl(url, &block)
|
36
|
-
end
|
1
|
+
require 'ostruct'
|
2
|
+
require 'anemone/core'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
# Version number
|
6
|
+
VERSION = '0.0.2'
|
7
|
+
|
8
|
+
# User-Agent string used for HTTP requests
|
9
|
+
USER_AGENT = "Anemone/#{self::VERSION}"
|
10
|
+
|
11
|
+
#module-wide options
|
12
|
+
def Anemone.options=(options)
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def Anemone.options
|
17
|
+
@options
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Convenience method to start a crawl using Core
|
22
|
+
#
|
23
|
+
def Anemone.crawl(url, options = {}, &block)
|
24
|
+
Anemone.options = OpenStruct.new(options)
|
25
|
+
|
26
|
+
#by default, run 4 Tentacle threads to fetch pages
|
27
|
+
Anemone.options.threads ||= 4
|
28
|
+
|
29
|
+
#disable verbose output by default
|
30
|
+
Anemone.options.verbose ||= false
|
31
|
+
|
32
|
+
#by default, don't throw away the page response body after scanning it for links
|
33
|
+
Anemone.options.discard_page_bodies ||= false
|
34
|
+
|
35
|
+
Core.crawl(url, &block)
|
36
|
+
end
|
37
37
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -1,179 +1,181 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
require 'thread'
|
3
|
-
require 'anemone/tentacle'
|
4
|
-
require 'anemone/page_hash'
|
5
|
-
|
6
|
-
module Anemone
|
7
|
-
class Core
|
8
|
-
# PageHash storing all Page objects encountered during the crawl
|
9
|
-
attr_reader :pages
|
10
|
-
|
11
|
-
#
|
12
|
-
# Initialize the crawl with a starting *url*, *options*, and optional *block*
|
13
|
-
#
|
14
|
-
def initialize(url, &block)
|
15
|
-
url = URI(url) if url.is_a?(String)
|
16
|
-
@url = url
|
17
|
-
@tentacles = []
|
18
|
-
@pages = PageHash.new
|
19
|
-
@on_every_page_blocks = []
|
20
|
-
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
21
|
-
@skip_link_patterns = []
|
22
|
-
@after_crawl_blocks = []
|
23
|
-
|
24
|
-
block.call(self) if block
|
25
|
-
end
|
26
|
-
|
27
|
-
#
|
28
|
-
# Convenience method to start a new crawl
|
29
|
-
#
|
30
|
-
def self.crawl(root, &block)
|
31
|
-
self.new(root) do |core|
|
32
|
-
block.call(core) if block
|
33
|
-
core.run
|
34
|
-
core.do_after_crawl_blocks
|
35
|
-
return core
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
#
|
40
|
-
# Add a block to be executed on the PageHash after the crawl
|
41
|
-
# is finished
|
42
|
-
#
|
43
|
-
def after_crawl(&block)
|
44
|
-
@after_crawl_blocks << block
|
45
|
-
self
|
46
|
-
end
|
47
|
-
|
48
|
-
#
|
49
|
-
# Add one ore more Regex patterns for URLs which should not be
|
50
|
-
# followed
|
51
|
-
#
|
52
|
-
def skip_links_like(*patterns)
|
53
|
-
if patterns
|
54
|
-
patterns.each do |pattern|
|
55
|
-
@skip_link_patterns << pattern
|
56
|
-
end
|
57
|
-
end
|
58
|
-
self
|
59
|
-
end
|
60
|
-
|
61
|
-
#
|
62
|
-
# Add a block to be executed on every Page as they are encountered
|
63
|
-
# during the crawl
|
64
|
-
#
|
65
|
-
def on_every_page(&block)
|
66
|
-
@on_every_page_blocks << block
|
67
|
-
self
|
68
|
-
end
|
69
|
-
|
70
|
-
#
|
71
|
-
# Add a block to be executed on Page objects with a URL matching
|
72
|
-
# one or more patterns
|
73
|
-
#
|
74
|
-
def on_pages_like(*patterns, &block)
|
75
|
-
if patterns
|
76
|
-
patterns.each do |pattern|
|
77
|
-
@on_pages_like_blocks[pattern] << block
|
78
|
-
end
|
79
|
-
end
|
80
|
-
self
|
81
|
-
end
|
82
|
-
|
83
|
-
#
|
84
|
-
# Perform the crawl
|
85
|
-
#
|
86
|
-
def run
|
87
|
-
link_queue = Queue.new
|
88
|
-
page_queue = Queue.new
|
89
|
-
|
90
|
-
Anemone.options.threads.times do |id|
|
91
|
-
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
92
|
-
end
|
93
|
-
|
94
|
-
return if !visit_link?(@url)
|
95
|
-
|
96
|
-
link_queue.enq(@url)
|
97
|
-
|
98
|
-
while true do
|
99
|
-
page = page_queue.deq
|
100
|
-
|
101
|
-
@pages[page.url] = page
|
102
|
-
|
103
|
-
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
104
|
-
|
105
|
-
do_page_blocks(page)
|
106
|
-
|
107
|
-
page.links.each do |link|
|
108
|
-
if visit_link?(link)
|
109
|
-
link_queue.enq(link)
|
110
|
-
@pages[link] = nil
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
page.aliases.each do |aka|
|
115
|
-
if !@pages.has_key?(aka) or @pages[aka].nil?
|
116
|
-
@pages[aka] = page.alias_clone(aka)
|
117
|
-
end
|
118
|
-
@pages[aka].add_alias!(page.url)
|
119
|
-
end
|
120
|
-
|
121
|
-
# if we are done with the crawl, tell the threads to end
|
122
|
-
if link_queue.empty? and page_queue.empty?
|
123
|
-
until link_queue.num_waiting == @tentacles.size
|
124
|
-
Thread.pass
|
125
|
-
end
|
126
|
-
|
127
|
-
if page_queue.empty?
|
128
|
-
@tentacles.size.times { |i| link_queue.enq(:END)}
|
129
|
-
break
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
end
|
134
|
-
|
135
|
-
@tentacles.each { |t| t.join }
|
136
|
-
|
137
|
-
self
|
138
|
-
end
|
139
|
-
|
140
|
-
#
|
141
|
-
# Execute the after_crawl blocks
|
142
|
-
#
|
143
|
-
def do_after_crawl_blocks
|
144
|
-
@after_crawl_blocks.each {|b| b.call(@pages)}
|
145
|
-
end
|
146
|
-
|
147
|
-
#
|
148
|
-
# Execute the on_every_page blocks for *page*
|
149
|
-
#
|
150
|
-
def do_page_blocks(page)
|
151
|
-
@on_every_page_blocks.each do |blk|
|
152
|
-
blk.call(page)
|
153
|
-
end
|
154
|
-
|
155
|
-
@on_pages_like_blocks.each do |pattern,
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
#
|
163
|
-
# +
|
164
|
-
#
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
#
|
172
|
-
#
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
1
|
+
require 'net/http'
|
2
|
+
require 'thread'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page_hash'
|
5
|
+
|
6
|
+
module Anemone
|
7
|
+
class Core
|
8
|
+
# PageHash storing all Page objects encountered during the crawl
|
9
|
+
attr_reader :pages
|
10
|
+
|
11
|
+
#
|
12
|
+
# Initialize the crawl with a starting *url*, *options*, and optional *block*
|
13
|
+
#
|
14
|
+
def initialize(url, &block)
|
15
|
+
url = URI(url) if url.is_a?(String)
|
16
|
+
@url = url
|
17
|
+
@tentacles = []
|
18
|
+
@pages = PageHash.new
|
19
|
+
@on_every_page_blocks = []
|
20
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
21
|
+
@skip_link_patterns = []
|
22
|
+
@after_crawl_blocks = []
|
23
|
+
|
24
|
+
block.call(self) if block
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Convenience method to start a new crawl
|
29
|
+
#
|
30
|
+
def self.crawl(root, &block)
|
31
|
+
self.new(root) do |core|
|
32
|
+
block.call(core) if block
|
33
|
+
core.run
|
34
|
+
core.do_after_crawl_blocks
|
35
|
+
return core
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Add a block to be executed on the PageHash after the crawl
|
41
|
+
# is finished
|
42
|
+
#
|
43
|
+
def after_crawl(&block)
|
44
|
+
@after_crawl_blocks << block
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Add one ore more Regex patterns for URLs which should not be
|
50
|
+
# followed
|
51
|
+
#
|
52
|
+
def skip_links_like(*patterns)
|
53
|
+
if patterns
|
54
|
+
patterns.each do |pattern|
|
55
|
+
@skip_link_patterns << pattern
|
56
|
+
end
|
57
|
+
end
|
58
|
+
self
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Add a block to be executed on every Page as they are encountered
|
63
|
+
# during the crawl
|
64
|
+
#
|
65
|
+
def on_every_page(&block)
|
66
|
+
@on_every_page_blocks << block
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Add a block to be executed on Page objects with a URL matching
|
72
|
+
# one or more patterns
|
73
|
+
#
|
74
|
+
def on_pages_like(*patterns, &block)
|
75
|
+
if patterns
|
76
|
+
patterns.each do |pattern|
|
77
|
+
@on_pages_like_blocks[pattern] << block
|
78
|
+
end
|
79
|
+
end
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# Perform the crawl
|
85
|
+
#
|
86
|
+
def run
|
87
|
+
link_queue = Queue.new
|
88
|
+
page_queue = Queue.new
|
89
|
+
|
90
|
+
Anemone.options.threads.times do |id|
|
91
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
92
|
+
end
|
93
|
+
|
94
|
+
return if !visit_link?(@url)
|
95
|
+
|
96
|
+
link_queue.enq(@url)
|
97
|
+
|
98
|
+
while true do
|
99
|
+
page = page_queue.deq
|
100
|
+
|
101
|
+
@pages[page.url] = page
|
102
|
+
|
103
|
+
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
104
|
+
|
105
|
+
do_page_blocks(page)
|
106
|
+
|
107
|
+
page.links.each do |link|
|
108
|
+
if visit_link?(link)
|
109
|
+
link_queue.enq(link)
|
110
|
+
@pages[link] = nil
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
page.aliases.each do |aka|
|
115
|
+
if !@pages.has_key?(aka) or @pages[aka].nil?
|
116
|
+
@pages[aka] = page.alias_clone(aka)
|
117
|
+
end
|
118
|
+
@pages[aka].add_alias!(page.url)
|
119
|
+
end
|
120
|
+
|
121
|
+
# if we are done with the crawl, tell the threads to end
|
122
|
+
if link_queue.empty? and page_queue.empty?
|
123
|
+
until link_queue.num_waiting == @tentacles.size
|
124
|
+
Thread.pass
|
125
|
+
end
|
126
|
+
|
127
|
+
if page_queue.empty?
|
128
|
+
@tentacles.size.times { |i| link_queue.enq(:END)}
|
129
|
+
break
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
@tentacles.each { |t| t.join }
|
136
|
+
|
137
|
+
self
|
138
|
+
end
|
139
|
+
|
140
|
+
#
|
141
|
+
# Execute the after_crawl blocks
|
142
|
+
#
|
143
|
+
def do_after_crawl_blocks
|
144
|
+
@after_crawl_blocks.each {|b| b.call(@pages)}
|
145
|
+
end
|
146
|
+
|
147
|
+
#
|
148
|
+
# Execute the on_every_page blocks for *page*
|
149
|
+
#
|
150
|
+
def do_page_blocks(page)
|
151
|
+
@on_every_page_blocks.each do |blk|
|
152
|
+
blk.call(page)
|
153
|
+
end
|
154
|
+
|
155
|
+
@on_pages_like_blocks.each do |pattern, blks|
|
156
|
+
if page.url.to_s =~ pattern
|
157
|
+
blks.each { |blk| blk.call(page) }
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
# Returns +true+ if *link* has not been visited already,
|
164
|
+
# and is not excluded by a skip_link pattern. Returns
|
165
|
+
# +false+ otherwise.
|
166
|
+
#
|
167
|
+
def visit_link?(link)
|
168
|
+
!@pages.has_key?(link) and !skip_link?(link)
|
169
|
+
end
|
170
|
+
|
171
|
+
#
|
172
|
+
# Returns +true+ if *link* should not be visited because
|
173
|
+
# its URL matches a skip_link pattern.
|
174
|
+
#
|
175
|
+
def skip_link?(link)
|
176
|
+
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
177
|
+
return false
|
178
|
+
end
|
179
|
+
|
180
|
+
end
|
181
|
+
end
|