jeremyf-anemone 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +19 -0
- data/README.rdoc +18 -0
- data/Rakefile +48 -0
- data/VERSION.yml +4 -0
- data/anemone.gemspec +62 -0
- data/bin/anemone_count.rb +36 -0
- data/bin/anemone_cron.rb +106 -0
- data/bin/anemone_pagedepth.rb +44 -0
- data/bin/anemone_serialize.rb +51 -0
- data/bin/anemone_url_list.rb +51 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +37 -0
- data/lib/anemone/core.rb +211 -0
- data/lib/anemone/http.rb +38 -0
- data/lib/anemone/page.rb +180 -0
- data/lib/anemone/page_hash.rb +116 -0
- data/lib/anemone/tentacle.rb +31 -0
- data/spec/anemone_spec.rb +27 -0
- data/spec/core_spec.rb +114 -0
- data/spec/fakeweb_helper.rb +55 -0
- data/spec/page_spec.rb +49 -0
- data/spec/spec_helper.rb +5 -0
- metadata +85 -0
data/lib/anemone.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'anemone/core'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
# Version number
|
6
|
+
VERSION = '0.1.1'
|
7
|
+
|
8
|
+
# User-Agent string used for HTTP requests
|
9
|
+
USER_AGENT = "Anemone/#{self::VERSION}"
|
10
|
+
|
11
|
+
#module-wide options
|
12
|
+
def Anemone.options=(options)
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def Anemone.options
|
17
|
+
@options
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Convenience method to start a crawl using Core
|
22
|
+
#
|
23
|
+
def Anemone.crawl(urls, options = {}, &block)
|
24
|
+
Anemone.options = OpenStruct.new(options)
|
25
|
+
|
26
|
+
#by default, run 4 Tentacle threads to fetch pages
|
27
|
+
Anemone.options.threads ||= 4
|
28
|
+
|
29
|
+
#disable verbose output by default
|
30
|
+
Anemone.options.verbose ||= false
|
31
|
+
|
32
|
+
#by default, don't throw away the page response body after scanning it for links
|
33
|
+
Anemone.options.discard_page_bodies ||= false
|
34
|
+
|
35
|
+
Core.crawl(urls, &block)
|
36
|
+
end
|
37
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'thread'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page_hash'
|
5
|
+
|
6
|
+
module Anemone
|
7
|
+
class Core
|
8
|
+
# PageHash storing all Page objects encountered during the crawl
|
9
|
+
attr_reader :pages
|
10
|
+
|
11
|
+
#
|
12
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
|
+
# and optional *block*
|
14
|
+
#
|
15
|
+
def initialize(urls, &block)
|
16
|
+
@urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
|
17
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
+
|
19
|
+
@tentacles = []
|
20
|
+
@pages = PageHash.new
|
21
|
+
@on_every_page_blocks = []
|
22
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
23
|
+
@skip_link_patterns = []
|
24
|
+
@after_crawl_blocks = []
|
25
|
+
|
26
|
+
block.call(self) if block
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Convenience method to start a new crawl
|
31
|
+
#
|
32
|
+
def self.crawl(root, &block)
|
33
|
+
self.new(root) do |core|
|
34
|
+
block.call(core) if block
|
35
|
+
core.run
|
36
|
+
return core
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Add a block to be executed on the PageHash after the crawl
|
42
|
+
# is finished
|
43
|
+
#
|
44
|
+
def after_crawl(&block)
|
45
|
+
@after_crawl_blocks << block
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Add one ore more Regex patterns for URLs which should not be
|
51
|
+
# followed
|
52
|
+
#
|
53
|
+
def skip_links_like(*patterns)
|
54
|
+
if patterns
|
55
|
+
patterns.each do |pattern|
|
56
|
+
@skip_link_patterns << pattern
|
57
|
+
end
|
58
|
+
end
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Add a block to be executed on every Page as they are encountered
|
64
|
+
# during the crawl
|
65
|
+
#
|
66
|
+
def on_every_page(&block)
|
67
|
+
@on_every_page_blocks << block
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Add a block to be executed on Page objects with a URL matching
|
73
|
+
# one or more patterns
|
74
|
+
#
|
75
|
+
def on_pages_like(*patterns, &block)
|
76
|
+
if patterns
|
77
|
+
patterns.each do |pattern|
|
78
|
+
@on_pages_like_blocks[pattern] << block
|
79
|
+
end
|
80
|
+
end
|
81
|
+
self
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Specify a block which will select which links to follow on each page.
|
86
|
+
# The block should return an Array of URI objects.
|
87
|
+
#
|
88
|
+
def focus_crawl(&block)
|
89
|
+
@focus_crawl_block = block
|
90
|
+
self
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Perform the crawl
|
95
|
+
#
|
96
|
+
def run
|
97
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
98
|
+
return if @urls.empty?
|
99
|
+
|
100
|
+
link_queue = Queue.new
|
101
|
+
page_queue = Queue.new
|
102
|
+
|
103
|
+
Anemone.options.threads.times do |id|
|
104
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
105
|
+
end
|
106
|
+
|
107
|
+
@urls.each{ |url| link_queue.enq(url) }
|
108
|
+
|
109
|
+
loop do
|
110
|
+
page = page_queue.deq
|
111
|
+
|
112
|
+
@pages[page.url] = page
|
113
|
+
|
114
|
+
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
115
|
+
|
116
|
+
#perform the on_every_page blocks for this page
|
117
|
+
do_page_blocks(page)
|
118
|
+
|
119
|
+
page.doc = nil if Anemone.options.discard_page_bodies
|
120
|
+
|
121
|
+
links_to_follow(page).each do |link|
|
122
|
+
|
123
|
+
eval(%(def link.from_url; '#{page.url}'; end))
|
124
|
+
link_queue.enq(link)
|
125
|
+
@pages[link] = nil
|
126
|
+
end
|
127
|
+
|
128
|
+
#create an entry in the page hash for each alias of this page,
|
129
|
+
#i.e. all the pages that redirected to this page
|
130
|
+
page.aliases.each do |aka|
|
131
|
+
if !@pages.has_key?(aka) or @pages[aka].nil?
|
132
|
+
@pages[aka] = page.alias_clone(aka)
|
133
|
+
end
|
134
|
+
@pages[aka].add_alias!(page.url)
|
135
|
+
end
|
136
|
+
|
137
|
+
# if we are done with the crawl, tell the threads to end
|
138
|
+
if link_queue.empty? and page_queue.empty?
|
139
|
+
until link_queue.num_waiting == @tentacles.size
|
140
|
+
Thread.pass
|
141
|
+
end
|
142
|
+
|
143
|
+
if page_queue.empty?
|
144
|
+
@tentacles.size.times { |i| link_queue.enq(:END)}
|
145
|
+
break
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
@tentacles.each { |t| t.join }
|
152
|
+
|
153
|
+
do_after_crawl_blocks()
|
154
|
+
|
155
|
+
self
|
156
|
+
end
|
157
|
+
|
158
|
+
private
|
159
|
+
|
160
|
+
#
|
161
|
+
# Execute the after_crawl blocks
|
162
|
+
#
|
163
|
+
def do_after_crawl_blocks
|
164
|
+
@after_crawl_blocks.each {|b| b.call(@pages)}
|
165
|
+
end
|
166
|
+
|
167
|
+
#
|
168
|
+
# Execute the on_every_page blocks for *page*
|
169
|
+
#
|
170
|
+
def do_page_blocks(page)
|
171
|
+
@on_every_page_blocks.each do |blk|
|
172
|
+
blk.call(page)
|
173
|
+
end
|
174
|
+
|
175
|
+
@on_pages_like_blocks.each do |pattern, blks|
|
176
|
+
if page.url.to_s =~ pattern
|
177
|
+
blks.each { |blk| blk.call(page) }
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
#
|
183
|
+
# Return an Array of links to follow from the given page.
|
184
|
+
# Based on whether or not the link has already been crawled,
|
185
|
+
# and the block given to focus_crawl()
|
186
|
+
#
|
187
|
+
def links_to_follow(page)
|
188
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
189
|
+
links.find_all { |link| visit_link?(link) }
|
190
|
+
end
|
191
|
+
|
192
|
+
#
|
193
|
+
# Returns +true+ if *link* has not been visited already,
|
194
|
+
# and is not excluded by a skip_link pattern. Returns
|
195
|
+
# +false+ otherwise.
|
196
|
+
#
|
197
|
+
def visit_link?(link)
|
198
|
+
!@pages.has_key?(link) and !skip_link?(link)
|
199
|
+
end
|
200
|
+
|
201
|
+
#
|
202
|
+
# Returns +true+ if *link* should not be visited because
|
203
|
+
# its URL matches a skip_link pattern.
|
204
|
+
#
|
205
|
+
def skip_link?(link)
|
206
|
+
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
207
|
+
return false
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
211
|
+
end
|
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class HTTP < Net::HTTP
|
5
|
+
# Maximum number of redirects to follow on each get_response
|
6
|
+
REDIRECTION_LIMIT = 5
|
7
|
+
|
8
|
+
#
|
9
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
+
# Returns the response object, response code, and final URI location.
|
11
|
+
#
|
12
|
+
def self.get(url)
|
13
|
+
response = get_response(url)
|
14
|
+
code = Integer(response.code)
|
15
|
+
loc = url
|
16
|
+
|
17
|
+
limit = REDIRECTION_LIMIT
|
18
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
+
loc = URI(response['location'])
|
20
|
+
loc = url.merge(loc) if loc.relative?
|
21
|
+
response = get_response(loc)
|
22
|
+
limit -= 1
|
23
|
+
end
|
24
|
+
|
25
|
+
return response, code, loc
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
+
#
|
31
|
+
def self.get_response(url)
|
32
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
|
+
Net::HTTP.start(url.host, url.port) do |http|
|
34
|
+
return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT })
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/anemone/page.rb
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
class Page
|
7
|
+
|
8
|
+
# The URL of the page
|
9
|
+
attr_reader :url
|
10
|
+
attr_reader :from_url
|
11
|
+
|
12
|
+
# Array of distinct A tag HREFs from the page
|
13
|
+
attr_reader :links
|
14
|
+
# Headers of the HTTP response
|
15
|
+
attr_reader :headers
|
16
|
+
|
17
|
+
# OpenStruct for user-stored data
|
18
|
+
attr_accessor :data
|
19
|
+
# Nokogiri document for the HTML body
|
20
|
+
attr_accessor :doc
|
21
|
+
# Integer response code of the page
|
22
|
+
attr_accessor :code
|
23
|
+
# Array of redirect-aliases for the page
|
24
|
+
attr_accessor :aliases
|
25
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
26
|
+
attr_accessor :visited
|
27
|
+
# Used by PageHash#shortest_paths! to store depth of the page
|
28
|
+
attr_accessor :depth
|
29
|
+
|
30
|
+
#
|
31
|
+
# Create a new Page from the response of an HTTP request to *url*
|
32
|
+
#
|
33
|
+
def self.fetch(url)
|
34
|
+
begin
|
35
|
+
url = URI(url) if url.is_a?(String)
|
36
|
+
|
37
|
+
response, code, location = Anemone::HTTP.get(url)
|
38
|
+
|
39
|
+
aka = nil
|
40
|
+
if !url.eql?(location)
|
41
|
+
aka = location
|
42
|
+
end
|
43
|
+
|
44
|
+
return Page.new(url, response.body, code, response.to_hash, aka, url.respond_to?(:from_url) ? url.from_url : nil)
|
45
|
+
rescue
|
46
|
+
return Page.new(url)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Create a new page
|
52
|
+
#
|
53
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, from_url = nil)
|
54
|
+
@from_url = from_url
|
55
|
+
@url = url
|
56
|
+
@code = code
|
57
|
+
@headers = headers
|
58
|
+
@links = []
|
59
|
+
@aliases = []
|
60
|
+
@data = OpenStruct.new
|
61
|
+
|
62
|
+
@aliases << aka if !aka.nil?
|
63
|
+
|
64
|
+
if body
|
65
|
+
begin
|
66
|
+
@doc = Nokogiri::HTML(body)
|
67
|
+
rescue
|
68
|
+
return
|
69
|
+
end
|
70
|
+
|
71
|
+
return if @doc.nil?
|
72
|
+
|
73
|
+
#get a list of distinct links on the page, in absolute url form
|
74
|
+
@doc.css('a').each do |a|
|
75
|
+
u = a.attributes['href'].content if a.attributes['href']
|
76
|
+
next if u.nil?
|
77
|
+
|
78
|
+
begin
|
79
|
+
abs = to_absolute(URI(u))
|
80
|
+
rescue
|
81
|
+
next
|
82
|
+
end
|
83
|
+
|
84
|
+
@links << abs if in_domain?(abs)
|
85
|
+
end
|
86
|
+
|
87
|
+
@links.uniq!
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
#
|
93
|
+
# Return a new page with the same *response* and *url*, but
|
94
|
+
# with a 200 response code
|
95
|
+
#
|
96
|
+
def alias_clone(url)
|
97
|
+
p = clone
|
98
|
+
p.add_alias!(@aka) if !@aka.nil?
|
99
|
+
p.code = 200
|
100
|
+
p
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
105
|
+
#
|
106
|
+
# Returns *self*
|
107
|
+
#
|
108
|
+
def add_alias!(aka)
|
109
|
+
@aliases << aka if !@aliases.include?(aka)
|
110
|
+
self
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# Returns an Array of all links from this page, and all the
|
115
|
+
# redirect-aliases of those pages, as String objects.
|
116
|
+
#
|
117
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
118
|
+
#
|
119
|
+
def links_and_their_aliases(page_hash)
|
120
|
+
@links.inject([]) do |results, link|
|
121
|
+
results.concat([link].concat(page_hash[link].aliases))
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# The content-type returned by the HTTP request for this page
|
127
|
+
#
|
128
|
+
def content_type
|
129
|
+
@headers['content-type'][0] rescue nil
|
130
|
+
end
|
131
|
+
|
132
|
+
#
|
133
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
134
|
+
# otherwise.
|
135
|
+
#
|
136
|
+
def html?
|
137
|
+
(@content_type =~ /text\/html/) == 0
|
138
|
+
end
|
139
|
+
|
140
|
+
#
|
141
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
142
|
+
# otherwise.
|
143
|
+
#
|
144
|
+
def redirect?
|
145
|
+
(300..399).include?(@code)
|
146
|
+
end
|
147
|
+
|
148
|
+
#
|
149
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
150
|
+
# returns +false+ otherwise.
|
151
|
+
#
|
152
|
+
def not_found?
|
153
|
+
404 == @code
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Converts relative URL *link* into an absolute URL based on the
|
158
|
+
# location of the page
|
159
|
+
#
|
160
|
+
def to_absolute(link)
|
161
|
+
# remove anchor
|
162
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
163
|
+
|
164
|
+
relative = URI(link)
|
165
|
+
absolute = @url.merge(relative)
|
166
|
+
|
167
|
+
absolute.path = '/' if absolute.path.empty?
|
168
|
+
|
169
|
+
return absolute
|
170
|
+
end
|
171
|
+
|
172
|
+
#
|
173
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
174
|
+
# +false+ otherwise
|
175
|
+
#
|
176
|
+
def in_domain?(uri)
|
177
|
+
uri.host == @url.host
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|