anemone 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +11 -1
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +58 -66
- data/lib/anemone/http.rb +39 -28
- data/lib/anemone/page.rb +53 -59
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +9 -3
data/lib/anemone/page.rb
CHANGED
@@ -8,39 +8,44 @@ module Anemone
|
|
8
8
|
attr_reader :url
|
9
9
|
# Headers of the HTTP response
|
10
10
|
attr_reader :headers
|
11
|
-
|
11
|
+
# URL of the page this one redirected to, if any
|
12
|
+
attr_reader :redirect_to
|
13
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
14
|
+
attr_reader :error
|
15
|
+
|
12
16
|
# OpenStruct for user-stored data
|
13
17
|
attr_accessor :data
|
14
|
-
# Nokogiri document for the HTML body
|
15
|
-
attr_accessor :doc
|
16
18
|
# Integer response code of the page
|
17
|
-
attr_accessor :code
|
18
|
-
#
|
19
|
-
attr_accessor :aliases
|
20
|
-
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
19
|
+
attr_accessor :code
|
20
|
+
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
|
21
21
|
attr_accessor :visited
|
22
22
|
# Depth of this page from the root of the crawl. This is not necessarily the
|
23
|
-
# shortest path; use
|
23
|
+
# shortest path; use PageStore#shortest_paths! to find that value.
|
24
24
|
attr_accessor :depth
|
25
25
|
# URL of the page that brought us to this page
|
26
26
|
attr_accessor :referer
|
27
27
|
# Response time of the request for this page in milliseconds
|
28
28
|
attr_accessor :response_time
|
29
|
-
|
29
|
+
|
30
30
|
#
|
31
31
|
# Create a new page
|
32
32
|
#
|
33
|
-
def initialize(url,
|
33
|
+
def initialize(url, params = {})
|
34
34
|
@url = url
|
35
|
-
@code = code
|
36
|
-
@headers = headers || {}
|
37
|
-
@headers['content-type'] ||= ['']
|
38
|
-
@aliases = Array(aka)
|
39
35
|
@data = OpenStruct.new
|
40
|
-
|
41
|
-
@
|
42
|
-
@
|
43
|
-
@
|
36
|
+
|
37
|
+
@code = params[:code]
|
38
|
+
@headers = params[:headers] || {}
|
39
|
+
@headers['content-type'] ||= ['']
|
40
|
+
@aliases = Array(params[:aka]).compact
|
41
|
+
@referer = params[:referer]
|
42
|
+
@depth = params[:depth] || 0
|
43
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
44
|
+
@response_time = params[:response_time]
|
45
|
+
@body = params[:body]
|
46
|
+
@error = params[:error]
|
47
|
+
|
48
|
+
@fetched = !params[:code].nil?
|
44
49
|
end
|
45
50
|
|
46
51
|
# Array of distinct A tag HREFs from the page
|
@@ -48,7 +53,7 @@ module Anemone
|
|
48
53
|
return @links unless @links.nil?
|
49
54
|
@links = []
|
50
55
|
return @links if !doc
|
51
|
-
|
56
|
+
|
52
57
|
doc.css('a').each do |a|
|
53
58
|
u = a.attributes['href'].content rescue nil
|
54
59
|
next if u.nil? or u.empty?
|
@@ -58,52 +63,30 @@ module Anemone
|
|
58
63
|
@links.uniq!
|
59
64
|
@links
|
60
65
|
end
|
61
|
-
|
66
|
+
|
67
|
+
# Nokogiri document for the HTML body
|
68
|
+
def doc
|
69
|
+
return @doc if @doc
|
70
|
+
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
71
|
+
end
|
72
|
+
|
73
|
+
# Delete the Nokogiri document and response body to conserve memory
|
62
74
|
def discard_doc!
|
63
75
|
links # force parsing of page links before we trash the document
|
64
|
-
@doc = nil
|
65
|
-
end
|
66
|
-
|
67
|
-
#
|
68
|
-
# Return a new page with the same *response* and *url*, but
|
69
|
-
# with a 200 response code
|
70
|
-
#
|
71
|
-
def alias_clone(url)
|
72
|
-
p = clone
|
73
|
-
p.add_alias!(@aka) if !@aka.nil?
|
74
|
-
p.code = 200
|
75
|
-
p
|
76
|
+
@doc = @body = nil
|
76
77
|
end
|
77
78
|
|
78
|
-
|
79
|
-
|
80
|
-
#
|
81
|
-
# Returns *self*
|
82
|
-
#
|
83
|
-
def add_alias!(aka)
|
84
|
-
@aliases << aka if !@aliases.include?(aka)
|
85
|
-
self
|
79
|
+
def fetched?
|
80
|
+
@fetched
|
86
81
|
end
|
87
|
-
|
88
|
-
#
|
89
|
-
# Returns an Array of all links from this page, and all the
|
90
|
-
# redirect-aliases of those pages, as String objects.
|
91
|
-
#
|
92
|
-
# *page_hash* is a PageHash object with the results of the current crawl.
|
93
|
-
#
|
94
|
-
def links_and_their_aliases(page_hash)
|
95
|
-
links.inject([]) do |results, link|
|
96
|
-
results.concat([link].concat(page_hash[link].aliases))
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
82
|
+
|
100
83
|
#
|
101
84
|
# The content-type returned by the HTTP request for this page
|
102
85
|
#
|
103
86
|
def content_type
|
104
87
|
headers['content-type'].first
|
105
88
|
end
|
106
|
-
|
89
|
+
|
107
90
|
#
|
108
91
|
# Returns +true+ if the page is a HTML document, returns +false+
|
109
92
|
# otherwise.
|
@@ -111,15 +94,15 @@ module Anemone
|
|
111
94
|
def html?
|
112
95
|
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
113
96
|
end
|
114
|
-
|
97
|
+
|
115
98
|
#
|
116
99
|
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
117
100
|
# otherwise.
|
118
|
-
#
|
101
|
+
#
|
119
102
|
def redirect?
|
120
103
|
(300..399).include?(@code)
|
121
104
|
end
|
122
|
-
|
105
|
+
|
123
106
|
#
|
124
107
|
# Returns +true+ if the page was not found (returned 404 code),
|
125
108
|
# returns +false+ otherwise.
|
@@ -127,12 +110,14 @@ module Anemone
|
|
127
110
|
def not_found?
|
128
111
|
404 == @code
|
129
112
|
end
|
130
|
-
|
113
|
+
|
131
114
|
#
|
132
115
|
# Converts relative URL *link* into an absolute URL based on the
|
133
116
|
# location of the page
|
134
117
|
#
|
135
118
|
def to_absolute(link)
|
119
|
+
return nil if link.nil?
|
120
|
+
|
136
121
|
# remove anchor
|
137
122
|
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
138
123
|
|
@@ -143,7 +128,7 @@ module Anemone
|
|
143
128
|
|
144
129
|
return absolute
|
145
130
|
end
|
146
|
-
|
131
|
+
|
147
132
|
#
|
148
133
|
# Returns +true+ if *uri* is in the same domain as the page, returns
|
149
134
|
# +false+ otherwise
|
@@ -151,5 +136,14 @@ module Anemone
|
|
151
136
|
def in_domain?(uri)
|
152
137
|
uri.host == @url.host
|
153
138
|
end
|
139
|
+
|
140
|
+
def marshal_dump
|
141
|
+
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
|
142
|
+
end
|
143
|
+
|
144
|
+
def marshal_load(ary)
|
145
|
+
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
146
|
+
end
|
147
|
+
|
154
148
|
end
|
155
149
|
end
|
@@ -1,21 +1,52 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
1
3
|
module Anemone
|
2
|
-
class
|
3
|
-
|
4
|
+
class PageStore
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@storage, :keys, :values, :size, :each
|
8
|
+
|
9
|
+
def initialize(storage = {})
|
10
|
+
@storage = storage
|
11
|
+
end
|
12
|
+
|
4
13
|
# We typically index the hash with a URI,
|
5
14
|
# but convert it to a String for easier retrieval
|
6
15
|
def [](index)
|
7
|
-
|
16
|
+
@storage[index.to_s]
|
8
17
|
end
|
9
|
-
|
18
|
+
|
10
19
|
def []=(index, other)
|
11
|
-
|
20
|
+
@storage[index.to_s] = other
|
21
|
+
end
|
22
|
+
|
23
|
+
def delete(key)
|
24
|
+
@storage.delete key.to_s
|
12
25
|
end
|
13
|
-
|
26
|
+
|
14
27
|
def has_key?(key)
|
15
|
-
|
28
|
+
@storage.has_key? key.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
def each_value
|
32
|
+
each { |key, value| yield value }
|
33
|
+
end
|
34
|
+
|
35
|
+
def values
|
36
|
+
result = []
|
37
|
+
each { |key, value| result << value }
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
41
|
+
def touch_key(key)
|
42
|
+
self[key] = Page.new(key)
|
16
43
|
end
|
17
44
|
|
18
|
-
|
45
|
+
def touch_keys(keys)
|
46
|
+
@storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Does this PageStore contain the specified URL?
|
19
50
|
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
20
51
|
def has_page?(url)
|
21
52
|
schemes = %w(http https)
|
@@ -24,80 +55,67 @@ module Anemone
|
|
24
55
|
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
25
56
|
end
|
26
57
|
|
27
|
-
has_key?
|
58
|
+
has_key? url
|
28
59
|
end
|
29
|
-
|
60
|
+
|
30
61
|
#
|
31
62
|
# Use a breadth-first search to calculate the single-source
|
32
|
-
# shortest paths from *root* to all pages in the
|
63
|
+
# shortest paths from *root* to all pages in the PageStore
|
33
64
|
#
|
34
65
|
def shortest_paths!(root)
|
35
66
|
root = URI(root) if root.is_a?(String)
|
36
67
|
raise "Root node not found" if !has_key?(root)
|
37
|
-
|
38
|
-
each_value {|p| p.visited = false if p}
|
39
|
-
|
68
|
+
|
40
69
|
q = Queue.new
|
41
|
-
|
42
|
-
q.enq
|
43
|
-
self[root]
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
page = self[url]
|
51
|
-
|
70
|
+
|
71
|
+
q.enq root
|
72
|
+
root_page = self[root]
|
73
|
+
root_page.depth = 0
|
74
|
+
root_page.visited = true
|
75
|
+
self[root] = root_page
|
76
|
+
while !q.empty?
|
77
|
+
page = self[q.deq]
|
52
78
|
page.links.each do |u|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
79
|
+
begin
|
80
|
+
link = self[u]
|
81
|
+
next if link.nil? || !link.fetched? || link.visited
|
82
|
+
|
83
|
+
q << u unless link.redirect?
|
84
|
+
link.visited = true
|
85
|
+
link.depth = page.depth + 1
|
86
|
+
self[u] = link
|
87
|
+
|
88
|
+
if link.redirect?
|
89
|
+
u = link.redirect_to
|
90
|
+
redo
|
60
91
|
end
|
61
92
|
end
|
62
|
-
|
63
|
-
q.enq(self[u].url) if !self[u].visited
|
64
|
-
self[u].visited = true
|
65
93
|
end
|
66
94
|
end
|
67
|
-
|
95
|
+
|
68
96
|
self
|
69
97
|
end
|
70
|
-
|
98
|
+
|
71
99
|
#
|
72
|
-
#
|
73
|
-
# non-redirect Page
|
100
|
+
# Removes all Pages from storage where redirect? is true
|
74
101
|
#
|
75
|
-
def uniq
|
76
|
-
|
77
|
-
|
78
|
-
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
79
|
-
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
80
|
-
if !page.redirect? and !page_added
|
81
|
-
results[url] = page.clone
|
82
|
-
results[url].aliases = []
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
results
|
102
|
+
def uniq!
|
103
|
+
each_value { |page| delete page.url if page.redirect? }
|
104
|
+
self
|
87
105
|
end
|
88
|
-
|
106
|
+
|
89
107
|
#
|
90
108
|
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
91
109
|
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
92
110
|
#
|
93
111
|
def pages_linking_to(urls)
|
94
112
|
unless urls.is_a?(Array)
|
95
|
-
urls = [urls]
|
113
|
+
urls = [urls]
|
96
114
|
single = true
|
97
115
|
end
|
98
116
|
|
99
117
|
urls.map! do |url|
|
100
|
-
|
118
|
+
unless url.is_a?(URI)
|
101
119
|
URI(url) rescue nil
|
102
120
|
else
|
103
121
|
url
|
@@ -112,7 +130,7 @@ module Anemone
|
|
112
130
|
end
|
113
131
|
|
114
132
|
if single and !links.empty?
|
115
|
-
return links.first
|
133
|
+
return links[urls.first]
|
116
134
|
else
|
117
135
|
return links
|
118
136
|
end
|
@@ -132,11 +150,11 @@ module Anemone
|
|
132
150
|
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
133
151
|
|
134
152
|
if single and !links.empty?
|
135
|
-
return links.first
|
153
|
+
return links[urls.first]
|
136
154
|
else
|
137
155
|
return links
|
138
|
-
end
|
156
|
+
end
|
139
157
|
end
|
140
158
|
|
141
159
|
end
|
142
|
-
end
|
160
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Anemone
|
2
|
+
module Storage
|
3
|
+
|
4
|
+
def self.Hash(*args)
|
5
|
+
Hash.new(*args)
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.PStore(*args)
|
9
|
+
require 'anemone/storage/pstore'
|
10
|
+
self::PStore.new(*args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.TokyoCabinet(file)
|
14
|
+
require 'anemone/storage/tokyo_cabinet'
|
15
|
+
self::TokyoCabinet.new(file)
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'pstore'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
module Storage
|
6
|
+
class PStore
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@keys, :has_key?, :keys, :size
|
10
|
+
|
11
|
+
def initialize(file)
|
12
|
+
File.delete(file) if File.exists?(file)
|
13
|
+
@store = ::PStore.new(file)
|
14
|
+
@keys = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](key)
|
18
|
+
@store.transaction { |s| s[key] }
|
19
|
+
end
|
20
|
+
|
21
|
+
def []=(key,value)
|
22
|
+
@keys[key] = nil
|
23
|
+
@store.transaction { |s| s[key] = value }
|
24
|
+
end
|
25
|
+
|
26
|
+
def delete(key)
|
27
|
+
@keys.delete(key)
|
28
|
+
@store.transaction { |s| s.delete key}
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
@keys.each_key do |key|
|
33
|
+
value = nil
|
34
|
+
@store.transaction { |s| value = s[key] }
|
35
|
+
yield key, value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def merge!(hash)
|
40
|
+
@store.transaction do |s|
|
41
|
+
hash.each { |key, value| s[key] = value; @keys[key] = nil }
|
42
|
+
end
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|