anemone 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +17 -17
- data/bin/anemone_count.rb +36 -31
- data/bin/anemone_cron.rb +107 -98
- data/bin/anemone_pagedepth.rb +43 -38
- data/bin/anemone_serialize.rb +50 -42
- data/bin/anemone_url_list.rb +54 -46
- data/bin/anemone_url_list.rb~ +58 -0
- data/lib/anemone.rb +1 -1
- data/lib/anemone/anemone.rb +36 -36
- data/lib/anemone/core.rb +181 -179
- data/lib/anemone/http.rb +36 -36
- data/lib/anemone/page.rb +184 -159
- data/lib/anemone/page_hash.rb +82 -82
- data/lib/anemone/tentacle.rb +30 -30
- metadata +10 -9
data/lib/anemone/http.rb
CHANGED
@@ -1,37 +1,37 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
|
3
|
-
module Anemone
|
4
|
-
class HTTP < Net::HTTP
|
5
|
-
# Maximum number of redirects to follow on each get_response
|
6
|
-
REDIRECTION_LIMIT = 5
|
7
|
-
|
8
|
-
#
|
9
|
-
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
-
# Returns the response object, response code, and final URI location.
|
11
|
-
#
|
12
|
-
def self.get(url)
|
13
|
-
response = get_response(url)
|
14
|
-
code = Integer(response.code)
|
15
|
-
loc = url
|
16
|
-
|
17
|
-
limit = REDIRECTION_LIMIT
|
18
|
-
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
-
loc = URI(response['location'])
|
20
|
-
loc = url.merge(loc) if loc.relative?
|
21
|
-
response = get_response(loc)
|
22
|
-
limit -= 1
|
23
|
-
end
|
24
|
-
|
25
|
-
return response, code, loc
|
26
|
-
end
|
27
|
-
|
28
|
-
#
|
29
|
-
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
-
#
|
31
|
-
def self.get_response(url)
|
32
|
-
Net::HTTP.start(url.host, url.port) do |http|
|
33
|
-
return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class HTTP < Net::HTTP
|
5
|
+
# Maximum number of redirects to follow on each get_response
|
6
|
+
REDIRECTION_LIMIT = 5
|
7
|
+
|
8
|
+
#
|
9
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
+
# Returns the response object, response code, and final URI location.
|
11
|
+
#
|
12
|
+
def self.get(url)
|
13
|
+
response = get_response(url)
|
14
|
+
code = Integer(response.code)
|
15
|
+
loc = url
|
16
|
+
|
17
|
+
limit = REDIRECTION_LIMIT
|
18
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
+
loc = URI(response['location'])
|
20
|
+
loc = url.merge(loc) if loc.relative?
|
21
|
+
response = get_response(loc)
|
22
|
+
limit -= 1
|
23
|
+
end
|
24
|
+
|
25
|
+
return response, code, loc
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
+
#
|
31
|
+
def self.get_response(url)
|
32
|
+
Net::HTTP.start(url.host, url.port) do |http|
|
33
|
+
return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
37
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -1,159 +1,184 @@
|
|
1
|
-
require 'anemone/http'
|
2
|
-
require 'hpricot'
|
3
|
-
|
4
|
-
module Anemone
|
5
|
-
class Page
|
6
|
-
# The URL of the page
|
7
|
-
attr_reader :url
|
8
|
-
# Array of distinct A tag HREFs from the page
|
9
|
-
attr_reader :links
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
#
|
104
|
-
#
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
#
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
#
|
128
|
-
#
|
129
|
-
#
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
#
|
153
|
-
# +
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
1
|
+
require 'anemone/http'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class Page
|
6
|
+
# The URL of the page
|
7
|
+
attr_reader :url
|
8
|
+
# Array of distinct A tag HREFs from the page
|
9
|
+
attr_reader :links
|
10
|
+
#Body of the HTTP response
|
11
|
+
attr_reader :body
|
12
|
+
#Content-type of the HTTP response
|
13
|
+
attr_reader :content_type
|
14
|
+
#title of the page if it is an HTML document
|
15
|
+
attr_reader :title
|
16
|
+
#first h1 on the page, if present
|
17
|
+
attr_reader :h1
|
18
|
+
#first h2 on the page, if present
|
19
|
+
attr_reader :h2
|
20
|
+
#meta-description of the page, if present
|
21
|
+
attr_reader :description
|
22
|
+
|
23
|
+
# Integer response code of the page
|
24
|
+
attr_accessor :code
|
25
|
+
# Array of redirect-aliases for the page
|
26
|
+
attr_accessor :aliases
|
27
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
28
|
+
attr_accessor :visited
|
29
|
+
# Used by PageHash#shortest_paths! to store depth of the page
|
30
|
+
attr_accessor :depth
|
31
|
+
|
32
|
+
#
|
33
|
+
# Create a new Page from the response of an HTTP request to *url*
|
34
|
+
#
|
35
|
+
def self.fetch(url)
|
36
|
+
begin
|
37
|
+
url = URI(url) if url.is_a?(String)
|
38
|
+
|
39
|
+
response, code, location = Anemone::HTTP.get(url)
|
40
|
+
|
41
|
+
aka = nil
|
42
|
+
if !url.eql?(location)
|
43
|
+
aka = location
|
44
|
+
end
|
45
|
+
|
46
|
+
return Page.new(url, response.body, code, response['Content-Type'], aka)
|
47
|
+
rescue
|
48
|
+
return Page.new(url)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Create a new page
|
54
|
+
#
|
55
|
+
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
|
56
|
+
@url = url
|
57
|
+
@body = body unless Anemone.options.discard_page_bodies
|
58
|
+
@code = code
|
59
|
+
@content_type = content_type
|
60
|
+
@links = []
|
61
|
+
@aliases = []
|
62
|
+
|
63
|
+
@aliases << aka if !aka.nil?
|
64
|
+
|
65
|
+
if body
|
66
|
+
h = Hpricot(body)
|
67
|
+
|
68
|
+
#save page title
|
69
|
+
title_elem = h.at('title')
|
70
|
+
@title = title_elem.inner_html if !title_elem.nil?
|
71
|
+
|
72
|
+
#save page h1
|
73
|
+
h1_elem = h.at('h1')
|
74
|
+
@h1 = h1_elem.inner_html if !h1_elem.nil?
|
75
|
+
|
76
|
+
#save page h2
|
77
|
+
h2_elem = h.at('h2')
|
78
|
+
@h2 = h2_elem.inner_html if !h2_elem.nil?
|
79
|
+
|
80
|
+
#save page meta-description
|
81
|
+
description_elem = h.at('meta[@name=description]')
|
82
|
+
@description = description_elem['content'] if !description_elem.nil?
|
83
|
+
|
84
|
+
#get a list of distinct links on the page, in absolute url form
|
85
|
+
h.search('a').each do |a|
|
86
|
+
u = a['href']
|
87
|
+
next if u.nil?
|
88
|
+
|
89
|
+
begin
|
90
|
+
abs = to_absolute(URI(u))
|
91
|
+
rescue
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
95
|
+
@links << abs if in_domain?(abs)
|
96
|
+
end
|
97
|
+
|
98
|
+
@links.uniq!
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
#
|
104
|
+
# Return a new page with the same *response* and *url*, but
|
105
|
+
# with a 200 response code
|
106
|
+
#
|
107
|
+
def alias_clone(url)
|
108
|
+
p = clone
|
109
|
+
p.add_alias!(@aka) if !@aka.nil?
|
110
|
+
p.code = 200
|
111
|
+
p
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
116
|
+
#
|
117
|
+
# Returns *self*
|
118
|
+
#
|
119
|
+
def add_alias!(aka)
|
120
|
+
@aliases << aka if !@aliases.include?(aka)
|
121
|
+
self
|
122
|
+
end
|
123
|
+
|
124
|
+
#
|
125
|
+
# Returns an Array of all links from this page, and all the
|
126
|
+
# redirect-aliases of those pages, as String objects.
|
127
|
+
#
|
128
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
129
|
+
#
|
130
|
+
def links_and_their_aliases(page_hash)
|
131
|
+
@links.inject([]) do |results, link|
|
132
|
+
results.concat([link].concat(page_hash[link].aliases))
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
138
|
+
# otherwise.
|
139
|
+
#
|
140
|
+
def html?
|
141
|
+
(@content_type =~ /text\/html/) == 0
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
146
|
+
# otherwise.
|
147
|
+
#
|
148
|
+
def redirect?
|
149
|
+
(300..399).include?(@code)
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
154
|
+
# returns +false+ otherwise.
|
155
|
+
#
|
156
|
+
def not_found?
|
157
|
+
404 == @code
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# Converts relative URL *link* into an absolute URL based on the
|
162
|
+
# location of the page
|
163
|
+
#
|
164
|
+
def to_absolute(link)
|
165
|
+
# remove anchor
|
166
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
167
|
+
|
168
|
+
relative = URI(link)
|
169
|
+
absolute = @url.merge(relative)
|
170
|
+
|
171
|
+
absolute.path = '/' if absolute.path.empty?
|
172
|
+
|
173
|
+
return absolute
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
178
|
+
# +false+ otherwise
|
179
|
+
#
|
180
|
+
def in_domain?(uri)
|
181
|
+
uri.host == @url.host
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -1,83 +1,83 @@
|
|
1
|
-
module Anemone
|
2
|
-
class PageHash < Hash
|
3
|
-
|
4
|
-
#
|
5
|
-
# Use a breadth-first search to calculate the single-source
|
6
|
-
# shortest paths from *root* to all pages in the PageHash
|
7
|
-
#
|
8
|
-
def shortest_paths!(root)
|
9
|
-
root = URI(root) if root.is_a?(String)
|
10
|
-
raise "Root node not found" if !has_key?(root)
|
11
|
-
|
12
|
-
each_value {|p| p.visited = false if p}
|
13
|
-
|
14
|
-
q = Queue.new
|
15
|
-
|
16
|
-
q.enq(root)
|
17
|
-
self[root].depth = 0
|
18
|
-
self[root].visited = true
|
19
|
-
while(!q.empty?)
|
20
|
-
url = q.deq
|
21
|
-
|
22
|
-
next if !has_key?(url)
|
23
|
-
|
24
|
-
page = self[url]
|
25
|
-
|
26
|
-
page.links.each do |u|
|
27
|
-
next if !has_key?(u) or self[u].nil?
|
28
|
-
link = self[u]
|
29
|
-
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
-
|
31
|
-
aliases.each do |node|
|
32
|
-
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
-
node.depth = page.depth + 1
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
q.enq(self[u].url) if !self[u].visited
|
38
|
-
self[u].visited = true
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
self
|
43
|
-
end
|
44
|
-
|
45
|
-
#
|
46
|
-
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
-
# non-redirect Page
|
48
|
-
#
|
49
|
-
def uniq
|
50
|
-
results = PageHash.new
|
51
|
-
each do |url, page|
|
52
|
-
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
-
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
-
if !page.redirect? and !page_added
|
55
|
-
results[url] = page.clone
|
56
|
-
results[url].aliases = []
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
results
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# Return an Array of Page objects which link to the given url
|
65
|
-
#
|
66
|
-
def pages_linking_to url
|
67
|
-
begin
|
68
|
-
url = URI(url) if url.is_a?(String)
|
69
|
-
rescue
|
70
|
-
return []
|
71
|
-
end
|
72
|
-
|
73
|
-
values.delete_if { |p| !p.links.include?(url) }
|
74
|
-
end
|
75
|
-
|
76
|
-
#
|
77
|
-
# Return an Array of URI objects of Pages linking to the given url
|
78
|
-
def urls_linking_to url
|
79
|
-
pages_linking_to(url).map{|p| p.url}
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
#
|
5
|
+
# Use a breadth-first search to calculate the single-source
|
6
|
+
# shortest paths from *root* to all pages in the PageHash
|
7
|
+
#
|
8
|
+
def shortest_paths!(root)
|
9
|
+
root = URI(root) if root.is_a?(String)
|
10
|
+
raise "Root node not found" if !has_key?(root)
|
11
|
+
|
12
|
+
each_value {|p| p.visited = false if p}
|
13
|
+
|
14
|
+
q = Queue.new
|
15
|
+
|
16
|
+
q.enq(root)
|
17
|
+
self[root].depth = 0
|
18
|
+
self[root].visited = true
|
19
|
+
while(!q.empty?)
|
20
|
+
url = q.deq
|
21
|
+
|
22
|
+
next if !has_key?(url)
|
23
|
+
|
24
|
+
page = self[url]
|
25
|
+
|
26
|
+
page.links.each do |u|
|
27
|
+
next if !has_key?(u) or self[u].nil?
|
28
|
+
link = self[u]
|
29
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
+
|
31
|
+
aliases.each do |node|
|
32
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
+
node.depth = page.depth + 1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
q.enq(self[u].url) if !self[u].visited
|
38
|
+
self[u].visited = true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
+
# non-redirect Page
|
48
|
+
#
|
49
|
+
def uniq
|
50
|
+
results = PageHash.new
|
51
|
+
each do |url, page|
|
52
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
+
if !page.redirect? and !page_added
|
55
|
+
results[url] = page.clone
|
56
|
+
results[url].aliases = []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
results
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Return an Array of Page objects which link to the given url
|
65
|
+
#
|
66
|
+
def pages_linking_to url
|
67
|
+
begin
|
68
|
+
url = URI(url) if url.is_a?(String)
|
69
|
+
rescue
|
70
|
+
return []
|
71
|
+
end
|
72
|
+
|
73
|
+
values.delete_if { |p| !p.links.include?(url) }
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# Return an Array of URI objects of Pages linking to the given url
|
78
|
+
def urls_linking_to url
|
79
|
+
pages_linking_to(url).map{|p| p.url}
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
83
|
end
|