anemone 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +17 -17
- data/bin/anemone_count.rb +36 -31
- data/bin/anemone_cron.rb +107 -98
- data/bin/anemone_pagedepth.rb +43 -38
- data/bin/anemone_serialize.rb +50 -42
- data/bin/anemone_url_list.rb +54 -46
- data/bin/anemone_url_list.rb~ +58 -0
- data/lib/anemone.rb +1 -1
- data/lib/anemone/anemone.rb +36 -36
- data/lib/anemone/core.rb +181 -179
- data/lib/anemone/http.rb +36 -36
- data/lib/anemone/page.rb +184 -159
- data/lib/anemone/page_hash.rb +82 -82
- data/lib/anemone/tentacle.rb +30 -30
- metadata +10 -9
data/lib/anemone/http.rb
CHANGED
@@ -1,37 +1,37 @@
|
|
1
|
-
require 'net/http'
|
2
|
-
|
3
|
-
module Anemone
|
4
|
-
class HTTP < Net::HTTP
|
5
|
-
# Maximum number of redirects to follow on each get_response
|
6
|
-
REDIRECTION_LIMIT = 5
|
7
|
-
|
8
|
-
#
|
9
|
-
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
-
# Returns the response object, response code, and final URI location.
|
11
|
-
#
|
12
|
-
def self.get(url)
|
13
|
-
response = get_response(url)
|
14
|
-
code = Integer(response.code)
|
15
|
-
loc = url
|
16
|
-
|
17
|
-
limit = REDIRECTION_LIMIT
|
18
|
-
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
-
loc = URI(response['location'])
|
20
|
-
loc = url.merge(loc) if loc.relative?
|
21
|
-
response = get_response(loc)
|
22
|
-
limit -= 1
|
23
|
-
end
|
24
|
-
|
25
|
-
return response, code, loc
|
26
|
-
end
|
27
|
-
|
28
|
-
#
|
29
|
-
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
-
#
|
31
|
-
def self.get_response(url)
|
32
|
-
Net::HTTP.start(url.host, url.port) do |http|
|
33
|
-
return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class HTTP < Net::HTTP
|
5
|
+
# Maximum number of redirects to follow on each get_response
|
6
|
+
REDIRECTION_LIMIT = 5
|
7
|
+
|
8
|
+
#
|
9
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
+
# Returns the response object, response code, and final URI location.
|
11
|
+
#
|
12
|
+
def self.get(url)
|
13
|
+
response = get_response(url)
|
14
|
+
code = Integer(response.code)
|
15
|
+
loc = url
|
16
|
+
|
17
|
+
limit = REDIRECTION_LIMIT
|
18
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
+
loc = URI(response['location'])
|
20
|
+
loc = url.merge(loc) if loc.relative?
|
21
|
+
response = get_response(loc)
|
22
|
+
limit -= 1
|
23
|
+
end
|
24
|
+
|
25
|
+
return response, code, loc
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
+
#
|
31
|
+
def self.get_response(url)
|
32
|
+
Net::HTTP.start(url.host, url.port) do |http|
|
33
|
+
return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
37
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -1,159 +1,184 @@
|
|
1
|
-
require 'anemone/http'
|
2
|
-
require 'hpricot'
|
3
|
-
|
4
|
-
module Anemone
|
5
|
-
class Page
|
6
|
-
# The URL of the page
|
7
|
-
attr_reader :url
|
8
|
-
# Array of distinct A tag HREFs from the page
|
9
|
-
attr_reader :links
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
#
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
#
|
104
|
-
#
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
#
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
#
|
128
|
-
#
|
129
|
-
#
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
#
|
153
|
-
# +
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
1
|
+
require 'anemone/http'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class Page
|
6
|
+
# The URL of the page
|
7
|
+
attr_reader :url
|
8
|
+
# Array of distinct A tag HREFs from the page
|
9
|
+
attr_reader :links
|
10
|
+
#Body of the HTTP response
|
11
|
+
attr_reader :body
|
12
|
+
#Content-type of the HTTP response
|
13
|
+
attr_reader :content_type
|
14
|
+
#title of the page if it is an HTML document
|
15
|
+
attr_reader :title
|
16
|
+
#first h1 on the page, if present
|
17
|
+
attr_reader :h1
|
18
|
+
#first h2 on the page, if present
|
19
|
+
attr_reader :h2
|
20
|
+
#meta-description of the page, if present
|
21
|
+
attr_reader :description
|
22
|
+
|
23
|
+
# Integer response code of the page
|
24
|
+
attr_accessor :code
|
25
|
+
# Array of redirect-aliases for the page
|
26
|
+
attr_accessor :aliases
|
27
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
28
|
+
attr_accessor :visited
|
29
|
+
# Used by PageHash#shortest_paths! to store depth of the page
|
30
|
+
attr_accessor :depth
|
31
|
+
|
32
|
+
#
|
33
|
+
# Create a new Page from the response of an HTTP request to *url*
|
34
|
+
#
|
35
|
+
def self.fetch(url)
|
36
|
+
begin
|
37
|
+
url = URI(url) if url.is_a?(String)
|
38
|
+
|
39
|
+
response, code, location = Anemone::HTTP.get(url)
|
40
|
+
|
41
|
+
aka = nil
|
42
|
+
if !url.eql?(location)
|
43
|
+
aka = location
|
44
|
+
end
|
45
|
+
|
46
|
+
return Page.new(url, response.body, code, response['Content-Type'], aka)
|
47
|
+
rescue
|
48
|
+
return Page.new(url)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Create a new page
|
54
|
+
#
|
55
|
+
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
|
56
|
+
@url = url
|
57
|
+
@body = body unless Anemone.options.discard_page_bodies
|
58
|
+
@code = code
|
59
|
+
@content_type = content_type
|
60
|
+
@links = []
|
61
|
+
@aliases = []
|
62
|
+
|
63
|
+
@aliases << aka if !aka.nil?
|
64
|
+
|
65
|
+
if body
|
66
|
+
h = Hpricot(body)
|
67
|
+
|
68
|
+
#save page title
|
69
|
+
title_elem = h.at('title')
|
70
|
+
@title = title_elem.inner_html if !title_elem.nil?
|
71
|
+
|
72
|
+
#save page h1
|
73
|
+
h1_elem = h.at('h1')
|
74
|
+
@h1 = h1_elem.inner_html if !h1_elem.nil?
|
75
|
+
|
76
|
+
#save page h2
|
77
|
+
h2_elem = h.at('h2')
|
78
|
+
@h2 = h2_elem.inner_html if !h2_elem.nil?
|
79
|
+
|
80
|
+
#save page meta-description
|
81
|
+
description_elem = h.at('meta[@name=description]')
|
82
|
+
@description = description_elem['content'] if !description_elem.nil?
|
83
|
+
|
84
|
+
#get a list of distinct links on the page, in absolute url form
|
85
|
+
h.search('a').each do |a|
|
86
|
+
u = a['href']
|
87
|
+
next if u.nil?
|
88
|
+
|
89
|
+
begin
|
90
|
+
abs = to_absolute(URI(u))
|
91
|
+
rescue
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
95
|
+
@links << abs if in_domain?(abs)
|
96
|
+
end
|
97
|
+
|
98
|
+
@links.uniq!
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
#
|
104
|
+
# Return a new page with the same *response* and *url*, but
|
105
|
+
# with a 200 response code
|
106
|
+
#
|
107
|
+
def alias_clone(url)
|
108
|
+
p = clone
|
109
|
+
p.add_alias!(@aka) if !@aka.nil?
|
110
|
+
p.code = 200
|
111
|
+
p
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
116
|
+
#
|
117
|
+
# Returns *self*
|
118
|
+
#
|
119
|
+
def add_alias!(aka)
|
120
|
+
@aliases << aka if !@aliases.include?(aka)
|
121
|
+
self
|
122
|
+
end
|
123
|
+
|
124
|
+
#
|
125
|
+
# Returns an Array of all links from this page, and all the
|
126
|
+
# redirect-aliases of those pages, as String objects.
|
127
|
+
#
|
128
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
129
|
+
#
|
130
|
+
def links_and_their_aliases(page_hash)
|
131
|
+
@links.inject([]) do |results, link|
|
132
|
+
results.concat([link].concat(page_hash[link].aliases))
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
138
|
+
# otherwise.
|
139
|
+
#
|
140
|
+
def html?
|
141
|
+
(@content_type =~ /text\/html/) == 0
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
146
|
+
# otherwise.
|
147
|
+
#
|
148
|
+
def redirect?
|
149
|
+
(300..399).include?(@code)
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
154
|
+
# returns +false+ otherwise.
|
155
|
+
#
|
156
|
+
def not_found?
|
157
|
+
404 == @code
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# Converts relative URL *link* into an absolute URL based on the
|
162
|
+
# location of the page
|
163
|
+
#
|
164
|
+
def to_absolute(link)
|
165
|
+
# remove anchor
|
166
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
167
|
+
|
168
|
+
relative = URI(link)
|
169
|
+
absolute = @url.merge(relative)
|
170
|
+
|
171
|
+
absolute.path = '/' if absolute.path.empty?
|
172
|
+
|
173
|
+
return absolute
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
178
|
+
# +false+ otherwise
|
179
|
+
#
|
180
|
+
def in_domain?(uri)
|
181
|
+
uri.host == @url.host
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -1,83 +1,83 @@
|
|
1
|
-
module Anemone
|
2
|
-
class PageHash < Hash
|
3
|
-
|
4
|
-
#
|
5
|
-
# Use a breadth-first search to calculate the single-source
|
6
|
-
# shortest paths from *root* to all pages in the PageHash
|
7
|
-
#
|
8
|
-
def shortest_paths!(root)
|
9
|
-
root = URI(root) if root.is_a?(String)
|
10
|
-
raise "Root node not found" if !has_key?(root)
|
11
|
-
|
12
|
-
each_value {|p| p.visited = false if p}
|
13
|
-
|
14
|
-
q = Queue.new
|
15
|
-
|
16
|
-
q.enq(root)
|
17
|
-
self[root].depth = 0
|
18
|
-
self[root].visited = true
|
19
|
-
while(!q.empty?)
|
20
|
-
url = q.deq
|
21
|
-
|
22
|
-
next if !has_key?(url)
|
23
|
-
|
24
|
-
page = self[url]
|
25
|
-
|
26
|
-
page.links.each do |u|
|
27
|
-
next if !has_key?(u) or self[u].nil?
|
28
|
-
link = self[u]
|
29
|
-
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
-
|
31
|
-
aliases.each do |node|
|
32
|
-
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
-
node.depth = page.depth + 1
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
q.enq(self[u].url) if !self[u].visited
|
38
|
-
self[u].visited = true
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
self
|
43
|
-
end
|
44
|
-
|
45
|
-
#
|
46
|
-
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
-
# non-redirect Page
|
48
|
-
#
|
49
|
-
def uniq
|
50
|
-
results = PageHash.new
|
51
|
-
each do |url, page|
|
52
|
-
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
-
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
-
if !page.redirect? and !page_added
|
55
|
-
results[url] = page.clone
|
56
|
-
results[url].aliases = []
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
results
|
61
|
-
end
|
62
|
-
|
63
|
-
#
|
64
|
-
# Return an Array of Page objects which link to the given url
|
65
|
-
#
|
66
|
-
def pages_linking_to url
|
67
|
-
begin
|
68
|
-
url = URI(url) if url.is_a?(String)
|
69
|
-
rescue
|
70
|
-
return []
|
71
|
-
end
|
72
|
-
|
73
|
-
values.delete_if { |p| !p.links.include?(url) }
|
74
|
-
end
|
75
|
-
|
76
|
-
#
|
77
|
-
# Return an Array of URI objects of Pages linking to the given url
|
78
|
-
def urls_linking_to url
|
79
|
-
pages_linking_to(url).map{|p| p.url}
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
#
|
5
|
+
# Use a breadth-first search to calculate the single-source
|
6
|
+
# shortest paths from *root* to all pages in the PageHash
|
7
|
+
#
|
8
|
+
def shortest_paths!(root)
|
9
|
+
root = URI(root) if root.is_a?(String)
|
10
|
+
raise "Root node not found" if !has_key?(root)
|
11
|
+
|
12
|
+
each_value {|p| p.visited = false if p}
|
13
|
+
|
14
|
+
q = Queue.new
|
15
|
+
|
16
|
+
q.enq(root)
|
17
|
+
self[root].depth = 0
|
18
|
+
self[root].visited = true
|
19
|
+
while(!q.empty?)
|
20
|
+
url = q.deq
|
21
|
+
|
22
|
+
next if !has_key?(url)
|
23
|
+
|
24
|
+
page = self[url]
|
25
|
+
|
26
|
+
page.links.each do |u|
|
27
|
+
next if !has_key?(u) or self[u].nil?
|
28
|
+
link = self[u]
|
29
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
+
|
31
|
+
aliases.each do |node|
|
32
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
+
node.depth = page.depth + 1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
q.enq(self[u].url) if !self[u].visited
|
38
|
+
self[u].visited = true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
+
# non-redirect Page
|
48
|
+
#
|
49
|
+
def uniq
|
50
|
+
results = PageHash.new
|
51
|
+
each do |url, page|
|
52
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
+
if !page.redirect? and !page_added
|
55
|
+
results[url] = page.clone
|
56
|
+
results[url].aliases = []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
results
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Return an Array of Page objects which link to the given url
|
65
|
+
#
|
66
|
+
def pages_linking_to url
|
67
|
+
begin
|
68
|
+
url = URI(url) if url.is_a?(String)
|
69
|
+
rescue
|
70
|
+
return []
|
71
|
+
end
|
72
|
+
|
73
|
+
values.delete_if { |p| !p.links.include?(url) }
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# Return an Array of URI objects of Pages linking to the given url
|
78
|
+
def urls_linking_to url
|
79
|
+
pages_linking_to(url).map{|p| p.url}
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
83
|
end
|