anemone 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/anemone/http.rb CHANGED
@@ -1,37 +1,37 @@
1
- require 'net/http'
2
-
3
- module Anemone
4
- class HTTP < Net::HTTP
5
- # Maximum number of redirects to follow on each get_response
6
- REDIRECTION_LIMIT = 5
7
-
8
- #
9
- # Retrieve an HTTP response for *url*, following redirects.
10
- # Returns the response object, response code, and final URI location.
11
- #
12
- def self.get(url)
13
- response = get_response(url)
14
- code = Integer(response.code)
15
- loc = url
16
-
17
- limit = REDIRECTION_LIMIT
18
- while response.is_a?(Net::HTTPRedirection) and limit > 0
19
- loc = URI(response['location'])
20
- loc = url.merge(loc) if loc.relative?
21
- response = get_response(loc)
22
- limit -= 1
23
- end
24
-
25
- return response, code, loc
26
- end
27
-
28
- #
29
- # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
- #
31
- def self.get_response(url)
32
- Net::HTTP.start(url.host, url.port) do |http|
33
- return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
34
- end
35
- end
36
- end
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = get_response(loc)
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ Net::HTTP.start(url.host, url.port) do |http|
33
+ return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
34
+ end
35
+ end
36
+ end
37
37
  end
data/lib/anemone/page.rb CHANGED
@@ -1,159 +1,184 @@
1
- require 'anemone/http'
2
- require 'hpricot'
3
-
4
- module Anemone
5
- class Page
6
- # The URL of the page
7
- attr_reader :url
8
- # Array of distinct A tag HREFs from the page
9
- attr_reader :links
10
- #Body of the HTTP response
11
- attr_reader :body
12
- #Content-type of the HTTP response
13
- attr_reader :content_type
14
-
15
- # Integer response code of the page
16
- attr_accessor :code
17
- # Array of redirect-aliases for the page
18
- attr_accessor :aliases
19
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
20
- attr_accessor :visited
21
- # Used by PageHash#shortest_paths! to store depth of the page
22
- attr_accessor :depth
23
-
24
- #
25
- # Create a new Page from the response of an HTTP request to *url*
26
- #
27
- def self.fetch(url)
28
- begin
29
- url = URI(url) if url.is_a?(String)
30
-
31
- response, code, location = Anemone::HTTP.get(url)
32
-
33
- aka = nil
34
- if !url.eql?(location)
35
- aka = location
36
- end
37
-
38
- return Page.new(url, response.body, code, response['Content-Type'], aka)
39
- rescue
40
- return Page.new(url)
41
- end
42
- end
43
-
44
- #
45
- # Create a new page
46
- #
47
- def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
48
- @url = url
49
- @body = body unless Anemone.options.discard_page_bodies
50
- @code = code
51
- @content_type = content_type
52
- @links = []
53
- @aliases = []
54
-
55
- @aliases << aka if !aka.nil?
56
-
57
- #get a list of distinct links on the page, in absolute url form
58
- if body
59
- Hpricot(body).search('a').each do |a|
60
- u = a['href']
61
- next if u.nil?
62
-
63
- begin
64
- u = URI(u)
65
- rescue
66
- next
67
- end
68
-
69
- abs = to_absolute(u)
70
- @links << abs if in_domain?(abs)
71
- end
72
-
73
- @links.uniq!
74
- end
75
- end
76
-
77
-
78
- #
79
- # Return a new page with the same *response* and *url*, but
80
- # with a 200 response code
81
- #
82
- def alias_clone(url)
83
- p = clone
84
- p.add_alias!(@aka) if !@aka.nil?
85
- p.code = 200
86
- p
87
- end
88
-
89
- #
90
- # Add a redirect-alias String *aka* to the list of the page's aliases
91
- #
92
- # Returns *self*
93
- #
94
- def add_alias!(aka)
95
- @aliases << aka if !@aliases.include?(aka)
96
- self
97
- end
98
-
99
- #
100
- # Returns an Array of all links from this page, and all the
101
- # redirect-aliases of those pages, as String objects.
102
- #
103
- # *page_hash* is a PageHash object with the results of the current crawl.
104
- #
105
- def links_and_their_aliases(page_hash)
106
- @links.inject([]) do |results, link|
107
- results.concat([link].concat(page_hash[link].aliases))
108
- end
109
- end
110
-
111
- #
112
- # Returns +true+ if the page is a HTML document, returns +false+
113
- # otherwise.
114
- #
115
- def html?
116
- (@content_type =~ /text\/html/) == 0
117
- end
118
-
119
- #
120
- # Returns +true+ if the page is a HTTP redirect, returns +false+
121
- # otherwise.
122
- #
123
- def redirect?
124
- (300..399).include?(@code)
125
- end
126
-
127
- #
128
- # Returns +true+ if the page was not found (returned 404 code),
129
- # returns +false+ otherwise.
130
- #
131
- def not_found?
132
- 404 == @code
133
- end
134
-
135
- #
136
- # Converts relative URL *link* into an absolute URL based on the
137
- # location of the page
138
- #
139
- def to_absolute(link)
140
- # remove anchor
141
- link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
142
-
143
- relative = URI(link)
144
- absolute = @url.merge(relative)
145
-
146
- absolute.path = '/' if absolute.path.empty?
147
-
148
- return absolute
149
- end
150
-
151
- #
152
- # Returns +true+ if *uri* is in the same domain as the page, returns
153
- # +false+ otherwise
154
- #
155
- def in_domain?(uri)
156
- uri.host == @url.host
157
- end
158
- end
159
- end
1
+ require 'anemone/http'
2
+ require 'hpricot'
3
+
4
+ module Anemone
5
+ class Page
6
+ # The URL of the page
7
+ attr_reader :url
8
+ # Array of distinct A tag HREFs from the page
9
+ attr_reader :links
10
+ #Body of the HTTP response
11
+ attr_reader :body
12
+ #Content-type of the HTTP response
13
+ attr_reader :content_type
14
+ #title of the page if it is an HTML document
15
+ attr_reader :title
16
+ #first h1 on the page, if present
17
+ attr_reader :h1
18
+ #first h2 on the page, if present
19
+ attr_reader :h2
20
+ #meta-description of the page, if present
21
+ attr_reader :description
22
+
23
+ # Integer response code of the page
24
+ attr_accessor :code
25
+ # Array of redirect-aliases for the page
26
+ attr_accessor :aliases
27
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
28
+ attr_accessor :visited
29
+ # Used by PageHash#shortest_paths! to store depth of the page
30
+ attr_accessor :depth
31
+
32
+ #
33
+ # Create a new Page from the response of an HTTP request to *url*
34
+ #
35
+ def self.fetch(url)
36
+ begin
37
+ url = URI(url) if url.is_a?(String)
38
+
39
+ response, code, location = Anemone::HTTP.get(url)
40
+
41
+ aka = nil
42
+ if !url.eql?(location)
43
+ aka = location
44
+ end
45
+
46
+ return Page.new(url, response.body, code, response['Content-Type'], aka)
47
+ rescue
48
+ return Page.new(url)
49
+ end
50
+ end
51
+
52
+ #
53
+ # Create a new page
54
+ #
55
+ def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
56
+ @url = url
57
+ @body = body unless Anemone.options.discard_page_bodies
58
+ @code = code
59
+ @content_type = content_type
60
+ @links = []
61
+ @aliases = []
62
+
63
+ @aliases << aka if !aka.nil?
64
+
65
+ if body
66
+ h = Hpricot(body)
67
+
68
+ #save page title
69
+ title_elem = h.at('title')
70
+ @title = title_elem.inner_html if !title_elem.nil?
71
+
72
+ #save page h1
73
+ h1_elem = h.at('h1')
74
+ @h1 = h1_elem.inner_html if !h1_elem.nil?
75
+
76
+ #save page h2
77
+ h2_elem = h.at('h2')
78
+ @h2 = h2_elem.inner_html if !h2_elem.nil?
79
+
80
+ #save page meta-description
81
+ description_elem = h.at('meta[@name=description]')
82
+ @description = description_elem['content'] if !description_elem.nil?
83
+
84
+ #get a list of distinct links on the page, in absolute url form
85
+ h.search('a').each do |a|
86
+ u = a['href']
87
+ next if u.nil?
88
+
89
+ begin
90
+ abs = to_absolute(URI(u))
91
+ rescue
92
+ next
93
+ end
94
+
95
+ @links << abs if in_domain?(abs)
96
+ end
97
+
98
+ @links.uniq!
99
+ end
100
+ end
101
+
102
+
103
+ #
104
+ # Return a new page with the same *response* and *url*, but
105
+ # with a 200 response code
106
+ #
107
+ def alias_clone(url)
108
+ p = clone
109
+ p.add_alias!(@aka) if !@aka.nil?
110
+ p.code = 200
111
+ p
112
+ end
113
+
114
+ #
115
+ # Add a redirect-alias String *aka* to the list of the page's aliases
116
+ #
117
+ # Returns *self*
118
+ #
119
+ def add_alias!(aka)
120
+ @aliases << aka if !@aliases.include?(aka)
121
+ self
122
+ end
123
+
124
+ #
125
+ # Returns an Array of all links from this page, and all the
126
+ # redirect-aliases of those pages, as String objects.
127
+ #
128
+ # *page_hash* is a PageHash object with the results of the current crawl.
129
+ #
130
+ def links_and_their_aliases(page_hash)
131
+ @links.inject([]) do |results, link|
132
+ results.concat([link].concat(page_hash[link].aliases))
133
+ end
134
+ end
135
+
136
+ #
137
+ # Returns +true+ if the page is a HTML document, returns +false+
138
+ # otherwise.
139
+ #
140
+ def html?
141
+ (@content_type =~ /text\/html/) == 0
142
+ end
143
+
144
+ #
145
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
146
+ # otherwise.
147
+ #
148
+ def redirect?
149
+ (300..399).include?(@code)
150
+ end
151
+
152
+ #
153
+ # Returns +true+ if the page was not found (returned 404 code),
154
+ # returns +false+ otherwise.
155
+ #
156
+ def not_found?
157
+ 404 == @code
158
+ end
159
+
160
+ #
161
+ # Converts relative URL *link* into an absolute URL based on the
162
+ # location of the page
163
+ #
164
+ def to_absolute(link)
165
+ # remove anchor
166
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
167
+
168
+ relative = URI(link)
169
+ absolute = @url.merge(relative)
170
+
171
+ absolute.path = '/' if absolute.path.empty?
172
+
173
+ return absolute
174
+ end
175
+
176
+ #
177
+ # Returns +true+ if *uri* is in the same domain as the page, returns
178
+ # +false+ otherwise
179
+ #
180
+ def in_domain?(uri)
181
+ uri.host == @url.host
182
+ end
183
+ end
184
+ end
@@ -1,83 +1,83 @@
1
- module Anemone
2
- class PageHash < Hash
3
-
4
- #
5
- # Use a breadth-first search to calculate the single-source
6
- # shortest paths from *root* to all pages in the PageHash
7
- #
8
- def shortest_paths!(root)
9
- root = URI(root) if root.is_a?(String)
10
- raise "Root node not found" if !has_key?(root)
11
-
12
- each_value {|p| p.visited = false if p}
13
-
14
- q = Queue.new
15
-
16
- q.enq(root)
17
- self[root].depth = 0
18
- self[root].visited = true
19
- while(!q.empty?)
20
- url = q.deq
21
-
22
- next if !has_key?(url)
23
-
24
- page = self[url]
25
-
26
- page.links.each do |u|
27
- next if !has_key?(u) or self[u].nil?
28
- link = self[u]
29
- aliases = [link].concat(link.aliases.map {|a| self[a] })
30
-
31
- aliases.each do |node|
32
- if node.depth.nil? or page.depth + 1 < node.depth
33
- node.depth = page.depth + 1
34
- end
35
- end
36
-
37
- q.enq(self[u].url) if !self[u].visited
38
- self[u].visited = true
39
- end
40
- end
41
-
42
- self
43
- end
44
-
45
- #
46
- # Returns a new PageHash by removing redirect-aliases for each
47
- # non-redirect Page
48
- #
49
- def uniq
50
- results = PageHash.new
51
- each do |url, page|
52
- #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
- page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
- if !page.redirect? and !page_added
55
- results[url] = page.clone
56
- results[url].aliases = []
57
- end
58
- end
59
-
60
- results
61
- end
62
-
63
- #
64
- # Return an Array of Page objects which link to the given url
65
- #
66
- def pages_linking_to url
67
- begin
68
- url = URI(url) if url.is_a?(String)
69
- rescue
70
- return []
71
- end
72
-
73
- values.delete_if { |p| !p.links.include?(url) }
74
- end
75
-
76
- #
77
- # Return an Array of URI objects of Pages linking to the given url
78
- def urls_linking_to url
79
- pages_linking_to(url).map{|p| p.url}
80
- end
81
-
82
- end
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # Return an Array of Page objects which link to the given url
65
+ #
66
+ def pages_linking_to url
67
+ begin
68
+ url = URI(url) if url.is_a?(String)
69
+ rescue
70
+ return []
71
+ end
72
+
73
+ values.delete_if { |p| !p.links.include?(url) }
74
+ end
75
+
76
+ #
77
+ # Return an Array of URI objects of Pages linking to the given url
78
+ def urls_linking_to url
79
+ pages_linking_to(url).map{|p| p.url}
80
+ end
81
+
82
+ end
83
83
  end