anemone 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/anemone/http.rb CHANGED
@@ -1,37 +1,37 @@
1
- require 'net/http'
2
-
3
- module Anemone
4
- class HTTP < Net::HTTP
5
- # Maximum number of redirects to follow on each get_response
6
- REDIRECTION_LIMIT = 5
7
-
8
- #
9
- # Retrieve an HTTP response for *url*, following redirects.
10
- # Returns the response object, response code, and final URI location.
11
- #
12
- def self.get(url)
13
- response = get_response(url)
14
- code = Integer(response.code)
15
- loc = url
16
-
17
- limit = REDIRECTION_LIMIT
18
- while response.is_a?(Net::HTTPRedirection) and limit > 0
19
- loc = URI(response['location'])
20
- loc = url.merge(loc) if loc.relative?
21
- response = get_response(loc)
22
- limit -= 1
23
- end
24
-
25
- return response, code, loc
26
- end
27
-
28
- #
29
- # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
- #
31
- def self.get_response(url)
32
- Net::HTTP.start(url.host, url.port) do |http|
33
- return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
34
- end
35
- end
36
- end
1
+ require 'net/http'
2
+
3
+ module Anemone
4
+ class HTTP < Net::HTTP
5
+ # Maximum number of redirects to follow on each get_response
6
+ REDIRECTION_LIMIT = 5
7
+
8
+ #
9
+ # Retrieve an HTTP response for *url*, following redirects.
10
+ # Returns the response object, response code, and final URI location.
11
+ #
12
+ def self.get(url)
13
+ response = get_response(url)
14
+ code = Integer(response.code)
15
+ loc = url
16
+
17
+ limit = REDIRECTION_LIMIT
18
+ while response.is_a?(Net::HTTPRedirection) and limit > 0
19
+ loc = URI(response['location'])
20
+ loc = url.merge(loc) if loc.relative?
21
+ response = get_response(loc)
22
+ limit -= 1
23
+ end
24
+
25
+ return response, code, loc
26
+ end
27
+
28
+ #
29
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
+ #
31
+ def self.get_response(url)
32
+ Net::HTTP.start(url.host, url.port) do |http|
33
+ return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
34
+ end
35
+ end
36
+ end
37
37
  end
data/lib/anemone/page.rb CHANGED
@@ -1,159 +1,184 @@
1
- require 'anemone/http'
2
- require 'hpricot'
3
-
4
- module Anemone
5
- class Page
6
- # The URL of the page
7
- attr_reader :url
8
- # Array of distinct A tag HREFs from the page
9
- attr_reader :links
10
- #Body of the HTTP response
11
- attr_reader :body
12
- #Content-type of the HTTP response
13
- attr_reader :content_type
14
-
15
- # Integer response code of the page
16
- attr_accessor :code
17
- # Array of redirect-aliases for the page
18
- attr_accessor :aliases
19
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
20
- attr_accessor :visited
21
- # Used by PageHash#shortest_paths! to store depth of the page
22
- attr_accessor :depth
23
-
24
- #
25
- # Create a new Page from the response of an HTTP request to *url*
26
- #
27
- def self.fetch(url)
28
- begin
29
- url = URI(url) if url.is_a?(String)
30
-
31
- response, code, location = Anemone::HTTP.get(url)
32
-
33
- aka = nil
34
- if !url.eql?(location)
35
- aka = location
36
- end
37
-
38
- return Page.new(url, response.body, code, response['Content-Type'], aka)
39
- rescue
40
- return Page.new(url)
41
- end
42
- end
43
-
44
- #
45
- # Create a new page
46
- #
47
- def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
48
- @url = url
49
- @body = body unless Anemone.options.discard_page_bodies
50
- @code = code
51
- @content_type = content_type
52
- @links = []
53
- @aliases = []
54
-
55
- @aliases << aka if !aka.nil?
56
-
57
- #get a list of distinct links on the page, in absolute url form
58
- if body
59
- Hpricot(body).search('a').each do |a|
60
- u = a['href']
61
- next if u.nil?
62
-
63
- begin
64
- u = URI(u)
65
- rescue
66
- next
67
- end
68
-
69
- abs = to_absolute(u)
70
- @links << abs if in_domain?(abs)
71
- end
72
-
73
- @links.uniq!
74
- end
75
- end
76
-
77
-
78
- #
79
- # Return a new page with the same *response* and *url*, but
80
- # with a 200 response code
81
- #
82
- def alias_clone(url)
83
- p = clone
84
- p.add_alias!(@aka) if !@aka.nil?
85
- p.code = 200
86
- p
87
- end
88
-
89
- #
90
- # Add a redirect-alias String *aka* to the list of the page's aliases
91
- #
92
- # Returns *self*
93
- #
94
- def add_alias!(aka)
95
- @aliases << aka if !@aliases.include?(aka)
96
- self
97
- end
98
-
99
- #
100
- # Returns an Array of all links from this page, and all the
101
- # redirect-aliases of those pages, as String objects.
102
- #
103
- # *page_hash* is a PageHash object with the results of the current crawl.
104
- #
105
- def links_and_their_aliases(page_hash)
106
- @links.inject([]) do |results, link|
107
- results.concat([link].concat(page_hash[link].aliases))
108
- end
109
- end
110
-
111
- #
112
- # Returns +true+ if the page is a HTML document, returns +false+
113
- # otherwise.
114
- #
115
- def html?
116
- (@content_type =~ /text\/html/) == 0
117
- end
118
-
119
- #
120
- # Returns +true+ if the page is a HTTP redirect, returns +false+
121
- # otherwise.
122
- #
123
- def redirect?
124
- (300..399).include?(@code)
125
- end
126
-
127
- #
128
- # Returns +true+ if the page was not found (returned 404 code),
129
- # returns +false+ otherwise.
130
- #
131
- def not_found?
132
- 404 == @code
133
- end
134
-
135
- #
136
- # Converts relative URL *link* into an absolute URL based on the
137
- # location of the page
138
- #
139
- def to_absolute(link)
140
- # remove anchor
141
- link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
142
-
143
- relative = URI(link)
144
- absolute = @url.merge(relative)
145
-
146
- absolute.path = '/' if absolute.path.empty?
147
-
148
- return absolute
149
- end
150
-
151
- #
152
- # Returns +true+ if *uri* is in the same domain as the page, returns
153
- # +false+ otherwise
154
- #
155
- def in_domain?(uri)
156
- uri.host == @url.host
157
- end
158
- end
159
- end
1
+ require 'anemone/http'
2
+ require 'hpricot'
3
+
4
+ module Anemone
5
+ class Page
6
+ # The URL of the page
7
+ attr_reader :url
8
+ # Array of distinct A tag HREFs from the page
9
+ attr_reader :links
10
+ #Body of the HTTP response
11
+ attr_reader :body
12
+ #Content-type of the HTTP response
13
+ attr_reader :content_type
14
+ #title of the page if it is an HTML document
15
+ attr_reader :title
16
+ #first h1 on the page, if present
17
+ attr_reader :h1
18
+ #first h2 on the page, if present
19
+ attr_reader :h2
20
+ #meta-description of the page, if present
21
+ attr_reader :description
22
+
23
+ # Integer response code of the page
24
+ attr_accessor :code
25
+ # Array of redirect-aliases for the page
26
+ attr_accessor :aliases
27
+ # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
28
+ attr_accessor :visited
29
+ # Used by PageHash#shortest_paths! to store depth of the page
30
+ attr_accessor :depth
31
+
32
+ #
33
+ # Create a new Page from the response of an HTTP request to *url*
34
+ #
35
+ def self.fetch(url)
36
+ begin
37
+ url = URI(url) if url.is_a?(String)
38
+
39
+ response, code, location = Anemone::HTTP.get(url)
40
+
41
+ aka = nil
42
+ if !url.eql?(location)
43
+ aka = location
44
+ end
45
+
46
+ return Page.new(url, response.body, code, response['Content-Type'], aka)
47
+ rescue
48
+ return Page.new(url)
49
+ end
50
+ end
51
+
52
+ #
53
+ # Create a new page
54
+ #
55
+ def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
56
+ @url = url
57
+ @body = body unless Anemone.options.discard_page_bodies
58
+ @code = code
59
+ @content_type = content_type
60
+ @links = []
61
+ @aliases = []
62
+
63
+ @aliases << aka if !aka.nil?
64
+
65
+ if body
66
+ h = Hpricot(body)
67
+
68
+ #save page title
69
+ title_elem = h.at('title')
70
+ @title = title_elem.inner_html if !title_elem.nil?
71
+
72
+ #save page h1
73
+ h1_elem = h.at('h1')
74
+ @h1 = h1_elem.inner_html if !h1_elem.nil?
75
+
76
+ #save page h2
77
+ h2_elem = h.at('h2')
78
+ @h2 = h2_elem.inner_html if !h2_elem.nil?
79
+
80
+ #save page meta-description
81
+ description_elem = h.at('meta[@name=description]')
82
+ @description = description_elem['content'] if !description_elem.nil?
83
+
84
+ #get a list of distinct links on the page, in absolute url form
85
+ h.search('a').each do |a|
86
+ u = a['href']
87
+ next if u.nil?
88
+
89
+ begin
90
+ abs = to_absolute(URI(u))
91
+ rescue
92
+ next
93
+ end
94
+
95
+ @links << abs if in_domain?(abs)
96
+ end
97
+
98
+ @links.uniq!
99
+ end
100
+ end
101
+
102
+
103
+ #
104
+ # Return a new page with the same *response* and *url*, but
105
+ # with a 200 response code
106
+ #
107
+ def alias_clone(url)
108
+ p = clone
109
+ p.add_alias!(@aka) if !@aka.nil?
110
+ p.code = 200
111
+ p
112
+ end
113
+
114
+ #
115
+ # Add a redirect-alias String *aka* to the list of the page's aliases
116
+ #
117
+ # Returns *self*
118
+ #
119
+ def add_alias!(aka)
120
+ @aliases << aka if !@aliases.include?(aka)
121
+ self
122
+ end
123
+
124
+ #
125
+ # Returns an Array of all links from this page, and all the
126
+ # redirect-aliases of those pages, as String objects.
127
+ #
128
+ # *page_hash* is a PageHash object with the results of the current crawl.
129
+ #
130
+ def links_and_their_aliases(page_hash)
131
+ @links.inject([]) do |results, link|
132
+ results.concat([link].concat(page_hash[link].aliases))
133
+ end
134
+ end
135
+
136
+ #
137
+ # Returns +true+ if the page is a HTML document, returns +false+
138
+ # otherwise.
139
+ #
140
+ def html?
141
+ (@content_type =~ /text\/html/) == 0
142
+ end
143
+
144
+ #
145
+ # Returns +true+ if the page is a HTTP redirect, returns +false+
146
+ # otherwise.
147
+ #
148
+ def redirect?
149
+ (300..399).include?(@code)
150
+ end
151
+
152
+ #
153
+ # Returns +true+ if the page was not found (returned 404 code),
154
+ # returns +false+ otherwise.
155
+ #
156
+ def not_found?
157
+ 404 == @code
158
+ end
159
+
160
+ #
161
+ # Converts relative URL *link* into an absolute URL based on the
162
+ # location of the page
163
+ #
164
+ def to_absolute(link)
165
+ # remove anchor
166
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
167
+
168
+ relative = URI(link)
169
+ absolute = @url.merge(relative)
170
+
171
+ absolute.path = '/' if absolute.path.empty?
172
+
173
+ return absolute
174
+ end
175
+
176
+ #
177
+ # Returns +true+ if *uri* is in the same domain as the page, returns
178
+ # +false+ otherwise
179
+ #
180
+ def in_domain?(uri)
181
+ uri.host == @url.host
182
+ end
183
+ end
184
+ end
@@ -1,83 +1,83 @@
1
- module Anemone
2
- class PageHash < Hash
3
-
4
- #
5
- # Use a breadth-first search to calculate the single-source
6
- # shortest paths from *root* to all pages in the PageHash
7
- #
8
- def shortest_paths!(root)
9
- root = URI(root) if root.is_a?(String)
10
- raise "Root node not found" if !has_key?(root)
11
-
12
- each_value {|p| p.visited = false if p}
13
-
14
- q = Queue.new
15
-
16
- q.enq(root)
17
- self[root].depth = 0
18
- self[root].visited = true
19
- while(!q.empty?)
20
- url = q.deq
21
-
22
- next if !has_key?(url)
23
-
24
- page = self[url]
25
-
26
- page.links.each do |u|
27
- next if !has_key?(u) or self[u].nil?
28
- link = self[u]
29
- aliases = [link].concat(link.aliases.map {|a| self[a] })
30
-
31
- aliases.each do |node|
32
- if node.depth.nil? or page.depth + 1 < node.depth
33
- node.depth = page.depth + 1
34
- end
35
- end
36
-
37
- q.enq(self[u].url) if !self[u].visited
38
- self[u].visited = true
39
- end
40
- end
41
-
42
- self
43
- end
44
-
45
- #
46
- # Returns a new PageHash by removing redirect-aliases for each
47
- # non-redirect Page
48
- #
49
- def uniq
50
- results = PageHash.new
51
- each do |url, page|
52
- #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
- page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
- if !page.redirect? and !page_added
55
- results[url] = page.clone
56
- results[url].aliases = []
57
- end
58
- end
59
-
60
- results
61
- end
62
-
63
- #
64
- # Return an Array of Page objects which link to the given url
65
- #
66
- def pages_linking_to url
67
- begin
68
- url = URI(url) if url.is_a?(String)
69
- rescue
70
- return []
71
- end
72
-
73
- values.delete_if { |p| !p.links.include?(url) }
74
- end
75
-
76
- #
77
- # Return an Array of URI objects of Pages linking to the given url
78
- def urls_linking_to url
79
- pages_linking_to(url).map{|p| p.url}
80
- end
81
-
82
- end
1
+ module Anemone
2
+ class PageHash < Hash
3
+
4
+ #
5
+ # Use a breadth-first search to calculate the single-source
6
+ # shortest paths from *root* to all pages in the PageHash
7
+ #
8
+ def shortest_paths!(root)
9
+ root = URI(root) if root.is_a?(String)
10
+ raise "Root node not found" if !has_key?(root)
11
+
12
+ each_value {|p| p.visited = false if p}
13
+
14
+ q = Queue.new
15
+
16
+ q.enq(root)
17
+ self[root].depth = 0
18
+ self[root].visited = true
19
+ while(!q.empty?)
20
+ url = q.deq
21
+
22
+ next if !has_key?(url)
23
+
24
+ page = self[url]
25
+
26
+ page.links.each do |u|
27
+ next if !has_key?(u) or self[u].nil?
28
+ link = self[u]
29
+ aliases = [link].concat(link.aliases.map {|a| self[a] })
30
+
31
+ aliases.each do |node|
32
+ if node.depth.nil? or page.depth + 1 < node.depth
33
+ node.depth = page.depth + 1
34
+ end
35
+ end
36
+
37
+ q.enq(self[u].url) if !self[u].visited
38
+ self[u].visited = true
39
+ end
40
+ end
41
+
42
+ self
43
+ end
44
+
45
+ #
46
+ # Returns a new PageHash by removing redirect-aliases for each
47
+ # non-redirect Page
48
+ #
49
+ def uniq
50
+ results = PageHash.new
51
+ each do |url, page|
52
+ #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
53
+ page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
54
+ if !page.redirect? and !page_added
55
+ results[url] = page.clone
56
+ results[url].aliases = []
57
+ end
58
+ end
59
+
60
+ results
61
+ end
62
+
63
+ #
64
+ # Return an Array of Page objects which link to the given url
65
+ #
66
+ def pages_linking_to url
67
+ begin
68
+ url = URI(url) if url.is_a?(String)
69
+ rescue
70
+ return []
71
+ end
72
+
73
+ values.delete_if { |p| !p.links.include?(url) }
74
+ end
75
+
76
+ #
77
+ # Return an Array of URI objects of Pages linking to the given url
78
+ def urls_linking_to url
79
+ pages_linking_to(url).map{|p| p.url}
80
+ end
81
+
82
+ end
83
83
  end