webpage 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/webpage.rb +157 -126
  2. metadata +1 -1
data/webpage.rb CHANGED
@@ -3,9 +3,11 @@ require 'pp'
3
3
  require 'mechanize'
4
4
  require 'uri'
5
5
  class Webpage
6
- attr_reader:links,:successful,:related_uris
6
+ attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
7
+ attr_accessor :ignored_exts
7
8
  def initialize(uri)
8
- @uri = URI.parse(uri_encode(uri))
9
+ @links = Array.new
10
+ @relative_paths = Array.new
9
11
  @outbound_links = Array.new
10
12
  @internal_outbound_links = Array.new
11
13
  @external_outbound_links = Array.new
@@ -15,54 +17,56 @@ class Webpage
15
17
  @internal_inbound_links = Array.new
16
18
  @external_inbound_links = Array.new
17
19
  @internal_links = Array.new
18
- @links = Array.new
19
- @uri_dirname = File.dirname(@uri.path)
20
- @uri_domain = host_to_domain @uri.host
20
+ @invalid_links = Array.new
21
21
  @accessed_uri = Array.new
22
- @page = ''
23
22
  @related_uris = Array.new
24
23
  @successful = false
25
24
  begin
25
+ @uri = URI.parse(uri)
26
+ raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
27
+ @domain = Webpage.host_to_domain @uri.host
26
28
  agent = Mechanize.new
27
- agent.open_timeout = 5
29
+ agent.open_timeout = 3
28
30
  @page = agent.get @uri.to_s
31
+ raise 'not webpage' unless @page.class == Mechanize::Page
29
32
  @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
30
- @page.links.each do |link| #1
31
- next if link.href.nil?
32
- uri = uri_encode(link.href.strip)
33
- begin
34
- @links << @uri.merge(uri).to_s
35
- rescue URI::InvalidURIError,URI::InvalidComponentError
36
- warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
37
- end
38
- end
39
33
  @successful = true
40
34
  rescue Exception => e
41
- warn "#{e}:#{@uri}"
35
+ warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
42
36
  end
43
- @links.uniq!
44
- scan_links
45
37
  end
46
38
 
47
39
  def encoding
48
40
  return @page.encoding
49
41
  end
42
+
50
43
  def keywords
51
- return @page.search("//meta[@name='keywords']").first.attributes["content"].value.split(',')
44
+ meta = @page.search("//meta[@name='keywords']").first
45
+ return meta.attributes["content"].value.split(',') unless meta.nil?
46
+ end
47
+
48
+ def description
49
+ meta = @page.search("//meta[@name='description']").first
50
+ if meta.nil?
51
+ return false
52
+ end
53
+ return meta.atrributes['content'].value
52
54
  end
53
55
 
54
56
  def body
55
- return @page.body unless @page.body.include?'<html>'
56
- return String.new
57
+ return @page.body
58
+ #(return @page.body unless @page.body.include?'<html>') if @successful
59
+ #return String.new
57
60
  end
58
61
 
59
62
  def text
60
- return body.gsub(/<\/?[^>]*>/, "")
63
+ return Nokogiri::HTML(body).xpath("//text()").text
64
+ #return body.gsub(/<\/?[^>]*>/, "")
61
65
  end
62
66
 
63
67
  def title
64
68
  return @page.title unless @page.title.nil?
65
- return String.new
69
+ return false
66
70
  end
67
71
 
68
72
 
@@ -70,6 +74,7 @@ class Webpage
70
74
  #1.$all = get all <a>
71
75
  #2.$href = get all href from $all
72
76
  #3.make all $href to be absolute path and put to @links
77
+ =begin
73
78
  def links
74
79
  return @links unless @links.empty?
75
80
  begin
@@ -78,7 +83,7 @@ class Webpage
78
83
  agent.get @uri do |page|
79
84
  page.links.each do |link| #1
80
85
  next if link.href.nil?
81
- uri = uri_encode(link.href.strip)
86
+ uri = Webpage.uri_normalize(link.href)
82
87
  begin
83
88
  @links << @uri.merge(uri).to_s
84
89
  rescue URI::InvalidURIError,URI::InvalidComponentError
@@ -101,29 +106,42 @@ class Webpage
101
106
  #@links = @links.uniq - @accessed_uri
102
107
  #@accessed_uri += @links
103
108
  @links.uniq!
104
- puts @links
105
109
  scan_links
106
110
  return @links
107
111
  end
112
+ =end
108
113
 
109
- def internal_links
110
- return @internal_links unless @internal_links.empty?
114
+ def report
111
115
  scan_links
112
- return @internal_links
116
+ scan_outbound_links
117
+ scan_inbound_links
118
+ report = {
119
+ :internal_links => @internal_links,
120
+ :internal_outbound_links => @internal_outbound_links,
121
+ :outbound_links => @outbound_links,
122
+ :broken_outbound_links => @broken_outbound_links,
123
+ :external_inbound_links => @external_inbound_links,
124
+ :internal_inbound_links => @internal_inbound_links,
125
+ :external_outbound_links => @external_outbound_links,
126
+ :related_uris => @related_uris,
127
+ :invalid_links => @invalid_links
128
+ }
113
129
  end
130
+
131
+
132
+ =begin
114
133
  def external_outbound_links
115
134
  return @external_outbound_links unless @external_outbound_links.empty?
116
135
  links
117
136
  return @external_outbound_links
118
137
  end
138
+
119
139
  def internal_outbound_links
120
140
  return @internal_outbound_links unless @internal_outbound_links.empty?
121
141
  links
122
142
  return @internal_outbound_links
123
143
  end
124
- def outbound_links
125
- return external_outbound_links + internal_outbound_links
126
- end
144
+
127
145
  def back_links#inbound links among all the outbound links
128
146
  return @back_links unless @back_links.empty?
129
147
  scan_outbound_links
@@ -147,23 +165,27 @@ class Webpage
147
165
  scan_inbound_links
148
166
  return @internal_inbound_links
149
167
  end
150
-
168
+ =end
151
169
  def pagerank
152
- return @pagerank unless @pagerank
153
- require 'PageRankr'
154
- @pagerank = PageRankr.ranks(@uri.to_s, :google)
170
+ return @pagerank unless @pagerank.nil?
171
+ require 'page_rankr'
172
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
155
173
  return @pagerank
156
174
  end
157
175
 
158
176
  def ppl#pagerank per link
159
- return (@pagerank / links.count)
177
+ pagerank
178
+ return false if @pagerank.nil?
179
+ scan_links
180
+ return (@pagerank / @links.size)
160
181
  end
161
-
162
- def related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,related=Array.new)
182
+ def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
183
+ scan_links
163
184
  raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
164
- related_keywords.concat(keywords).uniq!
165
- seed_uris.concat(external_outbound_links).uniq!
166
- while seed_uris.size>0
185
+ seed_uris.concat(@external_outbound_links - checked_uris)
186
+ related_keywords.concat(keywords)
187
+ result = Array.new
188
+ while seed_uris.size > 0 and result.size < max
167
189
  uri = seed_uris.first
168
190
  checked_uris << uri unless checked_uris.include?uri
169
191
  seed_uris.delete(uri)
@@ -172,63 +194,84 @@ class Webpage
172
194
  text = w.body + w.title
173
195
  related_keywords.each do |word|
174
196
  if text.include?word
175
- related << uri
176
- seed_uris.concat(w.external_outbound_links).uniq!
197
+ #result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
198
+ domain = Webpage.host_to_domain(URI.parse(uri).host)
199
+ result << domain unless result.include? domain
200
+ seed_uris.concat(w.external_outbound_links - checked_uris)
177
201
  break
178
202
  end
179
203
  end
180
204
  end
181
- return related
205
+ return result
182
206
  end
183
-
184
- private
185
- def scan_links
186
- @links.each do |a|
187
- begin
188
- uri = URI.parse(uri_encode(a))
189
- rescue URI::InvalidURIError =>e
190
- puts "#{e}:#{uri}"
191
- next
207
+
208
+ def link_to(target_uri)
209
+ scan_links
210
+ target_uri = Webpage.uri_normalize(target_uri)
211
+ target_host = URI.parse(target_uri).host
212
+ target_domain = Webpage.host_to_domain(target_host)
213
+ type = 0 #not link to
214
+ @links.each do |link|
215
+ candidate_host = URI.parse(link).host
216
+ if link == target_uri
217
+ type = 3 #definitely link to
218
+ break
219
+ elsif URI.parse(link).host == target_host
220
+ type = 2 if type < 2 #link to the host
221
+ elsif Webpage.host_to_domain(candidate_host) == target_domain
222
+ type = 1 if type < 1 #link to the root domain
192
223
  end
193
- next if uri.host.nil?
194
- if uri.host.end_with?@uri_domain
195
- @internal_links << a
196
- elsif uri.scheme.start_with?'http'
197
- if host_to_domain(uri.host) == @uri_domain
198
- @internal_outbound_links << uri.to_s
199
- else
200
- @external_outbound_links << uri.to_s
201
- end
202
- #@outbound_links << a
224
+ end
225
+ return type
226
+ end
227
+
228
+ def self.uri_normalize(uri)
229
+ uri = URI.parse(uri).normalize
230
+ fragment = uri.fragment
231
+ return uri.to_s.delete("##{fragment}")
232
+ #uri = uri.to_s.strip.sub(/\#.*$/,'')
233
+ #uri.path = '/' if uri.path.nil?
234
+ end
235
+
236
+ def self.host_to_domain(host)
237
+ domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
238
+ return domain[1] unless domain.nil?
239
+ return false
240
+ end
241
+
242
+ def self.uri_encode(str)
243
+ return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
244
+ end
245
+
246
+ def scan_inbound_links
247
+ scan_links
248
+ @back_links.each do |inlink|
249
+ inlink = URI.parse inlink
250
+ if @domain == Webpage.host_to_domain(inlink.host)
251
+ @internal_inbound_links << inlink.to_s
252
+ else
253
+ @external_inbound_links << inlink.to_s
203
254
  end
204
255
  end
205
- @back_links.uniq!
206
- #@outbound_links.uniq!
207
- @internal_outbound_links.uniq!
208
- @external_outbound_links.uniq!
256
+ @internal_inbound_links.uniq!
257
+ @external_inbound_links.uniq!
209
258
  end
210
259
 
211
260
  def scan_outbound_links
212
- outbound_links.each do |outlink|
213
- begin
214
- w = Webpage.new(outlink)
215
- rescue URI::InvalidURIError
216
- warn "bad uri:#{outlink}"
261
+ @outbound_links.each do |outlink|
262
+ w = Webpage.new(outlink)
263
+ unless w.successful
264
+ @invalid_links << outlink
217
265
  next
218
266
  end
219
267
  next if w.links.nil?
220
268
  w.links.each do |uri|
221
- next unless uri.start_with?'http'
222
- begin
223
- uri = URI.parse(uri_encode(uri))
224
- next if uri.host.nil?
225
- if uri.host.end_with?@uri_domain
226
- @back_links << uri.to_s
227
- else
228
- @broken_outbound_links << uri.to_s
229
- end
230
- rescue URI::InvalidURIError
231
- warn "bad uri:#{uri}"
269
+ #uri = URI.parse(uri)
270
+ #next if uri.host.nil?
271
+ if Webpage.host_to_domain(uri) == @domain
272
+ @back_links << uri.to_s
273
+ else
274
+ @broken_outbound_links << uri.to_s
232
275
  end
233
276
  end
234
277
  end
@@ -236,50 +279,38 @@ class Webpage
236
279
  @broken_outbound_links.uniq!
237
280
  end
238
281
 
239
- def scan_inbound_links
240
- back_links.each do |inlink|
241
- inlink = URI.parse inlink
242
- if @uri_domain == host_to_domain(inlink.host)
243
- @internal_inbound_links << inlink.to_s
282
+ def scan_links
283
+ return unless @links.empty?
284
+ exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
285
+ @page.links.each do |link|
286
+ #初步解析
287
+ begin
288
+ uri = URI.parse(link.href)
289
+ href = uri.to_s
290
+ rescue URI::InvalidURIError => e
291
+ @invalid_links << href
292
+ next
293
+ end
294
+ #忽略非网页文件
295
+ if exts_to_ignored.include?href[-4,4]
296
+ @ignored_uris << href
297
+ next
298
+ end
299
+ #处理相对路径
300
+ if uri.relative?
301
+ @relative_paths << href
302
+ href = @uri.merge(href).to_s
303
+ @internal_outbound_links << href
244
304
  else
245
- @external_inbound_links << inlink.to_s
305
+ href = Webpage.uri_normalize(href)
306
+ if Webpage.host_to_domain(uri.host) == @domain
307
+ @internal_outbound_links << href
308
+ else
309
+ @external_outbound_links << href
310
+ end
246
311
  end
312
+ @links << href
247
313
  end
248
- @internal_inbound_links.uniq!
249
- @external_inbound_links.uniq!
314
+ @outbound_links = @internal_outbound_links.uniq! + @external_outbound_links.uniq!
250
315
  end
251
-
252
- def uri_encode(str)
253
- return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
254
- end
255
- def host_to_domain(host)
256
- domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
257
- return domain[1] unless domain.nil?
258
- return false
259
- end
260
- end
261
-
262
- w = Webpage.new('http://cidian.youdao.com')
263
- #puts w.external_outbound_links
264
- related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
265
- puts w.related_uris(related_keywords)
266
- exit
267
- require 'yaml'
268
- filename = './cidian.yaml'
269
- if File.exists?(filename)
270
- cached_cidian = YAML.load(File.read(filename))
271
- else
272
- cached_cidian = Hash.new
273
- cached_cidian[:seed_uris] = Array.new
274
- cached_cidian[:checked_uris] = Array.new
275
- cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
276
- cached_cidian[:related_uris] = Array.new
277
- end
278
- at_exit do
279
- File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
280
316
  end
281
- #puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
282
- puts w.external_inbound_links
283
- puts w.internal_inbound_links
284
- puts w.pagerank
285
- puts w.external_inbound_links
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: