webpage 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/webpage.rb +157 -126
  2. metadata +1 -1
data/webpage.rb CHANGED
@@ -3,9 +3,11 @@ require 'pp'
3
3
  require 'mechanize'
4
4
  require 'uri'
5
5
  class Webpage
6
- attr_reader:links,:successful,:related_uris
6
+ attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
7
+ attr_accessor :ignored_exts
7
8
  def initialize(uri)
8
- @uri = URI.parse(uri_encode(uri))
9
+ @links = Array.new
10
+ @relative_paths = Array.new
9
11
  @outbound_links = Array.new
10
12
  @internal_outbound_links = Array.new
11
13
  @external_outbound_links = Array.new
@@ -15,54 +17,56 @@ class Webpage
15
17
  @internal_inbound_links = Array.new
16
18
  @external_inbound_links = Array.new
17
19
  @internal_links = Array.new
18
- @links = Array.new
19
- @uri_dirname = File.dirname(@uri.path)
20
- @uri_domain = host_to_domain @uri.host
20
+ @invalid_links = Array.new
21
21
  @accessed_uri = Array.new
22
- @page = ''
23
22
  @related_uris = Array.new
24
23
  @successful = false
25
24
  begin
25
+ @uri = URI.parse(uri)
26
+ raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
27
+ @domain = Webpage.host_to_domain @uri.host
26
28
  agent = Mechanize.new
27
- agent.open_timeout = 5
29
+ agent.open_timeout = 3
28
30
  @page = agent.get @uri.to_s
31
+ raise 'not webpage' unless @page.class == Mechanize::Page
29
32
  @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
30
- @page.links.each do |link| #1
31
- next if link.href.nil?
32
- uri = uri_encode(link.href.strip)
33
- begin
34
- @links << @uri.merge(uri).to_s
35
- rescue URI::InvalidURIError,URI::InvalidComponentError
36
- warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
37
- end
38
- end
39
33
  @successful = true
40
34
  rescue Exception => e
41
- warn "#{e}:#{@uri}"
35
+ warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
42
36
  end
43
- @links.uniq!
44
- scan_links
45
37
  end
46
38
 
47
39
  def encoding
48
40
  return @page.encoding
49
41
  end
42
+
50
43
  def keywords
51
- return @page.search("//meta[@name='keywords']").first.attributes["content"].value.split(',')
44
+ meta = @page.search("//meta[@name='keywords']").first
45
+ return meta.attributes["content"].value.split(',') unless meta.nil?
46
+ end
47
+
48
+ def description
49
+ meta = @page.search("//meta[@name='description']").first
50
+ if meta.nil?
51
+ return false
52
+ end
53
+ return meta.atrributes['content'].value
52
54
  end
53
55
 
54
56
  def body
55
- return @page.body unless @page.body.include?'<html>'
56
- return String.new
57
+ return @page.body
58
+ #(return @page.body unless @page.body.include?'<html>') if @successful
59
+ #return String.new
57
60
  end
58
61
 
59
62
  def text
60
- return body.gsub(/<\/?[^>]*>/, "")
63
+ return Nokogiri::HTML(body).xpath("//text()").text
64
+ #return body.gsub(/<\/?[^>]*>/, "")
61
65
  end
62
66
 
63
67
  def title
64
68
  return @page.title unless @page.title.nil?
65
- return String.new
69
+ return false
66
70
  end
67
71
 
68
72
 
@@ -70,6 +74,7 @@ class Webpage
70
74
  #1.$all = get all <a>
71
75
  #2.$href = get all href from $all
72
76
  #3.make all $href to be absolute path and put to @links
77
+ =begin
73
78
  def links
74
79
  return @links unless @links.empty?
75
80
  begin
@@ -78,7 +83,7 @@ class Webpage
78
83
  agent.get @uri do |page|
79
84
  page.links.each do |link| #1
80
85
  next if link.href.nil?
81
- uri = uri_encode(link.href.strip)
86
+ uri = Webpage.uri_normalize(link.href)
82
87
  begin
83
88
  @links << @uri.merge(uri).to_s
84
89
  rescue URI::InvalidURIError,URI::InvalidComponentError
@@ -101,29 +106,42 @@ class Webpage
101
106
  #@links = @links.uniq - @accessed_uri
102
107
  #@accessed_uri += @links
103
108
  @links.uniq!
104
- puts @links
105
109
  scan_links
106
110
  return @links
107
111
  end
112
+ =end
108
113
 
109
- def internal_links
110
- return @internal_links unless @internal_links.empty?
114
+ def report
111
115
  scan_links
112
- return @internal_links
116
+ scan_outbound_links
117
+ scan_inbound_links
118
+ report = {
119
+ :internal_links => @internal_links,
120
+ :internal_outbound_links => @internal_outbound_links,
121
+ :outbound_links => @outbound_links,
122
+ :broken_outbound_links => @broken_outbound_links,
123
+ :external_inbound_links => @external_inbound_links,
124
+ :internal_inbound_links => @internal_inbound_links,
125
+ :external_outbound_links => @external_outbound_links,
126
+ :related_uris => @related_uris,
127
+ :invalid_links => @invalid_links
128
+ }
113
129
  end
130
+
131
+
132
+ =begin
114
133
  def external_outbound_links
115
134
  return @external_outbound_links unless @external_outbound_links.empty?
116
135
  links
117
136
  return @external_outbound_links
118
137
  end
138
+
119
139
  def internal_outbound_links
120
140
  return @internal_outbound_links unless @internal_outbound_links.empty?
121
141
  links
122
142
  return @internal_outbound_links
123
143
  end
124
- def outbound_links
125
- return external_outbound_links + internal_outbound_links
126
- end
144
+
127
145
  def back_links#inbound links among all the outbound links
128
146
  return @back_links unless @back_links.empty?
129
147
  scan_outbound_links
@@ -147,23 +165,27 @@ class Webpage
147
165
  scan_inbound_links
148
166
  return @internal_inbound_links
149
167
  end
150
-
168
+ =end
151
169
  def pagerank
152
- return @pagerank unless @pagerank
153
- require 'PageRankr'
154
- @pagerank = PageRankr.ranks(@uri.to_s, :google)
170
+ return @pagerank unless @pagerank.nil?
171
+ require 'page_rankr'
172
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
155
173
  return @pagerank
156
174
  end
157
175
 
158
176
  def ppl#pagerank per link
159
- return (@pagerank / links.count)
177
+ pagerank
178
+ return false if @pagerank.nil?
179
+ scan_links
180
+ return (@pagerank / @links.size)
160
181
  end
161
-
162
- def related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,related=Array.new)
182
+ def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
183
+ scan_links
163
184
  raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
164
- related_keywords.concat(keywords).uniq!
165
- seed_uris.concat(external_outbound_links).uniq!
166
- while seed_uris.size>0
185
+ seed_uris.concat(@external_outbound_links - checked_uris)
186
+ related_keywords.concat(keywords)
187
+ result = Array.new
188
+ while seed_uris.size > 0 and result.size < max
167
189
  uri = seed_uris.first
168
190
  checked_uris << uri unless checked_uris.include?uri
169
191
  seed_uris.delete(uri)
@@ -172,63 +194,84 @@ class Webpage
172
194
  text = w.body + w.title
173
195
  related_keywords.each do |word|
174
196
  if text.include?word
175
- related << uri
176
- seed_uris.concat(w.external_outbound_links).uniq!
197
+ #result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
198
+ domain = Webpage.host_to_domain(URI.parse(uri).host)
199
+ result << domain unless result.include? domain
200
+ seed_uris.concat(w.external_outbound_links - checked_uris)
177
201
  break
178
202
  end
179
203
  end
180
204
  end
181
- return related
205
+ return result
182
206
  end
183
-
184
- private
185
- def scan_links
186
- @links.each do |a|
187
- begin
188
- uri = URI.parse(uri_encode(a))
189
- rescue URI::InvalidURIError =>e
190
- puts "#{e}:#{uri}"
191
- next
207
+
208
+ def link_to(target_uri)
209
+ scan_links
210
+ target_uri = Webpage.uri_normalize(target_uri)
211
+ target_host = URI.parse(target_uri).host
212
+ target_domain = Webpage.host_to_domain(target_host)
213
+ type = 0 #not link to
214
+ @links.each do |link|
215
+ candidate_host = URI.parse(link).host
216
+ if link == target_uri
217
+ type = 3 #definitely link to
218
+ break
219
+ elsif URI.parse(link).host == target_host
220
+ type = 2 if type < 2 #link to the host
221
+ elsif Webpage.host_to_domain(candidate_host) == target_domain
222
+ type = 1 if type < 1 #link to the root domain
192
223
  end
193
- next if uri.host.nil?
194
- if uri.host.end_with?@uri_domain
195
- @internal_links << a
196
- elsif uri.scheme.start_with?'http'
197
- if host_to_domain(uri.host) == @uri_domain
198
- @internal_outbound_links << uri.to_s
199
- else
200
- @external_outbound_links << uri.to_s
201
- end
202
- #@outbound_links << a
224
+ end
225
+ return type
226
+ end
227
+
228
+ def self.uri_normalize(uri)
229
+ uri = URI.parse(uri).normalize
230
+ fragment = uri.fragment
231
+ return uri.to_s.delete("##{fragment}")
232
+ #uri = uri.to_s.strip.sub(/\#.*$/,'')
233
+ #uri.path = '/' if uri.path.nil?
234
+ end
235
+
236
+ def self.host_to_domain(host)
237
+ domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
238
+ return domain[1] unless domain.nil?
239
+ return false
240
+ end
241
+
242
+ def self.uri_encode(str)
243
+ return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
244
+ end
245
+
246
+ def scan_inbound_links
247
+ scan_links
248
+ @back_links.each do |inlink|
249
+ inlink = URI.parse inlink
250
+ if @domain == Webpage.host_to_domain(inlink.host)
251
+ @internal_inbound_links << inlink.to_s
252
+ else
253
+ @external_inbound_links << inlink.to_s
203
254
  end
204
255
  end
205
- @back_links.uniq!
206
- #@outbound_links.uniq!
207
- @internal_outbound_links.uniq!
208
- @external_outbound_links.uniq!
256
+ @internal_inbound_links.uniq!
257
+ @external_inbound_links.uniq!
209
258
  end
210
259
 
211
260
  def scan_outbound_links
212
- outbound_links.each do |outlink|
213
- begin
214
- w = Webpage.new(outlink)
215
- rescue URI::InvalidURIError
216
- warn "bad uri:#{outlink}"
261
+ @outbound_links.each do |outlink|
262
+ w = Webpage.new(outlink)
263
+ unless w.successful
264
+ @invalid_links << outlink
217
265
  next
218
266
  end
219
267
  next if w.links.nil?
220
268
  w.links.each do |uri|
221
- next unless uri.start_with?'http'
222
- begin
223
- uri = URI.parse(uri_encode(uri))
224
- next if uri.host.nil?
225
- if uri.host.end_with?@uri_domain
226
- @back_links << uri.to_s
227
- else
228
- @broken_outbound_links << uri.to_s
229
- end
230
- rescue URI::InvalidURIError
231
- warn "bad uri:#{uri}"
269
+ #uri = URI.parse(uri)
270
+ #next if uri.host.nil?
271
+ if Webpage.host_to_domain(uri) == @domain
272
+ @back_links << uri.to_s
273
+ else
274
+ @broken_outbound_links << uri.to_s
232
275
  end
233
276
  end
234
277
  end
@@ -236,50 +279,38 @@ class Webpage
236
279
  @broken_outbound_links.uniq!
237
280
  end
238
281
 
239
- def scan_inbound_links
240
- back_links.each do |inlink|
241
- inlink = URI.parse inlink
242
- if @uri_domain == host_to_domain(inlink.host)
243
- @internal_inbound_links << inlink.to_s
282
+ def scan_links
283
+ return unless @links.empty?
284
+ exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
285
+ @page.links.each do |link|
286
+ #初步解析
287
+ begin
288
+ uri = URI.parse(link.href)
289
+ href = uri.to_s
290
+ rescue URI::InvalidURIError => e
291
+ @invalid_links << href
292
+ next
293
+ end
294
+ #忽略非网页文件
295
+ if exts_to_ignored.include?href[-4,4]
296
+ @ignored_uris << href
297
+ next
298
+ end
299
+ #处理相对路径
300
+ if uri.relative?
301
+ @relative_paths << href
302
+ href = @uri.merge(href).to_s
303
+ @internal_outbound_links << href
244
304
  else
245
- @external_inbound_links << inlink.to_s
305
+ href = Webpage.uri_normalize(href)
306
+ if Webpage.host_to_domain(uri.host) == @domain
307
+ @internal_outbound_links << href
308
+ else
309
+ @external_outbound_links << href
310
+ end
246
311
  end
312
+ @links << href
247
313
  end
248
- @internal_inbound_links.uniq!
249
- @external_inbound_links.uniq!
314
+ @outbound_links = @internal_outbound_links.uniq! + @external_outbound_links.uniq!
250
315
  end
251
-
252
- def uri_encode(str)
253
- return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
254
- end
255
- def host_to_domain(host)
256
- domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
257
- return domain[1] unless domain.nil?
258
- return false
259
- end
260
- end
261
-
262
- w = Webpage.new('http://cidian.youdao.com')
263
- #puts w.external_outbound_links
264
- related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
265
- puts w.related_uris(related_keywords)
266
- exit
267
- require 'yaml'
268
- filename = './cidian.yaml'
269
- if File.exists?(filename)
270
- cached_cidian = YAML.load(File.read(filename))
271
- else
272
- cached_cidian = Hash.new
273
- cached_cidian[:seed_uris] = Array.new
274
- cached_cidian[:checked_uris] = Array.new
275
- cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
276
- cached_cidian[:related_uris] = Array.new
277
- end
278
- at_exit do
279
- File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
280
316
  end
281
- #puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
282
- puts w.external_inbound_links
283
- puts w.internal_inbound_links
284
- puts w.pagerank
285
- puts w.external_inbound_links
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: