webpage 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/webpage.rb +206 -49
  2. metadata +1 -1
data/webpage.rb CHANGED
@@ -3,25 +3,69 @@ require 'pp'
3
3
  require 'mechanize'
4
4
  require 'uri'
5
5
  class Webpage
6
- attr_reader:links
6
+ attr_reader:links,:successful,:related_uris
7
7
  def initialize(uri)
8
- @uri = URI.parse(encode(uri))
8
+ @uri = URI.parse(uri_encode(uri))
9
9
  @outbound_links = Array.new
10
- @outter_inbound_links = Array.new
11
- @inbound_links = Array.new
10
+ @internal_outbound_links = Array.new
11
+ @external_outbound_links = Array.new
12
+ @broken_outbound_links = Array.new
13
+ @external_inbound_links = Array.new
14
+ @back_links = Array.new
15
+ @internal_inbound_links = Array.new
16
+ @external_inbound_links = Array.new
12
17
  @internal_links = Array.new
13
18
  @links = Array.new
14
19
  @uri_dirname = File.dirname(@uri.path)
15
20
  @uri_domain = host_to_domain @uri.host
16
21
  @accessed_uri = Array.new
22
+ @page = ''
23
+ @related_uris = Array.new
24
+ @successful = false
25
+ begin
26
+ agent = Mechanize.new
27
+ agent.open_timeout = 5
28
+ @page = agent.get @uri.to_s
29
+ @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
30
+ @page.links.each do |link| #1
31
+ next if link.href.nil?
32
+ uri = uri_encode(link.href.strip)
33
+ begin
34
+ @links << @uri.merge(uri).to_s
35
+ rescue URI::InvalidURIError,URI::InvalidComponentError
36
+ warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
37
+ end
38
+ end
39
+ @successful = true
40
+ rescue Exception => e
41
+ warn "#{e}:#{@uri}"
42
+ end
43
+ @links.uniq!
44
+ scan_links
17
45
  end
18
- def encode(str)
19
- return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
46
+
47
+ def encoding
48
+ return @page.encoding
20
49
  end
21
- def host_to_domain(host)
22
- return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
50
+ def keywords
51
+ return @page.search("//meta[@name='keywords']").first.attributes["content"].value.split(',')
52
+ end
53
+
54
+ def body
55
+ return @page.body unless @page.body.include?'<html>'
56
+ return String.new
57
+ end
58
+
59
+ def text
60
+ return body.gsub(/<\/?[^>]*>/, "")
61
+ end
62
+
63
+ def title
64
+ return @page.title unless @page.title.nil?
65
+ return String.new
23
66
  end
24
67
 
68
+
25
69
  #get all links from html content
26
70
  #1.$all = get all <a>
27
71
  #2.$href = get all href from $all
@@ -34,7 +78,7 @@ class Webpage
34
78
  agent.get @uri do |page|
35
79
  page.links.each do |link| #1
36
80
  next if link.href.nil?
37
- uri = encode(link.href.strip)
81
+ uri = uri_encode(link.href.strip)
38
82
  begin
39
83
  @links << @uri.merge(uri).to_s
40
84
  rescue URI::InvalidURIError,URI::InvalidComponentError
@@ -54,32 +98,117 @@ class Webpage
54
98
  warn "#{e}.#{@uri}"
55
99
  end
56
100
  return Array.new if @links.empty?
57
- @links = @links.uniq - @accessed_uri
58
- @accessed_uri += @links
59
- @links.each do |a|
60
- uri = URI.parse(encode(a))
61
- next if uri.host.nil?
62
- if uri.host.end_with?@uri_domain
63
- @internal_links << a
64
- else
65
- @outbound_links << a
66
- end
67
- end
101
+ #@links = @links.uniq - @accessed_uri
102
+ #@accessed_uri += @links
103
+ @links.uniq!
104
+ puts @links
105
+ scan_links
68
106
  return @links
69
107
  end
70
-
108
+
71
109
  def internal_links
72
- return @internal_links if links
73
- return false
110
+ return @internal_links unless @internal_links.empty?
111
+ scan_links
112
+ return @internal_links
113
+ end
114
+ def external_outbound_links
115
+ return @external_outbound_links unless @external_outbound_links.empty?
116
+ links
117
+ return @external_outbound_links
118
+ end
119
+ def internal_outbound_links
120
+ return @internal_outbound_links unless @internal_outbound_links.empty?
121
+ links
122
+ return @internal_outbound_links
74
123
  end
75
-
76
124
  def outbound_links
77
- return @outbound_links if links
78
- return false
125
+ return external_outbound_links + internal_outbound_links
126
+ end
127
+ def back_links#inbound links among all the outbound links
128
+ return @back_links unless @back_links.empty?
129
+ scan_outbound_links
130
+ return @back_links
131
+ end
132
+
133
+ def broken_outbound_links
134
+ return @broken_outbound_links unless @broken_outbound_links.empty?
135
+ scan_outbound_links
136
+ return @broken_outbound_links
137
+ end
138
+
139
+ def external_inbound_links#outter inbound links
140
+ return @external_inbound_links unless @external_inbound_links.empty?
141
+ scan_inbound_links
142
+ return @external_inbound_links
143
+ end
144
+
145
+ def internal_inbound_links
146
+ return @internal_inbound_links unless @internal_inbound_links.empty?
147
+ scan_inbound_links
148
+ return @internal_inbound_links
79
149
  end
80
150
 
81
- def inbound_links
82
- return @inbound_links unless @inbound_links.empty?
151
+ def pagerank
152
+ return @pagerank unless @pagerank
153
+ require 'PageRankr'
154
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)
155
+ return @pagerank
156
+ end
157
+
158
+ def ppl#pagerank per link
159
+ return (@pagerank / links.count)
160
+ end
161
+
162
+ def related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,related=Array.new)
163
+ raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
164
+ related_keywords.concat(keywords).uniq!
165
+ seed_uris.concat(external_outbound_links).uniq!
166
+ while seed_uris.size>0
167
+ uri = seed_uris.first
168
+ checked_uris << uri unless checked_uris.include?uri
169
+ seed_uris.delete(uri)
170
+ w = Webpage.new uri
171
+ next unless w.successful
172
+ text = w.body + w.title
173
+ related_keywords.each do |word|
174
+ if text.include?word
175
+ related << uri
176
+ seed_uris.concat(w.external_outbound_links).uniq!
177
+ break
178
+ end
179
+ end
180
+ end
181
+ return related
182
+ end
183
+
184
+ private
185
+ def scan_links
186
+ @links.each do |a|
187
+ begin
188
+ uri = URI.parse(uri_encode(a))
189
+ rescue URI::InvalidURIError =>e
190
+ puts "#{e}:#{uri}"
191
+ next
192
+ end
193
+ next if uri.host.nil?
194
+ if uri.host.end_with?@uri_domain
195
+ @internal_links << a
196
+ elsif uri.scheme.start_with?'http'
197
+ if host_to_domain(uri.host) == @uri_domain
198
+ @internal_outbound_links << uri.to_s
199
+ else
200
+ @external_outbound_links << uri.to_s
201
+ end
202
+ #@outbound_links << a
203
+ end
204
+ end
205
+ @back_links.uniq!
206
+ #@outbound_links.uniq!
207
+ @internal_outbound_links.uniq!
208
+ @external_outbound_links.uniq!
209
+ end
210
+
211
+ def scan_outbound_links
83
212
  outbound_links.each do |outlink|
84
213
  begin
85
214
  w = Webpage.new(outlink)
@@ -87,42 +216,70 @@ class Webpage
87
216
  warn "bad uri:#{outlink}"
88
217
  next
89
218
  end
219
+ next if w.links.nil?
90
220
  w.links.each do |uri|
91
221
  next unless uri.start_with?'http'
92
222
  begin
93
- uri = URI.parse(encode(uri))
223
+ uri = URI.parse(uri_encode(uri))
94
224
  next if uri.host.nil?
95
- @inbound_links << uri.to_s if uri.host.end_with?@uri_domain
225
+ if uri.host.end_with?@uri_domain
226
+ @back_links << uri.to_s
227
+ else
228
+ @broken_outbound_links << uri.to_s
229
+ end
96
230
  rescue URI::InvalidURIError
97
231
  warn "bad uri:#{uri}"
98
232
  end
99
233
  end
100
234
  end
101
- return @inbound_links.uniq
235
+ @back_links.uniq!
236
+ @broken_outbound_links.uniq!
102
237
  end
103
238
 
104
- def outter_inbound_links
105
- return @outter_inbound_links unless @outter_inbound_links.empty?
106
- inbound_links.each do |inlink|
239
+ def scan_inbound_links
240
+ back_links.each do |inlink|
107
241
  inlink = URI.parse inlink
108
- @outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
242
+ if @uri_domain == host_to_domain(inlink.host)
243
+ @internal_inbound_links << inlink.to_s
244
+ else
245
+ @external_inbound_links << inlink.to_s
246
+ end
109
247
  end
110
- return @outter_inbound_links
248
+ @internal_inbound_links.uniq!
249
+ @external_inbound_links.uniq!
111
250
  end
112
-
113
- def friend_links#inbound && outbound
251
+
252
+ def uri_encode(str)
253
+ return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
114
254
  end
115
-
116
- def pagerank
117
- return @pagerank unless @pagerank
118
- require 'PageRankr'
119
- @pagerank = PageRankr.ranks(@uri.to_s, :google)
120
- return @pagerank
255
+ def host_to_domain(host)
256
+ domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
257
+ return domain[1] unless domain.nil?
258
+ return false
121
259
  end
260
+ end
122
261
 
123
- def ppl#pagerank_per_link
124
- return (@pagerank / links.count)
125
- end
262
+ w = Webpage.new('http://cidian.youdao.com')
263
+ #puts w.external_outbound_links
264
+ related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
265
+ puts w.related_uris(related_keywords)
266
+ exit
267
+ require 'yaml'
268
+ filename = './cidian.yaml'
269
+ if File.exists?(filename)
270
+ cached_cidian = YAML.load(File.read(filename))
271
+ else
272
+ cached_cidian = Hash.new
273
+ cached_cidian[:seed_uris] = Array.new
274
+ cached_cidian[:checked_uris] = Array.new
275
+ cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
276
+ cached_cidian[:related_uris] = Array.new
277
+ end
278
+ at_exit do
279
+ File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
126
280
  end
127
- w = Webpage.new('http://auto.163.com')
128
- puts w.outter_inbound_links
281
+ #puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
282
+ puts w.external_inbound_links
283
+ puts w.internal_inbound_links
284
+ puts w.pagerank
285
+ puts w.external_inbound_links
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: