webpage 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/webpage.rb +206 -49
- metadata +1 -1
data/webpage.rb
CHANGED
@@ -3,25 +3,69 @@ require 'pp'
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'uri'
|
5
5
|
class Webpage
|
6
|
-
attr_reader:links
|
6
|
+
attr_reader:links,:successful,:related_uris
|
7
7
|
def initialize(uri)
|
8
|
-
@uri = URI.parse(
|
8
|
+
@uri = URI.parse(uri_encode(uri))
|
9
9
|
@outbound_links = Array.new
|
10
|
-
@
|
11
|
-
@
|
10
|
+
@internal_outbound_links = Array.new
|
11
|
+
@external_outbound_links = Array.new
|
12
|
+
@broken_outbound_links = Array.new
|
13
|
+
@external_inbound_links = Array.new
|
14
|
+
@back_links = Array.new
|
15
|
+
@internal_inbound_links = Array.new
|
16
|
+
@external_inbound_links = Array.new
|
12
17
|
@internal_links = Array.new
|
13
18
|
@links = Array.new
|
14
19
|
@uri_dirname = File.dirname(@uri.path)
|
15
20
|
@uri_domain = host_to_domain @uri.host
|
16
21
|
@accessed_uri = Array.new
|
22
|
+
@page = ''
|
23
|
+
@related_uris = Array.new
|
24
|
+
@successful = false
|
25
|
+
begin
|
26
|
+
agent = Mechanize.new
|
27
|
+
agent.open_timeout = 5
|
28
|
+
@page = agent.get @uri.to_s
|
29
|
+
@page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
30
|
+
@page.links.each do |link| #1
|
31
|
+
next if link.href.nil?
|
32
|
+
uri = uri_encode(link.href.strip)
|
33
|
+
begin
|
34
|
+
@links << @uri.merge(uri).to_s
|
35
|
+
rescue URI::InvalidURIError,URI::InvalidComponentError
|
36
|
+
warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
@successful = true
|
40
|
+
rescue Exception => e
|
41
|
+
warn "#{e}:#{@uri}"
|
42
|
+
end
|
43
|
+
@links.uniq!
|
44
|
+
scan_links
|
17
45
|
end
|
18
|
-
|
19
|
-
|
46
|
+
|
47
|
+
def encoding
|
48
|
+
return @page.encoding
|
20
49
|
end
|
21
|
-
def
|
22
|
-
return
|
50
|
+
def keywords
|
51
|
+
return @page.search("//meta[@name='keywords']").first.attributes["content"].value.split(',')
|
52
|
+
end
|
53
|
+
|
54
|
+
def body
|
55
|
+
return @page.body unless @page.body.include?'<html>'
|
56
|
+
return String.new
|
57
|
+
end
|
58
|
+
|
59
|
+
def text
|
60
|
+
return body.gsub(/<\/?[^>]*>/, "")
|
61
|
+
end
|
62
|
+
|
63
|
+
def title
|
64
|
+
return @page.title unless @page.title.nil?
|
65
|
+
return String.new
|
23
66
|
end
|
24
67
|
|
68
|
+
|
25
69
|
#get all links from html content
|
26
70
|
#1.$all = get all <a>
|
27
71
|
#2.$href = get all href from $all
|
@@ -34,7 +78,7 @@ class Webpage
|
|
34
78
|
agent.get @uri do |page|
|
35
79
|
page.links.each do |link| #1
|
36
80
|
next if link.href.nil?
|
37
|
-
uri =
|
81
|
+
uri = uri_encode(link.href.strip)
|
38
82
|
begin
|
39
83
|
@links << @uri.merge(uri).to_s
|
40
84
|
rescue URI::InvalidURIError,URI::InvalidComponentError
|
@@ -54,32 +98,117 @@ class Webpage
|
|
54
98
|
warn "#{e}.#{@uri}"
|
55
99
|
end
|
56
100
|
return Array.new if @links.empty?
|
57
|
-
|
58
|
-
|
59
|
-
@links.
|
60
|
-
|
61
|
-
|
62
|
-
if uri.host.end_with?@uri_domain
|
63
|
-
@internal_links << a
|
64
|
-
else
|
65
|
-
@outbound_links << a
|
66
|
-
end
|
67
|
-
end
|
101
|
+
#@links = @links.uniq - @accessed_uri
|
102
|
+
#@accessed_uri += @links
|
103
|
+
@links.uniq!
|
104
|
+
puts @links
|
105
|
+
scan_links
|
68
106
|
return @links
|
69
107
|
end
|
70
|
-
|
108
|
+
|
71
109
|
def internal_links
|
72
|
-
return @internal_links
|
73
|
-
|
110
|
+
return @internal_links unless @internal_links.empty?
|
111
|
+
scan_links
|
112
|
+
return @internal_links
|
113
|
+
end
|
114
|
+
def external_outbound_links
|
115
|
+
return @external_outbound_links unless @external_outbound_links.empty?
|
116
|
+
links
|
117
|
+
return @external_outbound_links
|
118
|
+
end
|
119
|
+
def internal_outbound_links
|
120
|
+
return @internal_outbound_links unless @internal_outbound_links.empty?
|
121
|
+
links
|
122
|
+
return @internal_outbound_links
|
74
123
|
end
|
75
|
-
|
76
124
|
def outbound_links
|
77
|
-
return
|
78
|
-
|
125
|
+
return external_outbound_links + internal_outbound_links
|
126
|
+
end
|
127
|
+
def back_links#inbound links among all the outbound links
|
128
|
+
return @back_links unless @back_links.empty?
|
129
|
+
scan_outbound_links
|
130
|
+
return @back_links
|
131
|
+
end
|
132
|
+
|
133
|
+
def broken_outbound_links
|
134
|
+
return @broken_outbound_links unless @broken_outbound_links.empty?
|
135
|
+
scan_outbound_links
|
136
|
+
return @broken_outbound_links
|
137
|
+
end
|
138
|
+
|
139
|
+
def external_inbound_links#outter inbound links
|
140
|
+
return @external_inbound_links unless @external_inbound_links.empty?
|
141
|
+
scan_inbound_links
|
142
|
+
return @external_inbound_links
|
143
|
+
end
|
144
|
+
|
145
|
+
def internal_inbound_links
|
146
|
+
return @internal_inbound_links unless @internal_inbound_links.empty?
|
147
|
+
scan_inbound_links
|
148
|
+
return @internal_inbound_links
|
79
149
|
end
|
80
150
|
|
81
|
-
def
|
82
|
-
return @
|
151
|
+
def pagerank
|
152
|
+
return @pagerank unless @pagerank
|
153
|
+
require 'PageRankr'
|
154
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)
|
155
|
+
return @pagerank
|
156
|
+
end
|
157
|
+
|
158
|
+
def ppl#pagerank per link
|
159
|
+
return (@pagerank / links.count)
|
160
|
+
end
|
161
|
+
|
162
|
+
def related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,related=Array.new)
|
163
|
+
raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
|
164
|
+
related_keywords.concat(keywords).uniq!
|
165
|
+
seed_uris.concat(external_outbound_links).uniq!
|
166
|
+
while seed_uris.size>0
|
167
|
+
uri = seed_uris.first
|
168
|
+
checked_uris << uri unless checked_uris.include?uri
|
169
|
+
seed_uris.delete(uri)
|
170
|
+
w = Webpage.new uri
|
171
|
+
next unless w.successful
|
172
|
+
text = w.body + w.title
|
173
|
+
related_keywords.each do |word|
|
174
|
+
if text.include?word
|
175
|
+
related << uri
|
176
|
+
seed_uris.concat(w.external_outbound_links).uniq!
|
177
|
+
break
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
return related
|
182
|
+
end
|
183
|
+
|
184
|
+
private
|
185
|
+
def scan_links
|
186
|
+
@links.each do |a|
|
187
|
+
begin
|
188
|
+
uri = URI.parse(uri_encode(a))
|
189
|
+
rescue URI::InvalidURIError =>e
|
190
|
+
puts "#{e}:#{uri}"
|
191
|
+
next
|
192
|
+
end
|
193
|
+
next if uri.host.nil?
|
194
|
+
if uri.host.end_with?@uri_domain
|
195
|
+
@internal_links << a
|
196
|
+
elsif uri.scheme.start_with?'http'
|
197
|
+
if host_to_domain(uri.host) == @uri_domain
|
198
|
+
@internal_outbound_links << uri.to_s
|
199
|
+
else
|
200
|
+
@external_outbound_links << uri.to_s
|
201
|
+
end
|
202
|
+
#@outbound_links << a
|
203
|
+
end
|
204
|
+
end
|
205
|
+
@back_links.uniq!
|
206
|
+
#@outbound_links.uniq!
|
207
|
+
@internal_outbound_links.uniq!
|
208
|
+
@external_outbound_links.uniq!
|
209
|
+
end
|
210
|
+
|
211
|
+
def scan_outbound_links
|
83
212
|
outbound_links.each do |outlink|
|
84
213
|
begin
|
85
214
|
w = Webpage.new(outlink)
|
@@ -87,42 +216,70 @@ class Webpage
|
|
87
216
|
warn "bad uri:#{outlink}"
|
88
217
|
next
|
89
218
|
end
|
219
|
+
next if w.links.nil?
|
90
220
|
w.links.each do |uri|
|
91
221
|
next unless uri.start_with?'http'
|
92
222
|
begin
|
93
|
-
uri = URI.parse(
|
223
|
+
uri = URI.parse(uri_encode(uri))
|
94
224
|
next if uri.host.nil?
|
95
|
-
|
225
|
+
if uri.host.end_with?@uri_domain
|
226
|
+
@back_links << uri.to_s
|
227
|
+
else
|
228
|
+
@broken_outbound_links << uri.to_s
|
229
|
+
end
|
96
230
|
rescue URI::InvalidURIError
|
97
231
|
warn "bad uri:#{uri}"
|
98
232
|
end
|
99
233
|
end
|
100
234
|
end
|
101
|
-
|
235
|
+
@back_links.uniq!
|
236
|
+
@broken_outbound_links.uniq!
|
102
237
|
end
|
103
238
|
|
104
|
-
def
|
105
|
-
|
106
|
-
inbound_links.each do |inlink|
|
239
|
+
def scan_inbound_links
|
240
|
+
back_links.each do |inlink|
|
107
241
|
inlink = URI.parse inlink
|
108
|
-
|
242
|
+
if @uri_domain == host_to_domain(inlink.host)
|
243
|
+
@internal_inbound_links << inlink.to_s
|
244
|
+
else
|
245
|
+
@external_inbound_links << inlink.to_s
|
246
|
+
end
|
109
247
|
end
|
110
|
-
|
248
|
+
@internal_inbound_links.uniq!
|
249
|
+
@external_inbound_links.uniq!
|
111
250
|
end
|
112
|
-
|
113
|
-
def
|
251
|
+
|
252
|
+
def uri_encode(str)
|
253
|
+
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
114
254
|
end
|
115
|
-
|
116
|
-
|
117
|
-
return
|
118
|
-
|
119
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)
|
120
|
-
return @pagerank
|
255
|
+
def host_to_domain(host)
|
256
|
+
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
257
|
+
return domain[1] unless domain.nil?
|
258
|
+
return false
|
121
259
|
end
|
260
|
+
end
|
122
261
|
|
123
|
-
|
124
|
-
|
125
|
-
|
262
|
+
w = Webpage.new('http://cidian.youdao.com')
|
263
|
+
#puts w.external_outbound_links
|
264
|
+
related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
|
265
|
+
puts w.related_uris(related_keywords)
|
266
|
+
exit
|
267
|
+
require 'yaml'
|
268
|
+
filename = './cidian.yaml'
|
269
|
+
if File.exists?(filename)
|
270
|
+
cached_cidian = YAML.load(File.read(filename))
|
271
|
+
else
|
272
|
+
cached_cidian = Hash.new
|
273
|
+
cached_cidian[:seed_uris] = Array.new
|
274
|
+
cached_cidian[:checked_uris] = Array.new
|
275
|
+
cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
|
276
|
+
cached_cidian[:related_uris] = Array.new
|
277
|
+
end
|
278
|
+
at_exit do
|
279
|
+
File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
|
126
280
|
end
|
127
|
-
w
|
128
|
-
puts w.
|
281
|
+
#puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
|
282
|
+
puts w.external_inbound_links
|
283
|
+
puts w.internal_inbound_links
|
284
|
+
puts w.pagerank
|
285
|
+
puts w.external_inbound_links
|