webpage 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/webpage.rb +157 -126
- metadata +1 -1
data/webpage.rb
CHANGED
@@ -3,9 +3,11 @@ require 'pp'
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'uri'
|
5
5
|
class Webpage
|
6
|
-
attr_reader:links,:successful,:related_uris
|
6
|
+
attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
|
7
|
+
attr_accessor :ignored_exts
|
7
8
|
def initialize(uri)
|
8
|
-
@
|
9
|
+
@links = Array.new
|
10
|
+
@relative_paths = Array.new
|
9
11
|
@outbound_links = Array.new
|
10
12
|
@internal_outbound_links = Array.new
|
11
13
|
@external_outbound_links = Array.new
|
@@ -15,54 +17,56 @@ class Webpage
|
|
15
17
|
@internal_inbound_links = Array.new
|
16
18
|
@external_inbound_links = Array.new
|
17
19
|
@internal_links = Array.new
|
18
|
-
@
|
19
|
-
@uri_dirname = File.dirname(@uri.path)
|
20
|
-
@uri_domain = host_to_domain @uri.host
|
20
|
+
@invalid_links = Array.new
|
21
21
|
@accessed_uri = Array.new
|
22
|
-
@page = ''
|
23
22
|
@related_uris = Array.new
|
24
23
|
@successful = false
|
25
24
|
begin
|
25
|
+
@uri = URI.parse(uri)
|
26
|
+
raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
|
27
|
+
@domain = Webpage.host_to_domain @uri.host
|
26
28
|
agent = Mechanize.new
|
27
|
-
agent.open_timeout =
|
29
|
+
agent.open_timeout = 3
|
28
30
|
@page = agent.get @uri.to_s
|
31
|
+
raise 'not webpage' unless @page.class == Mechanize::Page
|
29
32
|
@page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
30
|
-
@page.links.each do |link| #1
|
31
|
-
next if link.href.nil?
|
32
|
-
uri = uri_encode(link.href.strip)
|
33
|
-
begin
|
34
|
-
@links << @uri.merge(uri).to_s
|
35
|
-
rescue URI::InvalidURIError,URI::InvalidComponentError
|
36
|
-
warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
|
37
|
-
end
|
38
|
-
end
|
39
33
|
@successful = true
|
40
34
|
rescue Exception => e
|
41
|
-
warn "#{e}
|
35
|
+
warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
|
42
36
|
end
|
43
|
-
@links.uniq!
|
44
|
-
scan_links
|
45
37
|
end
|
46
38
|
|
47
39
|
def encoding
|
48
40
|
return @page.encoding
|
49
41
|
end
|
42
|
+
|
50
43
|
def keywords
|
51
|
-
|
44
|
+
meta = @page.search("//meta[@name='keywords']").first
|
45
|
+
return meta.attributes["content"].value.split(',') unless meta.nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
def description
|
49
|
+
meta = @page.search("//meta[@name='description']").first
|
50
|
+
if meta.nil?
|
51
|
+
return false
|
52
|
+
end
|
53
|
+
return meta.atrributes['content'].value
|
52
54
|
end
|
53
55
|
|
54
56
|
def body
|
55
|
-
return @page.body
|
56
|
-
return
|
57
|
+
return @page.body
|
58
|
+
#(return @page.body unless @page.body.include?'<html>') if @successful
|
59
|
+
#return String.new
|
57
60
|
end
|
58
61
|
|
59
62
|
def text
|
60
|
-
return body.
|
63
|
+
return Nokogiri::HTML(body).xpath("//text()").text
|
64
|
+
#return body.gsub(/<\/?[^>]*>/, "")
|
61
65
|
end
|
62
66
|
|
63
67
|
def title
|
64
68
|
return @page.title unless @page.title.nil?
|
65
|
-
return
|
69
|
+
return false
|
66
70
|
end
|
67
71
|
|
68
72
|
|
@@ -70,6 +74,7 @@ class Webpage
|
|
70
74
|
#1.$all = get all <a>
|
71
75
|
#2.$href = get all href from $all
|
72
76
|
#3.make all $href to be absolute path and put to @links
|
77
|
+
=begin
|
73
78
|
def links
|
74
79
|
return @links unless @links.empty?
|
75
80
|
begin
|
@@ -78,7 +83,7 @@ class Webpage
|
|
78
83
|
agent.get @uri do |page|
|
79
84
|
page.links.each do |link| #1
|
80
85
|
next if link.href.nil?
|
81
|
-
uri =
|
86
|
+
uri = Webpage.uri_normalize(link.href)
|
82
87
|
begin
|
83
88
|
@links << @uri.merge(uri).to_s
|
84
89
|
rescue URI::InvalidURIError,URI::InvalidComponentError
|
@@ -101,29 +106,42 @@ class Webpage
|
|
101
106
|
#@links = @links.uniq - @accessed_uri
|
102
107
|
#@accessed_uri += @links
|
103
108
|
@links.uniq!
|
104
|
-
puts @links
|
105
109
|
scan_links
|
106
110
|
return @links
|
107
111
|
end
|
112
|
+
=end
|
108
113
|
|
109
|
-
def
|
110
|
-
return @internal_links unless @internal_links.empty?
|
114
|
+
def report
|
111
115
|
scan_links
|
112
|
-
|
116
|
+
scan_outbound_links
|
117
|
+
scan_inbound_links
|
118
|
+
report = {
|
119
|
+
:internal_links => @internal_links,
|
120
|
+
:internal_outbound_links => @internal_outbound_links,
|
121
|
+
:outbound_links => @outbound_links,
|
122
|
+
:broken_outbound_links => @broken_outbound_links,
|
123
|
+
:external_inbound_links => @external_inbound_links,
|
124
|
+
:internal_inbound_links => @internal_inbound_links,
|
125
|
+
:external_outbound_links => @external_outbound_links,
|
126
|
+
:related_uris => @related_uris,
|
127
|
+
:invalid_links => @invalid_links
|
128
|
+
}
|
113
129
|
end
|
130
|
+
|
131
|
+
|
132
|
+
=begin
|
114
133
|
def external_outbound_links
|
115
134
|
return @external_outbound_links unless @external_outbound_links.empty?
|
116
135
|
links
|
117
136
|
return @external_outbound_links
|
118
137
|
end
|
138
|
+
|
119
139
|
def internal_outbound_links
|
120
140
|
return @internal_outbound_links unless @internal_outbound_links.empty?
|
121
141
|
links
|
122
142
|
return @internal_outbound_links
|
123
143
|
end
|
124
|
-
|
125
|
-
return external_outbound_links + internal_outbound_links
|
126
|
-
end
|
144
|
+
|
127
145
|
def back_links#inbound links among all the outbound links
|
128
146
|
return @back_links unless @back_links.empty?
|
129
147
|
scan_outbound_links
|
@@ -147,23 +165,27 @@ class Webpage
|
|
147
165
|
scan_inbound_links
|
148
166
|
return @internal_inbound_links
|
149
167
|
end
|
150
|
-
|
168
|
+
=end
|
151
169
|
def pagerank
|
152
|
-
return @pagerank unless @pagerank
|
153
|
-
require '
|
154
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)
|
170
|
+
return @pagerank unless @pagerank.nil?
|
171
|
+
require 'page_rankr'
|
172
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
155
173
|
return @pagerank
|
156
174
|
end
|
157
175
|
|
158
176
|
def ppl#pagerank per link
|
159
|
-
|
177
|
+
pagerank
|
178
|
+
return false if @pagerank.nil?
|
179
|
+
scan_links
|
180
|
+
return (@pagerank / @links.size)
|
160
181
|
end
|
161
|
-
|
162
|
-
|
182
|
+
def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
|
183
|
+
scan_links
|
163
184
|
raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
|
164
|
-
|
165
|
-
|
166
|
-
|
185
|
+
seed_uris.concat(@external_outbound_links - checked_uris)
|
186
|
+
related_keywords.concat(keywords)
|
187
|
+
result = Array.new
|
188
|
+
while seed_uris.size > 0 and result.size < max
|
167
189
|
uri = seed_uris.first
|
168
190
|
checked_uris << uri unless checked_uris.include?uri
|
169
191
|
seed_uris.delete(uri)
|
@@ -172,63 +194,84 @@ class Webpage
|
|
172
194
|
text = w.body + w.title
|
173
195
|
related_keywords.each do |word|
|
174
196
|
if text.include?word
|
175
|
-
|
176
|
-
|
197
|
+
#result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
|
198
|
+
domain = Webpage.host_to_domain(URI.parse(uri).host)
|
199
|
+
result << domain unless result.include? domain
|
200
|
+
seed_uris.concat(w.external_outbound_links - checked_uris)
|
177
201
|
break
|
178
202
|
end
|
179
203
|
end
|
180
204
|
end
|
181
|
-
return
|
205
|
+
return result
|
182
206
|
end
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
207
|
+
|
208
|
+
def link_to(target_uri)
|
209
|
+
scan_links
|
210
|
+
target_uri = Webpage.uri_normalize(target_uri)
|
211
|
+
target_host = URI.parse(target_uri).host
|
212
|
+
target_domain = Webpage.host_to_domain(target_host)
|
213
|
+
type = 0 #not link to
|
214
|
+
@links.each do |link|
|
215
|
+
candidate_host = URI.parse(link).host
|
216
|
+
if link == target_uri
|
217
|
+
type = 3 #definitely link to
|
218
|
+
break
|
219
|
+
elsif URI.parse(link).host == target_host
|
220
|
+
type = 2 if type < 2 #link to the host
|
221
|
+
elsif Webpage.host_to_domain(candidate_host) == target_domain
|
222
|
+
type = 1 if type < 1 #link to the root domain
|
192
223
|
end
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
224
|
+
end
|
225
|
+
return type
|
226
|
+
end
|
227
|
+
|
228
|
+
def self.uri_normalize(uri)
|
229
|
+
uri = URI.parse(uri).normalize
|
230
|
+
fragment = uri.fragment
|
231
|
+
return uri.to_s.delete("##{fragment}")
|
232
|
+
#uri = uri.to_s.strip.sub(/\#.*$/,'')
|
233
|
+
#uri.path = '/' if uri.path.nil?
|
234
|
+
end
|
235
|
+
|
236
|
+
def self.host_to_domain(host)
|
237
|
+
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
238
|
+
return domain[1] unless domain.nil?
|
239
|
+
return false
|
240
|
+
end
|
241
|
+
|
242
|
+
def self.uri_encode(str)
|
243
|
+
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
244
|
+
end
|
245
|
+
|
246
|
+
def scan_inbound_links
|
247
|
+
scan_links
|
248
|
+
@back_links.each do |inlink|
|
249
|
+
inlink = URI.parse inlink
|
250
|
+
if @domain == Webpage.host_to_domain(inlink.host)
|
251
|
+
@internal_inbound_links << inlink.to_s
|
252
|
+
else
|
253
|
+
@external_inbound_links << inlink.to_s
|
203
254
|
end
|
204
255
|
end
|
205
|
-
@
|
206
|
-
|
207
|
-
@internal_outbound_links.uniq!
|
208
|
-
@external_outbound_links.uniq!
|
256
|
+
@internal_inbound_links.uniq!
|
257
|
+
@external_inbound_links.uniq!
|
209
258
|
end
|
210
259
|
|
211
260
|
def scan_outbound_links
|
212
|
-
outbound_links.each do |outlink|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
warn "bad uri:#{outlink}"
|
261
|
+
@outbound_links.each do |outlink|
|
262
|
+
w = Webpage.new(outlink)
|
263
|
+
unless w.successful
|
264
|
+
@invalid_links << outlink
|
217
265
|
next
|
218
266
|
end
|
219
267
|
next if w.links.nil?
|
220
268
|
w.links.each do |uri|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
else
|
228
|
-
@broken_outbound_links << uri.to_s
|
229
|
-
end
|
230
|
-
rescue URI::InvalidURIError
|
231
|
-
warn "bad uri:#{uri}"
|
269
|
+
#uri = URI.parse(uri)
|
270
|
+
#next if uri.host.nil?
|
271
|
+
if Webpage.host_to_domain(uri) == @domain
|
272
|
+
@back_links << uri.to_s
|
273
|
+
else
|
274
|
+
@broken_outbound_links << uri.to_s
|
232
275
|
end
|
233
276
|
end
|
234
277
|
end
|
@@ -236,50 +279,38 @@ class Webpage
|
|
236
279
|
@broken_outbound_links.uniq!
|
237
280
|
end
|
238
281
|
|
239
|
-
def
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
282
|
+
def scan_links
|
283
|
+
return unless @links.empty?
|
284
|
+
exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
|
285
|
+
@page.links.each do |link|
|
286
|
+
#初步解析
|
287
|
+
begin
|
288
|
+
uri = URI.parse(link.href)
|
289
|
+
href = uri.to_s
|
290
|
+
rescue URI::InvalidURIError => e
|
291
|
+
@invalid_links << href
|
292
|
+
next
|
293
|
+
end
|
294
|
+
#忽略非网页文件
|
295
|
+
if exts_to_ignored.include?href[-4,4]
|
296
|
+
@ignored_uris << href
|
297
|
+
next
|
298
|
+
end
|
299
|
+
#处理相对路径
|
300
|
+
if uri.relative?
|
301
|
+
@relative_paths << href
|
302
|
+
href = @uri.merge(href).to_s
|
303
|
+
@internal_outbound_links << href
|
244
304
|
else
|
245
|
-
|
305
|
+
href = Webpage.uri_normalize(href)
|
306
|
+
if Webpage.host_to_domain(uri.host) == @domain
|
307
|
+
@internal_outbound_links << href
|
308
|
+
else
|
309
|
+
@external_outbound_links << href
|
310
|
+
end
|
246
311
|
end
|
312
|
+
@links << href
|
247
313
|
end
|
248
|
-
@
|
249
|
-
@external_inbound_links.uniq!
|
314
|
+
@outbound_links = @internal_outbound_links.uniq! + @external_outbound_links.uniq!
|
250
315
|
end
|
251
|
-
|
252
|
-
def uri_encode(str)
|
253
|
-
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
254
|
-
end
|
255
|
-
def host_to_domain(host)
|
256
|
-
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
257
|
-
return domain[1] unless domain.nil?
|
258
|
-
return false
|
259
|
-
end
|
260
|
-
end
|
261
|
-
|
262
|
-
w = Webpage.new('http://cidian.youdao.com')
|
263
|
-
#puts w.external_outbound_links
|
264
|
-
related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
|
265
|
-
puts w.related_uris(related_keywords)
|
266
|
-
exit
|
267
|
-
require 'yaml'
|
268
|
-
filename = './cidian.yaml'
|
269
|
-
if File.exists?(filename)
|
270
|
-
cached_cidian = YAML.load(File.read(filename))
|
271
|
-
else
|
272
|
-
cached_cidian = Hash.new
|
273
|
-
cached_cidian[:seed_uris] = Array.new
|
274
|
-
cached_cidian[:checked_uris] = Array.new
|
275
|
-
cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
|
276
|
-
cached_cidian[:related_uris] = Array.new
|
277
|
-
end
|
278
|
-
at_exit do
|
279
|
-
File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
|
280
316
|
end
|
281
|
-
#puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
|
282
|
-
puts w.external_inbound_links
|
283
|
-
puts w.internal_inbound_links
|
284
|
-
puts w.pagerank
|
285
|
-
puts w.external_inbound_links
|