webpage 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/webpage.rb +157 -126
- metadata +1 -1
data/webpage.rb
CHANGED
@@ -3,9 +3,11 @@ require 'pp'
|
|
3
3
|
require 'mechanize'
|
4
4
|
require 'uri'
|
5
5
|
class Webpage
|
6
|
-
attr_reader:links,:successful,:related_uris
|
6
|
+
attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
|
7
|
+
attr_accessor :ignored_exts
|
7
8
|
def initialize(uri)
|
8
|
-
@
|
9
|
+
@links = Array.new
|
10
|
+
@relative_paths = Array.new
|
9
11
|
@outbound_links = Array.new
|
10
12
|
@internal_outbound_links = Array.new
|
11
13
|
@external_outbound_links = Array.new
|
@@ -15,54 +17,56 @@ class Webpage
|
|
15
17
|
@internal_inbound_links = Array.new
|
16
18
|
@external_inbound_links = Array.new
|
17
19
|
@internal_links = Array.new
|
18
|
-
@
|
19
|
-
@uri_dirname = File.dirname(@uri.path)
|
20
|
-
@uri_domain = host_to_domain @uri.host
|
20
|
+
@invalid_links = Array.new
|
21
21
|
@accessed_uri = Array.new
|
22
|
-
@page = ''
|
23
22
|
@related_uris = Array.new
|
24
23
|
@successful = false
|
25
24
|
begin
|
25
|
+
@uri = URI.parse(uri)
|
26
|
+
raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
|
27
|
+
@domain = Webpage.host_to_domain @uri.host
|
26
28
|
agent = Mechanize.new
|
27
|
-
agent.open_timeout =
|
29
|
+
agent.open_timeout = 3
|
28
30
|
@page = agent.get @uri.to_s
|
31
|
+
raise 'not webpage' unless @page.class == Mechanize::Page
|
29
32
|
@page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
30
|
-
@page.links.each do |link| #1
|
31
|
-
next if link.href.nil?
|
32
|
-
uri = uri_encode(link.href.strip)
|
33
|
-
begin
|
34
|
-
@links << @uri.merge(uri).to_s
|
35
|
-
rescue URI::InvalidURIError,URI::InvalidComponentError
|
36
|
-
warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
|
37
|
-
end
|
38
|
-
end
|
39
33
|
@successful = true
|
40
34
|
rescue Exception => e
|
41
|
-
warn "#{e}
|
35
|
+
warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
|
42
36
|
end
|
43
|
-
@links.uniq!
|
44
|
-
scan_links
|
45
37
|
end
|
46
38
|
|
47
39
|
def encoding
|
48
40
|
return @page.encoding
|
49
41
|
end
|
42
|
+
|
50
43
|
def keywords
|
51
|
-
|
44
|
+
meta = @page.search("//meta[@name='keywords']").first
|
45
|
+
return meta.attributes["content"].value.split(',') unless meta.nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
def description
|
49
|
+
meta = @page.search("//meta[@name='description']").first
|
50
|
+
if meta.nil?
|
51
|
+
return false
|
52
|
+
end
|
53
|
+
return meta.atrributes['content'].value
|
52
54
|
end
|
53
55
|
|
54
56
|
def body
|
55
|
-
return @page.body
|
56
|
-
return
|
57
|
+
return @page.body
|
58
|
+
#(return @page.body unless @page.body.include?'<html>') if @successful
|
59
|
+
#return String.new
|
57
60
|
end
|
58
61
|
|
59
62
|
def text
|
60
|
-
return body.
|
63
|
+
return Nokogiri::HTML(body).xpath("//text()").text
|
64
|
+
#return body.gsub(/<\/?[^>]*>/, "")
|
61
65
|
end
|
62
66
|
|
63
67
|
def title
|
64
68
|
return @page.title unless @page.title.nil?
|
65
|
-
return
|
69
|
+
return false
|
66
70
|
end
|
67
71
|
|
68
72
|
|
@@ -70,6 +74,7 @@ class Webpage
|
|
70
74
|
#1.$all = get all <a>
|
71
75
|
#2.$href = get all href from $all
|
72
76
|
#3.make all $href to be absolute path and put to @links
|
77
|
+
=begin
|
73
78
|
def links
|
74
79
|
return @links unless @links.empty?
|
75
80
|
begin
|
@@ -78,7 +83,7 @@ class Webpage
|
|
78
83
|
agent.get @uri do |page|
|
79
84
|
page.links.each do |link| #1
|
80
85
|
next if link.href.nil?
|
81
|
-
uri =
|
86
|
+
uri = Webpage.uri_normalize(link.href)
|
82
87
|
begin
|
83
88
|
@links << @uri.merge(uri).to_s
|
84
89
|
rescue URI::InvalidURIError,URI::InvalidComponentError
|
@@ -101,29 +106,42 @@ class Webpage
|
|
101
106
|
#@links = @links.uniq - @accessed_uri
|
102
107
|
#@accessed_uri += @links
|
103
108
|
@links.uniq!
|
104
|
-
puts @links
|
105
109
|
scan_links
|
106
110
|
return @links
|
107
111
|
end
|
112
|
+
=end
|
108
113
|
|
109
|
-
def
|
110
|
-
return @internal_links unless @internal_links.empty?
|
114
|
+
def report
|
111
115
|
scan_links
|
112
|
-
|
116
|
+
scan_outbound_links
|
117
|
+
scan_inbound_links
|
118
|
+
report = {
|
119
|
+
:internal_links => @internal_links,
|
120
|
+
:internal_outbound_links => @internal_outbound_links,
|
121
|
+
:outbound_links => @outbound_links,
|
122
|
+
:broken_outbound_links => @broken_outbound_links,
|
123
|
+
:external_inbound_links => @external_inbound_links,
|
124
|
+
:internal_inbound_links => @internal_inbound_links,
|
125
|
+
:external_outbound_links => @external_outbound_links,
|
126
|
+
:related_uris => @related_uris,
|
127
|
+
:invalid_links => @invalid_links
|
128
|
+
}
|
113
129
|
end
|
130
|
+
|
131
|
+
|
132
|
+
=begin
|
114
133
|
def external_outbound_links
|
115
134
|
return @external_outbound_links unless @external_outbound_links.empty?
|
116
135
|
links
|
117
136
|
return @external_outbound_links
|
118
137
|
end
|
138
|
+
|
119
139
|
def internal_outbound_links
|
120
140
|
return @internal_outbound_links unless @internal_outbound_links.empty?
|
121
141
|
links
|
122
142
|
return @internal_outbound_links
|
123
143
|
end
|
124
|
-
|
125
|
-
return external_outbound_links + internal_outbound_links
|
126
|
-
end
|
144
|
+
|
127
145
|
def back_links#inbound links among all the outbound links
|
128
146
|
return @back_links unless @back_links.empty?
|
129
147
|
scan_outbound_links
|
@@ -147,23 +165,27 @@ class Webpage
|
|
147
165
|
scan_inbound_links
|
148
166
|
return @internal_inbound_links
|
149
167
|
end
|
150
|
-
|
168
|
+
=end
|
151
169
|
def pagerank
|
152
|
-
return @pagerank unless @pagerank
|
153
|
-
require '
|
154
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)
|
170
|
+
return @pagerank unless @pagerank.nil?
|
171
|
+
require 'page_rankr'
|
172
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
155
173
|
return @pagerank
|
156
174
|
end
|
157
175
|
|
158
176
|
def ppl#pagerank per link
|
159
|
-
|
177
|
+
pagerank
|
178
|
+
return false if @pagerank.nil?
|
179
|
+
scan_links
|
180
|
+
return (@pagerank / @links.size)
|
160
181
|
end
|
161
|
-
|
162
|
-
|
182
|
+
def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
|
183
|
+
scan_links
|
163
184
|
raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
|
164
|
-
|
165
|
-
|
166
|
-
|
185
|
+
seed_uris.concat(@external_outbound_links - checked_uris)
|
186
|
+
related_keywords.concat(keywords)
|
187
|
+
result = Array.new
|
188
|
+
while seed_uris.size > 0 and result.size < max
|
167
189
|
uri = seed_uris.first
|
168
190
|
checked_uris << uri unless checked_uris.include?uri
|
169
191
|
seed_uris.delete(uri)
|
@@ -172,63 +194,84 @@ class Webpage
|
|
172
194
|
text = w.body + w.title
|
173
195
|
related_keywords.each do |word|
|
174
196
|
if text.include?word
|
175
|
-
|
176
|
-
|
197
|
+
#result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
|
198
|
+
domain = Webpage.host_to_domain(URI.parse(uri).host)
|
199
|
+
result << domain unless result.include? domain
|
200
|
+
seed_uris.concat(w.external_outbound_links - checked_uris)
|
177
201
|
break
|
178
202
|
end
|
179
203
|
end
|
180
204
|
end
|
181
|
-
return
|
205
|
+
return result
|
182
206
|
end
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
207
|
+
|
208
|
+
def link_to(target_uri)
|
209
|
+
scan_links
|
210
|
+
target_uri = Webpage.uri_normalize(target_uri)
|
211
|
+
target_host = URI.parse(target_uri).host
|
212
|
+
target_domain = Webpage.host_to_domain(target_host)
|
213
|
+
type = 0 #not link to
|
214
|
+
@links.each do |link|
|
215
|
+
candidate_host = URI.parse(link).host
|
216
|
+
if link == target_uri
|
217
|
+
type = 3 #definitely link to
|
218
|
+
break
|
219
|
+
elsif URI.parse(link).host == target_host
|
220
|
+
type = 2 if type < 2 #link to the host
|
221
|
+
elsif Webpage.host_to_domain(candidate_host) == target_domain
|
222
|
+
type = 1 if type < 1 #link to the root domain
|
192
223
|
end
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
224
|
+
end
|
225
|
+
return type
|
226
|
+
end
|
227
|
+
|
228
|
+
def self.uri_normalize(uri)
|
229
|
+
uri = URI.parse(uri).normalize
|
230
|
+
fragment = uri.fragment
|
231
|
+
return uri.to_s.delete("##{fragment}")
|
232
|
+
#uri = uri.to_s.strip.sub(/\#.*$/,'')
|
233
|
+
#uri.path = '/' if uri.path.nil?
|
234
|
+
end
|
235
|
+
|
236
|
+
def self.host_to_domain(host)
|
237
|
+
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
238
|
+
return domain[1] unless domain.nil?
|
239
|
+
return false
|
240
|
+
end
|
241
|
+
|
242
|
+
def self.uri_encode(str)
|
243
|
+
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
244
|
+
end
|
245
|
+
|
246
|
+
def scan_inbound_links
|
247
|
+
scan_links
|
248
|
+
@back_links.each do |inlink|
|
249
|
+
inlink = URI.parse inlink
|
250
|
+
if @domain == Webpage.host_to_domain(inlink.host)
|
251
|
+
@internal_inbound_links << inlink.to_s
|
252
|
+
else
|
253
|
+
@external_inbound_links << inlink.to_s
|
203
254
|
end
|
204
255
|
end
|
205
|
-
@
|
206
|
-
|
207
|
-
@internal_outbound_links.uniq!
|
208
|
-
@external_outbound_links.uniq!
|
256
|
+
@internal_inbound_links.uniq!
|
257
|
+
@external_inbound_links.uniq!
|
209
258
|
end
|
210
259
|
|
211
260
|
def scan_outbound_links
|
212
|
-
outbound_links.each do |outlink|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
warn "bad uri:#{outlink}"
|
261
|
+
@outbound_links.each do |outlink|
|
262
|
+
w = Webpage.new(outlink)
|
263
|
+
unless w.successful
|
264
|
+
@invalid_links << outlink
|
217
265
|
next
|
218
266
|
end
|
219
267
|
next if w.links.nil?
|
220
268
|
w.links.each do |uri|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
else
|
228
|
-
@broken_outbound_links << uri.to_s
|
229
|
-
end
|
230
|
-
rescue URI::InvalidURIError
|
231
|
-
warn "bad uri:#{uri}"
|
269
|
+
#uri = URI.parse(uri)
|
270
|
+
#next if uri.host.nil?
|
271
|
+
if Webpage.host_to_domain(uri) == @domain
|
272
|
+
@back_links << uri.to_s
|
273
|
+
else
|
274
|
+
@broken_outbound_links << uri.to_s
|
232
275
|
end
|
233
276
|
end
|
234
277
|
end
|
@@ -236,50 +279,38 @@ class Webpage
|
|
236
279
|
@broken_outbound_links.uniq!
|
237
280
|
end
|
238
281
|
|
239
|
-
def
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
282
|
+
def scan_links
|
283
|
+
return unless @links.empty?
|
284
|
+
exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
|
285
|
+
@page.links.each do |link|
|
286
|
+
#初步解析
|
287
|
+
begin
|
288
|
+
uri = URI.parse(link.href)
|
289
|
+
href = uri.to_s
|
290
|
+
rescue URI::InvalidURIError => e
|
291
|
+
@invalid_links << href
|
292
|
+
next
|
293
|
+
end
|
294
|
+
#忽略非网页文件
|
295
|
+
if exts_to_ignored.include?href[-4,4]
|
296
|
+
@ignored_uris << href
|
297
|
+
next
|
298
|
+
end
|
299
|
+
#处理相对路径
|
300
|
+
if uri.relative?
|
301
|
+
@relative_paths << href
|
302
|
+
href = @uri.merge(href).to_s
|
303
|
+
@internal_outbound_links << href
|
244
304
|
else
|
245
|
-
|
305
|
+
href = Webpage.uri_normalize(href)
|
306
|
+
if Webpage.host_to_domain(uri.host) == @domain
|
307
|
+
@internal_outbound_links << href
|
308
|
+
else
|
309
|
+
@external_outbound_links << href
|
310
|
+
end
|
246
311
|
end
|
312
|
+
@links << href
|
247
313
|
end
|
248
|
-
@
|
249
|
-
@external_inbound_links.uniq!
|
314
|
+
@outbound_links = @internal_outbound_links.uniq! + @external_outbound_links.uniq!
|
250
315
|
end
|
251
|
-
|
252
|
-
def uri_encode(str)
|
253
|
-
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
254
|
-
end
|
255
|
-
def host_to_domain(host)
|
256
|
-
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
257
|
-
return domain[1] unless domain.nil?
|
258
|
-
return false
|
259
|
-
end
|
260
|
-
end
|
261
|
-
|
262
|
-
w = Webpage.new('http://cidian.youdao.com')
|
263
|
-
#puts w.external_outbound_links
|
264
|
-
related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
|
265
|
-
puts w.related_uris(related_keywords)
|
266
|
-
exit
|
267
|
-
require 'yaml'
|
268
|
-
filename = './cidian.yaml'
|
269
|
-
if File.exists?(filename)
|
270
|
-
cached_cidian = YAML.load(File.read(filename))
|
271
|
-
else
|
272
|
-
cached_cidian = Hash.new
|
273
|
-
cached_cidian[:seed_uris] = Array.new
|
274
|
-
cached_cidian[:checked_uris] = Array.new
|
275
|
-
cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
|
276
|
-
cached_cidian[:related_uris] = Array.new
|
277
|
-
end
|
278
|
-
at_exit do
|
279
|
-
File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
|
280
316
|
end
|
281
|
-
#puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
|
282
|
-
puts w.external_inbound_links
|
283
|
-
puts w.internal_inbound_links
|
284
|
-
puts w.pagerank
|
285
|
-
puts w.external_inbound_links
|