webpage 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/webpage.rb +121 -276
- metadata +6 -4
data/webpage.rb
CHANGED
@@ -2,315 +2,160 @@
|
|
2
2
|
require 'pp'
|
3
3
|
require 'mechanize'
|
4
4
|
require 'uri'
|
5
|
-
class Webpage
|
6
|
-
attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
|
7
|
-
attr_accessor :ignored_exts
|
8
|
-
def initialize(uri)
|
9
|
-
@links = Array.new
|
10
|
-
@relative_paths = Array.new
|
11
|
-
@outbound_links = Array.new
|
12
|
-
@internal_outbound_links = Array.new
|
13
|
-
@external_outbound_links = Array.new
|
14
|
-
@broken_outbound_links = Array.new
|
15
|
-
@external_inbound_links = Array.new
|
16
|
-
@back_links = Array.new
|
17
|
-
@internal_inbound_links = Array.new
|
18
|
-
@external_inbound_links = Array.new
|
19
|
-
@internal_links = Array.new
|
20
|
-
@invalid_links = Array.new
|
21
|
-
@accessed_uri = Array.new
|
22
|
-
@related_uris = Array.new
|
23
|
-
@successful = false
|
24
|
-
begin
|
25
|
-
@uri = URI.parse(uri)
|
26
|
-
raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
|
27
|
-
@domain = Webpage.host_to_domain @uri.host
|
28
|
-
agent = Mechanize.new
|
29
|
-
agent.open_timeout = 3
|
30
|
-
@page = agent.get @uri.to_s
|
31
|
-
raise 'not webpage' unless @page.class == Mechanize::Page
|
32
|
-
@page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
33
|
-
@successful = true
|
34
|
-
rescue Exception => e
|
35
|
-
warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def encoding
|
40
|
-
return @page.encoding
|
41
|
-
end
|
42
|
-
|
43
|
-
def keywords
|
44
|
-
meta = @page.search("//meta[@name='keywords']").first
|
45
|
-
return meta.attributes["content"].value.split(',') unless meta.nil?
|
46
|
-
end
|
47
|
-
|
48
|
-
def description
|
49
|
-
meta = @page.search("//meta[@name='description']").first
|
50
|
-
if meta.nil?
|
51
|
-
return false
|
52
|
-
end
|
53
|
-
return meta.atrributes['content'].value
|
54
|
-
end
|
55
|
-
|
56
|
-
def body
|
57
|
-
return @page.body
|
58
|
-
#(return @page.body unless @page.body.include?'<html>') if @successful
|
59
|
-
#return String.new
|
60
|
-
end
|
61
|
-
|
62
|
-
def text
|
63
|
-
return Nokogiri::HTML(body).xpath("//text()").text
|
64
|
-
#return body.gsub(/<\/?[^>]*>/, "")
|
65
|
-
end
|
66
|
-
|
67
|
-
def title
|
68
|
-
return @page.title unless @page.title.nil?
|
69
|
-
return false
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
#get all links from html content
|
74
|
-
#1.$all = get all <a>
|
75
|
-
#2.$href = get all href from $all
|
76
|
-
#3.make all $href to be absolute path and put to @links
|
77
|
-
=begin
|
78
|
-
def links
|
79
|
-
return @links unless @links.empty?
|
80
|
-
begin
|
81
|
-
agent = Mechanize.new
|
82
|
-
agent.open_timeout = 5
|
83
|
-
agent.get @uri do |page|
|
84
|
-
page.links.each do |link| #1
|
85
|
-
next if link.href.nil?
|
86
|
-
uri = Webpage.uri_normalize(link.href)
|
87
|
-
begin
|
88
|
-
@links << @uri.merge(uri).to_s
|
89
|
-
rescue URI::InvalidURIError,URI::InvalidComponentError
|
90
|
-
warn "ignore\n #{uri} \n #{link.href}"
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
rescue Errno::ETIMEDOUT,Timeout::Error
|
95
|
-
warn "timeout:#{@uri}"
|
96
|
-
rescue NoMethodError => e
|
97
|
-
warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
|
98
|
-
rescue Zlib::GzipFile::Error,Mechanize::Error => e
|
99
|
-
warn "gzip error:#{@uri}.#{e}"
|
100
|
-
rescue Net::HTTP::Persistent::Error
|
101
|
-
warn "network reset:#{@uri}"
|
102
|
-
rescue SocketError =>e
|
103
|
-
warn "#{e}.#{@uri}"
|
104
|
-
end
|
105
|
-
return Array.new if @links.empty?
|
106
|
-
#@links = @links.uniq - @accessed_uri
|
107
|
-
#@accessed_uri += @links
|
108
|
-
@links.uniq!
|
109
|
-
scan_links
|
110
|
-
return @links
|
111
|
-
end
|
112
|
-
=end
|
113
|
-
|
114
|
-
def report
|
115
|
-
scan_links
|
116
|
-
scan_outbound_links
|
117
|
-
scan_inbound_links
|
118
|
-
report = {
|
119
|
-
:internal_links => @internal_links,
|
120
|
-
:internal_outbound_links => @internal_outbound_links,
|
121
|
-
:outbound_links => @outbound_links,
|
122
|
-
:broken_outbound_links => @broken_outbound_links,
|
123
|
-
:external_inbound_links => @external_inbound_links,
|
124
|
-
:internal_inbound_links => @internal_inbound_links,
|
125
|
-
:external_outbound_links => @external_outbound_links,
|
126
|
-
:related_uris => @related_uris,
|
127
|
-
:invalid_links => @invalid_links
|
128
|
-
}
|
129
|
-
end
|
130
|
-
|
131
|
-
|
132
|
-
=begin
|
133
|
-
def external_outbound_links
|
134
|
-
return @external_outbound_links unless @external_outbound_links.empty?
|
135
|
-
links
|
136
|
-
return @external_outbound_links
|
137
|
-
end
|
138
|
-
|
139
|
-
def internal_outbound_links
|
140
|
-
return @internal_outbound_links unless @internal_outbound_links.empty?
|
141
|
-
links
|
142
|
-
return @internal_outbound_links
|
143
|
-
end
|
144
|
-
|
145
|
-
def back_links#inbound links among all the outbound links
|
146
|
-
return @back_links unless @back_links.empty?
|
147
|
-
scan_outbound_links
|
148
|
-
return @back_links
|
149
|
-
end
|
150
|
-
|
151
|
-
def broken_outbound_links
|
152
|
-
return @broken_outbound_links unless @broken_outbound_links.empty?
|
153
|
-
scan_outbound_links
|
154
|
-
return @broken_outbound_links
|
155
|
-
end
|
156
|
-
|
157
|
-
def external_inbound_links#outter inbound links
|
158
|
-
return @external_inbound_links unless @external_inbound_links.empty?
|
159
|
-
scan_inbound_links
|
160
|
-
return @external_inbound_links
|
161
|
-
end
|
162
|
-
|
163
|
-
def internal_inbound_links
|
164
|
-
return @internal_inbound_links unless @internal_inbound_links.empty?
|
165
|
-
scan_inbound_links
|
166
|
-
return @internal_inbound_links
|
167
|
-
end
|
168
|
-
=end
|
169
|
-
def pagerank
|
170
|
-
return @pagerank unless @pagerank.nil?
|
171
|
-
require 'page_rankr'
|
172
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
173
|
-
return @pagerank
|
174
|
-
end
|
175
|
-
|
176
|
-
def ppl#pagerank per link
|
177
|
-
pagerank
|
178
|
-
return false if @pagerank.nil?
|
179
|
-
scan_links
|
180
|
-
return (@pagerank / @links.size)
|
181
|
-
end
|
182
|
-
def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
|
183
|
-
scan_links
|
184
|
-
raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
|
185
|
-
seed_uris.concat(@external_outbound_links - checked_uris)
|
186
|
-
related_keywords.concat(keywords)
|
187
|
-
result = Array.new
|
188
|
-
while seed_uris.size > 0 and result.size < max
|
189
|
-
uri = seed_uris.first
|
190
|
-
checked_uris << uri unless checked_uris.include?uri
|
191
|
-
seed_uris.delete(uri)
|
192
|
-
w = Webpage.new uri
|
193
|
-
next unless w.successful
|
194
|
-
text = w.body + w.title
|
195
|
-
related_keywords.each do |word|
|
196
|
-
if text.include?word
|
197
|
-
#result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
|
198
|
-
domain = Webpage.host_to_domain(URI.parse(uri).host)
|
199
|
-
result << domain unless result.include? domain
|
200
|
-
seed_uris.concat(w.external_outbound_links - checked_uris)
|
201
|
-
break
|
202
|
-
end
|
203
|
-
end
|
204
|
-
end
|
205
|
-
return result
|
206
|
-
end
|
207
|
-
|
208
|
-
def link_to(target_uri)
|
209
|
-
scan_links
|
210
|
-
target_uri = Webpage.uri_normalize(target_uri)
|
211
|
-
target_host = URI.parse(target_uri).host
|
212
|
-
target_domain = Webpage.host_to_domain(target_host)
|
213
|
-
type = 0 #not link to
|
214
|
-
@links.each do |link|
|
215
|
-
candidate_host = URI.parse(link).host
|
216
|
-
if link == target_uri
|
217
|
-
type = 3 #definitely link to
|
218
|
-
break
|
219
|
-
elsif URI.parse(link).host == target_host
|
220
|
-
type = 2 if type < 2 #link to the host
|
221
|
-
elsif Webpage.host_to_domain(candidate_host) == target_domain
|
222
|
-
type = 1 if type < 1 #link to the root domain
|
223
|
-
end
|
224
|
-
end
|
225
|
-
return type
|
226
|
-
end
|
227
5
|
|
6
|
+
class WebHelper
|
228
7
|
def self.uri_normalize(uri)
|
229
8
|
uri = URI.parse(uri).normalize
|
230
9
|
fragment = uri.fragment
|
231
|
-
|
10
|
+
uri = uri.to_s
|
11
|
+
uri.sub!(/##{fragment}$/,'') unless fragment.nil?
|
12
|
+
return uri
|
232
13
|
#uri = uri.to_s.strip.sub(/\#.*$/,'')
|
233
14
|
#uri.path = '/' if uri.path.nil?
|
234
15
|
end
|
235
|
-
|
236
16
|
def self.host_to_domain(host)
|
237
17
|
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
238
18
|
return domain[1] unless domain.nil?
|
239
19
|
return false
|
240
20
|
end
|
241
|
-
|
242
21
|
def self.uri_encode(str)
|
243
22
|
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
244
23
|
end
|
24
|
+
end
|
245
25
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
26
|
+
class Mechanize::Page
|
27
|
+
#@invalid_links = Hash.new
|
28
|
+
attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
|
29
|
+
public
|
30
|
+
def text
|
31
|
+
return Nokogiri::HTML(body).xpath("//text()").text
|
32
|
+
#return body.gsub(/<\/?[^>]*>/, "")
|
33
|
+
end
|
34
|
+
def keywords
|
35
|
+
meta = search("//meta[@name='keywords']").first
|
36
|
+
return meta.attributes["content"].value.split(',') unless meta.nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
def description
|
40
|
+
meta = search("//meta[@name='description']").first
|
41
|
+
if meta.nil?
|
42
|
+
return false
|
255
43
|
end
|
256
|
-
|
257
|
-
@external_inbound_links.uniq!
|
44
|
+
return meta.attributes['content'].value
|
258
45
|
end
|
259
46
|
|
260
|
-
def
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
@invalid_links << outlink
|
265
|
-
next
|
266
|
-
end
|
267
|
-
next if w.links.nil?
|
268
|
-
w.links.each do |uri|
|
269
|
-
#uri = URI.parse(uri)
|
270
|
-
#next if uri.host.nil?
|
271
|
-
if Webpage.host_to_domain(uri) == @domain
|
272
|
-
@back_links << uri.to_s
|
273
|
-
else
|
274
|
-
@broken_outbound_links << uri.to_s
|
275
|
-
end
|
276
|
-
end
|
277
|
-
end
|
278
|
-
@back_links.uniq!
|
279
|
-
@broken_outbound_links.uniq!
|
47
|
+
def pagerank
|
48
|
+
require 'page_rankr'
|
49
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
50
|
+
return @pagerank
|
280
51
|
end
|
281
52
|
|
282
53
|
def scan_links
|
283
|
-
|
54
|
+
@external_outbound_links = Array.new
|
55
|
+
@internal_outbound_links = Array.new
|
56
|
+
@valid_links = Array.new
|
57
|
+
@invalid_links = Array.new
|
58
|
+
@nofollowed_links = Array.new
|
284
59
|
exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
|
285
|
-
|
60
|
+
links.each do |link|
|
286
61
|
#初步解析
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
62
|
+
=begin
|
63
|
+
uri = URI.parse(link.uri).normalize
|
64
|
+
href = uri.to_s
|
65
|
+
rescue URI::InvalidURIError => e
|
66
|
+
pp link
|
67
|
+
puts e
|
68
|
+
@invalid_links << link
|
69
|
+
next
|
70
|
+
=end
|
71
|
+
#忽略非http请求
|
72
|
+
if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
|
73
|
+
@invalid_links << link#todo 不同链接key重复,无法体现
|
74
|
+
next
|
75
|
+
end
|
76
|
+
#忽略非网页文件,忽略js按钮忽略邮件
|
77
|
+
if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
|
78
|
+
@invalid_links << link
|
292
79
|
next
|
293
80
|
end
|
294
|
-
|
295
|
-
if
|
296
|
-
@
|
81
|
+
#nofollow links
|
82
|
+
if link.rel.include?'nofollow'
|
83
|
+
@nofollowed_links << link
|
297
84
|
next
|
298
85
|
end
|
86
|
+
if link.respond_to?'fragment' and link.fragment.empty?
|
87
|
+
@invalid_links << link
|
88
|
+
next
|
89
|
+
end
|
90
|
+
pp link
|
299
91
|
#处理相对路径
|
300
|
-
if uri.relative?
|
301
|
-
@
|
302
|
-
|
303
|
-
|
92
|
+
if !link.uri.nil? and link.uri.relative?
|
93
|
+
@invalid_links << link
|
94
|
+
#puts @uri.merge(link)
|
95
|
+
#link.uri = @uri.merge(link.uri)
|
96
|
+
@internal_outbound_links << link unless link.uri == @uri
|
97
|
+
elsif link.uri.nil?
|
98
|
+
warn "warning: host nil #{link.uri}"
|
99
|
+
next
|
304
100
|
else
|
305
|
-
|
306
|
-
|
307
|
-
@internal_outbound_links << href
|
101
|
+
if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
|
102
|
+
@internal_outbound_links << link
|
308
103
|
else
|
309
|
-
@external_outbound_links <<
|
104
|
+
@external_outbound_links << link
|
310
105
|
end
|
311
106
|
end
|
312
|
-
@
|
107
|
+
@valid_links << link
|
108
|
+
end
|
109
|
+
@outbound_links = @internal_outbound_links + @external_outbound_links
|
110
|
+
@scanned = true
|
111
|
+
end
|
112
|
+
end
|
113
|
+
class URI::Generic
|
114
|
+
def absolute?()
|
115
|
+
if @scheme or path.start_with?'/'
|
116
|
+
true
|
117
|
+
else
|
118
|
+
false
|
119
|
+
end
|
120
|
+
end
|
121
|
+
def domain
|
122
|
+
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
123
|
+
return domain[1] unless domain.nil?
|
124
|
+
return nil
|
125
|
+
end
|
126
|
+
=begin
|
127
|
+
def normalize!
|
128
|
+
if path && path == ''
|
129
|
+
set_path('/')
|
130
|
+
end
|
131
|
+
if scheme && scheme != scheme.downcase
|
132
|
+
set_scheme(self.scheme.downcase)
|
313
133
|
end
|
314
|
-
|
134
|
+
if host && host != host.downcase
|
135
|
+
set_host(self.host.downcase)
|
136
|
+
end
|
137
|
+
set_fragment(nil) unless fragment.nil?
|
315
138
|
end
|
139
|
+
=end
|
316
140
|
end
|
141
|
+
=begin
|
142
|
+
class URI::Parser
|
143
|
+
def parse(uri)
|
144
|
+
scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
|
145
|
+
|
146
|
+
if scheme && URI.scheme_list.include?(scheme.upcase)
|
147
|
+
URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
148
|
+
else
|
149
|
+
URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
a = Mechanize.new
|
154
|
+
w = a.get('http://dict.youdao.com/w/abc/')
|
155
|
+
w.scan_links
|
156
|
+
pp w.internal_outbound_links
|
157
|
+
exit
|
158
|
+
w.links.each do |link|
|
159
|
+
puts link.rel
|
160
|
+
end
|
161
|
+
=end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,14 +11,15 @@ bindir: bin
|
|
11
11
|
cert_chain: []
|
12
12
|
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description: to show seo oriented reports of the webpage,newbie's
|
14
|
+
description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
|
15
|
+
work, careful
|
15
16
|
email: seoaqua@qq.com
|
16
17
|
executables: []
|
17
18
|
extensions: []
|
18
19
|
extra_rdoc_files: []
|
19
20
|
files:
|
20
21
|
- webpage.rb
|
21
|
-
homepage: http://
|
22
|
+
homepage: http://github.com/seoaqua/ruby-webpage
|
22
23
|
licenses: []
|
23
24
|
post_install_message:
|
24
25
|
rdoc_options: []
|
@@ -41,5 +42,6 @@ rubyforge_project:
|
|
41
42
|
rubygems_version: 1.8.21
|
42
43
|
signing_key:
|
43
44
|
specification_version: 3
|
44
|
-
summary: to show seo oriented reports of the webpage,newbie's
|
45
|
+
summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
|
46
|
+
work, careful
|
45
47
|
test_files: []
|