webpage 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/webpage.rb +121 -276
- metadata +6 -4
data/webpage.rb
CHANGED
@@ -2,315 +2,160 @@
|
|
2
2
|
require 'pp'
|
3
3
|
require 'mechanize'
|
4
4
|
require 'uri'
|
5
|
-
class Webpage
|
6
|
-
attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
|
7
|
-
attr_accessor :ignored_exts
|
8
|
-
def initialize(uri)
|
9
|
-
@links = Array.new
|
10
|
-
@relative_paths = Array.new
|
11
|
-
@outbound_links = Array.new
|
12
|
-
@internal_outbound_links = Array.new
|
13
|
-
@external_outbound_links = Array.new
|
14
|
-
@broken_outbound_links = Array.new
|
15
|
-
@external_inbound_links = Array.new
|
16
|
-
@back_links = Array.new
|
17
|
-
@internal_inbound_links = Array.new
|
18
|
-
@external_inbound_links = Array.new
|
19
|
-
@internal_links = Array.new
|
20
|
-
@invalid_links = Array.new
|
21
|
-
@accessed_uri = Array.new
|
22
|
-
@related_uris = Array.new
|
23
|
-
@successful = false
|
24
|
-
begin
|
25
|
-
@uri = URI.parse(uri)
|
26
|
-
raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
|
27
|
-
@domain = Webpage.host_to_domain @uri.host
|
28
|
-
agent = Mechanize.new
|
29
|
-
agent.open_timeout = 3
|
30
|
-
@page = agent.get @uri.to_s
|
31
|
-
raise 'not webpage' unless @page.class == Mechanize::Page
|
32
|
-
@page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
|
33
|
-
@successful = true
|
34
|
-
rescue Exception => e
|
35
|
-
warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def encoding
|
40
|
-
return @page.encoding
|
41
|
-
end
|
42
|
-
|
43
|
-
def keywords
|
44
|
-
meta = @page.search("//meta[@name='keywords']").first
|
45
|
-
return meta.attributes["content"].value.split(',') unless meta.nil?
|
46
|
-
end
|
47
|
-
|
48
|
-
def description
|
49
|
-
meta = @page.search("//meta[@name='description']").first
|
50
|
-
if meta.nil?
|
51
|
-
return false
|
52
|
-
end
|
53
|
-
return meta.atrributes['content'].value
|
54
|
-
end
|
55
|
-
|
56
|
-
def body
|
57
|
-
return @page.body
|
58
|
-
#(return @page.body unless @page.body.include?'<html>') if @successful
|
59
|
-
#return String.new
|
60
|
-
end
|
61
|
-
|
62
|
-
def text
|
63
|
-
return Nokogiri::HTML(body).xpath("//text()").text
|
64
|
-
#return body.gsub(/<\/?[^>]*>/, "")
|
65
|
-
end
|
66
|
-
|
67
|
-
def title
|
68
|
-
return @page.title unless @page.title.nil?
|
69
|
-
return false
|
70
|
-
end
|
71
|
-
|
72
|
-
|
73
|
-
#get all links from html content
|
74
|
-
#1.$all = get all <a>
|
75
|
-
#2.$href = get all href from $all
|
76
|
-
#3.make all $href to be absolute path and put to @links
|
77
|
-
=begin
|
78
|
-
def links
|
79
|
-
return @links unless @links.empty?
|
80
|
-
begin
|
81
|
-
agent = Mechanize.new
|
82
|
-
agent.open_timeout = 5
|
83
|
-
agent.get @uri do |page|
|
84
|
-
page.links.each do |link| #1
|
85
|
-
next if link.href.nil?
|
86
|
-
uri = Webpage.uri_normalize(link.href)
|
87
|
-
begin
|
88
|
-
@links << @uri.merge(uri).to_s
|
89
|
-
rescue URI::InvalidURIError,URI::InvalidComponentError
|
90
|
-
warn "ignore\n #{uri} \n #{link.href}"
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
rescue Errno::ETIMEDOUT,Timeout::Error
|
95
|
-
warn "timeout:#{@uri}"
|
96
|
-
rescue NoMethodError => e
|
97
|
-
warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
|
98
|
-
rescue Zlib::GzipFile::Error,Mechanize::Error => e
|
99
|
-
warn "gzip error:#{@uri}.#{e}"
|
100
|
-
rescue Net::HTTP::Persistent::Error
|
101
|
-
warn "network reset:#{@uri}"
|
102
|
-
rescue SocketError =>e
|
103
|
-
warn "#{e}.#{@uri}"
|
104
|
-
end
|
105
|
-
return Array.new if @links.empty?
|
106
|
-
#@links = @links.uniq - @accessed_uri
|
107
|
-
#@accessed_uri += @links
|
108
|
-
@links.uniq!
|
109
|
-
scan_links
|
110
|
-
return @links
|
111
|
-
end
|
112
|
-
=end
|
113
|
-
|
114
|
-
def report
|
115
|
-
scan_links
|
116
|
-
scan_outbound_links
|
117
|
-
scan_inbound_links
|
118
|
-
report = {
|
119
|
-
:internal_links => @internal_links,
|
120
|
-
:internal_outbound_links => @internal_outbound_links,
|
121
|
-
:outbound_links => @outbound_links,
|
122
|
-
:broken_outbound_links => @broken_outbound_links,
|
123
|
-
:external_inbound_links => @external_inbound_links,
|
124
|
-
:internal_inbound_links => @internal_inbound_links,
|
125
|
-
:external_outbound_links => @external_outbound_links,
|
126
|
-
:related_uris => @related_uris,
|
127
|
-
:invalid_links => @invalid_links
|
128
|
-
}
|
129
|
-
end
|
130
|
-
|
131
|
-
|
132
|
-
=begin
|
133
|
-
def external_outbound_links
|
134
|
-
return @external_outbound_links unless @external_outbound_links.empty?
|
135
|
-
links
|
136
|
-
return @external_outbound_links
|
137
|
-
end
|
138
|
-
|
139
|
-
def internal_outbound_links
|
140
|
-
return @internal_outbound_links unless @internal_outbound_links.empty?
|
141
|
-
links
|
142
|
-
return @internal_outbound_links
|
143
|
-
end
|
144
|
-
|
145
|
-
def back_links#inbound links among all the outbound links
|
146
|
-
return @back_links unless @back_links.empty?
|
147
|
-
scan_outbound_links
|
148
|
-
return @back_links
|
149
|
-
end
|
150
|
-
|
151
|
-
def broken_outbound_links
|
152
|
-
return @broken_outbound_links unless @broken_outbound_links.empty?
|
153
|
-
scan_outbound_links
|
154
|
-
return @broken_outbound_links
|
155
|
-
end
|
156
|
-
|
157
|
-
def external_inbound_links#outter inbound links
|
158
|
-
return @external_inbound_links unless @external_inbound_links.empty?
|
159
|
-
scan_inbound_links
|
160
|
-
return @external_inbound_links
|
161
|
-
end
|
162
|
-
|
163
|
-
def internal_inbound_links
|
164
|
-
return @internal_inbound_links unless @internal_inbound_links.empty?
|
165
|
-
scan_inbound_links
|
166
|
-
return @internal_inbound_links
|
167
|
-
end
|
168
|
-
=end
|
169
|
-
def pagerank
|
170
|
-
return @pagerank unless @pagerank.nil?
|
171
|
-
require 'page_rankr'
|
172
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
173
|
-
return @pagerank
|
174
|
-
end
|
175
|
-
|
176
|
-
def ppl#pagerank per link
|
177
|
-
pagerank
|
178
|
-
return false if @pagerank.nil?
|
179
|
-
scan_links
|
180
|
-
return (@pagerank / @links.size)
|
181
|
-
end
|
182
|
-
def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
|
183
|
-
scan_links
|
184
|
-
raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
|
185
|
-
seed_uris.concat(@external_outbound_links - checked_uris)
|
186
|
-
related_keywords.concat(keywords)
|
187
|
-
result = Array.new
|
188
|
-
while seed_uris.size > 0 and result.size < max
|
189
|
-
uri = seed_uris.first
|
190
|
-
checked_uris << uri unless checked_uris.include?uri
|
191
|
-
seed_uris.delete(uri)
|
192
|
-
w = Webpage.new uri
|
193
|
-
next unless w.successful
|
194
|
-
text = w.body + w.title
|
195
|
-
related_keywords.each do |word|
|
196
|
-
if text.include?word
|
197
|
-
#result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
|
198
|
-
domain = Webpage.host_to_domain(URI.parse(uri).host)
|
199
|
-
result << domain unless result.include? domain
|
200
|
-
seed_uris.concat(w.external_outbound_links - checked_uris)
|
201
|
-
break
|
202
|
-
end
|
203
|
-
end
|
204
|
-
end
|
205
|
-
return result
|
206
|
-
end
|
207
|
-
|
208
|
-
def link_to(target_uri)
|
209
|
-
scan_links
|
210
|
-
target_uri = Webpage.uri_normalize(target_uri)
|
211
|
-
target_host = URI.parse(target_uri).host
|
212
|
-
target_domain = Webpage.host_to_domain(target_host)
|
213
|
-
type = 0 #not link to
|
214
|
-
@links.each do |link|
|
215
|
-
candidate_host = URI.parse(link).host
|
216
|
-
if link == target_uri
|
217
|
-
type = 3 #definitely link to
|
218
|
-
break
|
219
|
-
elsif URI.parse(link).host == target_host
|
220
|
-
type = 2 if type < 2 #link to the host
|
221
|
-
elsif Webpage.host_to_domain(candidate_host) == target_domain
|
222
|
-
type = 1 if type < 1 #link to the root domain
|
223
|
-
end
|
224
|
-
end
|
225
|
-
return type
|
226
|
-
end
|
227
5
|
|
6
|
+
class WebHelper
|
228
7
|
def self.uri_normalize(uri)
|
229
8
|
uri = URI.parse(uri).normalize
|
230
9
|
fragment = uri.fragment
|
231
|
-
|
10
|
+
uri = uri.to_s
|
11
|
+
uri.sub!(/##{fragment}$/,'') unless fragment.nil?
|
12
|
+
return uri
|
232
13
|
#uri = uri.to_s.strip.sub(/\#.*$/,'')
|
233
14
|
#uri.path = '/' if uri.path.nil?
|
234
15
|
end
|
235
|
-
|
236
16
|
def self.host_to_domain(host)
|
237
17
|
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
238
18
|
return domain[1] unless domain.nil?
|
239
19
|
return false
|
240
20
|
end
|
241
|
-
|
242
21
|
def self.uri_encode(str)
|
243
22
|
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
244
23
|
end
|
24
|
+
end
|
245
25
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
26
|
+
class Mechanize::Page
|
27
|
+
#@invalid_links = Hash.new
|
28
|
+
attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
|
29
|
+
public
|
30
|
+
def text
|
31
|
+
return Nokogiri::HTML(body).xpath("//text()").text
|
32
|
+
#return body.gsub(/<\/?[^>]*>/, "")
|
33
|
+
end
|
34
|
+
def keywords
|
35
|
+
meta = search("//meta[@name='keywords']").first
|
36
|
+
return meta.attributes["content"].value.split(',') unless meta.nil?
|
37
|
+
end
|
38
|
+
|
39
|
+
def description
|
40
|
+
meta = search("//meta[@name='description']").first
|
41
|
+
if meta.nil?
|
42
|
+
return false
|
255
43
|
end
|
256
|
-
|
257
|
-
@external_inbound_links.uniq!
|
44
|
+
return meta.attributes['content'].value
|
258
45
|
end
|
259
46
|
|
260
|
-
def
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
@invalid_links << outlink
|
265
|
-
next
|
266
|
-
end
|
267
|
-
next if w.links.nil?
|
268
|
-
w.links.each do |uri|
|
269
|
-
#uri = URI.parse(uri)
|
270
|
-
#next if uri.host.nil?
|
271
|
-
if Webpage.host_to_domain(uri) == @domain
|
272
|
-
@back_links << uri.to_s
|
273
|
-
else
|
274
|
-
@broken_outbound_links << uri.to_s
|
275
|
-
end
|
276
|
-
end
|
277
|
-
end
|
278
|
-
@back_links.uniq!
|
279
|
-
@broken_outbound_links.uniq!
|
47
|
+
def pagerank
|
48
|
+
require 'page_rankr'
|
49
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
50
|
+
return @pagerank
|
280
51
|
end
|
281
52
|
|
282
53
|
def scan_links
|
283
|
-
|
54
|
+
@external_outbound_links = Array.new
|
55
|
+
@internal_outbound_links = Array.new
|
56
|
+
@valid_links = Array.new
|
57
|
+
@invalid_links = Array.new
|
58
|
+
@nofollowed_links = Array.new
|
284
59
|
exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
|
285
|
-
|
60
|
+
links.each do |link|
|
286
61
|
#初步解析
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
62
|
+
=begin
|
63
|
+
uri = URI.parse(link.uri).normalize
|
64
|
+
href = uri.to_s
|
65
|
+
rescue URI::InvalidURIError => e
|
66
|
+
pp link
|
67
|
+
puts e
|
68
|
+
@invalid_links << link
|
69
|
+
next
|
70
|
+
=end
|
71
|
+
#忽略非http请求
|
72
|
+
if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
|
73
|
+
@invalid_links << link#todo 不同链接key重复,无法体现
|
74
|
+
next
|
75
|
+
end
|
76
|
+
#忽略非网页文件,忽略js按钮忽略邮件
|
77
|
+
if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
|
78
|
+
@invalid_links << link
|
292
79
|
next
|
293
80
|
end
|
294
|
-
|
295
|
-
if
|
296
|
-
@
|
81
|
+
#nofollow links
|
82
|
+
if link.rel.include?'nofollow'
|
83
|
+
@nofollowed_links << link
|
297
84
|
next
|
298
85
|
end
|
86
|
+
if link.respond_to?'fragment' and link.fragment.empty?
|
87
|
+
@invalid_links << link
|
88
|
+
next
|
89
|
+
end
|
90
|
+
pp link
|
299
91
|
#处理相对路径
|
300
|
-
if uri.relative?
|
301
|
-
@
|
302
|
-
|
303
|
-
|
92
|
+
if !link.uri.nil? and link.uri.relative?
|
93
|
+
@invalid_links << link
|
94
|
+
#puts @uri.merge(link)
|
95
|
+
#link.uri = @uri.merge(link.uri)
|
96
|
+
@internal_outbound_links << link unless link.uri == @uri
|
97
|
+
elsif link.uri.nil?
|
98
|
+
warn "warning: host nil #{link.uri}"
|
99
|
+
next
|
304
100
|
else
|
305
|
-
|
306
|
-
|
307
|
-
@internal_outbound_links << href
|
101
|
+
if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
|
102
|
+
@internal_outbound_links << link
|
308
103
|
else
|
309
|
-
@external_outbound_links <<
|
104
|
+
@external_outbound_links << link
|
310
105
|
end
|
311
106
|
end
|
312
|
-
@
|
107
|
+
@valid_links << link
|
108
|
+
end
|
109
|
+
@outbound_links = @internal_outbound_links + @external_outbound_links
|
110
|
+
@scanned = true
|
111
|
+
end
|
112
|
+
end
|
113
|
+
class URI::Generic
|
114
|
+
def absolute?()
|
115
|
+
if @scheme or path.start_with?'/'
|
116
|
+
true
|
117
|
+
else
|
118
|
+
false
|
119
|
+
end
|
120
|
+
end
|
121
|
+
def domain
|
122
|
+
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
123
|
+
return domain[1] unless domain.nil?
|
124
|
+
return nil
|
125
|
+
end
|
126
|
+
=begin
|
127
|
+
def normalize!
|
128
|
+
if path && path == ''
|
129
|
+
set_path('/')
|
130
|
+
end
|
131
|
+
if scheme && scheme != scheme.downcase
|
132
|
+
set_scheme(self.scheme.downcase)
|
313
133
|
end
|
314
|
-
|
134
|
+
if host && host != host.downcase
|
135
|
+
set_host(self.host.downcase)
|
136
|
+
end
|
137
|
+
set_fragment(nil) unless fragment.nil?
|
315
138
|
end
|
139
|
+
=end
|
316
140
|
end
|
141
|
+
=begin
|
142
|
+
class URI::Parser
|
143
|
+
def parse(uri)
|
144
|
+
scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
|
145
|
+
|
146
|
+
if scheme && URI.scheme_list.include?(scheme.upcase)
|
147
|
+
URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
148
|
+
else
|
149
|
+
URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
a = Mechanize.new
|
154
|
+
w = a.get('http://dict.youdao.com/w/abc/')
|
155
|
+
w.scan_links
|
156
|
+
pp w.internal_outbound_links
|
157
|
+
exit
|
158
|
+
w.links.each do |link|
|
159
|
+
puts link.rel
|
160
|
+
end
|
161
|
+
=end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,14 +11,15 @@ bindir: bin
|
|
11
11
|
cert_chain: []
|
12
12
|
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description: to show seo oriented reports of the webpage,newbie's
|
14
|
+
description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
|
15
|
+
work, careful
|
15
16
|
email: seoaqua@qq.com
|
16
17
|
executables: []
|
17
18
|
extensions: []
|
18
19
|
extra_rdoc_files: []
|
19
20
|
files:
|
20
21
|
- webpage.rb
|
21
|
-
homepage: http://
|
22
|
+
homepage: http://github.com/seoaqua/ruby-webpage
|
22
23
|
licenses: []
|
23
24
|
post_install_message:
|
24
25
|
rdoc_options: []
|
@@ -41,5 +42,6 @@ rubyforge_project:
|
|
41
42
|
rubygems_version: 1.8.21
|
42
43
|
signing_key:
|
43
44
|
specification_version: 3
|
44
|
-
summary: to show seo oriented reports of the webpage,newbie's
|
45
|
+
summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
|
46
|
+
work, careful
|
45
47
|
test_files: []
|