webpage 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/webpage.rb +121 -276
  2. metadata +6 -4
data/webpage.rb CHANGED
@@ -2,315 +2,160 @@
2
2
  require 'pp'
3
3
  require 'mechanize'
4
4
  require 'uri'
5
- class Webpage
6
- attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
7
- attr_accessor :ignored_exts
8
- def initialize(uri)
9
- @links = Array.new
10
- @relative_paths = Array.new
11
- @outbound_links = Array.new
12
- @internal_outbound_links = Array.new
13
- @external_outbound_links = Array.new
14
- @broken_outbound_links = Array.new
15
- @external_inbound_links = Array.new
16
- @back_links = Array.new
17
- @internal_inbound_links = Array.new
18
- @external_inbound_links = Array.new
19
- @internal_links = Array.new
20
- @invalid_links = Array.new
21
- @accessed_uri = Array.new
22
- @related_uris = Array.new
23
- @successful = false
24
- begin
25
- @uri = URI.parse(uri)
26
- raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
27
- @domain = Webpage.host_to_domain @uri.host
28
- agent = Mechanize.new
29
- agent.open_timeout = 3
30
- @page = agent.get @uri.to_s
31
- raise 'not webpage' unless @page.class == Mechanize::Page
32
- @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
33
- @successful = true
34
- rescue Exception => e
35
- warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
36
- end
37
- end
38
-
39
- def encoding
40
- return @page.encoding
41
- end
42
-
43
- def keywords
44
- meta = @page.search("//meta[@name='keywords']").first
45
- return meta.attributes["content"].value.split(',') unless meta.nil?
46
- end
47
-
48
- def description
49
- meta = @page.search("//meta[@name='description']").first
50
- if meta.nil?
51
- return false
52
- end
53
- return meta.atrributes['content'].value
54
- end
55
-
56
- def body
57
- return @page.body
58
- #(return @page.body unless @page.body.include?'<html>') if @successful
59
- #return String.new
60
- end
61
-
62
- def text
63
- return Nokogiri::HTML(body).xpath("//text()").text
64
- #return body.gsub(/<\/?[^>]*>/, "")
65
- end
66
-
67
- def title
68
- return @page.title unless @page.title.nil?
69
- return false
70
- end
71
-
72
-
73
- #get all links from html content
74
- #1.$all = get all <a>
75
- #2.$href = get all href from $all
76
- #3.make all $href to be absolute path and put to @links
77
- =begin
78
- def links
79
- return @links unless @links.empty?
80
- begin
81
- agent = Mechanize.new
82
- agent.open_timeout = 5
83
- agent.get @uri do |page|
84
- page.links.each do |link| #1
85
- next if link.href.nil?
86
- uri = Webpage.uri_normalize(link.href)
87
- begin
88
- @links << @uri.merge(uri).to_s
89
- rescue URI::InvalidURIError,URI::InvalidComponentError
90
- warn "ignore\n #{uri} \n #{link.href}"
91
- end
92
- end
93
- end
94
- rescue Errno::ETIMEDOUT,Timeout::Error
95
- warn "timeout:#{@uri}"
96
- rescue NoMethodError => e
97
- warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
98
- rescue Zlib::GzipFile::Error,Mechanize::Error => e
99
- warn "gzip error:#{@uri}.#{e}"
100
- rescue Net::HTTP::Persistent::Error
101
- warn "network reset:#{@uri}"
102
- rescue SocketError =>e
103
- warn "#{e}.#{@uri}"
104
- end
105
- return Array.new if @links.empty?
106
- #@links = @links.uniq - @accessed_uri
107
- #@accessed_uri += @links
108
- @links.uniq!
109
- scan_links
110
- return @links
111
- end
112
- =end
113
-
114
- def report
115
- scan_links
116
- scan_outbound_links
117
- scan_inbound_links
118
- report = {
119
- :internal_links => @internal_links,
120
- :internal_outbound_links => @internal_outbound_links,
121
- :outbound_links => @outbound_links,
122
- :broken_outbound_links => @broken_outbound_links,
123
- :external_inbound_links => @external_inbound_links,
124
- :internal_inbound_links => @internal_inbound_links,
125
- :external_outbound_links => @external_outbound_links,
126
- :related_uris => @related_uris,
127
- :invalid_links => @invalid_links
128
- }
129
- end
130
-
131
-
132
- =begin
133
- def external_outbound_links
134
- return @external_outbound_links unless @external_outbound_links.empty?
135
- links
136
- return @external_outbound_links
137
- end
138
-
139
- def internal_outbound_links
140
- return @internal_outbound_links unless @internal_outbound_links.empty?
141
- links
142
- return @internal_outbound_links
143
- end
144
-
145
- def back_links#inbound links among all the outbound links
146
- return @back_links unless @back_links.empty?
147
- scan_outbound_links
148
- return @back_links
149
- end
150
-
151
- def broken_outbound_links
152
- return @broken_outbound_links unless @broken_outbound_links.empty?
153
- scan_outbound_links
154
- return @broken_outbound_links
155
- end
156
-
157
- def external_inbound_links#outter inbound links
158
- return @external_inbound_links unless @external_inbound_links.empty?
159
- scan_inbound_links
160
- return @external_inbound_links
161
- end
162
-
163
- def internal_inbound_links
164
- return @internal_inbound_links unless @internal_inbound_links.empty?
165
- scan_inbound_links
166
- return @internal_inbound_links
167
- end
168
- =end
169
- def pagerank
170
- return @pagerank unless @pagerank.nil?
171
- require 'page_rankr'
172
- @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
173
- return @pagerank
174
- end
175
-
176
- def ppl#pagerank per link
177
- pagerank
178
- return false if @pagerank.nil?
179
- scan_links
180
- return (@pagerank / @links.size)
181
- end
182
- def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
183
- scan_links
184
- raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
185
- seed_uris.concat(@external_outbound_links - checked_uris)
186
- related_keywords.concat(keywords)
187
- result = Array.new
188
- while seed_uris.size > 0 and result.size < max
189
- uri = seed_uris.first
190
- checked_uris << uri unless checked_uris.include?uri
191
- seed_uris.delete(uri)
192
- w = Webpage.new uri
193
- next unless w.successful
194
- text = w.body + w.title
195
- related_keywords.each do |word|
196
- if text.include?word
197
- #result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
198
- domain = Webpage.host_to_domain(URI.parse(uri).host)
199
- result << domain unless result.include? domain
200
- seed_uris.concat(w.external_outbound_links - checked_uris)
201
- break
202
- end
203
- end
204
- end
205
- return result
206
- end
207
-
208
- def link_to(target_uri)
209
- scan_links
210
- target_uri = Webpage.uri_normalize(target_uri)
211
- target_host = URI.parse(target_uri).host
212
- target_domain = Webpage.host_to_domain(target_host)
213
- type = 0 #not link to
214
- @links.each do |link|
215
- candidate_host = URI.parse(link).host
216
- if link == target_uri
217
- type = 3 #definitely link to
218
- break
219
- elsif URI.parse(link).host == target_host
220
- type = 2 if type < 2 #link to the host
221
- elsif Webpage.host_to_domain(candidate_host) == target_domain
222
- type = 1 if type < 1 #link to the root domain
223
- end
224
- end
225
- return type
226
- end
227
5
 
6
+ class WebHelper
228
7
  def self.uri_normalize(uri)
229
8
  uri = URI.parse(uri).normalize
230
9
  fragment = uri.fragment
231
- return uri.to_s.delete("##{fragment}")
10
+ uri = uri.to_s
11
+ uri.sub!(/##{fragment}$/,'') unless fragment.nil?
12
+ return uri
232
13
  #uri = uri.to_s.strip.sub(/\#.*$/,'')
233
14
  #uri.path = '/' if uri.path.nil?
234
15
  end
235
-
236
16
  def self.host_to_domain(host)
237
17
  domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
238
18
  return domain[1] unless domain.nil?
239
19
  return false
240
20
  end
241
-
242
21
  def self.uri_encode(str)
243
22
  return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
244
23
  end
24
+ end
245
25
 
246
- def scan_inbound_links
247
- scan_links
248
- @back_links.each do |inlink|
249
- inlink = URI.parse inlink
250
- if @domain == Webpage.host_to_domain(inlink.host)
251
- @internal_inbound_links << inlink.to_s
252
- else
253
- @external_inbound_links << inlink.to_s
254
- end
26
+ class Mechanize::Page
27
+ #@invalid_links = Hash.new
28
+ attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
29
+ public
30
+ def text
31
+ return Nokogiri::HTML(body).xpath("//text()").text
32
+ #return body.gsub(/<\/?[^>]*>/, "")
33
+ end
34
+ def keywords
35
+ meta = search("//meta[@name='keywords']").first
36
+ return meta.attributes["content"].value.split(',') unless meta.nil?
37
+ end
38
+
39
+ def description
40
+ meta = search("//meta[@name='description']").first
41
+ if meta.nil?
42
+ return false
255
43
  end
256
- @internal_inbound_links.uniq!
257
- @external_inbound_links.uniq!
44
+ return meta.attributes['content'].value
258
45
  end
259
46
 
260
- def scan_outbound_links
261
- @outbound_links.each do |outlink|
262
- w = Webpage.new(outlink)
263
- unless w.successful
264
- @invalid_links << outlink
265
- next
266
- end
267
- next if w.links.nil?
268
- w.links.each do |uri|
269
- #uri = URI.parse(uri)
270
- #next if uri.host.nil?
271
- if Webpage.host_to_domain(uri) == @domain
272
- @back_links << uri.to_s
273
- else
274
- @broken_outbound_links << uri.to_s
275
- end
276
- end
277
- end
278
- @back_links.uniq!
279
- @broken_outbound_links.uniq!
47
+ def pagerank
48
+ require 'page_rankr'
49
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
50
+ return @pagerank
280
51
  end
281
52
 
282
53
  def scan_links
283
- return unless @links.empty?
54
+ @external_outbound_links = Array.new
55
+ @internal_outbound_links = Array.new
56
+ @valid_links = Array.new
57
+ @invalid_links = Array.new
58
+ @nofollowed_links = Array.new
284
59
  exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
285
- @page.links.each do |link|
60
+ links.each do |link|
286
61
  #初步解析
287
- begin
288
- uri = URI.parse(link.href)
289
- href = uri.to_s
290
- rescue URI::InvalidURIError => e
291
- @invalid_links << href
62
+ =begin
63
+ uri = URI.parse(link.uri).normalize
64
+ href = uri.to_s
65
+ rescue URI::InvalidURIError => e
66
+ pp link
67
+ puts e
68
+ @invalid_links << link
69
+ next
70
+ =end
71
+ #忽略非http请求
72
+ if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
73
+ @invalid_links << link#todo 不同链接key重复,无法体现
74
+ next
75
+ end
76
+ #忽略非网页文件,忽略js按钮忽略邮件
77
+ if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
78
+ @invalid_links << link
292
79
  next
293
80
  end
294
- #忽略非网页文件
295
- if exts_to_ignored.include?href[-4,4]
296
- @ignored_uris << href
81
+ #nofollow links
82
+ if link.rel.include?'nofollow'
83
+ @nofollowed_links << link
297
84
  next
298
85
  end
86
+ if link.respond_to?'fragment' and link.fragment.empty?
87
+ @invalid_links << link
88
+ next
89
+ end
90
+ pp link
299
91
  #处理相对路径
300
- if uri.relative?
301
- @relative_paths << href
302
- href = @uri.merge(href).to_s
303
- @internal_outbound_links << href
92
+ if !link.uri.nil? and link.uri.relative?
93
+ @invalid_links << link
94
+ #puts @uri.merge(link)
95
+ #link.uri = @uri.merge(link.uri)
96
+ @internal_outbound_links << link unless link.uri == @uri
97
+ elsif link.uri.nil?
98
+ warn "warning: host nil #{link.uri}"
99
+ next
304
100
  else
305
- href = Webpage.uri_normalize(href)
306
- if Webpage.host_to_domain(uri.host) == @domain
307
- @internal_outbound_links << href
101
+ if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
102
+ @internal_outbound_links << link
308
103
  else
309
- @external_outbound_links << href
104
+ @external_outbound_links << link
310
105
  end
311
106
  end
312
- @links << href
107
+ @valid_links << link
108
+ end
109
+ @outbound_links = @internal_outbound_links + @external_outbound_links
110
+ @scanned = true
111
+ end
112
+ end
113
+ class URI::Generic
114
+ def absolute?()
115
+ if @scheme or path.start_with?'/'
116
+ true
117
+ else
118
+ false
119
+ end
120
+ end
121
+ def domain
122
+ domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
123
+ return domain[1] unless domain.nil?
124
+ return nil
125
+ end
126
+ =begin
127
+ def normalize!
128
+ if path && path == ''
129
+ set_path('/')
130
+ end
131
+ if scheme && scheme != scheme.downcase
132
+ set_scheme(self.scheme.downcase)
313
133
  end
314
- @outbound_links = @internal_outbound_links.uniq! + @external_outbound_links.uniq!
134
+ if host && host != host.downcase
135
+ set_host(self.host.downcase)
136
+ end
137
+ set_fragment(nil) unless fragment.nil?
315
138
  end
139
+ =end
316
140
  end
141
+ =begin
142
+ class URI::Parser
143
+ def parse(uri)
144
+ scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
145
+
146
+ if scheme && URI.scheme_list.include?(scheme.upcase)
147
+ URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
148
+ else
149
+ URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
150
+ end
151
+ end
152
+ end
153
+ a = Mechanize.new
154
+ w = a.get('http://dict.youdao.com/w/abc/')
155
+ w.scan_links
156
+ pp w.internal_outbound_links
157
+ exit
158
+ w.links.each do |link|
159
+ puts link.rel
160
+ end
161
+ =end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,14 +11,15 @@ bindir: bin
11
11
  cert_chain: []
12
12
  date: 2012-04-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: to show seo oriented reports of the webpage,newbie's work, careful
14
+ description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
15
+ work, careful
15
16
  email: seoaqua@qq.com
16
17
  executables: []
17
18
  extensions: []
18
19
  extra_rdoc_files: []
19
20
  files:
20
21
  - webpage.rb
21
- homepage: http://seoaqua.com
22
+ homepage: http://github.com/seoaqua/ruby-webpage
22
23
  licenses: []
23
24
  post_install_message:
24
25
  rdoc_options: []
@@ -41,5 +42,6 @@ rubyforge_project:
41
42
  rubygems_version: 1.8.21
42
43
  signing_key:
43
44
  specification_version: 3
44
- summary: to show seo oriented reports of the webpage,newbie's work, careful
45
+ summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
46
+ work, careful
45
47
  test_files: []