webpage 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/webpage.rb +121 -276
  2. metadata +6 -4
data/webpage.rb CHANGED
@@ -2,315 +2,160 @@
2
2
  require 'pp'
3
3
  require 'mechanize'
4
4
  require 'uri'
5
- class Webpage
6
- attr_reader :links,:relative_paths,:outbound_links,:successful,:related_uris,:invalid_links,:internal_links,:internal_outbound_links,:internal_inbound_links,:broken_outbound_links,:external_outbound_links,:external_inbound_links
7
- attr_accessor :ignored_exts
8
- def initialize(uri)
9
- @links = Array.new
10
- @relative_paths = Array.new
11
- @outbound_links = Array.new
12
- @internal_outbound_links = Array.new
13
- @external_outbound_links = Array.new
14
- @broken_outbound_links = Array.new
15
- @external_inbound_links = Array.new
16
- @back_links = Array.new
17
- @internal_inbound_links = Array.new
18
- @external_inbound_links = Array.new
19
- @internal_links = Array.new
20
- @invalid_links = Array.new
21
- @accessed_uri = Array.new
22
- @related_uris = Array.new
23
- @successful = false
24
- begin
25
- @uri = URI.parse(uri)
26
- raise 'not url' unless @uri.class == URI::HTTP or @uri.class == URI::HTTPS
27
- @domain = Webpage.host_to_domain @uri.host
28
- agent = Mechanize.new
29
- agent.open_timeout = 3
30
- @page = agent.get @uri.to_s
31
- raise 'not webpage' unless @page.class == Mechanize::Page
32
- @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
33
- @successful = true
34
- rescue Exception => e
35
- warn "................\nget #{@uri} failed\n.#{e.backtrace.join("\n")}\n #{e}\nURI:.............."
36
- end
37
- end
38
-
39
- def encoding
40
- return @page.encoding
41
- end
42
-
43
- def keywords
44
- meta = @page.search("//meta[@name='keywords']").first
45
- return meta.attributes["content"].value.split(',') unless meta.nil?
46
- end
47
-
48
- def description
49
- meta = @page.search("//meta[@name='description']").first
50
- if meta.nil?
51
- return false
52
- end
53
- return meta.atrributes['content'].value
54
- end
55
-
56
- def body
57
- return @page.body
58
- #(return @page.body unless @page.body.include?'<html>') if @successful
59
- #return String.new
60
- end
61
-
62
- def text
63
- return Nokogiri::HTML(body).xpath("//text()").text
64
- #return body.gsub(/<\/?[^>]*>/, "")
65
- end
66
-
67
- def title
68
- return @page.title unless @page.title.nil?
69
- return false
70
- end
71
-
72
-
73
- #get all links from html content
74
- #1.$all = get all <a>
75
- #2.$href = get all href from $all
76
- #3.make all $href to be absolute path and put to @links
77
- =begin
78
- def links
79
- return @links unless @links.empty?
80
- begin
81
- agent = Mechanize.new
82
- agent.open_timeout = 5
83
- agent.get @uri do |page|
84
- page.links.each do |link| #1
85
- next if link.href.nil?
86
- uri = Webpage.uri_normalize(link.href)
87
- begin
88
- @links << @uri.merge(uri).to_s
89
- rescue URI::InvalidURIError,URI::InvalidComponentError
90
- warn "ignore\n #{uri} \n #{link.href}"
91
- end
92
- end
93
- end
94
- rescue Errno::ETIMEDOUT,Timeout::Error
95
- warn "timeout:#{@uri}"
96
- rescue NoMethodError => e
97
- warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
98
- rescue Zlib::GzipFile::Error,Mechanize::Error => e
99
- warn "gzip error:#{@uri}.#{e}"
100
- rescue Net::HTTP::Persistent::Error
101
- warn "network reset:#{@uri}"
102
- rescue SocketError =>e
103
- warn "#{e}.#{@uri}"
104
- end
105
- return Array.new if @links.empty?
106
- #@links = @links.uniq - @accessed_uri
107
- #@accessed_uri += @links
108
- @links.uniq!
109
- scan_links
110
- return @links
111
- end
112
- =end
113
-
114
- def report
115
- scan_links
116
- scan_outbound_links
117
- scan_inbound_links
118
- report = {
119
- :internal_links => @internal_links,
120
- :internal_outbound_links => @internal_outbound_links,
121
- :outbound_links => @outbound_links,
122
- :broken_outbound_links => @broken_outbound_links,
123
- :external_inbound_links => @external_inbound_links,
124
- :internal_inbound_links => @internal_inbound_links,
125
- :external_outbound_links => @external_outbound_links,
126
- :related_uris => @related_uris,
127
- :invalid_links => @invalid_links
128
- }
129
- end
130
-
131
-
132
- =begin
133
- def external_outbound_links
134
- return @external_outbound_links unless @external_outbound_links.empty?
135
- links
136
- return @external_outbound_links
137
- end
138
-
139
- def internal_outbound_links
140
- return @internal_outbound_links unless @internal_outbound_links.empty?
141
- links
142
- return @internal_outbound_links
143
- end
144
-
145
- def back_links#inbound links among all the outbound links
146
- return @back_links unless @back_links.empty?
147
- scan_outbound_links
148
- return @back_links
149
- end
150
-
151
- def broken_outbound_links
152
- return @broken_outbound_links unless @broken_outbound_links.empty?
153
- scan_outbound_links
154
- return @broken_outbound_links
155
- end
156
-
157
- def external_inbound_links#outter inbound links
158
- return @external_inbound_links unless @external_inbound_links.empty?
159
- scan_inbound_links
160
- return @external_inbound_links
161
- end
162
-
163
- def internal_inbound_links
164
- return @internal_inbound_links unless @internal_inbound_links.empty?
165
- scan_inbound_links
166
- return @internal_inbound_links
167
- end
168
- =end
169
- def pagerank
170
- return @pagerank unless @pagerank.nil?
171
- require 'page_rankr'
172
- @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
173
- return @pagerank
174
- end
175
-
176
- def ppl#pagerank per link
177
- pagerank
178
- return false if @pagerank.nil?
179
- scan_links
180
- return (@pagerank / @links.size)
181
- end
182
- def scan_related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,max=100)#todo: multi-threads
183
- scan_links
184
- raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
185
- seed_uris.concat(@external_outbound_links - checked_uris)
186
- related_keywords.concat(keywords)
187
- result = Array.new
188
- while seed_uris.size > 0 and result.size < max
189
- uri = seed_uris.first
190
- checked_uris << uri unless checked_uris.include?uri
191
- seed_uris.delete(uri)
192
- w = Webpage.new uri
193
- next unless w.successful
194
- text = w.body + w.title
195
- related_keywords.each do |word|
196
- if text.include?word
197
- #result.concat self.the_related_uris(related_keywords,seed_uris,checked_uris,max)
198
- domain = Webpage.host_to_domain(URI.parse(uri).host)
199
- result << domain unless result.include? domain
200
- seed_uris.concat(w.external_outbound_links - checked_uris)
201
- break
202
- end
203
- end
204
- end
205
- return result
206
- end
207
-
208
- def link_to(target_uri)
209
- scan_links
210
- target_uri = Webpage.uri_normalize(target_uri)
211
- target_host = URI.parse(target_uri).host
212
- target_domain = Webpage.host_to_domain(target_host)
213
- type = 0 #not link to
214
- @links.each do |link|
215
- candidate_host = URI.parse(link).host
216
- if link == target_uri
217
- type = 3 #definitely link to
218
- break
219
- elsif URI.parse(link).host == target_host
220
- type = 2 if type < 2 #link to the host
221
- elsif Webpage.host_to_domain(candidate_host) == target_domain
222
- type = 1 if type < 1 #link to the root domain
223
- end
224
- end
225
- return type
226
- end
227
5
 
6
+ class WebHelper
228
7
  def self.uri_normalize(uri)
229
8
  uri = URI.parse(uri).normalize
230
9
  fragment = uri.fragment
231
- return uri.to_s.delete("##{fragment}")
10
+ uri = uri.to_s
11
+ uri.sub!(/##{fragment}$/,'') unless fragment.nil?
12
+ return uri
232
13
  #uri = uri.to_s.strip.sub(/\#.*$/,'')
233
14
  #uri.path = '/' if uri.path.nil?
234
15
  end
235
-
236
16
  def self.host_to_domain(host)
237
17
  domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
238
18
  return domain[1] unless domain.nil?
239
19
  return false
240
20
  end
241
-
242
21
  def self.uri_encode(str)
243
22
  return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
244
23
  end
24
+ end
245
25
 
246
- def scan_inbound_links
247
- scan_links
248
- @back_links.each do |inlink|
249
- inlink = URI.parse inlink
250
- if @domain == Webpage.host_to_domain(inlink.host)
251
- @internal_inbound_links << inlink.to_s
252
- else
253
- @external_inbound_links << inlink.to_s
254
- end
26
+ class Mechanize::Page
27
+ #@invalid_links = Hash.new
28
+ attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
29
+ public
30
+ def text
31
+ return Nokogiri::HTML(body).xpath("//text()").text
32
+ #return body.gsub(/<\/?[^>]*>/, "")
33
+ end
34
+ def keywords
35
+ meta = search("//meta[@name='keywords']").first
36
+ return meta.attributes["content"].value.split(',') unless meta.nil?
37
+ end
38
+
39
+ def description
40
+ meta = search("//meta[@name='description']").first
41
+ if meta.nil?
42
+ return false
255
43
  end
256
- @internal_inbound_links.uniq!
257
- @external_inbound_links.uniq!
44
+ return meta.attributes['content'].value
258
45
  end
259
46
 
260
- def scan_outbound_links
261
- @outbound_links.each do |outlink|
262
- w = Webpage.new(outlink)
263
- unless w.successful
264
- @invalid_links << outlink
265
- next
266
- end
267
- next if w.links.nil?
268
- w.links.each do |uri|
269
- #uri = URI.parse(uri)
270
- #next if uri.host.nil?
271
- if Webpage.host_to_domain(uri) == @domain
272
- @back_links << uri.to_s
273
- else
274
- @broken_outbound_links << uri.to_s
275
- end
276
- end
277
- end
278
- @back_links.uniq!
279
- @broken_outbound_links.uniq!
47
+ def pagerank
48
+ require 'page_rankr'
49
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
50
+ return @pagerank
280
51
  end
281
52
 
282
53
  def scan_links
283
- return unless @links.empty?
54
+ @external_outbound_links = Array.new
55
+ @internal_outbound_links = Array.new
56
+ @valid_links = Array.new
57
+ @invalid_links = Array.new
58
+ @nofollowed_links = Array.new
284
59
  exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
285
- @page.links.each do |link|
60
+ links.each do |link|
286
61
  #初步解析
287
- begin
288
- uri = URI.parse(link.href)
289
- href = uri.to_s
290
- rescue URI::InvalidURIError => e
291
- @invalid_links << href
62
+ =begin
63
+ uri = URI.parse(link.uri).normalize
64
+ href = uri.to_s
65
+ rescue URI::InvalidURIError => e
66
+ pp link
67
+ puts e
68
+ @invalid_links << link
69
+ next
70
+ =end
71
+ #忽略非http请求
72
+ if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
73
+ @invalid_links << link#todo 不同链接key重复,无法体现
74
+ next
75
+ end
76
+ #忽略非网页文件,忽略js按钮忽略邮件
77
+ if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
78
+ @invalid_links << link
292
79
  next
293
80
  end
294
- #忽略非网页文件
295
- if exts_to_ignored.include?href[-4,4]
296
- @ignored_uris << href
81
+ #nofollow links
82
+ if link.rel.include?'nofollow'
83
+ @nofollowed_links << link
297
84
  next
298
85
  end
86
+ if link.respond_to?'fragment' and link.fragment.empty?
87
+ @invalid_links << link
88
+ next
89
+ end
90
+ pp link
299
91
  #处理相对路径
300
- if uri.relative?
301
- @relative_paths << href
302
- href = @uri.merge(href).to_s
303
- @internal_outbound_links << href
92
+ if !link.uri.nil? and link.uri.relative?
93
+ @invalid_links << link
94
+ #puts @uri.merge(link)
95
+ #link.uri = @uri.merge(link.uri)
96
+ @internal_outbound_links << link unless link.uri == @uri
97
+ elsif link.uri.nil?
98
+ warn "warning: host nil #{link.uri}"
99
+ next
304
100
  else
305
- href = Webpage.uri_normalize(href)
306
- if Webpage.host_to_domain(uri.host) == @domain
307
- @internal_outbound_links << href
101
+ if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
102
+ @internal_outbound_links << link
308
103
  else
309
- @external_outbound_links << href
104
+ @external_outbound_links << link
310
105
  end
311
106
  end
312
- @links << href
107
+ @valid_links << link
108
+ end
109
+ @outbound_links = @internal_outbound_links + @external_outbound_links
110
+ @scanned = true
111
+ end
112
+ end
113
+ class URI::Generic
114
+ def absolute?()
115
+ if @scheme or path.start_with?'/'
116
+ true
117
+ else
118
+ false
119
+ end
120
+ end
121
+ def domain
122
+ domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
123
+ return domain[1] unless domain.nil?
124
+ return nil
125
+ end
126
+ =begin
127
+ def normalize!
128
+ if path && path == ''
129
+ set_path('/')
130
+ end
131
+ if scheme && scheme != scheme.downcase
132
+ set_scheme(self.scheme.downcase)
313
133
  end
314
- @outbound_links = @internal_outbound_links.uniq! + @external_outbound_links.uniq!
134
+ if host && host != host.downcase
135
+ set_host(self.host.downcase)
136
+ end
137
+ set_fragment(nil) unless fragment.nil?
315
138
  end
139
+ =end
316
140
  end
141
+ =begin
142
+ class URI::Parser
143
+ def parse(uri)
144
+ scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
145
+
146
+ if scheme && URI.scheme_list.include?(scheme.upcase)
147
+ URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
148
+ else
149
+ URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
150
+ end
151
+ end
152
+ end
153
+ a = Mechanize.new
154
+ w = a.get('http://dict.youdao.com/w/abc/')
155
+ w.scan_links
156
+ pp w.internal_outbound_links
157
+ exit
158
+ w.links.each do |link|
159
+ puts link.rel
160
+ end
161
+ =end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,14 +11,15 @@ bindir: bin
11
11
  cert_chain: []
12
12
  date: 2012-04-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: to show seo oriented reports of the webpage,newbie's work, careful
14
+ description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
15
+ work, careful
15
16
  email: seoaqua@qq.com
16
17
  executables: []
17
18
  extensions: []
18
19
  extra_rdoc_files: []
19
20
  files:
20
21
  - webpage.rb
21
- homepage: http://seoaqua.com
22
+ homepage: http://github.com/seoaqua/ruby-webpage
22
23
  licenses: []
23
24
  post_install_message:
24
25
  rdoc_options: []
@@ -41,5 +42,6 @@ rubyforge_project:
41
42
  rubygems_version: 1.8.21
42
43
  signing_key:
43
44
  specification_version: 3
44
- summary: to show seo oriented reports of the webpage,newbie's work, careful
45
+ summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
46
+ work, careful
45
47
  test_files: []