webpage 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/webpage.rb +128 -0
  2. metadata +7 -7
  3. data/lib/webpage.rb +0 -0
data/webpage.rb ADDED
@@ -0,0 +1,128 @@
1
+ #coding:UTF-8
2
+ require 'pp'
3
+ require 'mechanize'
4
+ require 'uri'
5
+ class Webpage
6
+ attr_reader:links
7
+ def initialize(uri)
8
+ @uri = URI.parse(encode(uri))
9
+ @outbound_links = Array.new
10
+ @outter_inbound_links = Array.new
11
+ @inbound_links = Array.new
12
+ @internal_links = Array.new
13
+ @links = Array.new
14
+ @uri_dirname = File.dirname(@uri.path)
15
+ @uri_domain = host_to_domain @uri.host
16
+ @accessed_uri = Array.new
17
+ end
18
+ def encode(str)
19
+ return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
20
+ end
21
+ def host_to_domain(host)
22
+ return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
23
+ end
24
+
25
+ #get all links from html content
26
+ #1.$all = get all <a>
27
+ #2.$href = get all href from $all
28
+ #3.make all $href to be absolute path and put to @links
29
+ def links
30
+ return @links unless @links.empty?
31
+ begin
32
+ agent = Mechanize.new
33
+ agent.open_timeout = 5
34
+ agent.get @uri do |page|
35
+ page.links.each do |link| #1
36
+ next if link.href.nil?
37
+ uri = encode(link.href.strip)
38
+ begin
39
+ @links << @uri.merge(uri).to_s
40
+ rescue URI::InvalidURIError,URI::InvalidComponentError
41
+ warn "ignore\n #{uri} \n #{link.href}"
42
+ end
43
+ end
44
+ end
45
+ rescue Errno::ETIMEDOUT,Timeout::Error
46
+ warn "timeout:#{@uri}"
47
+ rescue NoMethodError => e
48
+ warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
49
+ rescue Zlib::GzipFile::Error,Mechanize::Error => e
50
+ warn "gzip error:#{@uri}.#{e}"
51
+ rescue Net::HTTP::Persistent::Error
52
+ warn "network reset:#{@uri}"
53
+ rescue SocketError =>e
54
+ warn "#{e}.#{@uri}"
55
+ end
56
+ return Array.new if @links.empty?
57
+ @links = @links.uniq - @accessed_uri
58
+ @accessed_uri += @links
59
+ @links.each do |a|
60
+ uri = URI.parse(encode(a))
61
+ next if uri.host.nil?
62
+ if uri.host.end_with?@uri_domain
63
+ @internal_links << a
64
+ else
65
+ @outbound_links << a
66
+ end
67
+ end
68
+ return @links
69
+ end
70
+
71
+ def internal_links
72
+ return @internal_links if links
73
+ return false
74
+ end
75
+
76
+ def outbound_links
77
+ return @outbound_links if links
78
+ return false
79
+ end
80
+
81
+ def inbound_links
82
+ return @inbound_links unless @inbound_links.empty?
83
+ outbound_links.each do |outlink|
84
+ begin
85
+ w = Webpage.new(outlink)
86
+ rescue URI::InvalidURIError
87
+ warn "bad uri:#{outlink}"
88
+ next
89
+ end
90
+ w.links.each do |uri|
91
+ next unless uri.start_with?'http'
92
+ begin
93
+ uri = URI.parse(encode(uri))
94
+ next if uri.host.nil?
95
+ @inbound_links << uri.to_s if uri.host.end_with?@uri_domain
96
+ rescue URI::InvalidURIError
97
+ warn "bad uri:#{uri}"
98
+ end
99
+ end
100
+ end
101
+ return @inbound_links.uniq
102
+ end
103
+
104
+ def outter_inbound_links
105
+ return @outter_inbound_links unless @outter_inbound_links.empty?
106
+ inbound_links.each do |inlink|
107
+ inlink = URI.parse inlink
108
+ @outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
109
+ end
110
+ return @outter_inbound_links
111
+ end
112
+
113
+ def friend_links#inbound && outbound
114
+ end
115
+
116
+ def pagerank
117
+ return @pagerank unless @pagerank
118
+ require 'PageRankr'
119
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)
120
+ return @pagerank
121
+ end
122
+
123
+ def ppl#pagerank_per_link
124
+ return (@pagerank / links.count)
125
+ end
126
+ end
127
+ w = Webpage.new('http://auto.163.com')
128
+ puts w.outter_inbound_links
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,16 +9,16 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-11 00:00:00.000000000 Z
12
+ date: 2012-04-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: to show a report of the webpage
14
+ description: to show seo oriented reports of the webpage,newbie's work, careful
15
15
  email: seoaqua@qq.com
16
16
  executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
- - lib/webpage.rb
21
- homepage: http://seoaqua.com/
20
+ - webpage.rb
21
+ homepage: http://seoaqua.com
22
22
  licenses: []
23
23
  post_install_message:
24
24
  rdoc_options: []
@@ -38,8 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 1.8.11
41
+ rubygems_version: 1.8.21
42
42
  signing_key:
43
43
  specification_version: 3
44
- summary: to show a report of the webpage
44
+ summary: to show seo oriented reports of the webpage,newbie's work, careful
45
45
  test_files: []
data/lib/webpage.rb DELETED
File without changes