webpage 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/webpage.rb +128 -0
- metadata +7 -7
- data/lib/webpage.rb +0 -0
data/webpage.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'pp'
|
3
|
+
require 'mechanize'
|
4
|
+
require 'uri'
|
5
|
+
class Webpage
|
6
|
+
attr_reader:links
|
7
|
+
def initialize(uri)
|
8
|
+
@uri = URI.parse(encode(uri))
|
9
|
+
@outbound_links = Array.new
|
10
|
+
@outter_inbound_links = Array.new
|
11
|
+
@inbound_links = Array.new
|
12
|
+
@internal_links = Array.new
|
13
|
+
@links = Array.new
|
14
|
+
@uri_dirname = File.dirname(@uri.path)
|
15
|
+
@uri_domain = host_to_domain @uri.host
|
16
|
+
@accessed_uri = Array.new
|
17
|
+
end
|
18
|
+
def encode(str)
|
19
|
+
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
20
|
+
end
|
21
|
+
def host_to_domain(host)
|
22
|
+
return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
|
23
|
+
end
|
24
|
+
|
25
|
+
#get all links from html content
|
26
|
+
#1.$all = get all <a>
|
27
|
+
#2.$href = get all href from $all
|
28
|
+
#3.make all $href to be absolute path and put to @links
|
29
|
+
def links
|
30
|
+
return @links unless @links.empty?
|
31
|
+
begin
|
32
|
+
agent = Mechanize.new
|
33
|
+
agent.open_timeout = 5
|
34
|
+
agent.get @uri do |page|
|
35
|
+
page.links.each do |link| #1
|
36
|
+
next if link.href.nil?
|
37
|
+
uri = encode(link.href.strip)
|
38
|
+
begin
|
39
|
+
@links << @uri.merge(uri).to_s
|
40
|
+
rescue URI::InvalidURIError,URI::InvalidComponentError
|
41
|
+
warn "ignore\n #{uri} \n #{link.href}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
rescue Errno::ETIMEDOUT,Timeout::Error
|
46
|
+
warn "timeout:#{@uri}"
|
47
|
+
rescue NoMethodError => e
|
48
|
+
warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
|
49
|
+
rescue Zlib::GzipFile::Error,Mechanize::Error => e
|
50
|
+
warn "gzip error:#{@uri}.#{e}"
|
51
|
+
rescue Net::HTTP::Persistent::Error
|
52
|
+
warn "network reset:#{@uri}"
|
53
|
+
rescue SocketError =>e
|
54
|
+
warn "#{e}.#{@uri}"
|
55
|
+
end
|
56
|
+
return Array.new if @links.empty?
|
57
|
+
@links = @links.uniq - @accessed_uri
|
58
|
+
@accessed_uri += @links
|
59
|
+
@links.each do |a|
|
60
|
+
uri = URI.parse(encode(a))
|
61
|
+
next if uri.host.nil?
|
62
|
+
if uri.host.end_with?@uri_domain
|
63
|
+
@internal_links << a
|
64
|
+
else
|
65
|
+
@outbound_links << a
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return @links
|
69
|
+
end
|
70
|
+
|
71
|
+
def internal_links
|
72
|
+
return @internal_links if links
|
73
|
+
return false
|
74
|
+
end
|
75
|
+
|
76
|
+
def outbound_links
|
77
|
+
return @outbound_links if links
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
|
81
|
+
def inbound_links
|
82
|
+
return @inbound_links unless @inbound_links.empty?
|
83
|
+
outbound_links.each do |outlink|
|
84
|
+
begin
|
85
|
+
w = Webpage.new(outlink)
|
86
|
+
rescue URI::InvalidURIError
|
87
|
+
warn "bad uri:#{outlink}"
|
88
|
+
next
|
89
|
+
end
|
90
|
+
w.links.each do |uri|
|
91
|
+
next unless uri.start_with?'http'
|
92
|
+
begin
|
93
|
+
uri = URI.parse(encode(uri))
|
94
|
+
next if uri.host.nil?
|
95
|
+
@inbound_links << uri.to_s if uri.host.end_with?@uri_domain
|
96
|
+
rescue URI::InvalidURIError
|
97
|
+
warn "bad uri:#{uri}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return @inbound_links.uniq
|
102
|
+
end
|
103
|
+
|
104
|
+
def outter_inbound_links
|
105
|
+
return @outter_inbound_links unless @outter_inbound_links.empty?
|
106
|
+
inbound_links.each do |inlink|
|
107
|
+
inlink = URI.parse inlink
|
108
|
+
@outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
|
109
|
+
end
|
110
|
+
return @outter_inbound_links
|
111
|
+
end
|
112
|
+
|
113
|
+
def friend_links#inbound && outbound
|
114
|
+
end
|
115
|
+
|
116
|
+
def pagerank
|
117
|
+
return @pagerank unless @pagerank
|
118
|
+
require 'PageRankr'
|
119
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)
|
120
|
+
return @pagerank
|
121
|
+
end
|
122
|
+
|
123
|
+
def ppl#pagerank_per_link
|
124
|
+
return (@pagerank / links.count)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
w = Webpage.new('http://auto.163.com')
|
128
|
+
puts w.outter_inbound_links
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,16 +9,16 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description: to show
|
14
|
+
description: to show seo oriented reports of the webpage,newbie's work, careful
|
15
15
|
email: seoaqua@qq.com
|
16
16
|
executables: []
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
-
-
|
21
|
-
homepage: http://seoaqua.com
|
20
|
+
- webpage.rb
|
21
|
+
homepage: http://seoaqua.com
|
22
22
|
licenses: []
|
23
23
|
post_install_message:
|
24
24
|
rdoc_options: []
|
@@ -38,8 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 1.8.
|
41
|
+
rubygems_version: 1.8.21
|
42
42
|
signing_key:
|
43
43
|
specification_version: 3
|
44
|
-
summary: to show
|
44
|
+
summary: to show seo oriented reports of the webpage,newbie's work, careful
|
45
45
|
test_files: []
|
data/lib/webpage.rb
DELETED
File without changes
|