webpage 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/webpage.rb +128 -0
- metadata +7 -7
- data/lib/webpage.rb +0 -0
data/webpage.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'pp'
|
3
|
+
require 'mechanize'
|
4
|
+
require 'uri'
|
5
|
+
class Webpage
|
6
|
+
attr_reader:links
|
7
|
+
def initialize(uri)
|
8
|
+
@uri = URI.parse(encode(uri))
|
9
|
+
@outbound_links = Array.new
|
10
|
+
@outter_inbound_links = Array.new
|
11
|
+
@inbound_links = Array.new
|
12
|
+
@internal_links = Array.new
|
13
|
+
@links = Array.new
|
14
|
+
@uri_dirname = File.dirname(@uri.path)
|
15
|
+
@uri_domain = host_to_domain @uri.host
|
16
|
+
@accessed_uri = Array.new
|
17
|
+
end
|
18
|
+
def encode(str)
|
19
|
+
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
20
|
+
end
|
21
|
+
def host_to_domain(host)
|
22
|
+
return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
|
23
|
+
end
|
24
|
+
|
25
|
+
#get all links from html content
|
26
|
+
#1.$all = get all <a>
|
27
|
+
#2.$href = get all href from $all
|
28
|
+
#3.make all $href to be absolute path and put to @links
|
29
|
+
def links
|
30
|
+
return @links unless @links.empty?
|
31
|
+
begin
|
32
|
+
agent = Mechanize.new
|
33
|
+
agent.open_timeout = 5
|
34
|
+
agent.get @uri do |page|
|
35
|
+
page.links.each do |link| #1
|
36
|
+
next if link.href.nil?
|
37
|
+
uri = encode(link.href.strip)
|
38
|
+
begin
|
39
|
+
@links << @uri.merge(uri).to_s
|
40
|
+
rescue URI::InvalidURIError,URI::InvalidComponentError
|
41
|
+
warn "ignore\n #{uri} \n #{link.href}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
rescue Errno::ETIMEDOUT,Timeout::Error
|
46
|
+
warn "timeout:#{@uri}"
|
47
|
+
rescue NoMethodError => e
|
48
|
+
warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
|
49
|
+
rescue Zlib::GzipFile::Error,Mechanize::Error => e
|
50
|
+
warn "gzip error:#{@uri}.#{e}"
|
51
|
+
rescue Net::HTTP::Persistent::Error
|
52
|
+
warn "network reset:#{@uri}"
|
53
|
+
rescue SocketError =>e
|
54
|
+
warn "#{e}.#{@uri}"
|
55
|
+
end
|
56
|
+
return Array.new if @links.empty?
|
57
|
+
@links = @links.uniq - @accessed_uri
|
58
|
+
@accessed_uri += @links
|
59
|
+
@links.each do |a|
|
60
|
+
uri = URI.parse(encode(a))
|
61
|
+
next if uri.host.nil?
|
62
|
+
if uri.host.end_with?@uri_domain
|
63
|
+
@internal_links << a
|
64
|
+
else
|
65
|
+
@outbound_links << a
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return @links
|
69
|
+
end
|
70
|
+
|
71
|
+
def internal_links
|
72
|
+
return @internal_links if links
|
73
|
+
return false
|
74
|
+
end
|
75
|
+
|
76
|
+
def outbound_links
|
77
|
+
return @outbound_links if links
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
|
81
|
+
def inbound_links
|
82
|
+
return @inbound_links unless @inbound_links.empty?
|
83
|
+
outbound_links.each do |outlink|
|
84
|
+
begin
|
85
|
+
w = Webpage.new(outlink)
|
86
|
+
rescue URI::InvalidURIError
|
87
|
+
warn "bad uri:#{outlink}"
|
88
|
+
next
|
89
|
+
end
|
90
|
+
w.links.each do |uri|
|
91
|
+
next unless uri.start_with?'http'
|
92
|
+
begin
|
93
|
+
uri = URI.parse(encode(uri))
|
94
|
+
next if uri.host.nil?
|
95
|
+
@inbound_links << uri.to_s if uri.host.end_with?@uri_domain
|
96
|
+
rescue URI::InvalidURIError
|
97
|
+
warn "bad uri:#{uri}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return @inbound_links.uniq
|
102
|
+
end
|
103
|
+
|
104
|
+
def outter_inbound_links
|
105
|
+
return @outter_inbound_links unless @outter_inbound_links.empty?
|
106
|
+
inbound_links.each do |inlink|
|
107
|
+
inlink = URI.parse inlink
|
108
|
+
@outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
|
109
|
+
end
|
110
|
+
return @outter_inbound_links
|
111
|
+
end
|
112
|
+
|
113
|
+
def friend_links#inbound && outbound
|
114
|
+
end
|
115
|
+
|
116
|
+
def pagerank
|
117
|
+
return @pagerank unless @pagerank
|
118
|
+
require 'PageRankr'
|
119
|
+
@pagerank = PageRankr.ranks(@uri.to_s, :google)
|
120
|
+
return @pagerank
|
121
|
+
end
|
122
|
+
|
123
|
+
def ppl#pagerank_per_link
|
124
|
+
return (@pagerank / links.count)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
w = Webpage.new('http://auto.163.com')
|
128
|
+
puts w.outter_inbound_links
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,16 +9,16 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-04-11 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
|
-
description: to show
|
14
|
+
description: to show seo oriented reports of the webpage,newbie's work, careful
|
15
15
|
email: seoaqua@qq.com
|
16
16
|
executables: []
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
-
-
|
21
|
-
homepage: http://seoaqua.com
|
20
|
+
- webpage.rb
|
21
|
+
homepage: http://seoaqua.com
|
22
22
|
licenses: []
|
23
23
|
post_install_message:
|
24
24
|
rdoc_options: []
|
@@ -38,8 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
38
|
version: '0'
|
39
39
|
requirements: []
|
40
40
|
rubyforge_project:
|
41
|
-
rubygems_version: 1.8.
|
41
|
+
rubygems_version: 1.8.21
|
42
42
|
signing_key:
|
43
43
|
specification_version: 3
|
44
|
-
summary: to show
|
44
|
+
summary: to show seo oriented reports of the webpage,newbie's work, careful
|
45
45
|
test_files: []
|
data/lib/webpage.rb
DELETED
File without changes
|