webpage 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/webpage.rb +128 -0
  2. metadata +7 -7
  3. data/lib/webpage.rb +0 -0
data/webpage.rb ADDED
@@ -0,0 +1,128 @@
1
+ #coding:UTF-8
2
+ require 'pp'
3
+ require 'mechanize'
4
+ require 'uri'
5
+ class Webpage
6
+ attr_reader:links
7
+ def initialize(uri)
8
+ @uri = URI.parse(encode(uri))
9
+ @outbound_links = Array.new
10
+ @outter_inbound_links = Array.new
11
+ @inbound_links = Array.new
12
+ @internal_links = Array.new
13
+ @links = Array.new
14
+ @uri_dirname = File.dirname(@uri.path)
15
+ @uri_domain = host_to_domain @uri.host
16
+ @accessed_uri = Array.new
17
+ end
18
+ def encode(str)
19
+ return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
20
+ end
21
+ def host_to_domain(host)
22
+ return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
23
+ end
24
+
25
+ #get all links from html content
26
+ #1.$all = get all <a>
27
+ #2.$href = get all href from $all
28
+ #3.make all $href to be absolute path and put to @links
29
+ def links
30
+ return @links unless @links.empty?
31
+ begin
32
+ agent = Mechanize.new
33
+ agent.open_timeout = 5
34
+ agent.get @uri do |page|
35
+ page.links.each do |link| #1
36
+ next if link.href.nil?
37
+ uri = encode(link.href.strip)
38
+ begin
39
+ @links << @uri.merge(uri).to_s
40
+ rescue URI::InvalidURIError,URI::InvalidComponentError
41
+ warn "ignore\n #{uri} \n #{link.href}"
42
+ end
43
+ end
44
+ end
45
+ rescue Errno::ETIMEDOUT,Timeout::Error
46
+ warn "timeout:#{@uri}"
47
+ rescue NoMethodError => e
48
+ warn "no method, mechanize recognize this as a file:#{@uri}.#{e}"
49
+ rescue Zlib::GzipFile::Error,Mechanize::Error => e
50
+ warn "gzip error:#{@uri}.#{e}"
51
+ rescue Net::HTTP::Persistent::Error
52
+ warn "network reset:#{@uri}"
53
+ rescue SocketError =>e
54
+ warn "#{e}.#{@uri}"
55
+ end
56
+ return Array.new if @links.empty?
57
+ @links = @links.uniq - @accessed_uri
58
+ @accessed_uri += @links
59
+ @links.each do |a|
60
+ uri = URI.parse(encode(a))
61
+ next if uri.host.nil?
62
+ if uri.host.end_with?@uri_domain
63
+ @internal_links << a
64
+ else
65
+ @outbound_links << a
66
+ end
67
+ end
68
+ return @links
69
+ end
70
+
71
+ def internal_links
72
+ return @internal_links if links
73
+ return false
74
+ end
75
+
76
+ def outbound_links
77
+ return @outbound_links if links
78
+ return false
79
+ end
80
+
81
+ def inbound_links
82
+ return @inbound_links unless @inbound_links.empty?
83
+ outbound_links.each do |outlink|
84
+ begin
85
+ w = Webpage.new(outlink)
86
+ rescue URI::InvalidURIError
87
+ warn "bad uri:#{outlink}"
88
+ next
89
+ end
90
+ w.links.each do |uri|
91
+ next unless uri.start_with?'http'
92
+ begin
93
+ uri = URI.parse(encode(uri))
94
+ next if uri.host.nil?
95
+ @inbound_links << uri.to_s if uri.host.end_with?@uri_domain
96
+ rescue URI::InvalidURIError
97
+ warn "bad uri:#{uri}"
98
+ end
99
+ end
100
+ end
101
+ return @inbound_links.uniq
102
+ end
103
+
104
+ def outter_inbound_links
105
+ return @outter_inbound_links unless @outter_inbound_links.empty?
106
+ inbound_links.each do |inlink|
107
+ inlink = URI.parse inlink
108
+ @outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
109
+ end
110
+ return @outter_inbound_links
111
+ end
112
+
113
+ def friend_links#inbound && outbound
114
+ end
115
+
116
+ def pagerank
117
+ return @pagerank unless @pagerank
118
+ require 'PageRankr'
119
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)
120
+ return @pagerank
121
+ end
122
+
123
+ def ppl#pagerank_per_link
124
+ return (@pagerank / links.count)
125
+ end
126
+ end
127
+ w = Webpage.new('http://auto.163.com')
128
+ puts w.outter_inbound_links
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,16 +9,16 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-11 00:00:00.000000000 Z
12
+ date: 2012-04-11 00:00:00.000000000 Z
13
13
  dependencies: []
14
- description: to show a report of the webpage
14
+ description: to show seo oriented reports of the webpage,newbie's work, careful
15
15
  email: seoaqua@qq.com
16
16
  executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
- - lib/webpage.rb
21
- homepage: http://seoaqua.com/
20
+ - webpage.rb
21
+ homepage: http://seoaqua.com
22
22
  licenses: []
23
23
  post_install_message:
24
24
  rdoc_options: []
@@ -38,8 +38,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
38
  version: '0'
39
39
  requirements: []
40
40
  rubyforge_project:
41
- rubygems_version: 1.8.11
41
+ rubygems_version: 1.8.21
42
42
  signing_key:
43
43
  specification_version: 3
44
- summary: to show a report of the webpage
44
+ summary: to show seo oriented reports of the webpage,newbie's work, careful
45
45
  test_files: []
data/lib/webpage.rb DELETED
File without changes