webpage 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/webpage.rb +206 -49
  2. metadata +1 -1
data/webpage.rb CHANGED
@@ -3,25 +3,69 @@ require 'pp'
3
3
  require 'mechanize'
4
4
  require 'uri'
5
5
  class Webpage
6
- attr_reader:links
6
+ attr_reader:links,:successful,:related_uris
7
7
  def initialize(uri)
8
- @uri = URI.parse(encode(uri))
8
+ @uri = URI.parse(uri_encode(uri))
9
9
  @outbound_links = Array.new
10
- @outter_inbound_links = Array.new
11
- @inbound_links = Array.new
10
+ @internal_outbound_links = Array.new
11
+ @external_outbound_links = Array.new
12
+ @broken_outbound_links = Array.new
13
+ @external_inbound_links = Array.new
14
+ @back_links = Array.new
15
+ @internal_inbound_links = Array.new
16
+ @external_inbound_links = Array.new
12
17
  @internal_links = Array.new
13
18
  @links = Array.new
14
19
  @uri_dirname = File.dirname(@uri.path)
15
20
  @uri_domain = host_to_domain @uri.host
16
21
  @accessed_uri = Array.new
22
+ @page = ''
23
+ @related_uris = Array.new
24
+ @successful = false
25
+ begin
26
+ agent = Mechanize.new
27
+ agent.open_timeout = 5
28
+ @page = agent.get @uri.to_s
29
+ @page.body = @page.body.force_encoding(@page.encoding).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
30
+ @page.links.each do |link| #1
31
+ next if link.href.nil?
32
+ uri = uri_encode(link.href.strip)
33
+ begin
34
+ @links << @uri.merge(uri).to_s
35
+ rescue URI::InvalidURIError,URI::InvalidComponentError
36
+ warn "ignore\nparsed: #{uri} \noriginal: #{link.href}"
37
+ end
38
+ end
39
+ @successful = true
40
+ rescue Exception => e
41
+ warn "#{e}:#{@uri}"
42
+ end
43
+ @links.uniq!
44
+ scan_links
17
45
  end
18
- def encode(str)
19
- return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
46
+
47
+ def encoding
48
+ return @page.encoding
20
49
  end
21
- def host_to_domain(host)
22
- return (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)[1]
50
+ def keywords
51
+ return @page.search("//meta[@name='keywords']").first.attributes["content"].value.split(',')
52
+ end
53
+
54
+ def body
55
+ return @page.body unless @page.body.include?'<html>'
56
+ return String.new
57
+ end
58
+
59
+ def text
60
+ return body.gsub(/<\/?[^>]*>/, "")
61
+ end
62
+
63
+ def title
64
+ return @page.title unless @page.title.nil?
65
+ return String.new
23
66
  end
24
67
 
68
+
25
69
  #get all links from html content
26
70
  #1.$all = get all <a>
27
71
  #2.$href = get all href from $all
@@ -34,7 +78,7 @@ class Webpage
34
78
  agent.get @uri do |page|
35
79
  page.links.each do |link| #1
36
80
  next if link.href.nil?
37
- uri = encode(link.href.strip)
81
+ uri = uri_encode(link.href.strip)
38
82
  begin
39
83
  @links << @uri.merge(uri).to_s
40
84
  rescue URI::InvalidURIError,URI::InvalidComponentError
@@ -54,32 +98,117 @@ class Webpage
54
98
  warn "#{e}.#{@uri}"
55
99
  end
56
100
  return Array.new if @links.empty?
57
- @links = @links.uniq - @accessed_uri
58
- @accessed_uri += @links
59
- @links.each do |a|
60
- uri = URI.parse(encode(a))
61
- next if uri.host.nil?
62
- if uri.host.end_with?@uri_domain
63
- @internal_links << a
64
- else
65
- @outbound_links << a
66
- end
67
- end
101
+ #@links = @links.uniq - @accessed_uri
102
+ #@accessed_uri += @links
103
+ @links.uniq!
104
+ puts @links
105
+ scan_links
68
106
  return @links
69
107
  end
70
-
108
+
71
109
  def internal_links
72
- return @internal_links if links
73
- return false
110
+ return @internal_links unless @internal_links.empty?
111
+ scan_links
112
+ return @internal_links
113
+ end
114
+ def external_outbound_links
115
+ return @external_outbound_links unless @external_outbound_links.empty?
116
+ links
117
+ return @external_outbound_links
118
+ end
119
+ def internal_outbound_links
120
+ return @internal_outbound_links unless @internal_outbound_links.empty?
121
+ links
122
+ return @internal_outbound_links
74
123
  end
75
-
76
124
  def outbound_links
77
- return @outbound_links if links
78
- return false
125
+ return external_outbound_links + internal_outbound_links
126
+ end
127
+ def back_links#inbound links among all the outbound links
128
+ return @back_links unless @back_links.empty?
129
+ scan_outbound_links
130
+ return @back_links
131
+ end
132
+
133
+ def broken_outbound_links
134
+ return @broken_outbound_links unless @broken_outbound_links.empty?
135
+ scan_outbound_links
136
+ return @broken_outbound_links
137
+ end
138
+
139
+ def external_inbound_links#outter inbound links
140
+ return @external_inbound_links unless @external_inbound_links.empty?
141
+ scan_inbound_links
142
+ return @external_inbound_links
143
+ end
144
+
145
+ def internal_inbound_links
146
+ return @internal_inbound_links unless @internal_inbound_links.empty?
147
+ scan_inbound_links
148
+ return @internal_inbound_links
79
149
  end
80
150
 
81
- def inbound_links
82
- return @inbound_links unless @inbound_links.empty?
151
+ def pagerank
152
+ return @pagerank unless @pagerank
153
+ require 'PageRankr'
154
+ @pagerank = PageRankr.ranks(@uri.to_s, :google)
155
+ return @pagerank
156
+ end
157
+
158
+ def ppl#pagerank per link
159
+ return (@pagerank / links.count)
160
+ end
161
+
162
+ def related_uris(related_keywords=Array.new,seed_uris=Array.new,checked_uris=Array.new,related=Array.new)
163
+ raise "related_keywords is not array,but a #{related_keywords.class}" unless related_keywords.class == Array and seed_uris.class == Array and checked_uris.class == Array
164
+ related_keywords.concat(keywords).uniq!
165
+ seed_uris.concat(external_outbound_links).uniq!
166
+ while seed_uris.size>0
167
+ uri = seed_uris.first
168
+ checked_uris << uri unless checked_uris.include?uri
169
+ seed_uris.delete(uri)
170
+ w = Webpage.new uri
171
+ next unless w.successful
172
+ text = w.body + w.title
173
+ related_keywords.each do |word|
174
+ if text.include?word
175
+ related << uri
176
+ seed_uris.concat(w.external_outbound_links).uniq!
177
+ break
178
+ end
179
+ end
180
+ end
181
+ return related
182
+ end
183
+
184
+ private
185
+ def scan_links
186
+ @links.each do |a|
187
+ begin
188
+ uri = URI.parse(uri_encode(a))
189
+ rescue URI::InvalidURIError =>e
190
+ puts "#{e}:#{uri}"
191
+ next
192
+ end
193
+ next if uri.host.nil?
194
+ if uri.host.end_with?@uri_domain
195
+ @internal_links << a
196
+ elsif uri.scheme.start_with?'http'
197
+ if host_to_domain(uri.host) == @uri_domain
198
+ @internal_outbound_links << uri.to_s
199
+ else
200
+ @external_outbound_links << uri.to_s
201
+ end
202
+ #@outbound_links << a
203
+ end
204
+ end
205
+ @back_links.uniq!
206
+ #@outbound_links.uniq!
207
+ @internal_outbound_links.uniq!
208
+ @external_outbound_links.uniq!
209
+ end
210
+
211
+ def scan_outbound_links
83
212
  outbound_links.each do |outlink|
84
213
  begin
85
214
  w = Webpage.new(outlink)
@@ -87,42 +216,70 @@ class Webpage
87
216
  warn "bad uri:#{outlink}"
88
217
  next
89
218
  end
219
+ next if w.links.nil?
90
220
  w.links.each do |uri|
91
221
  next unless uri.start_with?'http'
92
222
  begin
93
- uri = URI.parse(encode(uri))
223
+ uri = URI.parse(uri_encode(uri))
94
224
  next if uri.host.nil?
95
- @inbound_links << uri.to_s if uri.host.end_with?@uri_domain
225
+ if uri.host.end_with?@uri_domain
226
+ @back_links << uri.to_s
227
+ else
228
+ @broken_outbound_links << uri.to_s
229
+ end
96
230
  rescue URI::InvalidURIError
97
231
  warn "bad uri:#{uri}"
98
232
  end
99
233
  end
100
234
  end
101
- return @inbound_links.uniq
235
+ @back_links.uniq!
236
+ @broken_outbound_links.uniq!
102
237
  end
103
238
 
104
- def outter_inbound_links
105
- return @outter_inbound_links unless @outter_inbound_links.empty?
106
- inbound_links.each do |inlink|
239
+ def scan_inbound_links
240
+ back_links.each do |inlink|
107
241
  inlink = URI.parse inlink
108
- @outter_inbound_links << inlink.to_s unless @uri_domain == host_to_domain(inlink.host)
242
+ if @uri_domain == host_to_domain(inlink.host)
243
+ @internal_inbound_links << inlink.to_s
244
+ else
245
+ @external_inbound_links << inlink.to_s
246
+ end
109
247
  end
110
- return @outter_inbound_links
248
+ @internal_inbound_links.uniq!
249
+ @external_inbound_links.uniq!
111
250
  end
112
-
113
- def friend_links#inbound && outbound
251
+
252
+ def uri_encode(str)
253
+ return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
114
254
  end
115
-
116
- def pagerank
117
- return @pagerank unless @pagerank
118
- require 'PageRankr'
119
- @pagerank = PageRankr.ranks(@uri.to_s, :google)
120
- return @pagerank
255
+ def host_to_domain(host)
256
+ domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
257
+ return domain[1] unless domain.nil?
258
+ return false
121
259
  end
260
+ end
122
261
 
123
- def ppl#pagerank_per_link
124
- return (@pagerank / links.count)
125
- end
262
+ w = Webpage.new('http://cidian.youdao.com')
263
+ #puts w.external_outbound_links
264
+ related_keywords = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
265
+ puts w.related_uris(related_keywords)
266
+ exit
267
+ require 'yaml'
268
+ filename = './cidian.yaml'
269
+ if File.exists?(filename)
270
+ cached_cidian = YAML.load(File.read(filename))
271
+ else
272
+ cached_cidian = Hash.new
273
+ cached_cidian[:seed_uris] = Array.new
274
+ cached_cidian[:checked_uris] = Array.new
275
+ cached_cidian[:related_keywords] = %w(词典 辞典 辞海 译 英语 法语 日语 韩语 语言)
276
+ cached_cidian[:related_uris] = Array.new
277
+ end
278
+ at_exit do
279
+ File.open(filename,'w'){|f|f.puts(cached_cidian.to_yaml)}
126
280
  end
127
- w = Webpage.new('http://auto.163.com')
128
- puts w.outter_inbound_links
281
+ #puts w.related_uris(cached_cidian[:seed_uris],cached_cidian[:related_keywords],cached_cidian[:checked_uris],cached_cidian[:related_uris])
282
+ puts w.external_inbound_links
283
+ puts w.internal_inbound_links
284
+ puts w.pagerank
285
+ puts w.external_inbound_links
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: