webpage 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +2 -0
- data/lib/webpage/common.rb +14 -0
- data/lib/webpage.rb +49 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/webpage_spec.rb +59 -0
- data/webpage.gemspec +12 -0
- metadata +29 -10
- data/webpage.rb +0 -161
data/.rspec
ADDED
data/lib/webpage.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'mechanize'
|
3
|
+
require 'webpage/common'
|
4
|
+
class Webpage
|
5
|
+
def initialize(body,options={})
|
6
|
+
raise ArgumentError 'body cannot be empty' unless body
|
7
|
+
@body = body
|
8
|
+
@options = options
|
9
|
+
@body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding
|
10
|
+
@nokogiri = Nokogiri::HTML(@body)
|
11
|
+
end
|
12
|
+
|
13
|
+
def text
|
14
|
+
return @nokogiri.xpath("//text()").text
|
15
|
+
#return body.gsub(/<\/?[^>]*>/, "")
|
16
|
+
end
|
17
|
+
|
18
|
+
def keywords
|
19
|
+
@keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{|meta|meta['content']}.flatten.join.split(',')
|
20
|
+
#content = meta.attributes["content"].value unless meta.nil?
|
21
|
+
#return content.split(',') unless content.nil?
|
22
|
+
end
|
23
|
+
|
24
|
+
def description
|
25
|
+
@description ||= @nokogiri.xpath("//meta[@name='description']").map{|meta|meta['content']}.flatten.join
|
26
|
+
end
|
27
|
+
|
28
|
+
def links
|
29
|
+
@links ||= %w(a area).map do |tag|
|
30
|
+
@nokogiri.xpath("//#{tag}")
|
31
|
+
end.flatten
|
32
|
+
end
|
33
|
+
def link_to?(target_uri)
|
34
|
+
links.any?{|link|make_href_absolute(link['href']) == target_uri}
|
35
|
+
end
|
36
|
+
def link_to_host?(host)
|
37
|
+
links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host}
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def make_href_absolute(href)
|
42
|
+
href = fuzzy_uri(href.to_s)
|
43
|
+
return href.to_s if href.absolute?
|
44
|
+
raise 'need :basepath in options when initialize' unless @options.has_key?:basepath
|
45
|
+
basepath = fuzzy_uri(@options[:basepath])
|
46
|
+
raise 'basepath should be absolute' unless basepath.absolute?
|
47
|
+
URI.join(basepath,href)
|
48
|
+
end
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'webpage'
|
3
|
+
uri = 'http://www.hudong.com/'
|
4
|
+
to_uri = 'http://123.hudong.com/'
|
5
|
+
to_host = 'hudong.com'
|
6
|
+
page = Mechanize.new.get uri
|
7
|
+
page = Webpage.new(page.body,{:basepath=>uri})
|
8
|
+
describe Webpage do
|
9
|
+
it "text should be String" do
|
10
|
+
page.text.class.should == String
|
11
|
+
end
|
12
|
+
|
13
|
+
it "links should be an array" do
|
14
|
+
page.links.class.should == Array
|
15
|
+
end
|
16
|
+
it "links' elements should be Webpage::Link" do
|
17
|
+
page.links.each do |link|
|
18
|
+
link.class.should == Nokogiri::XML::Element
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
it "description should be text" do
|
23
|
+
page.description.class.should == String
|
24
|
+
end
|
25
|
+
|
26
|
+
it "keywords should be array" do
|
27
|
+
page.keywords.class.should == Array
|
28
|
+
end
|
29
|
+
|
30
|
+
it "keywords's values should be strings" do
|
31
|
+
page.keywords.each do |keyword|
|
32
|
+
keyword.class.should == String
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it "link_to? should return bool" do
|
37
|
+
[TrueClass,FalseClass].should include page.link_to?(to_uri).class
|
38
|
+
end
|
39
|
+
|
40
|
+
it "link_to_host? should return bool" do
|
41
|
+
[TrueClass,FalseClass].should include page.link_to_host?(to_host).class
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "the instance webpage" do
|
46
|
+
it "text should be big enought" do
|
47
|
+
page.text.size.should > 500
|
48
|
+
end
|
49
|
+
it "should has correct description " do
|
50
|
+
page.description.should == '互动百科是基于中文维基技术(维客,wiki百科)的网络百科全书,是全球最大中文百科网及百科全书。互动百科中文网,助您轻松百科探秘'
|
51
|
+
end
|
52
|
+
it "should has 7 keywords" do
|
53
|
+
page.keywords.size.should == 9
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should link_to #{to_uri}" do
|
57
|
+
page.link_to?(to_uri).should be_true
|
58
|
+
end
|
59
|
+
end
|
data/webpage.gemspec
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{webpage}
|
3
|
+
s.version = '0.0.6'
|
4
|
+
s.authors = ["seoaqua"]
|
5
|
+
s.date = %q{2012-07-29}
|
6
|
+
s.description = %q{a tool to extract some basic data from a webpage}
|
7
|
+
s.email = %q{seoaqua@qq.com}
|
8
|
+
s.files = `git ls-files`.split("\n")
|
9
|
+
s.homepage = %q{https://github.com/seoaqua/webpage}
|
10
|
+
s.summary = s.description
|
11
|
+
s.add_development_dependency 'mechanize'
|
12
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,17 +9,37 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
13
|
-
dependencies:
|
14
|
-
|
15
|
-
|
12
|
+
date: 2012-07-29 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mechanize
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: a tool to extract some basic data from a webpage
|
16
31
|
email: seoaqua@qq.com
|
17
32
|
executables: []
|
18
33
|
extensions: []
|
19
34
|
extra_rdoc_files: []
|
20
35
|
files:
|
21
|
-
-
|
22
|
-
|
36
|
+
- .rspec
|
37
|
+
- lib/webpage.rb
|
38
|
+
- lib/webpage/common.rb
|
39
|
+
- spec/spec_helper.rb
|
40
|
+
- spec/webpage_spec.rb
|
41
|
+
- webpage.gemspec
|
42
|
+
homepage: https://github.com/seoaqua/webpage
|
23
43
|
licenses: []
|
24
44
|
post_install_message:
|
25
45
|
rdoc_options: []
|
@@ -39,9 +59,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
59
|
version: '0'
|
40
60
|
requirements: []
|
41
61
|
rubyforge_project:
|
42
|
-
rubygems_version: 1.8.
|
62
|
+
rubygems_version: 1.8.24
|
43
63
|
signing_key:
|
44
64
|
specification_version: 3
|
45
|
-
summary:
|
46
|
-
work, careful
|
65
|
+
summary: a tool to extract some basic data from a webpage
|
47
66
|
test_files: []
|
data/webpage.rb
DELETED
@@ -1,161 +0,0 @@
|
|
1
|
-
#coding:UTF-8
|
2
|
-
require 'pp'
|
3
|
-
require 'mechanize'
|
4
|
-
require 'uri'
|
5
|
-
|
6
|
-
class WebHelper
|
7
|
-
def self.uri_normalize(uri)
|
8
|
-
uri = URI.parse(uri).normalize
|
9
|
-
fragment = uri.fragment
|
10
|
-
uri = uri.to_s
|
11
|
-
uri.sub!(/##{fragment}$/,'') unless fragment.nil?
|
12
|
-
return uri
|
13
|
-
#uri = uri.to_s.strip.sub(/\#.*$/,'')
|
14
|
-
#uri.path = '/' if uri.path.nil?
|
15
|
-
end
|
16
|
-
def self.host_to_domain(host)
|
17
|
-
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
18
|
-
return domain[1] unless domain.nil?
|
19
|
-
return false
|
20
|
-
end
|
21
|
-
def self.uri_encode(str)
|
22
|
-
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
class Mechanize::Page
|
27
|
-
#@invalid_links = Hash.new
|
28
|
-
attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
|
29
|
-
public
|
30
|
-
def text
|
31
|
-
return Nokogiri::HTML(body).xpath("//text()").text
|
32
|
-
#return body.gsub(/<\/?[^>]*>/, "")
|
33
|
-
end
|
34
|
-
def keywords
|
35
|
-
meta = search("//meta[@name='keywords']").first
|
36
|
-
return meta.attributes["content"].value.split(',') unless meta.nil?
|
37
|
-
end
|
38
|
-
|
39
|
-
def description
|
40
|
-
meta = search("//meta[@name='description']").first
|
41
|
-
if meta.nil?
|
42
|
-
return false
|
43
|
-
end
|
44
|
-
return meta.attributes['content'].value
|
45
|
-
end
|
46
|
-
|
47
|
-
def pagerank
|
48
|
-
require 'page_rankr'
|
49
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
50
|
-
return @pagerank
|
51
|
-
end
|
52
|
-
|
53
|
-
def scan_links
|
54
|
-
@external_outbound_links = Array.new
|
55
|
-
@internal_outbound_links = Array.new
|
56
|
-
@valid_links = Array.new
|
57
|
-
@invalid_links = Array.new
|
58
|
-
@nofollowed_links = Array.new
|
59
|
-
exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
|
60
|
-
links.each do |link|
|
61
|
-
#初步解析
|
62
|
-
=begin
|
63
|
-
uri = URI.parse(link.uri).normalize
|
64
|
-
href = uri.to_s
|
65
|
-
rescue URI::InvalidURIError => e
|
66
|
-
pp link
|
67
|
-
puts e
|
68
|
-
@invalid_links << link
|
69
|
-
next
|
70
|
-
=end
|
71
|
-
#忽略非http请求
|
72
|
-
if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
|
73
|
-
@invalid_links << link#todo 不同链接key重复,无法体现
|
74
|
-
next
|
75
|
-
end
|
76
|
-
#忽略非网页文件,忽略js按钮忽略邮件
|
77
|
-
if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
|
78
|
-
@invalid_links << link
|
79
|
-
next
|
80
|
-
end
|
81
|
-
#nofollow links
|
82
|
-
if link.rel.include?'nofollow'
|
83
|
-
@nofollowed_links << link
|
84
|
-
next
|
85
|
-
end
|
86
|
-
if link.respond_to?'fragment' and link.fragment.empty?
|
87
|
-
@invalid_links << link
|
88
|
-
next
|
89
|
-
end
|
90
|
-
pp link
|
91
|
-
#处理相对路径
|
92
|
-
if !link.uri.nil? and link.uri.relative?
|
93
|
-
@invalid_links << link
|
94
|
-
#puts @uri.merge(link)
|
95
|
-
#link.uri = @uri.merge(link.uri)
|
96
|
-
@internal_outbound_links << link unless link.uri == @uri
|
97
|
-
elsif link.uri.nil?
|
98
|
-
warn "warning: host nil #{link.uri}"
|
99
|
-
next
|
100
|
-
else
|
101
|
-
if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
|
102
|
-
@internal_outbound_links << link
|
103
|
-
else
|
104
|
-
@external_outbound_links << link
|
105
|
-
end
|
106
|
-
end
|
107
|
-
@valid_links << link
|
108
|
-
end
|
109
|
-
@outbound_links = @internal_outbound_links + @external_outbound_links
|
110
|
-
@scanned = true
|
111
|
-
end
|
112
|
-
end
|
113
|
-
class URI::Generic
|
114
|
-
def absolute?()
|
115
|
-
if @scheme or path.start_with?'/'
|
116
|
-
true
|
117
|
-
else
|
118
|
-
false
|
119
|
-
end
|
120
|
-
end
|
121
|
-
def domain
|
122
|
-
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
123
|
-
return domain[1] unless domain.nil?
|
124
|
-
return nil
|
125
|
-
end
|
126
|
-
=begin
|
127
|
-
def normalize!
|
128
|
-
if path && path == ''
|
129
|
-
set_path('/')
|
130
|
-
end
|
131
|
-
if scheme && scheme != scheme.downcase
|
132
|
-
set_scheme(self.scheme.downcase)
|
133
|
-
end
|
134
|
-
if host && host != host.downcase
|
135
|
-
set_host(self.host.downcase)
|
136
|
-
end
|
137
|
-
set_fragment(nil) unless fragment.nil?
|
138
|
-
end
|
139
|
-
=end
|
140
|
-
end
|
141
|
-
=begin
|
142
|
-
class URI::Parser
|
143
|
-
def parse(uri)
|
144
|
-
scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
|
145
|
-
|
146
|
-
if scheme && URI.scheme_list.include?(scheme.upcase)
|
147
|
-
URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
148
|
-
else
|
149
|
-
URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
a = Mechanize.new
|
154
|
-
w = a.get('http://dict.youdao.com/w/abc/')
|
155
|
-
w.scan_links
|
156
|
-
pp w.internal_outbound_links
|
157
|
-
exit
|
158
|
-
w.links.each do |link|
|
159
|
-
puts link.rel
|
160
|
-
end
|
161
|
-
=end
|