webpage 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +2 -0
- data/lib/webpage/common.rb +14 -0
- data/lib/webpage.rb +49 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/webpage_spec.rb +59 -0
- data/webpage.gemspec +12 -0
- metadata +29 -10
- data/webpage.rb +0 -161
data/.rspec
ADDED
data/lib/webpage.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'mechanize'
|
3
|
+
require 'webpage/common'
|
4
|
+
class Webpage
|
5
|
+
def initialize(body,options={})
|
6
|
+
raise ArgumentError 'body cannot be empty' unless body
|
7
|
+
@body = body
|
8
|
+
@options = options
|
9
|
+
@body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding
|
10
|
+
@nokogiri = Nokogiri::HTML(@body)
|
11
|
+
end
|
12
|
+
|
13
|
+
def text
|
14
|
+
return @nokogiri.xpath("//text()").text
|
15
|
+
#return body.gsub(/<\/?[^>]*>/, "")
|
16
|
+
end
|
17
|
+
|
18
|
+
def keywords
|
19
|
+
@keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{|meta|meta['content']}.flatten.join.split(',')
|
20
|
+
#content = meta.attributes["content"].value unless meta.nil?
|
21
|
+
#return content.split(',') unless content.nil?
|
22
|
+
end
|
23
|
+
|
24
|
+
def description
|
25
|
+
@description ||= @nokogiri.xpath("//meta[@name='description']").map{|meta|meta['content']}.flatten.join
|
26
|
+
end
|
27
|
+
|
28
|
+
def links
|
29
|
+
@links ||= %w(a area).map do |tag|
|
30
|
+
@nokogiri.xpath("//#{tag}")
|
31
|
+
end.flatten
|
32
|
+
end
|
33
|
+
def link_to?(target_uri)
|
34
|
+
links.any?{|link|make_href_absolute(link['href']) == target_uri}
|
35
|
+
end
|
36
|
+
def link_to_host?(host)
|
37
|
+
links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host}
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def make_href_absolute(href)
|
42
|
+
href = fuzzy_uri(href.to_s)
|
43
|
+
return href.to_s if href.absolute?
|
44
|
+
raise 'need :basepath in options when initialize' unless @options.has_key?:basepath
|
45
|
+
basepath = fuzzy_uri(@options[:basepath])
|
46
|
+
raise 'basepath should be absolute' unless basepath.absolute?
|
47
|
+
URI.join(basepath,href)
|
48
|
+
end
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'webpage'
|
3
|
+
uri = 'http://www.hudong.com/'
|
4
|
+
to_uri = 'http://123.hudong.com/'
|
5
|
+
to_host = 'hudong.com'
|
6
|
+
page = Mechanize.new.get uri
|
7
|
+
page = Webpage.new(page.body,{:basepath=>uri})
|
8
|
+
describe Webpage do
|
9
|
+
it "text should be String" do
|
10
|
+
page.text.class.should == String
|
11
|
+
end
|
12
|
+
|
13
|
+
it "links should be an array" do
|
14
|
+
page.links.class.should == Array
|
15
|
+
end
|
16
|
+
it "links' elements should be Webpage::Link" do
|
17
|
+
page.links.each do |link|
|
18
|
+
link.class.should == Nokogiri::XML::Element
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
it "description should be text" do
|
23
|
+
page.description.class.should == String
|
24
|
+
end
|
25
|
+
|
26
|
+
it "keywords should be array" do
|
27
|
+
page.keywords.class.should == Array
|
28
|
+
end
|
29
|
+
|
30
|
+
it "keywords's values should be strings" do
|
31
|
+
page.keywords.each do |keyword|
|
32
|
+
keyword.class.should == String
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it "link_to? should return bool" do
|
37
|
+
[TrueClass,FalseClass].should include page.link_to?(to_uri).class
|
38
|
+
end
|
39
|
+
|
40
|
+
it "link_to_host? should return bool" do
|
41
|
+
[TrueClass,FalseClass].should include page.link_to_host?(to_host).class
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "the instance webpage" do
|
46
|
+
it "text should be big enought" do
|
47
|
+
page.text.size.should > 500
|
48
|
+
end
|
49
|
+
it "should has correct description " do
|
50
|
+
page.description.should == '互动百科是基于中文维基技术(维客,wiki百科)的网络百科全书,是全球最大中文百科网及百科全书。互动百科中文网,助您轻松百科探秘'
|
51
|
+
end
|
52
|
+
it "should has 7 keywords" do
|
53
|
+
page.keywords.size.should == 9
|
54
|
+
end
|
55
|
+
|
56
|
+
it "should link_to #{to_uri}" do
|
57
|
+
page.link_to?(to_uri).should be_true
|
58
|
+
end
|
59
|
+
end
|
data/webpage.gemspec
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{webpage}
|
3
|
+
s.version = '0.0.6'
|
4
|
+
s.authors = ["seoaqua"]
|
5
|
+
s.date = %q{2012-07-29}
|
6
|
+
s.description = %q{a tool to extract some basic data from a webpage}
|
7
|
+
s.email = %q{seoaqua@qq.com}
|
8
|
+
s.files = `git ls-files`.split("\n")
|
9
|
+
s.homepage = %q{https://github.com/seoaqua/webpage}
|
10
|
+
s.summary = s.description
|
11
|
+
s.add_development_dependency 'mechanize'
|
12
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,17 +9,37 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
13
|
-
dependencies:
|
14
|
-
|
15
|
-
|
12
|
+
date: 2012-07-29 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mechanize
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: a tool to extract some basic data from a webpage
|
16
31
|
email: seoaqua@qq.com
|
17
32
|
executables: []
|
18
33
|
extensions: []
|
19
34
|
extra_rdoc_files: []
|
20
35
|
files:
|
21
|
-
-
|
22
|
-
|
36
|
+
- .rspec
|
37
|
+
- lib/webpage.rb
|
38
|
+
- lib/webpage/common.rb
|
39
|
+
- spec/spec_helper.rb
|
40
|
+
- spec/webpage_spec.rb
|
41
|
+
- webpage.gemspec
|
42
|
+
homepage: https://github.com/seoaqua/webpage
|
23
43
|
licenses: []
|
24
44
|
post_install_message:
|
25
45
|
rdoc_options: []
|
@@ -39,9 +59,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
39
59
|
version: '0'
|
40
60
|
requirements: []
|
41
61
|
rubyforge_project:
|
42
|
-
rubygems_version: 1.8.
|
62
|
+
rubygems_version: 1.8.24
|
43
63
|
signing_key:
|
44
64
|
specification_version: 3
|
45
|
-
summary:
|
46
|
-
work, careful
|
65
|
+
summary: a tool to extract some basic data from a webpage
|
47
66
|
test_files: []
|
data/webpage.rb
DELETED
@@ -1,161 +0,0 @@
|
|
1
|
-
#coding:UTF-8
|
2
|
-
require 'pp'
|
3
|
-
require 'mechanize'
|
4
|
-
require 'uri'
|
5
|
-
|
6
|
-
class WebHelper
|
7
|
-
def self.uri_normalize(uri)
|
8
|
-
uri = URI.parse(uri).normalize
|
9
|
-
fragment = uri.fragment
|
10
|
-
uri = uri.to_s
|
11
|
-
uri.sub!(/##{fragment}$/,'') unless fragment.nil?
|
12
|
-
return uri
|
13
|
-
#uri = uri.to_s.strip.sub(/\#.*$/,'')
|
14
|
-
#uri.path = '/' if uri.path.nil?
|
15
|
-
end
|
16
|
-
def self.host_to_domain(host)
|
17
|
-
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
18
|
-
return domain[1] unless domain.nil?
|
19
|
-
return false
|
20
|
-
end
|
21
|
-
def self.uri_encode(str)
|
22
|
-
return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
class Mechanize::Page
|
27
|
-
#@invalid_links = Hash.new
|
28
|
-
attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
|
29
|
-
public
|
30
|
-
def text
|
31
|
-
return Nokogiri::HTML(body).xpath("//text()").text
|
32
|
-
#return body.gsub(/<\/?[^>]*>/, "")
|
33
|
-
end
|
34
|
-
def keywords
|
35
|
-
meta = search("//meta[@name='keywords']").first
|
36
|
-
return meta.attributes["content"].value.split(',') unless meta.nil?
|
37
|
-
end
|
38
|
-
|
39
|
-
def description
|
40
|
-
meta = search("//meta[@name='description']").first
|
41
|
-
if meta.nil?
|
42
|
-
return false
|
43
|
-
end
|
44
|
-
return meta.attributes['content'].value
|
45
|
-
end
|
46
|
-
|
47
|
-
def pagerank
|
48
|
-
require 'page_rankr'
|
49
|
-
@pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
|
50
|
-
return @pagerank
|
51
|
-
end
|
52
|
-
|
53
|
-
def scan_links
|
54
|
-
@external_outbound_links = Array.new
|
55
|
-
@internal_outbound_links = Array.new
|
56
|
-
@valid_links = Array.new
|
57
|
-
@invalid_links = Array.new
|
58
|
-
@nofollowed_links = Array.new
|
59
|
-
exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
|
60
|
-
links.each do |link|
|
61
|
-
#初步解析
|
62
|
-
=begin
|
63
|
-
uri = URI.parse(link.uri).normalize
|
64
|
-
href = uri.to_s
|
65
|
-
rescue URI::InvalidURIError => e
|
66
|
-
pp link
|
67
|
-
puts e
|
68
|
-
@invalid_links << link
|
69
|
-
next
|
70
|
-
=end
|
71
|
-
#忽略非http请求
|
72
|
-
if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
|
73
|
-
@invalid_links << link#todo 不同链接key重复,无法体现
|
74
|
-
next
|
75
|
-
end
|
76
|
-
#忽略非网页文件,忽略js按钮忽略邮件
|
77
|
-
if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
|
78
|
-
@invalid_links << link
|
79
|
-
next
|
80
|
-
end
|
81
|
-
#nofollow links
|
82
|
-
if link.rel.include?'nofollow'
|
83
|
-
@nofollowed_links << link
|
84
|
-
next
|
85
|
-
end
|
86
|
-
if link.respond_to?'fragment' and link.fragment.empty?
|
87
|
-
@invalid_links << link
|
88
|
-
next
|
89
|
-
end
|
90
|
-
pp link
|
91
|
-
#处理相对路径
|
92
|
-
if !link.uri.nil? and link.uri.relative?
|
93
|
-
@invalid_links << link
|
94
|
-
#puts @uri.merge(link)
|
95
|
-
#link.uri = @uri.merge(link.uri)
|
96
|
-
@internal_outbound_links << link unless link.uri == @uri
|
97
|
-
elsif link.uri.nil?
|
98
|
-
warn "warning: host nil #{link.uri}"
|
99
|
-
next
|
100
|
-
else
|
101
|
-
if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
|
102
|
-
@internal_outbound_links << link
|
103
|
-
else
|
104
|
-
@external_outbound_links << link
|
105
|
-
end
|
106
|
-
end
|
107
|
-
@valid_links << link
|
108
|
-
end
|
109
|
-
@outbound_links = @internal_outbound_links + @external_outbound_links
|
110
|
-
@scanned = true
|
111
|
-
end
|
112
|
-
end
|
113
|
-
class URI::Generic
|
114
|
-
def absolute?()
|
115
|
-
if @scheme or path.start_with?'/'
|
116
|
-
true
|
117
|
-
else
|
118
|
-
false
|
119
|
-
end
|
120
|
-
end
|
121
|
-
def domain
|
122
|
-
domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
|
123
|
-
return domain[1] unless domain.nil?
|
124
|
-
return nil
|
125
|
-
end
|
126
|
-
=begin
|
127
|
-
def normalize!
|
128
|
-
if path && path == ''
|
129
|
-
set_path('/')
|
130
|
-
end
|
131
|
-
if scheme && scheme != scheme.downcase
|
132
|
-
set_scheme(self.scheme.downcase)
|
133
|
-
end
|
134
|
-
if host && host != host.downcase
|
135
|
-
set_host(self.host.downcase)
|
136
|
-
end
|
137
|
-
set_fragment(nil) unless fragment.nil?
|
138
|
-
end
|
139
|
-
=end
|
140
|
-
end
|
141
|
-
=begin
|
142
|
-
class URI::Parser
|
143
|
-
def parse(uri)
|
144
|
-
scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
|
145
|
-
|
146
|
-
if scheme && URI.scheme_list.include?(scheme.upcase)
|
147
|
-
URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
148
|
-
else
|
149
|
-
URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
153
|
-
a = Mechanize.new
|
154
|
-
w = a.get('http://dict.youdao.com/w/abc/')
|
155
|
-
w.scan_links
|
156
|
-
pp w.internal_outbound_links
|
157
|
-
exit
|
158
|
-
w.links.each do |link|
|
159
|
-
puts link.rel
|
160
|
-
end
|
161
|
-
=end
|