webpage 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1,14 @@
1
+ def fuzzy_uri(uri)
2
+ begin
3
+ URI(uri).normalize
4
+ rescue URI::InvalidURIError
5
+ URI(URI.encode(uri)).normalize
6
+ end
7
+ end
8
+ class Mechanize::Page::Link
9
+ def uri
10
+ @uri ||= if @href then
11
+ fuzzy_uri(@href)
12
+ end
13
+ end
14
+ end
data/lib/webpage.rb ADDED
@@ -0,0 +1,49 @@
1
+ #coding:UTF-8
2
+ require 'mechanize'
3
+ require 'webpage/common'
4
+ class Webpage
5
+ def initialize(body,options={})
6
+ raise ArgumentError 'body cannot be empty' unless body
7
+ @body = body
8
+ @options = options
9
+ @body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding
10
+ @nokogiri = Nokogiri::HTML(@body)
11
+ end
12
+
13
+ def text
14
+ return @nokogiri.xpath("//text()").text
15
+ #return body.gsub(/<\/?[^>]*>/, "")
16
+ end
17
+
18
+ def keywords
19
+ @keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{|meta|meta['content']}.flatten.join.split(',')
20
+ #content = meta.attributes["content"].value unless meta.nil?
21
+ #return content.split(',') unless content.nil?
22
+ end
23
+
24
+ def description
25
+ @description ||= @nokogiri.xpath("//meta[@name='description']").map{|meta|meta['content']}.flatten.join
26
+ end
27
+
28
+ def links
29
+ @links ||= %w(a area).map do |tag|
30
+ @nokogiri.xpath("//#{tag}")
31
+ end.flatten
32
+ end
33
+ def link_to?(target_uri)
34
+ links.any?{|link|make_href_absolute(link['href']) == target_uri}
35
+ end
36
+ def link_to_host?(host)
37
+ links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host}
38
+ end
39
+
40
+ private
41
+ def make_href_absolute(href)
42
+ href = fuzzy_uri(href.to_s)
43
+ return href.to_s if href.absolute?
44
+ raise 'need :basepath in options when initialize' unless @options.has_key?:basepath
45
+ basepath = fuzzy_uri(@options[:basepath])
46
+ raise 'basepath should be absolute' unless basepath.absolute?
47
+ URI.join(basepath,href)
48
+ end
49
+ end
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
@@ -0,0 +1,59 @@
1
+ #coding:UTF-8
2
+ require 'webpage'
3
+ uri = 'http://www.hudong.com/'
4
+ to_uri = 'http://123.hudong.com/'
5
+ to_host = 'hudong.com'
6
+ page = Mechanize.new.get uri
7
+ page = Webpage.new(page.body,{:basepath=>uri})
8
+ describe Webpage do
9
+ it "text should be String" do
10
+ page.text.class.should == String
11
+ end
12
+
13
+ it "links should be an array" do
14
+ page.links.class.should == Array
15
+ end
16
+ it "links' elements should be Webpage::Link" do
17
+ page.links.each do |link|
18
+ link.class.should == Nokogiri::XML::Element
19
+ end
20
+ end
21
+
22
+ it "description should be text" do
23
+ page.description.class.should == String
24
+ end
25
+
26
+ it "keywords should be array" do
27
+ page.keywords.class.should == Array
28
+ end
29
+
30
+ it "keywords's values should be strings" do
31
+ page.keywords.each do |keyword|
32
+ keyword.class.should == String
33
+ end
34
+ end
35
+
36
+ it "link_to? should return bool" do
37
+ [TrueClass,FalseClass].should include page.link_to?(to_uri).class
38
+ end
39
+
40
+ it "link_to_host? should return bool" do
41
+ [TrueClass,FalseClass].should include page.link_to_host?(to_host).class
42
+ end
43
+ end
44
+
45
+ describe "the instance webpage" do
46
+ it "text should be big enought" do
47
+ page.text.size.should > 500
48
+ end
49
+ it "should has correct description " do
50
+ page.description.should == '互动百科是基于中文维基技术(维客,wiki百科)的网络百科全书,是全球最大中文百科网及百科全书。互动百科中文网,助您轻松百科探秘'
51
+ end
52
+ it "should has 7 keywords" do
53
+ page.keywords.size.should == 9
54
+ end
55
+
56
+ it "should link_to #{to_uri}" do
57
+ page.link_to?(to_uri).should be_true
58
+ end
59
+ end
data/webpage.gemspec ADDED
@@ -0,0 +1,12 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{webpage}
3
+ s.version = '0.0.6'
4
+ s.authors = ["seoaqua"]
5
+ s.date = %q{2012-07-29}
6
+ s.description = %q{a tool to extract some basic data from a webpage}
7
+ s.email = %q{seoaqua@qq.com}
8
+ s.files = `git ls-files`.split("\n")
9
+ s.homepage = %q{https://github.com/seoaqua/webpage}
10
+ s.summary = s.description
11
+ s.add_development_dependency 'mechanize'
12
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,17 +9,37 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-11 00:00:00.000000000 Z
13
- dependencies: []
14
- description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
15
- work, careful
12
+ date: 2012-07-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: a tool to extract some basic data from a webpage
16
31
  email: seoaqua@qq.com
17
32
  executables: []
18
33
  extensions: []
19
34
  extra_rdoc_files: []
20
35
  files:
21
- - webpage.rb
22
- homepage: http://github.com/seoaqua/ruby-webpage
36
+ - .rspec
37
+ - lib/webpage.rb
38
+ - lib/webpage/common.rb
39
+ - spec/spec_helper.rb
40
+ - spec/webpage_spec.rb
41
+ - webpage.gemspec
42
+ homepage: https://github.com/seoaqua/webpage
23
43
  licenses: []
24
44
  post_install_message:
25
45
  rdoc_options: []
@@ -39,9 +59,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
59
  version: '0'
40
60
  requirements: []
41
61
  rubyforge_project:
42
- rubygems_version: 1.8.21
62
+ rubygems_version: 1.8.24
43
63
  signing_key:
44
64
  specification_version: 3
45
- summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
46
- work, careful
65
+ summary: a tool to extract some basic data from a webpage
47
66
  test_files: []
data/webpage.rb DELETED
@@ -1,161 +0,0 @@
1
- #coding:UTF-8
2
- require 'pp'
3
- require 'mechanize'
4
- require 'uri'
5
-
6
- class WebHelper
7
- def self.uri_normalize(uri)
8
- uri = URI.parse(uri).normalize
9
- fragment = uri.fragment
10
- uri = uri.to_s
11
- uri.sub!(/##{fragment}$/,'') unless fragment.nil?
12
- return uri
13
- #uri = uri.to_s.strip.sub(/\#.*$/,'')
14
- #uri.path = '/' if uri.path.nil?
15
- end
16
- def self.host_to_domain(host)
17
- domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
18
- return domain[1] unless domain.nil?
19
- return false
20
- end
21
- def self.uri_encode(str)
22
- return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
23
- end
24
- end
25
-
26
- class Mechanize::Page
27
- #@invalid_links = Hash.new
28
- attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
29
- public
30
- def text
31
- return Nokogiri::HTML(body).xpath("//text()").text
32
- #return body.gsub(/<\/?[^>]*>/, "")
33
- end
34
- def keywords
35
- meta = search("//meta[@name='keywords']").first
36
- return meta.attributes["content"].value.split(',') unless meta.nil?
37
- end
38
-
39
- def description
40
- meta = search("//meta[@name='description']").first
41
- if meta.nil?
42
- return false
43
- end
44
- return meta.attributes['content'].value
45
- end
46
-
47
- def pagerank
48
- require 'page_rankr'
49
- @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
50
- return @pagerank
51
- end
52
-
53
- def scan_links
54
- @external_outbound_links = Array.new
55
- @internal_outbound_links = Array.new
56
- @valid_links = Array.new
57
- @invalid_links = Array.new
58
- @nofollowed_links = Array.new
59
- exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
60
- links.each do |link|
61
- #初步解析
62
- =begin
63
- uri = URI.parse(link.uri).normalize
64
- href = uri.to_s
65
- rescue URI::InvalidURIError => e
66
- pp link
67
- puts e
68
- @invalid_links << link
69
- next
70
- =end
71
- #忽略非http请求
72
- if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
73
- @invalid_links << link#todo 不同链接key重复,无法体现
74
- next
75
- end
76
- #忽略非网页文件,忽略js按钮忽略邮件
77
- if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
78
- @invalid_links << link
79
- next
80
- end
81
- #nofollow links
82
- if link.rel.include?'nofollow'
83
- @nofollowed_links << link
84
- next
85
- end
86
- if link.respond_to?'fragment' and link.fragment.empty?
87
- @invalid_links << link
88
- next
89
- end
90
- pp link
91
- #处理相对路径
92
- if !link.uri.nil? and link.uri.relative?
93
- @invalid_links << link
94
- #puts @uri.merge(link)
95
- #link.uri = @uri.merge(link.uri)
96
- @internal_outbound_links << link unless link.uri == @uri
97
- elsif link.uri.nil?
98
- warn "warning: host nil #{link.uri}"
99
- next
100
- else
101
- if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
102
- @internal_outbound_links << link
103
- else
104
- @external_outbound_links << link
105
- end
106
- end
107
- @valid_links << link
108
- end
109
- @outbound_links = @internal_outbound_links + @external_outbound_links
110
- @scanned = true
111
- end
112
- end
113
- class URI::Generic
114
- def absolute?()
115
- if @scheme or path.start_with?'/'
116
- true
117
- else
118
- false
119
- end
120
- end
121
- def domain
122
- domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
123
- return domain[1] unless domain.nil?
124
- return nil
125
- end
126
- =begin
127
- def normalize!
128
- if path && path == ''
129
- set_path('/')
130
- end
131
- if scheme && scheme != scheme.downcase
132
- set_scheme(self.scheme.downcase)
133
- end
134
- if host && host != host.downcase
135
- set_host(self.host.downcase)
136
- end
137
- set_fragment(nil) unless fragment.nil?
138
- end
139
- =end
140
- end
141
- =begin
142
- class URI::Parser
143
- def parse(uri)
144
- scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
145
-
146
- if scheme && URI.scheme_list.include?(scheme.upcase)
147
- URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
148
- else
149
- URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
150
- end
151
- end
152
- end
153
- a = Mechanize.new
154
- w = a.get('http://dict.youdao.com/w/abc/')
155
- w.scan_links
156
- pp w.internal_outbound_links
157
- exit
158
- w.links.each do |link|
159
- puts link.rel
160
- end
161
- =end