webpage 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1,14 @@
1
+ def fuzzy_uri(uri)
2
+ begin
3
+ URI(uri).normalize
4
+ rescue URI::InvalidURIError
5
+ URI(URI.encode(uri)).normalize
6
+ end
7
+ end
8
+ class Mechanize::Page::Link
9
+ def uri
10
+ @uri ||= if @href then
11
+ fuzzy_uri(@href)
12
+ end
13
+ end
14
+ end
data/lib/webpage.rb ADDED
@@ -0,0 +1,49 @@
1
+ #coding:UTF-8
2
+ require 'mechanize'
3
+ require 'webpage/common'
4
+ class Webpage
5
+ def initialize(body,options={})
6
+ raise ArgumentError 'body cannot be empty' unless body
7
+ @body = body
8
+ @options = options
9
+ @body = @body.force_encoding(@options[:encoding]).encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") if @options.has_key?:encoding
10
+ @nokogiri = Nokogiri::HTML(@body)
11
+ end
12
+
13
+ def text
14
+ return @nokogiri.xpath("//text()").text
15
+ #return body.gsub(/<\/?[^>]*>/, "")
16
+ end
17
+
18
+ def keywords
19
+ @keywords ||= @nokogiri.xpath("//meta[@name='keywords']").map{|meta|meta['content']}.flatten.join.split(',')
20
+ #content = meta.attributes["content"].value unless meta.nil?
21
+ #return content.split(',') unless content.nil?
22
+ end
23
+
24
+ def description
25
+ @description ||= @nokogiri.xpath("//meta[@name='description']").map{|meta|meta['content']}.flatten.join
26
+ end
27
+
28
+ def links
29
+ @links ||= %w(a area).map do |tag|
30
+ @nokogiri.xpath("//#{tag}")
31
+ end.flatten
32
+ end
33
+ def link_to?(target_uri)
34
+ links.any?{|link|make_href_absolute(link['href']) == target_uri}
35
+ end
36
+ def link_to_host?(host)
37
+ links.any?{|link|fuzzy_uri(link['uri'].to_s).host == host}
38
+ end
39
+
40
+ private
41
+ def make_href_absolute(href)
42
+ href = fuzzy_uri(href.to_s)
43
+ return href.to_s if href.absolute?
44
+ raise 'need :basepath in options when initialize' unless @options.has_key?:basepath
45
+ basepath = fuzzy_uri(@options[:basepath])
46
+ raise 'basepath should be absolute' unless basepath.absolute?
47
+ URI.join(basepath,href)
48
+ end
49
+ end
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
@@ -0,0 +1,59 @@
1
+ #coding:UTF-8
2
+ require 'webpage'
3
+ uri = 'http://www.hudong.com/'
4
+ to_uri = 'http://123.hudong.com/'
5
+ to_host = 'hudong.com'
6
+ page = Mechanize.new.get uri
7
+ page = Webpage.new(page.body,{:basepath=>uri})
8
+ describe Webpage do
9
+ it "text should be String" do
10
+ page.text.class.should == String
11
+ end
12
+
13
+ it "links should be an array" do
14
+ page.links.class.should == Array
15
+ end
16
+ it "links' elements should be Webpage::Link" do
17
+ page.links.each do |link|
18
+ link.class.should == Nokogiri::XML::Element
19
+ end
20
+ end
21
+
22
+ it "description should be text" do
23
+ page.description.class.should == String
24
+ end
25
+
26
+ it "keywords should be array" do
27
+ page.keywords.class.should == Array
28
+ end
29
+
30
+ it "keywords's values should be strings" do
31
+ page.keywords.each do |keyword|
32
+ keyword.class.should == String
33
+ end
34
+ end
35
+
36
+ it "link_to? should return bool" do
37
+ [TrueClass,FalseClass].should include page.link_to?(to_uri).class
38
+ end
39
+
40
+ it "link_to_host? should return bool" do
41
+ [TrueClass,FalseClass].should include page.link_to_host?(to_host).class
42
+ end
43
+ end
44
+
45
+ describe "the instance webpage" do
46
+ it "text should be big enought" do
47
+ page.text.size.should > 500
48
+ end
49
+ it "should has correct description " do
50
+ page.description.should == '互动百科是基于中文维基技术(维客,wiki百科)的网络百科全书,是全球最大中文百科网及百科全书。互动百科中文网,助您轻松百科探秘'
51
+ end
52
+ it "should has 7 keywords" do
53
+ page.keywords.size.should == 9
54
+ end
55
+
56
+ it "should link_to #{to_uri}" do
57
+ page.link_to?(to_uri).should be_true
58
+ end
59
+ end
data/webpage.gemspec ADDED
@@ -0,0 +1,12 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{webpage}
3
+ s.version = '0.0.6'
4
+ s.authors = ["seoaqua"]
5
+ s.date = %q{2012-07-29}
6
+ s.description = %q{a tool to extract some basic data from a webpage}
7
+ s.email = %q{seoaqua@qq.com}
8
+ s.files = `git ls-files`.split("\n")
9
+ s.homepage = %q{https://github.com/seoaqua/webpage}
10
+ s.summary = s.description
11
+ s.add_development_dependency 'mechanize'
12
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webpage
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,17 +9,37 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-11 00:00:00.000000000 Z
13
- dependencies: []
14
- description: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
15
- work, careful
12
+ date: 2012-07-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: a tool to extract some basic data from a webpage
16
31
  email: seoaqua@qq.com
17
32
  executables: []
18
33
  extensions: []
19
34
  extra_rdoc_files: []
20
35
  files:
21
- - webpage.rb
22
- homepage: http://github.com/seoaqua/ruby-webpage
36
+ - .rspec
37
+ - lib/webpage.rb
38
+ - lib/webpage/common.rb
39
+ - spec/spec_helper.rb
40
+ - spec/webpage_spec.rb
41
+ - webpage.gemspec
42
+ homepage: https://github.com/seoaqua/webpage
23
43
  licenses: []
24
44
  post_install_message:
25
45
  rdoc_options: []
@@ -39,9 +59,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
59
  version: '0'
40
60
  requirements: []
41
61
  rubyforge_project:
42
- rubygems_version: 1.8.21
62
+ rubygems_version: 1.8.24
43
63
  signing_key:
44
64
  specification_version: 3
45
- summary: modify Mechanize::Page to show seo oriented reports of the webpage,newbie's
46
- work, careful
65
+ summary: a tool to extract some basic data from a webpage
47
66
  test_files: []
data/webpage.rb DELETED
@@ -1,161 +0,0 @@
1
- #coding:UTF-8
2
- require 'pp'
3
- require 'mechanize'
4
- require 'uri'
5
-
6
- class WebHelper
7
- def self.uri_normalize(uri)
8
- uri = URI.parse(uri).normalize
9
- fragment = uri.fragment
10
- uri = uri.to_s
11
- uri.sub!(/##{fragment}$/,'') unless fragment.nil?
12
- return uri
13
- #uri = uri.to_s.strip.sub(/\#.*$/,'')
14
- #uri.path = '/' if uri.path.nil?
15
- end
16
- def self.host_to_domain(host)
17
- domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
18
- return domain[1] unless domain.nil?
19
- return false
20
- end
21
- def self.uri_encode(str)
22
- return URI.encode(str,Regexp.new("[^#{URI::PATTERN::UNRESERVED+'#:/?%&='}]"))
23
- end
24
- end
25
-
26
- class Mechanize::Page
27
- #@invalid_links = Hash.new
28
- attr_reader :valid_links,:invalid_links,:outbound_links,:internal_outbound_links,:external_outbound_links
29
- public
30
- def text
31
- return Nokogiri::HTML(body).xpath("//text()").text
32
- #return body.gsub(/<\/?[^>]*>/, "")
33
- end
34
- def keywords
35
- meta = search("//meta[@name='keywords']").first
36
- return meta.attributes["content"].value.split(',') unless meta.nil?
37
- end
38
-
39
- def description
40
- meta = search("//meta[@name='description']").first
41
- if meta.nil?
42
- return false
43
- end
44
- return meta.attributes['content'].value
45
- end
46
-
47
- def pagerank
48
- require 'page_rankr'
49
- @pagerank = PageRankr.ranks(@uri.to_s, :google)[:google]
50
- return @pagerank
51
- end
52
-
53
- def scan_links
54
- @external_outbound_links = Array.new
55
- @internal_outbound_links = Array.new
56
- @valid_links = Array.new
57
- @invalid_links = Array.new
58
- @nofollowed_links = Array.new
59
- exts_to_ignored = %w(.exe .jpg .png .gif .msi .pdf .swf) #decide before download the uri
60
- links.each do |link|
61
- #初步解析
62
- =begin
63
- uri = URI.parse(link.uri).normalize
64
- href = uri.to_s
65
- rescue URI::InvalidURIError => e
66
- pp link
67
- puts e
68
- @invalid_links << link
69
- next
70
- =end
71
- #忽略非http请求
72
- if link.uri.respond_to?'scheme' and !link.uri.scheme.nil? and link.uri.scheme != 'http' and link.uri.scheme != 'https'
73
- @invalid_links << link#todo 不同链接key重复,无法体现
74
- next
75
- end
76
- #忽略非网页文件,忽略js按钮忽略邮件
77
- if !link.href.nil? and exts_to_ignored.include?link.href[-4,4]# or href.start_with?'javascript:' or href.start_with?'mailto:'
78
- @invalid_links << link
79
- next
80
- end
81
- #nofollow links
82
- if link.rel.include?'nofollow'
83
- @nofollowed_links << link
84
- next
85
- end
86
- if link.respond_to?'fragment' and link.fragment.empty?
87
- @invalid_links << link
88
- next
89
- end
90
- pp link
91
- #处理相对路径
92
- if !link.uri.nil? and link.uri.relative?
93
- @invalid_links << link
94
- #puts @uri.merge(link)
95
- #link.uri = @uri.merge(link.uri)
96
- @internal_outbound_links << link unless link.uri == @uri
97
- elsif link.uri.nil?
98
- warn "warning: host nil #{link.uri}"
99
- next
100
- else
101
- if link.uri.to_s.start_with?'/' or @uri.merge(link.uri).domain == @uri.domain
102
- @internal_outbound_links << link
103
- else
104
- @external_outbound_links << link
105
- end
106
- end
107
- @valid_links << link
108
- end
109
- @outbound_links = @internal_outbound_links + @external_outbound_links
110
- @scanned = true
111
- end
112
- end
113
- class URI::Generic
114
- def absolute?()
115
- if @scheme or path.start_with?'/'
116
- true
117
- else
118
- false
119
- end
120
- end
121
- def domain
122
- domain = (host.match /\.?([a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+)$/)
123
- return domain[1] unless domain.nil?
124
- return nil
125
- end
126
- =begin
127
- def normalize!
128
- if path && path == ''
129
- set_path('/')
130
- end
131
- if scheme && scheme != scheme.downcase
132
- set_scheme(self.scheme.downcase)
133
- end
134
- if host && host != host.downcase
135
- set_host(self.host.downcase)
136
- end
137
- set_fragment(nil) unless fragment.nil?
138
- end
139
- =end
140
- end
141
- =begin
142
- class URI::Parser
143
- def parse(uri)
144
- scheme, userinfo, host, port, registry, path, opaque, query, fragment = self.split(uri)
145
-
146
- if scheme && URI.scheme_list.include?(scheme.upcase)
147
- URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
148
- else
149
- URI::Generic.new(scheme, userinfo, host, port, registry, path, opaque, query, nil, self)
150
- end
151
- end
152
- end
153
- a = Mechanize.new
154
- w = a.get('http://dict.youdao.com/w/abc/')
155
- w.scan_links
156
- pp w.internal_outbound_links
157
- exit
158
- w.links.each do |link|
159
- puts link.rel
160
- end
161
- =end