seospider 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 984a688994b3020f4b7187b54694925197c3eac8
4
+ data.tar.gz: d252376604c47e26e2c5495496134c28b46b34ee
5
+ SHA512:
6
+ metadata.gz: 66624790fe10be37ae65053c814063ddbd54d610e4d5e1f008b880682efcdbebb97ca7b68db6c6d6b14cd4cddfe5b7ff79f6e97425c5e83a007d049fbaffeb5d
7
+ data.tar.gz: 7598dde6a43ae5a8fd57fadc56cfe4065ce649033366570ae0940b9e811bbaa12761c07fef527dcfbe25706c90411931ec77cd19cdcdae51ea1716988d4e8d38
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in seospider.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 MingQian Zhang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Seospider
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'seospider'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install seospider
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/ss ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'seospider'
4
+ require 'optparse'
5
+ require 'pp'
6
+
7
+ usage = "Usage:
8
+ ss -u url
9
+ "
10
+
11
+ options = {}
12
+ OptionParser.new do |opts|
13
+ opts.banner = usage
14
+
15
+ opts.on("-u URL", "--url URL", "URL to check") do |v|
16
+ options[:url] = v
17
+ end
18
+
19
+ end.parse!
20
+
21
+ options[:url] = ARGV[0] if options[:url].nil?
22
+
23
+ page = Seospider.get options[:url]
24
+ =begin
25
+ (page.keys - [:headers, :body]).each do |k|
26
+ puts [k, page[k]].join(': ')
27
+ end
28
+
29
+ page[:headers].each_pair do |k,v|
30
+ puts [k,v].join(': ')
31
+ end
32
+ =end
33
+
34
+ puts YAML.dump(page)
data/install.rb ADDED
@@ -0,0 +1,15 @@
1
+ path = File.expand_path('..',__FILE__)
2
+ gem_name = File.basename(Dir["#{path}/*.gemspec"].first).sub(/\.gemspec$/,'')
3
+
4
+ puts "Installing gem #{gem_name}"
5
+ puts "Gem source path is #{path}"
6
+ puts `
7
+ cd #{path}
8
+ rm #{gem_name}-*.gem
9
+ echo '#{gem_name}-*.gem deleted'
10
+ gem build #{gem_name}.gemspec
11
+ gem uninstall #{gem_name} -x -a -I
12
+ gem install #{gem_name}-*.gem
13
+
14
+ `
15
+ puts "#{gem_name} gem installed successfully!!!"
@@ -0,0 +1,10 @@
1
+ require 'httparty'
2
+
3
+ module Seospider
4
+ class Client
5
+ include HTTParty
6
+
7
+ follow_redirects false
8
+
9
+ end
10
+ end
@@ -0,0 +1,111 @@
1
+ require 'nokogiri'
2
+ require 'seospider/client'
3
+ require 'uri'
4
+ require 'webrobots'
5
+
6
+ module Seospider
7
+ class Parser
8
+ attr_reader :result
9
+
10
+ def initialize(attrs)
11
+ @url = attrs[:url]
12
+ @uri = URI(@url)
13
+ @user_agent = attrs[:user_agent] || 'Baiduspider'
14
+ @webrobots = WebRobots.new(@user_agent)
15
+ @debug = false
16
+
17
+ time_start = Time.now
18
+ r = Client.get(@url)
19
+ time_end = Time.now
20
+ response_time = time_end - time_start
21
+
22
+ @html = r.body.encode!('UTF-8','UTF-8',:invalid => :replace)
23
+ @doc = Nokogiri::HTML(@html)
24
+
25
+ @result = {:url => @url,
26
+ :status => r.response.code.to_i,
27
+ :location => r.headers['location'],
28
+ :response_time => response_time,
29
+ :canonical => '',
30
+ :title => '',
31
+ :meta_keywords => '',
32
+ :meta_description => '',
33
+ :meta_robots => '',
34
+ :h1 => '',
35
+ :h2 => '',
36
+ :h3 => '',
37
+ :links => '',
38
+ :headers => r.headers.to_hash
39
+ }
40
+
41
+ parse
42
+ end
43
+
44
+ def parse
45
+ self.methods.each do |m|
46
+ next unless m =~ /^_parse_/
47
+
48
+ begin
49
+ value = self.send m
50
+ rescue Exception => e
51
+ warn "#{e.class} - #{e.message} -- #{m}" if @debug
52
+ value = nil
53
+ end
54
+
55
+ @result[m.to_s.sub('_parse_','').to_sym] = value
56
+ end
57
+
58
+ @result
59
+ end
60
+
61
+ def _parse_title
62
+ @doc.search('title').first.content
63
+ end
64
+
65
+ def _parse_meta_description
66
+ @doc.search('meta[@name="description"]').first['content']
67
+ end
68
+
69
+ def _parse_meta_keywords
70
+ @doc.search('meta[@name="keywords"]').first['content']
71
+ end
72
+
73
+ def _parse_meta_robots
74
+ @doc.search('meta[@name="robots"]').first['content']
75
+ end
76
+
77
+ def _parse_links
78
+ links = []
79
+ @doc.search('a').each do |a|
80
+ href = a['href']
81
+ text = a.content.strip
82
+ rel = a['rel']
83
+
84
+ url = @uri.merge(URI.escape(href.to_s))
85
+ if url.host == @uri.host
86
+ disallow = @webrobots.disallowed?(url.to_s)
87
+ else
88
+ disallow = nil
89
+ end
90
+
91
+ links << {href: href, text: text, rel: rel, disallow: disallow}
92
+ end
93
+ links
94
+ end
95
+
96
+ def _parse_canonical
97
+ @doc.search('link[@rel="canonical"]').first['href']
98
+ end
99
+
100
+ (1..6).each do |i|
101
+ define_method "_parse_h#{i}" do
102
+ result = []
103
+ @doc.search("h#{i}").each do |h|
104
+ result << h.content.strip
105
+ end
106
+ result
107
+ end
108
+ end
109
+
110
+ end
111
+ end
@@ -0,0 +1,3 @@
1
+ module Seospider
2
+ VERSION = "0.0.1"
3
+ end
data/lib/seospider.rb ADDED
@@ -0,0 +1,8 @@
1
+ require 'seospider/version'
2
+ require 'seospider/parser'
3
+
4
+ module Seospider
5
+ def self.get(url)
6
+ Parser.new(url: url).result
7
+ end
8
+ end
data/seospider.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'seospider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "seospider"
8
+ spec.version = Seospider::VERSION
9
+ spec.authors = ["MingQian Zhang"]
10
+ spec.email = ["zmingqian@qq.com"]
11
+ spec.description = %q{SEO Spider : HTTP Client for SEO}
12
+ spec.summary = %q{SEO SPider}
13
+ spec.homepage = "https://github.com/mqzhang/seospider"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_runtime_dependency 'nokogiri'
24
+ spec.add_runtime_dependency 'httparty'
25
+ spec.add_runtime_dependency 'webrobots'
26
+ end
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: seospider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - MingQian Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: httparty
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: webrobots
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: 'SEO Spider : HTTP Client for SEO'
84
+ email:
85
+ - zmingqian@qq.com
86
+ executables:
87
+ - ss
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - bin/ss
97
+ - install.rb
98
+ - lib/seospider.rb
99
+ - lib/seospider/client.rb
100
+ - lib/seospider/parser.rb
101
+ - lib/seospider/version.rb
102
+ - seospider.gemspec
103
+ homepage: https://github.com/mqzhang/seospider
104
+ licenses:
105
+ - MIT
106
+ metadata: {}
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 2.0.0
124
+ signing_key:
125
+ specification_version: 4
126
+ summary: SEO SPider
127
+ test_files: []