tspider 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 95c0d16da560e6dcfa202a5200f921b2c7a11ed7
4
+ data.tar.gz: b51c3f6908b93bd5bef97eab7e656269076cc939
5
+ SHA512:
6
+ metadata.gz: d30570b33595b6a2d02674cc73f6d5ada6f1fa9a7a055b9235afaece12d086f17216f88886df36af299e0d6a11b951b305b31234b0de3acfd7d05d49d5611e2c
7
+ data.tar.gz: 521b79d2cbe67b7f4104c074a655019480fdb8e45fa554c755ba362cad7349cd92d2681fdf5111bf0e4ebe784ac318d421afc9ba5311da2441bd2ffa57dce15c
data/.gitignore ADDED
@@ -0,0 +1,16 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
15
+ .idea
16
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tspider.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 mqzhang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Tspider
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'tspider'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install tspider
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/tspider/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,11 @@
1
+ require 'httparty'
2
+ require 'tspider/ua'
3
+
4
+ module Tspider
5
+ class Client
6
+ include HTTParty
7
+
8
+ follow_redirects false
9
+
10
+ end
11
+ end
@@ -0,0 +1,111 @@
1
+ require 'tspider/client'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+ require 'webrobots'
5
+
6
+ module Tspider
7
+ class Page
8
+ attr_reader :opf, :html, :doc, :url, :status, :location, :response_time, :response
9
+
10
+ def initialize(attrs)
11
+ @url = attrs[:url]
12
+ @uri = URI(@url)
13
+ @user_agent = attrs[:user_agent]
14
+ @webrobots = WebRobots.new(@user_agent)
15
+ @debug = false
16
+
17
+ time_start = Time.now
18
+ r = Client.get(@url, :headers => {"User-Agent" => @user_agent})
19
+ time_end = Time.now
20
+ @response = r
21
+ @response_time = time_end - time_start
22
+ @status = r.response.code.to_i
23
+
24
+ @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace)
25
+ @doc = Nokogiri::HTML(@html)
26
+ @location = r.headers['location']
27
+ @headers = r.headers.to_hash
28
+
29
+ @opf = {:url => @url,
30
+ :status => @status,
31
+ :location => @Location,
32
+ :response_time => @response_time,
33
+ :canonical => canonical,
34
+ :title => title,
35
+ :meta_keywords => meta_keywords,
36
+ :meta_description => meta_description,
37
+ :meta_robots => meta_robots,
38
+ :h1 => h1,
39
+ :h2 => h2,
40
+ :h3 => h3,
41
+ :links => links,
42
+ :headers => @headers
43
+ }
44
+
45
+ end
46
+
47
+ def title
48
+ safe_search('title',[0,'content'])
49
+ end
50
+
51
+ def meta_description
52
+ safe_search('meta[@name="description"]',[0,'content'])
53
+ end
54
+
55
+ def meta_keywords
56
+ safe_search('meta[@name="keywords"]',[0,'content'])
57
+ end
58
+
59
+ def meta_robots
60
+ safe_search('meta[@name="robots"]',[0,'content'])
61
+ end
62
+
63
+ def links
64
+ links = []
65
+ safe_search('a').each do |a|
66
+ href = a['href']
67
+ text = a.content.strip
68
+ rel = a['rel']
69
+
70
+ url = @uri.merge(URI.escape(href.to_s))
71
+ if url.host == @uri.host
72
+ disallow = @webrobots.disallowed?(url.to_s)
73
+ else
74
+ disallow = nil
75
+ end
76
+
77
+ links << {href: href, text: text, rel: rel, disallow: disallow}
78
+ end
79
+ links
80
+ end
81
+
82
+ def canonical
83
+ safe_search('link[@rel="canonical"]',[0,'href'])
84
+ end
85
+
86
+ (1..6).each do |i|
87
+ define_method "h#{i}" do
88
+ result = []
89
+ safe_search("h#{i}").each do |h|
90
+ result << h.content.strip
91
+ end
92
+ result
93
+ end
94
+ end
95
+
96
+ private
97
+
98
+ def safe_search(search_value,select_path=[])
99
+ value = @doc.search(search_value)
100
+ select_path.each do |key|
101
+ begin
102
+ value = value[key]
103
+ rescue NoMethodError
104
+ return nil
105
+ end
106
+ end
107
+ value
108
+ end
109
+
110
+ end
111
+ end
data/lib/tspider/ua.rb ADDED
@@ -0,0 +1,7 @@
1
+ module Tspider
2
+ module UA
3
+ DEFAULT = 'Tspider (https://github.com/semseo/tspider)'
4
+ BAIDU = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
5
+ BAIDU_MOBILE = 'Mozilla/5.0 (Linux;u;Android 2.3.7;zh-cn;) AppleWebKit/533.1 (KHTML,like Gecko) Version/4.0 Mobile Safari/533.1 (compatible; +http://www.baidu.com/search/spider.html)'
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ module Tspider
2
+ VERSION = "0.0.1"
3
+ end
data/lib/tspider.rb ADDED
@@ -0,0 +1,8 @@
1
+ require 'tspider/version'
2
+ require 'tspider/page'
3
+
4
+ module Tspider
5
+ def self.get(url,ua = ::Tspider::UA::DEFAULT)
6
+ Page.new(url: url,user_agent: ua)
7
+ end
8
+ end
@@ -0,0 +1,93 @@
1
+ require 'tspider'
2
+
3
+ # This file was generated by the `rspec --init` command. Conventionally, all
4
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
5
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
6
+ # this file to always be loaded, without a need to explicitly require it in any
7
+ # files.
8
+ #
9
+ # Given that it is always loaded, you are encouraged to keep this file as
10
+ # light-weight as possible. Requiring heavyweight dependencies from this file
11
+ # will add to the boot time of your test suite on EVERY test run, even for an
12
+ # individual file that may not need all of that loaded. Instead, consider making
13
+ # a separate helper file that requires the additional dependencies and performs
14
+ # the additional setup, and require it from the spec files that actually need
15
+ # it.
16
+ #
17
+ # The `.rspec` file also contains a few flags that are not defaults but that
18
+ # users commonly want.
19
+ #
20
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
21
+ RSpec.configure do |config|
22
+ # rspec-expectations config goes here. You can use an alternate
23
+ # assertion/expectation library such as wrong or the stdlib/minitest
24
+ # assertions if you prefer.
25
+ config.expect_with :rspec do |expectations|
26
+ # This option will default to `true` in RSpec 4. It makes the `description`
27
+ # and `failure_message` of custom matchers include text for helper methods
28
+ # defined using `chain`, e.g.:
29
+ # be_bigger_than(2).and_smaller_than(4).description
30
+ # # => "be bigger than 2 and smaller than 4"
31
+ # ...rather than:
32
+ # # => "be bigger than 2"
33
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
34
+ end
35
+
36
+ # rspec-mocks config goes here. You can use an alternate test double
37
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
38
+ config.mock_with :rspec do |mocks|
39
+ # Prevents you from mocking or stubbing a method that does not exist on
40
+ # a real object. This is generally recommended, and will default to
41
+ # `true` in RSpec 4.
42
+ mocks.verify_partial_doubles = true
43
+ end
44
+
45
+ # The settings below are suggested to provide a good initial experience
46
+ # with RSpec, but feel free to customize to your heart's content.
47
+ =begin
48
+ # These two settings work together to allow you to limit a spec run
49
+ # to individual examples or groups you care about by tagging them with
50
+ # `:focus` metadata. When nothing is tagged with `:focus`, all examples
51
+ # get run.
52
+ config.filter_run :focus
53
+ config.run_all_when_everything_filtered = true
54
+
55
+ # Limits the available syntax to the non-monkey patched syntax that is
56
+ # recommended. For more details, see:
57
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
58
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
59
+ # - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
60
+ config.disable_monkey_patching!
61
+
62
+ # This setting enables warnings. It's recommended, but in some cases may
63
+ # be too noisy due to issues in dependencies.
64
+ config.warnings = true
65
+
66
+ # Many RSpec users commonly either run the entire suite or an individual
67
+ # file, and it's useful to allow more verbose output when running an
68
+ # individual spec file.
69
+ if config.files_to_run.one?
70
+ # Use the documentation formatter for detailed output,
71
+ # unless a formatter has already been configured
72
+ # (e.g. via a command-line flag).
73
+ config.default_formatter = 'doc'
74
+ end
75
+
76
+ # Print the 10 slowest examples and example groups at the
77
+ # end of the spec run, to help surface which specs are running
78
+ # particularly slow.
79
+ config.profile_examples = 10
80
+
81
+ # Run specs in random order to surface order dependencies. If you find an
82
+ # order dependency and want to debug it, you can fix the order by providing
83
+ # the seed, which is printed after each run.
84
+ # --seed 1234
85
+ config.order = :random
86
+
87
+ # Seed global randomization in this process using the `--seed` CLI option.
88
+ # Setting this allows you to use `--seed` to deterministically reproduce
89
+ # test failures related to randomization by passing the same `--seed` value
90
+ # as the one that triggered the failure.
91
+ Kernel.srand config.seed
92
+ =end
93
+ end
@@ -0,0 +1,5 @@
1
+ require 'spec_helper'
2
+ require 'pp'
3
+
4
+
5
+ pp Tspider.get('http://www.baidu.com/').opf
data/tspider.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tspider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tspider"
8
+ spec.version = Tspider::VERSION
9
+ spec.authors = ["mqzhang"]
10
+ spec.email = ["zmingqian@qq.com"]
11
+ spec.summary = %q{A spider for SEO.}
12
+ spec.description = %q{A spider for SEO.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_runtime_dependency 'nokogiri'
24
+ spec.add_runtime_dependency 'httparty'
25
+ spec.add_runtime_dependency 'webrobots'
26
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tspider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - mqzhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: httparty
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: webrobots
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: A spider for SEO.
84
+ email:
85
+ - zmingqian@qq.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - .gitignore
91
+ - .rspec
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - lib/tspider.rb
97
+ - lib/tspider/client.rb
98
+ - lib/tspider/page.rb
99
+ - lib/tspider/ua.rb
100
+ - lib/tspider/version.rb
101
+ - spec/spec_helper.rb
102
+ - spec/tspider_spec.rb
103
+ - tspider.gemspec
104
+ homepage: ''
105
+ licenses:
106
+ - MIT
107
+ metadata: {}
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ requirements: []
123
+ rubyforge_project:
124
+ rubygems_version: 2.0.14
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: A spider for SEO.
128
+ test_files:
129
+ - spec/spec_helper.rb
130
+ - spec/tspider_spec.rb