tspider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 95c0d16da560e6dcfa202a5200f921b2c7a11ed7
4
+ data.tar.gz: b51c3f6908b93bd5bef97eab7e656269076cc939
5
+ SHA512:
6
+ metadata.gz: d30570b33595b6a2d02674cc73f6d5ada6f1fa9a7a055b9235afaece12d086f17216f88886df36af299e0d6a11b951b305b31234b0de3acfd7d05d49d5611e2c
7
+ data.tar.gz: 521b79d2cbe67b7f4104c074a655019480fdb8e45fa554c755ba362cad7349cd92d2681fdf5111bf0e4ebe784ac318d421afc9ba5311da2441bd2ffa57dce15c
data/.gitignore ADDED
@@ -0,0 +1,16 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
15
+ .idea
16
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tspider.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 mqzhang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Tspider
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'tspider'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install tspider
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/tspider/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,11 @@
1
+ require 'httparty'
2
+ require 'tspider/ua'
3
+
4
+ module Tspider
5
+ class Client
6
+ include HTTParty
7
+
8
+ follow_redirects false
9
+
10
+ end
11
+ end
@@ -0,0 +1,111 @@
1
+ require 'tspider/client'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+ require 'webrobots'
5
+
6
+ module Tspider
7
+ class Page
8
+ attr_reader :opf, :html, :doc, :url, :status, :location, :response_time, :response
9
+
10
+ def initialize(attrs)
11
+ @url = attrs[:url]
12
+ @uri = URI(@url)
13
+ @user_agent = attrs[:user_agent]
14
+ @webrobots = WebRobots.new(@user_agent)
15
+ @debug = false
16
+
17
+ time_start = Time.now
18
+ r = Client.get(@url, :headers => {"User-Agent" => @user_agent})
19
+ time_end = Time.now
20
+ @response = r
21
+ @response_time = time_end - time_start
22
+ @status = r.response.code.to_i
23
+
24
+ @html = r.body.encode!('UTF-8', 'UTF-8', :invalid => :replace)
25
+ @doc = Nokogiri::HTML(@html)
26
+ @location = r.headers['location']
27
+ @headers = r.headers.to_hash
28
+
29
+ @opf = {:url => @url,
30
+ :status => @status,
31
+ :location => @Location,
32
+ :response_time => @response_time,
33
+ :canonical => canonical,
34
+ :title => title,
35
+ :meta_keywords => meta_keywords,
36
+ :meta_description => meta_description,
37
+ :meta_robots => meta_robots,
38
+ :h1 => h1,
39
+ :h2 => h2,
40
+ :h3 => h3,
41
+ :links => links,
42
+ :headers => @headers
43
+ }
44
+
45
+ end
46
+
47
+ def title
48
+ safe_search('title',[0,'content'])
49
+ end
50
+
51
+ def meta_description
52
+ safe_search('meta[@name="description"]',[0,'content'])
53
+ end
54
+
55
+ def meta_keywords
56
+ safe_search('meta[@name="keywords"]',[0,'content'])
57
+ end
58
+
59
+ def meta_robots
60
+ safe_search('meta[@name="robots"]',[0,'content'])
61
+ end
62
+
63
+ def links
64
+ links = []
65
+ safe_search('a').each do |a|
66
+ href = a['href']
67
+ text = a.content.strip
68
+ rel = a['rel']
69
+
70
+ url = @uri.merge(URI.escape(href.to_s))
71
+ if url.host == @uri.host
72
+ disallow = @webrobots.disallowed?(url.to_s)
73
+ else
74
+ disallow = nil
75
+ end
76
+
77
+ links << {href: href, text: text, rel: rel, disallow: disallow}
78
+ end
79
+ links
80
+ end
81
+
82
+ def canonical
83
+ safe_search('link[@rel="canonical"]',[0,'href'])
84
+ end
85
+
86
+ (1..6).each do |i|
87
+ define_method "h#{i}" do
88
+ result = []
89
+ safe_search("h#{i}").each do |h|
90
+ result << h.content.strip
91
+ end
92
+ result
93
+ end
94
+ end
95
+
96
+ private
97
+
98
+ def safe_search(search_value,select_path=[])
99
+ value = @doc.search(search_value)
100
+ select_path.each do |key|
101
+ begin
102
+ value = value[key]
103
+ rescue NoMethodError
104
+ return nil
105
+ end
106
+ end
107
+ value
108
+ end
109
+
110
+ end
111
+ end
data/lib/tspider/ua.rb ADDED
@@ -0,0 +1,7 @@
1
+ module Tspider
2
+ module UA
3
+ DEFAULT = 'Tspider (https://github.com/semseo/tspider)'
4
+ BAIDU = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
5
+ BAIDU_MOBILE = 'Mozilla/5.0 (Linux;u;Android 2.3.7;zh-cn;) AppleWebKit/533.1 (KHTML,like Gecko) Version/4.0 Mobile Safari/533.1 (compatible; +http://www.baidu.com/search/spider.html)'
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ module Tspider
2
+ VERSION = "0.0.1"
3
+ end
data/lib/tspider.rb ADDED
@@ -0,0 +1,8 @@
1
+ require 'tspider/version'
2
+ require 'tspider/page'
3
+
4
+ module Tspider
5
+ def self.get(url,ua = ::Tspider::UA::DEFAULT)
6
+ Page.new(url: url,user_agent: ua)
7
+ end
8
+ end
@@ -0,0 +1,93 @@
1
+ require 'tspider'
2
+
3
+ # This file was generated by the `rspec --init` command. Conventionally, all
4
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
5
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
6
+ # this file to always be loaded, without a need to explicitly require it in any
7
+ # files.
8
+ #
9
+ # Given that it is always loaded, you are encouraged to keep this file as
10
+ # light-weight as possible. Requiring heavyweight dependencies from this file
11
+ # will add to the boot time of your test suite on EVERY test run, even for an
12
+ # individual file that may not need all of that loaded. Instead, consider making
13
+ # a separate helper file that requires the additional dependencies and performs
14
+ # the additional setup, and require it from the spec files that actually need
15
+ # it.
16
+ #
17
+ # The `.rspec` file also contains a few flags that are not defaults but that
18
+ # users commonly want.
19
+ #
20
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
21
+ RSpec.configure do |config|
22
+ # rspec-expectations config goes here. You can use an alternate
23
+ # assertion/expectation library such as wrong or the stdlib/minitest
24
+ # assertions if you prefer.
25
+ config.expect_with :rspec do |expectations|
26
+ # This option will default to `true` in RSpec 4. It makes the `description`
27
+ # and `failure_message` of custom matchers include text for helper methods
28
+ # defined using `chain`, e.g.:
29
+ # be_bigger_than(2).and_smaller_than(4).description
30
+ # # => "be bigger than 2 and smaller than 4"
31
+ # ...rather than:
32
+ # # => "be bigger than 2"
33
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
34
+ end
35
+
36
+ # rspec-mocks config goes here. You can use an alternate test double
37
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
38
+ config.mock_with :rspec do |mocks|
39
+ # Prevents you from mocking or stubbing a method that does not exist on
40
+ # a real object. This is generally recommended, and will default to
41
+ # `true` in RSpec 4.
42
+ mocks.verify_partial_doubles = true
43
+ end
44
+
45
+ # The settings below are suggested to provide a good initial experience
46
+ # with RSpec, but feel free to customize to your heart's content.
47
+ =begin
48
+ # These two settings work together to allow you to limit a spec run
49
+ # to individual examples or groups you care about by tagging them with
50
+ # `:focus` metadata. When nothing is tagged with `:focus`, all examples
51
+ # get run.
52
+ config.filter_run :focus
53
+ config.run_all_when_everything_filtered = true
54
+
55
+ # Limits the available syntax to the non-monkey patched syntax that is
56
+ # recommended. For more details, see:
57
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
58
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
59
+ # - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
60
+ config.disable_monkey_patching!
61
+
62
+ # This setting enables warnings. It's recommended, but in some cases may
63
+ # be too noisy due to issues in dependencies.
64
+ config.warnings = true
65
+
66
+ # Many RSpec users commonly either run the entire suite or an individual
67
+ # file, and it's useful to allow more verbose output when running an
68
+ # individual spec file.
69
+ if config.files_to_run.one?
70
+ # Use the documentation formatter for detailed output,
71
+ # unless a formatter has already been configured
72
+ # (e.g. via a command-line flag).
73
+ config.default_formatter = 'doc'
74
+ end
75
+
76
+ # Print the 10 slowest examples and example groups at the
77
+ # end of the spec run, to help surface which specs are running
78
+ # particularly slow.
79
+ config.profile_examples = 10
80
+
81
+ # Run specs in random order to surface order dependencies. If you find an
82
+ # order dependency and want to debug it, you can fix the order by providing
83
+ # the seed, which is printed after each run.
84
+ # --seed 1234
85
+ config.order = :random
86
+
87
+ # Seed global randomization in this process using the `--seed` CLI option.
88
+ # Setting this allows you to use `--seed` to deterministically reproduce
89
+ # test failures related to randomization by passing the same `--seed` value
90
+ # as the one that triggered the failure.
91
+ Kernel.srand config.seed
92
+ =end
93
+ end
@@ -0,0 +1,5 @@
1
+ require 'spec_helper'
2
+ require 'pp'
3
+
4
+
5
+ pp Tspider.get('http://www.baidu.com/').opf
data/tspider.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tspider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tspider"
8
+ spec.version = Tspider::VERSION
9
+ spec.authors = ["mqzhang"]
10
+ spec.email = ["zmingqian@qq.com"]
11
+ spec.summary = %q{A spider for SEO.}
12
+ spec.description = %q{A spider for SEO.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_runtime_dependency 'nokogiri'
24
+ spec.add_runtime_dependency 'httparty'
25
+ spec.add_runtime_dependency 'webrobots'
26
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tspider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - mqzhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: httparty
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: webrobots
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: A spider for SEO.
84
+ email:
85
+ - zmingqian@qq.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - .gitignore
91
+ - .rspec
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - lib/tspider.rb
97
+ - lib/tspider/client.rb
98
+ - lib/tspider/page.rb
99
+ - lib/tspider/ua.rb
100
+ - lib/tspider/version.rb
101
+ - spec/spec_helper.rb
102
+ - spec/tspider_spec.rb
103
+ - tspider.gemspec
104
+ homepage: ''
105
+ licenses:
106
+ - MIT
107
+ metadata: {}
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ requirements: []
123
+ rubyforge_project:
124
+ rubygems_version: 2.0.14
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: A spider for SEO.
128
+ test_files:
129
+ - spec/spec_helper.rb
130
+ - spec/tspider_spec.rb